scrapy框架项目:抓取链家 全武汉的二手房信息
Posted cwkcwk
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy框架项目:抓取链家 全武汉的二手房信息相关的知识,希望对你有一定的参考价值。
import scrapy
import re
from collections import Counter
from lianjia.items import LianjiaItem
class LianjiaSpiderSpider(scrapy.Spider):
name = ‘lianjia_spider‘
allowed_domains = [‘wh.lianjia.com‘]
start_urls = [‘https://wh.lianjia.com/ershoufang/baibuting/‘]
def parse(self, response):
rsp = (response.body.decode("utf-8"))
#print(response.xpath("//div"))
item = LianjiaItem()
info_list = response.xpath("//div//ul//li[@class=‘clear LOGCLICKDATA‘]")
#print(len(info_list))
#print(info_list)
for i in info_list:
#print(i)
item["xiaoqu_name"] = i.xpath(‘.//div[@class="houseInfo"]//a[@target="_blank"]/text()‘).extract()[0]
#print(xiaoqu_name)
#xiaoqu_link = i.xpath(‘.//div[@class="houseInfo"]//@href‘).extract()[0]
#print(xiaoqu_link)
item["name"] = i.xpath(‘.//div[@class="info clear"]//a/text()‘).extract()[0]
#print(name)
item["area"] = i.xpath(‘.//div[@class="info clear"]//div[@class="positionInfo"]//a/text()‘).extract()[0]
#print(area)
item["link"] = i.xpath(".//div[@class=‘title‘]//@href").extract()[0]
#print(link)
item["summary"] = i.xpath(‘.//div[@class="houseInfo"]/text()‘).extract()[0] # summary 总结 朝向 装修等,电梯等
#print(summary)
item["floor"] = i.xpath(‘.//div[@class="info clear"]//div[@class="positionInfo"]/text()‘).extract()[0]
#print(floor)
item["zongjia"] = i.xpath(‘.//div[@class="info clear"]//div[@class="totalPrice"]//span/text()‘).extract()[0]# + "万" #组合上单位
#print(zongjia)
item["danjia"] = i.xpath(‘.//div[@class="info clear"]//div[@class="unitPrice"]//span/text()‘).extract()[0]
#print(danjia)
yield item
#经过分析发现,如果直接在 武昌 汉口 这样的大区域下搜索 ,最多显示30页数据,所以想要完全爬取,必须把所有小区域的链接挨个遍历
area_list = ["baibuting","dazhilu","dijiao","erqi2","houhu","huangpuyongqing","qianjinjianghan","sanyanglu","tazihu","yucaihuaqiao",
"changqinglu","changfengchangmatou","changganglu","taibeixiangganglu","tangjiadun","wuguangwansongyuan","xinhualuwanda","yangchahu",
"baofengchongren","changfengchangmatou","cbdxibeihu","gutian","hanzhengjie","jixian2","wujiashan","zongguan",
"changqinghuayuan","dongxihuqita","jinyinhu","jiangjunlu","baishazhou","chuhehanjie","donghudongting","jiedaokou","jiyuqiao","shuiguohu","shouyi","shahu",
"tuanjiedadao","wuchanghuochezhan","xudong","yangyuan","zhongbeilu","zhongnandingziqiao","zhuodaoquan","hongshanqita","qingshan1","huquanyangjiawan","luoshinanlu",
"laonanhu","nanhuwoerma","xinnanhu","qilimiao","sixin","wangjiawan","zhongjiacun","guanxichangzhi","guangguguangchang","guanshandadao","guanggunan","guanggudong",
"huakeda","jinronggang","minzudadao","sanhuannan","canglongdao","jiangxiaqita","miaoshan","wenhuadadao","caidianqita","dunkou",
"hankoubei","huangbeiqita","panlongcheng","qianchuan","xinzhouqita","yangluo"]
#counter = Counter(area_list) #查询列表中是否有重复
#print(counter)
#遍历所有区域后,再遍历0~30页 这样才能确保网站上的所有数据都被爬取,否则信息严重缺失
for i in area_list:
for num in range(0,30):
yield scrapy.Request("https://wh.lianjia.com/ershoufang/"+ i +"/pg"+ str(num), callback=self.parse)
items和pipelines无特别之处,按照常规写即可使用。
以上是关于scrapy框架项目:抓取链家 全武汉的二手房信息的主要内容,如果未能解决你的问题,请参考以下文章