爬取房天下整个网站房产数据。。。
Posted 月下柳梢映
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取房天下整个网站房产数据。。。相关的知识,希望对你有一定的参考价值。
以前爬的数据量都有点少了,所以现在写个爬全站数据爬虫来,用redis进行URL的去重处理,采用mysql储存清洗过后房产数据,采用线程池来进行调度,进行多线程爬取
下面是房天下所有地区二手房和新房的URL,为后续爬取提供起始URL:
1 import requests 2 from lxml import etree 3 4 5 class Ftx_newhouse_Secondhandhouse(object): 6 7 headers = { 8 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘, 9 ‘Cookie‘: ‘global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14‘ 10 11 } 12 13 def __init__(self): 14 self.url = ‘http://newhouse.fang.com/house/s/‘ 15 self.s = requests.session() 16 17 18 def Newhouse_ftx(self): 19 try: 20 response = self.s.post(self.url,headers=self.headers,verify=False) 21 except Exception as e: 22 print(‘error:‘,e) 23 response.encoding = ‘gb2312‘ 24 urls = etree.HTML(response.text) 25 xf_adress = urls.xpath(‘//div[@class="city20141104"]/div[3]/a/text()|‘ 26 ‘//div[@class="city20141104"]/div[4]/a/text()|‘ 27 ‘//div[@class="city20141104"]/div[5]/a/text()‘ 28 ) 29 xf_url = urls.xpath(‘//div[@class="city20141104"]/div[3]/a/@href|‘ 30 ‘//div[@class="city20141104"]/div[4]/a/@href|‘ 31 ‘//div[@class="city20141104"]/div[5]/a/@href‘ 32 ) 33 34 return (dict(zip(xf_adress,xf_url))) 35 36 def Secondhandhouse_ftx(self): 37 self.url = ‘http://esf.sh.fang.com/newsecond/esfcities.aspx‘ 38 try: 39 html = requests.get(self.url,headers=self.headers,timeout=4) 40 except Exception as e: 41 print(‘error:‘,e) 42 html.encoding = ‘gb2312‘ 43 Secondhandhouse_urls = etree.HTML(html.text) 44 xf_url = Secondhandhouse_urls.xpath(‘//div[@class="onCont"]/ul/li/a/text()‘) 45 xf_adress = Secondhandhouse_urls.xpath(‘//div[@class="onCont"]/ul/li/a/@href‘) 46 dictx = dict(zip(xf_url,xf_adress)) 47 return dictx
下面是爬取房产数据代码:
1 import requests,redis,pymysql 2 from mywed.fangtianxia.url import Ftx_newhouse_Secondhandhouse 3 from lxml import etree 4 from concurrent.futures import ThreadPoolExecutor 5 import re,os,time 6 from mywed.fangtianxia.logs import log_run 7 8 Secondhandhouse_urls_set = {‘http://esf.hbjs.fang.com‘} 9 dr = Ftx_newhouse_Secondhandhouse() 10 w = dr.Secondhandhouse_ftx() 11 for i in w.values(): 12 Secondhandhouse_urls_set.add(i) 13 print(Secondhandhouse_urls_set) 14 15 16 17 class Secondhandhouse(object): 18 19 headers = { 20 ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘, 21 ‘Cookie‘: ‘global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14‘ 22 23 } 24 25 def get_newhouse_data(self,url): 26 27 for num in range(102): 28 second_url = url + ‘/house/i3‘ + str(num) 29 try: 30 while True: 31 reponse = requests.get(url,headers=self.headers,timeout=3) 32 reponse.encoding = ‘gbk‘ 33 #print(reponse.text) 34 if reponse.status_code ==200: 35 break 36 else: 37 print(‘restart donwing ......‘) 38 except Exception as e: 39 log_run.File_enter_error(e) 40 select = etree.HTML(str(reponse.text)) 41 42 if not len(select.xpath(‘//a[@id="PageControl1_hlk_next"]/text()‘)): 43 break 44 else: 45 content_list = select.xpath(‘//dd[@class="info rel floatr"]‘) 46 #print(content_list) 47 48 for i in content_list: 49 title = i.xpath(‘./p[1]/a/@title‘) 50 content = i.xpath(‘./p[2]/text()‘) 51 name = i.xpath(‘./p[3]/a/span/text()‘) 52 adress = i.xpath(‘./p[3]/span/text()‘) 53 try: 54 size_list = select.xpath(‘//div[@class="area alignR"]‘) 55 size = [ii.xpath(‘./p/text()‘) for ii in size_list] 56 average_price_list = select.xpath(‘//p[@class="danjia alignR mt5"]‘) 57 average_price = [‘/‘.join(iii.xpath(‘./text()‘)) for iii in average_price_list] 58 sum_price_list = select.xpath(‘//p[@class="mt5 alignR"]‘) 59 sum_price = [‘‘.join(iiii.xpath(‘./span/text()‘)) for iiii in sum_price_list] 60 except Exception as e: 61 log_run.File_enter_error(e) 62 print(title) 63 64 65 if __name__ =="__main__": 66 t = Secondhandhouse() 67 t.get_newhouse_data(‘http://esf.fang.com/house/i33/‘) 68 #s = t.get_newhouse_data 69 #pool = ThreadPoolExecutor(30) 70 #f = pool.map(s,Secondhandhouse_urls_set)
以上是关于爬取房天下整个网站房产数据。。。的主要内容,如果未能解决你的问题,请参考以下文章
scrapy-redis 分布式爬虫爬取房天下网站所有国内城市的新房和二手房信息