爬取房天下整个网站房产数据。。。

Posted 2020-10-12 月下柳梢映

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了爬取房天下整个网站房产数据。。。相关的知识，希望对你有一定的参考价值。

以前爬的数据量都有点少了，所以现在写个爬全站数据爬虫来，用redis进行URL的去重处理，采用mysql储存清洗过后房产数据，采用线程池来进行调度，进行多线程爬取

下面是房天下所有地区二手房和新房的URL，为后续爬取提供起始URL：

 1 import requests
 2 from lxml import etree
 3 
 4 
 5 class Ftx_newhouse_Secondhandhouse(object):
 6 
 7     headers = {
 8         ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘,
 9         ‘Cookie‘: ‘global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14‘
10 
11     }
12 
13     def __init__(self):
14         self.url = ‘http://newhouse.fang.com/house/s/‘
15         self.s = requests.session()
16 
17 
18     def Newhouse_ftx(self):
19         try:
20             response = self.s.post(self.url,headers=self.headers,verify=False)
21         except Exception as e:
22             print(‘error:‘,e)
23         response.encoding = ‘gb2312‘
24         urls = etree.HTML(response.text)
25         xf_adress = urls.xpath(‘//div[@class="city20141104"]/div[3]/a/text()|‘
26                             ‘//div[@class="city20141104"]/div[4]/a/text()|‘
27                             ‘//div[@class="city20141104"]/div[5]/a/text()‘
28                             )
29         xf_url = urls.xpath(‘//div[@class="city20141104"]/div[3]/a/@href|‘
30                             ‘//div[@class="city20141104"]/div[4]/a/@href|‘
31                             ‘//div[@class="city20141104"]/div[5]/a/@href‘
32                             )
33 
34         return (dict(zip(xf_adress,xf_url)))
35 
36     def Secondhandhouse_ftx(self):
37         self.url = ‘http://esf.sh.fang.com/newsecond/esfcities.aspx‘
38         try:
39             html  = requests.get(self.url,headers=self.headers,timeout=4)
40         except Exception as e:
41             print(‘error:‘,e)
42         html.encoding = ‘gb2312‘
43         Secondhandhouse_urls = etree.HTML(html.text)
44         xf_url = Secondhandhouse_urls.xpath(‘//div[@class="onCont"]/ul/li/a/text()‘)
45         xf_adress  = Secondhandhouse_urls.xpath(‘//div[@class="onCont"]/ul/li/a/@href‘)
46         dictx = dict(zip(xf_url,xf_adress))
47         return dictx




下面是爬取房产数据代码：

 1 import requests,redis,pymysql
 2 from mywed.fangtianxia.url import Ftx_newhouse_Secondhandhouse
 3 from lxml import etree
 4 from concurrent.futures import ThreadPoolExecutor
 5 import re,os,time
 6 from mywed.fangtianxia.logs import log_run
 7 
 8 Secondhandhouse_urls_set = {‘http://esf.hbjs.fang.com‘}
 9 dr = Ftx_newhouse_Secondhandhouse()
10 w = dr.Secondhandhouse_ftx()
11 for i in w.values():
12     Secondhandhouse_urls_set.add(i)
13 print(Secondhandhouse_urls_set)
14 
15 
16 
17 class Secondhandhouse(object):
18 
19     headers = {
20         ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘,
21         ‘Cookie‘: ‘global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14‘
22 
23     }
24 
25     def get_newhouse_data(self,url):
26 
27         for num in range(102):
28             second_url = url + ‘/house/i3‘ + str(num)
29             try:
30                 while True:
31                     reponse = requests.get(url,headers=self.headers,timeout=3)
32                     reponse.encoding = ‘gbk‘
33                     #print(reponse.text)
34                     if reponse.status_code ==200:
35                         break
36                     else:
37                         print(‘restart donwing ......‘)
38             except Exception as e:
39                 log_run.File_enter_error(e)
40             select = etree.HTML(str(reponse.text))
41 
42             if not len(select.xpath(‘//a[@id="PageControl1_hlk_next"]/text()‘)):
43                 break
44             else:
45                 content_list = select.xpath(‘//dd[@class="info rel floatr"]‘)
46                 #print(content_list)
47 
48                 for i in content_list:
49                     title = i.xpath(‘./p[1]/a/@title‘)
50                     content = i.xpath(‘./p[2]/text()‘)
51                     name = i.xpath(‘./p[3]/a/span/text()‘)
52                     adress = i.xpath(‘./p[3]/span/text()‘)
53                 try:
54                     size_list = select.xpath(‘//div[@class="area alignR"]‘)
55                     size = [ii.xpath(‘./p/text()‘) for ii in size_list]
56                     average_price_list = select.xpath(‘//p[@class="danjia alignR mt5"]‘)
57                     average_price = [‘/‘.join(iii.xpath(‘./text()‘)) for iii in average_price_list]
58                     sum_price_list = select.xpath(‘//p[@class="mt5 alignR"]‘)
59                     sum_price = [‘‘.join(iiii.xpath(‘./span/text()‘)) for iiii in sum_price_list]
60                 except Exception as e:
61                     log_run.File_enter_error(e)
62                 print(title)
63 
64 
65 if __name__ =="__main__":
66     t = Secondhandhouse()
67     t.get_newhouse_data(‘http://esf.fang.com/house/i33/‘)
68     #s = t.get_newhouse_data
69     #pool = ThreadPoolExecutor(30)
70     #f = pool.map(s,Secondhandhouse_urls_set)

以上是关于爬取房天下整个网站房产数据。。。的主要内容，如果未能解决你的问题，请参考以下文章