链家深圳租房信息爬取练习 附加源码
Posted jackzz
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了链家深圳租房信息爬取练习 附加源码相关的知识,希望对你有一定的参考价值。
from urllib import request from time import sleep from lxml import etree import csv # import random #sleep(random.random(1)*2) 随机秒数 # 参数部分 # sz_url = ‘https://sz.lianjia.com/zufang/‘ # # header = { # # ‘Referer‘: ‘https://sz.lianjia.com/zufang/‘, # ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘, # } # # 请求部分 # res = request.Request(sz_url,headers=header) # # response = request.urlopen(res) # result = response.read().decode() # # print(result) # # 筛选部分 # html = etree.HTML(result) # name_list = html.xpath(‘//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a‘) # with open(‘house.csv‘,"wb") as f: # for name in name_list: # title=name.attrib["title"] # f.write(title.encode()) # f.write(‘ ‘.encode()) # print(title) # -------------------------------------------------------------------------------------------------------------- # # 参数部分 # sz_url = ‘https://sz.lianjia.com/zufang/105101400296.html‘ # # header = { # # ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘, # } # # 请求部分 # res = request.Request(sz_url,headers=header) # # response = request.urlopen(res) # result = response.read().decode() # # print(result) # # html = etree.HTML(result) # name_list = html.xpath(‘//div[@class="brokerName"]/a‘) # # for name in name_list: # text = name.text # print(text) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@code [email protected]@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # 请求测试 def getRequet(url,xpath,**headers): default_headers = { ‘Connection‘: ‘keep-alive‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTM, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘, } if headers : headers ={ **headers, **default_headers} else: headers = default_headers req = request.Request(url,headers=headers) response = request.urlopen(req) result = response.read().decode() html = etree.HTML(result) name_list = html.xpath(xpath) return name_list def main(): with open(‘house.csv‘,"wb") as f:#打开csv文件 写入数据 # csv_file = open(‘house.csv‘,‘wb‘) # csv_write = csv.writer(csv_file,dialect=‘excel‘) zf_url=‘https://sz.lianjia.com/zufang/‘#要访问的url地址 zf_xpath=‘//ul[@id="house-lst"]/li/div[@class="info-panel"]/h2/a‘#租房xpath地址 name_xpath=‘//div[@class="brokerName"]/a‘#联系人名字xpath house_list=getRequet(zf_url,zf_xpath) for house in house_list: print(‘正在下载:‘,zf_url)#打印下载链接地址 attrib = house.attrib house_name = attrib[‘title‘] url =attrib[‘href‘] username=getRequet(url,name_xpath)[0].text#取联系人名字的文本信息下标0 # csv_write.witerow(house_name,username) # print(‘@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@‘) # print(name) # print(url) # f.write(house_name.encode())#encode编码 f.write(‘ ‘.encode()) f.write(username.encode()) f.write(‘ ‘.encode()) # print(house_name) # print(username) sleep(1) # print(‘@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@‘) print(‘下载完成‘) f.close() if __name__==‘__main__‘: main()
以上是关于链家深圳租房信息爬取练习 附加源码的主要内容,如果未能解决你的问题,请参考以下文章