使用requestsreBeautifulSoup线程池爬取携程酒店信息并保存到Excel中
Posted wuyan717
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用requestsreBeautifulSoup线程池爬取携程酒店信息并保存到Excel中相关的知识,希望对你有一定的参考价值。
import requests import json import re import csv import threadpool import time, random from bs4 import BeautifulSoup from fake_useragent import UserAgent def hotel(city_letter, city_num, city_name): with open(‘has_address.json‘, ‘a+‘, encoding="utf-8") as f: f.write(str(city_num) + ‘ ‘) f.close() ss = 0 with open(‘携程/%s.csv‘ % city_name, ‘w+‘, encoding=‘utf-8-sig‘) as hotel_xie: k = csv.writer(hotel_xie, dialect=‘excel‘) k.writerow([‘序号‘, ‘名称‘, ‘价格‘, ‘星级‘, ‘地址‘, ‘酒店介绍‘]) for i in range(1, 100): url = "http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx" headers = { "Connection": "keep-alive", "origin": "http://hotels.ctrip.com", "Host": "hotels.ctrip.com", "referer": "http://hotels.ctrip.com/hotel/%s" % city_letter, "user-agent": UserAgent(verify_ssl=False).random, "Content-Type": "application/x-www-form-urlencoded", } data = { "StartTime": "2019-02-25", "DepTime": "2019-02-26", "RoomGuestCount": "1,1,0", "city": city_num, "page": i, } try: time.sleep(random.randint(1, 5)) html = requests.post(url, headers=headers, data=data) regex = re.compile(r‘\(?![/u"])‘) fixed = regex.sub(r"\\", html.text) aa = json.loads(fixed) except Exception: pass for n in range(0, 25): try: hotel_name = aa["hotelPositionJSON"][n]["name"] hotel_id = aa["hotelPositionJSON"][n]["id"] hotel_address = aa["hotelPositionJSON"][n]["address"] price = eval(aa["HotelMaiDianData"]["value"]["htllist"])[n]["amount"] star_class = aa["hotelPositionJSON"][n]["star"][-2:] time.sleep(random.randint(1, 3)) hotel_intro = requests.get(‘http://hotels.ctrip.com/hotel/%s.html‘ % hotel_id) res_req = BeautifulSoup(hotel_intro.text, "html5lib") iss = re.sub(‘资质备案‘, ‘‘, re.sub(‘联系方式‘, ‘‘, res_req.find(‘div‘, id=‘htlDes‘).findAll(‘p‘)[0].get_text())) ins = iss.replace(‘ ‘, ‘‘).replace(‘ ‘, ‘‘).replace(‘ ‘, ‘‘) s = res_req.find(‘span‘, id=‘J_realContact‘)[‘data-real‘].replace(‘ ‘, ‘,‘) tel = s[s.rfind("电话"): s.rfind("<a") - 2] duction = res_req.find(‘span‘, id=‘ctl00_MainContentPlaceHolder_hotelDetailInfo_lbDesc‘).get_text().replace(‘ ‘, ‘,‘) introduction = str(ins) + str(tel) + str(duction) ss += 1 k.writerow([ss, hotel_name, price + "元起", star_class, hotel_address, introduction]) except Exception: continue time.sleep(random.randint(1, 4)) hotel_xie.close() if __name__ == ‘__main__‘: has_num = [] will_req_list = [] for line in open("address.json", encoding=‘utf-8‘): single_list = line.replace(" ", "").split(‘,‘) for has in open("has_address.json", encoding=‘utf-8‘): has_num.append(int(has.replace(" ", ""))) if int(single_list[1]) in has_num: continue single_tuple = (single_list, None) will_req_list.append(single_tuple) pool = threadpool.ThreadPool(8) request_list = threadpool.makeRequests(hotel, will_req_list) [pool.putRequest(req) for req in request_list] pool.wait() # 爬取地址 # h = { # "Connection": "keep-alive", # "origin": "http://hotels.ctrip.com", # "Host": "hotels.ctrip.com", # "referer": "http://hotels.ctrip.com/hotel/beijing1", # "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36", # "Content-Type": "application/x-www-form-urlencoded", # } # res = requests.get(‘http://hotels.ctrip.com/Domestic/Tool/AjaxGetCitySuggestion.aspx‘, headers=h) # a_list = re.findall(‘data:(.*?),group:‘, res.text) # with open(‘address.json‘, ‘w+‘, encoding="utf-8") as f: # for address in a_list: # i = 0 # al = address.split(‘,‘) # for a in al: # city_letter = ‘‘.join(re.findall(r‘[A-Za-z]‘, a)) # f.write(city_letter + ‘,‘) # city_num = re.sub("D", "", a) # f.write(str(city_num)) # city_name = re.sub(‘[A-Za-z0-9"|]‘, "", a) # f.write(‘,‘ + str(city_name)) # f.write(‘ ‘) # i += 1 # f.close()
以上是关于使用requestsreBeautifulSoup线程池爬取携程酒店信息并保存到Excel中的主要内容,如果未能解决你的问题,请参考以下文章
在使用加载数据流步骤的猪中,使用(使用 PigStorage)和不使用它有啥区别?
Qt静态编译时使用OpenSSL有三种方式(不使用,动态使用,静态使用,默认是动态使用)