使用requestsreBeautifulSoup线程池爬取携程酒店信息并保存到Excel中

Posted wuyan717

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用requestsreBeautifulSoup线程池爬取携程酒店信息并保存到Excel中相关的知识,希望对你有一定的参考价值。

import requests
import json
import re
import csv
import threadpool
import time, random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


def hotel(city_letter, city_num, city_name):
    with open(has_address.json, a+, encoding="utf-8") as f:
        f.write(str(city_num) + 
)
    f.close()
    ss = 0
    with open(携程/%s.csv % city_name, w+, encoding=utf-8-sig) as hotel_xie:
        k = csv.writer(hotel_xie, dialect=excel)
        k.writerow([序号, 名称, 价格, 星级, 地址, 酒店介绍])
        for i in range(1, 100):
            url = "http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx"
            headers = {
                "Connection": "keep-alive",
                "origin": "http://hotels.ctrip.com",
                "Host": "hotels.ctrip.com",
                "referer": "http://hotels.ctrip.com/hotel/%s" % city_letter,
                "user-agent": UserAgent(verify_ssl=False).random,
                "Content-Type": "application/x-www-form-urlencoded",
            }
            data = {
                "StartTime": "2019-02-25",
                "DepTime": "2019-02-26",
                "RoomGuestCount": "1,1,0",
                "city": city_num,
                "page": i,
            }
            try:
                time.sleep(random.randint(1, 5))
                html = requests.post(url, headers=headers, data=data)
                regex = re.compile(r\(?![/u"]))
                fixed = regex.sub(r"\\", html.text)

                aa = json.loads(fixed)
            except Exception:
                pass
            for n in range(0, 25):
                try:
                    hotel_name = aa["hotelPositionJSON"][n]["name"]
                    hotel_id = aa["hotelPositionJSON"][n]["id"]
                    hotel_address = aa["hotelPositionJSON"][n]["address"]
                    price = eval(aa["HotelMaiDianData"]["value"]["htllist"])[n]["amount"]
                    star_class = aa["hotelPositionJSON"][n]["star"][-2:]
                    time.sleep(random.randint(1, 3))
                    hotel_intro = requests.get(http://hotels.ctrip.com/hotel/%s.html % hotel_id)
                    res_req = BeautifulSoup(hotel_intro.text, "html5lib")
                    iss = re.sub(资质备案, ‘‘, re.sub(联系方式, ‘‘, res_req.find(div, id=htlDes).findAll(p)[0].get_text()))
                    ins = iss.replace(
, ‘‘).replace( , ‘‘).replace( , ‘‘)
                    s = res_req.find(span, id=J_realContact)[data-real].replace(
, ,)
                    tel = s[s.rfind("电话"): s.rfind("<a") - 2]
                    duction = res_req.find(span, id=ctl00_MainContentPlaceHolder_hotelDetailInfo_lbDesc).get_text().replace(
, ,)
                    introduction = str(ins) + str(tel) + str(duction)
                    ss += 1
                    k.writerow([ss, hotel_name,  price + "元起", star_class, hotel_address, introduction])
                except Exception:
                    continue
                time.sleep(random.randint(1, 4))
    hotel_xie.close()


if __name__ == __main__:
    has_num = []
    will_req_list = []
    for line in open("address.json", encoding=utf-8):
        single_list = line.replace("
", "").split(,)
        for has in open("has_address.json", encoding=utf-8):
            has_num.append(int(has.replace("
", "")))
        if int(single_list[1]) in has_num:
            continue
        single_tuple = (single_list, None)
        will_req_list.append(single_tuple)
    pool = threadpool.ThreadPool(8)
    request_list = threadpool.makeRequests(hotel, will_req_list)
    [pool.putRequest(req) for req in request_list]
    pool.wait()

    # 爬取地址
    # h = {
    #         "Connection": "keep-alive",
    #         "origin": "http://hotels.ctrip.com",
    #         "Host": "hotels.ctrip.com",
    #         "referer": "http://hotels.ctrip.com/hotel/beijing1",
    #         "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
    #         "Content-Type": "application/x-www-form-urlencoded",
    #     }
    # res = requests.get(‘http://hotels.ctrip.com/Domestic/Tool/AjaxGetCitySuggestion.aspx‘, headers=h)
    # a_list = re.findall(‘data:(.*?),group:‘, res.text)
    # with open(‘address.json‘, ‘w+‘,  encoding="utf-8") as f:
    #     for address in a_list:
    #         i = 0
    #         al = address.split(‘,‘)
    #         for a in al:
    #             city_letter = ‘‘.join(re.findall(r‘[A-Za-z]‘, a))
    #             f.write(city_letter + ‘,‘)
    #             city_num = re.sub("D", "", a)
    #             f.write(str(city_num))
    #             city_name = re.sub(‘[A-Za-z0-9"|]‘, "", a)
    #             f.write(‘,‘ + str(city_name))
    #             f.write(‘
‘)
    #         i += 1
    # f.close()

 

以上是关于使用requestsreBeautifulSoup线程池爬取携程酒店信息并保存到Excel中的主要内容,如果未能解决你的问题,请参考以下文章

第一篇 用于测试使用

在使用加载数据流步骤的猪中,使用(使用 PigStorage)和不使用它有啥区别?

今目标使用教程 今目标任务使用篇

Qt静态编译时使用OpenSSL有三种方式(不使用,动态使用,静态使用,默认是动态使用)

MySQL db 在按日期排序时使用“使用位置;使用临时;使用文件排序”

使用“使用严格”作为“使用强”的备份