Python spider Requests && Lxml && bs4

Posted Adorable_Rocy

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python spider Requests && Lxml && bs4相关的知识,希望对你有一定的参考价值。

1.安装Requests && Lxml && bs4

pip install requests
pip install bs4
pip install lxml

2. requests简单应用

  • 观察百度翻译是如何工作的
  • 按照工作流程编写属于自己的翻译
  1. 翻译
import requests
import json

if __name__ == "__main__":
	# 根据ajax的请求url,进行翻译
    post_url = "https://fanyi.baidu.com/sug"
    headers = 
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
    
    key = input("Input word:")

    params = 
        'kw':key
    

    response = requests.post(url=post_url,data=params,headers=headers)

    dic_obj = response.json()
    fp = open('./translation.json','w',encoding='utf-8')
    # json数据持久化存储
    json.dump(dic_obj,fp=fp,ensure_ascii=False)
    # 打印获取的json数据
    print(dic_obj)

输出结果如下

Input word:python
'errno': 0, 'data': ['k': 'Python', 'v': '蛇属,蟒蛇属', 'k': 'python', 'v': 'n. 巨蛇,大蟒', 'k': 'pythons', 'v': 'n. 巨蛇,大蟒( python的名词复数 )']

  1. KFC查询系统
  post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'

    cname = input("请输入城市:")
    params = 
        'cname': cname,
        'pid':'',
        'keyword': '',
        'pageIndex': '1',
        'pageSize': '10'
    

    response = requests.post(url=post_url,params=params,headers=headers)

    page_txt = response.text

    filename = cname + '.html'
    with open(filename , 'w' , encoding='utf-8') as fp:
        fp.write(page_txt)

    print(page_txt , 'OVER!')

输入城市地址就可以完成周边查询

请输入城市:北京
"Table":["rowcount":443],"Table1":["rownum":1,"storeName":"前门","addressDetail":"西城区前门西大街正阳市场1号楼中部","pro":"Wi-Fi,礼品卡","provinceName":"北京市","cityName":"北京市","rownum":2,"storeName":"京源","addressDetail":"左家庄新源街24号","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":3,"storeName":"东大桥","addressDetail":"朝外大街东大桥路1号楼","pro":"Wi-Fi,店内参观,礼品卡","provinceName":"北京市","cityName":"北京市","rownum":4,"storeName":"方庄","addressDetail":"蒲芳路26号","pro":"Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":5,"storeName":"安定门","addressDetail":"安定门外大街西河沿13号楼","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":6,"storeName":"展览路(德宝)","addressDetail":"西外大街德宝新园14号","pro":"Wi-Fi,店内参观,礼品卡","provinceName":"北京市","cityName":"北京市","rownum":7,"storeName":"劲松","addressDetail":"劲松4区401楼","pro":"24小时,Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":8,"storeName":"西罗园","addressDetail":"西罗园4区南二段","pro":"Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":9,"storeName":"蓝桥","addressDetail":"蓝桥餐厅工体北路11-1号","pro":"24小时,Wi-Fi,点唱机,礼品卡","provinceName":"北京市","cityName":"北京市","rownum":10,"storeName":"万惠","addressDetail":"金台里甲15号","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市"] OVER!
  1. 正则表达式爬取图片数据
 if not os.path.exists('./img_url'):
        os.mkdir('./img_url')
    headers = 
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/99.0.4844.51 Safari/537.36 '
    
    # 正则表达式
    reg = '<img src="(.*?)".*?/>'
    home_page_url = "https://www.biedoul.com/t/5pCe56yR5Zu%2B5paH_.html"
    for index in range(1,14):

        home_page_url_num = home_page_url.format(str(index))

        print(home_page_url_num)

        home_text = requests.get(url=home_page_url_num, headers=headers).text
        # print(home_text)
    # ex = '<div class="nr"><dl class="xhlist" id="xh.*?"><dd>.*?<img src="(.*?)">.*?</dd></dl></div>'

        img_src = re.findall(reg, home_text, re.S)

        for i, src in enumerate(img_src):

            if i < 3: continue

            img_cont = requests.get(url=src, headers=headers).content

            img_path = './img_url/'+src.split('/')[-1]

            with open(img_path, 'wb') as fp:
                fp.write(img_cont)

        print("爬取完成")

3.bs4网络请求

from bs4 import BeautifulSoup
import re
import requests

if __name__ == "__main__":
    book_url = 'https://www.shicimingju.com/book/sanguoyanyi.html'

    headers = 
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
    
    # 解析详情页数据 页面数据乱码处理
    page_text = requests.get(url=book_url, headers=headers).content.decode('utf-8')

    soup = BeautifulSoup(page_text, 'lxml')

    list_data = soup.select('.book-mulu ul li')
    fp = open('sanguo.txt','w',encoding='utf-8')
    for src in list_data:

        title = src.a.string

        detail_url = 'https://www.shicimingju.com' + src.a['href']

        # 解析详情页数据 页面数据乱码处理
        page_texts = requests.get(url=detail_url,headers=headers).content.decode('utf-8')

        #解析文章 拿到数据
        detail_soup = BeautifulSoup(page_texts , 'lxml')
        page_content = detail_soup.find('div',class_='chapter_content')

        content = page_content.text
        fp.write(title+":"+content+'\\n')
        print(title + "爬取成功")

4.lxml使用etree模拟登录

  1. 使用破解验证码登录网站
 headers = 
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
    

    login_url = 'http://www.chaojiying.com/user/login/'

    page_text = requests.get(url=login_url,headers=headers).text
	# 使用Etree解析
    page_tree = etree.HTML(page_text)
	# 使用xpath解析出图片
    img_url = page_tree.xpath('/html/body/div[3]/div/div[3]/div[1]/form/div/img/@src')[0]

    img_data = requests.get(url='http://www.chaojiying.com' + img_url , headers=headers).content
    with open('./img_code.jpg','wb') as fp:
        fp.write(img_data)
	# 一个很不错的GitHub IMG博主
    ocr = ddddocr.DdddOcr()
    with open('./img_code.jpg', 'rb') as f:
        img_bytes = f.read()
    res = ocr.classification(img_bytes)
    
 	user_login_url = 'http://www.chaojiying.com/user/login/'
    data = 
        'user': '用户名',
        'pass': '密码',
        'imgtxt': res, # 图片解析的验证码
        'act': '1'
    
    # 解析登录成功后的界面 模拟登录成功
    user_page_text = requests.post(url=user_login_url,headers=headers,data=data).status_code

1.2:可以使用cookie登录

headers = 
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
        'cookie': 'cookie值'
    
    if user_page_text == 200:
       succ_url = 'http://www.chaojiying.com/user/'
       success_page_text = requests.get(url=succ_url,headers=headers).text
       print(success_page_text)
  1. 批量下载免费的ppt
 headers = 
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
    
    if not os.path.exists('./pptPic'):
        os.mkdir('./pptPic')
    if not os.path.exists('./ppt'):
        os.mkdir('./ppt')
    # PPT图片爬下来 并且将地址归并
    home_url = 'http://www.51pptmoban.com/ppt/'

    home_page = requests.get(url=home_url,headers=headers).text

    home_tree = etree.HTML(home_page)

    # 获取当前界面的所有的PPT图片
    img_list = home_tree.xpath('//div[@class="pdiv"]//img/@src')

    for img_path in img_list:
        img_path = 'http://www.51pptmoban.com' + img_path
        img_name = img_path.split('/')[-1]
        img_store = 'pptPic/' + img_name

        img_birna = requests.get(url=img_path , headers=headers).content
        with open(img_store,'wb') as fp:
            fp.write(img_birna)

    # 获取PPT详情地址

    ppt_list = home_tree.xpath('//div[@class="pdiv"]/a/@href')

    for ppt_url in ppt_list:
        ppt_url = 'http://www.51pptmoban.com' + ppt_url
        middle_page = requests.get(url=ppt_url,headers=headers).text
        middle_tree  = etree.HTML(middle_page)
        down_url = 'http://www.51pptmoban.com' + middle_tree.xpath('//div[@class="ppt_xz"]/a/@href')[0]
        down_page = requests.get(url=down_url,headers=headers).text
        down_tree = etree.HTML(down_page)
        load_url =  down_tree.xpath('//div[@class="down"]/a/@href')[0]
        load_url = load_url.split('..')[1]
        load_url = 'http://www.51pptmoban.com/e/DownSys' + load_url
        # 下载zip文件用IO流的方式写入到文件夹中
        f = requests.get(url=load_url,headers=headers).content
        down_name = down_tree.xpath('//div[@class="wz"]/a/text()')[1]
        with open('ppt/'+down_name + '.zip','wb') as output:
            output.write(f)
        print(down_name + ": 下载完成")

因为很简单,就不多簪述

以上是关于Python spider Requests && Lxml && bs4的主要内容,如果未能解决你的问题,请参考以下文章

1.Python通用Spider

spider-inline-requests

python--spider模拟登录

spider类

Spider理论系列-requests模块的Cookie使用

CrawlSpider的使用