Python spider Requests && Lxml && bs4
Posted Adorable_Rocy
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python spider Requests && Lxml && bs4相关的知识,希望对你有一定的参考价值。
1.安装Requests && Lxml && bs4
pip install requests
pip install bs4
pip install lxml
2. requests简单应用
- 观察百度翻译是如何工作的
- 按照工作流程编写属于自己的翻译
- 翻译
import requests
import json
if __name__ == "__main__":
# 根据ajax的请求url,进行翻译
post_url = "https://fanyi.baidu.com/sug"
headers =
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
key = input("Input word:")
params =
'kw':key
response = requests.post(url=post_url,data=params,headers=headers)
dic_obj = response.json()
fp = open('./translation.json','w',encoding='utf-8')
# json数据持久化存储
json.dump(dic_obj,fp=fp,ensure_ascii=False)
# 打印获取的json数据
print(dic_obj)
输出结果如下
Input word:python
'errno': 0, 'data': ['k': 'Python', 'v': '蛇属,蟒蛇属', 'k': 'python', 'v': 'n. 巨蛇,大蟒', 'k': 'pythons', 'v': 'n. 巨蛇,大蟒( python的名词复数 )']
- KFC查询系统
post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
cname = input("请输入城市:")
params =
'cname': cname,
'pid':'',
'keyword': '',
'pageIndex': '1',
'pageSize': '10'
response = requests.post(url=post_url,params=params,headers=headers)
page_txt = response.text
filename = cname + '.html'
with open(filename , 'w' , encoding='utf-8') as fp:
fp.write(page_txt)
print(page_txt , 'OVER!')
输入城市地址就可以完成周边查询
请输入城市:北京
"Table":["rowcount":443],"Table1":["rownum":1,"storeName":"前门","addressDetail":"西城区前门西大街正阳市场1号楼中部","pro":"Wi-Fi,礼品卡","provinceName":"北京市","cityName":"北京市","rownum":2,"storeName":"京源","addressDetail":"左家庄新源街24号","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":3,"storeName":"东大桥","addressDetail":"朝外大街东大桥路1号楼","pro":"Wi-Fi,店内参观,礼品卡","provinceName":"北京市","cityName":"北京市","rownum":4,"storeName":"方庄","addressDetail":"蒲芳路26号","pro":"Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":5,"storeName":"安定门","addressDetail":"安定门外大街西河沿13号楼","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":6,"storeName":"展览路(德宝)","addressDetail":"西外大街德宝新园14号","pro":"Wi-Fi,店内参观,礼品卡","provinceName":"北京市","cityName":"北京市","rownum":7,"storeName":"劲松","addressDetail":"劲松4区401楼","pro":"24小时,Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":8,"storeName":"西罗园","addressDetail":"西罗园4区南二段","pro":"Wi-Fi,店内参观,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市","rownum":9,"storeName":"蓝桥","addressDetail":"蓝桥餐厅工体北路11-1号","pro":"24小时,Wi-Fi,点唱机,礼品卡","provinceName":"北京市","cityName":"北京市","rownum":10,"storeName":"万惠","addressDetail":"金台里甲15号","pro":"Wi-Fi,礼品卡,生日餐会","provinceName":"北京市","cityName":"北京市"] OVER!
- 正则表达式爬取图片数据
if not os.path.exists('./img_url'):
os.mkdir('./img_url')
headers =
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/99.0.4844.51 Safari/537.36 '
# 正则表达式
reg = '<img src="(.*?)".*?/>'
home_page_url = "https://www.biedoul.com/t/5pCe56yR5Zu%2B5paH_.html"
for index in range(1,14):
home_page_url_num = home_page_url.format(str(index))
print(home_page_url_num)
home_text = requests.get(url=home_page_url_num, headers=headers).text
# print(home_text)
# ex = '<div class="nr"><dl class="xhlist" id="xh.*?"><dd>.*?<img src="(.*?)">.*?</dd></dl></div>'
img_src = re.findall(reg, home_text, re.S)
for i, src in enumerate(img_src):
if i < 3: continue
img_cont = requests.get(url=src, headers=headers).content
img_path = './img_url/'+src.split('/')[-1]
with open(img_path, 'wb') as fp:
fp.write(img_cont)
print("爬取完成")
3.bs4网络请求
from bs4 import BeautifulSoup
import re
import requests
if __name__ == "__main__":
book_url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers =
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
# 解析详情页数据 页面数据乱码处理
page_text = requests.get(url=book_url, headers=headers).content.decode('utf-8')
soup = BeautifulSoup(page_text, 'lxml')
list_data = soup.select('.book-mulu ul li')
fp = open('sanguo.txt','w',encoding='utf-8')
for src in list_data:
title = src.a.string
detail_url = 'https://www.shicimingju.com' + src.a['href']
# 解析详情页数据 页面数据乱码处理
page_texts = requests.get(url=detail_url,headers=headers).content.decode('utf-8')
#解析文章 拿到数据
detail_soup = BeautifulSoup(page_texts , 'lxml')
page_content = detail_soup.find('div',class_='chapter_content')
content = page_content.text
fp.write(title+":"+content+'\\n')
print(title + "爬取成功")
4.lxml使用etree模拟登录
- 使用破解验证码登录网站
headers =
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
login_url = 'http://www.chaojiying.com/user/login/'
page_text = requests.get(url=login_url,headers=headers).text
# 使用Etree解析
page_tree = etree.HTML(page_text)
# 使用xpath解析出图片
img_url = page_tree.xpath('/html/body/div[3]/div/div[3]/div[1]/form/div/img/@src')[0]
img_data = requests.get(url='http://www.chaojiying.com' + img_url , headers=headers).content
with open('./img_code.jpg','wb') as fp:
fp.write(img_data)
# 一个很不错的GitHub IMG博主
ocr = ddddocr.DdddOcr()
with open('./img_code.jpg', 'rb') as f:
img_bytes = f.read()
res = ocr.classification(img_bytes)
user_login_url = 'http://www.chaojiying.com/user/login/'
data =
'user': '用户名',
'pass': '密码',
'imgtxt': res, # 图片解析的验证码
'act': '1'
# 解析登录成功后的界面 模拟登录成功
user_page_text = requests.post(url=user_login_url,headers=headers,data=data).status_code
1.2:可以使用cookie登录
headers =
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
'cookie': 'cookie值'
if user_page_text == 200:
succ_url = 'http://www.chaojiying.com/user/'
success_page_text = requests.get(url=succ_url,headers=headers).text
print(success_page_text)
- 批量下载免费的ppt
headers =
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
if not os.path.exists('./pptPic'):
os.mkdir('./pptPic')
if not os.path.exists('./ppt'):
os.mkdir('./ppt')
# PPT图片爬下来 并且将地址归并
home_url = 'http://www.51pptmoban.com/ppt/'
home_page = requests.get(url=home_url,headers=headers).text
home_tree = etree.HTML(home_page)
# 获取当前界面的所有的PPT图片
img_list = home_tree.xpath('//div[@class="pdiv"]//img/@src')
for img_path in img_list:
img_path = 'http://www.51pptmoban.com' + img_path
img_name = img_path.split('/')[-1]
img_store = 'pptPic/' + img_name
img_birna = requests.get(url=img_path , headers=headers).content
with open(img_store,'wb') as fp:
fp.write(img_birna)
# 获取PPT详情地址
ppt_list = home_tree.xpath('//div[@class="pdiv"]/a/@href')
for ppt_url in ppt_list:
ppt_url = 'http://www.51pptmoban.com' + ppt_url
middle_page = requests.get(url=ppt_url,headers=headers).text
middle_tree = etree.HTML(middle_page)
down_url = 'http://www.51pptmoban.com' + middle_tree.xpath('//div[@class="ppt_xz"]/a/@href')[0]
down_page = requests.get(url=down_url,headers=headers).text
down_tree = etree.HTML(down_page)
load_url = down_tree.xpath('//div[@class="down"]/a/@href')[0]
load_url = load_url.split('..')[1]
load_url = 'http://www.51pptmoban.com/e/DownSys' + load_url
# 下载zip文件用IO流的方式写入到文件夹中
f = requests.get(url=load_url,headers=headers).content
down_name = down_tree.xpath('//div[@class="wz"]/a/text()')[1]
with open('ppt/'+down_name + '.zip','wb') as output:
output.write(f)
print(down_name + ": 下载完成")
因为很简单,就不多簪述
以上是关于Python spider Requests && Lxml && bs4的主要内容,如果未能解决你的问题,请参考以下文章