爬虫 requests,bs4 用法示例
Posted shijieli
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫 requests,bs4 用法示例相关的知识,希望对你有一定的参考价值。
requests 模块
用法
import requests # 1. 方法
""" requests.get requests.post requests.put requests.delete ... requests.request(method=‘POST‘) """ # 2. 参数 """ 2.1 url 2.2 headers 2.3 cookies 2.4 params 2.5 data,传请求体 requests.post( ..., data={‘user‘:‘alex‘,‘pwd‘:‘123‘} ) # 请求体会是body数据格式 GET /index http1.1 host:c1.com user=alex&pwd=123 2.6 json,传请求体 requests.post( ..., json={‘user‘:‘alex‘,‘pwd‘:‘123‘} ) # 请求体会是字典的格式 GET /index http1.1 host:c1.com Content-Type:application/json {"user":"alex","pwd":123} 2.7 代理 proxies # 无验证 proxie_dict = { "http": "61.172.249.96:80", # 所有的http 代理指定 "https://www.proxy360.cn/Proxy": "61.172.249.96:80", # 精准指定 "https": "http://61.185.219.126:3128", } ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict) # 验证待认证的代理 from requests.auth import HTTPProxyAuth proxyDict = { ‘http‘: ‘77.75.105.165‘, # 批量的 http 请求都走这个代理 ‘http://www.google.com‘: ‘77.75.105.165‘, # 精准设置单地址走某代理 ‘https‘: ‘77.75.106.165‘ # 批量的 https 请求走这个 } # 多创建一个认证对象 auth = HTTPProxyAuth(‘用户名‘, ‘密码‘) r = requests.get("http://www.google.com",data={‘xxx‘:‘ffff‘} proxies=proxyDict, auth=auth) print(r.text) ----------------------------------------------------------------------------------------- 2.8 文件上传 files # 发送文件 file_dict = { ‘f1‘: open(‘xxxx.log‘, ‘rb‘) } requests.request( method=‘POST‘, url=‘http://127.0.0.1:8000/test/‘, files=file_dict ) 2.9 认证 auth 内部: 用户名和密码,用户和密码加密,放在请求头中传给后台 - "用户:密码" - base64("用户:密码") - "Basic base64("用户|密码")" - 请求头: Authorization: "basic base64("用户|密码")" from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get(‘https://api.github.com/user‘, auth=HTTPBasicAuth(‘admin‘, ‘admin‘)) print(ret.text) 2.10 超时 timeout # ret = requests.get(‘http://google.com/‘, timeout=1) # 链接时间限制1s # print(ret) # ret = requests.get(‘http://google.com/‘, timeout=(5, 1)) # 链接 5s 返回 1s 超时不候 # print(ret) 2.11 允许重定向 allow_redirects 直到拿到结果为止 ret = requests.get(‘http://127.0.0.1:8000/test/‘, allow_redirects=False) print(ret.text) 2.12 大文件下载 stream from contextlib import closing with closing(requests.get(‘http://httpbin.org/get‘, stream=True)) as r1: # 在此处理响应。 for i in r1.iter_content(): print(i) 2.13 证书 cert - 百度、腾讯 => 不用携带证书(系统帮你做了) - 自定义证书 requests.get(‘http://127.0.0.1:8000/test/‘, cert="xxxx/xxx/xxx.pem") requests.get(‘http://127.0.0.1:8000/test/‘, cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key")) 2.14 确认 verify =False """ requests.get(‘http://127.0.0.1:8000/test/‘, cert="xxxx/xxx/xxx.pem")
示例
import requests from urllib.parse import urlencode # 请求方式 kwords = input("请输入关键字:>>").strip() res = urlencode({"wd":kwords}) # # 请求的url,当你在百度输入中文的时候,你把url拿下来会变成下面的这样格式的url,所以得urlencode一下 url ="https://www.baidu.com/s?"+res #https://www.baidu.com/s?wd=%E5%9B%BE%E7%89%87 response = requests.get( # 请求的url,当你在百度输入中文的时候,你把url拿下来会变成下面的这样格式的url url, # 请求头 headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/63.0.3239.108 Safari/537.36", }, ) with open("a.html","w",encoding="utf-8") as f: f.write(response.text) # print(response.status_code)
kwords = input("请输入关键字:>>").strip() response = requests.get( "https://www.baidu.com/s?", # 请求的url,当你在百度输入中文的时候,你把url拿下来会变成下面的这样格式的url params={ "wd":kwords, ‘pn‘:20 }, # 请求头 headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36", }, ) with open("b.html","w",encoding="utf-8") as f: f.write(response.text) # print(response.status_code)
bs4 模块
基本使用
from bs4 import BeautifulSoup import requests r1 = requests.get( ... ) print(r1.text) # 取出来内容 soup = BeautifulSoup(r1.text,‘html.parser‘) # (内容,解析器) # 标签对象 # content_list = soup.find(name=‘div‘,id=‘content-list‘) # find 取到第一个 find_all 取到所有 content_list = soup.find(name=‘div‘,attrs={"id":"content-list"}) # [标签对象,标签对象] item_list = content_list.find_all(name=‘div‘,attrs={‘class‘:‘item‘}) for item in item_list: a = item.find(name=‘a‘,attrs={‘class‘:‘show-content color-chag‘}) print(a.text.strip())
常用方法
r1= requests.get( ...)
soup = BeautifulSoup(r1.text,‘html.parser‘)
soup.find(name=‘a‘,attrs={‘class‘:‘show-content color-chag‘})
soup.find_all(name=‘a‘,attrs={‘class‘:‘show-content color-chag‘})
简单爬虫示例
爬取抽屉,以及自动登陆抽屉点赞
先查看首页拿到cookie,然后登陆要携带首页拿到的 cookie 才可以通过验证
"""""" # ################################### 示例一:爬取数据(携带请起头) ################################### """ import requests from bs4 import BeautifulSoup r1 = requests.get( url=‘https://dig.chouti.com/‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ } ) soup = BeautifulSoup(r1.text,‘html.parser‘) content_list = soup.find(name=‘div‘,attrs={"id":"content-list"}) item_list = content_list.find_all(name=‘div‘,attrs={‘class‘:‘item‘}) for item in item_list: a = item.find(name=‘a‘,attrs={‘class‘:‘show-content color-chag‘}) print(a.text.strip()) """ # ################################### 示例二:登陆点赞 ################################### """ import requests # 1. 查看首页 r1 = requests.get( url=‘https://dig.chouti.com/‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ } ) # 2. 提交用户名和密码 r2 = requests.post( url=‘https://dig.chouti.com/login‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ }, data={ ‘phone‘:‘8613121758648‘, ‘password‘:‘woshiniba‘, ‘oneMonth‘:1 }, cookies=r1.cookies.get_dict() # 套路 正常用户必然会先访问首页然后再登陆 # 如果你直接登陆必然是爬虫,因此设计在第一次访问首页的时候先创建cookie 并且返回了回去 # 并且要求你第二次访问的时候要带着这个 cookie ) # 3. 点赞 r3 = requests.post( url=‘https://dig.chouti.com/link/vote?linksId=20435396‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ }, cookies=r1.cookies.get_dict() ) print(r3.text) """ # ############## 方式二 session 方式 ############## """ # 用 session 自动封装好 cookie 不用在以后自己携带 import requests session = requests.Session() i1 = session.get(url="http://dig.chouti.com/help/service") i2 = session.post( url="http://dig.chouti.com/login", data={ ‘phone‘: "8615131255089", ‘password‘: "xxooxxoo", ‘oneMonth‘: "" } ) i3 = session.post( url="http://dig.chouti.com/link/vote?linksId=8589523" ) print(i3.text) """
爬取拉勾网
请求头中存在自定义的验证字段,要想办法拿到才可以正确爬取,以及 Referer 的使用
import re import requests """ 密码加密了的时候 找js 通过 python 实现加密方式 直接把加密后的密文拿来用 """ r1 = requests.get( url=‘https://passport.lagou.com/login/login.html‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, } ) """ 有两个奇怪的东西,是网站的防御机制 这两个数据必然是对方发给我们的 要不在响应头里面,要不在响应体里面 响应头看不到。那就去响应体里面找。 """ # 因为不是写在标签里面的。只能用正则来拿了 X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", r1.text, re.S)[0] # print(X_Anti_Forge_Token, X_Anti_Forge_Code) r2 = requests.post( url=‘https://passport.lagou.com/login/login.json‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, ‘X-Anit-Forge-Code‘:X_Anti_Forge_Code, ‘X-Anit-Forge-Token‘:X_Anti_Forge_Token, ‘Referer‘: ‘https://passport.lagou.com/login/login.html‘, # 上一次请求地址是什么?很多网站会要求带着个才可以继续 }, data={ "isValidate": True, ‘username‘: ‘15131255089‘, ‘password‘: ‘ab18d270d7126ea65915c50288c22c0d‘, # 直接发密文了 ‘request_form_verifyCode‘: ‘‘, ‘submit‘: ‘‘ }, cookies=r1.cookies.get_dict() ) print(r2.text)
自动登陆GitHub
scrf_token 的验证
"""""" # ################################### 示例三:自动登录GitHub ################################### # 1. GET,访问登录页面 """ - 去HTML中找隐藏的Input标签获取csrf token - 获取cookie """ # 2. POST,用户名和密码 """ - 发送数据: - csrf - 用户名 - 密码 - 携带cookie """ # 3. GET,访问https://github.com/settings/emails """ - 携带 cookie """ import requests from bs4 import BeautifulSoup # ########################################################## # 访问登陆页面,获取 authenticity_token i1 = requests.get( url=‘https://github.com/login‘ ) soup1 = BeautifulSoup(i1.text, features=‘lxml‘) tag = soup1.find(name=‘input‘, attrs={‘name‘: ‘authenticity_token‘}) authenticity_token = tag.get(‘value‘) # authenticity_token 拿到 c1 = i1.cookies.get_dict() i1.close() # 携带authenticity_token和用户名密码等信息,发送用户验证 form_data = { "authenticity_token": authenticity_token, # 放在请求体中发过去 "utf8": "", "commit": "Sign in", "login": "", ‘password‘: ‘‘ } i2 = requests.post( url=‘https://github.com/session‘, data=form_data, cookies=c1 ) c2 = i2.cookies.get_dict() c1.update(c2) # 将两次的 cookie 整合一起 i3 = requests.get(‘https://github.com/settings/repositories‘, cookies=c1) soup3 = BeautifulSoup(i3.text, features=‘lxml‘) list_group = soup3.find(name=‘div‘, class_=‘listgroup‘) from bs4.element import Tag for child in list_group.children: if isinstance(child, Tag): project_tag = child.find(name=‘a‘, class_=‘mr-1‘) size_tag = child.find(name=‘small‘) temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get(‘href‘), size_tag.string, project_tag.string, ) print(temp)
总结
请求头:
user-agent
referer
host
cookie
特殊请起头,查看上一次请求获取内容。
‘X-Anit-Forge-Code‘:... ‘X-Anit-Forge-Token‘:...
请求体:
- 原始数据 - 原始数据 + token - 密文 - 找算法 - 使用密文
套路:
- post登录获取cookie,以后携带cookie
- get获取未授权cookie,post登录携带cookie去授权,以后携带cookie
以上是关于爬虫 requests,bs4 用法示例的主要内容,如果未能解决你的问题,请参考以下文章