Python爬虫系列-Requests库详解

Posted carious

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫系列-Requests库详解相关的知识,希望对你有一定的参考价值。

Requests基于urllib,比urllib更加方便,可以节约我们大量的工作,完全满足HTTP测试需求。

实例引入

 import requests
 
 response = requests.get(‘https://www.baidu.com/‘)
 print(type(response))
 print(response.status_code)
 print(type(response.text))
 print(response.cookies)

<class ‘requests.models.Response‘>
200
<class ‘str‘>
<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>

各种请求方式

 import requests
 requests.post(‘http://httpbin.org/post‘)

<Response [200]>

 requests.put(‘http://httpbin.org/put‘)

<Response [200]>

 requests.delete(‘http://httpbin.org/delete‘)

<Response [200]>

 requests.head(‘http://httpbin.org/gett‘)

<Response [404]>

 requests.head(‘http://httpbin.org/get‘)

<Response [200]>

 requests.options(‘http://httpbin.org/get‘)

<Response [200]>

基本GET请求

 import requests
 
 response = requests.get(‘http://httpbin.org/get‘)
 print(response.text)

{
"args": {},
"headers": {
"Accept": "/",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.20.1"
},
"origin": "58.34.235.37",
"url": "http://httpbin.org/get"
}

带参数的GET请求

import requests
 response = requests.get(‘http://httpbin.org/get?name=germey&age=22‘)
 print(response.text)

{
"args": {
"age": "22",
"name": "germey"
},
"headers": {
"Accept": "/",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.20.1"
},
"origin": "58.34.235.37",
"url": "http://httpbin.org/get?name=germey&age=22"
}

import requests

 data = { ‘name‘:‘germery‘,‘age‘:22 }
 response = requests.get(‘http://httpbin.org/get‘,params=data)
 print(response.text)

{
"args": {
"age": "22",
"name": "germery"
},
"headers": {
"Accept": "/",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.20.1"
},
"origin": "58.34.235.37",
"url": "http://httpbin.org/get?name=germery&age=22"
}

解析json

 import requests
 
 response = requests.get(‘http://httpbin.org/get‘)
 print(type(response.text))

<class ‘str‘>

 print(response.json())  # 与json.loads(response.text)完全一样

{‘args‘: {}, ‘headers‘: {‘Accept‘: ‘/‘, ‘Accept-Encoding‘: ‘gzip, deflate‘, ‘Connection‘: ‘close‘, ‘Host‘: ‘httpbin.org‘, ‘User-Agent‘: ‘python-requests/2.20.1‘}, ‘origin‘: ‘58.34.235.37‘, ‘url‘: ‘http://httpbin.org/get‘}

 print(type(response.json()))

<class ‘dict‘>

获取二进制数据

import requests

 response = requests.get(‘https://github.com/favicon.ico‘)
 print(type(response.text),type(response.content))

<class ‘str‘> <class ‘bytes‘>

下载图片

import requests
 response = requests.get(‘https://github.com/favion.ico‘)
 with open(‘favicon.ico‘,‘wb‘) as f:
     f.write(response.content)

添加headers

import requests
 headers = { ‘User-Agent‘:‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘ }
 response = requests.get(‘https://www.zhihu.com/explore‘,headers=headers)

基本POST请求

import requests
data = { ‘name‘:‘germey‘,‘age‘:22 }
response = requests.post(‘http://httpbin.org/post‘,data=data)
print(response.text)

{
"args": {},
"data": "",
"files": {},
"form": {
"age": "22",
"name": "germey"
},
"headers": {
"Accept": "/",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Content-Length": "18",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.20.1"
},
"json": null,
"origin": "58.34.235.37",
"url": "http://httpbin.org/post"
}

添加headers

 import requests
 
 data = {‘name‘:‘germey‘,‘age‘:22}
 headers = { ‘User-Agent‘:‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘ }
 response = requests.post(‘http://httpbin.org/post‘,data=data,headers=headers)
 print(response.json())

{‘args‘: {}, ‘data‘: ‘‘, ‘files‘: {}, ‘form‘: {‘age‘: ‘22‘, ‘name‘: ‘germey‘}, ‘headers‘: {‘Accept‘: ‘/‘, ‘Accept-Encoding‘: ‘gzip, deflate‘, ‘Connection‘: ‘close‘, ‘Content-Length‘: ‘18‘, ‘Content-Type‘: ‘application/x-www-form-urlencoded‘, ‘Host‘: ‘httpbin.org‘, ‘User-Agent‘: ‘Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36‘}, ‘json‘: None, ‘origin‘: ‘58.34.235.37‘, ‘url‘: ‘http://httpbin.org/post‘}

响应

response属性

import requests

 response = requests.get(‘http://www.jianshu.com‘)
print(type(response.status_code),response.status_code)

<class ‘int‘> 403

 print(type(response.headers),response.headers)

<class ‘requests.structures.CaseInsensitiveDict‘> {‘Date‘: ‘Tue, 27 Nov 2018 20:03:06 GMT‘, ‘Content-Type‘: ‘text/html‘, ‘Transfer-Encoding‘: ‘chunked‘, ‘Connection‘: ‘keep-alive‘, ‘Server‘: ‘Tengine‘, ‘Strict-Transport-Security‘: ‘max-age=31536000; includeSubDomains; preload‘, ‘Content-Encoding‘: ‘gzip‘, ‘X-Via‘: ‘1.1 dianxinxiazai180:5 (Cdn Cache Server V2.0), 1.1 PSjsntdx3xf38:1 (Cdn Cache Server V2.0)‘}

 print(type(response.cookies),response.cookies)

<class ‘requests.cookies.RequestsCookieJar‘> <RequestsCookieJar[]>

 print(type(response.url),response.url)

<class ‘str‘> https://www.jianshu.com/

 print(type(response.history),response.history)

<class ‘list‘> [<Response [301]>]

状态码判断:

 import requests
 
 response = requests.get(‘http://www.jianshu.com‘)
 exit() if not response.status_code==requests.codes.ok else print(‘Request Successfully‘)
 import requests
 
 response = requests.get(‘http://www.jianshu.com‘)
 exit() if not response.status_code==200 else print(‘Request Successfully‘)

Requests高级操作

文件上传

 import requests
 
 files = {‘files‘:open(‘favicon.ico‘,‘rb‘)}
 response = requests.post(‘http://httpbin.org/post‘,files=files)
 print(response.text)

{
"args": {},
"data": "",
"files": {
"files": "内容省略"
},
"form": {},
"headers": {
"Accept": "/",
"Accept-Encoding": "gzip, deflate",
"Connection": "close",
"Content-Length": "148",
"Content-Type": "multipart/form-data; boundary=6e864227a6fd1cd7a1655802d20d7bd9",
"Host": "httpbin.org",
"User-Agent": "python-requests/2.20.1"
},
"json": null,
"origin": "58.34.235.37",
"url": "http://httpbin.org/post"
}

获取cookie值

import requests
 
 response = requests.get(‘http://www.baidu.com‘)
 print(response.cookies)

<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>

 for key,value in response.cookies.items():
     print(key+‘=‘+value)

BDORZ=27315

会话维持 模拟登录

import requests
 requests.get(‘http://httpbin.org/cookies/set/number/123456789‘)

<Response [200]>

 response = requests.get(‘http://httpbin.org/cookies‘)
 print(response.text)

{
"cookies": {}
}

s = requests.Session()
 s.get(‘http://httpbin.org/cookies/set/number/123456789‘)

<Response [200]>

 response = s.get(‘http://httpbin.org/cookies‘)
 print(response.text)

{
"cookies": {
"number": "123456789"
}
}

证书验证

import requests
 
 response = requests.get(‘https://www.12306.cn‘)
 print(response.status_code)
response = requests.get(‘https://www.12306.cn‘,verify=False)

/home/dex/.local/lib/python3.6/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)
/home/dex/.local/lib/python3.6/site-packages/urllib3/connectionpool.py:847: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)

添加证书

response = requests.get(‘https://www.12306.cn‘,cert=(‘/path/server.crf‘,‘/path/key‘))


























































































以上是关于Python爬虫系列-Requests库详解的主要内容,如果未能解决你的问题,请参考以下文章

爬虫系列

全网最全Requests库详解,实例引入,代码分析(ip代理,用户认证,证书检测)

Requests库详解

Python爬虫:Requests库

Python爬虫之Requests库入门

Python爬虫Requests库网络爬虫实战