requests库的基础知识

Posted 嘟嘟小冰

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了requests库的基础知识相关的知识,希望对你有一定的参考价值。

1.安装。

cmd----------->> pip install requests.

2. 七种操作方法。

# GET        全部信息
# HEADER     仅头部信息

# Put        全部重置 
# Patch 局部更新 ## 更改操作用户
# POST 后面添加新内容 ## 搜索使用 # DELETE 删除全部 import requests \'\'\' r = requests.get("http://www.baidu.com") # 获得全部文本信息 uRL对应的页面内容 print(r.headers) # 头部信息 print(r.text) # seem is also all information \'\'\' # requests.head \'\'\' r2 = requests.head("http://www.baidu.com") # just head information print(r.headers) # head information print(r2.text) # no ! because just get the head information \'\'\' # payload = {"key1":"value1","key2":"value2"} r3 = requests.post("http://www.baidu.com",data=payload) print(r3.text)

2.Response对象的属性。

import requests
r = requests.get("http://www.baidu.com")

print(r.status_code)           # HTTP请求的返回状态,200表示连接成功。404表示失败
print(r.text)                  # HTTP响应内容的字符串形式,即,uRL对应的页面内容
print(r.encoding)              # 从HTTP header中猜测的响应内容编码方式
print(r.apparent_encoding)     # 内容中分析出的响应内容编码方式
print(r.content)               # 响应内容的二进制形式 (处理图片,视频等使用

r.encoding = r.apparent_encoding # 转化编码   r.apparent_encoding  根据它的结果转码
print(r.text)

小结:通过 r.status_code 返回的状态码,判断是否连接成功。

3.通用代码框架

def gethtmlText(url):
    try:
        r = requests.get(url,timeout = 30)
        r.raise_for_status()   # 如果状态是200,引发异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "404"

if __name__ == "__main__":      # 没搞懂这个是什么鬼
    url = "http://www.baidu.com"
    print(getHTMLText(url))

 

4.ROBOTS.txt协议。

实战练习。

1.京东页面的提取。

 1 \'\'\'
 2 import requests
 3 r = requests.get(\'https://item.jd.com/13115733485.html\')
 4 print(r.status_code)
 5 print(r.encoding)
 6 print(r.text)
 7 \'\'\'
 8 
 9 \'\'\'
10 import requests
11 url = \'https://item.jd.com/13115733485.html\'
12 try:
13     r = requests.get(url, timeout=30)
14     r.raise_for_status()
15     r.encoding = r.apparent_encoding
16     print(r.text[:1000]) 
17 except:
18     print("404")
19 \'\'\'
20 
21 import requests
22 def getHTMLText(url):
23     try:
24         r = requests.get(url,timeout = 30)
25         r.raise_for_status()   # 如果状态是200,引发异常
26         r.encoding = r.apparent_encoding
27         return r.text
28     except:
29         return "404"
30 
31 if __name__ == "__main__":
32     url = "https://item.jd.com/13115733485.html"
33     print(getHTMLText(url)[:1000])

2.亚马逊。

\'\'\'
import requests
r = requests.get("https://www.amazon.cn/gp/product/B00PG0MMLO/ref=s9_acsd_al_bw_c_x_5_w?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=merchandised-search-5&pf_rd_r=N8WFJMBB60D92VPHAREM&pf_rd_r=N8WFJMBB60D92VPHAREM&pf_rd_t=101&pf_rd_p=26451395-7952-4f3c-b948-09e79ff542f8&pf_rd_p=26451395-7952-4f3c-b948-09e79ff542f8&pf_rd_i=1885051071")
print(r.status_code)                     # 503   连接出错,不是404
print(r.encoding)                        # ISO-8859-1
r.encoding = r.apparent_encoding         # 转码
print(r.text)           # 有反应,说明受限  # 报歉,由于程序执行时,遇到意外错误,您刚刚操作没有执行成功,请稍后重试。或将此错误报告给我们的客服中心

print(r.request.headers)   # 获取请求头部信息     # {\'User-Agent\': \'python-requests/2.14.2\', \'Accept-Encoding\': \'gzip, deflate\', \'Accept\': \'*/*\', \'Connection\': \'keep-alive\'}
\'\'\'

\'\'\'
import requests
kv = {\'User-Agent\':"Mazilla/5.0"}
url = "https://www.amazon.cn/gp/product/B00PG0MMLO/ref=s9_acsd_al_bw_c_x_5_w?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=merchandised-search-5&pf_rd_r=N8WFJMBB60D92VPHAREM&pf_rd_r=N8WFJMBB60D92VPHAREM&pf_rd_t=101&pf_rd_p=26451395-7952-4f3c-b948-09e79ff542f8&pf_rd_p=26451395-7952-4f3c-b948-09e79ff542f8&pf_rd_i=1885051071"

r = requests.get(url,headers= kv)       # 更改头部信息
print(r.status_code)                    # 200
print(r.request.headers)                # {\'User-Agent\': \'Mazilla/5.0\', \'Accept-Encoding\': \'gzip, deflate\', \'Accept\': \'*/*\', \'Connection\': \'keep-alive\'}
print(r.text[:1000])
\'\'\'

import requests
def getHTMLText(url):
    try:
        kv = {\'User-Agent\': "Mazilla/5.0"}
        r = requests.get(url,headers= kv,timeout = 30)         # headers=
        r.raise_for_status()   # 如果状态是200,引发异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "404"

if __name__ == "__main__":
    url = "https://www.amazon.cn/gp/product/B00PG0MMLO/ref=s9_acsd_al_bw_c_x_5_w?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=merchandised-search-5&pf_rd_r=N8WFJMBB60D92VPHAREM&pf_rd_r=N8WFJMBB60D92VPHAREM&pf_rd_t=101&pf_rd_p=26451395-7952-4f3c-b948-09e79ff542f8&pf_rd_p=26451395-7952-4f3c-b948-09e79ff542f8&pf_rd_i=1885051071"
    print(getHTMLText(url)[:1000])

 

3.百度搜索

\'\'\'
import requests
kv = {"wd":"Python"}
r = requests.get("http://www.baidu.com/s",params=kv)
print(r.status_code)                  # 200
print(r.request.url)                  # http://www.baidu.com/s?wd=Python
print(len(r.text))                    # 196429
\'\'\'
import requests
keyword = "Python"
def getHTMLText(url):
    try:
        kv = {\'wd\':keyword}                                      # 如何添加的????
        r = requests.get(url,params=kv,timeout = 30)              # params
        r.raise_for_status()   # 如果状态是200,引发异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "404"

if __name__ == "__main__":
    url = "http://www.baidu.com"
    print(getHTMLText(url)[:1000])

 

其他控制参数的使用方法:

 

实例 查询IP

import requests
url = "http://www.ip138.com/ips138.asp?ip="
r = requests.get(url+"202.204.80.112")           # 对URL内容进行修改
print(r.status_code)
print(r.encoding)
# r.encoding = "utf-8"
print(r.text[:-500])

下载图片:

\'\'\'
import requests
path = "F:/abc.jpg"
url = "http://image.nationalgeographic.com.cn/2017/0721/20170721020325584.jpg"
r = requests.get(url)
print(r.status_code)

with open(path,"wb") as f:         # 文件存储
    f.write(r.content)   # 文件写入          # r.content 响应内容的二进制形式  # 这个句子是什么意思来着??

\'\'\'

import requests
import os

url = "http://image.nationalgeographic.com.cn/2017/0721/20170721020325584.jpg"
root = \'F://pics//\'
path = root + url.split(\'/\')[-1]

if not os.path.exists(root):
    os.mkdir(root)
if not os.path.exists(path):
    r = requests.get(url)
    with open(path, \'wb\') as f:
        f.write(r.content)
        f.close()
        print(\'文件保存成功!\')
else:
    print(\'文件已存在。\')

遇到问题很多,需要多多练习呀!!!

以上是关于requests库的基础知识的主要内容,如果未能解决你的问题,请参考以下文章

Python爬虫之Requests库入门

爬虫基础(requests库的基本使用)--02

全方面的掌握Requests库的使用【python爬虫入门进阶】(02)

Urllib 库的基础和实用

Requests库的基本使用

爬虫3-Requests库的主要方法