urllib库中常见的类和方法
Posted qyan-blog
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了urllib库中常见的类和方法相关的知识,希望对你有一定的参考价值。
from urllib.request import urlopen
from http.client import HTTPResponse
response = urlopen(‘http://www.baidu.com‘)# http.client.HTTPResponse对象
print(type(response))
with response:
print(1,response.status)
print(2,response.reason)
print(3,response.geturl())
print(4,response.info()) # headers
print(5,response.read())
# urlopen 只能传递url和data,但是不能构造HTTP请求,所以Request类来实现
# 初始化方法,构造一个请求对象 可以添加一个headers字典,data参数决定是GET或者POST
# add_header(key,value)也可以为headers中增加一个键值对
from urllib.request import Request,urlopen
import random
url = ‘http://www.bing.com‘
ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (Khtml, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
ua = random.choice(ua_list)
req = Request(url) # 返回<class ‘urllib.request.Request‘>类
req.add_header(‘User-agent‘,ua)
print(‘type_req‘,type(req))# 返回<class ‘http.client.HTTPResponse‘>类
response = urlopen(req,timeout=20)
print(type(response))
with response:
print(1,response.status,response.getcode(),response.reason)
print(2,response.geturl())
print(3,response.info())#response的headers
print(4,response.read())
print(5,req.get_header(‘User-agent‘))
print(6,‘user-agent‘.capitalize())
# urllib.parse 模块
from urllib import parse
u = {
‘url‘:‘http://www.baidu.com‘,
‘p_url‘:‘http://www.baidu.com‘
}
x = parse.urlencode(u)
print(x)
u = parse.urlencode({‘wd‘:‘中国‘}) # 编码
print(u)
url = "https://www.baidu.com/s?{}".format(u)
print(url)
print(‘中国‘.encode(‘utf-8‘))
print(parse.unquote(u)) # 解码
print(parse.unquote(url))
需求:通过关键字在bing中搜索,返回结果保存在html文件中
from urllib.request import Request,urlopen
from urllib.parse import urlencode
import random
keyword = input("请输入关键字")
data = urlencode({‘q‘:keyword})
base_url = ‘http://cn.bing.com/search‘
url = ‘{}?{}‘.format(base_url,data)
print(url)
ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
ua = random.choice(ua_list)
req = Request(url,headers={‘User-agent‘:ua})
response = urlopen(req)
with response:
with open(‘1.html‘,‘wb‘) as f:
f.write(response.read())
print("success")
# POST方法
from urllib.request import Request,urlopen
from urllib.parse import urlencode
import simplejson
import random
ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
ua = random.choice(ua_list)
req = Request(‘http://httonin.org/post‘)
req.add_header(‘User-agent‘,ua)
data = urlencode({‘name‘:‘张三,@=/&*‘,‘age‘:‘6‘})
print(data)
res1 = urlopen(req,data=‘name=张三,@=/&*,&age=6‘.encode())# 不做url编码
res2 = urlopen(req,data=data.encode())#POST方法,Form提交数据
# with res1:
# print(res1.read())
with res2:
print(res2.read())
# 豆瓣https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=20&page_start=0
from urllib.request import Request,urlopen
from urllib.parse import urlencode
import random
ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
ua = random.choice(ua_list)
url = ‘https://movie.douban.com/explore#!‘
req = Request(url)
req.add_header(‘User-agent‘,ua)
data = urlencode({
‘type‘:‘movie‘,
‘tag‘:‘热门‘,
‘sort‘:‘rank‘,
‘page_limit‘:8,
‘page_start‘:10
})
# POST 方法
res = urlopen(req,data=data.encode())
with res:
print(res._method)
print(1,res.read().decode())
# GET方法
with urlopen(‘{}?{}‘.format(url,data)) as res:
print(res._method)
print(2,res.read().decode())
from urllib.request import Request,urlopen
import ssl
import random
ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
ua = random.choice(ua_list)
request = Request(‘https://www.12306.cn/mormhweb/‘)
request.add_header(
‘User-agent‘,ua
)
# 接受不受信任证书
context = ssl._create_unverified_context()
res = urlopen(request,context= context)
with res:
print(res._method)
print(res.geturl())
print(res.read().decode())
标准库urllib缺少关键功能,非标准第三方库提供了比如连接池管理
import urllib3
import random
url = ‘https://movie.douban.com‘
ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
ua = random.choice(ua_list)
with urllib3.PoolManager() as http:
response = http.request(‘GET‘,url,headers={‘User-agent‘:ua})
print(type(response)) # <class ‘urllib3.response.HTTPResponse‘>类
print(response.status,response.reason)
print(response.headers)
print(response.data)
requests库使用了urllib3库,提供可友好的api
import requests
import random
url = ‘https://movie.douban.com‘
ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
ua = random.choice(ua_list)
response = requests.request(‘GET‘,url,headers={‘User-Agent‘:ua})
with response:
print(type(response))
print(response.url)
print(response.status_code)
print(response.request.headers)# 请求头
print(response.headers)# 响应头
print(response.text)
with open(‘movie.html‘,‘w‘,encoding=‘utf-8‘) as f:
f.write(response.text)
requests默认使用了Session对象,是为了多次与服务器交互保留会话信息:
# 直接使用session
import requests
import random
ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
ua = random.choice(ua_list)
urls = [‘https://www.baidu.com‘,‘https://www.baidu.com‘]
session = requests.Session()
print(type(session))
with session:
for url in urls:
response = session.get(url,headers={‘User-agent‘:ua})
with response:
print(type(response)) # <class ‘requests.models.Response‘>类
print(response.url)
print(response.status_code)
print(‘headers‘,response.request.headers)
print(‘cookie‘,response.cookies)
print(response.text[:20])
以上是关于urllib库中常见的类和方法的主要内容,如果未能解决你的问题,请参考以下文章