python urllib库详解

Posted 2021-01-15 qqw-1995
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python urllib库详解相关的知识，希望对你有一定的参考价值。
Python的内置HTTP请求库
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparse robots.txt解析模块
  1 import urllib.request
  2 from urllib.parse import urlparse,urlunparse,urljoin,urlencode
  3 import http.cookiejar
  4 from urllib import error
  5 import socket
  6 #urlopen
  7 #请求
  8 # data = bytes(urllib.parse.urlencode({‘world‘:‘hello‘}),encoding=‘utf-8‘)
  9 # response = urllib.request.urlopen(‘http://httpbin.org/post‘,data=data)#post请求
 10 # print(response.read())
 11 #
 12 # #响应
 13 # #响应类型
 14 # response = urllib.request.urlopen(‘https://www.python.org‘)
 15 # print(type(response))
 16 #
 17 # #状态码、响应头
 18 # response = urllib.request.urlopen(‘http://www.python.org‘)
 19 # print(response.status)
 20 # print(response.getheaders())
 21 # print(response.getheader(‘Server‘))
 22 #
 23 # #获取响应体的内容
 24 # response = urllib.request.urlopen(‘http://www.python.org‘)
 25 # print(response.read().decode(‘utf-8‘))
 26 # #Request
 27 # request = urllib.request.Request(‘http://www.python.org‘)
 28 # response = urllib.request.urlopen(request)
 29 # print(response.read().decode(‘utf-8‘))
 30 
 31 #构建post请求加点headers
 32 # url = ‘http://httpbin.org/post‘
 33 # headers = {
 34 #     ‘User-Agent‘:‘Mozilla/4.0(compatible;MSE 5.5;Windows NT)‘,
 35 #     ‘Host‘:‘httpbin.org‘
 36 # }
 37 # dict = {
 38 #     ‘name‘:‘Germey‘
 39 # }
 40 # data = bytes(urllib.parse.urlencode(dict),encoding=‘utf-8‘)#form
 41 # req = urllib.request.Request(url=url,data=data,headers=headers,method=‘POST‘)
 42 # response = urllib.request.urlopen(req)
 43 # print(response.read().decode(‘utf-8‘))
 44 #
 45 # #Habdler
 46 # #代理,切换ip
 47 # proxy_handler = urllib.request.ProxyHandler({
 48 #     ‘http‘:‘http://127.0.0.1:9743‘,
 49 #     ‘https‘:‘https://127.0.0.1:9743‘
 50 # })
 51 # opener = urllib.request.build_opener(proxy_handler)
 52 # response = opener.open(‘http://httpbin.org/post‘)
 53 # print(response.read())
 54 #cookie是用来记录用户身份，维持登录状态
 55 cookie = http.cookiejar.CookieJar()
 56 handler = urllib.request.HTTPCookieProcessor(cookie)
 57 opener = urllib.request.build_opener(handler)
 58 response = opener.open(‘http://baidu.com‘)
 59 for item in cookie:
 60     print(item.name+‘=‘+item.value)
 61 
 62 # #用来保持登录信息所以可以把cookie保存文本信息
 63 # filename = ‘cookie.txt‘
 64 # cookie = http.cookiejar.LWPCookieJar(filename)
 65 # handler = urllib.request.HTTPCookieProcessor(cookie)
 66 # opener = urllib.request.build_opener(handler)
 67 # response = opener.open(‘http://baidu.com‘)
 68 # cookie.save(ignore_discard=True,ignore_expires=True)
 69 # #读取cookie文本文件
 70 # cookie = http.cookiejar.LWPCookieJar()
 71 # cookie.load(‘cookie.txt‘,ignore_expires=True,ignore_discard=True)
 72 # handler = urllib.request.HTTPCookieProcessor(cookie)
 73 # opener = urllib.request.build_opener(handler)
 74 # response = opener.open(‘http://www.baidu.com‘)
 75 # print(response.read().decode(‘utf-8‘))
 76 
 77 #异常处理,urllib库的异常大类就HTTPError,URLError两类
 78 try:
 79     response = urllib.request.urlopen(‘http://cuiqingcai.com/index.htm‘)
 80 except error.HTTPError as e:
 81     print(e.reason,e.code,e.headers,sep=‘
‘)
 82 except error.URLError as e:
 83     print(e.reason)
 84 else:
 85     print("Request Successfully")
 86 
 87 #具体验证是哪种异常：isinstance
 88 try:
 89     response = urllib.request.urlopen(‘http://www.baidu.com‘,timeout=0.01)
 90 except error.URLError as e:
 91     print(type(e.reason))
 92     if isinstance(e.reason,socket.timeout):
 93         print(‘TIME OUT‘)
 94 
 95 #URL解析
 96 #urllib.parse.urlparse(url,scheme=‘‘,allow_fragments=True)
 97 #urlparse将URL分割进行赋值,协议类型，域名，路径,参数等
 98 result = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘)
 99 print(type(result),result)
100 
101 #只有URL没有协议类型,URL有协议类型，scheme不会生效
102 result = urlparse(‘www.baidu.com/index.html;user?id=5#comment‘,scheme=‘https‘)
103 print(result)
104 
105 #fragment是锚点连接，令其为False会将fragment的内容加到前面的参数里
106 result = urlparse(‘http://www.baidu.com/index.html;user?id=5#comment‘,allow_fragments=False)
107 print(result)
108 
109 #urlunparse是将URL进行拼接
110 data = {‘http‘,‘www.baidu.com‘,‘index.html‘,‘user‘,‘a=6‘,‘comment‘}
111 print(urlunparse(data))#user://www.baidu.com/http;index.html?comment#a=6
112 
113 #urljoin拼接URL，后面的字段名会覆盖前面的字段名
114 print(urljoin(‘http://www.baidu.com‘,‘FAQ.html‘))#http://www.baidu.com/FAQ.html
115 print(urljoin(‘http://www.baidu.com‘,‘https://cuiqingcai.com/FAQ.html‘))#https://cuiqingcai.com/FAQ.html
116 
117 #urlencode将字典对象转化为get请求参数
118 params = {
119     ‘name‘:‘gemmey‘,
120     ‘age‘:22
121 }
122 base_url = ‘http://www.baidu.com?‘
123 url = base_url+urlencode(params)
124 print(url)
以上是关于python urllib库详解的主要内容，如果未能解决你的问题，请参考以下文章