python2.x urllib2和urllib的使用
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python2.x urllib2和urllib的使用相关的知识,希望对你有一定的参考价值。
1.最简单用法
urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,...)
1 import urllib2 2 import urllib 3 4 5 response = urllib2.urlopen("http://www.baidu.com") 6 7 print ‘getcode():‘,response.getcode() 8 print ‘geturl():‘,response.geturl() 9 print ‘url:‘,response.url 10 print ‘headers:\\n‘,response.headers 11 print ‘msg:‘,response.msg 12 13 #-------------------------------------out-------------------------------------- 14 getcode(): 200 15 geturl(): http://www.baidu.com 16 url: http://www.baidu.com 17 headers: 18 Date: Thu, 29 Dec 2016 06:28:36 GMT 19 Content-Type: text/html; charset=utf-8 20 Transfer-Encoding: chunked 21 Connection: Close 22 Vary: Accept-Encoding 23 Set-Cookie: BAIDUID=9A1E663B4C3AB33D11266F0D865A1F59:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com 24 Set-Cookie: BIDUPSID=9A1E663B4C3AB33D11266F0D865A1F59; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com 25 Set-Cookie: PSTM=1482992916; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com 26 Set-Cookie: BDSVRTM=0; path=/ 27 Set-Cookie: BD_HOME=0; path=/ 28 Set-Cookie: H_PS_PSSID=21858_1464_21112_17001_21553_20930; path=/; domain=.baidu.com 29 P3P: CP=" OTI DSP COR IVA OUR IND COM " 30 Cache-Control: private 31 Cxy_all: baidu+0ba0b09e0fa305471b5e3b42c352570f 32 Expires: Thu, 29 Dec 2016 06:27:54 GMT 33 X-Powered-By: HPHP 34 Server: BWS/1.1 35 X-UA-Compatible: IE=Edge,chrome=1 36 BDPAGETYPE: 1 37 BDQID: 0x889c1bcd00004be7 38 BDUSERID: 0 39 40 msg: OK
获取html内容
1 print response.read() #以str字符串形式返回整个页面 2 print response.readline() #每执行一次返回一行 3 print response.readlines() #以列表形式返回
2. 构造Request 设置headers
1 def set_headers(): 2 #构造Request,设置headers 3 #__init__(self, url, data=None, headers={},origin_req_host=None, unverifiable=False) 4 import urllib2 5 headers = {‘User-Agent‘:‘liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36‘} 6 request = urllib2.Request("http://localhost:5000/urllib2testget",headers=headers) 7 8 response = urllib2.urlopen(request) 9 print request.headers 10 #追加一个header 11 request.add_header("addheader","nice") 12 response = urllib2.urlopen(request) 13 print request.headers 14 15 set_headers() 16 17 #--------------------------------输出: 18 19 {‘User-agent‘: ‘liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36‘} 20 {"a": "1", "": "2"} 21 ------------------------------------------------ 22 {‘Addheader‘: ‘nice‘, ‘User-agent‘: ‘liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36‘} 23 {"a": "1", "": "2"}
3.发送get请求,发送post请求
1 def get_post(): 2 #get方式 3 import urllib2 4 import urllib 5 headers = {‘User-Agent‘:‘liu bi‘} 6 values = {"username":"diaosir_get","password":"diao123_get"} 7 data = urllib.urlencode(values) 8 print ‘---------------------get:‘ 9 url = "http://localhost:5000/urllib2testget" 10 get_url=url+"?"+data 11 request = urllib2.Request(get_url,headers=headers) 12 response = urllib2.urlopen(request) 13 print json.loads(response.read()) 14 print ‘---------------------post:‘ 15 url = "http://localhost:5000/urllib2testpost" 16 request = urllib2.Request(url,data,headers=headers) 17 response = urllib2.urlopen(request) 18 print json.loads(response.read()) 19 20 get_post() 21 22 #---------------------------------------------------------输出: 23 ---------------------get: 24 {u‘username‘: u‘diaosir_get‘, u‘password‘: u‘diao123_get‘} 25 ---------------------post: 26 {u‘username‘: u‘diaosir_get‘, u‘password‘: u‘diao123_get‘}
4.代理模式设置
def set_proxies(): #1.proxy_handler #2.创建operner #3.安装opener[非必须] #4.拿operner去请求url enable_proxy = True proxy_handler = urllib2.ProxyHandler({"http":‘http://120.24.73.165:3128‘}) null_proxy_handler = urllib2.ProxyHandler({}) if enable_proxy: opener = urllib2.build_opener(proxy_handler)#挂载opener else: opener = urllib2.build_opener(null_proxy_handler) request = urllib2.Request(‘http://www.baidu.com‘) print ‘---------------------不使用代理‘ response = urllib2.urlopen(request) print response.getcode(),request.host print ‘---------------------使用代理‘ response = opener.open(request) print response.getcode(),request.host #----------------------------------------------------------输出 ---------------------不使用代理 200 www.baidu.com ---------------------使用代理 200 120.24.73.165:3128
5.debug模式, 代码中urllib2.build_opener中的httpsHandler需要去掉,
1 def debug_set(): 2 #代理,调试 3 import urllib2,urllib 4 proxy_handler = urllib2.ProxyHandler({"http":‘http://192.168.1.108:89‘}) 5 6 #debuglog的使用 7 httpHandler = urllib2.HTTPHandler(debuglevel=1) 8 opener = urllib2.build_opener(httpHandler, httpsHandler,) 9 urllib2.install_opener(opener) 10 request = urllib2.Request(‘http://127.0.0.1:5000/urllib2testget?a=2&b=3‘,headers={‘User-Agent‘:‘liubi00‘}) 11 response = opener.open(request) 12 print response.getcode(),response.read() 13 14 15 16 17 #-------------------------------------------输出: 18 send: ‘GET /urllib2testget?a=2&b=3 HTTP/1.1\\r\\nAccept-Encoding: identity\\r\\nHost: 127.0.0.1:5000\\r\\nConnection: close\\r\\nUser-Agent: liubi00\\r\\n\\r\\n‘ 19 reply: ‘HTTP/1.0 200 OK\\r\\n‘ 20 header: Content-Type: text/html; charset=utf-8 21 header: Content-Length: 20 22 header: Server: Werkzeug/0.11.11 Python/2.7.12 23 header: Date: Fri, 30 Dec 2016 15:12:40 GMT 24 200 {"a": "2", "b": "3"}
6.获取cookie存到cookie.txt
import cookielib import urllib2 def get_cookie(): filename = ‘cookie.txt‘ #声明一个MozillaCookieJar对象实例来保存cookie,之后写入文件 cookie = cookielib.MozillaCookieJar(filename) #利用urllib2库的HTTPCookieProcessor对象来创建cookie处理器 handler = urllib2.HTTPCookieProcessor(cookie) #通过handler来构建opener opener = urllib2.build_opener(handler,) request = urllib2.Request(‘http://www.baidu.com‘) request.add_header(‘User-Agent‘,‘fuckyou‘) response = opener.open(request) #保存cookie到文件 cookie.save(ignore_discard=True, ignore_expires=True) print response.getcode() get_cookie() #----------------------------------------------输出: 200
7.通过cookie请求,更多查看http://www.cnblogs.com/sysu-blackbear/p/3629770.html
1 import cookielib 2 import urllib2 3 def use_cookie(): 4 #cookie--从cookies.txt读取cookies,携带cookies请求 5 cookie_file = ‘cookie.txt‘ 6 #创建MozillaCookieJar实例对象 7 cookie = cookielib.MozillaCookieJar(cookie_file) 8 #从文件中读取cookie内容到变量 9 cookie.load( ignore_discard=True, ignore_expires=True) 10 #创建请求的request 11 req = urllib2.Request("http://www.baidu.com") 12 #利用urllib2的build_opener方法创建一个opener 13 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) 14 response = opener.open(req) 15 print response.read()
8.异常处理
1 def deal_errors(): 2 #异常处理 3 import urllib2 4 #HTTPError 5 req = urllib2.Request(‘http://blog.csdn.net/cqcre‘) 6 try: 7 urllib2.urlopen(req) 8 except urllib2.HTTPError, e: 9 print e.code 10 print e.reason 11 12 #URLError 13 requset = urllib2.Request(‘http://www.xxxxx.com‘) 14 try: 15 urllib2.urlopen(requset) 16 except urllib2.URLError, e: 17 print e.reason 18 19 #HTTPERROR&URLERROR 20 req = urllib2.Request(‘http://blog.csdn.net/cqcre‘) 21 try: 22 urllib2.urlopen(req) 23 except urllib2.URLError, e: 24 if hasattr(e,"code"): 25 print e.code 26 if hasattr(e,"reason"): 27 print e.reason 28 else: 29 print "OK"
以上是关于python2.x urllib2和urllib的使用的主要内容,如果未能解决你的问题,请参考以下文章
python中urllib, urllib2,urllib3, httplib,httplib2, request的区别