python2.x urllib2和urllib的使用

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python2.x urllib2和urllib的使用相关的知识,希望对你有一定的参考价值。

1.最简单用法

  urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,...)

技术分享
 1 import urllib2
 2 import urllib
 3 
 4 
 5 response = urllib2.urlopen("http://www.baidu.com")
 6 
 7 print getcode():,response.getcode()
 8 print geturl():,response.geturl()
 9 print url:,response.url
10 print headers:\\n,response.headers
11 print msg:,response.msg
12 
13 #-------------------------------------out--------------------------------------
14 getcode(): 200
15 geturl(): http://www.baidu.com
16 url: http://www.baidu.com
17 headers:
18 Date: Thu, 29 Dec 2016 06:28:36 GMT
19 Content-Type: text/html; charset=utf-8
20 Transfer-Encoding: chunked
21 Connection: Close
22 Vary: Accept-Encoding
23 Set-Cookie: BAIDUID=9A1E663B4C3AB33D11266F0D865A1F59:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
24 Set-Cookie: BIDUPSID=9A1E663B4C3AB33D11266F0D865A1F59; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
25 Set-Cookie: PSTM=1482992916; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
26 Set-Cookie: BDSVRTM=0; path=/
27 Set-Cookie: BD_HOME=0; path=/
28 Set-Cookie: H_PS_PSSID=21858_1464_21112_17001_21553_20930; path=/; domain=.baidu.com
29 P3P: CP=" OTI DSP COR IVA OUR IND COM "
30 Cache-Control: private
31 Cxy_all: baidu+0ba0b09e0fa305471b5e3b42c352570f
32 Expires: Thu, 29 Dec 2016 06:27:54 GMT
33 X-Powered-By: HPHP
34 Server: BWS/1.1
35 X-UA-Compatible: IE=Edge,chrome=1
36 BDPAGETYPE: 1
37 BDQID: 0x889c1bcd00004be7
38 BDUSERID: 0
39 
40 msg: OK
View Code

 获取html内容

技术分享
1 print response.read()     #以str字符串形式返回整个页面
2 print response.readline() #每执行一次返回一行
3 print response.readlines() #以列表形式返回
View Code

2.  构造Request 设置headers

技术分享
 1 def set_headers():
 2     #构造Request,设置headers
 3     #__init__(self, url, data=None, headers={},origin_req_host=None, unverifiable=False)
 4     import urllib2
 5     headers = {User-Agent:liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36}
 6     request = urllib2.Request("http://localhost:5000/urllib2testget",headers=headers)
 7 
 8     response = urllib2.urlopen(request)
 9     print request.headers
10     #追加一个header
11     request.add_header("addheader","nice")
12     response = urllib2.urlopen(request)
13     print request.headers
14 
15 set_headers()
16 
17 #--------------------------------输出:
18 
19 {User-agent: liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36}
20 {"a": "1", "": "2"}
21 ------------------------------------------------
22 {Addheader: nice, User-agent: liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36}
23 {"a": "1", "": "2"}
View Code

3.发送get请求,发送post请求

技术分享
 1 def get_post():
 2     #get方式
 3     import urllib2
 4     import urllib
 5     headers = {User-Agent:liu bi}
 6     values = {"username":"diaosir_get","password":"diao123_get"}
 7     data = urllib.urlencode(values)
 8     print ---------------------get:
 9     url = "http://localhost:5000/urllib2testget"
10     get_url=url+"?"+data
11     request = urllib2.Request(get_url,headers=headers)
12     response = urllib2.urlopen(request)
13     print json.loads(response.read())
14     print ---------------------post:
15     url = "http://localhost:5000/urllib2testpost"
16     request = urllib2.Request(url,data,headers=headers)
17     response = urllib2.urlopen(request)
18     print json.loads(response.read())
19 
20 get_post()
21 
22 #---------------------------------------------------------输出:
23 ---------------------get:
24 {uusername: udiaosir_get, upassword: udiao123_get}
25 ---------------------post:
26 {uusername: udiaosir_get, upassword: udiao123_get}
post&get

4.代理模式设置

技术分享
def set_proxies():
    #1.proxy_handler
    #2.创建operner
    #3.安装opener[非必须]
    #4.拿operner去请求url
    enable_proxy = True
    proxy_handler = urllib2.ProxyHandler({"http":http://120.24.73.165:3128})
    null_proxy_handler = urllib2.ProxyHandler({})
    if enable_proxy:
        opener = urllib2.build_opener(proxy_handler)#挂载opener
    else:
        opener = urllib2.build_opener(null_proxy_handler)
    request = urllib2.Request(http://www.baidu.com)
    print ---------------------不使用代理
    response = urllib2.urlopen(request)
    print response.getcode(),request.host
    print ---------------------使用代理
    response = opener.open(request)
    print response.getcode(),request.host

#----------------------------------------------------------输出
---------------------不使用代理
200 www.baidu.com
---------------------使用代理
200 120.24.73.165:3128
View Code

5.debug模式, 代码中urllib2.build_opener中的httpsHandler需要去掉,

技术分享
 1 def debug_set():
 2     #代理,调试
 3     import  urllib2,urllib
 4     proxy_handler = urllib2.ProxyHandler({"http":http://192.168.1.108:89})
 5 
 6     #debuglog的使用
 7     httpHandler = urllib2.HTTPHandler(debuglevel=1)
 8     opener = urllib2.build_opener(httpHandler, httpsHandler,)
 9     urllib2.install_opener(opener) 
10     request = urllib2.Request(http://127.0.0.1:5000/urllib2testget?a=2&b=3,headers={User-Agent:liubi00})
11     response = opener.open(request)
12     print response.getcode(),response.read()
13 
14 
15 
16 
17 #-------------------------------------------输出:
18 send: GET /urllib2testget?a=2&b=3 HTTP/1.1\\r\\nAccept-Encoding: identity\\r\\nHost: 127.0.0.1:5000\\r\\nConnection: close\\r\\nUser-Agent: liubi00\\r\\n\\r\\n
19 reply: HTTP/1.0 200 OK\\r\\n
20 header: Content-Type: text/html; charset=utf-8
21 header: Content-Length: 20
22 header: Server: Werkzeug/0.11.11 Python/2.7.12
23 header: Date: Fri, 30 Dec 2016 15:12:40 GMT
24 200 {"a": "2", "b": "3"}
View Code

 

6.获取cookie存到cookie.txt

技术分享
import cookielib
import  urllib2

def get_cookie():
    filename = cookie.txt
    #声明一个MozillaCookieJar对象实例来保存cookie,之后写入文件
    cookie = cookielib.MozillaCookieJar(filename)
    #利用urllib2库的HTTPCookieProcessor对象来创建cookie处理器
    handler = urllib2.HTTPCookieProcessor(cookie)
    #通过handler来构建opener
    opener = urllib2.build_opener(handler,)
    request = urllib2.Request(http://www.baidu.com)
    request.add_header(User-Agent,fuckyou)
    response = opener.open(request)
    #保存cookie到文件
    cookie.save(ignore_discard=True, ignore_expires=True)
    print response.getcode()

get_cookie()

#----------------------------------------------输出:
200
View Code

7.通过cookie请求,更多查看http://www.cnblogs.com/sysu-blackbear/p/3629770.html

技术分享
 1 import cookielib
 2 import urllib2
 3 def use_cookie():
 4     #cookie--从cookies.txt读取cookies,携带cookies请求
 5     cookie_file = cookie.txt
 6     #创建MozillaCookieJar实例对象
 7     cookie = cookielib.MozillaCookieJar(cookie_file)
 8     #从文件中读取cookie内容到变量
 9     cookie.load( ignore_discard=True, ignore_expires=True)
10     #创建请求的request
11     req = urllib2.Request("http://www.baidu.com")
12     #利用urllib2的build_opener方法创建一个opener
13     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
14     response = opener.open(req)
15     print response.read()
View Code

8.异常处理

技术分享
 1 def deal_errors():
 2     #异常处理
 3     import urllib2
 4     #HTTPError
 5     req = urllib2.Request(http://blog.csdn.net/cqcre)
 6     try:
 7         urllib2.urlopen(req)
 8     except urllib2.HTTPError, e:
 9         print e.code
10         print e.reason
11 
12     #URLError
13     requset = urllib2.Request(http://www.xxxxx.com)
14     try:
15         urllib2.urlopen(requset)
16     except urllib2.URLError, e:
17         print e.reason
18 
19     #HTTPERROR&URLERROR
20     req = urllib2.Request(http://blog.csdn.net/cqcre)
21     try:
22         urllib2.urlopen(req)
23     except urllib2.URLError, e:
24         if hasattr(e,"code"):
25             print e.code
26         if hasattr(e,"reason"):
27             print e.reason
28     else:
29         print "OK"
View Code

 

以上是关于python2.x urllib2和urllib的使用的主要内容,如果未能解决你的问题,请参考以下文章

urllib2和requests的区别

python2.x到python3.x函数变化

Python爬虫_urllib2的使用

python中urllib, urllib2,urllib3, httplib,httplib2, request的区别

urllib模块

python爬虫_urllib2库的基本使用