python 爬虫学习第一课
Posted helenandyoyo
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 爬虫学习第一课相关的知识,希望对你有一定的参考价值。
python 爬虫学习之urllib模块
- get方式请求
- post方式请求
- 超时时间设置
- 超时时间设置,对错误进行处理
- 响应:响应类型、状态码、响应头
- 设置request请求头,添加代理
- cookie, HTTPCookiProcessor
- cookie保存到文件,MozillaCookieJar
- cookie保存到文件, LWPCookieJar
- 读取文件中的cookie
- 异常处理
- urlparse url拆分
- urlunparse url合并
- urljoin url拼接
- urlencode 字典url
get方式请求
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))
post方式请求
#post方式请求
import urllib.parse
import urllib.request
data = bytes(urllib.parse.urlencode('form_email':'sun', 'form_password':'123456'), encoding='utf8')
print(data)
response = urllib.request.urlopen('https://www.douban.com/accounts/login?source=main', data = data)
print(response.read())
超时时间设置
#超时时间设置
import urllib.request
response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
print(response.read())
超时时间设置,对错误进行处理
#超时时间设置,对错误进行处理
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
except urllib.error.URLError as e:
if isinstance(e.reason, socket.timeout):
print('TIME OUT')
响应:响应类型、状态码、响应头
#响应:响应类型、状态码、响应头
import urllib.request
response = urllib.request.urlopen('https://www.douban.com/')
print(type(response))
print(type(response.status))
print(response.status)
#getheaders/getheader 前者得到一个响应头列表/后者获取响应头中的某一项
print(type(response.getheaders()))
print(response.getheader('date'))
设置request请求头,添加代理
#设置request请求头,添加代理
from urllib import request, parse
url = 'https://36kr.com/'
headers =
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Host': '36kr.com'
proxy_handle = request.ProxyHandler(
'http': 'http://183.47.40.35:8088',
'http': 'https://42.176.36.251:37000'
)
opener = request.build_opener(proxy_handle)
req = request.Request(url, headers=headers)
response = opener.open(req)
print(response.read().decode('utf-8'))
cookie, HTTPCookiProcessor
#cookie, HTTPCookiProcessor
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opner = urllib.request.build_opener(handler)
response = opner.open('http://wwww.baidu.com')
print(cookie)
for item in cookie:
print(type(item))
print(item.name + "=" + item.value)
cookie保存到文件,MozillaCookieJar
#cookie保存到文件,MozillaCookieJar
import http.cookiejar
import urllib.request
import os
filename = "cookie.txt"
if_exist = os.path.exists(filename)
print(if_exist)
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)
cookie保存到文件, LWPCookieJar
#cookie保存到文件, LWPCookieJar
import http.cookiejar
import urllib.request
filename = 'cookie1.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)
读取文件中的cookie
#读取文件中的cookie
import http.cookiejar
import urllib.request
cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie1.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
异常处理
#异常处理
from urllib import request, error
try:
response = request.urlopen('http://pythonsite.com/1111.html')
except error.HTTPError as e:
print(e.reason)
print(e.code)
print(e.headers)
except error.URLError as e:
print(e.reason)
urlparse url拆分
#urlparse url拆分
from urllib.parse import urlparse
result = urlparse('https://www.cnblogs.com/zhaof/p/6910871.html')
print(result)
urlunparse url合并
#urlunparse url合并
from urllib.parse import urlunparse
data = ['http', 'www.cnblogs.com', '/zhaof/p/6910871.html','','','']
print(urlunparse(data))
urljoin url拼接
#urljoin url拼接
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','http://www.qdaily.com/tags/29.html'))
urlencode 字典url
#urlencode 字典url
from urllib.parse import urlencode
param =
"name":"Lyli",
"age":"23"
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(param)
print(url)
注:本文学习材料来自于python爬虫从入门到放弃(三)之 Urllib库的基本使用
以上是关于python 爬虫学习第一课的主要内容,如果未能解决你的问题,请参考以下文章