python 爬虫学习第一课

Posted 2022-12-01 helenandyoyo

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python 爬虫学习第一课相关的知识，希望对你有一定的参考价值。

python 爬虫学习之urllib模块

get方式请求
post方式请求
超时时间设置
超时时间设置，对错误进行处理
响应：响应类型、状态码、响应头
设置request请求头,添加代理
cookie, HTTPCookiProcessor
cookie保存到文件，MozillaCookieJar
cookie保存到文件, LWPCookieJar
读取文件中的cookie
异常处理
urlparse url拆分
urlunparse url合并
urljoin url拼接
urlencode 字典url

get方式请求

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))

post方式请求

#post方式请求
import urllib.parse
import urllib.request

data = bytes(urllib.parse.urlencode('form_email':'sun', 'form_password':'123456'), encoding='utf8')
print(data)
response = urllib.request.urlopen('https://www.douban.com/accounts/login?source=main', data = data)
print(response.read())

超时时间设置

#超时时间设置
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
print(response.read())

超时时间设置，对错误进行处理

#超时时间设置，对错误进行处理
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('https://www.douban.com/', timeout= 0.01)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

响应：响应类型、状态码、响应头

#响应：响应类型、状态码、响应头
import urllib.request

response = urllib.request.urlopen('https://www.douban.com/')
print(type(response))
print(type(response.status))
print(response.status)
#getheaders/getheader 前者得到一个响应头列表/后者获取响应头中的某一项
print(type(response.getheaders()))
print(response.getheader('date'))

设置request请求头,添加代理

#设置request请求头,添加代理
from urllib import request, parse

url = 'https://36kr.com/'
headers = 
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Host': '36kr.com'

proxy_handle = request.ProxyHandler(
    'http': 'http://183.47.40.35:8088',
    'http': 'https://42.176.36.251:37000'
)
opener = request.build_opener(proxy_handle)

req = request.Request(url, headers=headers)
response = opener.open(req)
print(response.read().decode('utf-8'))

cookie, HTTPCookiProcessor

#cookie, HTTPCookiProcessor
import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opner = urllib.request.build_opener(handler)
response = opner.open('http://wwww.baidu.com')
print(cookie)
for item in cookie:
    print(type(item))
    print(item.name + "=" + item.value)

cookie保存到文件，MozillaCookieJar

#cookie保存到文件，MozillaCookieJar
import http.cookiejar
import urllib.request
import os

filename = "cookie.txt"
if_exist = os.path.exists(filename)
print(if_exist)

cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

cookie保存到文件, LWPCookieJar

#cookie保存到文件, LWPCookieJar
import http.cookiejar
import urllib.request

filename = 'cookie1.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

读取文件中的cookie

#读取文件中的cookie
import http.cookiejar
import urllib.request

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie1.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

异常处理

#异常处理
from urllib import request, error

try:
    response = request.urlopen('http://pythonsite.com/1111.html')
except error.HTTPError as e:
    print(e.reason)
    print(e.code)
    print(e.headers)
except error.URLError as e:
    print(e.reason)

urlparse url拆分

#urlparse url拆分

from urllib.parse import urlparse

result = urlparse('https://www.cnblogs.com/zhaof/p/6910871.html')
print(result)

urlunparse url合并

#urlunparse url合并
from urllib.parse import urlunparse

data = ['http', 'www.cnblogs.com', '/zhaof/p/6910871.html','','','']
print(urlunparse(data))

urljoin url拼接

#urljoin url拼接
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','FAQ.html'))
print(urljoin('http://www.baidu.com','http://www.qdaily.com/tags/29.html'))

urlencode 字典url

#urlencode 字典url
from urllib.parse import urlencode

param = 
    "name":"Lyli",
    "age":"23"

base_url = 'http://www.baidu.com?'
url = base_url + urlencode(param)
print(url)

注：本文学习材料来自于python爬虫从入门到放弃（三）之 Urllib库的基本使用

以上是关于python 爬虫学习第一课的主要内容，如果未能解决你的问题，请参考以下文章