Python爬虫的一些操作
Posted 知我几分
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫的一些操作相关的知识,希望对你有一定的参考价值。
1.先来个不反爬的
"""这个不设置反爬措施,练手最好用""" import requests from bs4 import BeautifulSoup response = requests.get("https://www.autohome.com.cn/news/") # 转换编码 response.encoding = ‘gbk‘ # 封装html到soup soup = BeautifulSoup(response.text, ‘html.parser‘) # 找到匹配的第一个div div = soup.find(name=‘div‘, attrs={‘id‘: ‘auto-channel-lazyload-article‘}) # 找到此div下所有li li_list = div.find_all(name=‘li‘) # 循环获取数据 for li in li_list: title = li.find(name=‘h3‘) if not title: continue p = li.find(name=‘p‘) a = li.find(name=‘a‘) print(title.text) print(a.attrs.get(‘href‘)) print(p.text) img = li.find(name=‘img‘) src = img.get(‘src‘) src = "https:" + src print(type(src)) print(type(title.text)) # 再次发起请求,下载图片到本地 file_name = src.rsplit(‘/‘, maxsplit=1)[1] ret = requests.get(src) with open(file_name, ‘wb‘) as f: f.write(ret.content)
2.来个获取数据的
"""进阶爬虫1""" import requests from bs4 import BeautifulSoup res = requests.get( url="http://jandan.net/", ) soup = BeautifulSoup(res.text, "html.parser") div = soup.find(name="div", attrs={"id": "content"}) div_list = div.find_all(name="div", attrs={"class": "post f list-post"}) for div in div_list: print(div.text.strip()) # 获取所有文本 # img = div.find(name="img") # src = img.get("src") # if not src: # continue # src = "https:" + src # print(src) 获取图片 # h = div.find(name="h2") # a = h.find(name="a") # print(a.text) 获取标题
3.来个有点难度的
"""爬虫进阶2""" import requests # 1. 查看首页 r1 = requests.get( url=‘https://dig.chouti.com/‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ } ) # 2. 提交用户名和密码 r2 = requests.post( url=‘https://dig.chouti.com/login‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ }, data={ ‘phone‘:‘86你的账号‘, ‘password‘:‘你的密码‘, ‘oneMonth‘:1 }, cookies=r1.cookies.get_dict() ) # 3. 点赞 r3 = requests.post( url=‘https://dig.chouti.com/link/vote?linksId=20435396‘, headers={ ‘user-agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘ }, cookies=r1.cookies.get_dict() ) print(r3.text)
4.来个再难一点的
"""进阶爬取3""" import requests import re from bs4 import BeautifulSoup # 先伪装login请求 res = requests.get( url="https://passport.lagou.com/login/login.html", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360" } ) # print(res.text) 原话(动态token,防御伪造请求,重复提交)(小坑) # 笑一会儿 # 获取token(正则匹配) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", res.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", res.text, re.S)[0] ret = requests.post( url="https://passport.lagou.com/login/login.json", # 登录网址发送前发个错的获取登录url headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360", "X-Anit-Forge-Token": X_Anti_Forge_Token, "X_Anti_Forge_Code": X_Anti_Forge_Code, "Referer": "https://passport.lagou.com/login/login.html", # 上一次提交地址(小坑) }, data={ # 发送post数据 "isValidate": True, "username": 你的账号, "password": "你的密码", "request_form_verifyCode": "", "submit": "", "challenge": "c87407cd89add055d8f1b54ad579cec8", }, cookies=res.cookies.get_dict(), # 带着登录页面的cookies获取权限(小坑) ) r1 = requests.get( url="https://www.lagou.com/zhaopin/Python/?labelWords=label", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360", "Referer": "https://www.lagou.com/", # 上一次的登录网址(可以re匹配一下动态获取) }, cookies=ret.cookies.get_dict(), ) soup = BeautifulSoup(r1.text, "html.parser") div = soup.find(name="div", attrs={"id": "s_position_list"}) li_list = div.find_all(name="li") for li in li_list: title = li.find(name="h3") if not title: continue money = li.find(name="span") div = li.find(name="div", attrs={"class": "li_b_l"}) a = li.find(name="a") print(title.text) print(money.text) print(div.text) print(a.text)
5.来个github的
"""进阶爬取4""" import requests from bs4 import BeautifulSoup r1 = requests.get( url="https://github.com/session", # 这点注意url,登录是login获取cookies是session(小坑) headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, } ) soup = BeautifulSoup(r1.text, "html.parser") inp = soup.find(name="input", attrs={"name": "authenticity_token"}) cookies = r1.cookies.get_dict() token = inp.get("value") # 登录 r2 = requests.post( url="https://github.com/login", headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, data={ "commit": "Sign in", "utf8": "?", "authenticity_token": token, "login": "你的账号", "password": "你的密码", }, cookies=cookies ) # 后续要啥随你 print(r2.text)
以上是关于Python爬虫的一些操作的主要内容,如果未能解决你的问题,请参考以下文章
Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段