Python爬虫的一些操作

Posted 知我几分

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫的一些操作相关的知识,希望对你有一定的参考价值。

1.先来个不反爬的

技术分享图片
"""这个不设置反爬措施,练手最好用"""
import requests
from bs4 import BeautifulSoup


response = requests.get("https://www.autohome.com.cn/news/")
# 转换编码
response.encoding = gbk
# 封装html到soup
soup = BeautifulSoup(response.text, html.parser)
# 找到匹配的第一个div
div = soup.find(name=div, attrs={id: auto-channel-lazyload-article})
# 找到此div下所有li
li_list = div.find_all(name=li)
# 循环获取数据
for li in li_list:
    title = li.find(name=h3)
    if not title:
        continue
    p = li.find(name=p)
    a = li.find(name=a)
    print(title.text)
    print(a.attrs.get(href))
    print(p.text)
    img = li.find(name=img)
    src = img.get(src)
    src = "https:" + src
    print(type(src))
    print(type(title.text))

    # 再次发起请求,下载图片到本地
    file_name = src.rsplit(/, maxsplit=1)[1]
    ret = requests.get(src)
    with open(file_name, wb) as f:
        f.write(ret.content)
View Code

2.来个获取数据的

技术分享图片
"""进阶爬虫1"""
import requests
from bs4 import BeautifulSoup


res = requests.get(
    url="http://jandan.net/",
)
soup = BeautifulSoup(res.text, "html.parser")
div = soup.find(name="div", attrs={"id": "content"})
div_list = div.find_all(name="div", attrs={"class": "post f list-post"})
for div in div_list:
    print(div.text.strip())    # 获取所有文本
    # img = div.find(name="img")
    # src = img.get("src")
    # if not src:
    #     continue
    # src = "https:" + src
    # print(src)    获取图片
    # h = div.find(name="h2")
    # a = h.find(name="a")
    # print(a.text)    获取标题
View Code

3.来个有点难度的

 

技术分享图片
"""爬虫进阶2"""
import requests
# 1. 查看首页
r1 = requests.get(
    url=https://dig.chouti.com/,
    headers={
        user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
    }
)

# 2. 提交用户名和密码
r2 = requests.post(
    url=https://dig.chouti.com/login,
    headers={
        user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
    },
    data={
        phone:86你的账号,
        password:你的密码,
        oneMonth:1
    },
    cookies=r1.cookies.get_dict()
)


# 3. 点赞
r3 = requests.post(
    url=https://dig.chouti.com/link/vote?linksId=20435396,
    headers={
        user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36
    },
    cookies=r1.cookies.get_dict()
)
print(r3.text)
View Code

 

4.来个再难一点的

技术分享图片
"""进阶爬取3"""
import requests
import re
from bs4 import BeautifulSoup

# 先伪装login请求
res = requests.get(
    url="https://passport.lagou.com/login/login.html",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360"
    }
)
# print(res.text)   原话(动态token,防御伪造请求,重复提交)(小坑)
# 笑一会儿
# 获取token(正则匹配)
X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = ‘(.*?)‘", res.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = ‘(.*?)‘", res.text, re.S)[0]

ret = requests.post(
    url="https://passport.lagou.com/login/login.json",      # 登录网址发送前发个错的获取登录url
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
        "X-Anit-Forge-Token": X_Anti_Forge_Token,
        "X_Anti_Forge_Code": X_Anti_Forge_Code,
        "Referer": "https://passport.lagou.com/login/login.html",     # 上一次提交地址(小坑)
    },
    data={           # 发送post数据
        "isValidate": True,
        "username": 你的账号,
        "password": "你的密码",
        "request_form_verifyCode": "",
        "submit": "",
        "challenge": "c87407cd89add055d8f1b54ad579cec8",
    },
    cookies=res.cookies.get_dict(),     # 带着登录页面的cookies获取权限(小坑)
)

r1 = requests.get(
    url="https://www.lagou.com/zhaopin/Python/?labelWords=label",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.360",
        "Referer": "https://www.lagou.com/",    # 上一次的登录网址(可以re匹配一下动态获取)
    },
    cookies=ret.cookies.get_dict(),
)

soup = BeautifulSoup(r1.text, "html.parser")
div = soup.find(name="div", attrs={"id": "s_position_list"})
li_list = div.find_all(name="li")
for li in li_list:
    title = li.find(name="h3")
    if not title:
        continue
    money = li.find(name="span")
    div = li.find(name="div", attrs={"class": "li_b_l"})
    a = li.find(name="a")
    print(title.text)
    print(money.text)
    print(div.text)
    print(a.text)
View Code

5.来个github的

技术分享图片
"""进阶爬取4"""
import requests
from bs4 import BeautifulSoup


r1 = requests.get(
    url="https://github.com/session",     # 这点注意url,登录是login获取cookies是session(小坑)
    headers={
        User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
    }
)
soup = BeautifulSoup(r1.text, "html.parser")
inp = soup.find(name="input", attrs={"name": "authenticity_token"})
cookies = r1.cookies.get_dict()
token = inp.get("value")
# 登录
r2 = requests.post(
    url="https://github.com/login",
    headers={
        User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
    },
    data={
        "commit": "Sign in",
        "utf8": "?",
        "authenticity_token": token,
        "login": "你的账号",
        "password": "你的密码",
    },
    cookies=cookies
)
# 后续要啥随你
print(r2.text)
View Code

 

以上是关于Python爬虫的一些操作的主要内容,如果未能解决你的问题,请参考以下文章

Python爬虫的一些操作

python [代码片段]一些有趣的代码#sort

关于一些python爬虫示例代码

Python爬虫入门:27270图片爬取

Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段

Python爬虫入门教程 5-100 27270图片爬取