03:requests与BeautifulSoup结合爬取网页数据应用

Posted 不做大哥好多年

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了03:requests与BeautifulSoup结合爬取网页数据应用相关的知识,希望对你有一定的参考价值。

1.1 爬虫相关模块命令回顾

  1、requests模块

                  1、 pip install requests

                  2、 response = requests.get(\'http://www.baidu.com/ \')            #获取指定url的网页内容

                  3、 response.text                                                                                #获取文本文件

                  4、 response.content                                                                         #获取字节类型

                  5、 response.encoding = ‘utf-8’                                                       #指定获取的网页内容用utf-8编码

                      response.encoding = response.apparent_encoding       #下载的页面是什么编码就用什么编码格式

                  6、 response.cookies                                                                         #拿到cookies

                      response.cookies.get_dict()                               #拿到cookie字典样式

       2、beautisoup模块

                  1、 pip install beautifulsoup4

                  2、 把文本转成对象

        1)html.parser 是python内置模块无需安装

          soup = BeautiSoup(response.text,parser=\'html.parser\')

        2)lxml是第三方库,但是性能好(生产用这个

                                   soup = BeautifulSoup(response.text,features=\'lxml\')

                  3、 .find()用法:返回的是对象

        1)从爬取的内容找到id="auto-channel-lazyload-article" 中div的内容

                                   target = soup.find(id="auto-channel-lazyload-article")

        2) 从爬取的内容中找到一个div,并且这个div有一个属性是id=’i1’

                                   target = soup.find(\'div\',id=\'i1\')

                  4、 .find_all()用法:返回的是对象列表

                          1) 从以后取的target对象中找到所有li标签

                                   li_list = target.find_all(\'li\')

                  5、 从.find()获取的对象中找到想要的属性

        a.attrs.get(\'href\')                                                #获取所有a标签的所有href属性(a标签url路径)

        a.find(\'h3\').text                                                   #找到a标签中的所有h3标签,的内容

        img_url = a.find(\'img\').attrs.get(\'src\')       #从a标签中找到img标签所有src属性(图片url路径)

 1.2 爬取需要登录和不需要登录页面内容的方法

import requests
from bs4 import BeautifulSoup
response = requests.get(
   url=\'http://www.autohome.com.cn/news/\'
)

response.encoding = response.apparent_encoding          #下载的页面是什么编码就用什么编码格式

#1 把文本转成对象,
#soup = BeautifulSoup(response.text,features=\'lxml\')        #lxml是第三方库,但是性能好(生产用这个)
soup = BeautifulSoup(response.text,features=\'html.parser\')  # html.parser 是python内置模块无需安装

#2 从爬取的内容找到id="auto-channel-lazyload-article" 中div的内容
target = soup.find(id="auto-channel-lazyload-article")

#3.1 找到所有li标签 .find()是找到第一个
#3.2 也可以这样用: .find(\'div\',id=\'i1\')  可以使用这种组合查找的方法
#3.3 .find()找到的是对象,.find_all() 获取的是列表
li_list = target.find_all(\'li\')

for i in li_list:
   a = i.find(\'a\')
   if a:
      print(a.attrs.get(\'href\'))                   #获取所有a标签的url路径
      # a.find(\'h3\') 获取的是对象, 加上 .text才是获取文本
      txt = a.find(\'h3\').text                      #从a标签中找到所有h3标签的值
      print(txt,type(txt))
      img_url = a.find(\'img\').attrs.get(\'src\')#从a标签中找到img标签所有src属性(图片url路径)
      import uuid
      file_name = str(uuid.uuid4()) + \'.jpg\'

      if img_url.startswith(\'//www2\'):        #由于获取的图片url做了处理,所以才这样处理
         img_url2 = img_url.replace(\'//www2\',\'http://www3\')
         img_response = requests.get(url=img_url2)
         with open(file_name,\'wb\') as f:
            f.write(img_response.content)       #把图片写到本地
例1:爬取汽车之家新闻页面(爬取无需登录的网页)
import requests

#1 登录抽屉网站的用户名和密码放到字典里
post_dict = {
   "phone":\'86185387525\',
   \'password\':\'74810\',
   \'oneMonth\':1
}

#2 将密码字典以post方式提交到抽屉的登录界面
response = requests.post(
   url = \'http://dig.chouti.com/login\',
   data=post_dict
)

#3下面就是成功登录抽屉的返回值
print(response.text)
# {"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_49844923242"}}}

#4 下面是打印成功登录抽屉后返回的的cookie字典
cookie_dict = response.cookies.get_dict()
print(cookie_dict)
#{\'JSESSIONID\': \'aaaVizwwcod_L5QcwwR9v\', \'puid\': \'d332ef55361217e544b91f081090ad5e\',
#  \'route\': \'37316285ff8286c7a96cd0b03d38e13b\', \'gpsd\': \'f8b07e259141ae5a11d930334fbfb609\'}

#5 当我们每次需要访问抽屉登录后才能看的信息时,就可以在url中添加登录成返回的cookie字典
response=requests.get(
   url=\'http://dig.chouti.com/profile\',
   cookies = cookie_dict
)
例2:自动登录抽屉并获取用户配置页面的信息(cookie方式)

1.3 使用爬虫登录案例总结 

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests

# ## 1、首先登陆任何页面,获取cookie
i1 = requests.get(url="http://dig.chouti.com/")
i1_cookies = i1.cookies.get_dict()

# ## 2、用户登陆,携带上一次的cookie,后台对cookie中的 gpsd 进行授权
i2 = requests.post(
    url="http://dig.chouti.com/login",
    data={
        \'phone\': "8618538752511",
        \'password\': "7481079xl",
        \'oneMonth\': ""
    },
    cookies=i1_cookies
)

# ## 3、点赞(只需要携带已经被授权的gpsd即可)
gpsd = i1_cookies[\'gpsd\']
i3 = requests.post(
    url="http://dig.chouti.com/link/vote?linksId=15074576",
    cookies={\'gpsd\': gpsd}
)
print(i3.text)
例1:方式一: 使用cookie方式点赞抽屉
import requests

session = requests.Session()
i1 = session.get(url="http://dig.chouti.com/help/service")
i2 = session.post(
    url="http://dig.chouti.com/login",
    data={
        \'phone\': "8618538752511",
        \'password\': "7481079xl",
        \'oneMonth\': ""
    },
)
i3 = session.post(
    url="http://dig.chouti.com/link/vote?linksId=15074576"
)
print(i3.text)
例2:方式二: 使用session方式点赞抽屉
import requests
from bs4 import BeautifulSoup

# 第一步:获取csrf
# 1.1 获取login页面
r1 = requests.get(url=\'https://github.com/login\')
# 1.2 接文本文件解析成对象
b1 = BeautifulSoup(r1.text,\'html.parser\')
# 1.3 找到csrf_token标签
tag = b1.find(name=\'input\',attrs={\'name\':\'authenticity_token\'})
#1.4 获取csrf_token的值
# tag.get(\'value\')等价于 tag.attrs.get(\'values\')
token = tag.get(\'value\')                # 获取csrf_token的值
#1.5 获取第一次发送get请求返回的cookies字典
r1_cookie = r1.cookies.get_dict()       #获取第一次发get请求返回的cookie
print(\'第一次\',r1_cookie)

# 第二步:发送post请求,携带用户名 密码,和第一次get请求返回的cookie,后台进行授权
#2.1 携带:csrf_token,cookies,用户名,密码 发送post请求登录
# requests.post() 等价于  requests.request(\'post\',)
r2 = requests.post(
   url=\'https://github.com/session\',
   data={                        #这里data字典必须和实际登录的格式相同
      \'commit\':\'Sign in\',
      \'utf8\':\'\',
      \'authenticity_token\':token,
      \'login\':\'1532363461@qq.com\',
      \'password\':\'7481079xl\',
   },
   cookies = r1_cookie,

)
#2.2 获取第二次返回的cookies字典
r2_cookie = r2.cookies.get_dict()
print(\'第二次\',r2_cookie)
#2.3 将两次获取的cookie字典整合成一个:没有重合就用r1_cookie,有重合的就用r2_cookie更新这个字典
r1_cookie.update(r2_cookie)

# 第三步:访问个人页面,携带cookie
r3 = requests.get(
   url=\'https://github.com/settings/profile\',
   cookies = r1_cookie,                  # 获取数据时携带登录成功的cookie
)
print(r3.text)
例3:使用爬虫登录github并获取用户配置信息
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import time

import requests
from bs4 import BeautifulSoup

session = requests.Session()

i1 = session.get(
    url=\'https://www.zhihu.com/#signin\',
    headers={
        \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36\',
    }
)

soup1 = BeautifulSoup(i1.text, \'lxml\')
xsrf_tag = soup1.find(name=\'input\', attrs={\'name\': \'_xsrf\'})
xsrf = xsrf_tag.get(\'value\')

current_time = time.time()
i2 = session.get(
    url=\'https://www.zhihu.com/captcha.gif\',
    params={\'r\': current_time, \'type\': \'login\'},
    headers={
        \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36\',
    })

with open(\'zhihu.gif\', \'wb\') as f:
    f.write(i2.content)

captcha = input(\'请打开zhihu.gif文件,查看并输入验证码:\')
form_data = {
    "_xsrf": xsrf,
    \'password\': \'xxooxxoo\',
    "captcha": \'captcha\',
    \'email\': \'424662508@qq.com\'
}

i3 = session.post(
    url=\'https://www.zhihu.com/login/email\',
    data=form_data,
    headers={
        \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36\',
    }
)

i4 = session.get(
    url=\'https://www.zhihu.com/settings/profile\',
    headers={
        \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36\',
    }
)

soup4 = BeautifulSoup(i4.text, \'lxml\')
tag = soup4.find(id=\'rename-section\')
nick_name = tag.find(\'span\',class_=\'name\').string
print(nick_name)
例4:登录知乎
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import re
import json
import base64

import rsa
import requests

def js_encrypt(text):
    b64der = \'MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCp0wHYbg/NOPO3nzMD3dndwS0MccuMeXCHgVlGOoYyFwLdS24Im2e7YyhB0wrUsyYf0/nhzCzBK8ZC9eCWqd0aHbdgOQT6CuFQBMjbyGYvlVYU2ZP7kG9Ft6YV6oc9ambuO7nPZh+bvXH0zDKfi02prknrScAKC0XhadTHT3Al0QIDAQAB\'
    der = base64.standard_b64decode(b64der)

    pk = rsa.PublicKey.load_pkcs1_openssl_der(der)
    v1 = rsa.encrypt(bytes(text, \'utf8\'), pk)
    value = base64.encodebytes(v1).replace(b\'\\n\', b\'\')
    value = value.decode(\'utf8\')

    return value

session = requests.Session()
i1 = session.get(\'https://passport.cnblogs.com/user/signin\')
rep = re.compile("\'VerificationToken\': \'(.*)\'")
v = re.search(rep, i1.text)
verification_token = v.group(1)

form_data = {
    \'input1\': js_encrypt(\'wptawy\'),
    \'input2\': js_encrypt(\'asdfasdf\'),
    \'remember\': False
}
i2 = session.post(url=\'https://passport.cnblogs.com/user/signin\',
                  data=json.dumps(form_data),
                  headers={
                      \'Content-Type\': \'application/json; charset=UTF-8\',
                      \'X-Requested-With\': \'XMLHttpRequest\',
                      \'VerificationToken\': verification_token}
                  )

i3 = session.get(url=\'https://i.cnblogs.com/EditDiary.aspx\')
print(i3.text)
例5:登录博客园

 

以上是关于03:requests与BeautifulSoup结合爬取网页数据应用的主要内容,如果未能解决你的问题,请参考以下文章

使用beautifulsoup与requests爬取数据

Mooc爬虫03-BeautifulSoup

BeautifulSoup 中未显示表格元素

使用python抓取并分析数据—链家网(requests+BeautifulSoup)(转)

python爬虫之requests+selenium+BeautifulSoup

爬虫初体验:Python+Requests+BeautifulSoup抓取广播剧