python-爬虫

Posted Cool

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python-爬虫相关的知识,希望对你有一定的参考价值。

1.爬虫的定义:   向网站发起请求,获取资源后分析并提取有用数据的程序

2.爬虫的基本流程:

#1、发起请求
使用http库向目标站点发起请求,即发送一个Request
Request包含:请求头、请求体等

#2、获取响应内容
如果服务器能正常响应,则会得到一个Response
Response包含:html,json,图片,视频等

#3、解析内容
解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等
解析json数据:json模块
解析二进制数据:以b的方式写入文件

#4、保存数据
数据库
文件

3.格式:

    requests.get/post(                     #requests库发送请求将网页内容下载下来以后,并不会执行js代码,这需要我们自己分析目标站点然后发起新的request请求
            url,
            params={} ,    #请求数据
            cookies={},

            headers={
                User-agent:
                (cookie):
                Referer:
                },
            data={
                如果是get方式,请求体没有内容
                如果是post方式,请求体是format data
                },
            allow_redirects=False,   #默认是True
            )

 

4.简单示例:

技术分享图片
import requests #pip3 install requests
import re
import hashlib
import time

movie_path=rC:\mp4

def get_page(url):
    try:
        response=requests.get(url)
        if response.status_code == 200:
            return response.text
    except Exception:
        pass

def parse_index(index_page):
    urls=re.findall(class="items".*?href="(.*?)",index_page,re.S)
    for url in urls:
        if not url.startswith(http):
            url=http://www.xiaohuar.com+url
        yield url

def parse_detail(detail_page):
    l=re.findall(id="media".*?src="(.*?)",detail_page,re.S)
    if l:
        movie_url=l[0]
        if movie_url.endswith(mp4):
            yield movie_url

def get_movie(url):
    try:
        response=requests.get(url)
        if response.status_code == 200:
            m=hashlib.md5()
            m.update(str(time.time()).encode(utf-8))
            m.update(url.encode(utf-8))
            filepath=%s\%s.mp4 %(movie_path,m.hexdigest())
            with open(filepath,wb) as f:
                f.write(response.content)
                print(%s 下载成功 %url)
    except Exception:
        pass

def main():
    base_url=http://www.xiaohuar.com/list-3-{page_num}.html
    for i in range(5):
        url=base_url.format(page_num=i)
        index_page=get_page(url)
        detail_urls=parse_index(index_page)
        for detail_url in detail_urls:
            detail_page=get_page(detail_url)
            movie_urls=parse_detail(detail_page)
            for movie_url in movie_urls:
                get_movie(movie_url)

if __name__ == __main__:
    main()
爬取校花网视频(不能并发)
技术分享图片
import requests #pip3 install requests
import re
import hashlib
import time
from concurrent.futures import ThreadPoolExecutor

pool=ThreadPoolExecutor(50)
movie_path=rC:\mp4

def get_page(url):
    try:
        response=requests.get(url)
        if response.status_code == 200:
            return response.text
    except Exception:
        pass

def parse_index(index_page):
    index_page=index_page.result()
    urls=re.findall(class="items".*?href="(.*?)",index_page,re.S)
    for detail_url in urls:
        if not detail_url.startswith(http):
            detail_url=http://www.xiaohuar.com+detail_url
        pool.submit(get_page,detail_url).add_done_callback(parse_detail)

def parse_detail(detail_page):
    detail_page=detail_page.result()
    l=re.findall(id="media".*?src="(.*?)",detail_page,re.S)
    if l:
        movie_url=l[0]
        if movie_url.endswith(mp4):
            pool.submit(get_movie,movie_url)

def get_movie(url):
    try:
        response=requests.get(url)
        if response.status_code == 200:
            m=hashlib.md5()
            m.update(str(time.time()).encode(utf-8))
            m.update(url.encode(utf-8))
            filepath=%s\%s.mp4 %(movie_path,m.hexdigest())
            with open(filepath,wb) as f:
                f.write(response.content)
                print(%s 下载成功 %url)
    except Exception:
        pass

def main():
    base_url=http://www.xiaohuar.com/list-3-{page_num}.html
    for i in range(5):
        url=base_url.format(page_num=i)
        pool.submit(get_page,url).add_done_callback(parse_index)

if __name__ == __main__:
    main()
爬取校花网视频(并发)

5. requests模块的详细用法

import requests
from urllib.parse import urlencode

5.1 请求头中要有 User-Agent
keyword = input(>>: ).strip() res = urlencode({wd: keyword}, encoding=utf-8) url = https://www.baidu.com/s? + res print(url) respone=requests.get(url, headers={ User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36, }, ) print(respone.status_code) with open(a.html,w,encoding=utf-8) as f: f.write(respone.text) 5.2 参数中可以写url请求中的数据 keyword = input(>>: ).strip() respone=requests.get(https://www.baidu.com/s?, params={ wd:keyword, pn:20 }, headers={ User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36, }, ) print(respone.status_code) with open(a.html,w,encoding=utf-8) as f: f.write(respone.text) response = requests.get(https://www.zhihu.com/explore, headers={ User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36, }) print(response.status_code) print(response.text) 5.3 请求头中可以写 cookie response=requests.get( url=https://github.com/settings/emails, headers={ User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36, "Cookie": "_octo=GH1.1.9390043.1516008745; _ga=GA1.2.674621867.1516008745; _gat=1; tz=Asia%2FShanghai; user_session=gPR8zXuoKMY-h9R6WVlju1xa-jBcNRpEoWNOa9k3B922hlqy; __Host-user_session_same_site=gPR8zXuoKMY-h9R6WVlju1xa-jBcNRpEoWNOa9k3B922hlqy; logged_in=yes; dotcom_user=egonLin; _gh_sess=eyJsYXN0X3dyaXRlIjoxNTE2MDA4Nzc1NTkyLCJmbGFzaCI6eyJkaXNjYXJkIjpbXSwiZmxhc2hlcyI6eyJhbmFseXRpY3NfZGltZW5zaW9uIjp7Im5hbWUiOiJkaW1lbnNpb241IiwidmFsdWUiOiJMb2dnZWQgSW4ifX19LCJzZXNzaW9uX2lkIjoiMzllOGI4NjI4ODdjMTFlMmEyYTg5ZDUyMmU0NzQ4ODEifQ%3D%3D--37b89b9cb319fba0e2f7df68bcffe1a56bea7a41", } ) print([email protected] in response.text) 5.4 cookie 可以写在外面 response=requests.get( url=https://github.com/settings/emails, headers={ User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36, }, cookies={ "k1":"v1", }, ) print([email protected] in response.text) 5.5 是否允许请求的页面重定向 response=requests.get( url=https://github.com/settings/emails, headers={ User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36, }, cookies={ "k1":"v1", }, allow_redirects=False, )

6.小练习

技术分享图片
import requests
import re

# 一:先获取登陆页面,拿到authenticity_token和cookies:
# 1 请求的url:https://github.com/login
# 2 请求方法:GET
# 3 请求头:
#    User-Agent
r1 = requests.get(https://github.com/login,
                  headers={
                      User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
                  },
                  )
authenticity_token = re.findall(name="authenticity_token".*?value="(.*?)", r1.text, re.S)[0]
r1_cookies=r1.cookies.get_dict()
print(authenticity_token)
print(r1_cookies)
# 二:提交表单数据完成登陆
# 1 请求的url:https://github.com/session
# 2 请求方法:POST
# 3 请求头:
#    Referer:https://github.com/
#    User-Agent
# 4 请求体
# commit:Sign in
# utf8:?
# authenticity_token:pFLyO9choCgUd6mm1AMP7BoeEQ613TRDe49MBREZ7EU7MKM7IELFgmyGfcKXS0hsaIiGJ8YlkTD5nwwV4tebig==
# login:[email protected]
# password:alex3714
r2 = requests.post(https://github.com/session,
                   headers={
                       "Referer": "https://github.com/",
                       User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
                   },
                   cookies=r1_cookies,
                   data={
                       "commit": "Sign in",
                       utf8: "?",
                       "authenticity_token": authenticity_token,
                       "login": "[email protected]",
                       "password": "66666",
                   },
                   allow_redirects=False
                   )

# print(r2.status_code)
# print(r2.history)      #在允许跳转时查看跳转前页面数据

cookies=r2.cookies.get_dict()

r3=requests.get(https://github.com/settings/emails,
             headers={
                 "Referer": "https://github.com/",
                 User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36,
             },
             cookies=cookies)
print([email protected] in r3.text)
模拟登录git练习

 





以上是关于python-爬虫的主要内容,如果未能解决你的问题,请参考以下文章

Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段

python爬虫学习笔记-M3U8流视频数据爬虫

爬虫遇到头疼的验证码?Python实战讲解弹窗处理和验证码识别

python网络爬虫

Python 利用爬虫爬取网页内容 (div节点的疑惑)

为啥我的python爬虫界面与博主不一样