request模块的简单使用+爬虫小程序

Posted 周建豪的博客

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了request模块的简单使用+爬虫小程序相关的知识,希望对你有一定的参考价值。

爬虫之request

各种请求方式

get

host_url = ‘https://www.pearvideo.com/‘
#浏览器的版本等信息
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}
res = requests.get(host_url, headers=headers)

post

 r = requests.post(‘http://httpbin.org/post‘, data = {‘key‘:‘value‘})

delete

r = requests.delete(‘http://httpbin.org/delete‘)

put

r = requests.put(‘http://httpbin.org/put‘, data = {‘key‘:‘value‘})

响应response的属性

import requests
respone=requests.get(‘http://www.jianshu.com‘)
# respone属性
#获得响应的文本为字符串格式
print(respone.text)
#获得响应的文本为二进制格式
print(respone.content)
#获得响应的状态码
print(respone.status_code)

print(respone.headers)
print(respone.cookies)
#已字典的形式获得响应的cookie
print(respone.cookies.get_dict())
print(respone.cookies.items())

print(respone.url)
print(respone.history)

print(respone.encoding)

爬取梨视频首页视频

import os
import re
from concurrent.futures import ThreadPoolExecutor

import requests

host_url = ‘https://www.pearvideo.com/‘
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}


def get_index():
    res = requests.get(host_url, headers=headers)
    return res.text


def parser_index(text):
    res = re.findall(‘<a href="(.*?)" class="vervideo-lilink actplay">‘, text)
    res = [host_url + i for i in res]

    return res


def get_detail(html_text):
    # 获得视频的下载地址
    download_index = re.search(r‘srcUrl="(.*?.mp4)"‘, html_text).group(1)

    # 获取标题
    title = re.search(‘<h1 class="video-tt">(.*?)</h1>‘, html_text).group(1)

    dic = {
        ‘download_index‘: download_index,
        ‘title‘: title
    }
    print(‘成功链接到[%s]视频文件‘ % title)
    return dic


def get_video(video_url, title):
    video_bytes = requests.get(video_url).content
    if not os.path.exists(‘down_pearvideos‘):
        os.mkdir(‘down_pearvideos‘)
    file_path = os.path.join(‘down_pearvideos‘, title) + ‘.mp4‘
    with open(file_path, ‘wb‘) as f:
        f.write(video_bytes)
    print(file_path + ‘下载成功!‘)


if __name__ == ‘__main__‘:
    pool = ThreadPoolExecutor(10)
    text = get_index()
    url_list = parser_index(text)
    for url in url_list:
        response = requests.get(url, headers=headers).text
        content_dic = get_detail(response)
        # get_video(content_dic[‘download_index‘],content_dic[‘title‘])
        # 开启多线程快速的爬取数据
        pool.submit(get_video, content_dic[‘download_index‘], content_dic[‘title‘])

模拟登陆github

import re

import requests

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}

login_url = ‘https://github.com/login‘

login_response = requests.get(login_url, headers=headers)
login_token = re.search(‘name="authenticity_token" value="(.*?)"‘, login_response.text).group(1)
print(login_token)
login_cookie = login_response.cookies.get_dict()
print(login_cookie)

session_url = ‘https://github.com/session‘

session_response = requests.post(
    session_url,
    headers=headers,
    cookies=login_cookie,
    data={
        "commit": "Sign in",
        "utf8": "?",
        "authenticity_token": login_token,
        "login": "yangyuanhu",
        "password": "123654asd"
    }
)

print(session_response.text)

以上是关于request模块的简单使用+爬虫小程序的主要内容,如果未能解决你的问题,请参考以下文章

Python "爬虫"出发前的装备之简单实用的 Requests 模块

你还不会Python网络爬虫中的requests模块使用《一》

Python爬虫:requests模块使用

nodejs实现一个简单的爬虫

入门学Python一定要知道的requests模块安装及使用

Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段