1.爬虫的定义: 向网站发起请求,获取资源后分析并提取有用数据的程序
2.爬虫的基本流程:
#1、发起请求 使用http库向目标站点发起请求,即发送一个Request Request包含:请求头、请求体等 #2、获取响应内容 如果服务器能正常响应,则会得到一个Response Response包含:html,json,图片,视频等 #3、解析内容 解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等 解析json数据:json模块 解析二进制数据:以b的方式写入文件 #4、保存数据 数据库 文件
3.格式:
requests.get/post( #requests库发送请求将网页内容下载下来以后,并不会执行js代码,这需要我们自己分析目标站点然后发起新的request请求 url, params={} , #请求数据 cookies={}, headers={ User-agent: (cookie): Referer: }, data={ 如果是get方式,请求体没有内容 如果是post方式,请求体是format data }, allow_redirects=False, #默认是True )
4.简单示例:
import requests #pip3 install requests import re import hashlib import time movie_path=r‘C:\mp4‘ def get_page(url): try: response=requests.get(url) if response.status_code == 200: return response.text except Exception: pass def parse_index(index_page): urls=re.findall(‘class="items".*?href="(.*?)"‘,index_page,re.S) for url in urls: if not url.startswith(‘http‘): url=‘http://www.xiaohuar.com‘+url yield url def parse_detail(detail_page): l=re.findall(‘id="media".*?src="(.*?)"‘,detail_page,re.S) if l: movie_url=l[0] if movie_url.endswith(‘mp4‘): yield movie_url def get_movie(url): try: response=requests.get(url) if response.status_code == 200: m=hashlib.md5() m.update(str(time.time()).encode(‘utf-8‘)) m.update(url.encode(‘utf-8‘)) filepath=‘%s\%s.mp4‘ %(movie_path,m.hexdigest()) with open(filepath,‘wb‘) as f: f.write(response.content) print(‘%s 下载成功‘ %url) except Exception: pass def main(): base_url=‘http://www.xiaohuar.com/list-3-{page_num}.html‘ for i in range(5): url=base_url.format(page_num=i) index_page=get_page(url) detail_urls=parse_index(index_page) for detail_url in detail_urls: detail_page=get_page(detail_url) movie_urls=parse_detail(detail_page) for movie_url in movie_urls: get_movie(movie_url) if __name__ == ‘__main__‘: main()
import requests #pip3 install requests import re import hashlib import time from concurrent.futures import ThreadPoolExecutor pool=ThreadPoolExecutor(50) movie_path=r‘C:\mp4‘ def get_page(url): try: response=requests.get(url) if response.status_code == 200: return response.text except Exception: pass def parse_index(index_page): index_page=index_page.result() urls=re.findall(‘class="items".*?href="(.*?)"‘,index_page,re.S) for detail_url in urls: if not detail_url.startswith(‘http‘): detail_url=‘http://www.xiaohuar.com‘+detail_url pool.submit(get_page,detail_url).add_done_callback(parse_detail) def parse_detail(detail_page): detail_page=detail_page.result() l=re.findall(‘id="media".*?src="(.*?)"‘,detail_page,re.S) if l: movie_url=l[0] if movie_url.endswith(‘mp4‘): pool.submit(get_movie,movie_url) def get_movie(url): try: response=requests.get(url) if response.status_code == 200: m=hashlib.md5() m.update(str(time.time()).encode(‘utf-8‘)) m.update(url.encode(‘utf-8‘)) filepath=‘%s\%s.mp4‘ %(movie_path,m.hexdigest()) with open(filepath,‘wb‘) as f: f.write(response.content) print(‘%s 下载成功‘ %url) except Exception: pass def main(): base_url=‘http://www.xiaohuar.com/list-3-{page_num}.html‘ for i in range(5): url=base_url.format(page_num=i) pool.submit(get_page,url).add_done_callback(parse_index) if __name__ == ‘__main__‘: main()
5. requests模块的详细用法
import requests
from urllib.parse import urlencode
5.1 请求头中要有 ‘User-Agent‘
keyword = input(‘>>: ‘).strip() res = urlencode({‘wd‘: keyword}, encoding=‘utf-8‘) url = ‘https://www.baidu.com/s?‘ + res print(url) respone=requests.get(url, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, ) print(respone.status_code) with open(‘a.html‘,‘w‘,encoding=‘utf-8‘) as f: f.write(respone.text) 5.2 参数中可以写url请求中的数据 keyword = input(‘>>: ‘).strip() respone=requests.get(‘https://www.baidu.com/s?‘, params={ ‘wd‘:keyword, ‘pn‘:20 }, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, ) print(respone.status_code) with open(‘a.html‘,‘w‘,encoding=‘utf-8‘) as f: f.write(respone.text) response = requests.get(‘https://www.zhihu.com/explore‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }) print(response.status_code) print(response.text) 5.3 请求头中可以写 cookie response=requests.get( url=‘https://github.com/settings/emails‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, "Cookie": "_octo=GH1.1.9390043.1516008745; _ga=GA1.2.674621867.1516008745; _gat=1; tz=Asia%2FShanghai; user_session=gPR8zXuoKMY-h9R6WVlju1xa-jBcNRpEoWNOa9k3B922hlqy; __Host-user_session_same_site=gPR8zXuoKMY-h9R6WVlju1xa-jBcNRpEoWNOa9k3B922hlqy; logged_in=yes; dotcom_user=egonLin; _gh_sess=eyJsYXN0X3dyaXRlIjoxNTE2MDA4Nzc1NTkyLCJmbGFzaCI6eyJkaXNjYXJkIjpbXSwiZmxhc2hlcyI6eyJhbmFseXRpY3NfZGltZW5zaW9uIjp7Im5hbWUiOiJkaW1lbnNpb241IiwidmFsdWUiOiJMb2dnZWQgSW4ifX19LCJzZXNzaW9uX2lkIjoiMzllOGI4NjI4ODdjMTFlMmEyYTg5ZDUyMmU0NzQ4ODEifQ%3D%3D--37b89b9cb319fba0e2f7df68bcffe1a56bea7a41", } ) print(‘[email protected]‘ in response.text) 5.4 cookie 可以写在外面 response=requests.get( url=‘https://github.com/settings/emails‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, cookies={ "k1":"v1", }, ) print(‘[email protected]‘ in response.text) 5.5 是否允许请求的页面重定向 response=requests.get( url=‘https://github.com/settings/emails‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, cookies={ "k1":"v1", }, allow_redirects=False, )
6.小练习
import requests import re # 一:先获取登陆页面,拿到authenticity_token和cookies: # 1 请求的url:https://github.com/login # 2 请求方法:GET # 3 请求头: # User-Agent r1 = requests.get(‘https://github.com/login‘, headers={ ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, ) authenticity_token = re.findall(‘name="authenticity_token".*?value="(.*?)"‘, r1.text, re.S)[0] r1_cookies=r1.cookies.get_dict() print(authenticity_token) print(r1_cookies) # 二:提交表单数据完成登陆 # 1 请求的url:https://github.com/session # 2 请求方法:POST # 3 请求头: # Referer:https://github.com/ # User-Agent # 4 请求体 # commit:Sign in # utf8:? # authenticity_token:pFLyO9choCgUd6mm1AMP7BoeEQ613TRDe49MBREZ7EU7MKM7IELFgmyGfcKXS0hsaIiGJ8YlkTD5nwwV4tebig== # login:[email protected] # password:alex3714 r2 = requests.post(‘https://github.com/session‘, headers={ "Referer": "https://github.com/", ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, cookies=r1_cookies, data={ "commit": "Sign in", ‘utf8‘: "?", "authenticity_token": authenticity_token, "login": "[email protected]", "password": "66666", }, allow_redirects=False ) # print(r2.status_code) # print(r2.history) #在允许跳转时查看跳转前页面数据 cookies=r2.cookies.get_dict() r3=requests.get(‘https://github.com/settings/emails‘, headers={ "Referer": "https://github.com/", ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36‘, }, cookies=cookies) print(‘[email protected]‘ in r3.text)