爬虫性能分析
Posted 宁信
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫性能分析相关的知识,希望对你有一定的参考价值。
对于爬虫,python进行并发抓取的实现方式主要有以下几种:进程,线程,协程。
性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢。
一 多进程执行
可以实现并发,但是,请求发送出去后和返回之前,中间时期进程空闲
编写方式:
1- 多进程直接返回处理
1 from concurrent.futures import ProcessPoolExecutor 2 import requests 3 import time 4 5 def task(url): 6 response = requests.get(url) 7 print(url,response) 8 # 写正则表达式 9 return response 10 11 pool = ProcessPoolExecutor(7) 12 url_list = [ 13 \'http://www.cnblogs.com/wupeiqi\', 14 \'http://huaban.com/favorite/beauty/\', 15 \'http://www.bing.com\', 16 \'http://www.zhihu.com\', 17 \'http://www.sina.com\', 18 \'http://www.baidu.com\', 19 \'http://www.autohome.com.cn\', 20 ] 21 22 for url in url_list: 23 pool.submit(task,url) 24 25 pool.shutdown(wait=True)
2- 多进程通过回调函数处理
1 from concurrent.futures import ProcessPoolExecutor 2 import requests 3 import time 4 5 def task(url): 6 response = requests.get(url) 7 return response 8 9 def done(future,*args,**kwargs): 10 response = future.result() 11 print(response.status_code,response.content) 12 13 pool = ProcessPoolExecutor(7) 14 url_list = [ 15 \'http://www.cnblogs.com/wupeiqi\', 16 \'http://huaban.com/favorite/beauty/\', 17 \'http://www.bing.com\', 18 \'http://www.zhihu.com\', 19 \'http://www.sina.com\', 20 \'http://www.baidu.com\', 21 \'http://www.autohome.com.cn\', 22 ] 23 for url in url_list: 24 v = pool.submit(task,url) 25 v.add_done_callback(done) 26 27 pool.shutdown(wait=True)
二 多线程执行
爬虫可以实现并发,但是,请求发送出去后和返回之前,中间时期线程空闲。
编写方式:
1 多线程直接返回处理
from concurrent.futures import ThreadPoolExecutor import requests import time def task(url): response = requests.get(url) print(url,response) # 写正则表达式 pool = ThreadPoolExecutor(7) url_list = [ \'http://www.cnblogs.com/wupeiqi\', \'http://huaban.com/favorite/beauty/\', \'http://www.bing.com\', \'http://www.zhihu.com\', \'http://www.sina.com\', \'http://www.baidu.com\', \'http://www.autohome.com.cn\', ] for url in url_list: pool.submit(task,url) pool.shutdown(wait=True)
2 多线程通过回调函数处理
from concurrent.futures import ThreadPoolExecutor import requests import time def task(url): """ 下载页面 :param url: :return: """ response = requests.get(url) return response def done(future,*args,**kwargs): response = future.result() print(response.status_code,response.content) pool = ThreadPoolExecutor(7) url_list = [ \'http://www.cnblogs.com/wupeiqi\', \'http://huaban.com/favorite/beauty/\', \'http://www.bing.com\', \'http://www.zhihu.com\', \'http://www.sina.com\', \'http://www.baidu.com\', \'http://www.autohome.com.cn\', ] for url in url_list: v = pool.submit(task,url) v.add_done_callback(done) pool.shutdown(wait=True)
通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费
三 异步非阻塞模块
协程(微线程) + 异步IO---> 1个线程发送N个Http请求
import asyncio @asyncio.coroutine def task(): print(\'before...task......\') yield from asyncio.sleep(5) # 发送Http请求,支持TCP获取结果.. print(\'end...task......\') tasks = [task(), task()] loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
1 import asyncio 2 @asyncio.coroutine 3 def task(host, url=\'/\'): 4 print(\'start\',host,url) 5 reader, writer = yield from asyncio.open_connection(host, 80) 6 7 request_header_content = "GET %s HTTP/1.0\\r\\nHost: %s\\r\\n\\r\\n" % (url, host,) 8 request_header_content = bytes(request_header_content, encoding=\'utf-8\') 9 10 writer.write(request_header_content) 11 yield from writer.drain() 12 text = yield from reader.read() 13 print(\'end\',host, url, text) 14 writer.close() 15 16 tasks = [ 17 task(\'www.cnblogs.com\', \'/gregoryli/\'), 18 task(\'dig.chouti.com\', \'/pic/show?nid=4073644713430508&lid=10273091\') 19 ] 20 21 loop = asyncio.get_event_loop() 22 results = loop.run_until_complete(asyncio.gather(*tasks)) 23 loop.close()
import aiohttp import asyncio @asyncio.coroutine def fetch_async(url): print(url) response = yield from aiohttp.request(\'GET\', url) print(url, response) response.close() tasks = [fetch_async(\'http://www.baidu.com/\'), fetch_async(\'http://www.chouti.com/\')] event_loop = asyncio.get_event_loop() results = event_loop.run_until_complete(asyncio.gather(*tasks)) event_loop.close()
1 # -*- coding: utf-8 -*- 2 # 2017/11/17 14:04 3 import asyncio 4 import requests 5 6 @asyncio.coroutine 7 def task(func, *args): 8 print(func,args) 9 loop = asyncio.get_event_loop() 10 future = loop.run_in_executor(None, func, *args) # requests.get(\'http://www.cnblogs.com/wupeiqi/\') 11 response = yield from future 12 print(response.url, response.content) 13 14 tasks = [ 15 task(requests.get, \'http://www.cnblogs.com/gregoryli/\'), 16 task(requests.get, \'http://dig.chouti.com/pic/show?nid=4073644713430508&lid=10273091\') 17 ] 18 19 loop = asyncio.get_event_loop() 20 results = loop.run_until_complete(asyncio.gather(*tasks)) 21 loop.close()
import gevent import requests from gevent import monkey monkey.patch_all() def task(method, url, req_kwargs): print(method, url, req_kwargs) response = requests.request(method=method, url=url, **req_kwargs) print(response.url, response.content) # ##### 发送请求 ##### # gevent.joinall([ # gevent.spawn(task, method=\'get\', url=\'https://www.python.org/\', req_kwargs={}), # gevent.spawn(task, method=\'get\', url=\'https://www.yahoo.com/\', req_kwargs={}), # gevent.spawn(task, method=\'get\', url=\'https://github.com/\', req_kwargs={}), # ]) # ##### 发送请求(协程池控制最大协程数量) ##### from gevent.pool import Pool pool = Pool(5) gevent.joinall([ pool.spawn(task, method=\'get\', url=\'https://www.python.org/\', req_kwargs={}), pool.spawn(task, method=\'get\', url=\'https://www.yahoo.com/\', req_kwargs={}), pool.spawn(task, method=\'get\', url=\'https://www.github.com/\', req_kwargs={}), ])
1 import grequests 2 3 request_list = [ 4 grequests.get(\'http://httpbin.org/delay/1\', timeout=0.001), 5 grequests.get(\'http://fakedomain/\'), 6 grequests.get(\'http://httpbin.org/status/500\') 7 ] 8 9 # ##### 执行并获取响应列表 ##### 10 response_list = grequests.map(request_list,size=5) 11 print(response_list)
#!/usr/bin/env python # -*- coding:utf-8 -*- from twisted.internet import defer from twisted.web.client import getPage from twisted.internet import reactor def one_done(arg): print(arg) def all_done(arg): print(\'done\') reactor.stop() @defer.inlineCallbacks def task(url): res = getPage(bytes(url, encoding=\'utf8\')) # 发送Http请求 res.addCallback(one_done) yield res url_list = [ \'http://www.cnblogs.com\', \'http://www.cnblogs.com\', \'http://www.cnblogs.com\', \'http://www.cnblogs.com\', ] defer_list = [] # [特殊,特殊,特殊(已经向url发送请求)] for url in url_list: v = task(url) defer_list.append(v) d = defer.DeferredList(defer_list) d.addBoth(all_done) reactor.run() # 死循环,事件循环
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 from tornado.httpclient import AsyncHTTPClient 4 from tornado.httpclient import HTTPRequest 5 from tornado import ioloop 6 7 COUNT = 0 8 def handle_response(response): 9 global COUNT 10 COUNT -= 1 11 if response.error: 12 print("Error:", response.error) 13 else: 14 print(response.body) 15 # 方法同twisted 16 # ioloop.IOLoop.current().stop() 17 18 if COUNT == 0: 19 ioloop.IOLoop.current().stop() 20 21 def func(): 22 url_list = [ 23 \'http://www.baidu.com\', 24 \'http://www.bing.com\', 25 ] 26 global COUNT 27 COUNT = len(url_list) 28 for url in url_list: 29 print(url) 30 http_client = AsyncHTTPClient() 31 http_client.fetch(HTTPRequest(url), handle_response) 32 33 ioloop.IOLoop.current().add_callback(func) 34 ioloop.IOLoop.current().start() # 死循环
- asyncio
- 示例1:asyncio.sleep(5)
- 示例2:自己封装Http数据包
- 示例3:asyncio+aiohttp
aiohttp模块:封装Http数据包 pip3 install aiohttp
- 示例4:asyncio+requests
requests模块:封装Http数据包 pip3 install requests
- gevent,greenlet+异步IO
pip3 install greenlet
pip3 install gevent
- 示例1:gevent+requests
- 示例2:gevent(协程池,最多发多少个请求)+requests
- 示例3:gevent+requests => grequests
pip3 install grequests
效率:gevent > Twisted > Tornado > asyncio
四 socket
1. socket客户端、服务端
连接阻塞
setblocking(0): 无数据(连接无响应;数据未返回)就报错
------》http请求本质:阻塞
sk = socket.socket() # 1.连接 sk.connect((\'www.baidu.com\',80,)) # IO阻塞 print(\'连接成功了...\') # 2. 连接成功发送消息 sk.send(b\'GET / HTTP/1.0\\r\\nHost:www.baidu.com\\r\\n\\r\\n\') # sk.send(b\'POST / HTTP/1.0\\r\\nHost:www.baidu.com\\r\\n\\r\\nk1=v1&k2=v2\') # 3. 等待着服务端响应 data = sk.recv(8096) # 响应头,响应体,IO阻塞 print(data) # 关闭连接 sk.close()
2. IO多路复用
客户端:
try:
socket对象1.connet()
socket对象2.connet()
socket对象3.connet()
except Exception as e:
pass
while True:
r,w,e = select.select([socket对象1,socket对象2,socket对象3,],[socket对象1,socket对象2,socket对象3,],[],0.05)
r = [socket对象1,] # 表示有人给我发送数据
xx = socket对象1.recv()
w = [socket对象1,] # 表示我已经和别人创建连接成功:
socket对象1.send(\'"""GET /index HTTP/1.0\\r\\nHost: baidu.com\\r\\n\\r\\n"""\')
--------->http请求本质:非阻塞
sk = socket.socket() sk.setblocking(False) # 1.连接 try: sk.connect((\'www.baidu.com\',80,)) # IO阻塞 print(\'连接成功了...\') except BlockingIOError as e: print(e) # 2. 连接成功发送消息 sk.send(b\'GET / HTTP/1.0\\r\\nHost:www.baidu.com\\r\\n\\r\\n\') # sk.send(b\'POST / HTTP/1.0\\r\\nHost:www.baidu.com\\r\\n\\r\\nk1=v1&k2=v2\') # 3. 等待着服务端响应 data = sk.recv(8096) # IO阻塞 print(data) # 关闭连接 sk.close()
3.
class Foo:
·def fileno(self):
·obj = socket()
·return obj.fileno()
r,w,e = select.select([socket对象?,对象?,对象?,Foo()],[],[])
# 对象必须有: fileno方法,并返回一个文件描述符
要点:
a. select内部:对象.fileno()
b. Foo()内部封装socket文件描述符
IO多路复用: r,w,e = while 监听多个socket对象;
异步IO: 非阻塞的socket+IO多路复用
- 非阻塞socket
- select[自己对象],w,r
import socket import select class HttpRequest: def __init__(self,sk,host,callback): self.socket = sk self.host = host self.callback = callback def fileno(self): return self.socket.fileno() class HttpResponse: def __init__(self,recv_data): self.recv_data = recv_data self.header_dict = {} self.body = None self.initialize() def initialize(self): headers, body = self.recv_data.split(b\'\\r\\n\\r\\n\', 1) self.body = body header_list = headers.split(b\'\\r\\n\') for h in header_list: h_str = str(h,encoding=\'utf-8\') v = h_str.split(\':\',1) if len(v) == 2: self.header_dict[v[0]] = v[1] class AsyncRequest: def __init__(self): self.conn = [] self.connection = [] # 用于检测是否已经连接成功 def add_request(self,host,callback): try: sk = socket.socket() sk.setblocking(0)#false sk.connect((host,80,)) except BlockingIOError as e: pass request = HttpRequest(sk,host,callback) self.conn.append(request) self.connection.append(request) def run(self): while True: rlist,wlist,elist = select.select(self.conn,self.connection,self.conn,0.05) for w in wlist: print(w.host,\'连接成功...\') # 只要能循环到,表示socket和服务器端已经连接成功 tpl = "GET / HTTP/1.0\\r\\nHost:%s\\r\\n\\r\\n" %(w.host,) w.socket.send(bytes(tpl,encoding=\'utf-8\')) self.connection.remove(w) for r in rlist: # r,是HttpRequest recv_data = bytes() while True: try: chunck = r.socket.recv(8096) recv_data += chunck except Exception as e: break print(r.host,"有数据返回...",recv_data) response = HttpResponse(recv_data) r.callback(response) r.socket.close() self.conn.remove(r) if len(self.conn) == 0: break def f1(response): print(\'保存到文件\',response.header_dict) def f2(response): print(\'保存到数据库\', response.header_dict) url_list = [ {\'host\':\'www.baidu.com\',\'callback\': f1}, {\'host\':\'cn.bing.com\',\'callback\': f2}, {\'host\':\'www.cnblogs.com\',\'callback\': f2}, ] req = AsyncRequest() for item in url_list: req.add_request(item[\'host\'],item[\'callback\']) req.run()
以上是关于爬虫性能分析的主要内容,如果未能解决你的问题,请参考以下文章
Python爬虫腾讯视频m3u8格式分析爬取(附源码,高清无水印)
Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段