爬虫热身——性能相关

Posted gaosy-math

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫热身——性能相关相关的知识,希望对你有一定的参考价值。

   单线程串行与多线程(进程)并行

在编写爬虫时,性能的消耗主要在IO请求中,当单进程单线程模式下请求URL时必然会引起等待,从而使得请求整体变慢。

1. 单线程串行

# -*- coding: UTF-8 -*-
import requests, time

def be_async(url):
    print(url)
    res = requests.get(url)
    print(res)

url_list = ["https://www.baidu.com", "http://github.com/"]


a = time.time()  # 起始时间

for url in url_list:
    be_async(url)

b = time.time() # 结束时间
print("cost time: %s s"%(b-a))

 2. 多线程

# -*- coding: UTF-8 -*-
from concurrent.futures import ThreadPoolExecutor
import requests
import time

def be_async(url):
    print(url)
    res = requests.get(url)
    print(res)

url_list = ["https://www.baidu.com", "http://github.com/"]

pool = ThreadPoolExecutor(4)

a = time.time()  # 起始时间

for url in url_list:
    pool.submit(be_async, url)

pool.shutdown(wait=True)

b = time.time() # 结束时间
print("cost time: %s s"%(b-a))

 +回调函数

技术分享图片
# -*- coding: UTF-8 -*-
import requests, time
from concurrent.futures import ThreadPoolExecutor

def be_async(url):
    print(url)
    res = requests.get(url)
    print(res)

url_list = ["https://www.baidu.com", "http://github.com/"]


def callback(future):
    print(future.result)


pool = ThreadPoolExecutor(4)

a = time.time()  # 起始时间

for url in url_list:
    r = pool.submit(be_async, url)
    r.add_done_callback(callback)

pool.shutdown(wait=True)

b = time.time() # 结束时间
print("cost time: %s s"%(b-a))
View Code

 3. 多进程

# -*- coding: UTF-8 -*-
from concurrent.futures import ProcessPoolExecutor
import requests
import time

def be_async(url):
    print(url)
    res = requests.get(url)
    print(res)

url_list = ["https://www.baidu.com", "http://github.com/"]


if __name__ == __main__:

    pool = ProcessPoolExecutor(4)

    a = time.time()  # 起始时间

    for url in url_list:
        pool.submit(be_async, url)

    pool.shutdown(wait=True)

    b = time.time() # 结束时间
    print("cost time: %s s"%(b-a))

 +回调函数

技术分享图片
# -*- coding: UTF-8 -*-
import requests, time
from concurrent.futures import ProcessPoolExecutor

def be_async(url):
    print(url)
    res = requests.get(url)
    print(res)

url_list = ["https://www.baidu.com", "http://github.com/"]


def callback(future):
    print(future.result)

if __name__ == __main__:

    pool = ProcessPoolExecutor(4)

    a = time.time()  # 起始时间

    for url in url_list:
        r = pool.submit(be_async, url)
        r.add_done_callback(callback)

    pool.shutdown(wait=True)

    b = time.time() # 结束时间
    print("cost time: %s s"%(b-a))
View Code

 

 

   异步非阻塞IO

 通过上述代码均可以完成对请求性能的提高,对于多线程和多进行的缺点是在IO阻塞时会造成了线程和进程的浪费,所以异步会是首选:

 

1. asyncio 示例,该模块只能处理tcp协议

 原理:

技术分享图片

代码:

技术分享图片
# -*- coding: UTF-8 -*-
import asyncio

@asyncio.coroutine
def be_async(host, url = "/"):
    print(host, url)
    reader,writer = yield from asyncio.open_connection(host, 80)

    req_header = ‘‘‘GET %s HTTP/1.0
Host: %s

‘‘‘%(url, host)
    req_header = req_header.encode(encoding="utf-8")

    writer.write(req_header)

    yield from writer.drain()
    text = yield from reader.read()

    print(host, url, text)
    writer.close()

tasks = [
    be_async("www.cnblogs.com", url="/gaosy-math"),
    be_async("github.com", url="/gaoshao52")
]

loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
asyncio

 注:python3可以运行,python2没有试过

 2. asyncio+aiohttp 示例

原理: 与上面的一致

代码:

技术分享图片
# -*- coding: UTF-8 -*-
import asyncio
import aiohttp

@asyncio.coroutine
def be_async(url):
    print(url)
    res = yield from aiohttp.request(GET, url)
    data = yield from res.read()
    print(url, data)

    res.close()


tasks = [
    be_async("https://github.com/gaoshao52"),
    be_async("https://www.cnblogs.com/gaosy-math")
]


loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
asyncio+aiohttp

 注:python3运行有问题 ,python2没有试过

 3. asyncio+requests 示例

原理:一致

代码:

技术分享图片
# -*- coding: UTF-8 -*-
import asyncio
import requests

@asyncio.coroutine
def be_async(func, *args):
    loop = asyncio.get_event_loop()
    future = loop.run_in_executor(None, func, *args)
    res = yield from future
    print(res.url, res.text)


tasks = [
    be_async(requests.get, "https://github.com/gaoshao52"),
    be_async(requests.get, "https://www.cnblogs.com/gaosy-math")
]

loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
asyncio+requests

 注:python3可以运行,python2没有试过

 4. gevent+requests 示例

原理:一致

代码:

技术分享图片
# -*- coding: UTF-8 -*-
import gevent
import requests

from gevent import monkey

monkey.patch_all()

def be_async(method, url ,req_kwargs):
    print(method, url, req_kwargs)
    res = requests.request(method=method, url=url, **req_kwargs)
    print(res.url, res.status_code)

gevent.joinall([
    gevent.spawn(be_async, method=GET, url="https://github.com/gaoshao52", req_kwargs={}),
    gevent.spawn(be_async, method=GET, url="https://www.cnblogs.com/gaosy-math", req_kwargs={})
])


# ##### 发送请求(协程池控制最大协程数量) #####
# from gevent.pool import Pool
# pool = Pool(None)
# gevent.joinall([
#     pool.spawn(be_async, method=‘GET‘, url="https://github.com/gaoshao52", req_kwargs={}),
#     pool.spawn(be_async, method=‘GET‘, url="https://www.cnblogs.com/gaosy-math", req_kwargs={})
# ])

# 注: 必须在python2 中运行
gevent+requests

 注:python3运行有问题,  python2可以运行

 5. grequests 示例

原理:gevent+requests的封装

代码:

技术分享图片
# -*- coding: UTF-8 -*-

import grequests

request_list = [
    grequests.get("http://www.runoob.com/python/python-json.html"),
    grequests.get("https://www.git-scm.com/download/win")
]

##### 执行并获取响应列表 #####
# response_list = grequests.map(request_list)
# print(response_list)


# ##### 执行并获取响应列表(处理异常) #####
def exception_handler(request, exception):
    print(request,exception)
    print("Request failed")

response_list = grequests.map(request_list, exception_handler=exception_handler)
print(response_list)
grequests

 

以上是关于爬虫热身——性能相关的主要内容,如果未能解决你的问题,请参考以下文章

爬虫性能相关

爬虫性能相关

scrapy按顺序启动多个爬虫代码片段(python3)

爬虫性能相关

python之路 -- 爬虫 -- 高性能相关

scrapy主动退出爬虫的代码片段(python3)