使用Python实现同步&异步爬虫
Posted 再认真点
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用Python实现同步&异步爬虫相关的知识,希望对你有一定的参考价值。
同步
import requests
from lxml import etree # lxml:一个用xpath语法解析网页的库
from urllib.request import urlretrieve # 下载模块
import os
class DoutulaSpider(object):
def __init__(self):
self.base_url = "https://www.pkdoutu.com/article/list/?page=page"
self.hearders =
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
def parser_page(self,url):
resp = requests.get(url,headers=self.hearders)
html = resp.text
parser = etree.HTML(html)
emoji_url_list = parser.xpath("")
for emoji_url in emoji_url_list:
emoji_name = emoji_url.split("/")[-1]
emoji_path = os.path.join("emoji",emoji_name)
urlretrieve(emoji_url,emoji_path)
print("%s表情下载成功!"%emoji_name)
def run(self):
for x in range(1,11):
page_url = self.base_url.format(page=x)
# 解析页面中表情的url
self.parser_page(page_url)
if __name__ == '__main__':
spider = DoutulaSpider()
spider.run()
异步
# import requests 异步爬虫不能使用requests库去做,因为requests库是一个同步的。
from lxml import etree # lxml:一个用xpath语法解析网页的库
# from urllib.request import urlretrieve # 下载模块
import os
# 异步发送网络请求:aiohttp
import aiohttp
import aiofiles # 异步文件操作,需要会用到aiofiles这个库
import asyncio
class DoutulaSpider(object):
def __init__(self):
self.base_url = "https://www.pkdoutu.com/article/list/?page=page"
self.hearders =
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
async def parser_page(self,url):
async with self.session.get(url) as resp:
html = await resp.text()
parser = etree.HTML(html)
emoji_url_list = parser.xpath("")
for emoji_url in emoji_url_list:
async with self.session.get(emoji_url) as emoji_resp:
emoji_name = emoji_url.split("/")[-1]
emoji_path = os.path.join("emoji", emoji_name)
async with aiofiles.open(emoji_path,'wb') as fp:
data = await emoji_resp.read()
await fp.write(data)
print("%s表情下载成功!"%emoji_name)
async def run(self):
self.session = aiohttp.ClientSession(headers=self.hearders)
for x in range(1,11):
await self.parser_page(self.base_url.format(page=x))
await self.session.close()
if __name__ == '__main__':
spider = DoutulaSpider()
asyncio.run(spider.run())
以上是关于使用Python实现同步&异步爬虫的主要内容,如果未能解决你的问题,请参考以下文章