5-4,scrapy图片爬取与下载中间件的使用
Posted 吴平凡
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了5-4,scrapy图片爬取与下载中间件的使用相关的知识,希望对你有一定的参考价值。
01
—
scrapy图片爬取
import scrapy
class ImgSpider(scrapy.Spider):
name = 'img'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://sc.chinaz.com/tupian/']
def parse(self, response):
div_list = response.xpath('//*[@id="container"]/div')
for div in div_list:
#注意:使用伪属性
src1 = 'https:' + div.xpath('./div[1]/a/img/@src2').extract_first()
src = src1.split('_')[0] + '.jpg'
print(src)
from imgsPro import ImgsproItem
item = ImgsproItem()
item['src'] = src
yield item
from scrapy.pipelines.images import ImagesPipelineimport scrapyclass imgsPipeline(ImagesPipeline): # 就是可以根据图片地址进行图片数据的请求 def get_media_requests(self, item, info): yield scrapy.Request(item['src']) # 指定图片存储的路径 def file_path(self, request, response=None, info=None): imgName = request.url.split('/')[-1] return imgName def item_completed(self, results, item, info): return item # 返回给下一个即将被执行的管道类
# 指定图片存储的路径
IMAGES_STORE = './imgs'
02
—
下载中间件
from scrapy import signals
import random
from itemadapter import is_item, ItemAdapter
class MiddleproDownloaderMiddleware:
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(Khtml, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
# 可被选用的代理IP
PROXY_http = [
'153.180.102.104:80',
'195.208.131.189:56055',
]
PROXY_https = [
'120.83.49.90:9000',
'95.189.112.214:35508',
]
# 拦截请求
def process_request(self, request, spider):
# UA伪装
request.headers['User_Agent'] = random.choice(self.user_agent_list)
#为了验证代理的操作是否可用
request.meta['proxy'] = 'https://49.70.95.200:9999'
spider.crawler.engine.close_spider(spider, 'closespider')
# 拦截发生异常的请求
def process_exception(self, request, exception, spider):
if request.url.split(':')[0] == 'http': # 请求的协议头
# 代理
request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)
else:
request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
return request # 将修正之后的请求对象进行重新请求发送
import scrapy
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
module_urls = [] # 存储五个板块对应的详情页的url
# 解析五大板块对应详情页的url
def parse(self, response):
li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
alist = [3,4,6,7,8]
for index in alist:
module_url = li_list[index].xpath('./a/@href').extract_first()
self.module_urls.append(module_url)
for url in self.module_urls: # 对每个板块的url进行请求发送
yield scrapy.Request(url,callback=self.parse_module)
# 每一个板块对应的新闻标题相关的内容都是动态加载
def parse_module(self,response): # 解析每一个板块页面中对应新闻的标题和新闻详情页url
pass
from scrapy import signals
from itemadapter import is_item, ItemAdapter
from time import sleep
from scrapy.http import HtmlResponse
class WangyiproDownloaderMiddleware:
def process_request(self, request, spider):
return None
# 该方法拦截五大板块对应的响应对象,进行篡改
def process_response(self, request, response, spider):
# 挑选出指定的响应对象进行篡改
# 通过url指定request,通过request指定response
if request.url in spider.module_urls:
# response # 五大板块对应的响应对象
# 针对定位到的这些response进行篡改
# 实例化一个新的响应对象(符合要求:包含动态加载的新闻数据)替代原来旧的响应对象
new_response = HtmlResponse(url,body,encoding=,request)
return new_response
else:
# response # 其他请求对应的响应对象
return response
def process_exception(self, request, exception, spider):
pass
from selenium import webdriver
def __init__(self):
self.brower = webdriver.Chrome(executable_path='D:\学习\编程\2020爬虫全套教程\第七章,动态加载数据处理\chromedriver.exe')
browser.get(request.url)
sleep(2)
page_text = browser.page_source
new_response = HtmlResponse(url=request.url,body=page_text,encoding='utf-8',request=request)
def parse_detail(self,response): # 用于解析新闻内容
content = response.xpath('//*[@id="content"]/div[2]/article//text()').extract()
content = ''.join(content)
item = WangyiproItem()
item['title'] = title
# 对新闻详情页的url发起请求
yield scrapy.Request(url=news_detail_url,callback=self.parse_detail,meta={'item':item})
item = response.meta['item']
item['content'] = content
yield item #传递给管道
def closed(self,spider):
self.browser.quit()
import scrapy
from selenium import webdriver
from wangyiPro.items import WangyiproItem
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
module_urls = [] # 存储五个板块对应的详情页的url
# 实例化一个浏览器对象
def __init__(self):
self.browser = webdriver.Chrome(executable_path='E:\pywork\小猿圈爬虫课程(2020爬虫全套教程)\第七章:动态加载数据处理\chromedriver.exe')
# 解析五大板块对应详情页的url
def parse(self, response):
li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
alist = [3,4,6,7,8]
for index in alist:
module_url = li_list[index].xpath('./a/@href').extract_first()
self.module_urls.append(module_url)
for url in self.module_urls: # 对每个板块的url进行请求发送
yield scrapy.Request(url,callback=self.parse_module)
# 每一个板块对应的新闻标题相关的内容都是动态加载
def parse_module(self, response): # 解析每一个板块页面中对应新闻的标题和新闻详情页url
div_list = response.xpath('/html/body/div/div[3]/div[4]/div[1]/div/div/ul/li/div/div')
for div in div_list:
title = div.xpath('./div/div[1]/h3/a/text() | ./div[1]/h3/a/text()').extract_first()
news_detail_url = div.xpath('./a/@href | ./div[1]/div[1]/h3/a/@href | ./div[1]/h3/a/@href').extract_first()
print(title,news_detail_url)
item = WangyiproItem()
item['title'] = title
# 对新闻详情页的url发起请求
yield scrapy.Request(url=news_detail_url, callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response): # 用于解析新闻内容
content = response.xpath('//*[@id="content"]/div[2]/article//text() | '
'//*[@id="content"]/div[2]//text() | '
'/html/body/div[2]/div[2]/div[1]/div[2]/img/@src').extract()
content = ''.join(content).strip()
item = response.meta['item']
item['content'] = content
yield item
def closed(self, spider):
self.browser.quit()
from scrapy import signals
from itemadapter import is_item, ItemAdapter
from time import sleep
from scrapy.http import HtmlResponse
class WangyiproDownloaderMiddleware:
def process_request(self, request, spider):
return None
# 该方法拦截五大板块对应的响应对象,进行篡改
def process_response(self, request, response, spider): # spider爬虫对象
browser = spider.browser # 获取了在爬虫类中的浏览器对象
# 挑选出指定的响应对象进行篡改
# 通过url指定request,通过request指定response
if request.url in spider.module_urls:
browser.get(request.url) # 五个板块对应的url进行请求
sleep(2)
page_text = browser.page_source # 包含了动态加载的新闻数据
# response # 五大板块对应的响应对象
# 针对定位到的这些response进行篡改
# 实例化一个新的响应对象(符合要求:包含动态加载的新闻数据)替代原来旧的响应对象
# 如何获取动态加载出的新闻数据
# 基于selenium便捷的获取动态加载数据
new_response = HtmlResponse(url=request.url,body=page_text,
encoding='utf-8',request=request)
return new_response
else:
# response # 其他请求对应的响应对象
return response
def process_exception(self, request, exception, spider):
pass
以上是关于5-4,scrapy图片爬取与下载中间件的使用的主要内容,如果未能解决你的问题,请参考以下文章
爬虫07 /scrapy图片爬取中间件selenium在scrapy中的应用CrawlSpider分布式增量式
python爬虫---scrapy框架爬取图片,scrapy手动发送请求,发送post请求,提升爬取效率,请求传参(meta),五大核心组件,中间件