python文件管道下载图集

Posted 2021-03-05 brady-wang

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python文件管道下载图集相关的知识，希望对你有一定的参考价值。

# -*- coding: utf-8 -*-
import re
from time import sleep

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class AngelSpider(CrawlSpider):
    name = ‘angel‘
    allowed_domains = [‘angelimg.spbeen.com‘]
    start_urls = [‘http://angelimg.spbeen.com/‘]

    base_url = "http://angelimg.spbeen.com"
    rules = (
        Rule(LinkExtractor(allow=r‘^http://angelimg.spbeen.com/ang/\d+$‘), callback=‘parse_item‘, follow=False),
    )

    def parse_item(self, response):
        print(response.url)
        item = response.meta.get(‘item‘,False)
        if item:
            pass
        else:
            item = {}
            item[‘files‘] = []
            item[‘file_urls‘] = []
            dir_name = response.xpath(‘.//div[@class="article"]/h2/text()‘).extract_first()
            item[‘dir_name‘] = dir_name.split(‘【‘)[0]
            item[‘dir_name‘] = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","", item[‘dir_name‘])

        img_url = response.xpath(‘.//div[@id="content"]/a/img/@src‘).extract_first()
        item[‘file_urls‘].append(img_url)
        # 如果有下一页 请求下一页，没有数据丢回管道
        next_url = response.xpath(‘.//div[@class="page"]//a[contains(@class,"next")]/@href‘).extract_first()

        #sleep(1)
        if next_url:
            next_url = self.base_url + next_url
            yield scrapy.Request(next_url,callback=self.parse_item,meta={‘item‘:item})
        else:
            yield item

管道继承文件管道

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import hashlib
import os

from scrapy.pipelines.files import FilesPipeline

class AngelimgPipeline(object):
    def process_item(self, item, spider):
        return item



from scrapy.http import Request
from scrapy.utils.python import to_bytes

class DealFilePathPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        return [Request(x,meta={‘item‘:item}) for x in item.get(self.files_urls_field, [])]


    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn(‘FilesPipeline.file_key(url) method is deprecated, please use ‘
                          ‘file_path(request, response=None, info=None) instead‘,
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() method has been overridden
        if not hasattr(self.file_key, ‘_base‘):
            _warn()
            return self.file_key(url)
        ## end of deprecation warning block
        item = request.meta.get(‘item‘,{})
        media_guid = hashlib.sha1(to_bytes(url)).hexdigest()  # change to request.url after deprecation
        media_ext = os.path.splitext(url)[1]  # change to request.url after deprecation
        print(item)
        return ‘full2/{}/{}{}‘.format(item[‘dir_name‘],media_guid, media_ext)
        return ‘full/%s%s‘ % (media_guid, media_ext)

    # deprecated
    def file_key(self, url):
        return self.file_path(url)

    file_key._base = True

　　setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for angelImg project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = ‘angelImg‘

SPIDER_MODULES = [‘angelImg.spiders‘]
NEWSPIDER_MODULE = ‘angelImg.spiders‘


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = ‘angelImg (+http://www.yourdomain.com)‘

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  # ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
  # ‘Accept-Language‘: ‘en‘,
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
    "Referer":"http://angelimg.spbeen.com/"
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    ‘angelImg.middlewares.AngelimgSpiderMiddleware‘: 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    ‘angelImg.middlewares.AngelimgDownloaderMiddleware‘: 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   #‘angelImg.pipelines.AngelimgPipeline‘: 300,
   ‘angelImg.pipelines.DealFilePathPipeline‘: 200,
   #‘scrapy.pipelines.files.FilesPipeline‘: 2
}

FILES_STORE=‘file_doload‘

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = ‘httpcache‘
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage‘

以上是关于python文件管道下载图集的主要内容，如果未能解决你的问题，请参考以下文章

python文件管道 下载图集

python文件管道下载图集