Python爬虫 —— 抓取美女图片(Scrapy篇)
Posted h_z_cong
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python爬虫 —— 抓取美女图片(Scrapy篇)相关的知识,希望对你有一定的参考价值。
杂谈:
之前用requests模块爬取了美女图片,今天用scrapy框架实现了一遍。
(图片尺度确实大了点,但老衲早已无恋红尘,权当观赏哈哈哈)
Item:
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class GirlpicItem(scrapy.Item): title = scrapy.Field() image = scrapy.Field() index = scrapy.Field()
Spider:
#coding:utf-8 from scrapy.spiders import Spider from scrapy.http import Request from scrapy.selector import Selector from girlpic.items import GirlpicItem import scrapy import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) class GirlpicSipder(Spider): name = ‘girlpic‘ allowed_domains = [] # 允许的域名 start_urls = ["http://www.mzitu.com/all/"] def parse(self, response): groups = response.xpath("//div[@class=‘main-content‘]//ul[@class=‘archives‘]//a") count = 0 for group in groups: count = count + 1 if count > 5: return #此处小心,不要用os.exit(0) groupUrl = group.xpath(‘@href‘).extract()[0] title = group.xpath("text()").extract()[0] request = scrapy.Request(url=groupUrl, callback=self.getGroup, meta={‘title‘: title,‘groupUrl‘:groupUrl}, dont_filter=True) yield request def getGroup(self, response): maxIndex = response.xpath("//div[@class=‘pagenavi‘]//span/text()").extract()[-2] for index in range(1, int(maxIndex) + 1): pageUrl = response.meta[‘groupUrl‘]+‘/‘+str(index) meta = response.meta meta[‘index‘] = index request = scrapy.Request(url=pageUrl, callback=self.getPage, meta=meta, dont_filter=True) yield request def getPage(self, response): imageurl = response.xpath("//div[@class=‘main-image‘]//img/@src").extract()[0] # 获取图片url request = scrapy.Request(url=imageurl, callback=self.FormItem, meta=response.meta,dont_filter=True) yield request def FormItem(self, response): title = response.meta[‘title‘] index = response.meta[‘index‘] image = response.body item = GirlpicItem(title=title,index=index,image=image) yield item
PipeLine:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import os import codecs import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) class GirlpicPipeline(object): def __init__(self): self.dirpath = u‘D:学习资料‘ if not os.path.exists(self.dirpath): os.makedirs(self.dirpath) def process_item(self, item, spider): title = item[‘title‘] index = item[‘index‘] image = item[‘image‘] groupdir = os.path.join(self.dirpath, title) if not os.path.exists(groupdir): os.makedirs(groupdir) imagepath = os.path.join(groupdir, str(index) + u‘.jpg‘) file = codecs.open(imagepath, ‘wb‘) file.write(image) file.close() return item
以上是关于Python爬虫 —— 抓取美女图片(Scrapy篇)的主要内容,如果未能解决你的问题,请参考以下文章
python爬虫之Scrapy框架,基本介绍使用以及用框架下载图片案例
4.python爬虫之新建 scrapy 爬虫项目(抓取和保存)