用scrapy爬取搜狗Lofter图片
# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader
from tutorial.items import LofterSpiderItem
class LofterSpider(scrapy.Spider):
name = "lofter"
allowed_domains = ["pic.sogou.com"]
start_urls = [‘http://pic.sogou.com/‘]
# question的第一页answer的请求url
start_answer_url = "http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category=LOFTER&tag=%E5%85%A8%E9%83%A8&start={0}&len=15"
headers = {
"HOST": "pic.sogou.com",
"Referer": "http://pic.sogou.com",
‘User-Agent‘: "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
}
def parse(self, response):
yield scrapy.Request(self.start_answer_url.format(0), headers=self.headers,callback=self.parse_url)
def parse_url(self,response):
ans_json = json.loads(response.text)
for ans in ans_json[‘all_items‘]:
image_url = ans[‘ori_pic_url‘]
item_loader = ItemLoader(item=LofterSpiderItem(), response=response)
item_loader.add_value("lofter_image_url", image_url)
lofter_item = item_loader.load_item()
yield lofter_item
yield scrapy.Request(self.start_answer_url.format(ans_json[‘startIndex‘]+15), headers=self.headers,callback=self.parse_url)
settings.py
ITEM_PIPELINES = {
‘tutorial.pipelines.TutorialPipeline‘: 300,
‘tutorial.pipelines.TutorialImagePipeline‘: 1,
}
# IMAGES_URLS_FIELD = "front_image_url"
project_dir = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir, ‘image‘)
items.py
class LofterSpiderItem(scrapy.Item):
lofter_image_url = scrapy.Field(
output_processor=MapCompose(return_value)
)