from scrapy.loader import ItemLoader #继在jobbole中导入模块
item_loader=ItemLoader(item=JobBoleArticleItem(),response=response) #JobBoleArticleItem需要与items.py中的类名一致
item_loader.add_css("title",".entry-header h1::text")
item_loader.add_css("create_date","p.entry-meta-hide-on-mobile::text")
item_loader.add_value("url","response.url")
item_loader.add_value("front_image_url","front_image_url")
item_loader.add_value("front_image_path","get_md5(response.url)")
item_loader.add_xpath("praise_nums","//span[contains(@class,‘vote-post-up‘)]/h10/text()")
item_loader.add_xpath("comment_nums","//a[@href=‘#article-comment‘]/span/text()")
item_loader.add_xpath("fav_nums","//span[contains(@class,‘bookmark-btn‘)]/text()")
item_loader.add_css("tags","p.entry-meta-hide-on-mobile a::text")
item_loader.add_xpath("content","div.entry")
article_item=item_loader.load_item() #引用
其中由于里面用了css和path两种获取方法,可以任意切换。
from scrapy.loader.processors import MapCompose,TakeFirst,Join#在items.py中导入模块,其中类MapCompos可以传递任意多的函数
def add_jobbole(value):
return value+"-jobbole"
def date_convert(value):
try:
create_date = datetime.datetime.strptime(‘create_date‘, "%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now()
return create_date
def get_nums(value):
match_re = re.match(‘.*?(\d).*‘, value)
if match_re:
nums = int(match_re.group(1))
else:
nums = 0
return nums
def remove_comment_tags(value):
# 去掉tag中提取的评论
if "评论" in value:
return ""
else:
return value
def return_value(value):
return value #不变
class JobBoleArticleItem(scrapy.Item):
title=scrapy.Field(
#input_processor=MapCompose(add_jobbole)#进行预处理
input_processor= MapCompose(lambda x:x+"-jobbole",add_jobbole)
)
create_date=scrapy.Field(
input_processor= MapCompose(date_convert),
output_processor=TakeFirst() #取第一个
)
url=scrapy.Field()
url_object_id=scrapy.Field()
front_image_url=scrapy.Field(
output_processor=MapCompose(return_value)
)
front_image_path=scrapy.Field()
praise_nums=scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_nums=scrapy.Field(
input_processor=MapCompose(get_nums)
)
fav_nums=scrapy.Field(
input_processor=MapCompose(get_nums)
)
tags=scrapy.Field(
intput_processor=MapCompose(remove_comment_tags),
output_processor=Join(",")