scrapy 爬虫并把所有网址和所有图片对应起来写入到Excel中
Posted 沧海一粒水
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy 爬虫并把所有网址和所有图片对应起来写入到Excel中相关的知识,希望对你有一定的参考价值。
items.py 数据条目
import scrapy
class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
link =scrapy.Field()
comment=scrapy.Field()
pic_addr=scrapy.Field()
dd.py 数据分析
# -*- coding: utf-8 -*-
import scrapy
import re
from dangdang.items import DangdangItem
from scrapy.http import Request
class DdSpider(scrapy.Spider):
name = ‘dd‘
allowed_domains = [‘dangdang.com‘]
start_urls = [‘http://category.dangdang.com/pg1-cid4003844.html‘]
def parse(self, response):
item=DangdangItem()
item[‘title‘]=response.xpath(‘//a[@name="itemlist-title"]/@title‘).extract()
item[‘link‘]=response.xpath(‘//a[@name="itemlist-title"]/@href‘).extract()
item[‘comment‘] = response.xpath(‘//a[@name="itemlist-review"]/text()‘).extract()
item[‘pic_addr‘] = response.xpath(‘//a/img/@data-original‘).extract()
lst=item[‘pic_addr‘]
lnk=item[‘link‘]
#相同图片地址整理到一起
i = 0
lst1 = []
while i < len(lst):
pat = ‘http://.*?/.*?/.*?/([0-9]{5})‘
data = re.compile(pat).findall(lst[i])
k = 0
j = 0
ll = []
while j < len(lst):
data1 = re.compile(pat).findall(lst[j])
if (data == data1):
ll.append(lst[j])
k += 1
if k > 1:
lst.pop(j)
j = j - 1
j += 1
lst1.append(ll)
i += 1
#网页链接和图片链接序号相匹配,为写入数据进行整理
lst = []
for m in range(0, len(lnk)):
pat1 = ‘http://.*?/([0-9]{5})‘
d = re.compile(pat1).findall(lnk[m])
for n in range(0, len(lst1)):
pat2 = ‘http://.*?/.*?/.*?/([0-9]{5})‘
d1 = re.compile(pat2).findall(lst1[n][0])
if d == d1:
lst.append(lst1[n])
break
item[‘pic_addr‘]=lst
yield item
for page in range(2,81):
url=‘http://category.dangdang.com/pg‘+str(page)+‘-cid4003844.html‘
yield Request(url,callback=self.parse)
创建Excel表
import xlwt
book = xlwt.Workbook(encoding="utf-8")
sht = book.add_sheet("publisher", cell_overwrite_ok=True)
sht.write(0,0,u‘序号‘)
sht.write(0,1,u‘标题‘)
sht.write(0,2,u‘链接‘)
sht.write(0,3,u‘评论‘)
sht.write(0,4,u‘图片链接1‘)
sht.write(0,5,u‘图片链接2‘)
sht.write(0,6,u‘图片链接3‘)
sht.write(0,7,u‘图片链接4‘)
sht.write(0,8,u‘图片链接5‘)
book.save("d:\\data\\dangdang\\dangdang.xls")
pipelines.py 数据写入
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import xlrd
from xlutils.copy import copy
class DangdangPipeline(object):
def process_item(self, item,spider):
rb = xlrd.open_workbook(‘d:\\data\\dangdang\\dangdang.xls‘)
sht = rb.sheets()[0]
nrows = sht.nrows
# ncols = sht.ncols+1
wb = copy(rb)
ws = wb.get_sheet(0)
for i in range(0,len(item[‘title‘])):
title=item[‘title‘][i]
link=item[‘link‘][i]
comment=item[‘comment‘][i]
pic_addr = item[‘pic_addr‘][i]
try:
ws.write(nrows,0,nrows)
ws.write(nrows, 1, title)
ws.write(nrows, 2, link)
ws.write(nrows, 3, comment)
for j in range(0,len(pic_addr)):
ws.write(nrows,4+j,pic_addr[j])
nrows += 1
except Exception as err:
print(err)
wb.save(‘d:\\data\\dangdang\\dangdang.xls‘)
return item
以上是关于scrapy 爬虫并把所有网址和所有图片对应起来写入到Excel中的主要内容,如果未能解决你的问题,请参考以下文章
Scrapy(Python)爬虫框架案例实战教程,Mysql存储数据