scrapy之内蒙古自治区环境保护厅
Posted mayunji
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy之内蒙古自治区环境保护厅相关的知识,希望对你有一定的参考价值。
主程序:
# -*- coding: utf-8 -*- import re, scrapy from urllib.parse import urljoin from nmgepb.items import NmgepbItem class BasicNmgepbSpider(scrapy.Spider): name = ‘basic_nmgepb‘ allowed_domains = [‘nmgepb.gov.cn‘] start_urls = [‘http://nmgepb.gov.cn/‘] def __init__(self): self.countNum = 1 self.startLink ="http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/index.html" def start_requests(self): yield scrapy.Request(url=self.startLink, dont_filter=True, callback=self.link_parse) def customXpathParse(self, value): return ‘‘.join(value).strip() def customReParse(self, condition, index=1): if condition: return condition.group(index).strip() else: return "" def link_parse(self, response): if (len(response.text) < 1000): yield scrapy.Request(url=response.url, dont_filter=True, callback=self.link_parse) else: allLinks = response.xpath("/html/body/div[3]/div/div[3]/div[2]/ul/li/span[2]/a/@href").extract() for link in allLinks: link = urljoin(response.url, link) yield scrapy.Request(url=link, callback=self.info_parse) if (response.url == self.startLink): for pageNum in range(1, 6): link = ‘{0}_{1}.html‘.format(self.startLink.split(‘.html‘)[0], pageNum) yield scrapy.Request(url=link, callback=self.link_parse) def info_parse(self, response): if ‘http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472451.html‘ != response.url and ‘http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472450.html‘ != response.url and ‘http://www.nmgepb.gov.cn/ywgl/hjpj/xmslqk/201412/t20141230_1472443.html‘ != response.url: item = NmgepbItem() trData = response.xpath(‘//table//tr‘) tableClass = self.customXpathParse(response.xpath(‘//table/@class‘).extract()) if trData: for data in trData: tdNum = len(data.xpath(‘./td‘)) firstTd = self.customXpathParse(data.xpath(‘./td[1]//text()‘).extract()) lastTd = self.customXpathParse(data.xpath(‘./td[6]//text()‘).extract()) if (tdNum == 3): if (tableClass == ‘MsoTableGrid‘): item[‘link‘] = response.url item[‘title‘] = self.customXpathParse(data.xpath(‘./td[2]//text()‘).extract()) item[‘place‘] = self.customXpathParse(data.xpath(‘./td[3]//text()‘).extract()) item[‘company‘] = ‘‘ item[‘mechanism‘] = ‘‘ item[‘date‘] = ‘‘ if (item[‘title‘]) and (item[‘title‘] != ‘编号‘) and (item[‘title‘] != ‘项目名称‘): print(‘{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n‘.format(self.countNum, item[‘link‘], item[‘title‘])) self.countNum += 1 yield item elif (tableClass == ‘FCK__ShowTableBorders‘): item[‘link‘] = response.url item[‘title‘] = self.customXpathParse(data.xpath(‘./td[3]//text()‘).extract()) item[‘place‘] = ‘‘ item[‘company‘] = ‘‘ item[‘mechanism‘] = ‘‘ item[‘date‘] = self.customXpathParse(data.xpath(‘./td[2]//text()‘).extract()) if (item[‘title‘]) and (item[‘title‘] != ‘编号‘) and (item[‘title‘] != ‘项目名称‘): print(‘{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n‘.format(self.countNum, item[‘link‘], item[‘title‘])) self.countNum += 1 yield item elif (tdNum == 6) and (lastTd): item[‘link‘] = response.url item[‘title‘] = self.customXpathParse(data.xpath(‘./td[2]//text()‘).extract()) item[‘place‘] = self.customXpathParse(data.xpath(‘./td[3]//text()‘).extract()) item[‘company‘] = self.customXpathParse(data.xpath(‘./td[4]//text()‘).extract()) item[‘mechanism‘] = self.customXpathParse(data.xpath(‘./td[5]//text()‘).extract()) item[‘date‘] = self.customXpathParse(data.xpath(‘./td[6]//text()‘).extract()) if (item[‘title‘]) and (item[‘title‘] != ‘编号‘) and (item[‘title‘] != ‘项目名称‘): print(‘{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n‘.format(self.countNum, item[‘link‘], item[‘title‘])) self.countNum += 1 yield item elif (tdNum == 5 or tdNum == 6) and (not lastTd): if firstTd.isdigit(): item[‘link‘] = response.url item[‘title‘] = self.customXpathParse(data.xpath(‘./td[2]//text()‘).extract()) item[‘place‘] = self.customXpathParse(data.xpath(‘./td[3]//text()‘).extract()) item[‘company‘] = self.customXpathParse(data.xpath(‘./td[4]//text()‘).extract()) item[‘mechanism‘] = ‘‘ item[‘date‘] = self.customXpathParse(data.xpath(‘./td[5]//text()‘).extract()) if (item[‘title‘]) and (item[‘title‘] != ‘编号‘) and (item[‘title‘] != ‘项目名称‘): print(‘{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n‘.format(self.countNum, item[‘link‘], item[‘title‘])) self.countNum += 1 yield item else: item[‘link‘] = response.url item[‘title‘] = self.customXpathParse(data.xpath(‘./td[1]//text()‘).extract()) item[‘place‘] = self.customXpathParse(data.xpath(‘./td[2]//text()‘).extract()) item[‘company‘] = self.customXpathParse(data.xpath(‘./td[3]//text()‘).extract()) item[‘mechanism‘] = self.customXpathParse(data.xpath(‘./td[4]//text()‘).extract()) item[‘date‘] = self.customXpathParse(data.xpath(‘./td[5]//text()‘).extract()) if (item[‘title‘]) and (item[‘title‘] != ‘编号‘) and (item[‘title‘] != ‘项目名称‘): print(‘{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n‘.format(self.countNum, item[‘link‘], item[‘title‘])) self.countNum += 1 yield item elif (tdNum == 7): item[‘link‘] = response.url item[‘title‘] = self.customXpathParse(data.xpath(‘./td[2]//text()‘).extract()) item[‘place‘] = self.customXpathParse(data.xpath(‘./td[3]//text()‘).extract()) item[‘company‘] = self.customXpathParse(data.xpath(‘./td[4]//text()‘).extract()) item[‘mechanism‘] = ‘‘ item[‘date‘] = self.customXpathParse(data.xpath(‘./td[7]//text()‘).extract()) if (item[‘title‘]) and (item[‘title‘] != ‘编号‘) and (item[‘title‘] != ‘项目名称‘): print(‘{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n‘.format(self.countNum, item[‘link‘], item[‘title‘])) self.countNum += 1 yield item elif (tdNum == 9): item[‘link‘] = response.url item[‘title‘] = self.customXpathParse(data.xpath(‘./td[2]//text()‘).extract()) item[‘place‘] = self.customXpathParse(data.xpath(‘./td[3]//text()‘).extract()) item[‘company‘] = self.customXpathParse(data.xpath(‘./td[4]//text()‘).extract()) item[‘mechanism‘] = ‘‘ item[‘date‘] = self.customXpathParse(data.xpath(‘./td[9]//text()‘).extract()) if (item[‘title‘]) and (item[‘title‘] != ‘编号‘) and (item[‘title‘] != ‘项目名称‘): print(‘{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n‘.format(self.countNum, item[‘link‘], item[‘title‘])) self.countNum += 1 yield item else: item[‘link‘] = response.url item[‘title‘] = self.customReParse(re.search(r‘<strong>项目名称:</strong>(.*?)<‘, response.text, re.I)) item[‘place‘] = self.customReParse(re.search(r‘<strong>建设地点:</strong>(.*?)<‘, response.text, re.I)) item[‘company‘] = self.customReParse(re.search(r‘<strong>建设单位:</strong>(.*?)<‘, response.text, re.I)) item[‘mechanism‘] = self.customReParse(re.search(r‘<strong>环境影响评价机构:</strong>(.*?)<‘, response.text, re.I)) item[‘date‘] = self.customReParse(re.search(r‘<strong>受理日期:</strong>(.*?)<‘, response.text, re.I)) if (item[‘title‘]) and (item[‘title‘] != ‘编号‘) and (item[‘title‘] != ‘项目名称‘): print(‘{0:>3}:\n\t项目链接:{1}\n\t项目名称:{2}\n\n‘.format(self.countNum, item[‘link‘], item[‘title‘])) self.countNum += 1 yield item
items:
import scrapy class NmgepbItem(scrapy.Item): link = scrapy.Field() title = scrapy.Field() place = scrapy.Field() company = scrapy.Field() mechanism = scrapy.Field() date = scrapy.Field()
middlewares:
from scrapy import signals class NmgepbSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info(‘Spider opened: %s‘ % spider.name) class NmgepbDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info(‘Spider opened: %s‘ % spider.name)
pipelines:
import os, csv class NmgepbPipeline(object): def __init__(self): self.csvFilePath = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ‘nmgepb.csv‘) self.csvFile = open(self.csvFilePath, ‘w‘, encoding=‘gb18030‘, newline=‘‘) self.csvWrite = csv.writer(self.csvFile) self.csvWrite.writerow([‘页面链接‘, ‘项目名称‘, ‘建设地点‘, ‘建设单位‘, ‘评价机构‘, ‘受理日期‘]) def process_item(self, item, spider): self.csvWrite.writerow( [item.get(‘link‘), item.get(‘title‘), item.get(‘place‘), item.get(‘company‘), item.get(‘mechanism‘), item.get(‘date‘)] ) return item def close_spider(self, spider): self.csvFile.close() print("恭喜, 数据采集完成, 存储路径:%s"%self.csvFilePath)
settings(加入):
DEFAULT_REQUEST_HEADERS = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0" } ITEM_PIPELINES = { ‘nmgepb.pipelines.NmgepbPipeline‘: 300, }
以上是关于scrapy之内蒙古自治区环境保护厅的主要内容,如果未能解决你的问题,请参考以下文章