scrapy爬虫部分
Posted marier
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy爬虫部分相关的知识,希望对你有一定的参考价值。
items.py部分
import scrapy
class App01Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
original_url = scrapy.Field()
management_info = scrapy.Field()
com_name = scrapy.Field()
punish_num = scrapy.Field()
mana_results = scrapy.Field()
law_depart = scrapy.Field()
get_date = scrapy.Field()
app_01.py项目部分
-- coding: utf-8 --
import scrapy
import requests
from pyquery import PyQuery
from app01.items import *
class App01Spider(scrapy.Spider):
name = ‘app_01‘
allowed_domains = [‘http://gtghj.wuhan.gov.cn/pt-2256-7-1.html‘]
start_urls = [‘http://gtghj.wuhan.gov.cn/pt-2256-7-1.html‘]
def parse(self, response):
count = 1
s = PyQuery(response.text)
page_1 = s(‘#info > div > strong‘).text()
page = page_1.split(‘/‘)[1]
for i in range(int(page)):
url_page = ‘http://gtghj.wuhan.gov.cn/pt-2256-7-{}.html‘.format(i + 1)
original_r = requests.get(url_page)
original_r.encoding = ‘gbk‘
original_s = PyQuery(original_r.text)
original_urls = original_s(‘#info > ul > li > a‘).items()
get_dates = original_s(‘#info > ul > li > span‘).items()
for j, dates in zip(original_urls, get_dates):
original_url = ‘http://gtghj.wuhan.gov.cn{}‘.format(j.attr(‘href‘)) # 详情url
management_info = j.attr(‘title‘) # 标题
get_date = dates.text() # 时间
yield scrapy.Request(url=original_url,callback=self.parse_info,dont_filter=True#防止parse_info不回调,
meta={‘management_info‘:management_info,‘get_date‘:get_date,‘original_url‘:original_url})
count += 1
def parse_info(self,response):
item = App01Item()#调用items中的App01Item()
detail_s = PyQuery(response.text)
com_name = detail_s(‘#show > table:nth-child(2) > tr > td > div > table > tr:nth-child(2) > td:nth-child(2)‘).text()
mana_results = detail_s(‘#show > table:nth-child(2) > tr > td > div > table> tr:nth-child(8) > td:nth-child(2)‘).text()
punish_num = detail_s(‘#show > table:nth-child(2) > tr > td > div > table > tr:nth-child(4) > td:nth-child(2)‘).text()
law_depart = detail_s(‘#show > table:nth-child(2) > tr > td > div > table> tr:nth-child(9) > td:nth-child(2)‘).text()
item[‘com_name‘] = com_name
item[‘mana_results‘] = mana_results
item[‘punish_num‘] = punish_num
item[‘law_depart‘] = law_depart
item[‘management_info‘] = response.meta[‘management_info‘]
item[‘get_date‘] = response.meta[‘get_date‘]
item[‘original_url‘] = response.meta[‘original_url‘]
print(item)
main.py部分
from scrapy.cmdline import execute
execute(‘scrapy crawl app_01‘.split())
以上是关于scrapy爬虫部分的主要内容,如果未能解决你的问题,请参考以下文章