scrapy的安装
环境:python3.6
1
pip install -i https://pypi.douban.com/simple/ scrapy
2
scrapy startproject ArticleSpider
****main.py是后面创建用来运行scrapy的****
3
cd ArticleSpider
scrapy genspider jobbole blog.jobbole.com
------- ----------------
spider名 网站域名
4
在ArticleSpider创建main.py,可以通过此文件运行scrapy
from scrapy.cmdline import execute
import sys
import os
# print(__file__) #本文件名
# print(os.path.dirname(__file__)) #父文件名
# print(os.path.abspath(os.path.dirname(__file__))) #路径和父文件名
sys.path.append(os.path.dirname(os.path.abspath(__file__))) #设置环境,必须
execute(["scrapy", "crawl", "jobbole"])
5
以上运行可能在win下会报错
6
settings.py
准备工作完
1
在jobbole.py下
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import re
from urllib import parse
class JobboleSpider(scrapy.Spider):
name = ‘jobbole‘
allowed_domains = [‘blog.jobbole.com‘]
start_urls = [‘http://blog.jobbole.com/all-posts/‘]
def parse(self, response):
# 获取列表页每一个item的url
post_urls = response.css(‘#archive .floated-thumb .post-thumb a::attr(href)‘).extract()
for post_url in post_urls:
print(post_url)
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_info) # 把获取到的url交给详情页的方法处理
# 获取下一页的url
next_url = response.css(‘.next.page-numbers::attr(href)‘).extract_first()
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse) # 把获取到的下一页的url交给自己的方法处理
"""获取详情页的信息"""
def parse_info(self, response):
# 以下都是获取详情页信息
res_title = response.xpath(‘//div[@class="entry-header"]/h1/text()‘).extract_first()
res_date = response.xpath(‘//p[@class="entry-meta-hide-on-mobile"]/text()‘).extract_first().strip().replace(‘·‘, ‘‘).strip()
res_zhan = response.xpath(‘//span[contains(@class, "vote-post-up")]/h10/text()‘).extract_first()
res_content = response.xpath(‘//div[@class="entry"]/p/text()‘).extract_first()
res_cate_a = response.xpath(‘//p[@class="entry-meta-hide-on-mobile"]/a/text()‘).extract_first()
res_cate_b = [i.strip() for i in res_cate_a if not i.strip().endswith(‘评论‘)]
res_cate_c = ‘,‘.join(res_cate_b)
res_shoucang = response.xpath(‘//div[@class="post-adds"]/span[2]/text()‘).extract_first().strip()
match_obj1 = re.match(‘.*(\d+).*‘, res_shoucang)
if match_obj1:
res_shoucang = match_obj1.group(1)
else:
res_shoucang = 0
res_comment = response.xpath(‘//div[@class="post-adds"]/a/span/text()‘).extract_first().strip()
match_obj2 = re.match(‘.*(\d+).*‘, res_comment)
if match_obj2:
res_comment = match_obj2.group(1)
else:
res_comment = 0