scrapy 初探(css)
Posted 小溪彼岸
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了scrapy 初探(css)相关的知识,希望对你有一定的参考价值。
初始化项目及项目配置同scrapy 初探(xpath)一样
新建zwblog/spiders/lianjia_spider.py
内容如下:
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from LjCrwaler.items import LianjiaItem
class LJCrwalerSpider(CrawlSpider):
name = 'ljcrwaler'
allowed_domains = ['lianjia.com']
start_urls = ['https://qd.lianjia.com/ershoufang/']
# 设置抓取规则
rules =
# 房产详情链接
Rule(LinkExtractor(
restrict_xpaths="//ul[@class='sellListContent']/li/div[@class='info clear']/div[@class='title']/a"),
follow=True, callback="process_item"),
# 翻页链接
Rule(LinkExtractor(restrict_xpaths="//div[@class='pagination_group_a']/a"), follow=True),
def process_item(self, response):
# 提取关键字段信息
title = response.css('title::text').extract_first()
price = response.css('div.overview div.content > div.price > span.total::text').extract_first()
unit_price= response.css(
'div.overview div.content > div.price span.unitPriceValue::text').extract_first()
community_name = response.css(
'div.overview div.content > div.aroundInfo > div.communityName > a::text').extract_first()
region = response.css('div.areaName span.info a::text').extract()
t1 = response.xpath('//*[@class="brokerInfo clear"]/div')
linkman = t1.xpath('//*[@class="brokerName"]/a/text()').extract_first()
linktel = t1.xpath('//*[@class="phone"]/text()').extract_first()
type = response.css('#introduction div.base ul > li:first-child::text').extract_first()
construction_area = response.css('#introduction div.base ul > li:nth-child(3)::text').extract_first()
actual_area = response.css('#introduction div.base ul > li:nth-child(5)::text').extract_first()
orientation = response.css('#introduction div.base ul > li:nth-child(7)::text').extract_first()
decoration = response.css('#introduction div.base ul > li:nth-child(9)::text').extract_first()
floor = response.css('#introduction div.base ul > li:nth-child(2)::text').extract_first()
elevator = response.css('#introduction div.base ul > li:nth-child(12)::text').extract_first()
property = response.css('#introduction div.base ul > li:nth-child(13)::text').extract_first()
house_years = response.css(
'#introduction div.transaction li:nth-child(5) span:nth-child(2)::text').extract_first()
mortgage = response.css(
'#introduction div.transaction li:nth-child(7) span:nth-child(2)::text').extract_first().strip()
purposes = response.css(
'#introduction div.transaction ul > li:nth-child(4) span:nth-child(2)::text').extract_first()
release_date = response.css(
'#introduction div.transaction ul > li:first-child span:nth-child(2)::text').extract_first()
image_urls = response.css('div.content-wrapper img::attr(src)').extract()
from_url = response.url
yield item
css选择器的使用方式,基本上就是根据标签的class/id以及层级一层层筛选来达到获取数据的方式
title::text
表示ddd 标签的内容,类似a/text()
的用法div.overview div.content > div.aroundInfo
的表示方式和div.overview div.content div.aroundInfo
的写法效果一样
以上是关于scrapy 初探(css)的主要内容,如果未能解决你的问题,请参考以下文章