用scrapy和硒刮去分页的内容
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了用scrapy和硒刮去分页的内容相关的知识,希望对你有一定的参考价值。
我正在使用selenium和scrapy来刮取具有ajax内容的网站。首先,由于“csrf”的缘故,我无法模拟ajax请求。和主要问题,这里是我要抓的链接:
https://lastsecond.ir/hotels
这个网站的网址模式是这样的:
https://lastsecond.ir/hotels?page=1
https://lastsecond.ir/hotels?page=2
https://lastsecond.ir/hotels?page=3
....
https://lastsecond.ir/hotels?page=230
由ajax生成的内容所以我必须使用selenium在浏览器中稍等一下,但我无法浏览页面并获取所有内容!我只是得到第一页内容,我没有得到任何错误。这是我的蜘蛛代码:
class HotelsSpider(CrawlSpider):
name = 'hotels'
allowed_domains = ['lastsecond.ir']
start_urls = ['http://lastsecond.ir/hotels']
rules = (
Rule(LinkExtractor(allow=r'/hotels?page=[0-9]/'),
callback='parse_item', follow=True),
)
def __init__(self, *args, **kwargs):
super(HotelsSpider, self).__init__(*args, **kwargs)
self.driver = webdriver.Chrome(executable_path='chromedriver.exe')
def parse_item(self, response):
self.driver.get("http://lastsecond.ir/hotels?page=1")
WebDriverWait(self.driver, 30).until(
EC.presence_of_element_located((By.ID, "panel1"))
)
response = TextResponse(url=response.url,
body=self.driver.page_source, encoding='utf-8')
hotel = ItemLoader(item=HotelItem(), response=response)
hotel.add_css('hotel_name', '#panel1 h2.semimedium-font-size
a::text')
return hotel.load_item()
每个页面都有一个ajax请求,它没有“下一个”链接,它的编号分页。我的主要问题是我刚刚获得第一页内容!
答案
Token
在http://lastsecond.ir/hotels
代码中的javascript
,即。
var csrftoken = 'P7E5Txa5GGmMdJaEf6Y99RsD24vlzD74zEqKg83f';
所以你可以使用标准字符串函数来获取它。
如果你有令牌,那么你可以使用POST
创建http://lastsecond.ir/hotels/ajax
请求FormRequest()
,你不需要Selenium
在FormRequest()
中使用dont_filter=True
因为它会多次执行相同的url并且scrapy通常会跳过重复的url。
#!/usr/bin/env python3
import scrapy
#from scrapy.commands.view import open_in_browser
import json
class MySpider(scrapy.Spider):
name = 'myspider'
#allowed_domains = []
start_urls = ['https://lastsecond.ir/hotels']
#def start_requests(self):
# self.url_template = http://quotes.toscrape.com/tag/{}/page/{}/
# self.tags = ['love', 'inspirational', 'life', 'humor', 'books']
# self.pages = 10
#
# for tag in self.tags:
# for page in range(self.pages):
# url = self.url_template.format(tag, page)
# yield scrapy.Request(url)
def parse(self, response):
print('url:', response.url)
html = response.body_as_unicode()
start = html.find("var csrftoken = '")
start = start + len(b"var csrftoken = '")
end = html.find("';" , start)
self.csrftoken = html[start:end]
print('csrftoken:', self.csrftoken)
yield self.create_ajax_request('1')
def create_ajax_request(self, page):
'''
subfunction can't use `yield, it has to `return` Request to `parser`
and `parser` can use `yield`
'''
print('yield page:', page)
url = 'https://lastsecond.ir/hotels/ajax'
headers = {
'X-CSRF-TOKEN': self.csrftoken,
'X-Requested-With': 'XMLHttpRequest',
}
params = {
'filter_score': '',
'sort': 'reviewed_at',
'duration': '0',
'page': page,
'base_location_id': '1',
}
return scrapy.FormRequest(url,
callback=self.parse_details,
formdata=params,
headers=headers,
dont_filter=True,
)
#open_in_browser(response)
# save JSON in separated file
#number = response.url.split('/')[-1]
#filename = 'page-{}.json'.format(number)
#with open(filename, 'wb') as f:
# f.write(response.body)
# convert JSON into Python's dictionary
#data = json.loads(response.text)
# download files
#for href in response.css('img::attr(href)').extract():
# url = response.urljoin(src)
# yield {'file_urls': [url]}
# download images and convert to JPG
#for src in response.css('img::attr(src)').extract():
# url = response.urljoin(src)
# yield {'image_urls': [url]}
#item = {'url': '...', 'title': '...'}
#yield self.Request(url, meta={'item': item}, callback=self.parse_details)
def parse_details(self, response):
print('url:', response.url)
data = json.loads(response.body_as_unicode())
current = data['pagination']['current']
last = data['pagination']['last']
print('page:', current, '/', last)
print('keys:', data.keys())
print('keys[hotels]:', data['hotels'][0].keys())
print('pagination:', data['pagination'])
for hotel in data['hotels']:
print('title_en:', hotel['title_en'])
yield hotel
if current != last:
yield self.create_ajax_request( str(int(current) + 1) )
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file as CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
# download files to `FILES_STORE/full`
# it needs `yield {'file_urls': [url]}` in `parse()`
#'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1},
#'FILES_STORE': '/path/to/valid/dir',
# download images and convert to JPG
# it needs `yield {'image_urls': [url]}` in `parse()`
#'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1},
#'IMAGES_STORE': '/path/to/valid/dir',
})
c.crawl(MySpider)
c.start()
部分结果在屏幕上。所有数据都保存在output.csv
中。
page: 1
keys: dict_keys(['hotels', 'pagination', 'grades', 'locations', 'scores'])
keys[hotels]: dict_keys(['id', 'title_fa', 'title_en', 'link', 'logo_link', 'decorated_grade', 'location', 'rank', 'is_recommended_percent', 'decorated_score', 'reviews_count'])
title_en: Heliya Kish hotel
title_en: Amara Prestige Elite
title_en: All Seasons Hotel
title_en: Hotel Grand Unal
title_en: Marmaray hotel
title_en: Nova Plaza Taksim Square
title_en: Flora Grand Hotel
title_en: Boulevard Autograph Collection hotel
title_en: Alfa Istanbul hotel
title_en: Ramada Hotel & Suites Istanbul Merter
title_en: Sabena hotel
title_en: Taksim Gonen
title_en: Fame Residence Lara & SPA
title_en: Palazzo Donizetti Hotel
title_en: Twin Towers hotel
title_en: Grand Hotel de Pera hotel
title_en: Grand Hotel Halic
title_en: Grand Pamir hotel
title_en: St George hotel
title_en: The Royal Paradise hotel
page: 2
keys: dict_keys(['hotels', 'pagination', 'grades', 'locations', 'scores'])
keys[hotels]: dict_keys(['id', 'title_fa', 'title_en', 'link', 'logo_link', 'decorated_grade', 'location', 'rank', 'is_recommended_percent', 'decorated_score', 'reviews_count'])
title_en: Radisson Royal moscow hotel
title_en: Avenue hotel
title_en: jamshid esfahan hotel
title_en: Aquatek hotel
title_en: Adalya Elite Lara
title_en: Federal Kuala Lumpur hotel
title_en: Feronya Hotel
title_en: Dolabauri Tbilisi hotel
title_en: Limak Limra hotel
title_en: Urban Boutique Hotel
title_en: Doubletree Hilton Piyalepasa hotel
title_en: Ferman Hilal hotel
title_en: Grand Oztanik Hotel
title_en: Lara Family Club hotel
title_en: Swissotel The Bosphorus
title_en: Berjaya Times Square hotel
title_en: Gardenia hotel
title_en: Rixos Sungate
title_en: Jumeirah Emirates Towers hotel
title_en: Kervansaray Lara Hotel
以上是关于用scrapy和硒刮去分页的内容的主要内容,如果未能解决你的问题,请参考以下文章