抽屉爬取
Posted chvv
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了抽屉爬取相关的知识,希望对你有一定的参考价值。
# -*- coding: utf-8 -*- import scrapy from scrapy.http.cookies import CookieJar from scrapy.http import Request class ChoutiSpider(scrapy.Spider): name = ‘chouti‘ allowed_domains = [‘chouti.com‘] start_urls = [‘https://dig.chouti.com/‘] #ROBOTSTXT_OBEY = True需要注销掉 cookie_dict = {} #其他页面也要cookie,所以定义一个全局的 def parse(self, response): #点赞需要登录才能,所以要携带cookie #去响应头中获取cookie,response中获取 #cookie保存在cookie_jar对象中 print(response.xpath("//div[@id=‘dig_lcpage‘]/a/@href")) print(‘=====‘) # cookie_dict = {} cookie_jar = CookieJar() cookie_jar.extract_cookies(response, response.request) #去对象中将cookie解析到字典中 for k, v in cookie_jar._cookies.items(): for i, j in v.items(): for m, n in j.items(): self.cookie_dict[m] = n.value print("+++++") print(self.cookie_dict) yield scrapy.Request( url="https://dig.chouti.com/login", method="POST", body="phone=8618588888888&password=88888888&oneMonth=1", cookies=self.cookie_dict, headers={"content-type":"application/x-www-form-urlencoded; charset=UTF-8"}, callback=self.check_login ) def check_login(self,response): print(response.text) print("==============") yield scrapy.Request( "https://dig.chouti.com/all/hot/recent/1", cookies=self.cookie_dict, callback=self.index ) def index(self,response): div_list = response.xpath("//div[@id=‘content-list‘]/div[@class=‘item‘]") for div in div_list: link_id = div.xpath(‘*/div[@class="part2"]/@share-linkid‘).extract_first() print(link_id) #点赞请求 yield Request( url=‘http://dig.chouti.com/link/vote?linksId=%s‘ % (link_id,), method=‘POST‘, cookies=self.cookie_dict, callback=self.check_result ) def check_result(self,response): print(response.text)
以上是关于抽屉爬取的主要内容,如果未能解决你的问题,请参考以下文章