Scrapy 采集需要登录注册的网站
Posted funsion
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Scrapy 采集需要登录注册的网站相关的知识,希望对你有一定的参考价值。
#!/usr/bin/py2 # -*- coding: utf-8 -*- #encoding=utf-8 from bs4 import BeautifulSoup from scrapy.http import Request, FormRequest from spider_test.items import * from scrapy.spiders import CrawlSpider from spider_test import settings # @author Funsion Wu class ScrapyTestSpider(CrawlSpider): name = "spider_test" allowed_domains = [settings.SPIDER_DOMAIN] def start_requests(self): """第一次请求一下登录页面,设置开启cookie使其得到cookie,设置回调函数""" yield Request(‘http://%s/admin/account/login.html‘ % settings.SPIDER_DOMAIN, meta={‘cookiejar‘: 1}, callback=self.parse) def parse(self, response): data = dict(username="xiaoming", # 登录页表单的账号字段 password="888888") # 登录页表单的密码字段 print(‘登录中....!‘) """第二次用表单post请求,携带Cookie、浏览器代理、用户登录信息,进行登录给Cookie授权""" yield FormRequest(url=‘http://%s/admin/account/dologin.html‘ % settings.SPIDER_DOMAIN, # 真实post地址 meta={‘cookiejar‘: 1}, formdata=data, callback=self.jump_office_list) def jump_office_list(self, response): print(‘正在请需要登录才可以访问的页面....!‘) yield Request(‘http://%s/admin/office/getofficelist.html‘ % settings.SPIDER_DOMAIN, meta={‘cookiejar‘: 1}, callback=self.parser_office_list) def parser_office_list(self, response): soup = BeautifulSoup(response.body, ‘html.parser‘) page_list = soup.find(attrs={‘class‘: ‘pagination‘}).find_all(‘a‘) if page_list: for page in page_list: page_url = ‘http://%s%s‘ % (settings.SPIDER_DOMAIN, page.get(‘href‘)) yield Request(page_url, meta={‘cookiejar‘: 1}, callback=self.parser_office_list) office_list = soup.find_all(‘a‘, attrs={‘class‘: ‘ui-office-list‘}) if office_list: for office in office_list: office_url = ‘http://%s%s‘ % (settings.SPIDER_DOMAIN, office.attrs[‘href‘]) yield Request(office_url, meta={‘cookiejar‘: 1}, callback=self.parse_article) def parse_article(self, response): test_item = SpiderTestItem() soup = BeautifulSoup(response.body, ‘html.parser‘) container = soup.find(‘table‘, attrs={‘class‘: ‘index-statistics-table‘}) test_item[‘source_url‘] = response.url test_item[‘title‘] = soup.title.get_text() test_item[‘article_content‘] = container.prettify() return test_item
以上是关于Scrapy 采集需要登录注册的网站的主要内容,如果未能解决你的问题,请参考以下文章