使用Python Scrapy抓取网站并捕获不同的响应状态代码
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用Python Scrapy抓取网站并捕获不同的响应状态代码相关的知识,希望对你有一定的参考价值。
有人可以帮助我理解我的Scrapy Spider的每个抓取请求的响应状态代码捕获。我能够获得Resp代码200的输出,但如果网站有404错误,它也不会写任何输出,包括301和302。
这是我为其他网站实现的代码,并添加了一个包含我的名字的域名以供参考。
import scrapy
import requests
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider
class TestSpider(CrawlSpider):
name = 'TestSpider'
handle_httpstatus_list = [404]
resp_log_file = 'C:\resp'
ok_log_file = 'C:\alright'
bad_log_file = 'C:\badresp'
redirect_log_file = 'C:\redirect'
allowed_domains = ['santhosh.com']
start_urls = ['santhosh.com/']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_item"
)
]
def parse_item(self, response):
# The list of items that are found on the particular page
items = []
res = Selector(response)
self.append(self.resp_log_file, str(response))
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
try:
if response.status == 404:
## 404 tracciate anche separatamente
self.append(self.bad_log_file, response.url)
elif response.status == 200:
## printa su ok_log_file
self.append(self.ok_log_file, response.url)
elif response.status == 302:
## printa su ok_log_file
self.append(self.redirect_log_file, response.url)
else:
self.append(self.bad_log_file, response.url)
except Exception, e:
pass
return None
def append(self, file, string):
print " Writing content to File "
file = open(file, 'a')
file.write(string+"
")
file.close()
我已经看到了与响应代码捕获相关的问题,但它们与我的请求并不完全相似,因此创建了这个新帖子。如果有任何与此相关的问题,请求您忽略此问题并将我重定向到那里。提前致谢!
答案
我尝试了代码,我看到它发送404
和301
到parse()
,而不是parse_item()
但我没有断页链接页面所以它不启动LinkExtractor
我使用门户网站httpbin.org生成具有不同状态的页面。
也许如果我有破坏网址的页面,那么LinkExtractor
可以运行,我可以得到不同的结果。
#!/usr/bin/env python3
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.spiders import Rule, CrawlSpider
#from scrapy.commands.view import open_in_browser
class MySpider(CrawlSpider):
name = 'MySpider'
handle_httpstatus_list = [404, 301, 302, 303]
all_responses_log = './responses_all.log'
ok_responses_log = './responses_ok.log'
bad_responses_log = './responses_bad.log'
redirects_responses_log = './responses_redirect.log'
start_urls = [
'http://httpbin.org/status/301',
'http://httpbin.org/status/302',
'http://httpbin.org/status/303',
'http://httpbin.org/status/404',
'http://httpbin.org/status/200',
]
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_item"
)
]
def parse(self, response):
print('parse url:', response.url)
self.test_status('parse()', response)
def parse_item(self, response):
print('parse item url:', response.url)
self.test_status('parse_item()', response)
# The list of items that are found on the particular page
items = []
res = Selector(response)
self.append(self.resp_log_file, str(response))
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
def test_status(self, text, response):
try:
if response.status == 404:
log = self.bad_responses_log
elif response.status == 200:
log = self.ok_responses_log
#elif 299 < response.status < 400:
elif response.status in (301, 302, 303, 307):
log = self.redirects_responses_log
else:
log = self.redirects_responses_log
message = "{} | {} | {}
".format(response.status, text, response.url)
self.append(log, message)
except Exception as e:
print('Error:', e)
def append(self, filename, string):
print('Writing log:', filename)
with open(filename, 'a') as f:
f.write(string)
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
})
c.crawl(MySpider)
c.start()
以上是关于使用Python Scrapy抓取网站并捕获不同的响应状态代码的主要内容,如果未能解决你的问题,请参考以下文章
如何使用Scrapy制作Twitter Crawler? [关闭]