python 这个python脚本是着名的Wayback Machine网站存档爬虫的简单命令行实现。
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 这个python脚本是着名的Wayback Machine网站存档爬虫的简单命令行实现。相关的知识,希望对你有一定的参考价值。
#!/bin/python
# The Wayback Machine is a digital archive of the World Wide Web and other information
# on the Internet created by the Internet Archive, a nonprofit organization.
# Author: Kristof Toth
# Description: Command line program for the Wayback machine
# Link to the API: https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server
import argparse
import json
import sys
import urllib2
import requests
from datetime import datetime
from prettytable import PrettyTable
def main():
usage = "Usage: python wayback.py -t example.com [options]"
description = "Command line program for the Wayback machine. Version 1.0"
parser = argparse.ArgumentParser(usage=usage, description=description)
# add command line options
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
parser.add_argument("-t", "--target", action="store", required=True,
dest="target", help="(required) specifies the target domain")
parser.add_argument("-m", "--match", action="store", choices=["exact", "prefix", "host", "domain"],
dest="match", help="(optional) return matches for an exact url"
"exact (default): return exactly matching results"
"prefix: return results for all results under the path"
"host: return result for the given host"
"domain: return results from host and all subhosts")
parser.add_argument("-f", "--filter", action="store",
dest="filter", help="(optional) you can specify different criterias"
"'!' before the query inverts the match"
"Filter for 'OK' response codes: statuscode:200"
"Filter for mime type: mimetype:text/html"
"Filter for a certain digest: digest=...")
parser.add_argument("--from", action="store",
dest="start", help="(optional) you can specify the starting date"
"date format: yyyyMMddhhmmss"
"Example: --from=2015")
parser.add_argument("--to", action="store",
dest="end", help="(optional) you can specify the ending date"
"date format: yyyyMMddhhmmss"
"Example: --to=2017")
parser.add_argument("-l", "--limit", action="store",
dest="limit", help="(optional) limits the number of results"
"-N returns the last N result")
parser.add_argument("-o", "--output", action="store",
dest="result", help="(optional) outputs the result to a file")
options = parser.parse_args()
print "[ * ] Target: {0}".format(options.target)
# API endpoint url
url = "http://web.archive.org/cdx/search/cdx?"
# additional parameters to the endpoint
parameters = {"url": options.target,
"limit": options.limit,
"match": options.match,
"filter": options.filter,
"from": options.start,
"to": options.end,
"output": "json"}
print "[ * ] Downloading data..."
response = requests.get(url=url, params=parameters)
data = json.loads(response.text)
print "[ * ] Download finished!"
# default choice for saving to a file is no
choice = "no"
# if the data is more than 100 lines and the output file is not expicitly defined, ask the question
if options.result is None and len(data) > 100:
sys.stdout.write("The amount of data is not suitable for standard output. \n"
"Would you like store it in a file instead? [ y / n ] : ")
choice = raw_input().lower()
# define a pretty table
table = PrettyTable()
table.field_names = ["Date", "Time", "Link", "Mime Type", "Status code", "Digest", "Length"]
table.align["Link"] = "l"
print "[ * ] This may take some time. Please wait!"
for i in range(1, len(data)):
# uncomment to search for username and passwords
# if row(data, i) is not None:
table.add_row(row(data, i))
# user wants to store the data in a file
if options.result or choice in ["yes", "y"]:
print "[ * ] Creating file..."
if options.result:
file_name = options.result
else:
file_name = "archives.txt"
# the user used the -o switch, so put the result into that file
output_file = open(file_name, "w+")
print "[ * ] File successfully created!"
print "[ * ] Writing data to the file..."
output_file.write(table.get_string().encode("utf-8"))
output_file.close()
print "[ * ] Result successfully saved to {0}!".format(file_name)
else:
# just iterate through the result and print it to stdout
print "[ * ] Displaying results for {0}: ".format(options.target)
print table
print "[ * ] Operation successfully finished!"
def row(data, i):
try:
# trying to format the date to a more readable format
date = datetime.strptime(data[i][1], "%Y%m%d%H%M%S").strftime("%Y.%m.%d.")
time = datetime.strptime(data[i][1], "%Y%m%d%H%M%S").strftime("%H:%M:%S")
except ValueError:
# sometimes we can not format the date, so leave it like that
date = data[i][1]
time = data[i][1]
pass
domain = urllib2.unquote(data[i][2]).encode('ascii', 'ignore').decode('ascii')
link = urllib2.unquote("https://web.archive.org/web/" + data[i][1] + "/" + domain)
mime_type = data[i][3]
status_code = data[i][4]
digest = data[i][5]
length = data[i][6]
# use this if you want to search for username and passwords
# if "user" in link or "pass" in link:
# return [date, time, link, mime_type, status_code, digest, length]
# else:
# return None
return [date, time, link, mime_type, status_code, digest, length]
if __name__ == '__main__':
main()
以上是关于python 这个python脚本是着名的Wayback Machine网站存档爬虫的简单命令行实现。的主要内容,如果未能解决你的问题,请参考以下文章