python 这个python脚本是着名的Wayback Machine网站存档爬虫的简单命令行实现。

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 这个python脚本是着名的Wayback Machine网站存档爬虫的简单命令行实现。相关的知识,希望对你有一定的参考价值。

#!/bin/python

# The Wayback Machine is a digital archive of the World Wide Web and other information
# on the Internet created by the Internet Archive, a nonprofit organization.

# Author: Kristof Toth
# Description: Command line program for the Wayback machine
# Link to the API: https://github.com/internetarchive/wayback/tree/master/wayback-cdx-server

import argparse
import json
import sys
import urllib2
import requests
from datetime import datetime
from prettytable import PrettyTable


def main():
    usage = "Usage: python wayback.py -t example.com [options]"
    description = "Command line program for the Wayback machine. Version 1.0"
    parser = argparse.ArgumentParser(usage=usage, description=description)
    # add command line  options
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
    parser.add_argument("-t", "--target", action="store", required=True,
                        dest="target", help="(required) specifies the target domain")
    parser.add_argument("-m", "--match", action="store", choices=["exact", "prefix", "host", "domain"],
                        dest="match", help="(optional) return matches for an exact url"
                                           "exact (default): return exactly matching results"
                                           "prefix: return results for all results under the path"
                                           "host: return result for the given host"
                                           "domain: return results from host and all subhosts")
    parser.add_argument("-f", "--filter", action="store",
                        dest="filter", help="(optional) you can specify different criterias"
                                            "'!' before the query inverts the match"
                                            "Filter for 'OK' response codes: statuscode:200"
                                            "Filter for mime type: mimetype:text/html"
                                            "Filter for a certain digest: digest=...")
    parser.add_argument("--from", action="store",
                        dest="start", help="(optional) you can specify the starting date"
                                           "date format: yyyyMMddhhmmss"
                                           "Example: --from=2015")
    parser.add_argument("--to", action="store",
                        dest="end", help="(optional) you can specify the ending date"
                                         "date format: yyyyMMddhhmmss"
                                         "Example: --to=2017")
    parser.add_argument("-l", "--limit", action="store",
                        dest="limit", help="(optional) limits the number of results"
                                           "-N returns the last N result")
    parser.add_argument("-o", "--output", action="store",
                        dest="result", help="(optional) outputs the result to a file")

    options = parser.parse_args()

    print "[ * ] Target: {0}".format(options.target)

    # API endpoint url
    url = "http://web.archive.org/cdx/search/cdx?"

    # additional parameters to the endpoint
    parameters = {"url": options.target,
                  "limit": options.limit,
                  "match": options.match,
                  "filter": options.filter,
                  "from": options.start,
                  "to": options.end,
                  "output": "json"}

    print "[ * ] Downloading data..."
    response = requests.get(url=url, params=parameters)
    data = json.loads(response.text)
    print "[ * ] Download finished!"

    # default choice for saving to a file is no
    choice = "no"

    # if the data is more than 100 lines and the output file is not expicitly defined, ask the question
    if options.result is None and len(data) > 100:
        sys.stdout.write("The amount of data is not suitable for standard output. \n"
                         "Would you like store it in a file instead? [ y / n ] : ")
        choice = raw_input().lower()

    # define a pretty table
    table = PrettyTable()
    table.field_names = ["Date", "Time", "Link", "Mime Type", "Status code", "Digest", "Length"]
    table.align["Link"] = "l"
    print "[ * ] This may take some time. Please wait!"
    for i in range(1, len(data)):
        # uncomment to search for username and passwords
        # if row(data, i) is not None:
        table.add_row(row(data, i))

    # user wants to store the data in a file
    if options.result or choice in ["yes", "y"]:
        print "[ * ] Creating file..."
        if options.result:
            file_name = options.result
        else:
            file_name = "archives.txt"
        # the user used the -o switch, so put the result into that file
        output_file = open(file_name, "w+")
        print "[ * ] File successfully created!"
        print "[ * ] Writing data to the file..."
        output_file.write(table.get_string().encode("utf-8"))
        output_file.close()
        print "[ * ] Result successfully saved to {0}!".format(file_name)
    else:
        # just iterate through the result and print it to stdout
        print "[ * ] Displaying results for {0}: ".format(options.target)
        print table
        print "[ * ] Operation successfully finished!"


def row(data, i):
    try:
        # trying to format the date to a more readable format
        date = datetime.strptime(data[i][1], "%Y%m%d%H%M%S").strftime("%Y.%m.%d.")
        time = datetime.strptime(data[i][1], "%Y%m%d%H%M%S").strftime("%H:%M:%S")
    except ValueError:
        # sometimes we can not format the date, so leave it like that
        date = data[i][1]
        time = data[i][1]
        pass
    domain = urllib2.unquote(data[i][2]).encode('ascii', 'ignore').decode('ascii')
    link = urllib2.unquote("https://web.archive.org/web/" + data[i][1] + "/" + domain)
    mime_type = data[i][3]
    status_code = data[i][4]
    digest = data[i][5]
    length = data[i][6]

    # use this if you want to search for username and passwords
    # if "user" in link or "pass" in link:
    #    return [date, time, link, mime_type, status_code, digest, length]
    # else:
    #    return None

    return [date, time, link, mime_type, status_code, digest, length]


if __name__ == '__main__':
    main()

以上是关于python 这个python脚本是着名的Wayback Machine网站存档爬虫的简单命令行实现。的主要内容,如果未能解决你的问题,请参考以下文章

卷积神经网络的整体认识

CSS 着名的梅耶重置

用着名的Square / Rectangle示例打破Liskov原理给出了逻辑错误

如何运用Java组件itext生成pdf

如何运用Java组件itext生成pdf

如何在多进程系统中实现锁定?