import mechanize
import urllib2
import urlparse
import argparse
import json
p = argparse.ArgumentParser()
p.add_argument("-s", "--site", required=True)
p.add_argument("-d", "--domain_limit")
def get_all_links(br, links, visited=set(), recursion=0, domain_limit=None):
if recursion:
print "***** RECURSION %d *****" % recursion
new_links = set()
for link in links:
if link not in visited:
if domain_limit:
link_parsed = urlparse.urlparse(link)
dom = ".".join(link_parsed.netloc.split(".")[-2:])
if dom != domain_limit:
print "Skipping %s because it's not in the %s domain" % (link, domain_limit)
continue
print "Getting page: %s" % link
visited.add(link)
try:
br.open(link)
if not br.viewing_html():
continue
except urllib2.HTTPError, e:
if e.getcode() == 403:
print "Skipping %s because it's in robots.txt" % link
continue
except urllib2.URLError, e:
print "URLError: %s" % e
continue
for l in br.links():
if l.absolute_url not in links and l.absolute_url not in new_links:
new_links.add(l.absolute_url)
if new_links:
recursion += 1
links = links.union(get_all_links(br, new_links, links.union(visited), recursion, domain_limit))
return links
if __name__ == "__main__":
args = p.parse_args()
br = mechanize.Browser()
links = set()
try:
links = get_all_links(br, set([args.site]), domain_limit=args.domain_limit)
except Exception, e:
print e
if links:
print "Found %d links!" % len(links)
url = urlparse.urlparse(args.site)
with open("%s.json" % url.netloc, "w") as f:
f.write(json.dumps(list(links), indent=2))