#!/usr/bin/env python
'''Build up a set of URLs using the common crawl index. See
http://commoncrawl.org/2015/04/announcing-the-common-crawl-index/ for more info.
'''
from __future__ import print_function
import gzip
import logging
import os
import random
import boto3
log = logging.getLogger('urlutils.profiling.get_urls')
_here = lambda *paths: os.path.join(os.path.dirname(os.path.abspath(__file__)), *paths)
def get_common_crawl_urls(week='2016-07', max_urls=10000000):
num_urls = 0
bucket = boto3.resource('s3').Bucket('commoncrawl')
objects = bucket.objects.filter(Prefix='cc-index/collections/CC-MAIN-{}/indexes/'.format(week))
objects = [o for o in objects if o.key.endswith('.gz')]
# Common Crawl URLS are alphabetically sorted so we don't want to grab only
# stuff like http://69.30.227.140/showthread.php?tid=35992
objects = random.sample(objects, 3)
for object_ in objects:
filename = _here(os.path.basename(object_.key))
if not os.path.exists(filename):
log.info('Downloading common crawl index file %s to %s', object_.key, filename)
bucket.download_file(object_.key, filename)
log.info('Downloaded %s to %s', object_.key, filename)
with gzip.open(filename) as fp:
for line in fp:
if num_urls == max_urls:
break
yield line.split(' ')[3][1:-2]
num_urls += 1
os.unlink(filename)
if num_urls == max_urls:
break
logging.basicConfig(level=logging.INFO)
[logging.getLogger(l).setLevel(logging.WARN) for l in ('boto3', 'botocore')]
filename = _here('urls.txt')
max_urls = 10000000
log.info('Writing {:,} URLs to %s'.format(max_urls), filename)
with open(filename, 'w') as fp:
for i, url in enumerate(get_common_crawl_urls()):
print(url, file=fp)