python 一个多线程的挖掘github可用的用户名
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 一个多线程的挖掘github可用的用户名相关的知识,希望对你有一定的参考价值。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Github available username mine
import itertools
import threading
import urllib.error
import urllib.request
from datetime import datetime
from queue import Queue
# target folder to save available username
f = open('data.txt', 'a')
# number of threads
workers = 32
# length of username
name_length = 4
# starting point that the mine try to crawl
mine_length = 4
args_queue = Queue()
data_queue = Queue()
lock = threading.Lock()
base_url = 'https://github.com/'
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'
headers = {'User-Agent': user_agent}
error_code = 'Not Found'
class ThreadCrawl(threading.Thread):
def __init__(self, m_args_queue, m_data_queue):
threading.Thread.__init__(self)
self.args_queue = m_args_queue
self.data_queue = m_data_queue
def run(self):
while True:
args = self.args_queue.get()
try:
url = base_url + args
print(url)
request = urllib.request.Request(url, headers=headers)
urllib.request.urlopen(request)
except urllib.error.URLError as e:
if hasattr(e, "reason"):
print(args, e.reason)
self.data_queue.put(args)
self.args_queue.task_done()
return
class ThreadWrite(threading.Thread):
def __init__(self, m_data_queue, m_lock, m_f):
threading.Thread.__init__(self)
self.data_queue = m_data_queue
self.lock = m_lock
self.f = m_f
def run(self):
while True:
data = self.data_queue.get()
# print(isinstance(data, str), data)
with self.lock:
self.f.write(data + '\n')
self.data_queue.task_done()
return
def dict_maker(length, point=1, chars=None):
assert length >= point >= 1
if chars is None:
import string
chars = string.ascii_lowercase + string.digits
p = []
for j in range(point, length + 1):
p.append(itertools.product(chars, repeat=j))
return itertools.chain(*p)
if __name__ == '__main__':
time_point = datetime.now()
for i in range(workers):
t = ThreadCrawl(args_queue, data_queue)
t.setDaemon(True)
t.start()
dicts = dict_maker(name_length, point=mine_length)
for n in dicts:
# tuple
str_n = "".join(n)
args_queue.put(str_n)
for i in range(2):
t = ThreadWrite(data_queue, lock, f)
t.setDaemon(True)
t.start()
args_queue.join()
data_queue.join()
with lock:
f.close()
print((datetime.now() - time_point).seconds)
以上是关于python 一个多线程的挖掘github可用的用户名的主要内容,如果未能解决你的问题,请参考以下文章