python 一个多线程的挖掘github可用的用户名

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 一个多线程的挖掘github可用的用户名相关的知识,希望对你有一定的参考价值。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Github available username mine
import itertools
import threading
import urllib.error
import urllib.request
from datetime import datetime
from queue import Queue

# target folder to save available username
f = open('data.txt', 'a')
# number of threads
workers = 32
# length of username
name_length = 4
# starting point that the mine try to crawl
mine_length = 4

args_queue = Queue()
data_queue = Queue()
lock = threading.Lock()

base_url = 'https://github.com/'
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'
headers = {'User-Agent': user_agent}
error_code = 'Not Found'


class ThreadCrawl(threading.Thread):
    def __init__(self, m_args_queue, m_data_queue):
        threading.Thread.__init__(self)
        self.args_queue = m_args_queue
        self.data_queue = m_data_queue

    def run(self):
        while True:
            args = self.args_queue.get()
            try:
                url = base_url + args
                print(url)
                request = urllib.request.Request(url, headers=headers)
                urllib.request.urlopen(request)

            except urllib.error.URLError as e:
                if hasattr(e, "reason"):
                    print(args, e.reason)
                    self.data_queue.put(args)

            self.args_queue.task_done()
        return


class ThreadWrite(threading.Thread):
    def __init__(self, m_data_queue, m_lock, m_f):
        threading.Thread.__init__(self)
        self.data_queue = m_data_queue
        self.lock = m_lock
        self.f = m_f

    def run(self):
        while True:
            data = self.data_queue.get()
            # print(isinstance(data, str), data)
            with self.lock:
                self.f.write(data + '\n')
            self.data_queue.task_done()
        return


def dict_maker(length, point=1, chars=None):
    assert length >= point >= 1

    if chars is None:
        import string
        chars = string.ascii_lowercase + string.digits

    p = []
    for j in range(point, length + 1):
        p.append(itertools.product(chars, repeat=j))

    return itertools.chain(*p)


if __name__ == '__main__':
    time_point = datetime.now()

    for i in range(workers):
        t = ThreadCrawl(args_queue, data_queue)
        t.setDaemon(True)
        t.start()

    dicts = dict_maker(name_length, point=mine_length)

    for n in dicts:
        # tuple
        str_n = "".join(n)
        args_queue.put(str_n)

    for i in range(2):
        t = ThreadWrite(data_queue, lock, f)
        t.setDaemon(True)
        t.start()

    args_queue.join()
    data_queue.join()

    with lock:
        f.close()

    print((datetime.now() - time_point).seconds)

以上是关于python 一个多线程的挖掘github可用的用户名的主要内容,如果未能解决你的问题,请参考以下文章

数据挖掘_多线程抓取

数据挖掘_多进程抓取

如何让 Python 多线程管道使用 90% 的可用内存?

基于Windows平台的Python多线程及多进程学习小结

testNG 多线程测试(xml文件实现)

‘高并发&高性能&高可用服务程序’编写及运维指南