内核月报搜索普通版代码

Posted 2022-04-23 渔夫数据库笔记
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了内核月报搜索普通版代码相关的知识，希望对你有一定的参考价值。
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2022/4/11 17:58
# @Author  : shaofei
# @Email   : shaochenshuo@126.com
# @File    : 内核月报搜索.py
# @Software: PyCharm

"""
该程序通过指定(-k)关键字搜索淘宝内核月报的标题，正文，代码等部分，只要有其中某个部分命中就输出该月报标题及链接
"""

#pip3 install beautifulsoup4
#pip3 install lxml
#pip install requests
#import urllib2
import urllib.request
import requests
import threading
import argparse
import re
import sys
from bs4 import BeautifulSoup
url = "http://mysql.taobao.org/monthly"
#加上请求头，模拟成浏览器取访问网站，避免请求头反爬策略：HTTP Error 418,注意headers是dic
USER_AGENT  = 'user-agent': 'Mozilla/5.0 (Linux; android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/100.0.4896.75 Mobile Safari/537.36'
URL_TIMEOUT = 10


def parameter_parse():
    # 解析命令行参数(-k 指定过滤的关键字，-l指定多个关键字间的关系)
    parse = argparse.ArgumentParser()
    parse.add_argument('-k', '--key', nargs='+', type=str, default=None,
                       help='指定搜索的关键字，必填参数，支持指定多个关键字，多个关键字间以空格分隔', required=True)
    parse.add_argument('-l', '--relation', type=str, choices=['or', 'and'], help='多个关键词之间的关系 or或者and')
    args = parse.parse_args()
    # print(args)
    # print(args.key[0])
    if 1 < len(args.key) <= 3:
        if args.relation:
            print('本次过滤指定的关键字列表为：，关键字间关系为：'.format(args.key, args.relation))
        else:
            print('当你使用 -k 指定多个关键字时，必须使用 -l 指定关键字间的关系(and 或者 or)')
            sys.exit()
    elif len(args.key) == 1:
        print('本次过滤指定的关键字为：'.format(args.key[0]))
    else:
        print('  -k 最多只支持指定三个关键字')
        sys.exit()
    return args

def get_html(url, USER_AGENT, URL_TIMEOUT):
    """
    访问url返回html
    """
    response = requests.get(url, headers=USER_AGENT, timeout=URL_TIMEOUT)
    response.encoding = response.apparent_encoding
    #text = response.text
    html = response.content.decode('utf-8')
    #print(text)
    return html

def call_gethtml(url, USER_AGENT, URL_TIMEOUT):
    try:
        html = get_html(url, USER_AGENT, URL_TIMEOUT)
    except Exception as e:
        error = str(e)
        if "Connection to mysql.taobao.org timed out" in error:
            print('访问网页超时，重新访问')
            html = get_html(url, USER_AGENT, URL_TIMEOUT)
        else:
            print('访问网页报错，报错代码如下：'.format(e))
    return html

def get_month_list():
    """
    用户生成淘宝内核月报，每月url 列表
    """
    html = call_gethtml(url, USER_AGENT, URL_TIMEOUT)
    #通过BeautifulSoup对html进行格式化，便于访问
    soup = BeautifulSoup(html, 'lxml')
    h3_list = soup.find_all('h3')
    #上面的 soup 是 'bs4.element.Tag' 类型，下面的 h3 也是 'bs4.element.Tag'类型
    month_url = []
    for h3 in h3_list:
        #h3.a['href'] 访问h3的a标签，h3.a.string 获取a标签的文本内容(如果有多个a标签，下面获取的是第一个a标签)
        child_list = ''.format('http://mysql.taobao.org', h3.a['href'])
        #print('链接为：, 链接标题为：'.format(child_list, h3.a.string))
        month_url.append(child_list)
    #print('月报列表长度为：。月报列表如下：'.format(len(month_url), month_url))
    return month_url


def key_check(url_month, key: list, relation = 0):
    """
    通过顺序匹配文章标题，正文，及代码内容，如果能匹配上关键字则输出该链接(按标题，正文，代码顺序匹配，标题匹配上就不会再进行正文匹配)
    :param url_month: 如http://mysql.taobao.org/monthly/2022/03/
    :param key:
    :return:
    """
    #访问某月的内核月报列表(如http://mysql.taobao.org/monthly/2022/03/)
    html_title = call_gethtml(url_month, USER_AGENT, URL_TIMEOUT)
    soup = BeautifulSoup(html_title, 'lxml')
    article_h3_list = soup.find_all('h3')
    for h3_in in article_h3_list:
        #指定3个关键字参数时对月报进行过滤
        if len(key) == 3:
            if relation == 'or':
                # 查看文章标题中是否含有key参数指定的关键字，re.I 表示不区分大小写。如果标题中含有关键字，则直接放入article_url字典，不继续检查正文
                if h3_in.find('a', text=re.compile(key[0], re.I)) or h3_in.find('a', text=re.compile(key[1], re.I)) or h3_in.find('a', text=re.compile(key[2], re.I)):
                    url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                    url_string = h3_in.a.string.strip()
                    print('标题匹配上为：，链接地址为：'.format(url_string, url_page))
                    article_url['标题匹配：'.format(url_string)] = url_page
                    continue
                # 如果标题中不包含关键字，继续搜索正文中是否包含关键字
                else:
                    url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                    url_string = h3_in.a.string.strip()
                    # 访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
                    html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
                    soup = BeautifulSoup(html_page, 'lxml')
                    # 检查正文中是否包含key参数指定的关键字
                    if soup.find_all('p', text=re.compile(key[0], re.I)) or soup.find_all('p', text=re.compile(key[1], re.I)) or soup.find_all('p', text=re.compile(key[2], re.I)):
                        article_url['正文匹配：'.format(url_string)] = url_page
                        print('正文匹配上为：，链接地址为：'.format(url_string, url_page))
                    # 检查代码段中是否包含key参数指定的关键字
                    elif soup.find('code', text=re.compile(key[0], re.I)) or soup.find('code', text=re.compile(key[1], re.I)) or soup.find('code', text=re.compile(key[2], re.I)):
                        article_url['代码匹配：'.format(url_string)] = url_page
                        print('代码匹配上为：，链接地址为：'.format(url_string, url_page))
                    else:
                        pass
                pass
            if relation == 'and':
                # 查看文章标题中是否含有key参数指定的关键字，re.I 表示不区分大小写。如果标题中含有关键字，则直接放入article_url字典，不继续检查正文
                if h3_in.find('a', text=re.compile(key[0], re.I)) and h3_in.find('a', text=re.compile(key[1], re.I)) and h3_in.find('a', text=re.compile(key[2], re.I)):
                    url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                    url_string = h3_in.a.string.strip()
                    print('标题匹配上为：，链接地址为：'.format(url_string, url_page))
                    article_url['标题匹配：'.format(url_string)] = url_page
                    continue
                # 如果标题中不包含关键字，继续搜索正文中是否包含关键字
                else:
                    url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                    url_string = h3_in.a.string.strip()
                    # 访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
                    html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
                    soup = BeautifulSoup(html_page, 'lxml')
                    # 检查正文中是否包含key参数指定的关键字
                    if soup.find_all('p', text=re.compile(key[0], re.I)) and soup.find_all('p', text=re.compile(key[1], re.I)) and soup.find_all('p', text=re.compile(key[2], re.I)):
                        article_url['正文匹配：'.format(url_string)] = url_page
                        print('正文匹配上为：，链接地址为：'.format(url_string, url_page))
                    # 检查代码段中是否包含key参数指定的关键字
                    elif soup.find('code', text=re.compile(key[0], re.I)) and soup.find('code', text=re.compile(key[1], re.I)) and soup.find('code', text=re.compile(key[2], re.I)):
                        article_url['代码匹配：'.format(url_string)] = url_page
                        print('代码匹配上为：，链接地址为：'.format(url_string, url_page))
                    else:
                        pass
            else:
                pass
        #指定两个关键字参数时，对月报进行过滤
        elif len(key) == 2:
            if relation == 'or':
                # 查看文章标题中是否含有key参数指定的关键字，re.I 表示不区分大小写。如果标题中含有关键字，则直接放入article_url字典，不继续检查正文
                if h3_in.find('a', text=re.compile(key[0], re.I)) or h3_in.find('a', text=re.compile(key[1], re.I)):
                    url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                    url_string = h3_in.a.string.strip()
                    print('标题匹配上为：，链接地址为：'.format(url_string, url_page))
                    article_url['标题匹配：'.format(url_string)] = url_page
                    continue
                # 如果标题中不包含关键字，继续搜索正文中是否包含关键字
                else:
                    url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                    url_string = h3_in.a.string.strip()
                    # 访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
                    html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
                    soup = BeautifulSoup(html_page, 'lxml')
                    # 检查正文中是否包含key参数指定的关键字
                    if soup.find_all('p', text=re.compile(key[0], re.I)) or soup.find_all('p', text=re.compile(key[1], re.I)):
                        article_url['正文匹配：'.format(url_string)] = url_page
                        print('正文匹配上为：，链接地址为：'.format(url_string, url_page))
                    # 检查代码段中是否包含key参数指定的关键字
                    elif soup.find('code', text=re.compile(key[0], re.I)) or soup.find('code', text=re.compile(key[1], re.I)):
                        article_url['代码匹配：'.format(url_string)] = url_page
                        print('代码匹配上为：，链接地址为：'.format(url_string, url_page))
                    else:
                        pass
            if relation == 'and':
                # 查看文章标题中是否含有key参数指定的关键字，re.I 表示不区分大小写。如果标题中含有关键字，则直接放入article_url字典，不继续检查正文
                if h3_in.find('a', text=re.compile(key[0], re.I)) and h3_in.find('a', text=re.compile(key[1], re.I)):
                    url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                    url_string = h3_in.a.string.strip()
                    print('标题匹配上为：，链接地址为：'.format(url_string, url_page))
                    article_url['标题匹配：'.format(url_string)] = url_page
                    continue
                # 如果标题中不包含关键字，继续搜索正文中是否包含关键字
                else:
                    url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                    url_string = h3_in.a.string.strip()
                    # 访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
                    html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
                    soup = BeautifulSoup(html_page, 'lxml')
                    # 检查正文中是否包含key参数指定的关键字
                    if soup.find_all('p', text=re.compile(key[0], re.I)) and soup.find_all('p', text=re.compile(key[1], re.I)):
                        article_url['正文匹配：'.format(url_string)] = url_page
                        print('正文匹配上为：，链接地址为：'.format(url_string, url_page))
                    # 检查代码段中是否包含key参数指定的关键字
                    elif soup.find('code', text=re.compile(key[0], re.I)) and soup.find('code', text=re.compile(key[1], re.I)):
                        article_url['代码匹配：'.format(url_string)] = url_page
                        print('代码匹配上为：，链接地址为：'.format(url_string, url_page))
                    else:
                        pass
            else:
                pass
        #指定一个关键字参数时对月报进行过滤
        else:
            #查看文章标题中是否含有key参数指定的关键字，re.I 表示不区分大小写。如果标题中含有关键字，则直接放入article_url字典，不继续检查正文
            if h3_in.find('a', text=re.compile(key[0], re.I)):
                url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                url_string = h3_in.a.string.strip()
                print('标题匹配上为：，链接地址为：'.format(url_string, url_page))
                article_url['标题匹配：'.format(url_string)] = url_page
                continue
            # 如果标题中不包含关键字，继续搜索正文中是否包含关键字
            else:
                url_page = ''.format('http://mysql.taobao.org', h3_in.a['href'])
                url_string = h3_in.a.string.strip()
                #访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
                html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
                soup = BeautifulSoup(html_page, 'lxml')
                #检查正文中是否包含key参数指定的关键字
                if soup.find_all('p', text=re.compile(key[0], re.I)):
                    article_url['正文匹配：'.format(url_string)] = url_page
                    print('正文匹配上为：，链接地址为：'.format(url_string, url_page))
                #检查代码段中是否包含key参数指定的关键字
                elif soup.find('code', text=re.compile(key[0], re.I)):
                    article_url['代码匹配：'.format(url_string)] = url_page
                    print('代码匹配上为：，链接地址为：'.format(url_string, url_page))
                else:
                    pass


if __name__ == '__main__':
    args = parameter_parse()
    key = args.key
    if len(key) > 1:
        key_relation = args.relation
    else:
        key_relation = 0
    #这里不比考虑python 中字典是否线程安全，因为我的并发线程只会对自字典增加不同key:value 键值对
    article_url = 
    i_parallel = 2
    month_list = get_month_list()
    list_count = len(month_list)
    operate_count = 1
    if list_count > 1:
        while operate_count <= list_count:
            threads = []
            for i in range(i_parallel):
                t_check = threading.Thread(target=key_check, args=(month_list[operate_count - 1], key, key_relation))
                threads.append(t_check)
                operate_count += 1
                if operate_count > (list_count):
                    break
            for s in threads:
                s.start()
            for j in threads:
                j.join()
            print('当前过滤分析进度为 ---------------------%---------------------'.format(round(((operate_count - 1)/list_count)*100, 2)))
    else:
        key_check(month_list[0], key)
    print('最终检查结果为：'.format(article_url))
以上是关于内核月报搜索普通版代码的主要内容，如果未能解决你的问题，请参考以下文章
内核月报搜索普通版 代码

内核月报搜索普通版代码