Python.biqukan

Posted Breathing...

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python.biqukan相关的知识,希望对你有一定的参考价值。

"""
https://www.biqukan.com
"""

import requests
import requests.adapters
import redis
from lxml import etree
from pyquery import PyQuery as pq
import pymongo
import multiprocessing
import datetime

redis_retries = 5  # 最大尝试次数
redis_key_chapter = 'redis_key_chapter'
mongo_db_name = 'mongo_db_name'
mongo_db_table = 'mongo_db_table'


def get_url_txt(url, headers, encoding, data=None):
    ret = ''
    try:
        requests.adapters.DEFAULT_RETRIES = 5
        session = requests.session()
        session.keep_alive = False
        if data is None:
            response = session.get(url, headers=headers)
        else:
            response = session.get(url, headers=headers, data=data)
        if response.status_code == 200:
            response.encoding = encoding
            ret = response.text
        response.close()
        session.close()
    except Exception as e:
        print(e)
    return ret


def flush_redis_key_chapter():
    try:
        redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete(
            redis_key_chapter)
    except Exception as e:
        print(e)


def get_chapter_href(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        encoding = 'gbk'
        text = get_url_txt(url=url, headers=headers, encoding=encoding)
        if len(text):
            doc1 = pq(text)('.listmain')
            # print(doc1)
            doc2 = pq(doc1)('a')
            # print(doc2)
            for i in doc2:
                chapter_name = pq(i).text()
                chapter_href = 'https://www.biqukan.com' + pq(i).attr('href')
                # print(chapter_name, chapter_href)
                redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
                    redis_key_chapter,
                    chapter_name + '|0|' + chapter_href)  # chaptername|count|href
    except Exception as e:
        print(e)


def flush_mongo_db_table():
    try:
        pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].drop()
    except Exception as e:
        print(e)


def get_chapter_content_(redis_value):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
        encoding = 'gbk'
        chapter_name = redis_value.split('|')[0]
        conn_count = int(redis_value.split('|')[1])
        chapter_href = redis_value.split('|')[2]
        index = chapter_href.split('/')
        index = index[len(index) - 1].split('.')[0]
        text = get_url_txt(url=chapter_href, headers=headers, encoding=encoding)
        if len(text):
            print(redis_value)
            doc1 = pq(text)('#content')
            content = pq(doc1).text()
            # print(content)
            pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].insert_one(
                {'chapter_name': index + chapter_name, 'content': content})
        elif conn_count < redis_retries:
            redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
                redis_key_chapter,
                chapter_name + '|' + str(conn_count + 1) + '|' + chapter_href)  # chaptername|count|href
    except Exception as e:
        print(e)


def get_chapter_content():
    try:
        while redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).llen(
                redis_key_chapter) > 0:
            p = multiprocessing.Pool()
            while True:
                redis_value = redis.StrictRedis(
                    connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).lpop(
                    redis_key_chapter)
                if redis_value is None:
                    break
                redis_value = redis_value.decode(encoding='utf8', errors='ignore')
                # print(redis_value)
                p.apply_async(get_chapter_content_, (redis_value,))
            p.close()
            p.join()
    except Exception as e:
        print(e)


if __name__ == '__main__':
    start = datetime.datetime.now()
    print(start.strftime('%Y-%m-%d %H:%M:%S'))
    # redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete(
    #     'redis_key_chapter')
    # pymongo.MongoClient('localhost:27017')['db']['table'].drop()
    pass
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    encoding = 'gbk'
    # text = get_url_txt('https://www.biqukan.com/4_4438/', headers, encoding)
    # print(text)
    pass
    # doc1 = pq(text)('.listmain')
    # print(doc1)
    # doc2 = pq(doc1)('a')
    # print(doc2)
    # for i in doc2:
    #     chapter_name = pq(i).text()
    #     chapter_href = pq(i).attr('href')
    #     print(chapter_name, 'https://www.biqukan.com' + chapter_href)
    #     redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
    #         redis_key_chapter,
    #         chapter_name + '|0|' + 'https://www.haotxt.com' + chapter_href)  # chaptername|count|href
    pass
    # text = get_url_txt('https://www.biqukan.com/4_4438/2098012.html', headers, encoding)
    # doc1 = pq(text)('#content')
    # content = pq(doc1).text()
    # print(content)
    pass
    # flush_redis_key_chapter()
    # get_chapter_href('https://www.biqukan.com/4_4438/')
    # flush_mongo_db_table()
    # get_chapter_content()
    # with open('temp.txt', 'w'):
    #     pass
    # with open('temp.txt', 'a') as f:
    #     for i in pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].find({}).sort(
    #             [('chapter_name', pymongo.ASCENDING)]):
    #         f.write('	' + i['chapter_name'] + '
')
    #         f.write(i['content'].replace('xa0', '').replace('

', '
') + '

')
    pass
    end = datetime.datetime.now()
    print(end.strftime('%Y-%m-%d %H:%M:%S'))
    print('cost seconds : %d' % (end - start).seconds)
    pass

以上是关于Python.biqukan的主要内容,如果未能解决你的问题,请参考以下文章

VSCode自定义代码片段——CSS选择器

谷歌浏览器调试jsp 引入代码片段,如何调试代码片段中的js

片段和活动之间的核心区别是啥?哪些代码可以写成片段?

VSCode自定义代码片段——.vue文件的模板

VSCode自定义代码片段6——CSS选择器

VSCode自定义代码片段——声明函数