Python.biqukan
Posted Breathing...
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python.biqukan相关的知识,希望对你有一定的参考价值。
"""
https://www.biqukan.com
"""
import requests
import requests.adapters
import redis
from lxml import etree
from pyquery import PyQuery as pq
import pymongo
import multiprocessing
import datetime
redis_retries = 5 # 最大尝试次数
redis_key_chapter = 'redis_key_chapter'
mongo_db_name = 'mongo_db_name'
mongo_db_table = 'mongo_db_table'
def get_url_txt(url, headers, encoding, data=None):
ret = ''
try:
requests.adapters.DEFAULT_RETRIES = 5
session = requests.session()
session.keep_alive = False
if data is None:
response = session.get(url, headers=headers)
else:
response = session.get(url, headers=headers, data=data)
if response.status_code == 200:
response.encoding = encoding
ret = response.text
response.close()
session.close()
except Exception as e:
print(e)
return ret
def flush_redis_key_chapter():
try:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete(
redis_key_chapter)
except Exception as e:
print(e)
def get_chapter_href(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
encoding = 'gbk'
text = get_url_txt(url=url, headers=headers, encoding=encoding)
if len(text):
doc1 = pq(text)('.listmain')
# print(doc1)
doc2 = pq(doc1)('a')
# print(doc2)
for i in doc2:
chapter_name = pq(i).text()
chapter_href = 'https://www.biqukan.com' + pq(i).attr('href')
# print(chapter_name, chapter_href)
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
redis_key_chapter,
chapter_name + '|0|' + chapter_href) # chaptername|count|href
except Exception as e:
print(e)
def flush_mongo_db_table():
try:
pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].drop()
except Exception as e:
print(e)
def get_chapter_content_(redis_value):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
encoding = 'gbk'
chapter_name = redis_value.split('|')[0]
conn_count = int(redis_value.split('|')[1])
chapter_href = redis_value.split('|')[2]
index = chapter_href.split('/')
index = index[len(index) - 1].split('.')[0]
text = get_url_txt(url=chapter_href, headers=headers, encoding=encoding)
if len(text):
print(redis_value)
doc1 = pq(text)('#content')
content = pq(doc1).text()
# print(content)
pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].insert_one(
{'chapter_name': index + chapter_name, 'content': content})
elif conn_count < redis_retries:
redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
redis_key_chapter,
chapter_name + '|' + str(conn_count + 1) + '|' + chapter_href) # chaptername|count|href
except Exception as e:
print(e)
def get_chapter_content():
try:
while redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).llen(
redis_key_chapter) > 0:
p = multiprocessing.Pool()
while True:
redis_value = redis.StrictRedis(
connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).lpop(
redis_key_chapter)
if redis_value is None:
break
redis_value = redis_value.decode(encoding='utf8', errors='ignore')
# print(redis_value)
p.apply_async(get_chapter_content_, (redis_value,))
p.close()
p.join()
except Exception as e:
print(e)
if __name__ == '__main__':
start = datetime.datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))
# redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).delete(
# 'redis_key_chapter')
# pymongo.MongoClient('localhost:27017')['db']['table'].drop()
pass
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
encoding = 'gbk'
# text = get_url_txt('https://www.biqukan.com/4_4438/', headers, encoding)
# print(text)
pass
# doc1 = pq(text)('.listmain')
# print(doc1)
# doc2 = pq(doc1)('a')
# print(doc2)
# for i in doc2:
# chapter_name = pq(i).text()
# chapter_href = pq(i).attr('href')
# print(chapter_name, 'https://www.biqukan.com' + chapter_href)
# redis.StrictRedis(connection_pool=redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)).rpush(
# redis_key_chapter,
# chapter_name + '|0|' + 'https://www.haotxt.com' + chapter_href) # chaptername|count|href
pass
# text = get_url_txt('https://www.biqukan.com/4_4438/2098012.html', headers, encoding)
# doc1 = pq(text)('#content')
# content = pq(doc1).text()
# print(content)
pass
# flush_redis_key_chapter()
# get_chapter_href('https://www.biqukan.com/4_4438/')
# flush_mongo_db_table()
# get_chapter_content()
# with open('temp.txt', 'w'):
# pass
# with open('temp.txt', 'a') as f:
# for i in pymongo.MongoClient('localhost:27017')[mongo_db_name][mongo_db_table].find({}).sort(
# [('chapter_name', pymongo.ASCENDING)]):
# f.write(' ' + i['chapter_name'] + '
')
# f.write(i['content'].replace('xa0', '').replace('
', '
') + '
')
pass
end = datetime.datetime.now()
print(end.strftime('%Y-%m-%d %H:%M:%S'))
print('cost seconds : %d' % (end - start).seconds)
pass
以上是关于Python.biqukan的主要内容,如果未能解决你的问题,请参考以下文章