第一个小爬虫(改进版)——下书网下载小说v2
Posted lasttime
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了第一个小爬虫(改进版)——下书网下载小说v2相关的知识,希望对你有一定的参考价值。
已知问题:
- 代理地址需要更新
- 中断只能重新开始
1 import requests 2 import urllib.request 3 import re 4 import os 5 import string 6 import time 7 import random 8 from urllib import request 9 10 path = os.getcwd() # 获取当前路径 11 12 13 def open_url(url): 14 proxy_list = [ 15 {‘http‘: ‘112.85.129.9:9999‘}, 16 {‘http‘: ‘113.105.202.7:3128‘}, 17 {‘http‘: ‘180.121.115.181:48184‘}, 18 {‘http‘: ‘123.162.168.192:40274‘}, 19 {‘http‘: ‘115.207.77.72:8118‘}, 20 {‘http‘: ‘112.85.129.9:9999‘}, 21 {‘http‘: ‘61.184.109.33:61320‘}, 22 {‘htpps‘: ‘58.218.201.188:‘}, 23 ] 24 proxy = random.choice(proxy_list) 25 print(str(proxy)) 26 px = request.ProxyHandler(proxy) 27 opener = request.build_opener(px) 28 opener.addheaders = [(‘User-Agent‘, 29 ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/56.0.2924.87 Safari/537.36‘)] 30 request.install_opener(opener) 31 try: 32 page = request.urlopen(url,timeout=10) 33 html = page.read().decode(‘utf-8‘) 34 return html 35 except: 36 print("出错额!!!!!!!!!!!!!!!!!!!!!") 37 return get_txt(open_url(url)) 38 39 40 def get_txt(html): 41 lst1 = str(r‘<head>[\s\S]*<title>.*</title>‘) 42 lst2 = (r‘<div id="tac">[\s\S]*<div class="info bottominfo">‘) 43 l1 = str(re.findall(lst1,html)) 44 l1 = l1[51:].replace(‘\‘‘,‘‘).replace(‘\"‘, ‘‘).replace(‘>]‘,‘‘) 45 l1list = l1.split(‘,‘)[0] 46 l2 = str(re.findall(lst2,html)) 47 l2 = l2[92:].replace(r‘\u3000\u3000‘,‘ ‘).replace(‘<br/><br/>‘,‘\n‘)[:-60] 48 l2 = re.sub(‘\*‘,‘ ‘,l2) 49 l2 = str(l2) 50 f = open(path+r‘\\%s.txt‘%(l5),‘a‘) 51 f.write(l1list) 52 f.write(‘\n\n‘) 53 f.write(l2) 54 f.write(‘\n\n\n‘) 55 print(l1list + ‘→→→下载完成→→→‘) 56 57 def get_titlename(html): 58 lst3 = str(r‘<head>[\s\S]*<title>.*</title>‘) 59 l3 = str(re.findall(lst3,html)) 60 l3 = l3[43:].split(‘_‘)[0].replace(‘txt下载‘,‘\n ——‘).replace(‘(‘,‘‘).replace(‘)‘,‘‘) 61 print(l3 + ‘→正在下载‘) 62 f = open(path+r‘\\%s.txt‘%(l5),‘a‘) 63 f.write(l3) 64 f.write(‘\n\n‘) 65 print(l3 + ‘→→→titlename下载完成→→→‘) 66 67 def get_txtname(html): 68 lst4 = str(r‘<head>[\s\S]*<title>.*</title>‘) 69 l4 = str(re.findall(lst4,html)) 70 l5 = l4[43:].split(‘txt‘)[0] 71 f = open(path+r‘\\%s.txt‘%(l5),‘a‘) 72 f.close 73 return l5 74 75 if __name__ == ‘__main__‘: 76 print(‘\n使用说明:‘ 77 ‘示例:《武道乾坤》,URL https://www.xiashu.la/2186/ ,该书目录为即为2186‘) 78 url0 = ‘https://www.xiashu.la‘ 79 ml = input(‘请输入目录‘) 80 url1 = url0 + r‘/‘ + ml + r‘/‘ 81 print(‘你输入的目录为:%s‘%url1) 82 chapters = input(‘请输入总章节数(示例80页,则输入80):‘) 83 chapters = int(chapters) 84 print("当前工作目录 : %s" % path) 85 get_txtname(open_url(url1)) 86 l5 = get_txtname(open_url(url1)) 87 get_titlename(open_url(url1)) 88 for chapter in range(1,chapters+1): 89 url = url1 +‘read_‘+ str(chapter) + ‘.html‘ 90 t = random.randint(1,2) 91 print(t) 92 time.sleep(1)#单位:秒 93 get_txt(open_url(url))
以上是关于第一个小爬虫(改进版)——下书网下载小说v2的主要内容,如果未能解决你的问题,请参考以下文章