小说爬取 python + urllib + lxml
Posted Dmail
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了小说爬取 python + urllib + lxml相关的知识,希望对你有一定的参考价值。
from urllib import parse from urllib import request from lxml import etree import time class Novel: def __init__(self,*args): self.name = args[0] self.dict = args[1] self.txt = ‘‘ for key in sorted(self.dict): self.txt = self.txt + self.dict[key] def write(self): f = open(self.name+‘.txt‘,‘w‘) f.write(self.txt) f.close() #获取网页源代码 def get_http_page(url,**kw): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" } req = request.Request(url,headers=headers) response = request.urlopen(req) page = response.read() encoding = ‘gbk‘ if kw: encoding = kw[‘encoding‘] page = page.decode(encoding) return page #获取漫画目录 def get_comics_directory(url): url_list = [] page = get_http_page(url,encoding=‘utf-8‘) html = etree.HTML(page) result = html.xpath(‘/html/body/div[2]/div/div[2]/h3/a‘) elment_select = None if len(result): url2 = result[0].get(‘href‘) if url2: page = get_http_page(url2) html = etree.HTML(page) elment_select = html.xpath(‘/html/body/div[4]/div[9]/span[2]/select‘) if len(elment_select): result_option = elment_select[0].findall(‘option‘) for option in result_option: url_list.append(‘https://m.wenxuemi6.com{}‘.format(option.get(‘value‘))) return url_list def downdload_txt(url_list,**kw): if kw: start = int(kw[‘start‘]) stop = int (kw[‘stop‘]) if start >= 0 and start < len(url_list) and stop > start and stop <len(url_list): count = kw[‘start‘] count_max = kw[‘stop‘] else: count = 0 count_max = len(url_list) print(‘正在爬取目录和章节地址,请稍等……‘) d = {} while count < count_max: url = url_list[count] page = get_http_page(url) html = etree.HTML(page) result = html.xpath(‘/html/body/div[4]/ul[2]/li/a‘) txt = ‘‘ if type(result).__name__ == ‘list‘: for l in result: url = ‘https://m.wenxuemi6.com{}‘.format(l.get(‘href‘)) #url_list.append(‘https://m.wenxuemi6.com{}‘.format(l.get(‘href‘))) print(‘Download chapters by URL:{}‘.format(url)) d2 = {‘{}‘.format(count): ‘‘} page = get_http_page(url) html = etree.HTML(page) url_next = html.xpath(‘//*[@id="pb_next"]‘) t = html.xpath(‘//*[@id="nr1"]/text()‘) t2 = html.xpath(‘//*[@id="nr1"]/p‘) txt_title = ‘‘ txt_title_list = html.xpath(‘//*[@id="nr_title"]/text()‘) if type(txt_title_list).__name__ == ‘list‘: if (len(txt_title_list) == 1): txt_title = txt_title_list[0] txt = txt + txt_title + ‘ ‘ for l2 in t: txt = txt + l2 + ‘ ‘ if type(t2).__name__ == ‘list‘: if len(t2) == 1: url = ‘https://m.wenxuemi6.com{}‘.format(l.get(‘href‘)[:-5] + ‘_2.html‘) print(‘Download chapters by URL:{}‘.format(url)) page = get_http_page(url) html = etree.HTML(page) t = html.xpath(‘//*[@id="nr1"]/text()‘) for l2 in t: txt = txt + l2 + ‘ ‘ d2[‘{}‘.format(count)] = txt d.update(d2) time.sleep(1) return d if __name__ == ‘__main__‘: txt_name = input("请输入要搜索的书名:") url = ‘https://m.wenxuemi6.com/search.php?keyword={}‘.format(parse.quote(txt_name)) referer = url url_list = get_comics_directory(url) #下载第一页目录下的小说 d = downdload_txt(url_list,start=0,stop=1) n1 = Novel(txt_name,d) #写出文件 [txt_name].txt 到当前目录下 n1.write() #下载全本小说 d2 = downdload_txt(url_list,start=0,stop=1) n2 = Novel(txt_name,d2) #写出文件 [txt_name].txt 到当前目录下 n2.write()
以上是关于小说爬取 python + urllib + lxml的主要内容,如果未能解决你的问题,请参考以下文章