爬取小说网站整站小说内容 -《狗嗨默示录》-
Posted 李·狗嗨
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取小说网站整站小说内容 -《狗嗨默示录》-相关的知识,希望对你有一定的参考价值。
# !/usr/bin/env python # -*- coding: utf-8 -*- import urllib.request import re import mysqldb import socket domain = ‘http://www.quanshuwang.com‘ headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/55.0.2883.87 Safari/537.36‘ } #获取分类列表 def getTypeList(type): req = urllib.request.Request(‘http://www.quanshuwang.com/map/%s.html‘%type) req.headers = headers #替换头信息 #req.add_header() #添加单个头信息 res = urllib.request.urlopen(req) # 获取源码 html = res.read().decode(‘gbk‘) # 解码 reg = r‘<a href="(/book/.+?)" target="_blank">(.+?)</a>‘ reg = re.compile(reg) # 编译 return re.findall(reg,html) def getNovelList(href): req = urllib.request.Request(domain + href) req.headers = headers res = urllib.request.urlopen(req) html = res.read().decode(‘gbk‘) reg = r‘<li><a href="(.+?)" title="(.+?)">(.+?)</a></li>‘ reg = re.compile(reg) return re.findall(reg,html) def getNovelContent(url): req = urllib.request.Request(domain + url) req.headers = headers res = urllib.request.urlopen(req) html = res.read().decode(‘gbk‘,‘ignore‘) reg = r‘style5\(\);</script>(.*?)<script type="text/javascript">style6\(\)‘ reg = re.compile(reg,re.S) print(domain + url) return re.findall(reg,html)[0] class Sql(object): conn = MySQLdb.connect(host=‘localhost‘,port=x,user=‘x‘,password=‘x‘,db=‘novel‘,charset=‘utf8‘) def addnovels(self,sort,novelname): cur = self.conn.cursor() #游标 cur.execute("insert into novel(sort,novelname) values(‘%s‘,‘%s‘)"%(sort,novelname)) lastrowid = cur.lastrowid cur.close() self.conn.commit() return lastrowid def addchapters(self,novelid,chaptername,content): cur = self.conn.cursor() cur.execute("insert into chapter(novelid,chaptername,content) values(%s,‘%s‘,‘%s‘)"%(novelid,chaptername,content)) cur.close() self.conn.commit() mysql = Sql() if __name__ == ‘__main__‘: for type in range(1,10): if type == 1: sort = "玄幻魔法" elif type == 2: sort = "武侠修真" elif type == 3: sort = "历史军事" elif type == 4: sort = "女频言情" elif type == 5: sort = "侦探推理" elif type == 6: sort = "网络动漫" elif type == 7: sort = "科幻小说" elif type == 8: sort = "恐怖灵异" elif type == 9: sort = "美文同人" else: print("请求的小说类型有误!!!") for href,novelname in getTypeList(type): lastrowid = mysql.addnovels(sort,novelname) for url,title,title in getNovelList(href): try: print("正在爬取------------%s 《%s》 %s"%(sort,novelname,title)) content = getNovelContent(href.replace(‘index.html‘,url)) mysql.addchapters(novelid=lastrowid,chaptername=title,content=content) socket.setdefaulttimeout(30) except Exception as e: print("连接中断,发生错误:%s !!!!"%e)
以上是关于爬取小说网站整站小说内容 -《狗嗨默示录》-的主要内容,如果未能解决你的问题,请参考以下文章