爬取小说网站整站小说内容 -《狗嗨默示录》-

Posted 李·狗嗨

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取小说网站整站小说内容 -《狗嗨默示录》-相关的知识,希望对你有一定的参考价值。

# !/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib.request
import re
import mysqldb
import socket


domain = http://www.quanshuwang.com
headers = {
    User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/55.0.2883.87 Safari/537.36
}

#获取分类列表
def getTypeList(type):
    req = urllib.request.Request(http://www.quanshuwang.com/map/%s.html%type)
    req.headers = headers #替换头信息
    #req.add_header() #添加单个头信息
    res = urllib.request.urlopen(req)  # 获取源码
    html = res.read().decode(gbk)  # 解码
    reg = r<a href="(/book/.+?)" target="_blank">(.+?)</a>
    reg = re.compile(reg)  # 编译
    return re.findall(reg,html)


def getNovelList(href):
    req = urllib.request.Request(domain + href)
    req.headers = headers
    res = urllib.request.urlopen(req)
    html = res.read().decode(gbk)
    reg = r<li><a href="(.+?)" title="(.+?)">(.+?)</a></li>
    reg = re.compile(reg)
    return re.findall(reg,html)


def getNovelContent(url):
    req = urllib.request.Request(domain + url)
    req.headers = headers
    res = urllib.request.urlopen(req)
    html = res.read().decode(gbk,ignore)
    reg = rstyle5\(\);</script>(.*?)<script type="text/javascript">style6\(\)
    reg = re.compile(reg,re.S)
    print(domain + url)
    return re.findall(reg,html)[0]

class Sql(object):
    conn = MySQLdb.connect(host=localhost,port=x,user=‘x,password=‘x,db=novel,charset=utf8)
    def addnovels(self,sort,novelname):
        cur = self.conn.cursor() #游标
        cur.execute("insert into novel(sort,novelname) values(‘%s‘,‘%s‘)"%(sort,novelname))
        lastrowid = cur.lastrowid
        cur.close()
        self.conn.commit()
        return lastrowid
    def addchapters(self,novelid,chaptername,content):
        cur = self.conn.cursor()
        cur.execute("insert into chapter(novelid,chaptername,content) values(%s,‘%s‘,‘%s‘)"%(novelid,chaptername,content))
        cur.close()
        self.conn.commit()

mysql = Sql()
if __name__ == __main__:
    for type in range(1,10):
        if type == 1:
            sort = "玄幻魔法"
        elif type == 2:
            sort = "武侠修真"
        elif type == 3:
            sort = "历史军事"
        elif type == 4:
            sort = "女频言情"
        elif type == 5:
            sort = "侦探推理"
        elif type == 6:
            sort = "网络动漫"
        elif type == 7:
            sort = "科幻小说"
        elif type == 8:
            sort = "恐怖灵异"
        elif type == 9:
            sort = "美文同人"
        else:
            print("请求的小说类型有误!!!")
        for href,novelname in getTypeList(type):
            lastrowid = mysql.addnovels(sort,novelname)
            for url,title,title in getNovelList(href):
                try:
                    print("正在爬取------------%s 《%s》 %s"%(sort,novelname,title))
                    content = getNovelContent(href.replace(index.html,url))
                    mysql.addchapters(novelid=lastrowid,chaptername=title,content=content)
                    socket.setdefaulttimeout(30)
                except Exception as e:
                    print("连接中断,发生错误:%s !!!!"%e)

 

以上是关于爬取小说网站整站小说内容 -《狗嗨默示录》-的主要内容,如果未能解决你的问题,请参考以下文章

爬取全本小说内容并进行文本分析

Python 爬取笔趣阁小说

spider爬虫练习,爬取顶点小说网,小说内容。

Python爬取小说网站下载小说

10分钟入门爬虫-小说网站爬取

小说爬虫(基于requests+BeautifulSoup)