python 爬小说

Posted 2020-10-24 我们是煮过的花朵

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python 爬小说相关的知识，希望对你有一定的参考价值。

#coding=utf-8
import datetime
import time
import sys
import os 

import urllib2
import urllib

sx = ‘小说站网址‘

type = sys.getfilesystemencoding()  
user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘  
headers = { ‘User-Agent‘ : user_agent }  


fo = open("note.txt", "wb")

def getHtml(url):  
    try:  
        request = urllib2.Request(url, headers=headers)  
        response = urllib2.urlopen(request)  
        data = response.read()  
        data = data.decode(‘gbk‘)  
        data = data.encode(‘utf-8‘)  
        print len(data)  
        return data
    except urllib2.URLError, e:  
        if hasattr(e, "code"):  
            print e.code  
        if hasattr(e, "reason"):  
            print e.reson  
        pass

def dealIndex(url):
    data = getHtml(url)
    # pos = data.find()
    bgnpos = data.index(‘ChapterList_HengFu_1‘) + 10
    endpos = data.index(‘ChapterList_HengFu_2‘) - 10
    print bgnpos
    print endpos

    achfx = data[bgnpos:endpos]
    pos = bgnpos

    i = 0
    while 1:
        newpos = achfx.find(‘href=‘, pos)
        if newpos == -1 or newpos >= endpos:
            break

        # print data[newpos:newpos+200]
        indexurl = achfx[newpos+6:newpos+19]

        titlepos = achfx.find(‘</a>‘, newpos+20)
        titlename = achfx[newpos+21:titlepos+1]
        # print indexurl + "   " + titlename
        pos = titlepos + 5

        dealContext(sx + indexurl, titlename)
        # i = i + 1
        # # print "-----------------" + str(pos)
        # if i >= 1:
        #     break
        pass

    # print achfx


def dealContext(url, title):
    print url
    print title

    data = getHtml(url)
    bgnpos = data.find(‘name="content"‘, 10) + 15
    endpos = data.find(‘yuedu_bottom‘, bgnpos)
    endpos = data.find(‘</div>‘, endpos - 50)

    sContent = data[bgnpos:endpos]
    sContent = sContent.replace(‘&nbsp;‘, ‘ ‘)
    sContent = sContent.replace(‘<br />‘, ‘ ‘)

    # # sContent = sContent.strip("&nbsp;")
    # # sContent = sContent.strip(‘<br />‘)
    # print sContent
    # # print sContent.strip(‘<br />‘)
    sContent = title + "  " + sContent
    fo.write(sContent)

dealIndex(sx)


fo.close()

以上是关于python 爬小说的主要内容，如果未能解决你的问题，请参考以下文章

python 爬qidian小说

40行代码爬取金庸所有武侠小说

教你用Python批量爬取小说！这年头了谁看小说还充钱啊!

python之如何爬取一篇小说的第一章内容

用python爬虫简单爬取笔趣网：类“起点网”的小说

python 爬小说