python 爬小说

Posted 我们是煮过的花朵

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 爬小说相关的知识,希望对你有一定的参考价值。

#coding=utf-8
import datetime
import time
import sys
import os 

import urllib2
import urllib

sx = 小说站网址

type = sys.getfilesystemencoding()  
user_agent = Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)  
headers = { User-Agent : user_agent }  


fo = open("note.txt", "wb")

def getHtml(url):  
    try:  
        request = urllib2.Request(url, headers=headers)  
        response = urllib2.urlopen(request)  
        data = response.read()  
        data = data.decode(gbk)  
        data = data.encode(utf-8)  
        print len(data)  
        return data
    except urllib2.URLError, e:  
        if hasattr(e, "code"):  
            print e.code  
        if hasattr(e, "reason"):  
            print e.reson  
        pass

def dealIndex(url):
    data = getHtml(url)
    # pos = data.find()
    bgnpos = data.index(ChapterList_HengFu_1) + 10
    endpos = data.index(ChapterList_HengFu_2) - 10
    print bgnpos
    print endpos

    achfx = data[bgnpos:endpos]
    pos = bgnpos

    i = 0
    while 1:
        newpos = achfx.find(href=, pos)
        if newpos == -1 or newpos >= endpos:
            break

        # print data[newpos:newpos+200]
        indexurl = achfx[newpos+6:newpos+19]

        titlepos = achfx.find(</a>, newpos+20)
        titlename = achfx[newpos+21:titlepos+1]
        # print indexurl + "   " + titlename
        pos = titlepos + 5

        dealContext(sx + indexurl, titlename)
        # i = i + 1
        # # print "-----------------" + str(pos)
        # if i >= 1:
        #     break
        pass

    # print achfx


def dealContext(url, title):
    print url
    print title

    data = getHtml(url)
    bgnpos = data.find(name="content", 10) + 15
    endpos = data.find(yuedu_bottom, bgnpos)
    endpos = data.find(</div>, endpos - 50)

    sContent = data[bgnpos:endpos]
    sContent = sContent.replace(&nbsp;,  )
    sContent = sContent.replace(<br />,  )

    # # sContent = sContent.strip("&nbsp;")
    # # sContent = sContent.strip(‘<br />‘)
    # print sContent
    # # print sContent.strip(‘<br />‘)
    sContent = title + "  " + sContent
    fo.write(sContent)

dealIndex(sx)


fo.close()

 

以上是关于python 爬小说的主要内容,如果未能解决你的问题,请参考以下文章

python 爬qidian小说

40行代码爬取金庸所有武侠小说

教你用Python批量爬取小说!这年头了谁看小说还充钱啊!

python之如何爬取一篇小说的第一章内容

用python爬虫简单爬取 笔趣网:类“起点网”的小说

python 爬小说