python爬虫-糗百阅读器
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python爬虫-糗百阅读器相关的知识,希望对你有一定的参考价值。
#!/usr/bi/env python # -*- coding:utf-8 -*- import urllib import urllib2 import re class Turtle(object): def __init__(self): self.pageIndex = 1 self.stories = [] self.enable = True self.header = {‘User-Agent‘:‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘} self.enable = True ‘‘‘ 获取 网页内容 ‘‘‘ def getPage(self,pageIndex): try: url = ‘http://www.qiushibaike.com/hot/page/‘+ str(pageIndex) request = urllib2.Request(url, headers = self.header) response = urllib2.urlopen(request) return response.read().decode(‘utf-8‘) except urllib2.URLError,e: if hasattr(e,‘code‘): print u‘错误码:‘,e.code if hasattr(e , ‘reason‘): print u‘错误原因:‘,e.reason ‘‘‘ 获取网页内段子 ‘‘‘ def getPageItem(self,pageIndex): pageContent = self.getPage(pageIndex) if not pageContent: print u‘页面加载失败。。。‘ return None pattern = re.compile(‘<div.*?author.*?<h2>(.*?)</h2>.*?‘+ ‘<div.*?content">(.*?)<!--(.*?)-->.*?</div>‘+ ‘.*?<div.*?class="stats.*?class="number">(.*?)</i>‘,re.S) try: items = re.findall(pattern,pageContent) except BaseException,e: print e pageStories = [] for item in items: replaceBR = re.compile(‘<br/>‘) text = re.sub(replaceBR,‘\n‘,item[1]) pageStories.append([item[0].strip(),text.strip(),item[3].strip()]) return pageStories ‘‘‘ 加载一页段子 ‘‘‘ def loadPage(self): if len(self.stories) < 2:#总页数小于1页,加载下一页 print ‘==============剩余未读小于两页,预加载下一页==============‘ pageStories = self.getPageItem(self.pageIndex) self.pageIndex += 1 self.stories.append(pageStories) ‘‘‘ 一个个读段子 ‘‘‘ def getOneStory(self): for story in self.stories[0]: isQ = raw_input() if isQ == ‘q‘ or isQ == ‘Q‘: self.enable = False return self.loadPage() print story[1] print ‘-----%s,liked by %s‘ % ( story[0] , story[2] ) def start(self): print ‘start to read page 1‘ self.loadPage(); while self.enable: if len(self.stories) >0: self.getOneStory() del self.stories[0] print ‘===========该页已读完,读取下一页===========‘ turtle = Turtle() turtle.start();
以上是关于python爬虫-糗百阅读器的主要内容,如果未能解决你的问题,请参考以下文章