python 爬虫第二例--百度贴吧
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 爬虫第二例--百度贴吧相关的知识,希望对你有一定的参考价值。
python 第二例,爬取百度贴吧的帖子,获取帖子的标题,内容,所在楼层,发布时间
其中存在一个问题,当该帖子是手机端发布的帖子,此时在页面中会有标识,因此多一个span标签,与楼层和发布时间的标签一样
解决方法: 目潜想到的解决方法是通过判断爬到的值来进行选择,但解决方案效率肯定低,因此未使用,等知识体系丰富后再进行改进
附爬取的代码:
# -*- coding: utf-8 -*- import urllib2 import urllib import re class Tool: # 去除Img标签 removeImg = re.compile(r‘<img.*?>| {0,100}|‘) # 删除超链接标签 removeAddr = re.compile(r‘<a.*?>|</a>‘) # 把换行标签转换为\n replaceLine= re.compile(r‘<tr>|<div>|</div>|</p>‘) # 把表格制表转换为\t replaceTD = re.compile(r‘<td>‘) # 把段落开头转换为\n加两个空格 replacePara = re.compile(‘<br><br>|<br>|<br><br><br>‘) # 将其余标签去除 removeExtraTag = re.compile(‘<.*?>‘) def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replacePara, "\n ", x) x = re.sub(self.removeExtraTag, "", x) # 使用strip() 方法将前后多余内容删除 return x.strip() class BDTB: #初始化变量,传入基类地址,传入是否只看楼主参数 def __init__(self, baseUrl, onlyLz, floorTag): # 基地址 self.baseUrl = baseUrl # 是否只看楼主 self.onlyLz = ‘?see_lz=‘ + str(onlyLz) # 实例化替换去除标签类 self.tool = Tool() # 全局file变量,文件写入操作对象 self.file = None # 默认标签,若没有获得标题,使用此标题 self.defaultTitle = u"百度贴吧帖子" # 是否写入楼层信息 self.floorTag = floorTag # 传入页码,获取该帖子的代码 def getPage(self, pageNum): try: url = self.baseUrl + self.onlyLz + ‘&pn=‘ + str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read().decode(‘utf-8‘) except urllib2.URLError,e: if hasattr(e, ‘code‘): print u"连接百度贴吧失败,错误编码:" + e.code return None if hasattr(e, ‘reason‘): print u"连接百度贴吧失败,错误原因:" + e.reason return None # 获取帖子的标题 def getTitle(self, page): pattren = re.compile(r‘<h3 class="core_title_txt pull-left text-overflow ".*?>(.*?)</h3>‘,re.S) items = re.search(pattren, page) if items: # print items.group(1).strip() return items.group(1).strip() else: return None # 提取帖子页数 def getPageNum(self, page): pattren = re.compile(r‘<ul class="l_posts_num">.*?<span class="red">(.*?)</span>‘,re.S) items = re.search(pattren, page) if items: # print items.group(1) return items.group(1).strip() else: return None # 提取正文内容 def getContent(self, page): pattren = re.compile(r‘<div id="post_content_.*?>(.*?)</div>.*?<div class="core_reply_tail clearfix">.*?<span class="tail-info">(.*?)</span>.*?<span class="tail-info">(.*?)</span>‘,re.S) items = re.findall(pattren, page) contents = [] for item in items: content = [] title = "\n" + self.tool.replace(item[0]) + "\n" floor = item[1] writeTime = item[2] content.append(title.encode("utf-8")) content.append(floor.encode("utf-8")) content.append(writeTime.encode("utf-8")) contents.append(content) return contents # 设置文件名字 def setFileTIltle(self, title): if title is not None: self.file = open(title + ".txt", "w+") else: self.file = open(self.defaultTitle + ".txt", "w+") # 将内容写入文件 def writeData(self, contents): for item in contents: if self.floorTag == ‘1‘: floorLineTime = "\n" + item[1] +"-----------------------------\n"+item[2] self.file.write(floorLineTime) self.file.write(item[0]) # floorLine = "\n" + item[‘‘] # print "------------------%s--------------------%s-----------------"%(item[1],item[2]) # print self.tool.replace(item[0]) # print self.tool.replace(items[1]) # print items[1] def start(self): indexPage = self.getPage(1) pageNum = self.getPageNum(indexPage) title = self.getTitle(indexPage) self.setFileTIltle(title) if pageNum == None: print "URL已经失效" try: print "该帖子一共有"+str(pageNum)+"页" for i in range(1, int(pageNum)+1): print "正在写入第" + str(i) + "页数据" page = self.getPage(i) contents = self.getContent(page) self.writeData(contents) except IOError,e: print "写入异常,原因:"+e.message finally: print "写入完成" print u"请输入帖子代号" baseUrl = ‘http://tieba.baidu.com/p/‘ + str(raw_input(u"http://tieba.baidu.com/p/")) seeLZ = raw_input("是否只获取楼主的帖子是输入1,否输入0\n") floorTag = raw_input("是否写入楼层与时间信息是输入1,否输入0\n") bdtb = BDTB(baseUrl, seeLZ, floorTag) bdtb.start()