网络爬虫入门——案例一:爬取百度贴吧帖子
Posted 可爱的熊乖乖
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了网络爬虫入门——案例一:爬取百度贴吧帖子相关的知识,希望对你有一定的参考价值。
参考资料:
Python:http://www.runoob.com/python/python-intro.html
Python爬虫系列教程:http://www.cnblogs.com/xin-xin/p/4297852.html
正则表达式:http://www.cnblogs.com/deerchao/archive/2006/08/24/zhengzhe30fengzhongjiaocheng.html
本贴目标:
1.对百度贴吧的任意帖子进行抓取
2.指定是否只抓取楼主发帖内容
3.将抓取到的内容分析并保存到文件
4.抓取帖子中出现的美图
# -*- coding: utf-8 -*- """ Created on Fri Apr 15 11:47:02 2016 @author: wuhan """ import urllib import urllib2 import re import time import os #reload(sys) #sys.setdefaultencoding("utf-8") class Tool: removeImg = re.compile(\'<img.*?>| {12}\') removeAddr = re.compile(\'<a.*?>|</a>\') replaceLine = re.compile(\'<tr>|<div>|</div>|</p>\') replaceTD = re.compile(\'<td>\') replacePara = re.compile(\'<p.*?>\') replaceBR = re.compile(\'<br><br>|<br>\') removeExtraTag = re.compile(\'<.*?>\') def replace(self,x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\\n", x) x = re.sub(self.replaceBR, "\\n", x) x = re.sub(self.replacePara, "\\n ", x) x = re.sub(self.replaceTD, "\\t", x) x = re.sub(self.removeExtraTag, "", x) return x.strip() class BDTB: def __init__(self, baseUrl, seeLZ, floorTag): self.baseURL = baseUrl self.seeLZ = \'?see_lz=\' + str(seeLZ) self.tool = Tool() self.file = None self.floor = 1 self.defaultTitle = u\'百度贴吧\' self.floorTag = floorTag def getPage(self, pageNum): try: url = self.baseURL + self.seeLZ + \'&pn=\' + str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read().decode(\'utf-8\') except urllib2.URLError, e: if hasattr(e, "reason"): print u\'百度贴吧链接失败,错误原因 :\', e.reason return None def getTitle(self, page): pattern = re.compile(\'<h1 class="core_title_txt.*?>(.*?)</h1>\',re.S) result = re.search(pattern, page) if result: return result.group(1).strip() else: return None def getPageNum(self, page): pattern = re.compile(\'<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>\',re.S) result = re.search(pattern, page) if result: return result.group(1).strip() else: return None def getContents(self,page): pattern = re.compile(\'<div id="post_content.*?>(.*?)</div>\', re.S) items = re.findall(pattern, page) contents = [] for item in items: content = "\\n" + self.tool.replace(item) + "\\n" contents.append(content.encode(\'utf-8\')) return contents def setFileTitle(self, title): if title is not None: self.file = open(title + ".txt" , "w+") else: self.file = open(self.defaultTitle + ".txt" , "w+") def writeData(self, contents): for item in contents: if self.floorTag == \'1\': floorLine = "\\n" + str(self.floor) + u"-----------------------------------------------------------------------------------------------------------------------------------------\\n" self.file.write(floorLine) self.file.write(item) self.floor += 1 def start(self): indexPage = self.getPage(1) pageNum = self.getPageNum(indexPage) title = self.getTitle(indexPage) self.setFileTitle(title) if pageNum == None: print "URL已失效,请重试" return try: print "该贴子共有" + str(pageNum) + "页" for i in range(1, int(pageNum)+1): print "正在写入第" + str(i) + "页数据" page = self.getPage(i) contents = self.getContents(page) self.writeData(contents) self.getPicture(page, i) except IOError, e: print "写入异常,原因" + e.message finally: print "写入任务完成" def getPicture(self, page, PageNum): reg = r\'<img class="BDE_Image".*?src="(.+?.jpg)\' imgre = re.compile(reg)#可以把正则表达式编译成一个正则表达式对象 imglist = re.findall(imgre,page)#读取html 中包含 imgre(正则表达式)的数据 t = time.localtime(time.time()) foldername = str(t.__getattribute__("tm_year"))+"-"+str(t.__getattribute__("tm_mon"))+"-"+str(t.__getattribute__("tm_mday")) picpath = \'E:\\\\Python\\\\ImageDownload\\\\%s\' % (foldername) #下载到的本地目录 if not os.path.exists(picpath): #路径不存在时创建一个 os.makedirs(picpath) x = 0 for imgurl in imglist: target = picpath+\'\\\\%s_%s.jpg\' % (PageNum, x) urllib.urlretrieve(imgurl, target)#直接将远程数据下载到本地 x+=1 print u"请输入帖子代号" baseURL = \'http://tieba.baidu.com/p/\' + str(raw_input(u\'http://tieba.baidu.com/p/\')) seeLZ = raw_input("是否只获取楼主发言,是输入1,否输入0\\n".decode(\'utf-8\').encode(\'gbk\')) floorTag = raw_input("是否写入楼层信息,是输入1,否输入0\\n".decode(\'utf-8\').encode(\'gbk\')) bdtb = BDTB(baseURL,seeLZ,floorTag) bdtb.start()
以上是关于网络爬虫入门——案例一:爬取百度贴吧帖子的主要内容,如果未能解决你的问题,请参考以下文章