11-内涵段子-爬虫
Posted zhumengdexiaobai
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了11-内涵段子-爬虫相关的知识,希望对你有一定的参考价值。
爬取内涵段子,使用正则进行简单处理:
#_*_ coding: utf-8 _*_ ‘‘‘ Created on 2018年7月14日 @author: sss function:爬去内涵段子(静态网页抓取) ‘‘‘ import requests import urllib import re import random from Tools.scripts.treesync import raw_input class Spider: def __init__(self): #初始话起始页的位置 self.page = 1 #爬去开关,ture表示继续爬取 self.switch = True def loadPage(self): """ 下载页面 """ print(‘开始下载第‘+ str(self.page) + ‘页:‘) url = ‘https://www.neihan8.com/article/list_5_‘ + str(self.page) + ‘.html‘ ua_list = [ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (X11; CrOS i686 2268.111.0)like Gecko", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X ", "Mozilla/5.0 (Macintosh; Intel Mac OS " ] user_agnet = random.choice(ua_list) headers = { "Connection" : "keep-alive", "Accept" : "application/json, text/javascript, */*; q=0.01", "User-Agent" : user_agnet, } # response = requests.get(url, headers = headers) request = urllib.request.Request(url, headers = headers) response = urllib.request.urlopen(request) #获取每页的html源码: html = response.read().decode(‘gbk‘) # print(html) #创建正则表达式规则对象,匹配每页里的段子内容,re.Sb表示匹配全部字符串内容 pattern = re.compile(‘<divsclass="f18 mb20">(.*?)</div>‘, re.RegexFlag.S) #在py3中不是re.S #将正则匹配的对象应用到html源码字符串里,返回这个页面里的所有段子的列表 content_list = pattern.findall(html) # print(content_list) self.dealPage(content_list) def dealPage(self, content_list ): """ 处理每页都段子 """ for item in content_list: item = item.replace(‘<p>‘, ‘‘).replace(‘</p>‘,‘‘.replace(‘<br>‘, ‘‘)).replace("<br />", ‘‘) # print(itme) self.writePage(item) def writePage(self, item): """ 把每条段子写入到文件里 """ with open(‘duanzi.txt‘, ‘a‘) as f: f.write(item) def startWork(self): """ 控制爬虫的运行 """ while self.switch: self.loadPage() command = raw_input(‘如果继续爬去,请按回车(退出输入q)‘) if command == ‘q‘: self.switch = False self.page += 1 print(‘finish!‘) if __name__ == ‘__main__‘: duanziSpider = Spider() duanziSpider.startWork()
以上是关于11-内涵段子-爬虫的主要内容,如果未能解决你的问题,请参考以下文章