爬虫大作业
Posted xjh602545141
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫大作业相关的知识,希望对你有一定的参考价值。
通过爬游侠网的游戏资讯页面,获取新闻标题和作者,并对作者进行统计,网站
http://www.ali213.net/news/game/
首先是要对网站发送请求
下面是我的代码
from urllib.parse import quote import string class htmlDownloader(object): def download(self, url): if url is None: return None s = quote(url, safe=string.printable) # url里有中文需要添加这一句,不然乱码 response = urllib.request.urlopen(s) if response.getcode() != 200: return None return response.read() # 返回内容
接着是要解析并得到目标URL,通过管理器进行操作,代码如下
# -*- coding:utf8 -*- class UrlManage(object): def __init__(self): self.detail_urls = set() # 详细内容页的URL self.old_detail_urls = set() # 已经爬取过的url def add_detail_url(self, url): if url is None: return if url not in self.detail_urls and url not in self.old_detail_urls: self.detail_urls.add(url) # print(self.detail_urls) # 添加多个url def add_new_detail_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: self.add_detail_url(url) def has_new_detail_url(self): return len(self.detail_urls) != 0 def get_detail_url(self): new_detail_url = self.detail_urls.pop() self.old_detail_urls.add(new_detail_url) return new_detail_url
观察游侠网游戏资讯的结构(如以下图片所示),构造相应的解析器
第三页和第五页的差距只是index_后的数字差别,以此类推
需要注意的是 这里的第三条信息是广告来的,我的方法是通过计数器来跳过这个广告
# -*- coding:utf8 -*- import re from urllib.parse import urlparse from bs4 import BeautifulSoup class HtmlParser(object): def soup(cont): soups = BeautifulSoup(cont, ‘html.parser‘, from_encoding=‘utf-8‘) return soups # 得到具体的data数据 def get_new_data(soup): dict = {} count = 0 if (soup.select(‘.t5c_l‘)[0].contents): li = soup.select(‘.t5c_l‘)[0].select(‘.n_lone‘) di = {} for i in li: if(count==2): print(‘我是广告‘) count=count+1 continue moviename = i.select(‘h2‘)[0].select(‘a‘)[0].attrs[‘title‘] # 游戏名 print(moviename) comment = i.select(‘.lone_f‘)[0].select(‘.lone_f_r‘)[0].select(‘.lone_f_r_f‘)[0].text comment=comment.lstrip() comment=comment[9:].lstrip() # comment = re.findall(‘\\d+‘, comment)[0] # gametype=i.select(‘.tag2‘)[0].select(‘.a‘)[0].attrs[‘title‘] di[moviename] = comment print(moviename,di[moviename]) count=count+1 # di[‘gametype‘]=gametype if di: # 返回的字典不为空的时候 dict.update(di) return dict # 得到详细内容的url def get_detail_url(base_url): detail_urls = set() for k in range(1, 201): if (k == 1): urls = base_url # print(urls) else: urls = base_url + ‘index_{}.html‘.format(k) # print(urls) detail_urls.add(urls) return detail_urls
把爬到的数据存为Excel格式
# -*- coding:utf8 -*- import xlwt # 写入Excel表的库 class HtmlOutputer(object): def __init__(self): self.datas = [] def output_excel(self, dict): di = dict wbk = xlwt.Workbook(encoding=‘utf-8‘) sheet = wbk.add_sheet("wordCount") # Excel单元格名字 k = 0 for i in di.items(): sheet.write(k, 0, label=i[0]) sheet.write(k, 1, label=i[1]) k = k + 1 wbk.save(‘wordCount.xls‘) # 保存为 wordCount.xls文件
编写主类,代码如下
# -*- coding:utf8 -*- from dazuoye import url_manager, html_downloader, html_parser, html_outputer class SpiderMain(object): def __init__(self): self.urls = url_manager.UrlManage() self.downloader = html_downloader.HtmlDownloader() self.htmlparser = html_parser.HtmlParser self.outputer = html_outputer.HtmlOutputer() def craw(self, root_url): count = 1 dictdata = {} try: detail_urls = self.htmlparser.get_detail_url(root_url) self.urls.add_new_detail_urls(detail_urls) except: print(‘craw failed‘) while self.urls.has_new_detail_url(): try: detail_url = self.urls.get_detail_url() print(‘crow %d : %s‘ % (count, detail_url)) html_cont = self.downloader.download(detail_url) soup = self.htmlparser.soup(html_cont) dict = self.htmlparser.get_new_data(soup) dictdata.update(dict) if count == 200: break count = count + 1 except: print(‘craw failed‘) self.outputer.output_excel(dictdata) # 程序入口 if __name__ == "__main__": url = ‘http://www.ali213.net/news/game/‘ obj_spider = SpiderMain() obj_spider.craw(url)
运行主类,结果如下
然后再把作者存进列表,再通过字典统计词频,最后生成词云
# -*- coding:utf8 -*- from wordcloud import WordCloud import matplotlib.pyplot as plt import xlrd from PIL import Image, ImageSequence import numpy as np file = xlrd.open_workbook(‘wordCount.xls‘) sheet = file.sheet_by_name(‘wordCount‘) list_li=[] for i in range(sheet.nrows): rows = sheet.row_values(i) list_li.append(rows[1].rstrip(‘\\n‘)) list_di={} for i in list_li: if list_li.count(i)>1: list_di[i]=list_li.count(i) print(list_di) image = Image.open(‘./005.jpg‘) graph = np.array(image) wc = WordCloud(font_path=‘./fonts/simhei.ttf‘, background_color=‘white‘, max_words=50, max_font_size=100, min_font_size=10,mask=graph,random_state=10) wc.generate_from_frequencies(list_di) plt.figure() # 以下代码显示图片 plt.imshow(wc) plt.axis("off") plt.show()
如图所示(哈哈哈哈哈哈哈)
说说码代码的时候遇到的问题吧
就是解析数据的那里,因为有广告,所以经常报错,试过通过判断标签获得的数据是否为空来跳过此次循环 不过还是不行 最后另辟蹊跷,设计了一个计数器,当计数为2(即第三条)时跳过此次循环,
说起来也是因为网站固定第三条为广告,才能这样做的。
还有一个就是读取Excel转化为字典那里,本来想通过jieba的,不过发现我的方法更简单后就这样弄了
总的来说 此次大作业令我对Python兴趣大增,致使我觉得Python是世界上最好的语言!
以上是关于爬虫大作业的主要内容,如果未能解决你的问题,请参考以下文章
HTML5期末大作业:餐饮美食网站设计——咖啡(10页) HTML+CSS+JavaScript 学生DW网页设计作业成品 web课程设计网页规划与设计 咖啡网页设计 美食餐饮网页设计...(代码片段
Python大作业——爬虫+可视化+数据分析+数据库(可视化篇)