爬虫大作业
Posted 宇健
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬虫大作业相关的知识,希望对你有一定的参考价值。
import requests, re, jieba,pandas from bs4 import BeautifulSoup from datetime import datetime from wordcloud import WordCloud import matplotlib.pyplot as plt # 获取新闻细节 def getNewsDetail(newsUrl): res = requests.get(newsUrl) res.encoding = \'gb2312\' soupd = BeautifulSoup(res.text, \'html.parser\') detail = {\'title\': soupd.select(\'#epContentLeft\')[0].h1.text, \'newsUrl\': newsUrl, \'time\': datetime.strptime( re.search(\'(\\d{4}.\\d{2}.\\d{2}\\s\\d{2}.\\d{2}.\\d{2})\', soupd.select(\'.post_time_source\')[0].text).group(1), \'%Y-%m-%d %H:%M:%S\'), \'source\': re.search(\'来源:(.*)\', soupd.select(\'.post_time_source\')[0].text).group(1), \'content\': soupd.select(\'#endText\')[0].text} return detail # 通过jieba分词,获取新闻关键词 def getKeyWords(): content = open(\'news.txt\', \'r\', encoding=\'utf-8\').read() wordSet = set(jieba._lcut(\'\'.join(re.findall(\'[\\u4e00-\\u9fa5]\', content)))) # 通过正则表达式选取中文字符数组,拼接为无标点字符内容,再转换为字符集合 wordDict = {} deleteList, keyWords = [], [] for i in wordSet: wordDict[i] = content.count(i) # 生成词云字典 for i in wordDict.keys(): if len(i) < 2: deleteList.append(i) # 生成单字无意义字符列表 for i in deleteList: del wordDict[i] # 在词云字典中删除无意义字符 dictList = list(wordDict.items()) dictList.sort(key=lambda item: item[1], reverse=True) for dict in dictList: keyWords.append(dict[0]) writekeyword(keyWords) # 将新闻内容写入到文件 def writeNews(pagedetail): f = open(\'text1.txt\', \'a\', encoding=\'utf-8\') for detail in pagedetail: f.write(detail[\'content\']) f.close() # 将词云写入到文件 def writekeyword(keywords): f = open(\'text.txt\', \'a\', encoding=\'utf-8\') for word in text: f.write(\' \' + word) f.close() # 获取一页的新闻 def getListPage(listUrl): res = requests.get(listUrl) res.encoding = \'utf-8\' soup = BeautifulSoup(res.text, \'html.parser\') pagedetail = [] # 存储一页所有新闻的详情 for news in soup.select(\'#news-flow-content\')[0].select(\'li\'): newsdetail = getNewsDetail(news.select(\'a\')[0][\'href\']) # 调用getNewsDetail()获取新闻详情 pagedetail.append(newsdetail) return pagedetail def getWordCloud(): keywords = open(\'keywords.txt\', \'r\', encoding=\'utf-8\').read() # 打开词云文件 wc = WordCloud(font_path=r\'C:\\Windows\\Fonts\\simfang.ttf\', background_color=\'white\', max_words=100).generate( keywords).to_file(\'kwords.png\') # 生成词云,字体设置为可识别中文字符 plt.imshow(wc) plt.axis(\'off\') plt.show() pagedetail = getListPage(\'http://tech.163.com/internet/\') # 获取首页新闻 writeNews(pagedetail) for i in range(2, 20): # 因为网易新闻频道只存取20页新闻,直接设置20 listUrl = \'http://tech.163.com/special/tele_2016_%02d/\' % i # 填充新闻页,页面格式为两位数字字符 pagedetail = getListPage(listUrl) writeNews(pagedetail) getKeyWords() # 获取词云,并且写到文件 getWordCloud() # 从词云文件读取词云,生成词云
以上是关于爬虫大作业的主要内容,如果未能解决你的问题,请参考以下文章
HTML5期末大作业:餐饮美食网站设计——咖啡(10页) HTML+CSS+JavaScript 学生DW网页设计作业成品 web课程设计网页规划与设计 咖啡网页设计 美食餐饮网页设计...(代码片段
Python大作业——爬虫+可视化+数据分析+数据库(可视化篇)