获取全部校园新闻
Posted 146-王星宇
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了获取全部校园新闻相关的知识,希望对你有一定的参考价值。
1.取出一个新闻列表页的全部新闻 包装成函数。
2.获取总的新闻篇数,算出新闻总页数。
3.获取全部新闻列表页的全部新闻详情。
import requests from bs4 import BeautifulSoup from datetime import datetime import re #获取新闻点击次数 def getClickCount(newsUrl): newsId = re.findall(\'\\_(.*).html\', newsUrl)[0].split(\'/\')[1] clickUrl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(newsId) clickStr = requests.get(clickUrl).text count = re.search("hits\'\\).html\\(\'(.*)\'\\);",clickStr).group(1) return count # 获取新闻的信息 def getNewDetail(url): resd = requests.get(url) resd.encoding = \'utf-8\' soupd = BeautifulSoup(resd.text, \'html.parser\') title = soupd.select(\'.show-title\')[0].text info = soupd.select(\'.show-info\')[0].text time = info.lstrip(\'发布时间:\')[0:19] dt = datetime.strptime(time, \'%Y-%m-%d %H:%M:%S\') if info.find(\'来源:\') > 0: source = info[info.find(\'来源:\'):].split()[0].lstrip(\'来源:\') else: source = \'none\' if info.find(\'作者:\') > 0: author = info[info.find(\'作者:\'):].split()[0].lstrip(\'作者:\') else: author = \'none\' clickcount=getClickCount(url); print(\'链接:{0}\\n标题:{1}\\n发布时间:{2}\\n来源:{3}\\n作者:{4}\\n点击次数:{5}\'.format(url, title, dt, source,author , clickcount)) print(\'-----------------\') def getListPage(listPageUrl): res = requests.get(listPageUrl) res.encoding = \'utf-8\' soup = BeautifulSoup(res.text, \'html.parser\') for news in soup.select(\'li\'): if len(news.select(\'.news-list-title\')) > 0: # 获取新闻模块链接 a = news.a.attrs[\'href\'] # 调用函数获取新闻正文 getNewDetail(a) url=\'http://news.gzcc.cn/html/xiaoyuanxinwen/\' resn = requests.get(url) resn.encoding = \'utf-8\' soupn = BeautifulSoup(resn.text,\'html.parser\') n = int(soupn.select(\'.a1\')[0].text.rstrip(\'条\')) for i in range(n,n+1): pageUrl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html\'.format(i) getListPage(pageUrl)
4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。
import requests import re import jieba from bs4 import BeautifulSoup from datetime import datetime def getNewsDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = \'gb2312\' soupd = BeautifulSoup(resd.text, \'html.parser\') content = soupd.select(\'#endText\')[0].text info = soupd.select(\'.post_time_source\')[0].text date = re.search(\'(\\d{4}.\\d{2}.\\d{2}\\s\\d{2}.\\d{2}.\\d{2})\', info).group(1) dateTime = datetime.strptime(date, \'%Y-%m-%d %H:%M:%S\') sources = re.search(\'来源:\\s*(.*)\', info).group(1) TopWords = getTopWords(content) print(\'发布时间:{0}\\n来源:{1}\'.format(dateTime, sources)) print(\'关键词:{}、{}、{}、{}、{}\'.format(TopWords[0], TopWords[1], TopWords[2],TopWords[3],TopWords[4])) print(content) print(\'---------------------------\') def getTopWords(content): str = \'\'\'一!“”,。?;’"\',.、:\\n\'\'\' for s in str: content=content.replace(s, \' \') wordlist = list(jieba.cut(content)) exclude = {\'这\', \'\\u3000\', \'\\r\', \'\\xa0\',\'时候\',\'对\',\'上\',\'与\',\'等\',\'不\',\'\',\'没有\',\'很多\',\'的\',\'大\',\'出来\', \'_\', \'到\',\' \', \'将\', \'在\', \'是\', \'了\', \'一\', \'还\', \'也\', \'《\', \'》\', \'(\', \')\',\'和\',\'我\',\'我们\',\'其\',\'能够\',\'以\',\'个\',\'短\',\'中\',\'是\',\'不是\'} set2 = set(wordlist) - exclude dict = {} for key in set2: dict[key] = wordlist.count(key) dictlist = list(dict.items()) dictlist.sort(key=lambda x: x[1], reverse=True) return dictlist; def getListPage(listUrl): res = requests.get(listUrl) res.encoding = \'gbk\' soup = BeautifulSoup(res.text, \'html.parser\') for new in soup.select(\'#news-flow-content\')[0].select(\'li\'): url = new.select(\'a\')[0][\'href\'] title = new.select(\'a\')[0].text print(\'标题:{0}\\n链接:{1}\'.format(title, url)) getNewsDetail(url) break listUrl = \'http://tech.163.com/internet/\' getListPage(listUrl) for i in range(2, 10): listUrl = \'http://tech.163.com/special/it_2016_%02d/\' % i getListPage(listUrl)
以上是关于获取全部校园新闻的主要内容,如果未能解决你的问题,请参考以下文章