广州商学院新闻获取
Posted 127li
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了广州商学院新闻获取相关的知识,希望对你有一定的参考价值。
import re import xlwt import time import pandas import requests from multiprocessing import Process,Pool from bs4 import BeautifulSoup def getClickCount(newUrl): """ 获取新闻的点击次数 :param newUrl: :return: int """ new_id = re.findall(r‘\_(.*).html‘,newUrl) new_id = new_id[0].split(‘/‘)[1] url = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(new_id) content = requests.get(url) clickCount = int(re.search("hits‘\).html\(‘(.*)‘\);", content.text).group(1)) return clickCount def getNewDetail(newsUrl): """ 获取广州商学院的新闻详情 :param newsUrl: :return: Dict """ content=‘‘ web=requests.get(newsUrl) web.encoding=‘utf-8‘ soup=BeautifulSoup(web.text, ‘html.parser‘) structure=soup.find(‘div‘,{‘class‘:‘show-content‘}) #正文 for string in structure.stripped_strings: content=content+string list=[] info=soup.find(‘div‘,{‘class‘:‘show-info‘}) info=info.text.replace(‘\xa0‘,‘n‘).split(‘n‘)#细节信息 for string in info: if len(string)>3: if string.find(‘发布时间‘)!=-1: string=string.replace(‘:‘,‘:‘,1) string=string.strip() if string.find(‘次‘)!=-1: string=‘点击:{}次‘.format(getClickCount(newsUrl)) list.append(string.split(‘:‘)) list=dict(list) list[‘链接‘]=newsUrl list[‘正文‘]=content list[‘发布时间‘]=time.strptime(list[‘发布时间‘],‘%Y-%m-%d %H:%M:%S‘) return list def getNewsUrl(url): """ 获取广州商学院新闻列表页的所有新闻页的链接 :param url: :return: List """ newsList=[] web=requests.get(url) web.encoding=‘utf-8‘ soup=BeautifulSoup(web.text,‘html.parser‘) soup=soup.find(‘ul‘,{‘class‘:‘news-list‘}) for child in soup.children: if len(child)>1: newsList.append(child.a[‘href‘]) return newsList def getPage(url): """ 获取广州商学院新闻页数 :param url: :return: int """ web=requests.get(url) web.encoding=‘utf-8‘ soup=BeautifulSoup(web.text,‘html.parser‘) soup=soup.find(‘a‘,{‘class‘:‘a1‘}).string[:-1] page=int(soup)//10+1 return page def getnews(url): print(‘start in %s‘%url[39:]) newsurllist = getNewsUrl(url) for url in newsurllist: news.append(getNewDetail(url)) print(‘ end ‘ ,end=‘‘) if __name__==‘__main__‘: news=[] url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘ newsurl=getNewsUrl(url) page=getPage(url) for i in range(1,page+1): if i==1: url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘ else: url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i) getnews(url) df=pandas.DataFrame(news) df.to_excel(‘gzccnews.xls‘)
以上是关于广州商学院新闻获取的主要内容,如果未能解决你的问题,请参考以下文章