广州商学院新闻获取

Posted 2020-10-31 127li
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了广州商学院新闻获取相关的知识，希望对你有一定的参考价值。
import re
import xlwt
import time
import pandas
import requests
from multiprocessing import Process,Pool
from bs4 import BeautifulSoup


def getClickCount(newUrl):

    """
    获取新闻的点击次数
    :param newUrl:
    :return: int
    """
    new_id = re.findall(r‘\_(.*).html‘,newUrl)
    new_id = new_id[0].split(‘/‘)[1]
    url = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(new_id)
    content = requests.get(url)
    clickCount = int(re.search("hits‘\).html\(‘(.*)‘\);", content.text).group(1))
    return clickCount

def getNewDetail(newsUrl):

    """
    获取广州商学院的新闻详情
    :param newsUrl:
    :return: Dict
    """
    content=‘‘
    web=requests.get(newsUrl)
    web.encoding=‘utf-8‘
    soup=BeautifulSoup(web.text, ‘html.parser‘)
    structure=soup.find(‘div‘,{‘class‘:‘show-content‘}) #正文
    for string in structure.stripped_strings:
        content=content+string

    list=[]
    info=soup.find(‘div‘,{‘class‘:‘show-info‘})
    info=info.text.replace(‘\xa0‘,‘n‘).split(‘n‘)#细节信息
    for string in info:
        if len(string)>3:
            if string.find(‘发布时间‘)!=-1:
                string=string.replace(‘:‘,‘：‘,1)
                string=string.strip()
            if string.find(‘次‘)!=-1:
                string=‘点击：{}次‘.format(getClickCount(newsUrl))

            list.append(string.split(‘：‘))
    list=dict(list)
    list[‘链接‘]=newsUrl
    list[‘正文‘]=content
    list[‘发布时间‘]=time.strptime(list[‘发布时间‘],‘%Y-%m-%d %H:%M:%S‘)
    return list
def getNewsUrl(url):

    """
    获取广州商学院新闻列表页的所有新闻页的链接
    :param url:
    :return: List
    """

    newsList=[]
    web=requests.get(url)
    web.encoding=‘utf-8‘

    soup=BeautifulSoup(web.text,‘html.parser‘)
    soup=soup.find(‘ul‘,{‘class‘:‘news-list‘})
    for child in soup.children:
        if len(child)>1:
            newsList.append(child.a[‘href‘])
    return newsList

def getPage(url):

    """
    获取广州商学院新闻页数
    :param url:
    :return: int
    """
    web=requests.get(url)
    web.encoding=‘utf-8‘

    soup=BeautifulSoup(web.text,‘html.parser‘)
    soup=soup.find(‘a‘,{‘class‘:‘a1‘}).string[:-1]

    page=int(soup)//10+1

    return page

def getnews(url):
    print(‘start in %s‘%url[39:])
    newsurllist = getNewsUrl(url)
    for url in newsurllist:
        news.append(getNewDetail(url))
    print(‘ end ‘ ,end=‘‘)

if __name__==‘__main__‘:

    news=[]

    url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
    newsurl=getNewsUrl(url)
    page=getPage(url)
    for i in range(1,page+1):
        if i==1:
            url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
        else:
            url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i)
        getnews(url)
    df=pandas.DataFrame(news)
    df.to_excel(‘gzccnews.xls‘)
以上是关于广州商学院新闻获取的主要内容，如果未能解决你的问题，请参考以下文章