获取全部校园新闻

Posted 2020-10-30 146-王星宇

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了获取全部校园新闻相关的知识，希望对你有一定的参考价值。

1.取出一个新闻列表页的全部新闻包装成函数。

2.获取总的新闻篇数，算出新闻总页数。

3.获取全部新闻列表页的全部新闻详情。

import requests
from  bs4 import  BeautifulSoup
from datetime import datetime
import re

#获取新闻点击次数
def getClickCount(newsUrl):
    newsId = re.findall(\'\\_(.*).html\', newsUrl)[0].split(\'/\')[1]
    clickUrl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(newsId)
    clickStr = requests.get(clickUrl).text
    count = re.search("hits\'\\).html\\(\'(.*)\'\\);",clickStr).group(1)
    return count


# 获取新闻的信息
def getNewDetail(url):
    resd = requests.get(url)
    resd.encoding = \'utf-8\'
    soupd = BeautifulSoup(resd.text, \'html.parser\')
    title = soupd.select(\'.show-title\')[0].text
    info = soupd.select(\'.show-info\')[0].text
    time = info.lstrip(\'发布时间:\')[0:19]
    dt = datetime.strptime(time, \'%Y-%m-%d %H:%M:%S\')

    if info.find(\'来源：\') > 0:
        source = info[info.find(\'来源：\'):].split()[0].lstrip(\'来源：\')
    else:
        source = \'none\'

    if info.find(\'作者：\') > 0:
        author = info[info.find(\'作者：\'):].split()[0].lstrip(\'作者：\')
    else:
        author = \'none\'

    clickcount=getClickCount(url);


    print(\'链接：{0}\\n标题：{1}\\n发布时间：{2}\\n来源：{3}\\n作者：{4}\\n点击次数：{5}\'.format(url, title, dt, source,author , clickcount))
    print(\'-----------------\')

def getListPage(listPageUrl):
    res = requests.get(listPageUrl)
    res.encoding = \'utf-8\'
    soup = BeautifulSoup(res.text, \'html.parser\')
    for news in soup.select(\'li\'):
        if len(news.select(\'.news-list-title\')) > 0:
            # 获取新闻模块链接
            a = news.a.attrs[\'href\']
            # 调用函数获取新闻正文
            getNewDetail(a)




url=\'http://news.gzcc.cn/html/xiaoyuanxinwen/\'
resn = requests.get(url)
resn.encoding = \'utf-8\'
soupn = BeautifulSoup(resn.text,\'html.parser\')
n = int(soupn.select(\'.a1\')[0].text.rstrip(\'条\'))


for i in range(n,n+1):
    pageUrl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html\'.format(i)
    getListPage(pageUrl)

4.找一个自己感兴趣的主题，进行数据爬取，并进行分词分析。不能与其它同学雷同。

import requests
import re
import jieba
from bs4 import BeautifulSoup
from datetime import datetime


def getNewsDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = \'gb2312\'
    soupd = BeautifulSoup(resd.text, \'html.parser\')

    content = soupd.select(\'#endText\')[0].text
    info = soupd.select(\'.post_time_source\')[0].text
    date = re.search(\'(\\d{4}.\\d{2}.\\d{2}\\s\\d{2}.\\d{2}.\\d{2})\', info).group(1)
    dateTime = datetime.strptime(date, \'%Y-%m-%d %H:%M:%S\')
    sources = re.search(\'来源:\\s*(.*)\', info).group(1)
    TopWords = getTopWords(content)
    print(\'发布时间：{0}\\n来源：{1}\'.format(dateTime, sources))
    print(\'关键词：{}、{}、{}、{}、{}\'.format(TopWords[0], TopWords[1], TopWords[2],TopWords[3],TopWords[4]))
    print(content)
    print(\'---------------------------\')

def getTopWords(content):
    str = \'\'\'一！“”，。？；’"\',.、：\\n\'\'\'
    for s in str:
        content=content.replace(s, \' \')
    wordlist = list(jieba.cut(content))
    exclude = {\'这\', \'\\u3000\', \'\\r\', \'\\xa0\',\'时候\',\'对\',\'上\',\'与\',\'等\',\'不\',\'\',\'没有\',\'很多\',\'的\',\'大\',\'出来\', \'_\', \'到\',\' \', \'将\', \'在\', \'是\', \'了\', \'一\', \'还\', \'也\', \'《\', \'》\', \'（\', \'）\',\'和\',\'我\',\'我们\',\'其\',\'能够\',\'以\',\'个\',\'短\',\'中\',\'是\',\'不是\'}
    set2 = set(wordlist) - exclude
    dict = {}
    for key in set2:
        dict[key] = wordlist.count(key)
    dictlist = list(dict.items())
    dictlist.sort(key=lambda x: x[1], reverse=True)
    return dictlist;

def getListPage(listUrl):
    res = requests.get(listUrl)
    res.encoding = \'gbk\'
    soup = BeautifulSoup(res.text, \'html.parser\')
    for new in soup.select(\'#news-flow-content\')[0].select(\'li\'):
        url = new.select(\'a\')[0][\'href\']
        title = new.select(\'a\')[0].text
        print(\'标题：{0}\\n链接：{1}\'.format(title, url))
        getNewsDetail(url)
        break

listUrl = \'http://tech.163.com/internet/\'
getListPage(listUrl)
for i in range(2, 10):
    listUrl = \'http://tech.163.com/special/it_2016_%02d/\' % i
    getListPage(listUrl)

以上是关于获取全部校园新闻的主要内容，如果未能解决你的问题，请参考以下文章

获取全部校园新闻