获取全部校园新闻

Posted 2020-10-30 Hiro-D

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了获取全部校园新闻相关的知识，希望对你有一定的参考价值。

爬取中山大学官网的新闻页面：

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re


# 获取新闻列表页的简略信息
def crawlOnePage(url):
    res = requests.get(url)
    res.encoding = \'UTF-8\'
    soup = BeautifulSoup(res.text, \'html.parser\')
    cont = soup.select(\'li\')
    for i in cont:
        print()
        print(\'新闻网址： \' + \'http://news2.sysu.edu.cn/news01/\' + i.select(\'a\')[0][\'href\'])
        # detailUrl=\'http://news2.sysu.edu.cn/news01/\' + i.select(\'a\')[0][\'href\']
        print(\'新闻标题： \' + i.select(\'a\')[0].text)
        # getDetail(detailUrl)




#获取新闻具体信息
def getDetail(url):
    res = requests.get(url)
    res.encoding = \'UTF-8\'
    soup = BeautifulSoup(res.text, \'html.parser\')
    cont=soup.select(\'p\')[2].text.split(\'|\')
    # 日期
    times=cont[4].split(\'：\')[1]
    # 来源
    source=cont[0]
    # 作者
    author=cont[1]
    #编辑
    editor=cont[3]
    # 将时间字符串转换成datetime格式
    release_time = datetime.strptime(times, \'%Y-%m-%d \')
    print(source,author,editor,release_time)
    content = soup.select(\'p\')[-1].text
    print(content)


# 取得所有页面的新闻
def getTotalPage(url):
    res = requests.get(url)
    res.encoding = \'UTF-8\'
    soup = BeautifulSoup(res.text, \'html.parser\')
    n =int( soup.select(\'strong\')[0].text.lstrip(\'1/\'))
    for i in range(1, n):
        page = str(i)
        geturl = \'http://news2.sysu.edu.cn/news01/index\'+page+\'.htm\'
        crawlOnePage(geturl)



crawlOnePage(\'http://news2.sysu.edu.cn/news01/index.htm\')
getDetail(\'http://news2.sysu.edu.cn/news01/152940.htm\')
getTotalPage(\'http://news2.sysu.edu.cn/news01/index.htm\')

截图：

以上是关于获取全部校园新闻的主要内容，如果未能解决你的问题，请参考以下文章