爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离
Posted 157 符致伟
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离相关的知识,希望对你有一定的参考价值。
import requests import re url = "http://news.gzcc.cn/html/xiaoyuanxinwen/" res = requests.get(url) res.encoding = \'utf-8\' # 利用BeautifulSoup的HTML解析器,生成结构树 from bs4 import BeautifulSoup soup = BeautifulSoup(res.text, \'html.parser\') def getClickCount(url): HitUrl = \'http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80\' hitNumber = requests.get(HitUrl).text.split(\'.html\')[-1].lstrip("(\'").rstrip("\');") print("点击次数:", hitNumber) re.match(\'http://news.gzcc.cn/html/2018/xiaoyuanxinwen(.*).html\', url).group(1).split(\'/\')[1] print(\'新闻编号:\', re.search(\'\\_(.*).html\', url).group(1)) def getNewDetail(url): res = requests.get(url) res.encoding = \'utf-8\' soup = BeautifulSoup(res.text, \'html.parser\') for news in soup.select(\'li\'): if len(news.select(\'.news-list-title\')) > 0: # 首页文章标题 title = news.select(\'.news-list-title\')[0].text # 首页文章描述 description = news.select(\'.news-list-description\')[0].text # 首页文章信息 info = news.select(\'.news-list-info\')[0].text # 首页文章链接 href = news.select(\'a\')[0][\'href\'] url = href res = requests.get(url) res.encoding = \'utf-8\' soup = BeautifulSoup(res.text, \'html.parser\') # 获取每篇文章的信息 newinfo = soup.select(\'.show-info\')[0].text # 获取文章内容 content = soup.select(\'#content\')[0].text # 日期 date = newinfo.split()[0] # 当日时间 time = newinfo.split()[1] # 作者 author = newinfo.split()[2] # 审核 checker = newinfo.split()[3] # 来源 source = newinfo.split()[4] # 摄影 Photography = newinfo.split()[5] print(\'------------------------------------------------------------------------------\') print("文章标题:" + title) print("\\n文章描述:" + description) print("\\n文章信息:\\n" + date + \' \' + time + \'\\n\' + author + \'\\n\' + checker + \'\\n\' + source+ \'\\n\' + Photography) getClickCount(href)#点击次数、新闻编号 print("\\n文章链接:" + href) print(content) print(\'------------------------------------------------------------------------------\') getNewDetail(url)
以上是关于爬取校园新闻首页的新闻的详情,使用正则表达式,函数抽离的主要内容,如果未能解决你的问题,请参考以下文章