1. 用requests库和BeautifulSoup库,爬取校园新闻首页新闻的标题、链接、正文、show-info。
2. 分析info字符串,获取每篇新闻的发布时间,作者,来源,摄影等信息。
import requests re=requests.get(‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘) re.encoding=‘utf-8‘ from bs4 import BeautifulSoup soup = BeautifulSoup(re.text,‘html.parser‘) #获取所有的新闻标题 # for i in soup.select(‘.news-list-title‘): # print(i) #获取所有的新闻标题的描述 # for j in soup.select(‘.news-list-description‘): # print(j) #获取所有新闻标题的链接 # for k in soup.find_all(‘a‘): # print(k[‘href‘]) # print(soup.select(‘.news-list-title‘)[0].text) for news in soup.select(‘li‘): if len(news.select(‘.news-list-title‘))>0: t=news.select(‘.news-list-title‘)[0].text # d=news.select(‘.news-list-description‘)[0].text a=news.select(‘a‘)[0].attrs print(t,a[‘href‘]) re1 = requests.get(a[‘href‘]) re1.encoding = "utf-8" soup1 = BeautifulSoup(re1.text, ‘html.parser‘) content = soup1.select(‘#content‘)[0].text show = soup1.select(‘.show-info‘)[0].text print(show) print(content)
import requests
re=requests.get(‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘)
re.encoding=‘utf-8‘
from bs4 import BeautifulSoup
soup = BeautifulSoup(re.text,‘html.parser‘)
#获取所有的新闻标题
# for i in soup.select(‘.news-list-title‘):
# print(i)
#获取所有的新闻标题的描述
# for j in soup.select(‘.news-list-description‘):
# print(j)
#获取所有新闻标题的链接
# for k in soup.find_all(‘a‘):
# print(k[‘href‘])
# print(soup.select(‘.news-list-title‘)[0].text)
for news in soup.select(‘li‘):
if len(news.select(‘.news-list-title‘))>0:
t=news.select(‘.news-list-title‘)[0].text
# d=news.select(‘.news-list-description‘)[0].text
a=news.select(‘a‘)[0].attrs
print(t,a[‘href‘])
re1 = requests.get(a[‘href‘])
re1.encoding = "utf-8"
soup1 = BeautifulSoup(re1.text, ‘html.parser‘)
content = soup1.select(‘#content‘)[0].text
show = soup1.select(‘.show-info‘)[0].text
print(show)
print(content)