爬取所有校园新闻

Posted ZJQ-013

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了爬取所有校园新闻相关的知识,希望对你有一定的参考价值。

 1、获取单条新闻的#标题#链接#时间#来源#内容 #点击次数,并包装成一个函数

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime

def getchick(url):
    t = re.match(\'http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html\',url).groups()
    id = t[0].split(\'/\')[1]
    chickurl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(id)
    chick = int(requests.get(chickurl).text.split(\'.\')[-1].lstrip("html(\'").rstrip("\');"))
    return(chick)

gzccurl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/\'
res = requests.get(gzccurl)
res.encoding=\'utf-8\'
soup = BeautifulSoup(res.text,\'html.parser\')



for news in soup.select(\'li\'):
    if len(news.select(\'.news-list-title\'))>0:
        title = news.select(\'.news-list-title\')[0].text #标题
        url = news.select(\'a\')[0][\'href\'] #路径
        time = news.select(\'.news-list-info\')[0].contents[0].text #时间
        dt = datetime.strptime(time,\'%Y-%m-%d\')
        source = news.select(\'.news-list-info\')[0].contents[1].text #来源
        #正文
        resd = requests.get(url)
        resd.encoding=\'utf-8\'
        soupd = BeautifulSoup(resd.text,\'html.parser\')
        detail = soupd.select(\'.show-content\')[0].text
        chick = getchick(url)
        print(title,chick,url,dt,source,detail)
        break

 

 

2、获取一个新闻列表页的所有新闻的上述详情,并包装成一个函数

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime

def getchick(url):
    t = re.match(\'http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html\',url).groups()
    id = t[0].split(\'/\')[1]
    chickurl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(id)
    chick = int(requests.get(chickurl).text.split(\'.\')[-1].lstrip("html(\'").rstrip("\');"))
    return(chick)
def getonepage(url):
    for news in soup.select(\'li\'):
        if len(news.select(\'.news-list-title\'))>0:
            title = news.select(\'.news-list-title\')[0].text #标题
            url = news.select(\'a\')[0][\'href\'] #路径
            time = news.select(\'.news-list-info\')[0].contents[0].text #时间
            dt = datetime.strptime(time,\'%Y-%m-%d\')
            source = news.select(\'.news-list-info\')[0].contents[1].text #来源
        #正文
            resd = requests.get(url)
            resd.encoding=\'utf-8\'
            soupd = BeautifulSoup(resd.text,\'html.parser\')
            detail = soupd.select(\'.show-content\')[0].text
            chick = getchick(url)
            print(title,chick,url,dt,source,detail)
gzccurl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/\'
res = requests.get(gzccurl)
res.encoding=\'utf-8\'
soup = BeautifulSoup(res.text,\'html.parser\')

getonepage(\'http://news.gzcc.cn/html/xiaoyuanxinwen/index,html\')

 

3、获取所有新闻列表页的网址,调用上述函数

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime

def getdetail(url):
    resd = requests.get(url)
    resd.encoding=\'utf-8\'
    soupd = BeautifulSoup(resd.text,\'html.parser\')
    detail = soupd.select(\'.show-content\')[0].text
    return(detail)
def getchick(newsurl):
    id = re.match(\'http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html\',newsurl).groups()[-1].split(\'/\')[1]
    chickurl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(id)
    chick = int(requests.get(chickurl).text.split(\'.\')[-1].lstrip("html(\'").rstrip("\');"))
    return(chick)
def getonepage(listurl):
    res = requests.get(listurl)
    res.encoding=\'utf-8\'
    soup = BeautifulSoup(res.text,\'html.parser\')
    for news in soup.select(\'li\'):
        if len(news.select(\'.news-list-title\'))>0:
            title = news.select(\'.news-list-title\')[0].text #标题
            url = news.select(\'a\')[0][\'href\'] #路径
            time = news.select(\'.news-list-info\')[0].contents[0].text #时间
            dt = datetime.strptime(time,\'%Y-%m-%d\')
            source = news.select(\'.news-list-info\')[0].contents[1].text #来源
        #正文
            detail = getdetail(url)
            chick = getchick(url)
            #print(title,chick,url,dt,source,detail)

gzccurl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/\'
res = requests.get(gzccurl)
res.encoding=\'utf-8\'
soup = BeautifulSoup(res.text,\'html.parser\')

page = int(soup.select(\'.a1\')[0].text.rstrip(\'\'))//10+1
for i in range(2,page+1):
    listurl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html\'.format(i)
    print(listurl)

4、完成所有校园新闻的爬取工作

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime

def getdetail(url):
    resd = requests.get(url)
    resd.encoding=\'utf-8\'
    soupd = BeautifulSoup(resd.text,\'html.parser\')
    detail = soupd.select(\'.show-content\')[0].text
    return(detail)
def getchick(newsurl):
    id = re.match(\'http://news.gzcc.cn/html/2017/xiaoyuanxinwen_(.*).html\',newsurl).groups()[-1].split(\'/\')[1]
    chickurl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(id)
    chick = int(requests.get(chickurl).text.split(\'.\')[-1].lstrip("html(\'").rstrip("\');"))
    return(chick)
def getonepage(listurl):
    res = requests.get(listurl)
    res.encoding=\'utf-8\'
    soup = BeautifulSoup(res.text,\'html.parser\')
    for news in soup.select(\'li\'):
        if len(news.select(\'.news-list-title\'))>0:
            title = news.select(\'.news-list-title\')[0].text #标题
            url = news.select(\'a\')[0][\'href\'] #路径
            time = news.select(\'.news-list-info\')[0].contents[0].text #时间
            dt = datetime.strptime(time,\'%Y-%m-%d\')
            source = news.select(\'.news-list-info\')[0].contents[1].text #来源
        #正文
            detail = getdetail(url)
            chick = getchick(url)
            print(title,chick,url,dt,source,detail)

getonepage(\'http://news.gzcc.cn/html/xiaoyuanxinwen/index.html\')
gzccurl = \'http://news.gzcc.cn/html/xiaoyuanxinwen/\'
res = requests.get(gzccurl)
res.encoding=\'utf-8\'
soup = BeautifulSoup(res.text,\'html.parser\')

page = int(soup.select(\'.a1\')[0].text.rstrip(\'\'))//10+1
for i in range(2,page+1):
    getonepage(\'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html\'.format(i))

 

 5、完成自己所选其他主题相应数据的爬取工作

 

import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime

def getdetail(url):
    resd = requests.get(url)
    resd.encoding=\'utf-8\'
    soupd = BeautifulSoup(resd.text,\'html.parser\')
    detail = soupd.select(\'.show-content\')[0].text
    return(detail)
def getchick(newsurl):
    id = re.match(\'http://news.gzcc.cn/html/2017/meitishijie_(.*).html\',newsurl).groups()[-1].split(\'/\')[1]
    chickurl = \'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80\'.format(id)
    chick = int(requests.get(chickurl).text.split(\'.\')[-1].lstrip("html(\'").rstrip("\');"))
    return(chick)
def getonepage(listurl):
    res = requests.get(listurl)
    res.encoding=\'utf-8\'
    soup = BeautifulSoup(res.text,\'html.parser\')
    for news in soup.select(\'li\'):
        if len(news.select(\'.news-list-title\'))>0:
            title = news.select(\'.news-list-title\')[0].text #标题
            url = news.select(\'a\')[0][\'href\'] #路径
            time = news.select(\'.news-list-info\')[0].contents[0].text #时间
            dt = datetime.strptime(time,\'%Y-%m-%d\')
        #正文
            detail = getdetail(url)
            chick = getchick(url)
            print(title,chick,url,dt,detail)

getonepage(\'http://news.gzcc.cn/html/meitishijie/index.html\')
gzccurl = \'http://news.gzcc.cn/html/meitishijie/\'
res = requests.get(gzccurl)
res.encoding=\'utf-8\'
soup = BeautifulSoup(res.text,\'html.parser\')

page = int(soup.select(\'.a1\')[0].text.rstrip(\'\'))//10+1
for i in range(2,page+1):
    getonepage(\'http://news.gzcc.cn/html/meitishijie/{}.html\'.format(i))

 

 

 

以上是关于爬取所有校园新闻的主要内容,如果未能解决你的问题,请参考以下文章

爬取所有校园新闻

爬取所有校园新闻

爬取所有校园新闻

爬取所有校园新闻

爬取所有校园新闻

爬取所有校园新闻