数据结构化与保存

Posted 2020-10-31 087林金龙
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了数据结构化与保存相关的知识，希望对你有一定的参考价值。
import requests
from bs4 import BeautifulSoup
import string
import time
import datetime
import re
import pandas



#获取文章详情
def getNewDetail(d,hist):
    print("详情：")
    rlink=requests.get(d,headers=head)
    rlink.encoding=\'utf-8\'
    #print(rlink.text)
    soup=BeautifulSoup(rlink.text,\'html.parser\')
    e=soup.select(".show-info")[0].text
    print()
    return(newsDetailItems(e,hist,soup))
   

#将获取的文章详情存储为字典
def newsDetailItems(e,hist,soup):
    news={}

    #时间类型转换
    dt=e.lstrip(\'发布时间:\')[:19]
    news[\'时间\']= datetime.datetime.strptime(dt,\'%Y-%m-%d %H:%M:%S\')

    #作者
    i=e.find(\'作者：\')
    if i>0:
        news[\'作者\']=e[e.find(\'作者：\'):].split()[0].lstrip(\'作者：\')

    else:
        news[\'作者\']=\'未知\'

    #审核
    i=e.find(\'审核：\')
    if i>0:
        news[\'审核\']=e[e.find(\'审核：\'):].split()[0].lstrip(\'审核：\')

    else:
        news[\'审核\']=\'未知\'

    #来源
    i=e.find(\'来源：\')
    if i>0:
        news[\'来源\']=e[e.find(\'来源：\'):].split()[0].lstrip(\'来源：\')

    else:
        news[\'来源\']=\'未知\'


    #摄影
    i=e.find(\'摄影：\')
    if i>0:
        news[\'摄影\']=e[e.find(\'摄影：\'):].split()[0].lstrip(\'摄影：\')

    else:
        news[\'摄影\']=\'未知\'

    #点击次数
    i=e.find(\'点击：\')
    if i>0:
        news[\'点击\']=hist

    
    news[\'新闻内容：\']=soup.select("#content")[0].text
    #print(news[\'新闻内容：\'])

    for pn in range(5):
        print()
    return(news)

#点击次数
def getClickCount(d):
        #获取新闻编号
        r2=re.findall(\'\\_\\d+\\/(.*?)\\.\',d,re.S)
        #print(r2)
        r1=\'http://oa.gzcc.cn/api.php?op=count&id=\'
        r3=\'&modelid=80\'
        r22="".join(r2)

        #生成点击次数的URL
        r_all=r1+r22+r3
        #print(r_all)
        rlink2=requests.get(r_all,headers=head)

        #获取点击次数
        hist=rlink2.text.split(\'.html\')[-1].lstrip("(\')").rstrip("\');")
        return hist

#计算新闻数量
def newscounter(counter):
    counter=counter+1
    return counter

#获取新闻列表页的全部新闻(将新闻数据结构化为字典的列表)
def getListPagel(r,counter):

    
    if counter==0:
        countertemp=counter

    soup=BeautifulSoup(r.text,\'html.parser\')
    #存概述的列表
    newslist1=[]

    #存详情的列表
    newslist2=[]
    aso=soup.select(\'li\')
    for i in soup.select(\'li\'):
        news1={}
        if len(i.select(".news-list-title"))>0:
            a=i.select(".news-list-title")[0].text
            news1["标题"]=i.select(".news-list-title")[0].text
            news1["时间"]=i.select(".news-list-info")[0].contents[0].text
            news1["来源"]=i.select(".news-list-info")[0].contents[1].text
            news1["链接"]=i.select("a")[0].attrs[\'href\']
            news1["内容概述"]=i.select(".news-list-description")[0].text
            news1["点击"]=getClickCount(news1["链接"])+\' 次\'

            #新闻数
            countertemp=newscounter(counter)
            counter=countertemp
            print("已获取新闻数："+str(countertemp))
            print()

            showone="标题："+news1["标题"]+\'\\n\'+"时间："+news1["时间"]+\'\\n\'+"来源："+news1["来源"]+\'\\n\'+"链接："+news1["链接"]+\'\\n\'+"点击："+news1["点击"]+\'\\n\'+"内容概述："+\'\\n\'+news1["内容概述"]+\'\\n\\n\'
            #print(showone)
            file_handle.write(showone)

            print()

            #概述
            newslist1.append(news1)

            #详情
            newslist2.append(getNewDetail(news1["链接"],news1["点击"]))
            
    return counter,newslist1,newslist2



#爬虫伪装
head = {}
head[\'user-agent\']=\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36\'

##打开文件
#file_handle=open(\'1.txt\',mode=\'w\')

#用于li标签数量(用于判断)
temp=1

#列表页
i=230

#新闻数量计数器
global counter
counter=0

#清空文本文件
file_handle=open(\'1.txt\',mode=\'w\')
file_handle.truncate()
file_handle.close()

#存储概述的列表
newslist1=[]

#存储详情的列表
newslist2=[]

#ID"news-list-title"存在即进行循环，说明页面存在
while temp>0:
    page=i

    MainLink="http://news.gzcc.cn/html/xiaoyuanxinwen/"+str(page)+".html"
    if i==1:
        r=requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/",headers=head)
    else:
        r=requests.get(MainLink,headers=head)
    r.encoding=\'utf-8\'
    soup=BeautifulSoup(r.text,\'html.parser\')

    #以追加写入方式打开文件
    file_handle=open(\'1.txt\',mode=\'a\',encoding=\'utf-8\')

    listgetListPagel=getListPagel(r,counter)

    #新闻数
    counter=listgetListPagel[0]
    #新闻概述
    if newslist1==None:
        newslist1=listgetListPagel[1]
    else:
        newslist1.extend(listgetListPagel[1])
    #新闻详情
    if newslist2==None:
        newslist2=listgetListPagel[2]
    else:
        newslist2.extend(listgetListPagel[2])

    print(newslist1)

    temp=len(soup.select(".news-list-title"))
    print("已爬取页数"+str(page))
    i=i+1
    print()
    print("----------------------------------------------------------------------------------------------------------------------")
    print()
    file_handle.close()

#安装pandas，用pandas.DataFrame(newstotal)，创建一个DataFrame对象df.
#******************************************************************
df=pandas.DataFrame(newslist1)
print(df)
#通过df将提取的数据保存到csv或excel 文件
df.to_csv("1.csv")

# 提取包含点击次数、标题、来源的前6行数据
print(df[[\'click\', \'title\', \'sources\']].head(6))

# 提取‘学校综合办’发布的，‘点击次数’超过3000的新闻。
print(df[(df[\'click\'] > 3000) & (df[\'sources\'] == \'学校综合办\')])

# 提取\'国际学院\'和\'学生工作处\'发布的新闻。
print(df[df[\'sources\'].isin([\'国际学院\', \'学生工作处\'])])
#******************************************************************
从230页开始爬取：
由于是从第230页开始爬取，所以并无满足后两个条件的新闻：
以上是关于数据结构化与保存的主要内容，如果未能解决你的问题，请参考以下文章