CSIC_716_20191028

Posted csic716

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CSIC_716_20191028相关的知识,希望对你有一定的参考价值。

 

1、爬取小破站的弹幕

2、展示爬取内容

 

打开网页,用教的方法找到cid 和header

import requests

from bs4 import BeautufulSoup

import pandas as pd

import re

header={

‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (Khtml, like Gecko) Chrome/78.0.3904.70 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36‘

}#小破站的header可以通过谷歌浏览器的network中的request header获取

url=‘ http://comment.破站.com/125507930.xml‘  #以爬取小破站cid=125507930为例

response=request.get(headers=header,url=url) #获取响应

response.encoding=response.apparent_encoding

data=response.text

soup=BeautifulSoup(data,‘lxml‘ )#解析

dlist=soup.find_all(‘d‘) #将内容按照规律向需要获取的内容筛选靠拢,存入dlist

d_list=[ ]

for i in d_list

danmu{ }

danmu[‘弹幕‘]=i.text

d_list.append(danmu)  #将取到的值放到d_list中

df=pd.dateframe(d_list)     #df应该还是列表,pd是pandas

f.open(‘sign.txt‘,‘encoding=‘utf-8‘)

for p in danmu[‘danmu‘].value:

pat=re.compile(r‘[一-龥]+‘)

filter_data=re.findall(pattern=pat,string=p )#筛选

f.write( " ".join(filter_data))  #保存数据进入sign文件

f.close()

------------------------------------------------------------------------------------------------------------------------------------------------------

利用上面获得的文件 sign.txt进行展示

import jieba

from imageio import imread

from wordcloud import WordCloud

f.open(‘sign.txt ‘,encoding=‘utf-8‘ )

data=f.read()

result=" ".join(jieba.lcut(data))

f.close( )

mask_color=imread(‘XXXX.jpg‘)

wc=WordCloud(

font_path=‘font的路径‘,

mask=mask_color,

width=xx,

height=xx,

)

wc.generate(result)

wc.to_file(‘xxxx.png‘)

plt.imshow( wc)

plt.show()

 

 

----------------------------------------------------以下为正式代码将两者合并--------------------------------------------------------------------------------------------------

 

如果不生成中间文件,爬完网页直接生成图片,代码合并,如下所示

 1 #coding:utf-8
 2 import requests
 3 import jieba
 4 import pandas as pd
 5 import re
 6 import matplotlib.pyplot as plt
 7 from bs4 import BeautifulSoup
 8 from imageio import imread
 9 from wordcloud import WordCloud
10 header={
11 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36
12 }
13 url=http://comment.bilibili.com/122593266.xml‘  #换一下数字的cid即可
14 response = requests.get(url=url,headers=header)
15 response.encoding=response.apparent_encoding
16 data=response.text
17 suop=BeautifulSoup(data,lxml)
18 list_filter=suop.find_all(d)
19 plist=[]
20 for i in list_filter:
21     danmu={}
22     danmu[弹幕]= i.text
23     plist.append(danmu)
24 df=pd.DataFrame(plist)
25 reslist=[]
26 for p in df[弹幕].values:
27     txtfilter= re.compile(r[一-龥]+)
28     res=re.findall(pattern=txtfilter,string=p)
29     result = " ".join(res)
30     reslist.append(result)
31 result= " ".join(reslist)
32 finalResult=" ".join(jieba.lcut(result))
33 mask_color=imread(五角星.jpg)  #图片可以随意替换,放在project中即可
34 wc=WordCloud(
35     font_path=rC:WindowsFontsSTLITI.TTF,
36     mask=mask_color,
37     width=1920,
38     height=1080,
39     background_color=white
40 )
41 wc.generate(finalResult)
42 wc.to_file(hunt.png)
43 plt.imshow(wc)
44 plt.show( )

 

以上是关于CSIC_716_20191028的主要内容,如果未能解决你的问题,请参考以下文章

CSIC_716_20191112闭包函数和装饰器

CSIC_716_20191101

CSIC_716_20191108

CSIC_716_20191216事务视图触发器存储过程索引

CSIC_716_20191213exec内置函数元类pymysql模块

CSIC_716_20191105