基于新浪微博的⽇志数据分析
Posted 一加六
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了基于新浪微博的⽇志数据分析相关的知识,希望对你有一定的参考价值。
["beCommentWeiboId":"","beForwardWeiboId":"","catchTime":"1387159495","co mmentCount":"1419","content":"分享图
片","createTime":"1386981067","info1":"","info2":"","info3":"","mlevel":"" ,"musicurl":[],"pic_list": ["http://ww3.sinaimg.cn/thumbnail/40d61044jw1ebixhnsiknj20qo0qognx.jpg"]," praiseCount":"5265","reportCount":"1285","source":"iPad客户
端","userId":"1087770692","videourl": [],"weiboId":"3655325888057474","weiboUrl":"http://weibo.com/1087770692/An dhixO7g"] 2 ["beCommentWeiboId":"","beForwardWeiboId":"","catchTime":"1387159495","co mmentCount":"91","content":"行走:#去远方发现自己#@费勇主编,跨界明星联合执笔,
分享他们观行思趣的心发现、他们的成长与心路历程,当当网限量赠送出品人@陈坤抄诵印刷版
《心经》,赠完不再加印哦!详情请戳:
http://t.cn/8k622Sj","createTime":"1386925242","info1":"","info2":"","info 3":"","mlevel":"","musicurl":[],"pic_list": ["http://ww4.sinaimg.cn/thumbnail/b2336177jw1ebi6j4twk7j20m80tkgra.jpg"]," praiseCount":"1","reportCount":"721","source":"","userId":"2989711735","vi deourl": [],"weiboId":"3655091741442099","weiboUrl":"http://weibo.com/2989711735/An 7bE639F"]
beCommentWeiboId 是否评论
beForwardWeiboId 是否是转发微博
catchTime 抓取时间
commentCount 评论次数
content 内容
createTime 创建时间
info1 信息字段1
info2信息字段2
info3信息字段3
mlevel no sure musicurl ⾳乐链接
pic_list 照⽚列表(可以有多个)
praiseCount 点赞⼈数
reportCount 转发⼈数
source 数据来源
userId ⽤户id
videourl 视频链接
weiboId 微博id
weiboUrl 微博⽹址
在hadoop创建目录
hadoop dfs -mkdir weibo
hadoop fs -put ./weibo/*
hadoop fs -ls /weibo
hive创建库
create database if not exists weibo;
use weibo;
create external table weibo(json string) location '/weibo';
select * from weibo limit 3;
处理json格式数据使用到get_json_object()和json_tuple(),其中两者都只认最外层是花括号 才能正常解析.最外层是时 [ ] 不能解析,
当最外层时 [] 时可使用substring方法去掉最外层 [ ]
select get_json_object(substring(json,2,length(json)-1),'$.userId') from weibo limit 1;
统计需求
微博总量和独立用户数
#总量
select count(*) from weibo;
#独立用户数
select count(distinct(get_json_object(a.j,'$.userId')))
from
(select substring(json,2,length(json)-1) as j from weibo) a;
用户所有微博被转发的总数,输出前3个用户
使用json_tuple提取多个字段
select b.id,sum(b.cnt) as bsum
from
(select
json_tuple(a.j,'userId','reportCount') as (id,cnt)
from
(select substring(json,2,length(json)-1) as j from weibo) a)
b
group by b.id
order by bsum desc
limit 3;
被转发次数最多的前3条微博,输出用户id
select
get_json_object(a.j,'$.userId') as id,
cast(get_json_object(a.j,'$.reportCount') as INT) as cnt
from
(select substring(json,2,length(json)-1) as j from weibo) a
order by cnt desc
limit 3;
每个用户发布的微博总数,存储到临时表
create table weibo_uid_wbcnt(
userid string, wbcnt int )
row format delimited
fields terminated by '\\t';
insert overwrite table weibo_uid_wbcnt select get_json_object(a.j,'$.userId'),count(1)
from
(select substring(json,2,length(json)-2) as j from weibo) a
group by get_json_object(a.j,'$.userId');
select * from weibo_uid_wbcnt limit 10;
统计带图片的微博数
select count(1)
from
(select substring(json,2,length(json)-2) as j from weibo) a
where get_json_object(a.j,'$.pic_list') like '%http%';
统计使用iphone发微博的独立用户数
select count(distinct get_json_object(a.j,'$.userId'))
from
(select substring(json,2,length(json)-2) as j from weibo) a
where lower(get_json_object(a.j,'$.source')) like '%iphone%';
微博中评论次数小于1000的用户id和数据来源,放入视图
create view weibo_view as
select get_json_object(a.j,'$.userId') as id,get_json_object(a.j,'$.source') as source
from
(select substring(json,2,length(json)-2) as j from weibo) a
where get_json_object(a.j,'$.commentCount')<1000;
select * from weibo_view limit 10;
统计上条视图中数据来源“ipad客户端”的用户数目
select count(distinct id) as cnt from weibo_view where source='iPad客户端';
将微博的点赞数和转发数求和,降序,取前10条。
public class DemoTest1 extends UDF
public Integer evaluate(Integer num1,Integer num2)
try
return num1+num2;
catch (Exception e)
return null;
create temporary function wb as 'DemoTest1';
select wb(cast(get_json_object(a.j,'$.praiseCount') as int),cast(get_json_object(a.j,'$.reportCount') as int)) as cnt
from
(select substring(json,2,length(json)-2) as j from weibo) a
order by cnt desc limit 10;
⽤户微博内容中出现iphone关键词的最⼤次数
public class DemoTest2 extends UDF
public int evaluate(String content,String word)
int count = 0;
if(content != null&&content.length()>0)
String[] array = content.split(word);
count = array.length-1;
return count;
create temporary function wcount as 'DemoTest2';
select b.id,max(b.cnt) as cn
from
(select get_json_object(a.j,'$.userId') as id,wcount(get_json_object(a.j,'$.content'),'iphone') as cnt
from
(select substring(json,2,length(json)-2) as j from weibo) a) b
group by b.id
order by cn desc limit 10;
以上是关于基于新浪微博的⽇志数据分析的主要内容,如果未能解决你的问题,请参考以下文章