在hive仓库中建贴源数据表

Posted 2021-04-29 ItStar

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了在hive仓库中建贴源数据表相关的知识，希望对你有一定的参考价值。

该项目的数据分析过程在hadoop集群上实现，主要应用hive数据仓库工具，因此，采集并经过预处理后的数据，需要加载到hive数据仓库中，以进行后续的挖掘分析。

创建原始数据表

--在hive仓库中建贴源数据表ods_weblog_origin

drop table if exists shizhan.ods_weblog_origin;

create table shizhan.ods_weblog_origin(

valid string,

remote_addr string,

remote_user string,

time_local string,

request string,

status string,

body_bytes_sent string,

http_referer string,

http_user_agent string)

partitioned by (datestr string)

row format delimited

fields terminated by '\001';

日志模型贴源数据表ods_click_pageviews

drop table if exists ods_click_pageviews;

create table ods_click_pageviews(

Session string,

remote_addr string,

remote_user string,

time_local string,

request string,

visit_step string,

page_staylong string,

http_referer string,

http_user_agent string,

body_bytes_sent string,

status string)

partitioned by (datestr string)

row format delimited

fields terminated by '\001';

日志visit模型表click_stream_visit

drop table if exist click_stream_visit;

create table click_stream_visit(

session string,

remote_addr string,

inTime string,

outTime string,

inPage string,

outPage string,

referal string,

pageVisits int)

partitioned by (datestr string);

时间维表创建

drop table dim_time if exists ods_click_pageviews;

create table dim_time(

year string,

month string,

day string,

hour string)

row format delimited

fields terminated by ',';

导入数据

导入清洗结果数据到贴源数据表ods_weblog_origin

load data inpath '/weblog/preprocessed/16-02-24-16/' overwrite into table ods_weblog_origin partition(datestr='2013-09-18');

0: jdbc:hive2://localhost:10000> show partitions ods_weblog_origin;

+-------------------+--+

| partition |

+-------------------+--+

| timestr=20151203 |

+-------------------+--+

0: jdbc:hive2://localhost:10000> select count(*) from ods_weblog_origin;

+--------+--+

| _c0 |

+--------+--+

| 11347 |

+--------+--+

导入日志模型pageviews数据到ods_click_pageviews表

0: jdbc:hive2://hdp-node-01:10000> load data inpath '/weblog/clickstream/pageviews' overwrite into table ods_click_pageviews partition(datestr='2013-09-18');

0: jdbc:hive2://hdp-node-01:10000> select count(1) from ods_click_pageviews;

+------+--+

| _c0 |

+------+--+

| 66 |

+------+--+

导入日志模型visit数据到click_stream_visit表

0: jdbc:hive2://mini1:10000> Load data local inpath '/weblog/clickstream/visits' overwrite into table click_stream_visit partition(datestr='2013-09-18');

ODS层明细宽表

需求概述

整个数据分析的过程是按照数据仓库的层次分层进行的，总体来说，是从ODS原始数据中整理出一些中间表（比如，为后续分析方便，将原始数据中的时间、url等非结构化数据作结构化抽取，将各种字段信息进行细化，形成明细表），然后再在中间表的基础之上统计出各种指标数据

ETL实现

建表——明细表ods_weblog_detail (源：ods_weblog_origin) （目标：ods_weblog_detail）

drop table ods_weblog_detail;

create table ods_weblog_detail(

valid string, --有效标识

remote_addr string, --来源IP

remote_user string, --用户标识

time_local string, --访问完整时间

daystr string, --访问日期

timestr string, --访问时间

month string, --访问月

day string, --访问日

hour string, --访问时

request string, --请求的url

status string, --响应码

body_bytes_sent string, --传输字节数

http_referer string, --来源url

ref_host string, --来源的host

ref_path string, --来源的路径

ref_query string, --来源参数query

ref_query_id string, --来源参数query的值

http_user_agent string --客户终端标识

)

partitioned by(datestr string);

--抽取refer_url到中间表 "t_ods_tmp_referurl"

--将来访url分离出host path query query id

drop table if exists t_ods_tmp_referurl;

create table t_ ods _tmp_referurl as

SELECT a.*,b.*

FROM ods_origin_weblog a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as host, path, query, query_id;

--抽取转换time_local字段到中间表明细表 ”t_ ods _detail”

drop table if exists t_ods_tmp_detail;

create table t_ods_tmp_detail as

select b.*,substring(time_local,0,10) as daystr,

substring(time_local,11) as tmstr,

substring(time_local,5,2) as month,

substring(time_local,8,2) as day,

substring(time_local,11,2) as hour

From t_ ods _tmp_referurl b;

以上语句可以改写成：

insert into table zs.ods_weblog_detail partition(datestr='2013-09-18')

select c.valid,c.remote_addr,c.remote_user,c.time_local,

substring (c.time_local,0,10) as daystr,

substring(c.time_local,12) as tmstr,

substring(c.time_local,6,2) as month,

substring(c.time_local,9,2) as day,

substring(c.time_local,11,3) as hour,

c.request,c.status,c.body_bytes_sent,c.http_referer,c.ref_host,c.ref_path,c.ref_query,c.ref_query_id,c.http_user_agent

from

(SELECT

a.valid,a.remote_addr,a.remote_user,a.time_local,

a.request,a.status,a.body_bytes_sent,a.http_referer,a.http_user_agent,b.ref_host,b.ref_path,b.ref_query,b.ref_query_id

FROM zs.ods_weblog_origin a LATERAL VIEW parse_url_tuple (regexp_replace(http_referer, "\"", ""), 'HOST', 'PATH','QUERY', 'QUERY:id') b as ref_host, ref_path, ref_query, ref_query_id) c

0: jdbc:hive2://localhost:10000> show partitions ods_weblog_detail;

+---------------------+--+

| partition |

+---------------------+--+

| dd=18%2FSep%2F2013 |

+---------------------+--+

1 row selected (0.134 seconds)

统计分析

注：每一种统计指标都可以跟各维度表进行叉乘，从而得出各个维度的统计结果

篇幅限制，叉乘的代码及注释信息详见项目工程代码文件。

为了在前端展示时速度更快，每一个指标都事先算出各维度结果存入mysql。

提前准备好维表数据，在hive仓库中创建相应维表，如：

时间维表：

create table v_time(year string,month string,day string,hour string)

row format delimited

fields terminated by ',';

load data local inpath '/home/hadoop/v_time.txt' into table v_time;

在实际生产中，究竟需要哪些统计指标通常由相关数据需求部门人员提出，而且会不断有新的统计需求产生，以下为网站流量分析中的一些典型指标示例。

PV统计

多维统计PV总量

1. 时间维度

--计算指定的某个小时pvs

select count(*),month,day,hour from dw_click.ods_weblog_detail group by month,day,hour;

--计算该处理批次（一天）中的各小时pvs

drop table dw_pvs_hour;

create table dw_pvs_hour(month string,day string,hour string,pvs bigint) partitioned by(datestr string);

insert into table dw_pvs_hour partition(datestr='2016-03-18')

select a.month as month,a.day as day,a.hour as hour,count(1) as pvs from ods_weblog_detail a

where a.datestr='2016-03-18' group by a.month,a.day,a.hour;

或者用时间维表关联

维度：日

drop table dw_pvs_day;

create table dw_pvs_day(pvs bigint,month string,day string);

insert into table dw_pvs_day

select count(1) as pvs,a.month as month,a.day as day from dim_time a

join ods_weblog_detail b

on b.dd='18/Sep/2013' and a.month=b.month and a.day=b.day

group by a.month,a.day;

--或者，从之前算好的小时结果中统计

Insert into table dw_pvs_day

Select sum(pvs) as pvs,month,day from dw_pvs_hour group by month,day having day='18';

结果如下：

维度：月

drop table t_display_pv_month;

create table t_display_pv_month (pvs bigint,month string);

insert into table t_display_pv_month

select count(*) as pvs,a.month from t_dim_time a

join t_ods_detail_prt b on a.month=b.month group by a.month;

2. 按终端维度统计pv总量

注：探索数据中的终端类型

select distinct(http_user_agent) from ods_weblog_detail where http_user_agent like '%Mozilla%' limit 200;

终端维度：uc

drop table t_display_pv_terminal_uc;

create table t_display_pv_ terminal_uc (pvs bigint,mm string,dd string,hh string);

终端维度：chrome

drop table t_display_pv_terminal_chrome;

create table t_display_pv_ terminal_ chrome (pvs bigint,mm string,dd string,hh string);

终端维度：safari

drop table t_display_pv_terminal_safari;

create table t_display_pv_ terminal_ safari (pvs bigint,mm string,dd string,hh string);

以上是关于在hive仓库中建贴源数据表的主要内容，如果未能解决你的问题，请参考以下文章