Hive拉链表实现
Posted lucas-zhao
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Hive拉链表实现相关的知识,希望对你有一定的参考价值。
拉链表测试:
有如下测试数据
--2019/12/1号订单的全量数据 id status create_time operation_time 1 待支付 2019-12-01 2 待支付 2019-12-01 3 已支付 2019-12-01 --2019/12/2号订单的全量数据 id status create_time operation_time 1 待支付 2019-12-01 2 已支付 2019-12-01 2019-12-02 3 已支付 2019-12-01 4 待支付 2019-12-02 5 已支付 2019-12-02
--创建订单表 drop table if exists order_info; create table order_info( id int, status string, create_time string, operate_time string ) partitioned by(dt string) row format delimited fields terminated by ‘ ‘;
--加载12/1的数据 load data local inpath "/opt/data/order_info1" into table chain.order_info partition(dt=‘2019-12-01‘); --加载12/2的数据 load data local inpath "/opt/data/order_info2" into table chain.order_info partition(dt=‘2019-12-02‘);
--创建拉链表 drop table if exists order_info_chain; create table order_info_chain( id int, status string, create_time string, operate_time string, start_time string, end_time string ) row format delimited fields terminated by ‘ ‘; --初始化拉链表,加载12/1号的数据 insert overwrite table order_info_chain select id, status, create_time, operate_time, ‘2019-12-01‘, ‘9999-99-99‘ from order_info where dt=‘2019-12-01‘;
--订单变化表 create table order_change( id string, status string, create_time string, operate_time string ) partitioned by (dt string); --灌入12/2变化和新增的数据,通过create_time,create_time判断 insert overwrite table order_change partition(dt=‘2019-12-02‘) select id, status, create_time, operate_time from order_info where create_time=‘2019-12-02‘ or operate_time=‘2019-12-02‘;
--创建临时拉链表 create table tmp_chain( id string, status string, create_time string COMMENT ‘创建时间‘, operate_time string COMMENT ‘修改时间‘, start_time string COMMENT ‘有效开始时间‘, end_time string COMMENT ‘有效结束时间‘ );
--向临时拉链表导入数据 insert overwrite table tmp_chain select * from ( select id, status, create_time, operate_time, ‘2019-12-02‘ as start_time, ‘9999-99-99‘ as end_time from order_change where dt=‘2019-12-02‘ union all --修改发生过更新的记录的end_time为前一天 select orch.id, orch.status, orch.create_time, orch.operate_time, orch.start_time, if(ch.id is null, orch.end_time, date_add(ch.dt,-1)) as end_time from order_info_chain orch left join (select * from order_change where dt=‘2019-12-02‘) ch on orch.id = ch.id and orch.end_time=‘9999-99-99‘ ) t order by t.id,t.start_time;
--把临时表覆盖给拉链表 insert overwrite table order_info_chain select * from tmp_chain;
以上是关于Hive拉链表实现的主要内容,如果未能解决你的问题,请参考以下文章