拉链表

Posted yin-fei

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了拉链表相关的知识,希望对你有一定的参考价值。

1.数据准备

create table sospdm.tmp_ods_user
(
     cust_num   string comment 会员编码
    ,mbl_phone  string comment 会员手机号
)partitioned by (statis_date string comment 统计时间)
stored as rcfile 
;

set hive.exec.dynamic.partition=true;   
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table sospdm.tmp_ods_user partition (statis_date)
select 001 as cust_num,111111 as mbl_phone,20170101 as statis_date from sospdm.dual union all
select 002 as cust_num,222222 as mbl_phone,20170101 as statis_date from sospdm.dual union all
select 003 as cust_num,333333 as mbl_phone,20170101 as statis_date from sospdm.dual union all
select 004 as cust_num,444444 as mbl_phone,20170101 as statis_date from sospdm.dual union all

select 001 as cust_num,111111 as mbl_phone,20170102 as statis_date from sospdm.dual union all
select 002 as cust_num,233333 as mbl_phone,20170102 as statis_date from sospdm.dual union all
select 003 as cust_num,333333 as mbl_phone,20170102 as statis_date from sospdm.dual union all
select 004 as cust_num,432432 as mbl_phone,20170102 as statis_date from sospdm.dual union all
select 005 as cust_num,555555 as mbl_phone,20170102 as statis_date from sospdm.dual union all

select 001 as cust_num,111111 as mbl_phone,20170103 as statis_date from sospdm.dual union all
select 002 as cust_num,233333 as mbl_phone,20170103 as statis_date from sospdm.dual union all
select 003 as cust_num,333333 as mbl_phone,20170103 as statis_date from sospdm.dual union all
select 004 as cust_num,654321 as mbl_phone,20170103 as statis_date from sospdm.dual union all
select 005 as cust_num,115115 as mbl_phone,20170103 as statis_date from sospdm.dual union all
select 006 as cust_num,666666 as mbl_phone,20170103 as statis_date from sospdm.dual union all

select 001 as cust_num,111111 as mbl_phone,20170104 as statis_date from sospdm.dual union all
select 002 as cust_num,233333 as mbl_phone,20170104 as statis_date from sospdm.dual union all
select 003 as cust_num,333333 as mbl_phone,20170104 as statis_date from sospdm.dual union all
select 004 as cust_num,654321 as mbl_phone,20170104 as statis_date from sospdm.dual union all
select 005 as cust_num,115115 as mbl_phone,20170104 as statis_date from sospdm.dual union all
select 006 as cust_num,666767 as mbl_phone,20170104 as statis_date from sospdm.dual;

2.拉链逻辑

-- 全量拉链
-- 初始化拉链表
create table sospdm.tmp_dwd_his
(
     cust_num string comment 会员编码
    ,mbl_phone string comment 手机号
    ,start_date string comment 开始时间
)partitioned by (end_date string comment 结束时间)
;

insert overwrite table sospdm.tmp_dwd_his partition(end_date)
select
     cust_num
    ,mbl_phone
    ,20170101 as statis_date
    ,99991231 as end_date
from tmp_ods_user where statis_date=20170101
;

-- 取开链数据与今日全量数据对比 找出变化的

-- 字段改变或者不变的
insert overwrite table sospdm.tmp_dwd_his partition(end_date)
select 
     t1.cust_num
    ,t1.mbl_phone
    ,${statis_date} as start_date
    ,99991231 as end_date
from
(
    select
         cust_num
        ,mbl_phone
    from tmp_ods_user where statis_date=${statis_date}
) t1
left join 
(
    select
         cust_num
        ,mbl_phone
    from tmp_dwd_his where start_date<=${statis_date} and end_date>=${statis_date}
) t2 
on t1.cust_num=t2.cust_num
where t2.cust_num is null -- 新增
or t1.mbl_phone <> t2.mbl_phone -- 改变

union all 

select 
     t1.cust_num
    ,t1.mbl_phone
    ,start_date
    ,${statis_date} as end_date   -- 改变的进行闭链
from
(
    select
         cust_num
        ,mbl_phone
    from tmp_ods_user where statis_date=${statis_date}
) t1 
inner join 
(
    select
         cust_num
        ,mbl_phone
        ,start_date
        ,end_date
    from tmp_dwd_his where start_date<=${statis_date} and end_date>=${statis_date}
) t2 
on t1.cust_num=t2.cust_num  
where t1.mbl_phone <> t2.mbl_phone-- 改变

union all 

select 
     t1.cust_num
    ,t1.mbl_phone
    ,start_date
    ,end_date
from
(
    select
         cust_num
        ,mbl_phone
    from tmp_ods_user where statis_date=${statis_date}
) t1 
inner join 
(
    select
         cust_num
        ,mbl_phone
        ,start_date
        ,end_date
    from tmp_dwd_his where start_date<=${statis_date} and end_date>=${statis_date}
) t2 
on t1.cust_num=t2.cust_num and t1.mbl_phone = t2.mbl_phone -- 不变
;

3.拉链回滚

-- 回滚
-- 回滚日期之前的闭链 还是闭链

select 
     cust_num
    ,mbl_phone
    ,start_date
    ,end_date
from sospdm.tmp_dwd_his where end_date < 20170103
--回滚日期之前的开链
union all 
select 
     cust_num
    ,mbl_phone
    ,start_date
    ,99991231 as end_date
from sospdm.tmp_dwd_his where end_date >= 20170103
;

 

以上是关于拉链表的主要内容,如果未能解决你的问题,请参考以下文章

数仓-拉链表的详细实现过程

拉链表流水表

拉链表

如何在Hive中更新拉链表

数据仓库:拉链表详解

Hive拉链表实战-SQL模拟hive仓库拉链表实现