Doris 数据模型及自动分区使用案例
Posted ShenLiang2025
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Doris 数据模型及自动分区使用案例相关的知识,希望对你有一定的参考价值。
Doris 数据模型及自动分区使用案例代码说明
简介
Doris数据模型使用案例演示。
使用
Duplicate 模型
DROP TABLE test_stu
CREATE TABLE test_stu_duplicate
(id int,
name varchar(100),
gender TINYINT,
score TINYINT
)
DUPLICATE KEY(id, name)
DISTRIBUTED BY HASH (name)
INSERT INTO test_stu_duplicate VALUES(1,'shenliang',1,10);
INSERT INTO test_stu_duplicate VALUES(2,'shenliang',1,20);
INSERT INTO test_stu_duplicate VALUES(1,'shenliang',1,10);
INSERT INTO test_stu_duplicate VALUES(3,'liangshen',0,20);
id | name | gender | score |
3 | liangshen | 0 | 20 |
1 | shenliang | 1 | 10 |
2 | shenliang | 1 | 20 |
1 | shenliang | 1 | 10 |
#第3条和第1条完全重复,但进表时不会过滤,即不会去重复数据。
Unique模型
CREATE TABLE test_stu_unique
(id int,
name varchar(100),
gender TINYINT,
score TINYINT
)
UNIQUE KEY(id, name)
DISTRIBUTED BY HASH (id)
PROPERTIES("replication_num" = "1");
INSERT INTO test_stu_unique VALUES(1,'shenliang',1,10);
INSERT INTO test_stu_unique VALUES(2,'shenliang',1,20);
INSERT INTO test_stu_unique VALUES(1,'shenliang',1,10);
INSERT INTO test_stu_unique VALUES(3,'liangshen',0,20);
id | name | gender | score |
3 | liangshen | 0 | 20 |
1 | shenliang | 1 | 10 |
2 | shenliang | 1 | 20 |
Aggr模型
CREATE TABLE test_stu_aggr
(name varchar(100),
id int,
gender TINYINT,
score TINYINT REPLACE,
acc_classtimes INT SUM,
max_classtimes INT MAX,
min_classtimes INT MIN
)
AGGREGATE KEY(name,id,gender)
DISTRIBUTED BY HASH(id)
PROPERTIES("replication_num" = "1");
INSERT INTO test_stu_aggr VALUES('shenliang',1,1,30,5,10,1);
INSERT INTO test_stu_aggr VALUES('shenliang',1,1,40,8,12,3);
name | gender | id | score | acc_classtimes | max_classtimes | min_classtimes |
shenliang | 1 | 1 | 40 | 13 | 12 | 1 |
这里输入插入了2条记录,但会通过
INSERT INTO test_stu_aggr VALUES('shenliang',2,1,10,2,15,5);
INSERT INTO test_stu_aggr VALUES('shenliang',2,0,20,6,7,7);
name | id | gender | score | acc_classtimes | max_classtimes | min_classtimes |
shenliang | 1 | 1 | 40 | 13 | 12 | 1 |
shenliang | 2 | 1 | 10 | 2 | 15 | 5 |
shenliang | 2 | 0 | 20 | 6 | 7 | 7 |
物化视图
物化视图是将预先计算(根据定义好的 SELECT 语句)好的数据集,存储在 Doris 中的一个特殊的表。
create materialized view test_stu_maxclasstimes as select gender, MAX(max_classtimes)
from test_stu_aggr group by gender;
EXPLAIN select gender, MAX(max_classtimes)
from test_stu_aggr group by gender;
Explain String
PLAN FRAGMENT 0
OUTPUT EXPRS:<slot 2> `gender` | <slot 3> max(`max_classtimes`)
PARTITION: UNPARTITIONED
RESULT SINK
4:EXCHANGE
PLAN FRAGMENT 1
OUTPUT EXPRS:
PARTITION: HASH_PARTITIONED: <slot 2> `gender`
STREAM DATA SINK
EXCHANGE ID: 04
UNPARTITIONED
3:AGGREGATE (merge finalize)
| output: max(<slot 3> max(`max_classtimes`))
| group by: <slot 2> `gender`
| cardinality=-1
|
2:EXCHANGE
PLAN FRAGMENT 2
OUTPUT EXPRS:
PARTITION: HASH_PARTITIONED: `default_cluster:dw_incubate`.`test_stu_aggr`.`id`
STREAM DATA SINK
EXCHANGE ID: 02
HASH_PARTITIONED: <slot 2> `gender`
1:AGGREGATE (update serialize)
| STREAMING
| output: max(`max_classtimes`)
| group by: `gender`
| cardinality=-1
|
0:OlapScanNode
TABLE: test_stu_aggr
PREAGGREGATION: ON
partitions=1/1
rollup: test_stu_maxclasstimes
tabletRatio=10/10
tabletList=292497,292499,292501,292503,292505,292507,292509,292511,292513,292515
cardinality=0
avgRowSize=5.0
numNodes=1
Rollup
Rollup可认为是物化视图的一个子集。
alter table test_stu_aggr add rollup rollup_id(id,acc_classtimes);
EXPLAIN
SElECT id,SUM(acc_classtimes)
FROM test_stu_aggr
GROUP BY id
Explain String
PLAN FRAGMENT 0
OUTPUT EXPRS:<slot 2> `id` | <slot 3> sum(`acc_classtimes`)
PARTITION: UNPARTITIONED
RESULT SINK
2:EXCHANGE
PLAN FRAGMENT 1
OUTPUT EXPRS:
PARTITION: HASH_PARTITIONED: `default_cluster:dw_incubate`.`test_stu_aggr`.`id`
STREAM DATA SINK
EXCHANGE ID: 02
UNPARTITIONED
1:AGGREGATE (update finalize)
| output: sum(`acc_classtimes`)
| group by: `id`
| cardinality=-1
|
0:OlapScanNode
TABLE: test_stu_aggr
PREAGGREGATION: ON
partitions=1/1
rollup: rollup_id
tabletRatio=10/10
tabletList=292475,292477,292479,292481,292483,292485,292487,292489,292491,292493
cardinality=0
avgRowSize=8.0
numNodes=1
均值
Doris里不支持Avg模型,在指标类数据不为NULL的情况下,可通过追加计数器字段算。详见#2里指标不为NULL的情况。
#1定义常量字段cnt,通过REPLACE方式使用。
CREATE TABLE t_temp
( id int,
score int SUM,
cnt int REPLACE default '1'
) AGGREGATE KEY (id)
distributed by hash(id)
INSERT INTO t_temp(id,score) VALUES(1,80);
INSERT INTO t_temp(id,score) VALUES(1,90);
INSERT INTO t_temp(id,score) VALUES(1,70);
INSERT INTO t_temp(id,score) VALUES(2,80);
INSERT INTO t_temp(id,score) VALUES(2,NULL);
SELECT * FROM t_temp
id score cnt
1 240 1
2 80 1
#2 定义累计字段cnt并求和
CREATE TABLE t_temp_2
( id int,
score int SUM,
cnt int SUM default '1'
) AGGREGATE KEY (id)
distributed by hash(id)
INSERT INTO t_temp_2(id,score) VALUES(1,80);
INSERT INTO t_temp_2(id,score) VALUES(1,90);
INSERT INTO t_temp_2(id,score) VALUES(1,70);
INSERT INTO t_temp_2(id,score) VALUES(2,80);
INSERT INTO t_temp_2(id,score) VALUES(2,NULL);
INSERT INTO t_temp_2(id,score) VALUES(3,70);
INSERT INTO t_temp_2(id,score) VALUES(3,50);
SELECT * FROM t_temp_2
id score cnt
1 240 3
3 120 2
2 80 2
均值avg可听过score/cnt得到。
分区简介
Doris字段分区支持手动建分区和动态加分区并追加历史分区信息。
手动指定分区
# 建表时手动指定分区信息,这里核心为
PARTITION BY RANGE(collectionDate)
(
PARTITION p20211031 VALUES LESS THAN ("2021-11-01"),
PARTITION p20211101 VALUES LESS THAN ("2021-11-02"),
PARTITION p20211102 VALUES LESS THAN ("2021-11-03")
)
注:分区字段需要在key列表里,且key字段列表必须放在建表语句的前面。
详细见如下脚本:
DROP TABLE t_deviceinfo;
CREATE TABLE t_deviceinfo(
collectdate date COMMENT '采集时间',
deviceid int COMMENT '设备ID',
value biggint '指标值'
)UNIQUE KEY(collectdate,deviceid )
PARTITION BY RANGE(collectdate )
(
PARTITION p20211031 VALUES LESS THAN ("2021-11-01"),
PARTITION p20211101 VALUES LESS THAN ("2021-11-02"),
PARTITION p20211102 VALUES LESS THAN ("2021-11-03")
)
distributed by hash(collectdate,deviceId) buckets 20
动态生成分区
# 建表时指定动态分区信息,这里主要是指定动态分区相关参数,详细解释见下:
"dynamic_partition.enable" = "true" -- 指定开启动态分区
"dynamic_partition.create_history_partition" = "true", -- 历史数据是否分区
"dynamic_partition.history_partition_num" = "200", -- 历史数据回溯时间(天为单位)
"dynamic_partition.time_unit" = "DAY", -- 分区频率,支持天、周、月、年
-- "dynamic_partition.start" = "-100", -- 设置删除删除100天前的分区(其它时间单位类似),不设置该参数即默认不删除历史分区
"dynamic_partition.end" = "7", -- 往前增加7天分区(其它时间单位类似)
"dynamic_partition.prefix" = "p",-- 指定分区名的前缀,这里是以“p”开头
"dynamic_partition.buckets" = "32" – 设置分区内桶的个数
详细建表语句见下:
DROP TABLE t_deviceinfo;
CREATE TABLE t_deviceinfo(
collectdate date COMMENT '采集时间',
deviceid int COMMENT '设备ID',
value bigint '指标值'
)
UNIQUE KEY(collectdate,deviceid )
partition by range(collectdate)()
distributed by hash(collectdate,deviceid) buckets 20
PROPERTIES
(
"dynamic_partition.enable" = "true",
"dynamic_partition.create_history_partition" = "true",
"dynamic_partition.history_partition_num" = "200",
"dynamic_partition.time_unit" = "DAY",
-- "dynamic_partition.start" = "-100",
"dynamic_partition.end" = "7",
"dynamic_partition.prefix" = "p",
"dynamic_partition.buckets" = "32"
);
以上是关于Doris 数据模型及自动分区使用案例的主要内容,如果未能解决你的问题,请参考以下文章