Doris -- 查询语法和内置函数

Posted _泡泡

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Doris -- 查询语法和内置函数相关的知识,希望对你有一定的参考价值。

查询语法和内置函数

查询语法整体结构

SELECT
[ALL | DISTINCT | DISTINCTROW ]            -- 对查询字段的结果是否需要去重,还是全部保留等参数
select_expr [, select_expr ...]            -- select的查询字段
[FROM table_references
[PARTITION partition_list]                 -- from 哪个库里面的那张表甚至哪一个(几个)分区
[WHERE where_condition]                    -- WHERE 查询
[GROUP BY col_name | expr | position     -- group by  聚合
[ASC | DESC], ... [WITH ROLLUP]]
[HAVING where_condition]                   -- having 针对聚合函数的再一次过滤
[ORDER BY col_name | expr | position     -- 对结果数据按照字段进行排序
[ASC | DESC], ...]                                       -- 排序规则
[LIMIT [offset,] row_count | row_count OFFSET offset]  -- 限制输出多少行内容
[INTO OUTFILE \'file_name\']                 -- 将查询的结果导出到文件中

内置函数

条件函数

if

if(boolean condition, type valueTrue, type valueFalseOrNull)
--如果表达式 condition 成立,返回结果 valueTrue;否则,返回结果 valueFalseOrNull
--返回值类型:valueTrue 表达式结果的类型
示例:
SQL
mysql> select  user_id, if(user_id = 1, "true", "false") as test_if from test;
+---------+---------+
| user_id | test_if |
+---------+---------+
| 1       | true    |
| 2       | false   |
+---------+---------+

ifnull,nvl,coalesce,nullif

ifnull(expr1, expr2)
--如果 expr1 的值不为 NULL 则返回 expr1,否则返回 expr2

nvl(expr1, expr2)
--如果 expr1 的值不为 NULL 则返回 expr1,否则返回 expr2

coalesce(expr1, expr2, ...., expr_n))
--返回参数中的第一个非空表达式(从左向右)

nullif(expr1, expr2)
-- 如果两个参数相等,则返回NULL。否则返回第一个参数的值

mysql> select ifnull(1,0);
+--------------+
| ifnull(1, 0) |
+--------------+
|            1 |
+--------------+

mysql> select nvl(null,10);
+------------------+
| nvl(null,10)     |
+------------------+
|               10 |
+------------------+

mysql> select coalesce(NULL, \'1111\', \'0000\');
+--------------------------------+
| coalesce(NULL, \'1111\', \'0000\') |
+--------------------------------+
| 1111                           |
+--------------------------------+

mysql> select coalesce(NULL, NULL,NULL,\'0000\', NULL);
+----------------------------------------+
| coalesce(NULL, NULL,NULL,\'0000\', NULL) |
+----------------------------------------+
| 0000                                   |
+----------------------------------------+

mysql> select nullif(1,1);
+--------------+
| nullif(1, 1) |
+--------------+
|         NULL |
+--------------+

mysql> select nullif(1,0);
+--------------+
| nullif(1, 0) |
+--------------+
|            1 |
+--------------+


case

-- 方式一
CASE expression
    WHEN condition1 THEN result1
    [WHEN condition2 THEN result2]
    ...
    [WHEN conditionN THEN resultN]
    [ELSE result]
END


-- 方式二
CASE WHEN condition1 THEN result1
    [WHEN condition2 THEN result2]
    ...
    [WHEN conditionN THEN resultN]
    [ELSE result]
END

-- 将表达式和多个可能的值进行比较,当匹配时返回相应的结果


mysql> select user_id, 
case user_id 
when 1 then \'user_id = 1\' 
when 2 then \'user_id = 2\' 
else \'user_id not exist\' 
end as test_case 
from test;
+---------+-------------+
| user_id | test_case   |
+---------+-------------+
| 1       | user_id = 1 |
| 2       | user_id = 2 |
| 3       | \'user_id not exist\' |
+---------+-------------+
 
mysql> select user_id, 
case 
when user_id = 1 then \'user_id = 1\' 
when user_id = 2 then \'user_id = 2\' 
else \'user_id not exist\' 
end as test_case 
from test;
+---------+-------------+
| user_id | test_case   |
+---------+-------------+
| 1       | user_id = 1 |
| 2       | user_id = 2 |
+---------+-------------+


聚合函数

min,max,sum,avg,count和mysql用法一致

min_by和max_by

MAX_BY(expr1, expr2)
返回expr2最大值所在行的 expr1 (求分组top1的简介函数)

MySQL > select * from tbl;
+------+------+------+------+
| k1   | k2   | k3   | k4   |
+------+------+------+------+
|    0 | 3    | 2    |  100 |
|    1 | 2    | 3    |    4 |
|    4 | 3    | 2    |    2 |
|    3 | 4    | 2    |    1 |
+------+------+------+------+

MySQL > select max_by(k1, k4) from tbl;
select max_by(k1, k4) from tbl;
--取k4这个列中的最大值对应的k1这个列的值
+--------------------+
| max_by(`k1`, `k4`) |
+--------------------+
|                  0 |
+--------------------+ 


-- 练习
name   subject   score 
zss,chinese,99
zss,math,89
zss,English,79
lss,chinese,88
lss,math,88
lss,English,22
www,chinese,99
www,math,45
zll,chinese,23
zll,math,88
zll,English,80
www,English,94

-- 建表语句
create table score
(
name varchar(50),
subject varchar(50),
score double
)
DUPLICATE KEY(name)
DISTRIBUTED BY HASH(name) BUCKETS 1;

-- 通过本地文件的方式导入数据
curl \\
 -u root: \\
 -H "label:salary" \\
 -H "column_separator:," \\
 -T /root/data/salary.txt \\
 http://doitedu01:8040/api/test/salary/_stream_load


-- 求每门课程成绩最高分的那个人
select  
subject,max_by(name,score) as name
from score
group by subject


+---------+------+
| subject | name |
+---------+------+
| English | www  |
| math    | lss  |
| chinese | www  |
+---------+------+

group_concat

VARCHAR GROUP_CONCAT([DISTINCT] VARCHAR 列名[, VARCHAR sep]

该函数是类似于 sum() 的聚合函数,group_concat 将结果集中的多行结果连接成一个字符串

-- group_concat对于收集的字段只能是string,varchar,char类型  
--当不指定分隔符的时候,默认使用 \',\'

VARCHAR :代表GROUP_CONCAT函数返回值类型
[DISTINCT]:可选参数,针对需要拼接的列的值进行去重  
[, VARCHAR sep]:拼接成字符串的分隔符,默认是 \',\'


--建表
create table example(
id int,
name varchar(50),
age int,
gender string,
is_marry boolean,
marry_date date,
marry_datetime datetime
)engine = olap
distributed by hash(id) buckets 3;

--插入数据
insert into example values \\
(1,\'zss\',18,\'male\',0,null,null),\\
(2,\'lss\',28,\'female\',1,\'2022-01-01\',\'2022-01-01 11:11:11\'),\\
(3,\'ww\',38,\'male\',1,\'2022-02-01\',\'2022-02-01 11:11:11\'),\\
(4,\'zl\',48,\'female\',0,null,null),\\
(5,\'tq\',58,\'male\',1,\'2022-03-01\',\'2022-03-01 11:11:11\'),\\
(6,\'mly\',18,\'male\',1,\'2022-04-01\',\'2022-04-01 11:11:11\'),\\
(7,null,18,\'male\',1,\'2022-05-01\',\'2022-05-01 11:11:11\');

--当收集的那一列,有值为null时,他会自动将null的值过滤掉
select 
gender,
group_concat(name,\',\') as gc_name
from example 
group by gender;
+--------+---------------+
| gender | gc_name       |
+--------+---------------+
| female | zl,lss        |
| male   | zss,ww,tq,mly |
+--------+---------------+

select 
gender,
group_concat(DISTINCT cast(age as string)) as gc_age
from example 
group by gender;

+--------+------------+
| gender | gc_age     |
+--------+------------+
| female | 48, 28     |
| male   | 58, 38, 18 |
+--------+------------+


-- 求:每一个人有考试成绩的所有科目
select
name,
group_concat(subject,\',\')  as all_subject
from score
group by name

collect_list,collect_set (1.2版本上线)

ARRAY<T> collect_list(expr)
--返回一个包含 expr 中所有元素(不包括NULL)的数组,数组中元素顺序是不确定的。

ARRAY<T> collect_set(expr)
--返回一个包含 expr 中所有去重后元素(不包括NULL)的数组,数组中元素顺序是不确定的。

日期函数

获取当前时间

-- curdate,current_date,now,curtime,current_time,current_timestamp
select current_date();
+----------------+
| current_date() |
+----------------+
| 2022-11-25     |
+----------------+

select curdate();
+------------+
| curdate()  |
+------------+
| 2022-11-25 |
+------------+

 select now();
+---------------------+
| now()               |
+---------------------+
| 2022-11-25 00:55:15 |
+---------------------+

select curtime();
+-----------+
| curtime() |
+-----------+
| 00:42:13  |
+-----------+

select current_timestamp();
+---------------------+
| current_timestamp() |
+---------------------+
| 2022-11-25 00:42:30 |
+---------------------+

last_day(1.2版本上线)

DATE last_day(DATETIME date) 
-- 返回输入日期中月份的最后一天; 
--\'28\'(非闰年的二月份), 
--\'29\'(闰年的二月份), 
--\'30\'(四月,六月,九月,十一月), 
--\'31\'(一月,三月,五月,七月,八月,十月,十二月)

select last_day(\'2000-03-03 01:00:00\'); -- 给我返回这个月份中的最后一天的日期  年月日
ERROR 1105 (HY000): errCode = 2, detailMessage = No matching function with signature: last_day(varchar(-1)).

from_unixtime

DATETIME FROM_UNIXTIME(INT unix_timestamp[, VARCHAR string_format])
-- 将 unix 时间戳转化为对应的 time 格式,返回的格式由 string_format 指定
--支持date_format中的format格式,默认为 %Y-%m-%d %H:%i:%s

-- 正常使用的三种格式
yyyyMMdd
yyyy-MM-dd
yyyy-MM-dd HH:mm:ss
示例:
SQL
mysql> select from_unixtime(1196440219);  -- 时区
+---------------------------+
| from_unixtime(1196440219) |
+---------------------------+
| 2007-12-01 00:30:19       |
+---------------------------+

mysql> select from_unixtime(1196440219, \'yyyy-MM-dd HH:mm:ss\');
+--------------------------------------------------+
| from_unixtime(1196440219, \'yyyy-MM-dd HH:mm:ss\') |
+--------------------------------------------------+
| 2007-12-01 00:30:19                              |
+--------------------------------------------------+

mysql> select from_unixtime(1196440219, \'%Y-%m-%d\');
+-----------------------------------------+
| from_unixtime(1196440219, \'%Y-%m-%d\') |
+-----------------------------------------+
| 2007-12-01                              |
+-----------------------------------------+

unix_timestamp

UNIX_TIMESTAMP(), 
UNIX_TIMESTAMP(DATETIME date), 
UNIX_TIMESTAMP(DATETIME date, STRING fmt) -- 给一个日期,指定这个日期的格式
-- 将日期转换成时间戳,返回值是一个int类型

-- 获取当前日期的时间戳
select unix_timestamp();
+------------------+
| unix_timestamp() |
+------------------+
|       1669309722 |
+------------------+

-- 获取指定日期的时间戳
select unix_timestamp(\'2022-11-26 01:09:01\');
+---------------------------------------+
| unix_timestamp(\'2022-11-26 01:09:01\') |
+---------------------------------------+
|                            1669396141 |
+---------------------------------------+

-- 给定一个特殊日期格式的时间戳,指定格式
select unix_timestamp(\'2022-11-26 01:09-01\', \'%Y-%m-%d %H:%i-%s\');
+------------------------------------------------------------+
| unix_timestamp(\'2022-11-26 01:09-01\', \'%Y-%m-%d %H:%i-%s\') |
+------------------------------------------------------------+
|                                                 1669396141 |
+------------------------------------------------------------+

to_date

DATE TO_DATE(DATETIME)
--返回 DATETIME 类型中的日期部分。
示例:
SQL
select to_date("2022-11-20 00:00:00");     
+--------------------------------+
| to_date(\'2022-11-20 00:00:00\') |
+--------------------------------+
| 2022-11-20                     |
+--------------------------------+

extract

extract(unit FROM DATETIME)   --抽取
-- 提取DATETIME某个指定单位的值。
--unit单位可以为year, month, day, hour, minute或者second
示例:
SQL
select 
extract(year from \'2022-09-22 17:01:30\') as year,
extract(month from \'2022-09-22 17:01:30\') as month,
extract(day from \'2022-09-22 17:01:30\') as day,
extract(hour from \'2022-09-22 17:01:30\') as hour,
extract(minute from \'2022-09-22 17:01:30\') as minute,
extract(second from \'2022-09-22 17:01:30\') as second;

+------+-------+------+------+--------+--------+
| year | month | day  | hour | minute | second |
+------+-------+------+------+--------+--------+
| 2022 |     9 |   22 |   17 |      1 |     30 |
+------+-------+------+------+--------+--------+

date_add,date_sub,datediff

DATE_ADD(DATETIME date,INTERVAL expr type)

DATE_SUB(DATETIME date,INTERVAL expr type)

DATEDIFF(DATETIME expr1,DATETIME expr2)
-- 计算两个日期相差多少天,结果精确到天。

-- 向日期添加指定的时间间隔。
-- date 参数是合法的日期表达式。
-- expr 参数是您希望添加的时间间隔。
-- type 参数可以是下列值:YEAR, MONTH, DAY, HOUR, MINUTE, SECOND



select date_add(\'2010-11-30 23:59:59\', INTERVAL 2 DAY);
+-------------------------------------------------+
| date_add(\'2010-11-30 23:59:59\', INTERVAL 2 DAY) |
+-------------------------------------------------+
| 2010-12-02 23:59:59                             |
+-------------------------------------------------+


--传一个负数进去也就等同于date_sub
select date_add(\'2010-11-30 23:59:59\', INTERVAL -2 DAY);
+--------------------------------------------------+
| date_add(\'2010-11-30 23:59:59\', INTERVAL -2 DAY) |
+--------------------------------------------------+
| 2010-11-28 23:59:59                              |
+--------------------------------------------------+


mysql> select datediff(\'2022-11-27 22:51:56\',\'2022-11-24 22:50:56\');
+--------------------------------------------------------+
| datediff(\'2022-11-27 22:51:56\', \'2022-11-24 22:50:56\') |
+--------------------------------------------------------+
|                                                      3 |
+--------------------------------------------------------+

date_format

VARCHAR DATE_FORMAT(DATETIME date, VARCHAR format)
--将日期类型按照format的类型转化为字符串
示例:
SQL
select date_format(\'2007-10-04 22:23:00\', \'%H:%i:%s\');
+------------------------------------------------+
| date_format(\'2007-10-04 22:23:00\', \'%H:%i:%s\') |
+------------------------------------------------+
| 22:23:00                                       |
+------------------------------------------------+

select date_format(\'2007-10-04 22:23:00\', \'yyyy-MM-dd\');
+------------------------------------------------+
| date_format(\'2007-10-04 22:23:00\', \'%Y-%m-%d\') |
+------------------------------------------------+
| 2007-10-04                                     |
+------------------------------------------------+

字符串函数

length,lower,upper,reverse
获取到字符串的长度,对字符串转大小写和字符串的反转

lpad,rpad

VARCHAR rpad(VARCHAR str, INT len, VARCHAR pad)

VARCHAR lpad(VARCHAR str, INT len, VARCHAR pad)

-- 返回 str 中长度为 len(从首字母开始算起)的字符串。
--如果 len 大于 str 的长度,则在 str 的后面不断补充 pad  字符,
--直到该字符串的长度达到 len 为止。如果 len 小于 str 的长度,
--该函数相当于截断 str 字符串,只返回长度为 len  的字符串。
--len 指的是字符长度而不是字节长度。
示例:
SQL
-- 向左边补齐
SELECT lpad("1", 5, "hellohello");
+---------------------+
| lpad("1", 5, "0") |
+---------------------+
| 00001             |
+---------------------+

-- 向右边补齐
SELECT rpad(\'11\', 5, \'0\');
+---------------------+
| rpad(\'11\', 5, \'0\')  |
+---------------------+
| 11000               |
+---------------------+

concat,concat_ws

select concat("a", "b");
+------------------+
| concat(\'a\', \'b\') |
+------------------+
| ab               |
+------------------+

select concat("a", "b", "c");
+-----------------------+
| concat(\'a\', \'b\', \'c\') |
+-----------------------+
| abc                   |
+-----------------------+

-- concat中,如果有一个值为null,那么得到的结果就是null
mysql> select concat("a", null, "c");
+------------------------+
| concat(\'a\', NULL, \'c\') |
+------------------------+
| NULL                   |
+------------------------+


--使用第一个参数 sep 作为连接符
--将第二个参数以及后续所有参数(或ARRAY中的所有字符串)拼接成一个字符串。
-- 如果分隔符是 NULL,返回 NULL。 concat_ws函数不会跳过空字符串,会跳过 NULL 值。
mysql> select concat_ws("_", "a", "b");
+----------------------------+
| concat_ws("_", "a", "b")   |
+----------------------------+
| a_b                        |
+----------------------------+

mysql> select concat_ws(NULL, "d", "is");
+----------------------------+
| concat_ws(NULL, \'d\', \'is\') |
+----------------------------+
| NULL                       |
+----------------------------+

substr

--求子字符串,返回第一个参数描述的字符串中从start开始长度为len的部分字符串。
--首字母的下标为1。
mysql> select substr("Hello doris", 3, 5);
+-----------------------------+
| substr(\'Hello doris\', 2, 1) |
+-----------------------------+
| e                           |
+-----------------------------+
mysql> select substr("Hello doris", 1, 2);
+-----------------------------+
| substr(\'Hello doris\', 1, 2) |
+-----------------------------+
| He                          |
+-----------------------------+

ends_with,starts_with

BOOLEAN ENDS_WITH (VARCHAR str, VARCHAR suffix)
--如果字符串以指定后缀结尾,返回true。否则,返回false。
--任意参数为NULL,返回NULL。

BOOLEAN STARTS_WITH (VARCHAR str, VARCHAR prefix)
--如果字符串以指定前缀开头,返回true。否则,返回false。
--任意参数为NULL,返回NULL。
示例:
SQL
select ends_with("Hello doris", "doris");
+-----------------------------------+
| ends_with(\'Hello doris\', \'doris\') |
+-----------------------------------+
|                                 1 | 
+-----------------------------------+

select ends_with("Hello doris", "Hello");
+-----------------------------------+
| ends_with(\'Hello doris\', \'Hello\') |
+-----------------------------------+
|                                 0 | 
+-----------------------------------+


select starts_with("hello world","hello");
+-------------------------------------+
| starts_with(\'hello world\', \'hello\') |
+-------------------------------------+
|                                   1 |
+-------------------------------------+

select starts_with("hello world","world");
+-------------------------------------+
| starts_with(\'hello world\', \'world\') |
+-------------------------------------+
|                                   0 |
+-------------------------------------+

trim,ltrim,rtrim

VARCHAR trim(VARCHAR str)
-- 将参数 str 中左侧和右侧开始部分连续出现的空格去掉
mysql> SELECT trim(\'   ab d   \') str;
+------+
| str  |
+------+
| ab d |
+------+

VARCHAR ltrim(VARCHAR str)
-- 将参数 str 中从左侧部分开始部分连续出现的空格去掉
mysql> SELECT ltrim(\'   ab d\') str;
+------+
| str  |
+------+
| ab d |
+------+

VARCHAR rtrim(VARCHAR str)
--将参数 str 中从右侧部分开始部分连续出现的空格去掉

mysql> SELECT rtrim(\'ab d   \') str;
+------+
| str  |
+------+
| ab d |
+------+

null_or_empty,not_null_or_empty

BOOLEAN NULL_OR_EMPTY (VARCHAR str)

-- 如果字符串为空字符串或者NULL,返回true。否则,返回false。
select null_or_empty(null);
+---------------------+
| null_or_empty(NULL) |
+---------------------+
|                   1 |
+---------------------+

select null_or_empty("");
+-------------------+
| null_or_empty(\'\') |
+-------------------+
|                 1 |
+-------------------+

select null_or_empty("a");
+--------------------+
| null_or_empty(\'a\') |
+--------------------+
|                  0 |
+--------------------+

BOOLEAN NOT_NULL_OR_EMPTY (VARCHAR str)
如果字符串为空字符串或者NULL,返回false。否则,返回true。

select not_null_or_empty(null);
+-------------------------+
| not_null_or_empty(NULL) |
+-------------------------+
|                       0 |
+-------------------------+

select not_null_or_empty("");
+-----------------------+
| not_null_or_empty(\'\') |
+-----------------------+
|                     0 |
+-----------------------+

select not_null_or_empty("a");
+------------------------+
| not_null_or_empty(\'a\') |
+------------------------+
|                      1 |
+------------------------+

replace

VARCHAR REPLACE (VARCHAR str, VARCHAR old, VARCHAR new)
-- 将str字符串中的old子串全部替换为new串

mysql> select replace("http://www.baidu.com:9090", "9090", "");
+------------------------------------------------------+
| replace(\'http://www.baidu.com:9090\', \'9090\', \'\') |
+------------------------------------------------------+
| http://www.baidu.com:                                |
+------------------------------------------------------+

split_part

VARCHAR split_part(VARCHAR content, VARCHAR delimiter, INT field)
-- 根据分割符拆分字符串, 返回指定的分割部分(从一开始计数)。

mysql> select split_part("hello world", " ", 1);
+----------------------------------+
| split_part(\'hello world\', \' \', 1) |
+----------------------------------+
| hello                            |
+----------------------------------+


mysql> select split_part("hello world", " ", 2);
+----------------------------------+
| split_part(\'hello world\', \' \', 2) |
+----------------------------------+
| world                             |
+----------------------------------+

mysql> select split_part("2019年7月8号", "月", 1);
+-----------------------------------------+
| split_part(\'2019年7月8号\', \'月\', 1)     |
+-----------------------------------------+
| 2019年7                                 |
+-----------------------------------------+

mysql> select split_part("abca", "a", 1);
+----------------------------+
| split_part(\'abca\', \'a\', 1) |
+----------------------------+
|                            |
+----------------------------+

money_format

VARCHAR money_format(Number)
-- 将数字按照货币格式输出,整数部分每隔3位用逗号分隔,小数部分保留2位

mysql> select money_format(17014116);
+------------------------+
| money_format(17014116) |
+------------------------+
| 17,014,116.00          |
+------------------------+

mysql> select money_format(1123.456);
+------------------------+
| money_format(1123.456) |
+------------------------+
| 1,123.46               |
+------------------------+

mysql> select money_format(1123.4);
+----------------------+
| money_format(1123.4) |
+----------------------+
| 1,123.40             |
+----------------------+

数学函数

ceil和floor

BIGINT ceil(DOUBLE x) 
-- 返回大于或等于x的最小整数值.
mysql> select ceil(1);

+-----------+
| ceil(1.0) |
+-----------+
|         1 |
+-----------+
mysql> select ceil(2.4);
+-----------+
| ceil(2.4) |
+-----------+
|         3 |
+-----------+
mysql> select ceil(-10.3);
+-------------+
| ceil(-10.3) |
+-------------+
|         -10 |
+-------------+

BIGINT floor(DOUBLE x) 
-- 返回小于或等于x的最大整数值.
mysql> select floor(1);
+------------+
| floor(1.0) |
+------------+
|          1 |
+------------+
mysql> select floor(2.4);
+------------+
| floor(2.4) |
+------------+
|          2 |
+------------+
mysql> select floor(-10.3);
+--------------+
| floor(-10.3) |
+--------------+
|          -11 |
+--------------+

round

round(x), round(x, d) 
-- 将x四舍五入后保留d位小数,d默认为0。
-- 如果d为负数,则小数点左边d位为0。如果x或d为null,返回null。

mysql> select round(2.4);
+------------+
| round(2.4) |
+------------+
|          2 |
+------------+
mysql> select round(2.5);
+------------+
| round(2.5) |
+------------+
|          3 |
+------------+
mysql> select round(-3.4);
+-------------+
| round(-3.4) |
+-------------+
|          -3 |
+-------------+
mysql> select round(-3.5);
+-------------+
| round(-3.5) |
+-------------+
|          -4 |
+-------------+
mysql> select round(1667.2725, 2);
+---------------------+
| round(1667.2725, 2) |
+---------------------+
|             1667.27 |
+---------------------+
mysql> select round(1667.2725, -2);
+----------------------+
| round(1667.2725, -2) |
+----------------------+
|                 1700 |
+----------------------+

truncate


DOUBLE truncate(DOUBLE x, INT d) 
-- 按照保留小数的位数d对x进行数值截取。
-- 规则如下: 
-- 当d > 0时:保留x的d位小数 
-- 当d = 0时:将x的小数部分去除,只保留整数部分 
-- 当d < 0时:将x的小数部分去除,整数部分按照 d所指定的位数,采用数字0进行替换

mysql> select truncate(124.3867, 2);
+-----------------------+
| truncate(124.3867, 2) |
+-----------------------+
|                124.38 |
+-----------------------+
mysql> select truncate(124.3867, 0);
+-----------------------+
| truncate(124.3867, 0) |
+-----------------------+
|                   124 |
+-----------------------+
mysql> select truncate(-124.3867, -2);
+-------------------------+
| truncate(-124.3867, -2) |
+-------------------------+
|                    -100 |
+-------------------------+

abs

数值类型 abs(数值类型 x) 
-- 返回x的绝对值.

mysql> select abs(-2);
+---------+
| abs(-2) |
+---------+
|       2 |
+---------+
mysql> select abs(3.254655654);
+------------------+
| abs(3.254655654) |
+------------------+
|      3.254655654 |
+------------------+
mysql> select abs(-3254654236547654354654767);
+---------------------------------+
| abs(-3254654236547654354654767) |
+---------------------------------+
| 3254654236547654354654767       |
+---------------------------------+

pow

DOUBLE pow(DOUBLE a, DOUBLE b) 
-- 求幂次:返回a的b次方.

mysql> select pow(2,0);
+---------------+
| pow(2.0, 0.0) |
+---------------+
|             1 |
+---------------+
mysql> select pow(2,3);
+---------------+
| pow(2.0, 3.0) |
+---------------+
|             8 |
+---------------+
mysql> select round(pow(3,2.4),2);
+--------------------+
| pow(3.0, 2.4)      |
+--------------------+
| 13.966610165238235 |
+--------------------+


greatest和 least


greatest(col_a, col_b, …, col_n)
-- 返回一行中 n个column的最大值.若column中有NULL,则返回NULL.

least(col_a, col_b, …, col_n)
-- 返回一行中 n个column的最小值.若column中有NULL,则返回NULL.

mysql> select greatest(-1, 0, 5, 8);
+-----------------------+
| greatest(-1, 0, 5, 8) |
+-----------------------+
|                     8 |
+-----------------------+
mysql> select greatest(-1, 0, 5, NULL);
+--------------------------+
| greatest(-1, 0, 5, NULL) |
+--------------------------+
| NULL                     |
+--------------------------+
mysql> select greatest(6.3, 4.29, 7.6876);
+-----------------------------+
| greatest(6.3, 4.29, 7.6876) |
+-----------------------------+
|                      7.6876 |
+-----------------------------+
mysql> select greatest("2022-02-26 20:02:11","2020-01-23 20:02:11","2020-06-22 20:02:11");
+-------------------------------------------------------------------------------+
| greatest(\'2022-02-26 20:02:11\', \'2020-01-23 20:02:11\', \'2020-06-22 20:02:11\') |
+-------------------------------------------------------------------------------+
| 2022-02-26 20:02:11                                                           |
+-------------------------------------------------------------------------------+

JSON函数

CREATE TABLE test_json (
  id INT,
  json_string String
)
DUPLICATE KEY(id)
DISTRIBUTED BY HASH(id) BUCKETS 3
PROPERTIES("replication_num" = "1");

--测试数据
"k1":"v31", "k2": 300, "a1": ["k1":"v41", "k2": 400, 1, "a", 3.14]
"k1":"v32", "k2": 400, "a1": ["k1":"v41", "k2": 400, 2, "a", 4.14],"a2":"k3":"v33", "k4": 200,"a2": ["k1":"v41", "k2": 400, 2, "a", 4.14]
"k1":"v33", "k2": 500, "a1": ["k1":"v41", "k2": 400, 3, "a", 5.14],"a2":"k3":"v33", "k4": 200,"a2": ["k5":"v42", "k6": 600]
"k1":"v31"
"k1":"v31", "k2": 300
"k1":"v31", "k2": 200 "a1": []
--json是一种里面存着一对对key,value类型的结构
--针对值类型的不同:
-- 1.简单值:"k1":"v31"
-- 2.数组:["k1":"v41", "k2": 400, 1, "a", 3.14]
-- 3.对象:"a2":"k3":"v33", "k4": 200,"a2": ["k5":"v42", "k6": 600]

取值的时候,指定的\'$.k1\'==>这样的东西我们称之为json path ,json的路劲

-- 通过本地文件的方式导入

curl \\
 -u root: \\
 -H "label:load_local_file1" \\
 -H "column_separator:_" \\
 -T /root/data/json.txt \\
http://doitedu01:8040/api/test/test_json/_stream_load
-- 用insert into 的方式导入一条
INSERT INTO test_json VALUES(7, \'"k1":"v1", "k2": 200\');
5.1.7.1get_json_double,get_json_int,get_json_string


DOUBLE get_json_int(VARCHAR json_str, VARCHAR json_path)
INT get_json_int(VARCHAR json_str, VARCHAR json_path)
VARCHAR get_json_string(VARCHAR json_str, VARCHAR json_path)

-- 解析并获取 json 字符串内指定路径的double,int,string 类型的内容。 
-- 其中 json_path 必须以 $ 符号作为开头,使用 . 作为路径分割符。
-- 如果路径中包含 . ,则可以使用双引号包围。 
-- 使用 [ ] 表示数组下标,从 0 开始。 
-- path 的内容不能包含 ", [ 和 ]。 
-- 如果 json_string 格式不对,或 json_path 格式不对,或无法找到匹配项,则返回 NULL。

--1.获取到k1对应的value的值
mysql> select  id, get_json_string(json_string,\'$.k1\') as k1 from test_json;
+------+------+
| id   | k1   |
+------+------+
|    2 | v32  |
|    4 | v31  |
|    5 | v31  |
|    6 | v31  |
|    1 | v31  |
|    3 | v33  |
+------+------+

--2.获取到key 为a1 里面的数组
mysql> select  id, get_json_string(json_string,\'$.a1\') as arr from test_json;
+------+------------------------------------+
| id   | arr                                |
+------+------------------------------------+
|    1 | ["k1":"v41","k2":400,1,"a",3.14] |
|    3 | ["k1":"v41","k2":400,3,"a",5.14] |
|    2 | ["k1":"v41","k2":400,2,"a",4.14] |
|    4 | NULL                               |
|    5 | NULL                               |
|    6 | []                                 |
+------+------------------------------------+


--3.获取到key 为a1 里面的数组中第一个元素的值
mysql> select  id, get_json_string(json_string,\'$.a1[0]\') as arr from test_json;
+------+-----------------------+
| id   | arr                   |
+------+-----------------------+
|    2 | "k1":"v41","k2":400 |
|    1 | "k1":"v41","k2":400 |
|    3 | "k1":"v41","k2":400 |
|    4 | NULL                  |
|    5 | NULL                  |
|    6 | NULL                  |
+------+-----------------------+

--4.获取到key 为a1 里面的数组中第一个元素的值(这个值是一个json串,再次获取到这个字符串中)
select  id, get_json_string(get_json_string(json_string,\'$.a1[0]\'),\'$.k1\') as arr from test_json; 
+------+------+
| id   | arr  |
+------+------+
|    2 | v41  |
|    1 | v41  |
|    3 | v41  |
|    4 | NULL |
|    5 | NULL |
|    6 | NULL |
+------+------+
6 rows in set (0.02 sec)


json_object

VARCHAR json_object(VARCHAR,...)
-- 生成一个包含指定Key-Value对的json object, 
-- 传入的参数是key,value对,且key不能为null


MySQL> select json_object(\'time\',curtime());
+--------------------------------+
| json_object(\'time\', curtime()) |
+--------------------------------+
| "time": "10:49:18"           |
+--------------------------------+


MySQL> SELECT json_object(\'id\', 87, \'name\', \'carrot\');
+-----------------------------------------+
| json_object(\'id\', 87, \'name\', \'carrot\') |
+-----------------------------------------+
| "id": 87, "name": "carrot"            |
+-----------------------------------------+

json_object(\'id\', 87, \'name\', \'carrot\');

MySQL> select json_object(\'username\',null);
+---------------------------------+
| json_object(\'username\', \'NULL\') |
+---------------------------------+
| "username": NULL              |
+---------------------------------+

窗口函数

ROW_NUMBER(),DENSE_RANK(),RANK()

-- 测试rank打行号,名次相同会并列排名,比如两个第一名,就是1 1 然后第二名会显示3
select x, y, rank() over(partition by x order by y) as rank from int_t;

| x  | y    | rank     |
|----|------|----------|
| 1  | 1    | 1        |
| 1  | 2    | 2        |
| 1  | 2    | 2        |
| 2  | 1    | 1        |
| 2  | 2    | 2        |
| 2  | 3    | 3        |
| 3  | 1    | 1        |
| 3  | 1    | 1        |
| 3  | 2    | 3        |


-- 测试dense_rank(),名词相同会并列排名,比如两个第一名,就是1 1 然后第二名会显示2
select x, y, dense_rank() over(partition by x order by y) as rank from int_t;
 | x  | y    | rank     |
 |----|------|----------|
 | 1  | 1    | 1        |
 | 1  | 2    | 2        |
 | 1  | 2    | 2        |
 | 2  | 1    | 1        |
 | 2  | 2    | 2        |
 | 2  | 3    | 3        |
 | 3  | 1    | 1        |
 | 3  | 1    | 1        |
 | 3  | 2    | 2        |
 
 -- 测试ROW_NUMBER() 按照分组排序要求,返回的编号依次底层,1 2 3 4 5 ,
 -- 不会有重复值,也不会有空缺值,就是连续递增的整数,从1 开始
 select x, y, row_number() over(partition by x order by y) as rank from int_t;

| x | y    | rank     |
|---|------|----------|
| 1 | 1    | 1        |
| 1 | 2    | 2        |
| 1 | 2    | 3        |
| 2 | 1    | 1        |
| 2 | 2    | 2        |
| 2 | 3    | 3        |
| 3 | 1    | 1        |
| 3 | 1    | 2        |
| 3 | 2    | 3        |



-- 数据
孙悟空,语文,87
孙悟空,数学,95
娜娜,英语,84
宋宋,语文,64
孙悟空,英语,68
宋宋,英语,84
婷婷,语文,65
娜娜,语文,94
宋宋,数学,86
婷婷,数学,85
娜娜,数学,56
婷婷,英语,78

-- 建表语句
create table stu
(
name varchar(50),
subject varchar(50),
score double
)
DUPLICATE KEY(name)
DISTRIBUTED BY HASH(name) BUCKETS 1;

-- 通过本地文件的方式导入数据
curl \\
 -u root: \\
 -H "label:num_test" \\
 -H "column_separator:," \\
 -T /root/data/stu.txt \\
http://doitedu01:8040/api/test/stu/_stream_load

需求:
【相同分数并列(假设第一名有两个,排名就是并列第一,然后第三名从2开始)】
1.按照分数降序排序,求每个学科中每个人的名次

2.按照每个人的总分进行升序排列,得到每个人总分名次的名次

【相同分数并列(假设第一名有两个,排名就是并列第一,然后第三名从3开始)】
3.按照学科进行升序排列,得到每个人的每个学科的名次

4.按照每个人的总分进行升序排列,得到每个人总分名次的名次

【相同分数并列
(假设第一名有两个,排名就是并列第一,
就再单独比语文的成绩,然后数学,最后英语,
分数全部一样,按照学生名字的字典顺序,在前的为第一)】

5.按照每个人的总分进行升序排列,得到每个人总分名次的名次

-- 1.按照学科进行升序排列,得到每个人的每个学科的名次 
select 
name,subject,score,
dense_rank() over(partition by subject order by score desc) as rank
from stu
+-----------+---------+-------+------+
| name      | subject | score | rank |
+-----------+---------+-------+------+
| 孙悟空    | 数学    |    95 |    1 |
| 宋宋      | 数学    |    86 |    2 |
| 婷婷      | 数学    |    85 |    3 |
| 娜娜      | 数学    |    56 |    4 |
| 娜娜      | 英语    |    84 |    1 |
| 宋宋      | 英语    |    84 |    1 |
| 婷婷      | 英语    |    78 |    2 |
| 孙悟空    | 英语    |    68 |    3 |
| 娜娜      | 语文    |    94 |    1 |
| 孙悟空    | 语文    |    87 |    2 |
| 婷婷      | 语文    |    65 |    3 |
| 宋宋      | 语文    |    64 |    4 |
+-----------+---------+-------+------+

-- 2.按照每个人的总分进行升序排列,得到每个人总分名次的名次
select
name,sum_score,
-- 因为是整体按照学生的总分进行求名次,所有学生为1组,就不需要分组了
dense_rank() over(order by sum_score desc) as rank
from 
(
    select 
    name,sum(score) as sum_score
    from stu
    group by name
) as t ;
+-----------+-----------+------+
| name      | sum_score | rank |
+-----------+-----------+------+
| 孙悟空    |       250 |    1 |
| 宋宋      |       234 |    2 |
| 娜娜      |       234 |    2 |
| 婷婷      |       228 |    3 |
+-----------+-----------+------+


【相同分数并列(假设第一名有两个,排名就是并列第一,然后第三名从3开始)】
-- 3.按照学科进行升序排列,得到每个人的每个学科的名次 
select 
name,subject,score,
rank() over(partition by subject order by score desc) as rank
from stu
+-----------+---------+-------+------+
| name      | subject | score | rank |
+-----------+---------+-------+------+
| 孙悟空    | 数学    |    95 |    1 |
| 宋宋      | 数学    |    86 |    2 |
| 婷婷      | 数学    |    85 |    3 |
| 娜娜      | 数学    |    56 |    4 |
| 娜娜      | 英语    |    84 |    1 |
| 宋宋      | 英语    |    84 |    1 |
| 婷婷      | 英语    |    78 |    3 |
| 孙悟空    | 英语    |    68 |    4 |
| 娜娜      | 语文    |    94 |    1 |
| 孙悟空    | 语文    |    87 |    2 |
| 婷婷      | 语文    |    65 |    3 |
| 宋宋      | 语文    |    64 |    4 |
+-----------+---------+-------+------+

-- 4.按照每个人的总分进行升序排列,得到每个人总分名次的名次
select
name,sum_score,
-- 因为是整体按照学生的总分进行求名次,所有学生为1组,就不需要分组了
rank() over(order by sum_score desc) as rank
from 
(
    select 
    name,sum(score) as sum_score
    from stu
    group by name
) as t ;
+-----------+-----------+------+
| name      | sum_score | rank |
+-----------+-----------+------+
| 孙悟空    |       250 |    1 |
| 宋宋      |       234 |    2 |
| 娜娜      |       234 |    2 |
| 婷婷      |       228 |    4 |
+-----------+-----------+------+

【相同分数并列
(假设第一名有两个,排名就是并列第一,
就再单独比语文的成绩,然后数学,最后英语,
分数全部一样,按照学生名字的字典顺序,在前的为第一)】

-- 5.按照每个人的总分进行升序排列,得到每个人总分名次的名次
--方案1:利用窗口函数来列转行
 select 
 name,subject,score as math_score,english_score,chinese_score,sum_score,
 row_number()over(order by sum_score desc ,chinese_score desc ,score desc ,english_score desc,name asc) as num
 from 
 (
        select  
        name,subject,score,
        lead(score,1,0)over(partition by name order by subject) as english_score,
        lead(score,2,0)over(partition by name order by subject) as chinese_score, 
        sum(score)over(partition by name) as sum_score,
        row_number()over(partition by name) as num
        from stu 
 ) as tmp
 where num = 1
 
 -- 方案2:利用if判断来列转行
  select 
 name,chinese_score,match_score,english_score,sum_score,
 row_number()over(order by sum_score desc ,chinese_score desc ,match_score desc ,english_score desc,name asc) as num
 from 
 (
    select 
    name,
    sum(chinese_score) as chinese_score,
    sum(match_score) as match_score,
    sum(english_score) as english_score,
    sum(chinese_score) + sum(match_score) + sum(english_score) as sum_score
    from 
    (
        select  name,subject,
        if(subject = \'语文\',score,0) as chinese_score,
        if(subject = \'数学\',score,0) as match_score,
        if(subject = \'英语\',score,0) as english_score
        from stu 
    )as t 
    group by name
 ) as t1

+-----------+---------+------------+---------------+---------------+-----------+------+
| name      | subject | math_score | english_score | chinese_score | sum_score | num  |
+-----------+---------+------------+---------------+---------------+-----------+------+
| 孙悟空    | 数学    |         95 |            68 |            87 |       250 |    1 |
| 娜娜      | 数学    |         56 |            84 |            94 |       234 |    2 |
| 宋宋      | 数学    |         86 |            84 |            64 |       234 |    3 |
| 婷婷      | 数学    |         85 |            78 |            65 |       228 |    4 |
+-----------+---------+------------+---------------+---------------+-----------+------+


min,max,sum,avg,count

min(x)over()   -- 取窗口中x列的最小值
max(x)over()   -- 取窗口中x列的最大值
sum(x)over()   -- 取窗口中x列的数据总和
avg(x)over()   -- 取窗口中x列的数据平均值
count(x)over() -- 取窗口中x列有多少行

unbounded preceding
current row
1 following
1 PRECEDING 

rows between unbounded preceding and current row   --指在当前窗口中第一行到当前行的范围
rows between unbounded preceding and 1 following   --指在当前窗口中第一行到当前行下一行的范围 
rows between unbounded preceding and 1 PRECEDING   --指在当前窗口中第一行到当前行前一行的范围

LEAD() ,LAG()

-- LAG() 方法用来计算当前行向前数若干行的值。
LAG(expr, offset, default) OVER (partition_by_clause order_by_clause)

-- LEAD() 方法用来计算当前行向后数若干行的值。
LEAD(expr, offset, default]) OVER (partition_by_clause order_by_clause)

漏斗模型分析函数window_funnel

业务目标、到达路径,路径步骤、步骤人数,步骤之间的相对转换率和绝对转换率
每一种业务都有他的核心任务和流程,而流程的每一个步骤,都可能有用户流失。
所以如果把每一个步骤及其对应的数据(如UV)拼接起来,就会形成一个上大下小的漏斗形态,这就是漏斗模型


-- 准备数据
uid  event_id   event_action  event_time
u001,e1,view_detail_page,2022-11-01 01:10:21
u001,e2,add_bag_page,2022-11-01 01:11:13
u001,e3,collect_goods_page,2022-11-01 02:07:11
u002,e3,collect_goods_page,2022-11-01 01:10:21
u002,e4,order_detail_page,2022-11-01 01:11:13
u002,e5,pay_detail_page,2022-11-01 02:07:11
u002,e6,click_adver_page,2022-11-01 13:07:23
u002,e7,home_page,2022-11-01 08:18:12
u002,e8,list_detail_page,2022-11-01 23:34:29
u002,e1,view_detail_page,2022-11-01 11:25:32
u002,e2,add_bag_page,2022-11-01 12:41:21
u002,e3,collect_goods_page,2022-11-01 16:21:15
u002,e4,order_detail_page,2022-11-01 21:41:12
u003,e5,pay_detail_page,2022-11-01 01:10:21
u003,e6,click_adver_page,2022-11-01 01:11:13
u003,e7,home_page,2022-11-01 02:07:11
u001,e4,order_detail_page,2022-11-01 13:07:23
u001,e5,pay_detail_page,2022-11-01 08:18:12
u001,e6,click_adver_page,2022-11-01 23:34:29
u001,e7,home_page,2022-11-01 11:25:32
u001,e8,list_detail_page,2022-11-01 12:41:21
u001,e1,view_detail_page,2022-11-01 16:21:15
u001,e2,add_bag_page,2022-11-01 21:41:12
u003,e8,list_detail_page,2022-11-01 13:07:23
u003,e1,view_detail_page,2022-11-01 08:18:12
u003,e2,add_bag_page,2022-11-01 23:34:29
u003,e3,collect_goods_page,2022-11-01 11:25:32
u003,e4,order_detail_page,2022-11-01 12:41:21
u003,e5,pay_detail_page,2022-11-01 16:21:15
u003,e6,click_adver_page,2022-11-01 21:41:12
u004,e7,home_page,2022-11-01 01:10:21
u004,e8,list_detail_page,2022-11-01 01:11:13
u004,e1,view_detail_page,2022-11-01 02:07:11
u004,e2,add_bag_page,2022-11-01 13:07:23
u004,e3,collect_goods_page,2022-11-01 08:18:12
u004,e4,order_detail_page,2022-11-01 23:34:29
u004,e5,pay_detail_page,2022-11-01 11:25:32
u004,e6,click_adver_page,2022-11-01 12:41:21
u004,e7,home_page,2022-11-01 16:21:15
u004,e8,list_detail_page,2022-11-01 21:41:12
u005,e1,view_detail_page,2022-11-01 01:10:21
u005,e2,add_bag_page,2022-11-01 01:11:13
u005,e3,collect_goods_page,2022-11-01 02:07:11
u005,e4,order_detail_page,2022-11-01 13:07:23
u005,e5,pay_detail_page,2022-11-01 08:18:12
u005,e6,click_adver_page,2022-11-01 23:34:29
u005,e7,home_page,2022-11-01 11:25:32
u005,e8,list_detail_page,2022-11-01 12:41:21
u005,e1,view_detail_page,2022-11-01 16:21:15
u005,e2,add_bag_page,2022-11-01 21:41:12
u005,e3,collect_goods_page,2022-11-01 01:10:21
u006,e4,order_detail_page,2022-11-01 01:11:13
u006,e5,pay_detail_page,2022-11-01 02:07:11
u006,e6,click_adver_page,2022-11-01 13:07:23
u006,e7,home_page,2022-11-01 08:18:12
u006,e8,list_detail_page,2022-11-01 23:34:29
u006,e1,view_detail_page,2022-11-01 11:25:32
u006,e2,add_bag_page,2022-11-01 12:41:21
u006,e3,collect_goods_page,2022-11-01 16:21:15
u006,e4,order_detail_page,2022-11-01 21:41:12
u006,e5,pay_detail_page,2022-11-01 23:10:21
u006,e6,click_adver_page,2022-11-01 01:11:13
u007,e7,home_page,2022-11-01 02:07:11
u007,e8,list_detail_page,2022-11-01 13:07:23
u007,e1,view_detail_page,2022-11-01 08:18:12
u007,e2,add_bag_page,2022-11-01 23:34:29
u007,e3,collect_goods_page,2022-11-01 11:25:32
u007,e4,order_detail_page,2022-11-01 12:41:21
u007,e5,pay_detail_page,2022-11-01 16:21:15
u007,e6,click_adver_page,2022-11-01 21:41:12
u007,e7,home_page,2022-11-01 01:10:21
u008,e8,list_detail_page,2022-11-01 01:11:13
u008,e1,view_detail_page,2022-11-01 02:07:11
u008,e2,add_bag_page,2022-11-01 13:07:23
u008,e3,collect_goods_page,2022-11-01 08:18:12
u008,e4,order_detail_page,2022-11-01 23:34:29
u008,e5,pay_detail_page,2022-11-01 11:25:32
u008,e6,click_adver_page,2022-11-01 12:41:21
u008,e7,home_page,2022-11-01 16:21:15
u008,e8,list_detail_page,2022-11-01 21:41:12
u008,e1,view_detail_page,2022-11-01 01:10:21
u009,e2,add_bag_page,2022-11-01 01:11:13
u009,e3,collect_goods_page,2022-11-01 02:07:11
u009,e4,order_detail_page,2022-11-01 13:07:23
u009,e5,pay_detail_page,2022-11-01 08:18:12
u009,e6,click_adver_page,2022-11-01 23:34:29
u009,e7,home_page,2022-11-01 11:25:32
u009,e8,list_detail_page,2022-11-01 12:41:21
u009,e1,view_detail_page,2022-11-01 16:21:15
u009,e2,add_bag_page,2022-11-01 21:41:12
u009,e3,collect_goods_page,2022-11-01 01:10:21
u010,e4,order_detail_page,2022-11-01 01:11:13
u010,e5,pay_detail_page,2022-11-01 02:07:11
u010,e6,click_adver_page,2022-11-01 13:07:23
u010,e7,home_page,2022-11-01 08:18:12
u010,e8,list_detail_page,2022-11-01 23:34:29
u010,e5,pay_detail_page,2022-11-01 11:25:32
u010,e6,click_adver_page,2022-11-01 12:41:21
u010,e7,home_page,2022-11-01 16:21:15
u010,e8,list_detail_page,2022-11-01 21:41:12

-- 创建表
drop table if exists event_info_log;
create table event_info_log
(
uid varchar(20),
event_id varchar(20),
event_action varchar(20),
event_time datetime
)
DUPLICATE KEY(uid)
DISTRIBUTED BY HASH(uid) BUCKETS 1;

-- 通过本地文件的方式导入数据
curl \\
 -u root: \\
 -H "label:event_info_log" \\
 -H "column_separator:," \\
 -T /root/data/event_log.txt \\
 http://linux01:8040/api/test/event_info_log/_stream_load


封装、要素(时间范围,事件的排序时间依据,漏斗模型的事件链)

window_funnel(window, mode, timestamp_column, event1, event2, ... , eventN)

-- 漏斗分析函数搜索滑动时间窗口内最大的发生的最大事件序列长度。
-- window :滑动时间窗口大小,单位为秒。
-- mode  :保留,目前只支持default。-- 相邻两个事件之间没有时间间隔要求,并且相邻两个事件中可以做其他的事件
-- timestamp_column :指定时间列,类型为DATETIME, 滑动窗口沿着此列工作。
-- eventN :表示事件的布尔表达式。

select 
user_id,
window_funnel(3600*24, \'default\', event_time, event_id=\'e1\', event_id=\'e2\' , event_id=\'e4\', event_id=\'e5\') as step
from event_info_log 
group by user_id

+---------+------+
| user_id | step |
+---------+------+
| u006    |    4 |
| u007    |    2 |
| u005    |    3 |
| u004    |    3 |
| u010    |    0 |
| u001    |    3 |
| u003    |    2 |
| u002    |    3 |
| u008    |    3 |
| u009    |    2 |
+---------+------+



-- 算每一层级的转换率
select
\'购买转化漏斗\' as funnel_name,
sum(if(step >= 1 ,1,0)) as step1,
sum(if(step >= 2 ,1,0)) as step2,
sum(if(step >= 3 ,1,0)) as step3,
sum(if(step >= 4 ,1,0)) as step4,
round(sum(if(step >= 2 ,1,0))/sum(if(step >= 1 ,1,0)),2) as \'step1->step2_radio\',
round(sum(if(step >= 3 ,1,0))/sum(if(step >= 2 ,1,0)),2) as \'step2->step3_radio\',
round(sum(if(step >= 4 ,1,0))/sum(if(step >= 3 ,1,0)),2) as \'step3->step4_radio\'
from
(
select 
user_id,
window_funnel(3600*24, \'default\', report_date, event_id=\'e1\', event_id=\'e2\' , event_id=\'e4\', event_id=\'e5\') as step
from event_info_log 
where to_date(report_date) = \'2022-11-01\'
and event_id in(\'e1\',\'e4\',\'e5\',\'e2\')
group by user_id
) as t1 

-- res
+--------------------+-------+-------+-------+-------+--------------------+--------------------+--------------------+
| funnel_name        | step1 | step2 | step3 | step4 | step1->step2_radio | step2->step3_radio | step3->step4_radio |
+--------------------+-------+-------+-------+-------+--------------------+--------------------+--------------------+
| 购买转化漏斗       |     9 |     9 |     6 |     1 |                  1 |               0.67 |               0.17 |
+--------------------+-------+-------+-------+-------+--------------------+--------------------+--------------------+

Hive内部函数简介及查询语法

1.Hive内置函数:

在Hive中 系统给我们内置了很多函数 具体参考官方地址

  • 看下官网给我们的介绍:
SHOW FUNCTIONS; --查看所有内置函数
DESCRIBE FUNCTION <function_name>; --查看某个函数的描述
DESCRIBE FUNCTION EXTENDED <function_name>; --查看某个函数的具体使用方法
hive> DESCRIBE FUNCTION case;
OK
CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return f
Time taken: 0.006 seconds, Fetched: 1 row(s)
hive> DESCRIBE FUNCTION EXTENDED case;
OK
CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return f
Example:
 SELECT
 CASE deptno
   WHEN 1 THEN Engineering
   WHEN 2 THEN Finance
   ELSE admin
 END,
 CASE zone
   WHEN 7 THEN Americas
   ELSE Asia-Pac
 END
 FROM emp_details
Time taken: 0.008 seconds, Fetched: 13 row(s)
# DESCRIBE 可简写为desc
hive> desc FUNCTION EXTENDED case;
OK
CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return f
Example:
 SELECT
 CASE deptno
   WHEN 1 THEN Engineering
   WHEN 2 THEN Finance
   ELSE admin
 END,
 CASE zone
   WHEN 7 THEN Americas
   ELSE Asia-Pac
 END
 FROM emp_details
Time taken: 0.009 seconds, Fetched: 13 row(s)

下面我们了解下常用函数的使用方法:

# 为了方便测试 我们创建常用的dual表
hive> create table dual(x string);
OK
Time taken: 0.11 seconds
hive> insert into table dual values(‘‘);
Query ID = hadoop_20180702100505_f0566585-06b2-4c53-910a-b6a58791fc2d
Total jobs = 3
Launching Job 1 out of 3
...
OK
Time taken: 29.535 seconds
hive> select * from dual;
OK

Time taken: 0.147 seconds, Fetched: 1 row(s)
# 测试当前时间
hive> select current_date from dual;
OK
2018-07-02
Time taken: 0.111 seconds, Fetched: 1 row(s)
# 测试当前时间戳
hive> select current_timestamp from dual;
OK
2018-07-02 15:03:28.919
Time taken: 0.117 seconds, Fetched: 1 row(s)
# 测试substr函数 用于截取字符串
hive> desc function extended substr;
OK
substr(str, pos[, len]) - returns the substring of str that starts at pos and is of length len orsubstr(bin, pos[, len]) - returns the slice of byte array that starts at pos and is of length len
Synonyms: substring
pos is a 1-based index. If pos<0 the starting position is determined by counting backwards from the end of str.
Example:
   > SELECT substr(‘Facebook‘, 5) FROM src LIMIT 1;
  ‘book‘
  > SELECT substr(‘Facebook‘, -5) FROM src LIMIT 1;
  ‘ebook‘
  > SELECT substr(‘Facebook‘, 5, 1) FROM src LIMIT 1;
  ‘b‘
Time taken: 0.016 seconds, Fetched: 10 row(s)
hive> SELECT substr(‘helloworld‘,-5) FROM dual;
OK
world
Time taken: 0.171 seconds, Fetched: 1 row(s)
hive> SELECT substr(‘helloworld‘,5) FROM dual;
OK
oworld
Time taken: 0.12 seconds, Fetched: 1 row(s)
hive> SELECT substr(‘helloworld‘,5,3) FROM dual;
OK
owo
Time taken: 0.142 seconds, Fetched: 1 row(s)
# 测试函数concat 用于将字符连接起来
hive> desc function extended concat_ws;
OK
concat_ws(separator, [string | array(string)]+) - returns the concatenation of the strings separated by the separator.
Example:
  > SELECT concat_ws(‘.‘, ‘www‘, array(‘facebook‘, ‘com‘)) FROM src LIMIT 1;
  ‘www.facebook.com‘
Time taken: 0.019 seconds, Fetched: 4 row(s)
hive> select concat_ws(".","192","168","199","151") from dual;
OK
192.168.199.151
Time taken: 0.152 seconds, Fetched: 1 row(s)
# 测试函数split 用于拆分
hive> desc function extended split;
OK
split(str, regex) - Splits str around occurances that match regex
Example:
  > SELECT split(‘oneAtwoBthreeC‘, ‘[ABC]‘) FROM src LIMIT 1;
  ["one", "two", "three"]
Time taken: 0.021 seconds, Fetched: 4 row(s)
hive> select split("192.168.199.151","\.") from dual;
OK
["192","168","199","151"]
Time taken: 0.169 seconds, Fetched: 1 row(s)

2.Hive查询语法:

  • 简单select语法:
    # 简单select语法
    hive> select * from emp where deptno=10;
    OK
    7782    CLARK   MANAGER 7839    1981-6-9        2450.0  NULL    10
    7839    KING    PRESIDENT       NULL    1981-11-17      5000.0  NULL    10
    7934    MILLER  CLERK   7782    1982-1-23       1300.0  NULL    10
    Time taken: 0.899 seconds, Fetched: 3 row(s)
    hive> select * from emp where empno <= 7800;
    OK
    7369    SMITH   CLERK   7902    1980-12-17      800.0   NULL    20
    7499    ALLEN   SALESMAN        7698    1981-2-20       1600.0  300.0   30
    7521    WARD    SALESMAN        7698    1981-2-22       1250.0  500.0   30
    7566    JONES   MANAGER 7839    1981-4-2        2975.0  NULL    20
    7654    MARTIN  SALESMAN        7698    1981-9-28       1250.0  1400.0  30
    7698    BLAKE   MANAGER 7839    1981-5-1        2850.0  NULL    30
    7782    CLARK   MANAGER 7839    1981-6-9        2450.0  NULL    10
    7788    SCOTT   ANALYST 7566    1987-4-19       3000.0  NULL    20
    Time taken: 0.277 seconds, Fetched: 8 row(s)
    hive> select * from emp where salary between 1000 and 1500;
    OK
    7521    WARD    SALESMAN        7698    1981-2-22       1250.0  500.0   30
    7654    MARTIN  SALESMAN        7698    1981-9-28       1250.0  1400.0  30
    7844    TURNER  SALESMAN        7698    1981-9-8        1500.0  0.0     30
    7876    ADAMS   CLERK   7788    1987-5-23       1100.0  NULL    20
    7934    MILLER  CLERK   7782    1982-1-23       1300.0  NULL    10
    Time taken: 0.187 seconds, Fetched: 5 row(s)
    hive> select * from emp limit 5;
    OK
    7369    SMITH   CLERK   7902    1980-12-17      800.0   NULL    20
    7499    ALLEN   SALESMAN        7698    1981-2-20       1600.0  300.0   30
    7521    WARD    SALESMAN        7698    1981-2-22       1250.0  500.0   30
    7566    JONES   MANAGER 7839    1981-4-2        2975.0  NULL    20
    7654    MARTIN  SALESMAN        7698    1981-9-28       1250.0  1400.0  30
    Time taken: 0.154 seconds, Fetched: 5 row(s)
    hive> select * from emp where empno in(7566,7499);
    OK
    7499    ALLEN   SALESMAN        7698    1981-2-20       1600.0  300.0   30
    7566    JONES   MANAGER 7839    1981-4-2        2975.0  NULL    20
    Time taken: 0.153 seconds, Fetched: 2 row(s)
    hive> select * from emp where comm is not null;
    OK
    7499    ALLEN   SALESMAN        7698    1981-2-20       1600.0  300.0   30
    7521    WARD    SALESMAN        7698    1981-2-22       1250.0  500.0   30
    7654    MARTIN  SALESMAN        7698    1981-9-28       1250.0  1400.0  30
    7844    TURNER  SALESMAN        7698    1981-9-8        1500.0  0.0     30
    Time taken: 0.291 seconds, Fetched: 4 row(s)
  • 聚合函数及分组函数:
# 聚合函数及分组函数
#  max/min/count/sum/avg 特点:多进一出,进来很多条记录出去只有一条记录

# 查询部门编号为10的有多少条记录
hive> select count(1) from emp where deptno=10;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
3
Time taken: 38.951 seconds, Fetched: 1 row(s)
# 求最大工资,最小工资,平均工资,工资的和
hive> select max(salary),min(salary),avg(salary),sum(salary) from emp;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
5000.0  800.0   2073.214285714286       29025.0
Time taken: 23.748 seconds, Fetched: 1 row(s)

# 分组函数 group by
# 求部门的平均工资
# 注:select中出现的字段,如果没有出现在组函数/聚合函数中,必须出现在group by里面,否则就会产生报错
hive> select deptno,avg(salary) from emp group by deptno;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
10      2916.6666666666665
20      2175.0
30      1566.6666666666667
Time taken: 36.502 seconds, Fetched: 3 row(s)
# 求每个部门(deptno)、工作岗位(job)的最高工资(salary)
hive> select deptno,job,max(salary) from emp group by deptno,job;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
10      CLERK   1300.0
10      MANAGER 2450.0
10      PRESIDENT       5000.0
20      ANALYST 3000.0
20      CLERK   1100.0
20      MANAGER 2975.0
30      CLERK   950.0
30      MANAGER 2850.0
30      SALESMAN        1600.0
Time taken: 36.096 seconds, Fetched: 9 row(s)
# 查询平均工资大于2000的部门(使用having子句限定分组查询)
hive> select deptno,avg(salary) from emp group by deptno having avg(salary) >2000;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
10      2916.6666666666665
20      2175.0
Time taken: 24.71 seconds, Fetched: 2 row(s)

# case when then end(不会跑mr)
hive> select ename, salary, 
    > case
    > when salary > 1 and salary <= 1000 then ‘LOWER‘
    > when salary > 1000 and salary <= 2000 then ‘MIDDLE‘
    > when salary > 2000 and salary <= 4000 then ‘HIGH‘
    > ELSE ‘HIGHEST‘
    > end
    > from emp;
OK
SMITH   800.0   LOWER
ALLEN   1600.0  MIDDLE
WARD    1250.0  MIDDLE
JONES   2975.0  HIGH
MARTIN  1250.0  MIDDLE
BLAKE   2850.0  HIGH
CLARK   2450.0  HIGH
SCOTT   3000.0  HIGH
KING    5000.0  HIGHEST
TURNER  1500.0  MIDDLE
ADAMS   1100.0  MIDDLE
JAMES   950.0   LOWER
FORD    3000.0  HIGH
MILLER  1300.0  MIDDLE
Time taken: 0.096 seconds, Fetched: 14 row(s)
  • 多表join查询:
# 创建测试表
hive> create table a(
    > id int, name string
    > ) row format delimited fields terminated by ‘	‘;
OK
Time taken: 0.311 seconds
hive> create table b(
    > id int, age int
    > ) row format delimited fields terminated by ‘	‘;
OK
Time taken: 0.142 seconds
# insert或load数据 最后表数据如下
hive> select * from a;
OK
1       zhangsan
2       lisi
3       wangwu
hive> select * from b;
OK
1       20
2       30
4       40
Time taken: 0.2 seconds, Fetched: 3 row(s)

# 内连接 inner join = join 仅列出表1和表2符合连接条件的数据
hive> select a.id,a.name,b.age from a join b on a.id=b.id;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
1       zhangsan        20
2       lisi    30
Time taken: 24.415 seconds, Fetched: 2 row(s)
# 左外连接(left join) 以左边的为基准,左边的数据全部数据全部出现,如果没有出现就赋null值
hive> select a.id,a.name,b.age from a left join b on a.id=b.id;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
1       zhangsan        20
2       lisi    30
3       wangwu  NULL
Time taken: 26.218 seconds, Fetched: 3 row(s)
# 右外连接(right join) 以右表为基准
hive> select a.id,a.name,b.age from a right join b on a.id=b.id;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
1       zhangsan        20
2       lisi    30
NULL    NULL    40
Time taken: 24.027 seconds, Fetched: 3 row(s)
# 全连接(full join)相当于表1和表2的数据都显示,如果没有对应的数据,则显示Null.
hive> select a.id,a.name,b.age from a full join b on a.id=b.id;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
1       zhangsan        20
2       lisi    30
3       wangwu  NULL
NULL    NULL    40
Time taken: 32.94 seconds, Fetched: 4 row(s)
# 笛卡尔积(cross join) 没有连接条件 会针对表1和表2的每条数据做连接
hive> select a.id,a.name,b.age from a cross join b;
Warning: Map Join MAPJOIN[7][bigTable=a] in task ‘Stage-3:MAPRED‘ is a cross product
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
1       zhangsan        20
1       zhangsan        30
1       zhangsan        40
2       lisi    20
2       lisi    30
2       lisi    40
3       wangwu  20
3       wangwu  30
3       wangwu  40
Time taken: 29.825 seconds, Fetched: 9 row(s)

3.利用Hive sql实现wordcount:

# 创建表 加载测试数据
hive> create table hive_wc(sentence string);
OK
Time taken: 0.149 seconds

[[email protected] ~]$ cat hive-wc.txt
hello,world,welcome
hello,welcome

hive> load data local inpath ‘/home/hadoop/hive-wc.txt‘ into table hive_wc;
Loading data to table default.hive_wc
Table default.hive_wc stats: [numFiles=1, totalSize=34]
OK
Time taken: 0.729 seconds
hive> select * from hive_wc;
OK
hello,world,welcome
hello,welcome
Time taken: 0.13 seconds, Fetched: 2 row(s)

# 获取每个单词 利用split分割
hive> select split(sentence,",") from hive_wc;
OK
["hello","world","welcome"]
["hello","welcome"]
Time taken: 0.163 seconds, Fetched: 2 row(s)
# explode把数组转成多行 结合split使用如下
hive> select explode(split(sentence,",")) from hive_wc;
OK
hello
world
welcome
hello
welcome
Time taken: 0.068 seconds, Fetched: 5 row(s)
# 做group by操作 一条语句即可实现wordcount统计
hive> select word, count(1) as c 
    > from (select explode(split(sentence,",")) as word from hive_wc) t
    > group by word ;
Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ce
Total jobs = 1
...
OK
hello   2
welcome 2
world   1
Time taken: 34.168 seconds, Fetched: 3 row(s)

以上是关于Doris -- 查询语法和内置函数的主要内容,如果未能解决你的问题,请参考以下文章

Clickhouse优点 缺点 / Doris 优点 缺点

Clickhouse优点 缺点 / Doris 优点 缺点

Doris和Mysql的批同步测试

Clickhouse与Doris的区别

Hive内部函数简介及查询语法

doris03-简单使用