hive窗口函数及示例
Posted Z-hhhhh
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了hive窗口函数及示例相关的知识,希望对你有一定的参考价值。
窗口函数
其实只有over()才是真正的窗口函数,只是over常于其他函数组合使用,采用强大的效果。
一、over()的语法
分析函数 over(partition by 列名 order by 列名 rows between 开始位置 and 结束位置)
1.1、窗口控制,既window子句
over(partition by col order by 排序字段 rows between 1 preceding and 1 fllowing)
二、常用窗口函数
- sum(col) over() : 分组对col累计求和
- count(col) over() : 分组对col累计
- min(col) over() : 分组对col求最小值
- max(col) over() : 分组求col的最大值
- avg(col) over() : 分组求col列的平均值
- first_value(col) over() : 某分组排序后的第一个col值
- last_value(col) over() : 某分组排序后的最后一个col值
- lag(col,n,DEFAULT) : 统计往前n行的col值,n可选,默认为1,DEFAULT当往上第n行为NULL时候,取默认值,如不指定,则为NULL
- lead(col,n,DEFAULT) : 统计往后n行的col值,n可选,默认为1,DEFAULT当往下第n行为NULL时候,取默认值,如不指定,则为NULL
- ntile(n) : 用于将分组数据按照顺序切分成n片,返回当前切片值。注意:n必须为int类型
- row_number() over() : 排名函数,不会重复,适合于生成主键或者不并列排名
- rank() over() : 排名函数,有并列名次,名次不连续。如:1,1,3
- dense_rank() over() : 排名函数,有并列名次,名次连续。如:1,1,2
测试数据
create table shop_data(
shop_id int,
stat_date string,
ordamt double
)
row format delimited
fields terminated by '\\t'
lines terminated by '\\n'
stored as textfile;
-- 插入数据
insert into shop_data values
(10026,'201901230030',5170),
(10026,'201901230100',5669),
(10026,'201901230130',2396),
(10026,'201901230200',1498),
(10026,'201901230230',1997),
(10026,'201901230300',1188),
(10026,'201901230330',598),
(10026,'201901230400',479),
(10026,'201901230430',1587),
(10026,'201901230530',799),
(10027,'201901230030',2170),
(10027,'201901230100',1623),
(10027,'201901230130',3397),
(10027,'201901230200',1434),
(10027,'201901230230',1001),
(10028,'201901230300',1687),
(10028,'201901230330',1298),
(10028,'201901230400',149),
(10029,'201901230430',2587),
(10029,'201901230530',589);
三、案例
count
select shop_id,stat_date,ordamt,
-- 以符合条件的所有行作为窗口
count(shop_id) over() as count1,
-- 以按shop_id分组的所有行作为窗口
count(shop_id) over(partition by shop_id) as count2,
-- 以按shop_id分组、按stat_date排序的所有行作为窗口
count(shop_id) over(partition by shop_id order by stat_date) as count3,
-- 以按shop_id分组、按stat_date排序、按当前行+往前1行+往后2行的行作为窗口
count(ordamt) over(partition by shop_id order by stat_date rows between 1 preceding and 2 following) as count4,
-- 以按shop_id分组、按stat_date排序、按从起点到末尾,默认从起点到末尾和count2结果相同
count(ordamt) over(partition by shop_id order by stat_date rows between unbounded preceding and unbounded following) as count5,
-- 以按shop_id分组、按stat_date排序、按从起点到当前行的前一行
count(ordamt) over(partition by shop_id order by stat_date rows between unbounded preceding and 1 preceding) as count6,
-- 以按shop_id分组、按stat_date排序、按从起点到当前行
count(ordamt) over(partition by shop_id order by stat_date rows between unbounded preceding and current row) as count7,
-- 以按shop_id分组、按stat_date排序、按从当前行到末尾
count(ordamt) over(partition by shop_id order by stat_date rows between current row and unbounded following) as count8,
-- 以按shop_id分组、按stat_date排序、按从当前行往后一行到末尾
count(ordamt) over(partition by shop_id order by stat_date rows between 1 following and unbounded following) as count9,
-- 以按shop_id分组、按stat_date排序、按从当前行往后一行到当前行往后2行
count(ordamt) over(partition by shop_id order by stat_date rows between 1 following and 2 following) as count10
from shop_data;
--查询结果如下
+----------+---------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
| shop_id | stat_date | ordamt | count1 | count2 | count3 | count4 | count5 | count6 | count7 | count8 | count9 | count10 |
+----------+---------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
| 10026 | 201901230030 | 5170.0 | 20 | 10 | 1 | 3 | 10 | 0 | 1 | 10 | 9 | 2 |
| 10026 | 201901230100 | 5669.0 | 20 | 10 | 2 | 4 | 10 | 1 | 2 | 9 | 8 | 2 |
| 10026 | 201901230130 | 2396.0 | 20 | 10 | 3 | 4 | 10 | 2 | 3 | 8 | 7 | 2 |
| 10026 | 201901230200 | 1498.0 | 20 | 10 | 4 | 4 | 10 | 3 | 4 | 7 | 6 | 2 |
| 10026 | 201901230230 | 1997.0 | 20 | 10 | 5 | 4 | 10 | 4 | 5 | 6 | 5 | 2 |
| 10026 | 201901230300 | 1188.0 | 20 | 10 | 6 | 4 | 10 | 5 | 6 | 5 | 4 | 2 |
| 10026 | 201901230330 | 598.0 | 20 | 10 | 7 | 4 | 10 | 6 | 7 | 4 | 3 | 2 |
| 10026 | 201901230400 | 479.0 | 20 | 10 | 8 | 4 | 10 | 7 | 8 | 3 | 2 | 2 |
| 10026 | 201901230430 | 1587.0 | 20 | 10 | 9 | 3 | 10 | 8 | 9 | 2 | 1 | 1 |
| 10026 | 201901230530 | 799.0 | 20 | 10 | 10 | 2 | 10 | 9 | 10 | 1 | 0 | 0 |
| 10027 | 201901230030 | 2170.0 | 20 | 5 | 1 | 3 | 5 | 0 | 1 | 5 | 4 | 2 |
| 10027 | 201901230100 | 1623.0 | 20 | 5 | 2 | 4 | 5 | 1 | 2 | 4 | 3 | 2 |
| 10027 | 201901230130 | 3397.0 | 20 | 5 | 3 | 4 | 5 | 2 | 3 | 3 | 2 | 2 |
| 10027 | 201901230200 | 1434.0 | 20 | 5 | 4 | 3 | 5 | 3 | 4 | 2 | 1 | 1 |
| 10027 | 201901230230 | 1001.0 | 20 | 5 | 5 | 2 | 5 | 4 | 5 | 1 | 0 | 0 |
| 10028 | 201901230300 | 1687.0 | 20 | 3 | 1 | 3 | 3 | 0 | 1 | 3 | 2 | 2 |
| 10028 | 201901230330 | 1298.0 | 20 | 3 | 2 | 3 | 3 | 1 | 2 | 2 | 1 | 1 |
| 10028 | 201901230400 | 149.0 | 20 | 3 | 3 | 2 | 3 | 2 | 3 | 1 | 0 | 0 |
| 10029 | 201901230430 | 2587.0 | 20 | 2 | 1 | 2 | 2 | 0 | 1 | 2 | 1 | 1 |
| 10029 | 201901230530 | 589.0 | 20 | 2 | 2 | 2 | 2 | 1 | 2 | 1 | 0 | 0 |
+----------+---------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+
sum
select
shop_id, stat_date, ordamt,
-- 以按shop_id分组、按stat_date排序、统计每个商品截止到当前时间的销售总额,默认从起点到当前行
sum(ordamt) over(partition by shop_id order by stat_date) as sum_amt1,
-- 以按shop_id分组、按stat_date排序、统计每个商品前半小时到后一小时的销售额(按当前行+往前1行+往后2行的行作为窗口)
sum(ordamt) over(partition by shop_id order by stat_date rows between 1 preceding and 2 following) as sum_amt2,
-- 以按shop_id分组、按stat_date排序、统计每个商品的销售总额(从起点到末尾)
sum(ordamt) over(partition by shop_id order by stat_date rows between unbounded preceding and unbounded following) as sum_amt3,
-- 以按shop_id分组、按stat_date排序、统计截止到前半小时的销售总额(从起点到当前行的前一行)
sum(ordamt) over(partition by shop_id order by stat_date rows between unbounded preceding and 1 preceding) as sum_amt4,
-- 以按shop_id分组、按stat_date排序、统计每个商品截止到当前时间的销售总额,默认从起点到当前行(从起点到当前行)
sum(ordamt) over(partition by shop_id order by stat_date rows between unbounded preceding and current row) as sum_amt5,
-- 以按shop_id分组、按stat_date排序、统计当前时间及之后的销售总额(从当前行的末尾)
sum(ordamt) over(partition by shop_id order by stat_date rows between current row and unbounded following) as sum_amt6,
-- 以按shop_id分组、按stat_date排序、统计当前时间的后半小时及之后的销售额(当前行后一行到末尾)
sum(ordamt) over(partition by shop_id order by stat_date rows between 1 following and unbounded following) as sum_amt7,
-- 以按shop_id分组、按stat_date排序、统计当前时间后半小时到后一小时之间的销售额(按从当前行往后一行到当前行往后2行)
sum(ordamt) over(partition by shop_id order by stat_date rows between 1 following and 2 following) as sum_amt8
from shop_data;
查询结果
+----------+---------------+---------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
| shop_id | stat_date | ordamt | sum_amt1 | sum_amt2 | sum_amt3 | sum_amt4 | sum_amt5 | sum_amt6 | sum_amt7 | sum_amt8 |
+----------+---------------+---------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
| 10026 | 201901230030 | 5170.0 | 5170.0 | 13235.0 | 21381.0 | NULL | 5170.0 | 21381.0 | 16211.0 | 8065.0 |
| 10026 | 201901230100 | 5669.0 | 10839.0 | 14733.0 | 21381.0 | 5170.0 | 10839.0 | 16211.0 | 10542.0 | 3894.0 |
| 10026 | 201901230130 | 2396.0 | 13235.0 | 11560.0 | 21381.0 | 10839.0 | 13235.0 | 10542.0 | 8146.0 | 3495.0 |
| 10026 | 201901230200 | 1498.0 | 14733.0 | 7079.0 | 21381.0 | 13235.0 | 14733.0 | 8146.0 | 6648.0 | 3185.0 |
| 10026 | 201901230230 | 1997.0 | 16730.0 | 5281.0 | 21381.0 | 14733.0 | 16730.0 | 6648.0 | 4651.0 | 1786.0 |
| 10026 | 201901230300 | 1188.0 | 17918.0 | 4262.0 | 21381.0 | 16730.0 | 17918.0 | 4651.0 | 3463.0 | 1077.0 |
| 10026 | 201901230330 | 598.0 | 18516.0 | 3852.0 | 21381.0 | 17918.0 | 18516.0 | 3463.0 | 2865.0 | 2066.0 |
| 10026 | 201901230400 | 479.0 | 18995.0 | 3463.0 | 21381.0 | 18516.0 | 18995.0 | 2865.0 | 2386.0 | 2386.0 |
| 10026 | 201901230430 | 1587.0 | 20582.0 | 2865.0 | 21381.0 | 18995.0 | 20582.0 | 2386.0 | 799.0 | 799.0 |
| 10026 | 201901230530 | 799.0 | 21381.0 | 2386.0 | 21381.0 | 20582.0 | 21381.0 | 799.0 | NULL | NULL |
| 10027 | 201901230030 | 2170.0 | 2170.0 | 7190.0 | 9625.0 | NULL | 2170.0 | 9625.0 | 7455.0 | 5020.0 |
| 10027 | 201901230100 | 1623.0 | 3793.0 | 8624.0 | 9625.0 | 2170.0 | 3793.0 | 7455.0 | 5832.0 | 4831.0 |
| 10027 | 201901230130 | 3397.0 | 7190.0 | 7455.0 | 9625.0 | 3793.0 | 7190.0 | 5832.0 | 2435.0 | 2435.0 |
| 10027 | 201901230200 | 1434.0 | 8624.0 | 5832.0 | 9625.0 | 7190.0 | 8624.0 | 2435.0 | 1001.0 | 1001.0 |
| 10027 | 201901230230 | 1001.0 | 9625.0 | 2435.0 | 9625.0 | 8624.0 | 9625.0 | 1001.0 | NULL | NULL |
| 10028 | 201901230300 | 1687.0 | 1以上是关于hive窗口函数及示例的主要内容,如果未能解决你的问题,请参考以下文章