Hive函数
Posted 星辰安安
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Hive函数相关的知识,希望对你有一定的参考价值。
文章目录
Hive 常用函数
关系运算
等值比较 = == <=>
不等值比较 != <>
区间比较: select * from default.students where id between 1500100001 and 1500100010;
空值/非空值判断:is null、is not null、nvl()、isnull()
like、rlike、regexp用法
数值计算
取整函数(四舍五入):round
向上取整:ceil
向下取整:floor
条件函数
- if: if(表达式,如果表达式成立的返回值,如果表达式不成立的返回值)
select if(1>0,1,0);
select if(1>0,if(-1>0,-1,1),0);
- COALESCE
select COALESCE(null,'1','2'); // 1 从左往右 一次匹配 直到非空为止
select COALESCE('1',null,'2'); // 1
- case when
select score
,case when score>120 then '优秀'
when score>100 then '良好'
when score>90 then '及格'
else '不及格'
end as pingfen
from default.score limit 20;
select name
,case name when "施笑槐" then "槐ge"
when "吕金鹏" then "鹏ge"
when "单乐蕊" then "蕊jie"
else "算了不叫了"
end as nickname
from default.students limit 10;
日期函数
select from_unixtime(1610611142,'YYYY/MM/dd HH:mm:ss');
select from_unixtime(unix_timestamp(),'YYYY/MM/dd HH:mm:ss');
// '2021年01月14日' -> '2021-01-14'
select from_unixtime(unix_timestamp('2021年01月14日','yyyy年MM月dd日'),'yyyy-MM-dd');
// "04牛2021数加16逼" -> "2021/04/16"
select from_unixtime(unix_timestamp("04牛2021数加16逼","MM牛yyyy数加dd逼"),"yyyy/MM/dd");
字符串函数
concat('123','456'); // 123456
concat('123','456',null); // NULL
select concat_ws('#','a','b','c'); // a#b#c
select concat_ws('#','a','b','c',NULL); // a#b#c 可以指定分隔符,并且会自动忽略NULL
select concat_ws("|",cast(id as string),name,cast(age as string),gender,clazz) from students limit 10;
select substring("abcdefg",1); // abcdefg HQL中涉及到位置的时候 是从1开始计数
// '2021/01/14' -> '2021-01-14'
select concat_ws("-",substring('2021/01/14',1,4),substring('2021/01/14',6,2),substring('2021/01/14',9,2));
select split("abcde,fgh",","); // ["abcde","fgh"]
select split("a,b,c,d,e,f",",")[2]; // c
select explode(split("abcde,fgh",",")); // abcde
// fgh
// 解析json格式的数据
select get_json_object('"name":"zhangsan","age":18,"score":["course_name":"math","score":100,"course_name":"english","score":60]',"$.score[0].score"); // 100
Hive 中的wordCount
create table words(
words string
)row format delimited fields terminated by '|';
// 数据
hello,java,hello,java,scala,python
hbase,hadoop,hadoop,hdfs,hive,hive
hbase,hadoop,hadoop,hdfs,hive,hive
select word,count(*) from (select explode(split(words,',')) word from words) a group by a.word;
// 结果
hadoop 4
hbase 2
hdfs 2
hello 2
hive 4
java 2
python 1
scala 1
Hive 开窗函数
好像给每一份数据 开一扇窗户 所以叫开窗函数, 在sql中有一类函数叫做聚合函数,例如sum()、avg()、max()等等,这类函数可以将多行数据按照规则聚集为一行,一般来讲聚集后的行数是要少于聚集前的行数的.但是有时我们想要既显示聚集前的数据,又要显示聚集后的数据,这时我们便引入了窗口函数.
测试数据
111,69,class1,department1
112,80,class1,department1
113,74,class1,department1
114,94,class1,department1
115,93,class1,department1
121,74,class2,department1
122,86,class2,department1
123,78,class2,department1
124,70,class2,department1
211,93,class1,department2
212,83,class1,department2
213,94,class1,department2
214,94,class1,department2
215,82,class1,department2
216,74,class1,department2
221,99,class2,department2
222,78,class2,department2
223,74,class2,department2
224,80,class2,department2
225,85,class2,department2
建表语句
create table new_score(
id int
,score int
,clazz string
,department string
) row format delimited fields terminated by ",";
row_number:无并列排名
用法: select xxxx, row_number() over(partition by 分组字段 order by 排序字段 desc) as rn from tb group by xxxx
dense_rank:有并列排名,并且依次递增
rank:有并列排名,不依次递增
PERCENT_RANK:(rank的结果-1)/(分区内数据的个数-1)
select id
,score
,clazz
,department
,row_number() over (partition by clazz order by score desc) as row_number_rk
,dense_rank() over (partition by clazz order by score desc) as dense_rk
,rank() over (partition by clazz order by score desc) as rk
,percent_rank() over (partition by clazz order by score desc) as percent_rk
from new_score;
id score clazz department row_number_rk dense_rk rk percent_rk
114 94 class1 department1 1 1 1 0.0
214 94 class1 department2 2 1 1 0.0
213 94 class1 department2 3 1 1 0.0
211 93 class1 department2 4 2 4 0.3
115 93 class1 department1 5 2 4 0.3
212 83 class1 department2 6 3 6 0.5
215 82 class1 department2 7 4 7 0.6
112 80 class1 department1 8 5 8 0.7
113 74 class1 department1 9 6 9 0.8
216 74 class1 department2 10 6 9 0.8
111 69 class1 department1 11 7 11 1.0
221 99 class2 department2 1 1 1 0.0
122 86 class2 department1 2 2 2 0.125
225 85 class2 department2 3 3 3 0.25
224 80 class2 department2 4 4 4 0.375
123 78 class2 department1 5 5 5 0.5
222 78 class2 department2 6 5 5 0.5
121 74 class2 department1 7 6 7 0.75
223 74 class2 department2 8 6 7 0.75
124 70 class2 department1 9 7 9 1.0
LAG(col,n):往前第n行数据
LEAD(col,n):往后第n行数据
FIRST_VALUE:取分组内排序后,截止到当前行,第一个值
LAST_VALUE:取分组内排序后,截止到当前行,最后一个值,对于并列的排名,取最后一个
NTILE(n):对分区内数据再分成n组,然后打上组号
select id
,score
,clazz
,department
,lag(id,2) over (partition by clazz order by score desc) as lag_num
,LEAD(id,2) over (partition by clazz order by score desc) as lead_num
,FIRST_VALUE(id) over (partition by clazz order by score desc) as first_v_num
,LAST_VALUE(id) over (partition by clazz order by score desc) as last_v_num
,NTILE(3) over (partition by clazz order by score desc) as ntile_num
from new_score;
id score clazz department lag_num lead_num first_v_num last_v_num ntile_num
114 94 class1 department1 NULL 213 114 213 1
214 94 class1 department2 NULL 211 114 213 1
213 94 class1 department2 114 115 114 213 1
211 93 class1 department2 214 212 114 115 1
115 93 class1 department1 213 215 114 115 2
212 83 class1 department2 211 112 114 212 2
215 82 class1 department2 115 113 114 215 2
112 80 class1 department1 212 216 114 112 2
113 74 class1 department1 215 111 114 216 3
216 74 class1 department2 112 NULL 114 216 3
111 69 class1 department1 113 NULL 114 111 3
221 99 class2 department2 NULL 225 221 221 1
122 86 class2 department1 NULL 224 221 122 1
225 85 class2 department2 221 123 221 225 1
224 80 class2 department2 122 222 221 224 2
123 78 class2 department1 225 121 221 222 2
222 78 class2 department2 224 223 221 222 2
121 74 class2 department1 123 124 221 223 3
223 74 class2 department2 222 NULL 221 223 3
124 70 class2 department1 121 NULL 221 124 3
Hive 行转列
lateral view explode
create table testArray2(
name string,
weight array<string>
)row format delimited
fields terminated by '\\t'
COLLECTION ITEMS terminated by ',';
志凯 "150","170","180"
上单 "150","180","190"
select name,col1 from testarray2 lateral view explode(weight) t1 as col1;
志凯 150
志凯 170
志凯 180
上单 150
上单 180
上单 190
select key from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;
key1
key2
key3
select name,col1,col2 from testarray2 lateral view explode(map('key1',1,'key2',2,'key3',3)) t1 as col1,col2;
志凯 key1 1
志凯 key2 2
志凯 key3 3
上单 key1 1
上单 key2 2
上单 key3 3
select name,pos,col1 from testarray2 lateral view posexplode(weight) t1 as pos,col1;
志凯 0 150
志凯 1 170
志凯 2 180
上单 0 150
上单 1 180
上单 2 190
Hive 列转行
// testLieToLine
name col1
志凯 150
志凯 170
志凯 180
上单 150
上单 180
上单 190
create table testLieToLine(
name string,
col1 int
)row format delimited
fields terminated by '\\t';
select name,collect_list(col1) from testLieToLine group by name;
// 结果
上单 ["150","180","190"]
志凯 ["150","170","180"]
select t1.name
,collect_list(t1.col1)
from (
select name
,col1
from testarray2
lateral view explode(weight) t1 as col1
) t1 group by t1.name;
Hive自定义函数UserDefineFunction
UDF:一进一出
- 创建maven项目,并加入依赖
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
- 编写代码,继承org.apache.hadoop.hive.ql.exec.UDF,实现evaluate方法,在evaluate方法中实现自己的逻辑
import org.apache.hadoop.hive.ql.exec.UDF;
public class HiveUDF extends UDF
// hadoop => #hadoop#
public String evaluate(String col1)
// 给传进来的数据 左边加上 # 号 右边加上 $
String result = "#" + col1 + "$";
return result;
- 打成jar包并上传至Linux虚拟机
- 在hive shell中,使用
add jar 路径
将jar包作为资源添加到hive环境中
add jar /usr/local/soft/jars/HiveUDF2-1.0.jar;
- 使用jar包资源注册一个临时函数,fxxx1是你的函数名
create temporary function fxxx1 as 'MyUDF';
- 使用函数名处理数据
select fxx1(name) as fxx_name from students limit 10;
#施笑槐$
#吕金鹏$
#单乐蕊$
#葛德曜$
#宣谷芹$
#边昂雄$
#尚孤风$
#符半双$
#沈德昌$
#羿彦昌$
UDTF:一进多出
"key1:value1,key2:value2,key3:value3"
key1 value1
key2 value2
key3 value3
方法一:使用 explode+split
select split(t.col1,":")[0],split(t.col1,":")[1]
from (select explode(split("key1:value1,key2:value2,key3:value3",",")) as col1) t;
方法二:自定UDTF
- 代码
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
public class HiveUDTF extends GenericUDTF
// 指定输出的列名 及 类型
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException
ArrayList<String> filedNames = new ArrayList<String>();
ArrayList<ObjectInspector> filedObj = new ArrayList<ObjectInspector>();
filedNames.add("col1");
filedObj.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
filedNames.add("col2");
filedObj.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
<以上是关于Hive函数的主要内容,如果未能解决你的问题,请参考以下文章