(五)Hive的UDF、UDAF和UDTF自定义函数
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了(五)Hive的UDF、UDAF和UDTF自定义函数相关的知识,希望对你有一定的参考价值。
参考技术A order by(全局排序):不经常用sort by+distrbutre by :经常用
set mapreduce.job.reduce=3;
select * from emp sort by empno desc;
select sal,deptno from emp distribute by sal sort by sal;
cluster by:只能是升序排序,相当于(sort by+distrbutre by )
select sal,deptno from emp cluster bY sal;
select concat_ws('_','sdfsdf','sdfs','123');
select cast('000000000000123123123123' as bigint);
select parse_url(' http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1','HOST' );
select get_json_object('"name":"jack","age":"20"','$.name');
实现wordcount
create table t_movie(
name string,
type string
)
row format delimited fields terminated by ' '
lines terminated by '\n'
location '/data/inner/ODS/01/t_movie';
load data inpath '/data/inner/RAW/01/t_movie/movie' into table t_movie;
select * from t_movie;
select name,split(type,',') from t_movie;
select explode(split(type,',')) from t_movie;
select name,tcol from t_movie LATERAL VIEW explode(split(type,',')) typetable AS tcol;
create table t_wordcount(
wordline string
)
row format delimited fields terminated by '\n'
location '/data/inner/ODS/01/t_wordcount';
load data inpath '/data/inner/RAW/01/t_wordcount/harry' into table t_wordcount;
select word,count(word) from t_wordcount lateral view explode(split(wordline,' ')) eswtable as word group by word;
select word,count(word) from (select explode(split(wordline,' ')) word from t_wordcount) esw group by word;
1,继承类
2,重写方法(实现逻辑)
3,打包
4,上传,创建函数
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.5</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
</dependencies>
UDF:
public class UDFHello extends UDF
public static String evaluate(String value)
..................
hadoop fs -mkdir -p /bdp/hive/bin/lib
hadoop fs -put ./demouf.jar /bdp/hive/bin/lib
create function sxt_hello as 'com.vincent.UDFHello' using jar 'hdfs:////bdp/hive/bin/lib/demouf.jar';
UDAF:
public static class SxtInnerClass implements UDAFEvaluator
@Override
//获取Hive的集群信息
public void init()
//输入数据端,进行map操作
public boolean iterate(String value)
//数据输出端,进行reduce输出
public int terminatePartial()
public String terminate()
UDTF:
public class ExplodeMap extends GenericUDTF
@Override
//数据结果:主要用来写逻辑操作
public void process(Object[] args)
@Override
//数据输入端:主要用来检测数据是否符合标准
public StructObjectInspector initialize(ObjectInspector[] args)
@Override
//关闭
public void close()
Hive--10---函数----自定义函数 (UDF-UDAF-UDTF)
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
文章目录
自定义函数
官网地址:https://cwiki.apache.org/confluence/display/Hive/HivePlugins
1.函数分类
- UDF—普通函数
- UDAF—聚合函数
- UDTF—炸裂函数
2.编程步骤
3.自定义 UDF函数-----案例
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.3</version>
</dependency>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.1.3</version>
</dependency>
</dependencies>
</dependencies>
MyUDF
package com.atguigu.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
public class MyUDF extends GenericUDF
//校验数据参数个数
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException
if (arguments.length != 1)
throw new UDFArgumentException("参数个数不为1");
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
//处理数据
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException
//1.取出输入数据
String input = arguments[0].get().toString();
//2.判断输入数据是否为null
if (input == null)
return 0;
//3.返回输入数据的长度
return input.length();
@Override
public String getDisplayString(String[] children)
return "";
4.自定义 UDTF函数-----案例
MyUDTF
package com.atguigu.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* 输入数据:hello,atguigu,hive
* 输出数据:
* hello
* atguigu
* hive
*/
public class MyUDTF extends GenericUDTF
//输出数据的集合
private ArrayList<String> outPutList = new ArrayList<>();
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException
//输出数据的默认列名,可以别别名覆盖
List<String> fieldNames = new ArrayList<>();
fieldNames.add("word");
//输出数据的类型
List<ObjectInspector> fieldOIs = new ArrayList<>();
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//最终返回值
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
//处理输入数据:hello,atguigu,hive
@Override
public void process(Object[] args) throws HiveException
//1.取出输入数据
String input = args[0].toString();
//2.按照","分割字符串
String[] words = input.split(",");
//3.遍历数据写出
for (String word : words)
//清空集合
outPutList.clear();
//将数据放入集合
outPutList.add(word);
//输出数据
forward(outPutList);
//收尾方法
@Override
public void close() throws HiveException
MyUDTF2
package com.atguigu.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* 输入数据:hello,atguigu:hello,hive
* 输出数据:
* hello atguigu
* hello hive
*/
public class MyUDTF2 extends GenericUDTF
//输出数据的集合
private ArrayList<String> outPutList = new ArrayList<>();
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException
//输出数据的默认列名,可以别别名覆盖
List<String> fieldNames = new ArrayList<>();
fieldNames.add("word1");
fieldNames.add("word2");
//输出数据的类型
List<ObjectInspector> fieldOIs = new ArrayList<>();
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//最终返回值
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
//处理输入数据:hello,atguigu:hello,hive
@Override
public void process(Object[] args) throws HiveException
//1.取出输入数据
String input = args[0].toString();
//2.按照","分割字符串
String[] fields = input.split(":");
//3.遍历数据写出
for (String field : fields)
//清空集合
outPutList.clear();
//将field按照','分割
String[] words = field.split(",");
//将words放入集合
outPutList.add(words[0]);
outPutList.add(words[1]);
//写出数据
forward(outPutList);
//收尾方法
@Override
public void close() throws HiveException
以上是关于(五)Hive的UDF、UDAF和UDTF自定义函数的主要内容,如果未能解决你的问题,请参考以下文章