Hive常用指令集锦
Posted Alex_996
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Hive常用指令集锦相关的知识,希望对你有一定的参考价值。
- 创建表
use database_name;
drop table if exists new_table_name;
create table new_table_name as
select * from origin_table_name;
- 从Hive数据仓库下载数据到本地
hive -e "set hive.cli.print.header=true;select * from database_name.table_name;" > ./data.csv
- 根据表字段值分类汇总
case
when label in ('G', '1', 1) then '1'
when label in ('B', '0', 0) then '0'
else NULL end as new_label,
- 获取日期字符串月份
substr(cast(dt as string), 1, 7),
- 近六个月每月划分为4周
case
when dt >= concat(substr(add_months('$target_dt', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 8
then concat('30', '', concat(substr(dt, 3, 5), '-', '01'))
when dt >= concat(substr(add_months('$target_dt', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 15
then concat('30', '', concat(substr(dt, 3, 5), '-', '02'))
when dt >= concat(substr(add_months('$target_dt', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 22
then concat('30', '', concat(substr(dt, 3, 5), '-', '03'))
when dt >= concat(substr(add_months('$target_dt', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 32
then concat('30', '', concat(substr(dt, 3, 5), '-', '04'))
end,
- 取数并按照时间去重
select
id
to_date(apply_time) as apply_time,
apply_channel
from
(select
id.
apply_time,
apply_channel,
row_number() over (distribute by id, apply_channel sort by apply_time desc) as rn
from
database_name.user_table_name
) as t0
where
t0.rn = 1
- 不select某几个字段
select `(pin|dt)?+.+` from database_name.table_name;
调优参数
set hive.map.aggr.hash.percentmemory=0.3;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
set mapreduce.map.memory.mb=5200;
set mapreduce.map.java.opts=-Xmx4500M;
set mapreduce.reduce.memory.mb=5200;
set mapreduce.reduce.java.opts=-Xmx4500M;
set mapreduce.map.memory.mb=10240;
set mapreduce.map.java.opts=-Xmx10240m;
set mapreduce.reduce.memory.mb=20480;
set mapreduce.reduce.java.opts=-Xmx20480m;
set mapreduce.jobtracker.split.metainfo.maxsize=-1;
set mapred.max.split.size=1000000;
set mapred.min.split.size.per.node=1000000;
set mapred.min.split.size.per.rack=1000000;
set mapreduce.input.fileinputformat.split.maxsize=1000000;
set hive.exec.parallel=true;
set mapred.compress.map.output=true;
set mapred.output.compress=true;
set hive.exec.compress.output=true;
set hive.compute.query.using.stats=true;
set hive.optimize.skewjoin=true;
set hive.skewjoin.key=500000;
set hive.skewjoin.mapjoin.map.tasks=10000;
set hive.skewjoin.mapjoin.min.split=33554432;
set hive.auto.convert.join=false;
set hive.map.aggr.hash.percentmemory=0.1;
set hive.optimize.sort.dynamic.partition=true;
UDF
python UDF模板
warning: UDF中不能有中文注释
# -*- coding:utf-8 -*-
import sys
# 报错信息,不覆盖输出
error_msg = None
try:
import numpy as np
except Exception as e:
error_msg = "import error: " + str(e)
try:
pass
except Exception as e:
error_msg = "define error: " + str(e) if error_msg is None else error_msg
for line in sys.stdin:
try:
items = line.strip().split("\\t")
print('\\t'.join(items))
except Exception as e:
error_msg = "process error: " + str(e) if error_msg is None else error_msg
if error_msg:
print(error_msg)
python UDF调用模板
use database_name;
drop table if exists table_name;
create table table_name as
select
transform(select_cols)
using 'python udf.py'
as (new_cols)
from origin_table_name;
以上是关于Hive常用指令集锦的主要内容,如果未能解决你的问题,请参考以下文章