Hive常用指令集锦

Posted 2022-04-27 Alex_996

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Hive常用指令集锦相关的知识，希望对你有一定的参考价值。

创建表

use database_name;
drop table if exists new_table_name;
create table new_table_name as 
select * from origin_table_name;

从Hive数据仓库下载数据到本地

hive -e "set hive.cli.print.header=true;select * from database_name.table_name;" > ./data.csv

根据表字段值分类汇总

  case
    when label in ('G', '1', 1) then '1'
    when label in ('B', '0', 0) then '0'
  else NULL end as new_label,

获取日期字符串月份

substr(cast(dt as string), 1, 7),

近六个月每月划分为4周

case
  when dt >= concat(substr(add_months('$target_dt', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 8
    then concat('30', '', concat(substr(dt, 3, 5), '-', '01'))
  when dt >= concat(substr(add_months('$target_dt', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 15
    then concat('30', '', concat(substr(dt, 3, 5), '-', '02'))
  when dt >= concat(substr(add_months('$target_dt', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 22
    then concat('30', '', concat(substr(dt, 3, 5), '-', '03'))
  when dt >= concat(substr(add_months('$target_dt', -5), 1, 7), '-', '01') and cast(substr(dt, -2, 2) as int) < 32
    then concat('30', '', concat(substr(dt, 3, 5), '-', '04'))
end,

取数并按照时间去重

select
    id
    to_date(apply_time) as apply_time,
    apply_channel
from
    (select
	    id.
	    apply_time,
	    apply_channel,
        row_number() over (distribute by id, apply_channel sort by apply_time desc) as rn
    from
        database_name.user_table_name
    ) as t0 
where
    t0.rn = 1

不select某几个字段

select `(pin|dt)?+.+` from database_name.table_name;

调优参数

set hive.map.aggr.hash.percentmemory=0.3;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
set mapreduce.map.memory.mb=5200;
set mapreduce.map.java.opts=-Xmx4500M;
set mapreduce.reduce.memory.mb=5200;
set mapreduce.reduce.java.opts=-Xmx4500M;

set mapreduce.map.memory.mb=10240;
set mapreduce.map.java.opts=-Xmx10240m;
set mapreduce.reduce.memory.mb=20480;
set mapreduce.reduce.java.opts=-Xmx20480m;
set mapreduce.jobtracker.split.metainfo.maxsize=-1;
set mapred.max.split.size=1000000;
set mapred.min.split.size.per.node=1000000;
set mapred.min.split.size.per.rack=1000000;
set mapreduce.input.fileinputformat.split.maxsize=1000000;
set hive.exec.parallel=true;
set mapred.compress.map.output=true;
set mapred.output.compress=true;
set hive.exec.compress.output=true;
set hive.compute.query.using.stats=true;
set hive.optimize.skewjoin=true;
set hive.skewjoin.key=500000;
set hive.skewjoin.mapjoin.map.tasks=10000;
set hive.skewjoin.mapjoin.min.split=33554432;
set hive.auto.convert.join=false;
set hive.map.aggr.hash.percentmemory=0.1;
set hive.optimize.sort.dynamic.partition=true;

UDF

python UDF模板

warning: UDF中不能有中文注释

# -*- coding:utf-8 -*-
import sys

# 报错信息，不覆盖输出
error_msg = None

try:
    import numpy as np
except Exception as e:
    error_msg = "import error: " + str(e)


try:
    pass
except Exception as e:
    error_msg = "define error: " + str(e) if error_msg is None else error_msg

for line in sys.stdin:
    try:
        items = line.strip().split("\\t")
        print('\\t'.join(items))
    except Exception as e:
        error_msg = "process error: " + str(e) if error_msg is None else error_msg

    if error_msg:
        print(error_msg)

python UDF调用模板

use database_name;
drop table if exists table_name;
create table table_name as
select
    transform(select_cols)
    using 'python udf.py'
    as (new_cols)
from origin_table_name;

以上是关于Hive常用指令集锦的主要内容，如果未能解决你的问题，请参考以下文章