Hive之基本操作
Posted wadeyu
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Hive之基本操作相关的知识,希望对你有一定的参考价值。
beeline客户端
- hive server、hive server2 的客户端,底层通过jdbc接口通讯
- hive命令后面会取消,会使用beeline替代
- 不需要单独启动hive server的使用方式,类似于hive命令
$ $HIVE_HOME/bin/beeline -u jdbc:hive2://
- 需要单独启动hive server的使用方式
$ $HIVE_HOME/bin/beeline -u jdbc:hive2://$HS2_HOST:$HS2_PORT
本地模式
默认任务会提交到hadoop集群执行,如果数据量不是非常大,可以选择在hive所在机器执行任务,也就是说不提交到集群运行。
# 查看执行模式,默认是集群(yarn)
0: jdbc:hive2://> set mapreduce.framework.name;
+--------------------------------+--+
| set |
+--------------------------------+--+
| mapreduce.framework.name=yarn |
+--------------------------------+--+
# 设置本地执行模式
0: jdbc:hive2://> set mapreduce.framework.name=local;
0: jdbc:hive2://> set mapreduce.framework.name;
+---------------------------------+--+
| set |
+---------------------------------+--+
| mapreduce.framework.name=local |
+---------------------------------+--+
# 本地执行(数据量大会比较慢)
0: jdbc:hive2://> select * from orders order by user_id desc limit 10;
+------------------+-----------------+------------------+----------------------+-------------------+---------------------------+--------------------------------+--+
| orders.order_id | orders.user_id | orders.eval_set | orders.order_number | orders.order_dow | orders.order_hour_of_day | orders.days_since_prior_order |
+------------------+-----------------+------------------+----------------------+-------------------+---------------------------+--------------------------------+--+
| 2558525 | 206209 | prior | 9 | 4 | 15 | 22.0 |
| 3154581 | 206209 | prior | 1 | 3 | 11 | |
| 2266710 | 206209 | prior | 10 | 5 | 18 | 29.0 |
| 3186442 | 206209 | prior | 6 | 0 | 16 | 3.0 |
| 1854736 | 206209 | prior | 11 | 4 | 10 | 30.0 |
| 272231 | 206209 | train | 14 | 6 | 14 | 30.0 |
| 2977660 | 206209 | prior | 13 | 1 | 12 | 7.0 |
| 626363 | 206209 | prior | 12 | 1 | 12 | 18.0 |
| 2307371 | 206209 | prior | 5 | 4 | 15 | 3.0 |
| 688306 | 206209 | prior | 4 | 1 | 10 | 30.0 |
+------------------+-----------------+------------------+----------------------+-------------------+---------------------------+--------------------------------+--+
hive操作
- 显示内置的函数
0: jdbc:hive2://> show functions;
# 查看函数描述
0: jdbc:hive2://> desc function collect_list
0: jdbc:hive2://> ;
+--------------------------------------------------------------+--+
| tab_name |
+--------------------------------------------------------------+--+
| collect_list(x) - Returns a list of objects with duplicates |
+--------------------------------------------------------------+--+
0: jdbc:hive2://> desc function extended collect_list;
+--------------------------------------------------------------+--+
| tab_name |
+--------------------------------------------------------------+--+
| collect_list(x) - Returns a list of objects with duplicates |
+--------------------------------------------------------------+--+
- 显示数据库列表
0: jdbc:hive2://> show databases;
+----------------+--+
| database_name |
+----------------+--+
| badou |
| default |
+----------------+--+
- 选择数据库
0: jdbc:hive2://> use badou;
- 显示表
0: jdbc:hive2://> show tables;
+-----------------------+--+
| tab_name |
+-----------------------+--+
| order_products_prior |
| orders |
+-----------------------+--+
0: jdbc:hive2://> show tables '.*s';
+-----------+--+
| tab_name |
+-----------+--+
| orders |
- 显示某个表分区列表
SHOW PARTITIONS foo;
- 查看表信息
0: jdbc:hive2://> desc orders;
+-------------------------+------------+-------------------------+--+
| col_name | data_type | comment |
+-------------------------+------------+-------------------------+--+
| order_id | int | ??id |
| user_id | int | ??id |
| eval_set | string | eval_set |
| order_number | int | order_number |
| order_dow | int | ???? |
| order_hour_of_day | string | ???? |
| days_since_prior_order | string | days_since_prior_order |
+-------------------------+------------+-------------------------+--+
0: jdbc:hive2://> show create table orders;
+----------------------------------------------------------------------+--+
| createtab_stmt |
+----------------------------------------------------------------------+--+
| CREATE TABLE `orders`( |
| `order_id` int COMMENT '??id', |
| `user_id` int COMMENT '??id', |
| `eval_set` string COMMENT 'eval_set', |
| `order_number` int COMMENT 'order_number', |
| `order_dow` int COMMENT '????', |
| `order_hour_of_day` string COMMENT '????', |
| `days_since_prior_order` string COMMENT 'days_since_prior_order') |
| COMMENT 'orders' |
| ROW FORMAT DELIMITED |
| FIELDS TERMINATED BY ',' |
| LINES TERMINATED BY '
' |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.mapred.TextInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' |
| LOCATION |
| 'hdfs://master:9000/user/hive/warehouse/badou.db/orders' |
| TBLPROPERTIES ( |
| 'COLUMN_STATS_ACCURATE'='false', |
| 'numFiles'='1', |
| 'numRows'='-1', |
| 'rawDataSize'='-1', |
| 'totalSize'='108968555', |
| 'transient_lastDdlTime'='1539096287') |
+----------------------------------------------------------------------+--+
- 创建表(默认是内部表)
0: jdbc:hive2://> create table pokes(foo int, bar string);
- 修改表
# 改表名
0: jdbc:hive2://> alter table pokes rename to pokes2;
# 新增字段
0: jdbc:hive2://> alter table pokes2 add columns (new_col2 int comment 'add column1');
0: jdbc:hive2://> desc pokes2;
+-----------+------------+--------------+--+
| col_name | data_type | comment |
+-----------+------------+--------------+--+
| foo | int | |
| bar | string | |
| new_col1 | int | |
| new_col2 | int | add column1 |
+-----------+------------+--------------+--+
# 替换字段
0: jdbc:hive2://> alter table pokes2 replace columns (foo1 int comment 'new column1', bar2 string comment 'new column2');
0: jdbc:hive2://> desc pokes2;
+-----------+------------+--------------+--+
| col_name | data_type | comment |
+-----------+------------+--------------+--+
| foo1 | int | new column1 |
| bar2 | string | new column2 |
+-----------+------------+--------------+--+
# 更改字段
ALTER TABLE foo CHANGE COLUMN dec_column_name dec_column_name DECIMAL(38,18);
- 删除表(内部表数据也会一起删除)
0: jdbc:hive2://> drop table pokes2;
0: jdbc:hive2://> show tables;
+-----------------------+--+
| tab_name |
+-----------------------+--+
| order_products_prior |
| orders |
+-----------------------+--+
- 导入本地数据
hive> LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
- 导入HDFS数据
hive> LOAD DATA INPATH '/examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
- sql操作
#查询和过滤
hive> SELECT a.foo FROM invites a WHERE a.ds='2008-08-15';
# 查询结果保存hdfs
INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='2008-08-15';
# 查询结果保存本地目录
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
# 查询结果保存到其它表
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a;
# GROUP BY
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(*) WHERE a.foo > 0 GROUP BY a.bar;
# JOIN
hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;
# MULTITABLE INSERT
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;
# STREAMING
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';
Hive使用示例
- 准备数据源
[[email protected] ~]# wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
[[email protected] ~]# unzip ml-100k.zip
Archive: ml-100k.zip
creating: ml-100k/
inflating: ml-100k/allbut.pl
inflating: ml-100k/mku.sh
inflating: ml-100k/README
inflating: ml-100k/u.data
inflating: ml-100k/u.genre
inflating: ml-100k/u.info
inflating: ml-100k/u.item
inflating: ml-100k/u.occupation
inflating: ml-100k/u.user
inflating: ml-100k/u1.base
inflating: ml-100k/u1.test
inflating: ml-100k/u2.base
inflating: ml-100k/u2.test
inflating: ml-100k/u3.base
inflating: ml-100k/u3.test
inflating: ml-100k/u4.base
inflating: ml-100k/u4.test
inflating: ml-100k/u5.base
inflating: ml-100k/u5.test
inflating: ml-100k/ua.base
inflating: ml-100k/ua.test
inflating: ml-100k/ub.base
inflating: ml-100k/ub.test
- 创建表
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
STORED AS TEXTFILE;
- 导入本地数据到hive
0: jdbc:hive2://> load data local inpath "./u.data" overwrite into table u_data;
Loading data to table badou.u_data
Table badou.u_data stats: [numFiles=1, numRows=0, totalSize=1979173, rawDataSize=0]
OK
No rows affected (5.05 seconds)
- 统计表总行数
0: jdbc:hive2://> select count(*) from u_data;
Total MapReduce CPU Time Spent: 6 seconds 100 msec
OK
+---------+--+
| _c0 |
+---------+--+
| 100000 |
+---------+--+
1 row selected (78.877 seconds)
- 使用python对数据转换,存另外一张表
# 创建新表
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ' '
STORED AS TEXTFILE;
# python数据转换程序 weekday_mapper.py
#! /bin/env python
import sys
from datetime import datetime
for line in sys.stdin:
userid, movieid, rating, unixtime = line.strip().split(' ')
weekday = datetime.fromtimestamp(float(unixtime)).isoweekday()
print( ' '.join([userid, movieid, rating, str(weekday)]) )
# Hive添加转换程序资源
0: jdbc:hive2://> add file weekday_mapper.py;
Added resources: [weekday_mapper.py]
No rows affected (0.047 seconds)
# 执行转换
insert overwrite table u_data_new
select
transform(userid, movieid, rating, unixtime)
using 'python weekday_mapper.py'
as (userid, movieid, rating, weekday)
from u_data;
# 查询新表前10条数据
0: jdbc:hive2://> select * from u_data_new limit 10;
+--------------------+---------------------+--------------------+---------------------+--+
| u_data_new.userid | u_data_new.movieid | u_data_new.rating | u_data_new.weekday |
+--------------------+---------------------+--------------------+---------------------+--+
| 196 | 242 | 3 | 4 |
| 186 | 302 | 3 | 7 |
| 22 | 377 | 1 | 5 |
| 244 | 51 | 2 | 4 |
| 166 | 346 | 1 | 1 |
| 298 | 474 | 4 | 3 |
| 115 | 265 | 2 | 4 |
| 253 | 465 | 5 | 6 |
| 305 | 451 | 3 | 7 |
| 6 | 86 | 3 | 4 |
+--------------------+---------------------+--------------------+---------------------+--+
# 一周某天的总数
0: jdbc:hive2://> select weekday, count(*) as tt from u_data_new group by weekday;
+----------+--------+--+
| weekday | tt |
+----------+--------+--+
| 1 | 12254 |
| 2 | 13579 |
| 3 | 14430 |
| 4 | 15114 |
| 5 | 14743 |
| 6 | 18229 |
| 7 | 11651 |
+----------+--------+--+
参考资料
【0】hive wiki - GettingStarted
https://cwiki.apache.org/confluence/display/Hive/GettingStarted
【1】hive wiki - User FAQ
https://cwiki.apache.org/confluence/display/Hive/User+FAQ
以上是关于Hive之基本操作的主要内容,如果未能解决你的问题,请参考以下文章