CDH hive建表注释中文乱码问题解决
Posted 扫地增
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了CDH hive建表注释中文乱码问题解决相关的知识,希望对你有一定的参考价值。
1. 问题描述
笔者在入职新公司后,由于现司大数据起步较晚,数仓还在初级建设阶段,集群使用CDH 5.10.0搭建比较古老,在筹建数仓是发现hive本身不支持中文注释,出现乱码。
show create table equipment;
CREATE EXTERNAL TABLE `equipment`(
`id` bigint COMMENT '????',
`customer_id` bigint COMMENT '??id',
`eq_id` bigint COMMENT '设备id',
`create_time` bigint COMMENT '??????')
COMMENT '??????????'
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://cloud-11:8020/big-data/ods/xxx/xxx/xxx/equipment'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='false',
'numFiles'='0',
'numRows'='-1',
'orc.compress'='LZO',
'rawDataSize'='-1',
'totalSize'='0',
'transient_lastDdlTime'='1633659470')
DESC equipment;
id bigint ?????
customer_id bigint ??id
eq_id bigint ??id
create_time bigint ??????
2. 问题分析
出现这个问题的本身我需要考虑元数据的信息来自于哪里,也就是hive在执行 show create table xxx
和desc xxx
时hive读取的是哪里的数据,很明显这两个命令其实都是查询的数仓中表的schema,而类似于schema的都属于元数据,因此我们想到这可能是由于我们元数据库字符编码设置与我们hive使用的编码方式不一致造成,我们知道hive默认使用utf-8编码。由于笔者当前hive是使用的mysql作为元数据存储,于是我们查看了我们设置的元数据库的编码方式:
我们发现数据库元数据库使用的编码格式是utf8
,是没有问题的,于是我又查看了字段注释、表注释、分区注释、表名注释、视图注释、数据库名称注释相关的元数据表的信息:
CREATE TABLE `TABLE_PARAMS` (
`TBL_ID` bigint(20) NOT NULL,
`PARAM_KEY` varchar(256) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL,
`PARAM_VALUE` varchar(4000) CHAR
ACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
PRIMARY KEY (`TBL_ID`,`PARAM_KEY`),
KEY `TABLE_PARAMS_N49` (`TBL_ID`),
CONSTRAINT `TABLE_PARAMS_FK1` FOREIGN KEY (`TBL_ID`) REFERENCES `TBLS` (`TBL_ID`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
CREATE TABLE `PARTITION_PARAMS` (
`PART_ID` bigint(20) NOT NULL,
`PARAM_KEY` varchar(256) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL,
`PARAM_VALUE` varchar(4000) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
PRIMARY KEY (`PART_ID`,`PARAM_KEY`),
KEY `PARTITION_PARAMS_N49` (`PART_ID`),
CONSTRAINT `PARTITION_PARAMS_FK1` FOREIGN KEY (`PART_ID`) REFERENCES `PARTITIONS` (`PART_ID`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
CREATE TABLE `PARTITION_KEYS` (
`TBL_ID` bigint(20) NOT NULL,
`PKEY_COMMENT` varchar(4000) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`PKEY_NAME` varchar(128) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL,
`PKEY_TYPE` varchar(767) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL,
`INTEGER_IDX` int(11) NOT NULL,
PRIMARY KEY (`TBL_ID`,`PKEY_NAME`),
KEY `PARTITION_KEYS_N49` (`TBL_ID`),
CONSTRAINT `PARTITION_KEYS_FK1` FOREIGN KEY (`TBL_ID`) REFERENCES `TBLS` (`TBL_ID`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
CREATE TABLE `INDEX_PARAMS` (
`INDEX_ID` bigint(20) NOT NULL,
`PARAM_KEY` varchar(256) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL,
`PARAM_VALUE` varchar(4000) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
PRIMARY KEY (`INDEX_ID`,`PARAM_KEY`),
KEY `INDEX_PARAMS_N49` (`INDEX_ID`),
CONSTRAINT `INDEX_PARAMS_FK1` FOREIGN KEY (`INDEX_ID`) REFERENCES `IDXS` (`INDEX_ID`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
CREATE TABLE `TBLS` (
`TBL_ID` bigint(20) NOT NULL,
`CREATE_TIME` int(11) NOT NULL,
`DB_ID` bigint(20) DEFAULT NULL,
`LAST_ACCESS_TIME` int(11) NOT NULL,
`OWNER` varchar(767) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`RETENTION` int(11) NOT NULL,
`SD_ID` bigint(20) DEFAULT NULL,
`TBL_NAME` varchar(128) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`TBL_TYPE` varchar(128) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`VIEW_EXPANDED_TEXT` mediumtext,
`VIEW_ORIGINAL_TEXT` mediumtext,
`LINK_TARGET_ID` bigint(20) DEFAULT NULL,
PRIMARY KEY (`TBL_ID`),
UNIQUE KEY `UNIQUETABLE` (`TBL_NAME`,`DB_ID`),
KEY `TBLS_N50` (`SD_ID`),
KEY `TBLS_N49` (`DB_ID`),
KEY `TBLS_N51` (`LINK_TARGET_ID`),
CONSTRAINT `TBLS_FK1` FOREIGN KEY (`SD_ID`) REFERENCES `SDS` (`SD_ID`),
CONSTRAINT `TBLS_FK2` FOREIGN KEY (`DB_ID`) REFERENCES `DBS` (`DB_ID`),
CONSTRAINT `TBLS_FK3` FOREIGN KEY (`LINK_TARGET_ID`) REFERENCES `TBLS` (`TBL_ID`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
CREATE TABLE `DBS` (
`DB_ID` bigint(20) NOT NULL,
`DESC` varchar(4000) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`DB_LOCATION_URI` varchar(4000) CHARACTER SET latin1 COLLATE latin1_bin NOT NULL,
`NAME` varchar(128) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`OWNER_NAME` varchar(128) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`OWNER_TYPE` varchar(10) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
PRIMARY KEY (`DB_ID`),
UNIQUE KEY `UNIQUE_DATABASE` (`NAME`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
我们发现了问题所在,原来这些元数据表使用的是latin1
的编码方式造成了元数据字符编码方式不一致。排查到这里相信大家都已经有了比较好的解决思路。
解决方案:
通过以上分析,很明显我们尝试下改变这些元数据表的字符编码方式为utf8
。如下:
#修改数据库字符集
alter database metastore character set utf8;
#修改字段注释字符集
ALTER TABLE COLUMNS_V2 modify column COMMENT varchar(256) character set utf8;
#修改表注释字符集
ALTER TABLE TABLE_PARAMS modify column PARAM_VALUE varchar(40000) character set utf8;
#修改分区参数,支持分区建用中文表示
ALTER TABLE PARTITION_PARAMS modify column PARAM_VALUE varchar(40000) character set utf8;
ALTER TABLE PARTITION_KEYS modify column PKEY_COMMENT varchar(40000) character set utf8;
#修改表名注释,支持中文表示
ALTER TABLE INDEX_PARAMS modify column PARAM_VALUE varchar(4000) character set utf8;
#修改视图,支持视图中文
ALTER TABLE TBLS modify COLUMN VIEW_EXPANDED_TEXT mediumtext CHARACTER SET utf8;
ALTER TABLE TBLS modify COLUMN VIEW_ORIGINAL_TEXT mediumtext CHARACTER SET utf8;
#修改数据库名称注释
ALTER TABLE `DBS` CHANGE COLUMN `DESC` `DESC` VARCHAR(4000) CHARACTER SET 'utf8' NULL DEFAULT NULL ;
4. 效果
这里提供下笔者解决完成后的效果图。如下:
仅此提供一个问题的解决思路,希望对大家有帮助。
以上是关于CDH hive建表注释中文乱码问题解决的主要内容,如果未能解决你的问题,请参考以下文章