把mongoDB数据导入hive
Posted 上官沐雪
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了把mongoDB数据导入hive相关的知识,希望对你有一定的参考价值。
hive映射mongoDB表
mongodb基本脚本
##创建一张表
db.createCollection("company_info_mapping");
##插入数据
db.getCollection("company_info_mapping").insert( {
companyName: "test",
licenseNumber: "test",
socialCreditCode: "test",
hid: "7752395865026566931",
uid: "1396656205685932577"
});
hive表映射mongodb数据表
## 添加jar包(可在相应资料里面下载)
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-hadoop-core-2.0.2.jar;
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-hadoop-hive-2.0.2.jar;
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-java-driver-3.12.8.jar;
CREATE EXTERNAL TABLE `mongodb_patent2020`(
`ZhuanLiXinXiInfo` string COMMENT 'from deserializer')
ROW FORMAT SERDE
'com.mongodb.hadoop.hive.BSONSerDe'
STORED BY
'com.mongodb.hadoop.hive.MongoStorageHandler'
WITH SERDEPROPERTIES (
'mongo.columns.mapping'='{"ZhuanLiXinXiInfo":"ZhuanLiXinXiInfo"}',
'serialization.format'='1')
TBLPROPERTIES (
'mongo.uri'='mongodb://172.16.98.159:21000/patent.patent2020');
## mongodb的地址
mongo.uri
## mongodb表映射的字段
mongo.columns.mapping
## 如果查询报错,添加如下脚本
set mongo.input.split.create_input_splits=false;
开始把mongodb数据导出
## 开始把mongodb数据导出
insert overwrite table hive_patent2020_pre_name
select
concat(substr(`_id`,length(`_id`)-1,1),`_id`) as key,
`_id` as id ,
patentname,
patenttype,
applicationpublishtime,
applicationnum,
applicationtime,
publishnum,
applicantname,
address,
agency,
agent,
abstracts,
status,
type,
trim(companyname)
from hive_patent2020_pre lateral view explode(split(REPLACE(REPLACE(REPLACE(trim(substr(applicantname,2,length(`applicantname`)-2)),'(','('),')',')'),'"',''),',')) tmp as companyname ;
注意事项:如果报错说什么split之类,可是如下参数:
set mongo.input.split.create_input_splits=false;
以上是关于把mongoDB数据导入hive的主要内容,如果未能解决你的问题,请参考以下文章