把mongoDB数据导入hive
Posted 上官沐雪
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了把mongoDB数据导入hive相关的知识,希望对你有一定的参考价值。
hive映射mongoDB表
mongodb基本脚本
##创建一张表
db.createCollection("company_info_mapping");
##插入数据
db.getCollection("company_info_mapping").insert(
companyName: "test",
licenseNumber: "test",
socialCreditCode: "test",
hid: "7752395865026566931",
uid: "1396656205685932577"
);
hive表映射mongodb数据表
## 添加jar包(可在相应资料里面下载)
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-hadoop-core-2.0.2.jar;
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-hadoop-hive-2.0.2.jar;
add jar /var/lib/hadoop-hdfs/bin/hive_mongoDB/mongo-java-driver-3.12.8.jar;
CREATE EXTERNAL TABLE `mongodb_patent2020`(
`ZhuanLiXinXiInfo` string COMMENT 'from deserializer')
ROW FORMAT SERDE
'com.mongodb.hadoop.hive.BSONSerDe'
STORED BY
'com.mongodb.hadoop.hive.MongoStorageHandler'
WITH SERDEPROPERTIES (
'mongo.columns.mapping'='"ZhuanLiXinXiInfo":"ZhuanLiXinXiInfo"',
'serialization.format'='1')
TBLPROPERTIES (
'mongo.uri'='mongodb://172.16.98.159:21000/patent.patent2020');
## mongodb的地址
mongo.uri
## mongodb表映射的字段
mongo.columns.mapping
## 如果查询报错,添加如下脚本
set mongo.input.split.create_input_splits=false;
开始把mongodb数据导出
## 开始把mongodb数据导出
insert overwrite table hive_patent2020_pre_name
select
concat(substr(`_id`,length(`_id`)-1,1),`_id`) as key,
`_id` as id ,
patentname,
patenttype,
applicationpublishtime,
applicationnum,
applicationtime,
publishnum,
applicantname,
address,
agency,
agent,
abstracts,
status,
type,
trim(companyname)
from hive_patent2020_pre lateral view explode(split(REPLACE(REPLACE(REPLACE(trim(substr(applicantname,2,length(`applicantname`)-2)),'(','('),')',')'),'"',''),',')) tmp as companyname ;
注意事项:如果报错说什么split之类,可是如下参数:
set mongo.input.split.create_input_splits=false;
在向mongodb插入数据的时候会报如下错误
Error: java.io.IOException: java.lang.reflect.InvocationTargetException
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:271)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.next(HadoopShimsSecure.java:144) at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.moveToNext(MapTask.java:205)
at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.next(MapTask.java:191)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:52)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:465)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:349)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:174)
at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1875)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:168)
Caused by: java.lang.reflect.InvocationTargetException at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:257) ... 11 more Caused by: java.lang.IndexOutOfBoundsException: Index: 1, Size: 1 at java.util.ArrayList.rangeCheck(ArrayList.java:657) at java.util.ArrayList.get(ArrayList.java:433)
at org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport.getProjectedGroupFields(DataWritableReadSupport.java:116)
at org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport.getSchemaByName(DataWritableReadSupport.java:176)
at org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport.getRequestedSchemaForIndexAccess(DataWritableReadSupport.java:289)
at org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport.init(DataWritableReadSupport.java:231)
at org.apache.hadoop.hive.ql.io.parquet.ParquetRecordReaderBase.getSplit(ParquetRecordReaderBase.java:84)
at org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:78)
at org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:63)
at org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat.getRecordReader(MapredParquetInputFormat.java:75) at org.apache.hadoop.hive.ql.io.CombineHiveRecordReader.<init>(CombineHiveRecordReader.java:68) ... 16 more
Error: java.io.IOException: java.lang.reflect.InvocationTargetException
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderCreationException(HiveIOExceptionHandlerChain.java:97)
at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(HiveIOExceptionHandlerUtil.java:57)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:271)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.next(HadoopShimsSecure.java:144) at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.moveToNext(MapTask.java:205)
at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.next(MapTask.java:191)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:52)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:465)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:349)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:174)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1875)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:168) Caused by: java.lang.reflect.InvocationTargetException at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.shims.HadoopShimsSecure$CombineFileRecordReader.initNextRecordReader(HadoopShimsSecure.java:257) ... 11 more Caused by: java.lang.IndexOutOfBoundsException: Index: 1, Size: 1
at java.util.ArrayList.rangeCheck(ArrayList.java:657) at java.util.ArrayList.get(ArrayList.java:433)
at org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport.getProjectedGroupFields(DataWritableReadSupport.java:116)
at org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport.getSchemaByName(DataWritableReadSupport.java:176)
at org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport.getRequestedSchemaForIndexAccess(DataWritableReadSupport.java:289)
at org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport.init(DataWritableReadSupport.java:231)
at org.apache.hadoop.hive.ql.io.parquet.ParquetRecordReaderBase.getSplit(ParquetRecordReaderBase.java:84)
at org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:78) at org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.<init>(ParquetRecordReaderWrapper.java:63)
at org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat.getRecordReader(MapredParquetInputFormat.java:75)
at org.apache.hadoop.hive.ql.io.CombineHiveRecordReader.<init>(CombineHiveRecordReader.java:68) ... 16 more
以上是关于把mongoDB数据导入hive的主要内容,如果未能解决你的问题,请参考以下文章