hive中文乱码-解析json
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了hive中文乱码-解析json相关的知识,希望对你有一定的参考价值。
--最近在做手机型号维度分析统计中,手机型号出现中文乱码的现象,查找的最终的原因的,在json解析的过程中,出现乱码,定位是分隔符的问题 ‘||‘
解决思路:用split函数的(||)切割json和ip,最终没有乱码的现象,但是问题解决了,不知是什么原因,估计是其中类的原因
select phone_mode_name,phone_brand_name from intdata.kesheng_sdk_device where phone_mode_name like ‘%?%‘ or phone_brand_name and src_file_day = 20170124 like ‘%?%‘ limit 3 ;
Total MapReduce CPU Time Spent: 5 minutes 2 seconds 850 msec
OK
?? GT-i9220 ??
?? GT-S5830 ??
?? GT-S5830 ??
Time taken: 25.528 seconds, Fetched: 3 row(s)
--create table
CREATE TABLE intdata.kesheng_sdk_device(
device_id string,
imei string,
idfa string,
idfv string,
imsi string,
sim string,
udid string,
os_type_code string,
macId string,
app_ver_code string,
phone_mode_name string,
phone_brand_name string,
phone_number string,
sreen_width string,
sreen_height string,
sreen_density string,
app_key string,
app_pkg_name string,
app_channel_id string,
user_id string,
language_code string,
upload_unix_time string,
os_ver_code string,
sdk_ver_code string,
is_BTB_flag string,
back_cnt bigint,
install_id string
)
partitioned by(src_file_day string)
clustered by(device_id) into 128 buckets;
---
add jar /opt/cloudera/parcels/CDH/lib/hive/lib/hive-contrib.jar;
set mapreduce.job.name=kesheng_sdk_device_${EXTRACT_DATE};
set hive.enforce.bucketing=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
INSERT OVERWRITE TABLE intdata.kesheng_sdk_device PARTITION(src_file_day)
SELECT
concat_ws(‘_‘, nvl(install_id, ‘‘), nvl(COALESCE(imei, idfa), ‘‘)) device_id,
imei,
idfa,
idfv,
imsi,
sim,
udid,
os_type,
macId,
app_ver_code,
phone_mode_name,
phone_brand_name,
phone_number,
sreen_width,
sreen_height,
sreen_density,
app_key,
app_pkg_name,
app_channel_id,
user_id,
language_code,
upload_unix_time,
os_ver_code,
sdk_ver_code,
is_BTB_flag,
back_cnt,
install_id,
src_file_day
FROM (SELECT
v.imei,
v.idfa,
v.idfv,
v.imsi,
v.sim,
v.udid,
v.os as os_type,
v.macId,
v.appVersion as app_ver_code,
v.phoneMode as phone_mode_name,
v.phoneBrand as phone_brand_name,
v.phoneNumber as phone_number,
v.sreenWidth as sreen_width,
v.sreenHeight as sreen_height,
v.sreenDensity as sreen_density,
v.appkey as app_key,
v.apppkg as app_pkg_name,
v.appchannel as app_channel_id,
v.userid as user_id,
v.language as language_code,
cast(v.uploadTs as bigint) as upload_unix_time,
v.osversion as os_ver_code,
v.sdkversion as sdk_ver_code,
v.isBTB as is_BTB_flag,
cast(v.backCount as bigint) as back_cnt,
v.installationID as install_id,
v.extract_date_label as src_file_day,
row_number() over(partition by v.installationID, v.imei, v.idfa order by v.uploadTs desc) row_num
FROM ods.kesheng_sdk_deviceinfo_v v
WHERE v.extract_date_label = ‘${EXTRACT_DATE}‘ and
(
(v.os = ‘AD‘ and (v.installationID is not null or v.imei is not null))
or (v.os = ‘ios‘ and (v.installationID is not null or v.idfa is not null))
)
) x
WHERE row_num = 1;
---
CREATE OR REPLACE VIEW ods.kesheng_sdk_deviceinfo_v
AS
SELECT concat(s.INPUT__FILE__NAME, ‘:‘, s.BLOCK__OFFSET__INSIDE__FILE) as rowkey,
d.os,
d.imei,
d.imsi,
d.idfa,
d.idfv,
d.sim,
d.macId,
d.appVersion,
d.phoneMode,
d.phoneBrand,
d.phoneNumber,
d.sreenWidth,
d.sreenHeight,
d.sreenDensity,
d.appkey,
d.apppkg,
d.appchannel,
d.userid,
d.language,
d.isBTB,
d.backCount,
d.uploadTs,
d.clientId,
d.sessionId,
d.osversion,
d.sdkversion,
d.ints,
d.networktype,
d.udid,
d.installationID,
s.client_ip,
s.extract_date_label,
s.extract_hour_label
FROM ods.kesheng_sdk_json_ex_v s
LATERAL VIEW json_tuple(s.json, ‘deviceInfo‘) p as deviceInfo
LATERAL VIEW json_tuple(p.deviceInfo, ‘os‘, ‘imei‘, ‘imsi‘, ‘idfa‘, ‘idfv‘, ‘sim‘, ‘macId‘, ‘appVersion‘, ‘phoneMode‘, ‘phoneBrand‘, ‘phoneNumber‘, ‘sreenWidth‘, ‘sreenHeight‘, ‘sreenDensity‘, ‘appkey‘, ‘apppkg‘, ‘appchannel‘, ‘userid‘, ‘language‘, ‘isBTB‘, ‘backCount‘, ‘uploadTs‘, ‘clientId‘, ‘sessionId‘, ‘osversion‘, ‘sdkversion‘, ‘ints‘, ‘networktype‘, ‘udid‘, ‘installationID‘) d AS os, imei, imsi, idfa, idfv, sim, macId, appVersion, phoneMode, phoneBrand, phoneNumber, sreenWidth, sreenHeight, sreenDensity, appkey, apppkg, appchannel, userid, language, isBTB, backCount, uploadTs, clientId, sessionId, osversion, sdkversion, ints, networktype, udid, installationID
WHERE p.deviceInfo IS NOT NULL;
----
create or replace view ods.kesheng_sdk_json_ex_v
as
select line[0] json, line[1] client_ip
,src_file_day extract_date_label
,src_file_hour extract_hour_label
,input_file_name ,block_offset_inside_file
from (select split(t1.line,‘\\|\\|‘) line
,src_file_day,src_file_hour
,INPUT__FILE__NAME input_file_name ,BLOCK__OFFSET__INSIDE__FILE block_offset_inside_file
from ods.kesheng_sdk_raw_ex t1
) t2
注意:
分割字符串函数: split
语法: split(string str, stringpat)
返回值: array
说明:按照pat字符串分割str,会返回分割后的字符串数组
举例:
hive> select split(‘abtcdtef‘,‘t‘) from lxw_dual;
["ab","cd","ef"]
drop table ods.kesheng_sdk_raw_ex;
create external table ods.kesheng_sdk_raw_ex (
line string
)
partitioned by (src_file_day string, src_file_hour string)
location ‘/user/hadoop/ods/kesheng/‘;
/* --------------------
-- 补分区
import datetime
beg_date = datetime.datetime.strptime(‘20170101‘,‘%Y%m%d‘)
end_date = datetime.datetime.strptime(‘20170123‘,‘%Y%m%d‘)
fmt_add_partition = "alter table ods.kesheng_sdk_raw_ex add if not exists partition(src_file_day=‘{0}‘, src_file_hour=‘00‘) location ‘{0}/00‘;"
‘‘‘
while beg_date < end_date:
beg_date = beg_date + datetime.timedelta(days=1)
print fmt_add_partition.format(datetime.datetime.strftime(beg_date, ‘%Y%m%d‘))
beg_date = datetime.datetime.strptime(‘20160811‘,‘%Y%m%d‘)
end_date = datetime.datetime.strptime(‘20161109‘,‘%Y%m%d‘)
‘‘‘
fmt_add_partition = "alter table ods.kesheng_sdk_raw_ex add if not exists partition(src_file_day=‘{0}‘, src_file_hour=‘{1}‘) location ‘{0}/{1}‘;"
while beg_date < end_date:
beg_date = beg_date + datetime.timedelta(days=1)
for i in xrange(24):
print fmt_add_partition.format(datetime.datetime.strftime(beg_date, ‘%Y%m%d‘), str(i).rjust(2,‘0‘))
*-----------------------------/
以上是关于hive中文乱码-解析json的主要内容,如果未能解决你的问题,请参考以下文章