ydb数据去重

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了ydb数据去重相关的知识,希望对你有一定的参考价值。

1.创建ydb导出表vehiclepass_txt_export
create external table vehiclepass_txt_export(
rowKey string, regionID string, cityID string, equipmentCode string, directionCode string,laneCode string,inOutTownFlag string,tollCode string,communityID string,tollTypeCode string,dataCompanyCode string,vehiclePlateSrc string,speed int,time string,url string,year string,month string,day string,hour string,week string,dataClassify string,vehiclePlate string,plateColor string,vehicleColor string,brand string,childBrand string,model string,species string,manufacturer string,headOrTail string,brandYear string,reliability int,vehicleTypeCode string,vehiclePosition string,firstTimeInCity int,firstTimeInCounty int,firstTimeInCommunity int,iword string,ivehiclePlate string,analysisData string
) PARTITIONED BY ( `month1` string)
location ‘/data/mr/vehiclepass_txt_export‘;
2.将ydb表vehiclepass分区为201603的数据导出到外部表vehiclepass_export_txt
insert overwrite table vehiclepass_txt_export partition(month1=‘201604‘)
select
r[0] as rowKey,
r[1] as regionID,
r[2] as cityID,
r[3] as equipmentCode,
r[4] as directionCode,
r[5] as laneCode,
r[6] as inOutTownFlag,
r[7] as tollCode,
r[8] as communityID,
r[9] as tollTypeCode,
r[10] as dataCompanyCode,
r[11] as vehiclePlateSrc,
r[12] as speed,
r[13] as time,
r[14] as url,
r[15] as year,
r[16] as month,
r[17] as day,
r[18] as hour,
r[19] as week,
r[20] as dataClassify,
r[21] as vehiclePlate,
r[22] as plateColor,
r[23] as vehicleColor,
r[24] as brand,
r[25] as childBrand,
r[26] as model,
r[27] as species,
r[28] as manufacturer,
r[29] as headOrTail,
r[30] as brandYear,
r[31] as reliability,
r[32] as vehicleTypeCode,
r[33] as vehiclePosition,
r[34] as firstTimeInCity,
r[35] as firstTimeInCounty,
r[36] as firstTimeInCommunity,
r[37] as iword,
r[38] as ivehiclePlate,
r[39] as analysisData
from ydb where YSQL(‘from‘,‘select rowKey,regionID,cityID,equipmentCode,directionCode,laneCode,inOutTownFlag,tollCode,communityID,tollTypeCode,dataCompanyCode,vehiclePlateSrc,speed,time,url,year,month,day,hour,week,dataClassify,vehiclePlate,plateColor,vehicleColor,brand,childBrand,model,species,manufacturer,headOrTail,brandYear,reliability,vehicleTypeCode,vehiclePosition,firstTimeInCity,firstTimeInCounty,firstTimeInCommunity,iword,ivehiclePlate,analysisData from vehiclepass where ydbpartion = \‘201604\‘ ‘,‘segment‘);

###删除导出表指定分区数据ALTER TABLE vehiclepass_txt_export DROP IF EXISTS PARTITION (month1=‘201603‘);
3.将表vehiclepass_export_txt中201603的数据去重保存到hdfs
/hadoop/bin/hadoop jar DereplicationYunnan.jar hdfs://10.166.112.190:9000/data/mr/vehiclepass_txt_export/month1=201603 hdfs://10.166.112.190:9000/data/mr/vehiclepass/201603
//将去重后的数据进行备份
hadoop fs -cp -f /data/mr/vehiclepass/201603 /data/mr/vehiclepass_bak
4.将去重后的数据导入hive表vehiclepass_hive(同时会自动将/data/mr/vehiclepass/201603/part-r-00000去掉)
load data inpath ‘/data/mr/vehiclepass/201603/part-r-00000‘ into table vehiclepass_hive partition(month1=201603);
5.清除ydb对应分区的数据
***************************************************
select count(r[0]) as cnt from
ydb where YSQL(‘from‘,‘select count(*) from vehiclepass where ydbpartion=\‘201603\‘ and ydbkv=\‘ydb.truncate:true\‘ ‘,‘segment‘)
***************************************************
http://hwits-master:9999/droppartion?tablename=vehiclepass&partions=201603
http://hwits-master:9999/droppartion?tablename=vehiclepass&partions=201603&isforce=true

6.将hive表vhiclepass_hive的数据导入ydb表vehiclepass
###insert into table vehiclepass select *,‘201603‘,‘‘ from vehiclepass_hive;
###将表的字段按列名对应写
INSERT INTO TABLE vehiclepass SELECT rowKey,regionID,
cityID,
equipmentCode,
directionCode,
laneCode,
inOutTownFlag,
tollCode,
communityID,
tollTypeCode,
dataCompanyCode,
vehiclePlateSrc,
speed,
time,
url,
YEAR,
MONTH,
DAY,
HOUR,
WEEK,
dataClassify,
vehiclePlate,
plateColor,
vehicleColor,
brand,
childBrand,
model,
species,
manufacturer,
headOrTail,
brandYear,
reliability,
vehicleTypeCode,
vehiclePosition,
firstTimeInCity,
firstTimeInCounty,
firstTimeInCommunity,
iword,
ivehiclePlate,
analysisData,
‘201603‘,
‘‘
FROM
vehiclepass_hive;
#######################################################################
ALTER TABLE vehiclepass_hive DROP IF EXISTS PARTITION (month1=‘201603‘);
确认:
select count(*) from vehiclepass_hive;

#####################
vehiclepass_hive

CREATE TABLE vehiclepass_hive
(
rowKey string,
regionID string,
cityID string,
equipmentCode string,
directionCode string,
laneCode string,
inOutTownFlag string,
tollCode string,
communityID string,
tollTypeCode string,
dataCompanyCode string,
vehiclePlateSrc string,
speed int,
time string,
url string,
year string,
month string,
day string,
hour string,
week string,
dataClassify string,
vehiclePlate string,
plateColor string,
vehicleColor string,
brand string,
childBrand string,
model string,
species string,
manufacturer string,
headOrTail string,
brandYear string,
reliability int,
vehicleTypeCode string,
vehiclePosition string,
firstTimeInCity int,
firstTimeInCounty int,
firstTimeInCommunity int,
iword string,
ivehiclePlate string,
analysisData string
)
PARTITIONED BY(month1 STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,‘ STORED AS TEXTFILE;

以上是关于ydb数据去重的主要内容,如果未能解决你的问题,请参考以下文章

python去重(汉字一样,里面的编码不一样)?

链表Linked List注意事项

数据库练习题(牛客网sql入门篇部分)

SQL快速入门

SQL快速入门

SQL快速入门