Hive Java UDTF 错误:ArrayIndexOutofBounds:1

Posted

技术标签:

【中文标题】Hive Java UDTF 错误:ArrayIndexOutofBounds:1【英文标题】:Hive Java UDTF Error: ArrayIndexOutofBounds:1 【发布时间】:2017-07-31 08:52:10 【问题描述】:

当我在 Hive 上运行我的 java 函数时,我不断收到 ArrayOutOfBounds 错误。

这是我正在使用的 Java 代码,我已打包到 Jar 文件中以与 Hive 一起使用: 在此请找到实现上述Java代码的hive代码。

public class similarity_report extends GenericUDTF 

private PrimitiveObjectInspector stringOI = null;
@Override
public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException

//if (args.length != 1) 
//
 // throw new UDFArgumentException("similarityReport() takes exactly one argument"); // 
if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE
&& ((PrimitiveObjectInspector) args[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING)
 throw new UDFArgumentException("similarityReport() takes a string as a parameter"); 
stringOI = (PrimitiveObjectInspector) args[0];
List<String> fieldNames = new ArrayList<String>(41);
List<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(41);
fieldNames.add("NAME_x");
fieldNames.add("VOTER ID_x");
fieldNames.add("FATHERS' NAME_x");
fieldNames.add("PIN CODE_x");
fieldNames.add("AREA_x");
fieldNames.add("TEHSIL_x");
fieldNames.add("DISTRICT_x");
fieldNames.add("POLICE STATION_x");
fieldNames.add("AGE_x");
fieldNames.add("Y-O-B_x");
fieldNames.add("GENDER_x");
fieldNames.add("HOUSE NUMBER_x");
fieldNames.add("STREET ADDRESS_x");
fieldNames.add("UNIQUE ID_x");
fieldNames.add("EDIT MAX_x");
fieldNames.add("MATCH ID_x");
fieldNames.add("FAKE MAX_x");
fieldNames.add("NAME_y");
fieldNames.add("VOTER ID_y");
fieldNames.add("FATHERS' NAME_y");
fieldNames.add("PIN CODE_y");
fieldNames.add("AREA_y");
fieldNames.add("TEHSIL_y");
fieldNames.add("DISTRICT_y");
fieldNames.add("POLICE STATION_y");
fieldNames.add("AGE_y");
fieldNames.add("Y-O-B_y");
fieldNames.add("GENDER_y");
fieldNames.add("HOUSE NUMBER_y");
fieldNames.add("STREET ADDRESS_y");
fieldNames.add("UNIQUE ID_y");
fieldNames.add("EDIT MAX_y");
fieldNames.add("MATCH ID_y");
fieldNames.add("FAKE MAX_y");
fieldNames.add("NAME SCORE");
fieldNames.add("ADDRESS SCORE");
fieldNames.add("CITY MATCH");
fieldNames.add("ZIP MATCH");
fieldNames.add("RELATIVE NAME SCORE");
fieldNames.add("VOTER ID MATCH");
fieldNames.add("KEY");

fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);

。 . .

fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);

public ArrayList<Object[]> processInputRecord(String row)

ArrayList<Object[]> result = new ArrayList<Object[]>();
//ensure none of the fields are empty
String[] tokens = row.split("\t");

System.out.println(tokens.length);

根据建议我添加了上面的行^^

String Name_x = tokens[0];
String VoterID_x = tokens[1];
String FathersName_x = tokens[2];
String PinCode_x = tokens[3];
String Area_x = tokens[4];
String Tehsil_x = tokens[5];
String District_x = tokens[6];
String PoliceStation_x = tokens[7];
String Age_x = tokens[8];
String YOB_x = tokens[9];
String Gender_x = tokens[10];
String HouseNumber_x = tokens[11];
String StreetAddress_x = tokens[12];
String UniqueID_x = tokens[1];
String EditMax_x = tokens[14];
String MatchID_x = tokens[15];
String FakeMax_x = tokens[16];
String Name_y = tokens[17];
String VoterID_y = tokens[18];
String FathersName_y = tokens[19];
String PinCode_y = tokens[20];
String Area_y = tokens[21];
String Tehsil_y = tokens[22];
String District_y = tokens[23];
String PoliceStation_y = tokens[24];
String Age_y = tokens[25];
String YOB_y = tokens[26];
String Gender_y = tokens[27];
String HouseNumber_y = tokens[28];
String StreetAddress_y = tokens[29];
String UniqueID_y = tokens[18];
String EditMax_y = tokens[31];
String MatchID_y = tokens[32];
String FakeMax_y = tokens[33];
String NameScore = tokens[34];
String AddressScore = tokens[35];
String CityMatch = tokens[36];
String ZipMatch = tokens[37];
String RelativeNameScore = tokens[38];
String VoterIDMatch = tokens[39];
String Key = tokens[40];
String Address_x;
String Address_y;
String matchType = "";

if (matchType == "similar")

result.add(new Object[]
 Name_x, VoterID_x, FathersName_x, PinCode_x, Area_x, Tehsil_x, District_x, PoliceStation_x, Age_x, YOB_x, Gender_x, HouseNumber_x, StreetAddress_x, UniqueID_x, EditMax_x, MatchID_x, FakeMax_x, NameScore, AddressScore, CityMatch, ZipMatch, RelativeNameScore, VoterIDMatch, Key);
result.add(new Object[]  Name_y, VoterID_y, FathersName_y, PinCode_y, Area_y, Tehsil_y, District_y, PoliceStation_y, Age_y, YOB_y, Gender_y, HouseNumber_y, StreetAddress_y, UniqueID_y, EditMax_y, MatchID_y, FakeMax_y, NameScore, AddressScore, CityMatch, ZipMatch, RelativeNameScore, VoterIDMatch, Key);



else if (matchType == "identical")


result.add(new Object[]  Name_x, VoterID_x, FathersName_x, PinCode_x, Area_x, Tehsil_x, District_x, PoliceStation_x, Age_x, YOB_x, Gender_x, HouseNumber_x, StreetAddress_x, UniqueID_x, EditMax_x, MatchID_x, FakeMax_x, NameScore, AddressScore, CityMatch, ZipMatch, RelativeNameScore, VoterIDMatch, Key
);

else if (matchType == "different")

result.add(new Object[]
 Name_x, VoterID_x, FathersName_x, PinCode_x, Area_x, Tehsil_x, District_x, PoliceStation_x, Age_x, YOB_x, Gender_x, HouseNumber_x, StreetAddress_x, UniqueID_x, EditMax_x, MatchID_x, FakeMax_x, NameScore, AddressScore, CityMatch, ZipMatch, RelativeNameScore, VoterIDMatch, Key
);
result.add(new Object[]
 Name_y, VoterID_y, FathersName_y, PinCode_y, Area_y, Tehsil_y, District_y, PoliceStation_y, Age_y, YOB_y, Gender_y, HouseNumber_y, StreetAddress_y, UniqueID_y, EditMax_y, MatchID_y, FakeMax_y, NameScore, AddressScore, CityMatch, ZipMatch, RelativeNameScore, VoterIDMatch, Key
);

return result;

@Override
public void process(Object[] record) throws HiveException 

final String row = stringOI.getPrimitiveJavaObject(record[0]).toString();
ArrayList<Object[]> results = processInputRecord(row);
Iterator<Object[]> it = results.iterator();
while (it.hasNext())
 Object[] r = it.next(); forward(r); 

@Override
public void close() throws HiveException
 // do nothing 

这是在配置单元表上处理上述代码的配置单元代码:

set mapred.job.queue.name=buanlst;
CREATE DATABASE IF NOT EXISTS saihieldb;
USE saihieldb;
CREATE TABLE datafile_to_dedupe (name_x String, voterid_x String, fathersname_x String, pincode_x String, area_x String, tehsil_x String, district_x String, policestation_x String, age_x String, yob_x String, gender_x String, housenumber_x String, streetaddress_x String)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '/idn/home/sbaks31/APRIORI_MUMBAI_SAMPLE_TAB_DELIMITED.txt' OVERWRITE INTO TABLE datafile_to_dedupe;
ALTER TABLE datafile_to_dedupe ADD COLUMNS (uniqueid_x String, editmax_x String, matchid_x String, fakemax_x String);
CREATE TABLE datafile_to_dedupe1 (name_y String, voterid_y String, fathersname_y String, pincode_y String, area_y String, tehsil_y String, district_y String, policestation_y String, age_y String, yob_y String, gender_y String, housenumber_y String, streetaddress_y String)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '/idn/home/sbaks31/APRIORI_MUMBAI_SAMPLE_TAB_DELIMITED.txt' OVERWRITE INTO TABLE datafile_to_dedupe1;
ALTER TABLE datafile_to_dedupe ADD COLUMNS (uniqueid_y String, editmax_y String, matchid_y String, fakemax_y String);
CREATE TABLE crossed (name_x String, voterid_x String, fathersname_x String, pincode_x String, area_x String, tehsil_x String, district_x String, policestation_x String, age_x String, yob_x String, gender_x String, housenumber_x String, streetaddress_x String, uniqueid_x String, editmax_x String, matchid_x String, fakemax_x String, name_y String, voterid_y String, fathersname_y String, pincode_y String, area_y String, tehsil_y String, district_y String, policestation_y String, age_y String, yob_y String, gender_y String, housenumber_y String, streetaddress_y String, uniqueid_y String, editmax_y String, matchid_y String, fakemax_y String)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
INSERT OVERWRITE TABLE crossed SELECT * FROM saihieldb.datafile_to_dedupe CROSS JOIN saihieldb.datafile_to_dedupe1 on (datafile_to_dedupe.name_x = datafile_to_dedupe1.name_y);
ALTER TABLE crossed ADD COLUMNS (namescore String, addressscore String, citymatch String, zipmatch String, relativenamescore String, voteridmatch String, Key String);
add jar /idn/home/sbaks31/DedupeFinal1.jar.filepart;
create temporary function fun3 as 'com.similarity_report';
CREATE VIEW newview4 AS select fun3(name_x, voterid_x, fathersname_x, pincode_x, area_x, tehsil_x, district_x, policestation_x, age_x, yob_x, gender_x, housenumber_x, streetaddress_x, uniqueid_x, editmax_x, matchid_x, fakemax_x, name_y, voterid_y, fathersname_y, pincode_y, area_y, tehsil_y, district_y, policestation_y, age_y, yob_y, gender_y, housenumber_y, streetaddress_y, uniqueid_y, editmax_y, matchid_y, fakemax_y, namescore, addressscore, citymatch, zipmatch, relativenamescore, voteridmatch, Key) from saihieldb.crossed;
select * from newview4 limit 10;

^^这是我收到错误的地方。请让我知道出了什么问题??

在我尝试打印令牌数组的长度后,我得到长度仅为 1.. 有人可以澄清为什么它在数组中只取 1 个值吗??

【问题讨论】:

你能把它缩小到几行代码到你认为问题所在的地方吗?因为这只是太多的代码。请阅读如何创建minimal reproducible example @philantrovert 我进一步缩小了代码范围,但由于新的错误是我的令牌数组只有一个长度,我不确定问题可能出在哪里.. 可能是因为您要在此处拆分的字符串不是制表符分隔的。 @philantrovert 然而,这个字符串不是从 Hive 表接收到的吗?并且 hive 表被格式化为制表符分隔。 【参考方案1】:

这行代码之后:

String[] tokens = row.split("\t");

写:

System.out.println(tokens.length);

并在该行放置一个断点以进行调试。 这将允许您查看数组的实际长度。 因为如果您的数组“tokens”的长度为 5,当您尝试调用 tokens[6]、tokens[7]、<...>、tokens[40] 等时会出现异常。

【讨论】:

在添加您建议的行后,它会打印 1,因为数组的长度只有 1.. 您认为为什么会这样?谢谢!

以上是关于Hive Java UDTF 错误:ArrayIndexOutofBounds:1的主要内容,如果未能解决你的问题,请参考以下文章

Hive UDTF 不接受超过 2 列的输出

Hive 查询:有没有办法将 UDTF 与“cluster by”一起使用?

hive自定义UDTF函数,步骤讲解

HIVE自定义UDTF函数

Hive UDTF开发指南

hive_UDTF函数