注册性能:使用存储过程的数据转换
Posted
技术标签:
【中文标题】注册性能:使用存储过程的数据转换【英文标题】:Reg Performance : Data Transformation using Stored procedure 【发布时间】:2020-06-01 04:10:26 【问题描述】:我尝试使用XL仓库仅转换雪花中的3M记录。转换规则附后。 错误记录写入错误表,成功记录写入清理表。 使用的仓库是XL,处理时间是1hr5mins。 与记录数量和转换规则相比,所花费的时间似乎相当长。 任何人都可以查看存储过程并建议是否必须修改代码以提高性能。 谢谢
create or replace procedure clean_transform_table(UPLOAD_DATE_IN VARCHAR)
returns varchar not null
language javascript
as
$$
//-------------define function to check valid date format
function isValidDate(datevalue)
var pattern = /^\d4-\d1,2-\d1,2$/;
return datevalue.match(pattern);
//--------define function to check valid email
function isvalidEmail(email)
var pattern = /\S+@\S+\.com$/;
return email.match(pattern);
//------------define function to insert error records
function insertErrorRecords(errorDetails)
if (errorDetails.length > 0)
var errorDetails = errorDetails.toString();
var cmd = "insert into error_records_log values"+errorDetails;
var stmt = snowflake.createStatement(
sqlText: cmd
);
stmt.execute();
//-------function to check valid birth date
function birthDateCheck(row_num, person_id, birth_date,recordStatus)
if (birth_date == null || birth_date.trim() == '' || isValidDate(birth_date.trim().toString()) == null)
var arr = [row_num,person_id,tableName,birthDateVar,"Value is null or empty or invalid",update_date,birth_date];
var arr = "'" + arr.join("','") + "'";
errorDetails.push( "("+arr+")" );
return 1;
else
return recordStatus;
//------function to check valid gender value
function genderCheck(row_num, person_id, gender,recordStatus)
if (gender == null || !['M', 'F'].includes(gender.trim()))
var arr = [row_num,person_id,tableName,genderVar,"Value is null or other in [M,F]",update_date,gender];
var arr = "'" + arr.join("','") + "'";
errorDetails.push( "("+arr+")" );
return 1;
else
return recordStatus;
//--------function to check valid country value
function countryCheck(row_num, person_id, country,country_of_Birth,recordStatus)
if (country != country_of_Birth)
var arr = [row_num,person_id,tableName,countryVar,"Value is not same as country_of_Birth",update_date,country];
var arr = "'" + arr.join("','") + "'";
errorDetails.push( "("+arr+")" );
return 1;
else
return recordStatus;
//-------function to check valid loan amount
function loanCheck(row_num, person_id, loan_amount,recordStatus)
if (loan_amount != null && loan_amount.trim() != '' && loan_amount.trim() < 0)
var arr = [row_num,person_id,tableName,loanVar,"Value is negative number",update_date,loan_amount];
var arr = "'" + arr.join("','") + "'";
errorDetails.push( "("+arr+")" );
return 1;
else
return recordStatus;
//------function to validate email
function emailCheck(row_num, person_id, email,recordStatus)
if (email != null && isvalidEmail(email) == null)
var arr = [row_num,person_id,tableName,emailVar,"value is invalid email",update_date,email];
var arr = "'" + arr.join("','") + "'";
errorDetails.push( "("+arr+")" );
return 1;
else
return recordStatus;
//----------function extract all data, validate and trace errors
function validateAllDataTraceErrors()
var cmd = "select * from SF_STRUCT_STAGE_RAW where upload_date = to_date(:1,'YYYY-MM-DD')";
var stmt = snowflake.createStatement(
sqlText: cmd
,binds:[UPLOAD_DATE_IN]
);
var resultSet = stmt.execute();
//----loop thru all the data
while (resultSet.next())
var recordStatus = 0;
var row_num = resultSet.getColumnValueAsString('ROW_NUM');
var person_id = resultSet.getColumnValueAsString('PERSON_ID');
var birth_date = resultSet.getColumnValueAsString('BIRTH_DATE');
var gender = resultSet.getColumnValueAsString('GENDER');
var country = resultSet.getColumnValueAsString('COUNTRY');
var country_of_Birth = resultSet.getColumnValueAsString('COUNTRY_OF_BIRTH');
var loan_amount = resultSet.getColumnValueAsString('LOAN_AMOUNT');
var email = resultSet.getColumnValueAsString('EMAIL');
//----birth date check
var recordStatus = birthDateCheck(row_num, person_id, birth_date,errorDetails,recordStatus);
//------gender check
var recordStatus = genderCheck(row_num, person_id, gender,recordStatus);
//------country and country of birth check
var recordStatus = countryCheck(row_num, person_id, country,country_of_Birth,recordStatus);
//------email check
var recordStatus = emailCheck(row_num, person_id, email,recordStatus);
//------Loan amount negative check
var recordStatus = loanCheck(row_num, person_id, loan_amount,recordStatus);
//------Update error Rownum's in variable
if (recordStatus == 1)
errorRowNum.push(row_num)
//------function to transfer valid records to clean table
function transfer_valid_records(errorRowNum)
var condition = "";
if (errorRowNum.length > 0)
condition = "where upload_date = to_date(:1,'YYYY-MM-DD') and raw.row_num not in ("+ errorRowNum.toString()+");";
else
condition = "where upload_date = to_date(:1,'YYYY-MM-DD');";
if (errorRowNum.length == 0)
errorRowNum = '';
var cmd = `insert into SF_STRUCT_CLEAN
select
seq1.nextval,
raw.Person_id,
reverse(raw.Given_Name),
reverse(raw.Family_Name),
raw.Title,
raw.BIRTH_DATE,
lkp_gen.description,
raw.Mobile_Phone,
raw.Email,
raw.Address_Line_1,
raw.Postcode,
raw.State,
raw.Country,
raw.Country_of_Birth,
raw.loan_amount,
current_date()
from SF_STRUCT_STAGE_RAW raw
left join lkp_gender lkp_gen
on raw.Gender = lkp_gen.code `+condition;
var stmt = snowflake.createStatement(
sqlText: cmd,
binds:[UPLOAD_DATE_IN]
);
stmt.execute();
//----------------------call main functions
try
//-------------define variables
var birthDateVar = 'BIRTH_DATE';
var genderVar = 'GENDER';
var countryVar = 'COUNTRY';
var loanVar = 'LOAN_AMOUNT';
var emailVar = 'EMAIL';
var tableName = 'SF_STRUCT_STAGE_RAW';
var errorRowNum = [];
var errorDetails = [];
var currentDate = new Date();
var update_date = currentDate.getFullYear()+'-'+(currentDate.getMonth()+1)+'-'+currentDate.getDate();
validateAllDataTraceErrors();
insertErrorRecords(errorDetails);
transfer_valid_records(errorRowNum);
result = 'Success';
catch (err)
result = "Failed: Code: " + err.code + "\n State: " + err.state;
result += "\n Message: " + err.message;
result += "\nStack Trace:\n" + err.stackTraceTxt;
return result;
$$
;
原始表具有以下架构 列数据类型 ROW_NUM NUMBER(38,0) PERSON_ID NUMBER(38,0) GIVEN_NAME VARCHAR(100) FAMILY_NAME VARCHAR(100) 标题 VARCHAR(100) BIRTH_DATE VARCHAR(100) 性别 VARCHAR(100) MOBILE_PHONE VARCHAR(100) 电子邮件 VARCHAR(100) ADDRESS_LINE_1 VARCHAR(100) 邮政编码 VARCHAR(100) 状态 VARCHAR(100) 国家 VARCHAR(100) COUNTRY_OF_BIRTH VARCHAR(100) LOAN_AMOUNT VARCHAR(50) 文件名 VARCHAR(100) UPLOAD_DATE日期
转换规则非常基本,如下所示
【问题讨论】:
【参考方案1】:1) 您正在逐行处理。我看到您使用正则表达式,因此应该可以将此存储过程转换为纯 SQL(或至少将行作为一组处理)。
https://docs.snowflake.com/en/sql-reference/functions-regexp.html
2) 它在 JavaScript 对象中保存失败的行号,如果有大量的行失败,您可能会遇到内存问题。
【讨论】:
嗯,这确实可以在纯 sql 中完成,但这只是 poc,其中规则很简单,但是可能有更复杂的规则在纯 sql 中是不可能的。所以我想逐行应用转换。我知道在使用 Javascript 对象时可能存在内存问题,但我确实想根据定义的错误表跟踪错误。即便如此,与处理逐行发生的其他 ETL 工具相比,处理所花费的时间还是非常巨大的。 即使在这种情况下,与其打开游标并为每一行循环,不如运行一个 SELECT 命令一起调用您的 UDF。 SELECT * FOM Y WHERE checkmail(x) AND morecomplex(z)。当然,也有可能无法使用这种方法的情况,那么就会出现性能问题。以上是关于注册性能:使用存储过程的数据转换的主要内容,如果未能解决你的问题,请参考以下文章
将 varchar 转换为数字数据类型时出现算术溢出错误。找不到存储过程''。?
将 DbDataReader 的结果转换为 ASP.NET MVC 4 中的数据库模型,来自使用 ADO.NET 的存储过程 [重复]