注册性能:使用存储过程的数据转换

Posted

技术标签:

【中文标题】注册性能:使用存储过程的数据转换【英文标题】:Reg Performance : Data Transformation using Stored procedure 【发布时间】:2020-06-01 04:10:26 【问题描述】:

我尝试使用XL仓库仅转换雪花中的3M记录。转换规则附后。 错误记录写入错误表,成功记录写入清理表。 使用的仓库是XL,处理时间是1hr5mins。 与记录数量和转换规则相比,所花费的时间似乎相当长。 任何人都可以查看存储过程并建议是否必须修改代码以提高性能。 谢谢

create or replace procedure clean_transform_table(UPLOAD_DATE_IN VARCHAR)
returns varchar not null
  language javascript
  as
$$

//-------------define function to check valid date format
function isValidDate(datevalue) 
    var pattern = /^\d4-\d1,2-\d1,2$/;
    return datevalue.match(pattern);


//--------define function to check valid email
function isvalidEmail(email) 
    var pattern = /\S+@\S+\.com$/;
    return email.match(pattern);


//------------define function to insert error records
function insertErrorRecords(errorDetails)
if (errorDetails.length > 0)
        var errorDetails = errorDetails.toString();
        var cmd = "insert into error_records_log values"+errorDetails;
        var stmt = snowflake.createStatement(
            
                sqlText: cmd
            
        );
        stmt.execute();




//-------function to check valid birth date
function birthDateCheck(row_num, person_id, birth_date,recordStatus) 

    if (birth_date == null || birth_date.trim() == '' || isValidDate(birth_date.trim().toString()) == null) 
        var arr = [row_num,person_id,tableName,birthDateVar,"Value is null or empty or invalid",update_date,birth_date];  
        var arr = "'" + arr.join("','") + "'";
        errorDetails.push( "("+arr+")" );
        return 1;

     else 
        return recordStatus;
    




//------function to check valid gender value
function genderCheck(row_num, person_id, gender,recordStatus) 

    if (gender == null || !['M', 'F'].includes(gender.trim())) 
        var arr = [row_num,person_id,tableName,genderVar,"Value is null or other in [M,F]",update_date,gender];  
        var arr = "'" + arr.join("','") + "'";
        errorDetails.push( "("+arr+")" );
       return 1;
     else 
        return recordStatus;
    



//--------function to check valid country value
function countryCheck(row_num, person_id, country,country_of_Birth,recordStatus) 

    if (country != country_of_Birth) 
       var arr = [row_num,person_id,tableName,countryVar,"Value is not same as country_of_Birth",update_date,country];  
       var arr = "'" + arr.join("','") + "'";
       errorDetails.push( "("+arr+")" );
       return 1;
     else 
        return recordStatus;
    



//-------function to check valid loan amount
function loanCheck(row_num, person_id, loan_amount,recordStatus) 

    if (loan_amount != null && loan_amount.trim() != '' && loan_amount.trim() < 0) 
       var arr = [row_num,person_id,tableName,loanVar,"Value is negative number",update_date,loan_amount];  
       var arr = "'" + arr.join("','") + "'";
       errorDetails.push( "("+arr+")" );
       return 1;
     else 
        return recordStatus;
    



//------function to validate email
function emailCheck(row_num, person_id, email,recordStatus) 
    if (email != null && isvalidEmail(email) == null)

       var arr = [row_num,person_id,tableName,emailVar,"value is invalid email",update_date,email];  
       var arr = "'" + arr.join("','") + "'";
       errorDetails.push( "("+arr+")" );
       return 1;
     else 
        return recordStatus;
    



//----------function extract all data, validate and trace errors
function validateAllDataTraceErrors()
var cmd = "select * from SF_STRUCT_STAGE_RAW where upload_date = to_date(:1,'YYYY-MM-DD')";
var stmt = snowflake.createStatement(
    
        sqlText: cmd
        ,binds:[UPLOAD_DATE_IN]
    
);
var resultSet = stmt.execute();

//----loop thru all the data
while (resultSet.next()) 

    var recordStatus = 0;
    var row_num = resultSet.getColumnValueAsString('ROW_NUM');
    var person_id = resultSet.getColumnValueAsString('PERSON_ID');
    var birth_date = resultSet.getColumnValueAsString('BIRTH_DATE');
    var gender = resultSet.getColumnValueAsString('GENDER');
    var country = resultSet.getColumnValueAsString('COUNTRY');
    var country_of_Birth = resultSet.getColumnValueAsString('COUNTRY_OF_BIRTH');
    var loan_amount = resultSet.getColumnValueAsString('LOAN_AMOUNT');
    var email = resultSet.getColumnValueAsString('EMAIL');

    //----birth date check
    var recordStatus = birthDateCheck(row_num, person_id, birth_date,errorDetails,recordStatus);

    //------gender check
    var recordStatus = genderCheck(row_num, person_id, gender,recordStatus);

    //------country and country of birth check
    var recordStatus = countryCheck(row_num, person_id, country,country_of_Birth,recordStatus);

    //------email check
    var recordStatus = emailCheck(row_num, person_id, email,recordStatus);

    //------Loan amount negative check
    var recordStatus = loanCheck(row_num, person_id, loan_amount,recordStatus);

    //------Update error Rownum's in variable
    if (recordStatus == 1) 
        errorRowNum.push(row_num)
    




//------function to transfer valid records to clean table
function transfer_valid_records(errorRowNum) 
var condition = "";

if (errorRowNum.length > 0)
condition = "where upload_date = to_date(:1,'YYYY-MM-DD') and raw.row_num not in ("+ errorRowNum.toString()+");";
else
condition = "where upload_date = to_date(:1,'YYYY-MM-DD');";


if (errorRowNum.length == 0)
errorRowNum = '';

var cmd = `insert into SF_STRUCT_CLEAN
select 
seq1.nextval,
raw.Person_id,  
reverse(raw.Given_Name),
reverse(raw.Family_Name),
raw.Title,
raw.BIRTH_DATE,
lkp_gen.description,
raw.Mobile_Phone,
raw.Email,
raw.Address_Line_1,
raw.Postcode,
raw.State,
raw.Country,
raw.Country_of_Birth,
raw.loan_amount,
current_date()
from SF_STRUCT_STAGE_RAW raw
left join lkp_gender lkp_gen
on raw.Gender = lkp_gen.code `+condition;

var stmt = snowflake.createStatement(
        
            sqlText: cmd,
            binds:[UPLOAD_DATE_IN]
        
    );
    stmt.execute();


//----------------------call main functions 
try

//-------------define variables
var birthDateVar = 'BIRTH_DATE';
var genderVar = 'GENDER';
var countryVar = 'COUNTRY';
var loanVar = 'LOAN_AMOUNT';
var emailVar = 'EMAIL';
var tableName = 'SF_STRUCT_STAGE_RAW';
var errorRowNum = [];
var errorDetails = [];
var currentDate =  new Date();
var update_date = currentDate.getFullYear()+'-'+(currentDate.getMonth()+1)+'-'+currentDate.getDate();

validateAllDataTraceErrors();

insertErrorRecords(errorDetails);

transfer_valid_records(errorRowNum);

result = 'Success';

catch (err)
result =  "Failed: Code: " + err.code + "\n  State: " + err.state;
result += "\n  Message: " + err.message;
result += "\nStack Trace:\n" + err.stackTraceTxt; 


return result;
$$
  ;

原始表具有以下架构 列数据类型 ROW_NUM NUMBER(38,0) PERSON_ID NUMBER(38,0) GIVEN_NAME VARCHAR(100) FAMILY_NAME VARCHAR(100) 标题 VARCHAR(100) BIRTH_DATE VARCHAR(100) 性别 VARCHAR(100) MOBILE_PHONE VARCHAR(100) 电子邮件 VARCHAR(100) ADDRESS_LINE_1 VARCHAR(100) 邮政编码 VARCHAR(100) 状态 VARCHAR(100) 国家 VARCHAR(100) COUNTRY_OF_BIRTH VARCHAR(100) LOAN_AMOUNT VARCHAR(50) 文件名 VARCHAR(100) UPLOAD_DATE日期

转换规则非常基本,如下所示

【问题讨论】:

【参考方案1】:

1) 您正在逐行处理。我看到您使用正则表达式,因此应该可以将此存储过程转换为纯 SQL(或至少将行作为一组处理)。

https://docs.snowflake.com/en/sql-reference/functions-regexp.html

2) 它在 JavaScript 对象中保存失败的行号,如果有大量的行失败,您可能会遇到内存问题。

【讨论】:

嗯,这确实可以在纯 sql 中完成,但这只是 poc,其中规则很简单,但是可能有更复杂的规则在纯 sql 中是不可能的。所以我想逐行应用转换。我知道在使用 Javascript 对象时可能存在内存问题,但我确实想根据定义的错误表跟踪错误。即便如此,与处理逐行发生的其他 ETL 工具相比,处理所花费的时间还是非常巨大的。 即使在这种情况下,与其打开游标并为每一行循环,不如运行一个 SELECT 命令一起调用您的 UDF。 SELECT * FOM Y WHERE checkmail(x) AND morecomplex(z)。当然,也有可能无法使用这种方法的情况,那么就会出现性能问题。

以上是关于注册性能:使用存储过程的数据转换的主要内容,如果未能解决你的问题,请参考以下文章

mySQL 存储过程中未知的无效类型转换

将 varchar 转换为数字数据类型时出现算术溢出错误。找不到存储过程''。?

将 DbDataReader 的结果转换为 ASP.NET MVC 4 中的数据库模型,来自使用 ADO.NET 的存储过程 [重复]

如何通过在SSIS中调用函数或存储过程来使用拆分转换过滤和拆分数据?

将 CTE 存储过程转换为 Mysql 兼容的数据库查询

将存储过程 PL/SQL 转换为 Java