r R中的自定义评估回归模型脚本用于Azure机器学习

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r R中的自定义评估回归模型脚本用于Azure机器学习相关的知识,希望对你有一定的参考价值。

# Map 1-based optional input ports to variables
df <- maml.mapInputPort(1) # class: data.frame
df_orig <- maml.mapInputPort(2)

### INPUT PARAMETERS #################################################
# k is the number of parameters to be estimated excuding the intercept
target_variable_name            <- "LOG_QUANTITY"
predicted_values_variable_name  <- "Scored Labels"
k                               <- 23
######################################################################


### LIBRARIES ##################################
library(ggplot2)


### FUNCTIONS ##################################

# QQplot as ggplot
# Source: https://gist.github.com/rentrop/d39a8406ad8af2a1066c
gg_qq <- function(x, distribution = "norm", ..., line.estimate = NULL, conf = 0.95,
                  labels = names(x)){
  q.function <- eval(parse(text = paste0("q", distribution)))
  d.function <- eval(parse(text = paste0("d", distribution)))
  x <- na.omit(x)
  ord <- order(x)
  n <- length(x)
  P <- ppoints(length(x))
  df <- data.frame(ord.x = x[ord], z = q.function(P, ...))

  if(is.null(line.estimate)){
    Q.x <- quantile(df$ord.x, c(0.25, 0.75))
    Q.z <- q.function(c(0.25, 0.75), ...)
    b <- diff(Q.x)/diff(Q.z)
    coef <- c(Q.x[1] - b * Q.z[1], b)
  } else {
    coef <- coef(line.estimate(ord.x ~ z))
  }

  zz <- qnorm(1 - (1 - conf)/2)
  SE <- (coef[2]/d.function(df$z)) * sqrt(P * (1 - P)/n)
  fit.value <- coef[1] + coef[2] * df$z
  df$upper <- fit.value + zz * SE
  df$lower <- fit.value - zz * SE

  if(!is.null(labels)){ 
    df$label <- ifelse(df$ord.x > df$upper | df$ord.x < df$lower, labels[ord],"")
    }

  p <- ggplot(df, aes(x=z, y=ord.x)) +
    geom_point(color="blue") + 
    geom_abline(intercept = coef[1], slope = coef[2], linetype = "dashed") +
    geom_ribbon(aes(ymin = lower, ymax = upper), alpha=0.2) 
  if(!is.null(labels)) p <- p + geom_text( aes(label = label))
  
  #print(p)
  #coef
  return(list(plot = p, coef))
}



### MAIN #######################################

#---------------
#--- Metrics ---
#---------------
actual_values <- df[, target_variable_name]
predicted_values <- df[, predicted_values_variable_name]

error_values <- actual_values - predicted_values

sd_error_values <- sd(error_values)
standardized_residual_values <- error_values / sd_error_values

n <- length(actual_values)
degrees_of_freedom <- n - k - 1

# Names of the resulting data frame's rows
names <- c("MAE", "MAPE", "RMSE", "NRMSE", "R^2", "adj-R^2", "S")


mae     <- c(value = mean( abs(error_values) ))
mape    <- c(value = mean( abs(100 * error_values / actual_values) ))
rmses   <- c(value = sqrt( mean( error_values^2 , na.rm = TRUE ) ))
nrmses  <- c(value = rmses /
                ( max(actual_values, na.rm = TRUE) - min(actual_values, na.rm = TRUE) )
           )
rsq     <- c( value = 1 - ( sum(error_values[!is.na(error_values)]^2) /
                            sum( (actual_values[!is.na(actual_values)] - mean(actual_values, na.rm = TRUE) )^2) )
           )
s       <- c(value = sqrt( sum(error_values[!is.na(error_values)]^2) / degrees_of_freedom))
radjsq  <- 1 - (1 - rsq) * (n - 1) / (n - k - 1)

res <- as.data.frame(
        cbind(stats = names,
              round(rbind(mae, mape, rmses, nrmses, rsq, radjsq, s), 6)
        )
       )


#-------------
#--- Plots ---
#-------------

title0 <- 'HISTOGRAM OF RESIDUALS'
subtitle0 <- target_variable_name

p0 <- ggplot(data = df, aes(error_values)) +
        geom_histogram(fill = "blue") +
        ggtitle(bquote(atop(.(title0), atop(italic(.(subtitle0)), "")))) +
        xlab("Residuals") + ylab("Frequence")

#print(p0)  # Use to debug the code
try(print(p0), silent = TRUE)



title1 <- 'PREDICTED VS ACTUAL'
subtitle1 <- target_variable_name

p1 <- ggplot(data = df, aes(x = predicted_values, y = actual_values)) +
        geom_point(color="blue") +
        geom_abline(intercept = 0, linetype = "dashed") +
        ggtitle(bquote(atop(.(title1), atop(italic(.(subtitle1)), "")))) +
        xlab("Predicted values") + ylab("Actual values") #+ expand_limits(x = 0, y = 0)

#print(p1)  # Use to debug the code
try(print(p1), silent = TRUE)



title2 <- 'PREDICTED VS RESIDUAL'
subtitle2 <- target_variable_name

p2 <- ggplot(data = df, aes(x = predicted_values, y = standardized_residual_values)) +
        geom_point(color="blue") +
        geom_hline(yintercept = 0, linetype = "dashed") +
        ggtitle(bquote(atop(.(title2), atop(italic(.(subtitle2)), "")))) +
        xlab("Predicted values") + ylab("Standardized residuals")

#print(p2)  # Use to debug the code
try(print(p2), silent = TRUE)



title3 <- 'NORMAL Q-Q PLOT'
subtitle3 <- target_variable_name

p3 <- gg_qq(error_values)$plot +
        ggtitle(bquote(atop(.(title3), atop(italic(.(subtitle3)), "")))) +
        xlab("Theoretical Quantiles") + ylab("Residuals Quantiles")

#print(p3)  # Use to debug the code
try(print(p3), silent = TRUE)




for (regressor_name in names(df_orig))
{
  if (regressor_name %in% names(df))
    feature_type <- "FEATURE"
  else
    feature_type <- "EXTERNAL FEATURE"
    
  variable_nature <- ifelse(regressor_name == target_variable_name, "LABEL", feature_type)
  if (!grepl("Scored", regressor_name, fixed = TRUE))
  {
    title4 <- paste(variable_nature, ' VS RESIDUAL', sep="")
    subtitle4 <- regressor_name
    
    if (class(df_orig[,regressor_name]) != "factor")
    {
      p4 <- ggplot(data = df_orig, aes(x = get(regressor_name), y = standardized_residual_values)) +
              geom_point(color="blue") +
              geom_hline(yintercept = 0, linetype = "dashed") +
              ggtitle(bquote(atop(.(title4), atop(italic(.(subtitle4)), "")))) +
              xlab(tolower(regressor_name)) + ylab("Standardized residuals")
    }
    else
    {
      p4 <- ggplot(data = df_orig, aes(x = get(regressor_name), y = standardized_residual_values)) +
              geom_boxplot(color="blue") +
              #geom_hline(yintercept = 0, linetype = "dashed") +
              ggtitle(bquote(atop(.(title4), atop(italic(.(subtitle4)), "")))) +
              xlab(tolower(regressor_name)) + ylab("Standardized residuals")
    }
    print(p4)  # Use to debug the code
    #try(print(p4), silent = TRUE)
  }
}

# Select data.frame to be sent to the output Dataset port
maml.mapOutputPort("res");

以上是关于r R中的自定义评估回归模型脚本用于Azure机器学习的主要内容,如果未能解决你的问题,请参考以下文章

R语言编写自定义函数评估回归模型预测变量的相对重要性(Relative importance)通过在所有可能的子模型中添加一个预测变量而获得的R方的平均增加评估预测变量的重要度并通过点图可视化

R语言使用caret包的train函数构建多元自适应回归样条(MARS)模型模型调优自定义设置trainControl函数和tuneLength参数自定义调优评估指标

R语言计算F1评估指标实战:F1 score使用R中caret包中的confusionMatrix()函数为给定的logistic回归模型计算F1得分(和其他指标)

R语言使用caret包的train函数构建多元自适应回归样条(MARS)模型模型调优自定义设置tuneGrid参数多个超参数组合调优trainControl函数自定义调优评估指标

R语言广义线性模型函数GLMglm函数构建泊松回归模型(Poisson regression)分析模型是否过离散(Overdispersion)使用残差偏差与模型中的剩余自由度的比率值评估

云计算与数据科学:Microsoft Azure 机器学习与R 简介