r R中的自定义评估回归模型脚本用于Azure机器学习
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r R中的自定义评估回归模型脚本用于Azure机器学习相关的知识,希望对你有一定的参考价值。
# Map 1-based optional input ports to variables
df <- maml.mapInputPort(1) # class: data.frame
df_orig <- maml.mapInputPort(2)
### INPUT PARAMETERS #################################################
# k is the number of parameters to be estimated excuding the intercept
target_variable_name <- "LOG_QUANTITY"
predicted_values_variable_name <- "Scored Labels"
k <- 23
######################################################################
### LIBRARIES ##################################
library(ggplot2)
### FUNCTIONS ##################################
# QQplot as ggplot
# Source: https://gist.github.com/rentrop/d39a8406ad8af2a1066c
gg_qq <- function(x, distribution = "norm", ..., line.estimate = NULL, conf = 0.95,
labels = names(x)){
q.function <- eval(parse(text = paste0("q", distribution)))
d.function <- eval(parse(text = paste0("d", distribution)))
x <- na.omit(x)
ord <- order(x)
n <- length(x)
P <- ppoints(length(x))
df <- data.frame(ord.x = x[ord], z = q.function(P, ...))
if(is.null(line.estimate)){
Q.x <- quantile(df$ord.x, c(0.25, 0.75))
Q.z <- q.function(c(0.25, 0.75), ...)
b <- diff(Q.x)/diff(Q.z)
coef <- c(Q.x[1] - b * Q.z[1], b)
} else {
coef <- coef(line.estimate(ord.x ~ z))
}
zz <- qnorm(1 - (1 - conf)/2)
SE <- (coef[2]/d.function(df$z)) * sqrt(P * (1 - P)/n)
fit.value <- coef[1] + coef[2] * df$z
df$upper <- fit.value + zz * SE
df$lower <- fit.value - zz * SE
if(!is.null(labels)){
df$label <- ifelse(df$ord.x > df$upper | df$ord.x < df$lower, labels[ord],"")
}
p <- ggplot(df, aes(x=z, y=ord.x)) +
geom_point(color="blue") +
geom_abline(intercept = coef[1], slope = coef[2], linetype = "dashed") +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha=0.2)
if(!is.null(labels)) p <- p + geom_text( aes(label = label))
#print(p)
#coef
return(list(plot = p, coef))
}
### MAIN #######################################
#---------------
#--- Metrics ---
#---------------
actual_values <- df[, target_variable_name]
predicted_values <- df[, predicted_values_variable_name]
error_values <- actual_values - predicted_values
sd_error_values <- sd(error_values)
standardized_residual_values <- error_values / sd_error_values
n <- length(actual_values)
degrees_of_freedom <- n - k - 1
# Names of the resulting data frame's rows
names <- c("MAE", "MAPE", "RMSE", "NRMSE", "R^2", "adj-R^2", "S")
mae <- c(value = mean( abs(error_values) ))
mape <- c(value = mean( abs(100 * error_values / actual_values) ))
rmses <- c(value = sqrt( mean( error_values^2 , na.rm = TRUE ) ))
nrmses <- c(value = rmses /
( max(actual_values, na.rm = TRUE) - min(actual_values, na.rm = TRUE) )
)
rsq <- c( value = 1 - ( sum(error_values[!is.na(error_values)]^2) /
sum( (actual_values[!is.na(actual_values)] - mean(actual_values, na.rm = TRUE) )^2) )
)
s <- c(value = sqrt( sum(error_values[!is.na(error_values)]^2) / degrees_of_freedom))
radjsq <- 1 - (1 - rsq) * (n - 1) / (n - k - 1)
res <- as.data.frame(
cbind(stats = names,
round(rbind(mae, mape, rmses, nrmses, rsq, radjsq, s), 6)
)
)
#-------------
#--- Plots ---
#-------------
title0 <- 'HISTOGRAM OF RESIDUALS'
subtitle0 <- target_variable_name
p0 <- ggplot(data = df, aes(error_values)) +
geom_histogram(fill = "blue") +
ggtitle(bquote(atop(.(title0), atop(italic(.(subtitle0)), "")))) +
xlab("Residuals") + ylab("Frequence")
#print(p0) # Use to debug the code
try(print(p0), silent = TRUE)
title1 <- 'PREDICTED VS ACTUAL'
subtitle1 <- target_variable_name
p1 <- ggplot(data = df, aes(x = predicted_values, y = actual_values)) +
geom_point(color="blue") +
geom_abline(intercept = 0, linetype = "dashed") +
ggtitle(bquote(atop(.(title1), atop(italic(.(subtitle1)), "")))) +
xlab("Predicted values") + ylab("Actual values") #+ expand_limits(x = 0, y = 0)
#print(p1) # Use to debug the code
try(print(p1), silent = TRUE)
title2 <- 'PREDICTED VS RESIDUAL'
subtitle2 <- target_variable_name
p2 <- ggplot(data = df, aes(x = predicted_values, y = standardized_residual_values)) +
geom_point(color="blue") +
geom_hline(yintercept = 0, linetype = "dashed") +
ggtitle(bquote(atop(.(title2), atop(italic(.(subtitle2)), "")))) +
xlab("Predicted values") + ylab("Standardized residuals")
#print(p2) # Use to debug the code
try(print(p2), silent = TRUE)
title3 <- 'NORMAL Q-Q PLOT'
subtitle3 <- target_variable_name
p3 <- gg_qq(error_values)$plot +
ggtitle(bquote(atop(.(title3), atop(italic(.(subtitle3)), "")))) +
xlab("Theoretical Quantiles") + ylab("Residuals Quantiles")
#print(p3) # Use to debug the code
try(print(p3), silent = TRUE)
for (regressor_name in names(df_orig))
{
if (regressor_name %in% names(df))
feature_type <- "FEATURE"
else
feature_type <- "EXTERNAL FEATURE"
variable_nature <- ifelse(regressor_name == target_variable_name, "LABEL", feature_type)
if (!grepl("Scored", regressor_name, fixed = TRUE))
{
title4 <- paste(variable_nature, ' VS RESIDUAL', sep="")
subtitle4 <- regressor_name
if (class(df_orig[,regressor_name]) != "factor")
{
p4 <- ggplot(data = df_orig, aes(x = get(regressor_name), y = standardized_residual_values)) +
geom_point(color="blue") +
geom_hline(yintercept = 0, linetype = "dashed") +
ggtitle(bquote(atop(.(title4), atop(italic(.(subtitle4)), "")))) +
xlab(tolower(regressor_name)) + ylab("Standardized residuals")
}
else
{
p4 <- ggplot(data = df_orig, aes(x = get(regressor_name), y = standardized_residual_values)) +
geom_boxplot(color="blue") +
#geom_hline(yintercept = 0, linetype = "dashed") +
ggtitle(bquote(atop(.(title4), atop(italic(.(subtitle4)), "")))) +
xlab(tolower(regressor_name)) + ylab("Standardized residuals")
}
print(p4) # Use to debug the code
#try(print(p4), silent = TRUE)
}
}
# Select data.frame to be sent to the output Dataset port
maml.mapOutputPort("res");
以上是关于r R中的自定义评估回归模型脚本用于Azure机器学习的主要内容,如果未能解决你的问题,请参考以下文章
R语言编写自定义函数评估回归模型预测变量的相对重要性(Relative importance)通过在所有可能的子模型中添加一个预测变量而获得的R方的平均增加评估预测变量的重要度并通过点图可视化
R语言使用caret包的train函数构建多元自适应回归样条(MARS)模型模型调优自定义设置trainControl函数和tuneLength参数自定义调优评估指标
R语言计算F1评估指标实战:F1 score使用R中caret包中的confusionMatrix()函数为给定的logistic回归模型计算F1得分(和其他指标)
R语言使用caret包的train函数构建多元自适应回归样条(MARS)模型模型调优自定义设置tuneGrid参数多个超参数组合调优trainControl函数自定义调优评估指标
R语言广义线性模型函数GLMglm函数构建泊松回归模型(Poisson regression)分析模型是否过离散(Overdispersion)使用残差偏差与模型中的剩余自由度的比率值评估