r R中数据科学竞赛的共同特征探索功能
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r R中数据科学竞赛的共同特征探索功能相关的知识,希望对你有一定的参考价值。
# remove duplicate columns
remDupcols <- function(data){
rem = which(!(names(data) %in% colnames(unique(as.matrix(data), MARGIN=2))))
return(rem)
}
# fast parallel cor
bigcorPar <- function(x, nblocks = 10, verbose = TRUE, ncore="all", ...){
library(ff, quietly = TRUE)
require(doMC)
if(ncore=="all"){
ncore = multicore:::detectCores()
registerDoMC(cores = ncore)
} else{
registerDoMC(cores = ncore)
}
NCOL <- ncol(x)
## test if ncol(x) %% nblocks gives remainder 0
if (NCOL %% nblocks != 0){stop("Choose different 'nblocks' so that ncol(x) %% nblocks = 0!")}
## preallocate square matrix of dimension
## ncol(x) in 'ff' single format
corMAT <- ff(vmode = "single", dim = c(NCOL, NCOL))
## split column numbers into 'nblocks' groups
SPLIT <- split(1:NCOL, rep(1:nblocks, each = NCOL/nblocks))
## create all unique combinations of blocks
COMBS <- expand.grid(1:length(SPLIT), 1:length(SPLIT))
COMBS <- t(apply(COMBS, 1, sort))
COMBS <- unique(COMBS)
## iterate through each block combination, calculate correlation matrix
## between blocks and store them in the preallocated matrix on both
## symmetric sides of the diagonal
results <- foreach(i = 1:nrow(COMBS)) %dopar% {
COMB <- COMBS[i, ]
G1 <- SPLIT[[COMB[1]]]
G2 <- SPLIT[[COMB[2]]]
if (verbose) cat("Block", COMB[1], "with Block", COMB[2], "\n")
flush.console()
COR <- cor(x[, G1], x[, G2], ...)
corMAT[G1, G2] <- COR
corMAT[G2, G1] <- t(COR)
COR <- NULL
}
gc()
return(corMAT)
}
# remove highly correlated features from data
remHighcor <- function(data, cutoff, ...){
data_cor <- cor(sapply(data, as.numeric), ...)
data_cor[is.na(data_cor)] <- 0
rem <- findCorrelation(data_cor, cutoff=cutoff, verbose=T)
return(rem)
}
# remove highly correlated features from data faster
remHighcorPar <- function(data, nblocks, ncore, cutoff, ...){
data_cor = bigcorPar(data, nblocks = nblocks, ncore = ncore, ...)
data_cor = data.matrix(as.data.frame(as.ffdf(data_cor)))
data_cor[is.na(data_cor)] <- 0
rem <- findCorrelation(data_cor, cutoff=cutoff, verbose=T)
return(rem)
}
# remove features from data which are highly correlated to those in main
remHighcor0 <- function(data, main, cutoff, ...){
res = cor(sapply(data, as.numeric), sapply(main, as.numeric), ...) # res is names(data) X names(main) matrix
res[is.na(res)] <- 0
res = res > cutoff
rem = unname(which(rowSums(res) > 0))
return(rem)
}
# one-hot/dummy encode factors
# does not depend on train/test split as long as # of factors is same
categtoOnehot <- function(data, fullrank=T, ...){
data[,names(data)] = lapply(data[,names(data)] , factor) # convert character to factors
if (fullrank){
res = as.data.frame(as.matrix(model.matrix( ~., data, ...)))[,-1]
} else {
res = as.data.frame(as.matrix(model.matrix(~ ., data, contrasts.arg = lapply(data, contrasts, contrasts=FALSE), ...)))[,-1]
}
return(res)
}
# orthogonal polynomial encoding for ordered factors - use only for a few 2-5 levels !
# not all features may actually be important and must be removed
# include dependence on train/test split ?
categtoOrthPoly <- function(data, fullrank=T, ...){
data[,names(data)] = lapply(data[,names(data)] , ordered) # convert character to factors
if (fullrank){
res = as.data.frame(as.matrix(model.matrix( ~., data, ...)))[,-1]
} else {
res = as.data.frame(as.matrix(model.matrix(~ ., data, contrasts.arg = lapply(data, contrasts, contrasts=FALSE), ...)))[,-1]
}
return(res)
}
# Out-of-fold mean/sd/median deviation encoding.
# Useful for high cardinality categorical variables.
# always full rank
categtoDeviationenc <- function(char_data, num_data, traininds=NULL, funcs = funs(mean(.,na.rm=T), sd(.,na.rm=T), 'median' = median(.,na.rm=T))){
if(length(traininds) == 0){
train_char_data = char_data
train_num_data = num_data
} else {
train_char_data = char_data[traininds, ]
train_num_data =num_data[traininds, ]
}
res = list()
for(x in names(train_char_data)){
res[[x]] = train_num_data %>% group_by(.dots=train_char_data[,x]) %>% summarise_each(funcs) # calculate mean/sd/median encodings
res[[x]][,-1] = apply(res[[x]][,-1], 2, scale, scale=FALSE, center=TRUE) # calculate deviances of mean/sd/median encodings
# rename columns
colnames(res[[x]])[1] = x
if (ncol(train_num_data) == 1)
colnames(res[[x]])[-1] = paste0(names(train_num_data),'_',names(res[[x]])[-1])
res[[x]] <- merge(char_data[,x,drop=F], res[[x]], all.x=T, by=x)[,-1] # apply encodings to all data
}
res = data.frame(do.call(cbind, res))
return(res)
}
# function to remove equivalent factors
remEquivfactors <- function(x.data, ref.data = NULL){
if(length(ref.data) == 0L){
all = x.data
} else{
all = data.frame(cbind(ref.data, x.data))
}
all[,names(all)] = lapply(all[,names(all), drop=F], function(l){
as.numeric(reorder(x=l, X=seq(1:nrow(all)), FUN=mean))
})
rem = which(!(names(x.data) %in% colnames(unique(as.matrix(all), MARGIN=2, fromLast = F)))) # removal of cols towards end will be preferred
return(rem)
}
# function to create n-way categorical-categorical interactions
nwayInterac <- function(char_data, n){
nway <- as.data.frame(combn(ncol(char_data), n, function(y) do.call(paste0, char_data[,y])))
names(nway) = combn(ncol(char_data), n, function(y) paste0(names(char_data)[y], collapse='.'))
rem = remEquivfactors(x.data = nway, ref.data=NULL)
if(length(rem) > 0)
nway = nway[,-rem]
return(nway)
}
# create xgbfi xlsx output
xgbfi <- function(xgbfi_path = 'code/xgbfi/bin/XgbFeatureInteractions.exe', # need to clone code from xgbfi repo
model_path,
output_path, # if filename then without xlsx extension
d = 3, # upper bound for extracted feature interactions depth
g = -1, # upper bound for interaction start deepening (zero deepening => interactions starting @root only)
t = 100, #upper bound for trees to be parsed
k = 100, # upper bound for exported feature interactions per depth level
s = 'Gain', # score metric to sort by (Gain, Fscore, wFScore, AvgwFScore, AvgGain, ExpGain)
h = 10 # amounts of split value histograms
){
system(paste0('mono ',xgbfi_path,' -m ',model_path,' -o ',output_path,
' -d ',d,' -g ',g,' -t ',t,' -k ',k,' -s ',s,' -h ',h)) # saves output .xlsx file in given ouput directory
}
# function to extract calendar features
# function to call in holidays open data / other open data from python/othersources ?
# function to calcualte mode?
# time series outliers
# time series outliers
tsoutliers <- function(x,plot=FALSE)
{
x <- as.ts(x)
if(frequency(x)>1)
resid <- stl(x,s.window="periodic",robust=TRUE)$time.series[,3]
else
{
tt <- 1:length(x)
resid <- residuals(loess(x ~ tt))
}
resid.q <- quantile(resid,prob=c(0.25,0.75))
iqr <- diff(resid.q)
limits <- resid.q + 1.5*iqr*c(-1,1)
score <- abs(pmin((resid-limits[1])/iqr,0) + pmax((resid - limits[2])/iqr,0))
if(plot)
{
plot(x)
x2 <- ts(rep(NA,length(x)))
x2[score>0] <- x[score>0]
tsp(x2) <- tsp(x)
points(x2,pch=19,col="red")
return(invisible(score))
}
else
return(score)
}
# time series lag features
以上是关于r R中数据科学竞赛的共同特征探索功能的主要内容,如果未能解决你的问题,请参考以下文章
R语言实战应用精讲50篇(三十一)-R语言入门系列-tidyverse数据分析流程
R语言SVM支持向量机模型数据分类实战:探索性数据分析模型调优特征选择核函数选择
R语言数据集探索性数据分析(exploratory data analysis, EDA)示例:所有特征的直方图可视化基于目标变量的分组可视化每个特征的箱图特征的相关性分析pairs散点图矩阵