r 如何使用ROC分析比较复杂的复发风险模型。该分析使用置换方法的实现
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 如何使用ROC分析比较复杂的复发风险模型。该分析使用置换方法的实现相关的知识,希望对你有一定的参考价值。
###################################
### Load R libraries
#Contains survival analysis function
library(survival)
#Functions to build ROC curves from Cox models
library(risksetROC)
###################################
### Data loading
#Function to quickly read .csv files
rcsv <- function(...)
{
read.csv(..., header=T, stringsAsFactors=FALSE,na.strings=c("NA", "#N/A", "#NULL!"))
}
#Read the .csv files, of subject data and survival times
covs <- rcsv('miestimation/pgbd_mi_data_imputed-stata12.csv') #All predictor variables. Variables with imputations have 35 entries each.
surv <- rcsv('pgbdsurvivalroc.csv')
#Merge data based on ID.
#Note: all.y makes sure that all subjects in the surv dataframe are included here, whether or not they have data in the covs datasheet
#Note: I set it so that if two variables have the same name, the variables from the covs sheet will have the suffix _covs
dat <- merge(covs,surv,by="subject_id",all.y=TRUE,suffixes=c("_covs",""))
###################################
### Data handling (Make it usable for the analysis)
##Per talking with Peter, we just want to take an average of the multiply imputed variables to use as our data
#This is a list of all imputed variables.
#The scan function takes user input and saves it as a vector
#This is the lazy way of doing it because you can just copy/paste lists from .csv files
imputed_vars_cat <- scan(what="character",sep=",")
childhoodphysabuse_binary,chronicaffdis_bin,comorbidpersonality2,functmorbid_bin,migraines2,mixedepisodes_collapsed,suicidethobehav_collapsed
imputed_vars_cont <- scan(what="character",sep=",")
anxietybaseline_continuous,negative_leq,sheehanimpairment,sheehantotal,suicidethobehav_collapsed
#This loop will go through the list of imputed variables
#and for each subject, take the median value
#iv is an incrementor variable
for (iv in imputed_vars_cont)
{
#For an imputed variable, find the column number of all imputations
#This is done by grepping for the variable in the column names of the data
#note: I prepend a _ before iv, so the original, unimputed data does not come up in the grep result
var_location <- grep(paste("_",iv,sep=""),names(dat))
ivdat <- dat[,var_location]
#apply: take the input data (ivdat), and performs a function (in this case median) on every row (as indicated by the 1 value)
#Will return a vector of length
imputed_collapsed <- apply(ivdat,1,median)
#add the vector of
dat <- cbind(dat,imputed_collapsed)
names(dat)[dim(dat)[2]] <- paste(iv,"i",sep="_")
}
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
for (iv in imputed_vars_cat)
{
#For an imputed variable, find the column number of all imputations
#This is done by grepping for the variable in the column names of the data
#note: I prepend a _ before iv, so the original, unimputed data does not come up in the grep result
var_location <- grep(paste("_",iv,sep=""),names(dat))
ivdat <- dat[,var_location]
#apply: take the input data (ivdat), and performs a function (in this case median) on every row (as indicated by the 1 value)
#Will return a vector of length
imputed_collapsed <- apply(ivdat,1,Mode)
#add the vector of
dat <- cbind(dat,imputed_collapsed)
names(dat)[dim(dat)[2]] <- paste(iv,"i",sep="_")
}
#subset data to relevant covariates only
dat_sub <- subset(dat, select=c(time,event,age,sex,racecat,li_status_collapsed,anxietybaseline_continuous_i,
chronicaffdis_bin_i,migraines2_i, suicidethobehav_collapsed_i,
mixedepisodes_collapsed_i,comorbidpersonality2_i , sheehantotal_i,
negative_leq_i))
###################################
### Permutation and analysis code
#Note: As of Feb 6, 2017, the model reported in the paper is the 730 day, full model
### turn off resampling the phenotype to get the baseline analysis, which should be 0.75240930 0.67311803 0.07929127 if the seed is set to 17
do_not_permute = 0 #Set to 1 if you want to turn off permutations and run the default analysis
full_model = 1 # Set to 1 to include the sheehan and others in the full model (limited set of clinical predictors or a really expanded set of clinical predictors)
null_baseline = 1 # Set to 1 if you want to compare to a model with aboslutely no covariates
#Set number of permutations
nperm=10000
if (do_not_permute == 1)
{
nperm= 1
}
#Makes a matrix that contains number of repetitions dimension, ROC for full, reduced, and difference models, for 365 and 730 days
roc_outputs <- data.frame(matrix(data=NA,nrow=nperm,ncol=6))
names(roc_outputs) <- c("AUC_f_365","AUC_r_365","AUC_d_365","AUC_f_730","AUC_r_730","AUC_d_730")
#Print a status update for the first or every 1000th permutation
for (permnum in 1:nperm)
{
if(permnum %% 1000 == 0 | permnum == 1)
{
print(paste("permutation run", permnum))
}
##Permute the phenotype
#Set the seed for random sampling
set.seed(permnum)
#There are N = dim(dat_sub)[1] rows in the data. Sampling numbers 1:N without
#replacement N times will give us a randomly ordered list of row numbers
shuffled <- sample(1:dim(dat_sub)[1],dim(dat_sub)[1],replace=F)
#Make a permuted dataframe by loading the data in the shuffled order for the
#phenotype column (columns 1 and 2) and in the typical order for all other covariates
dat_sub_shuffle <- data.frame(dat_sub[shuffled,c(1:2)],dat_sub[,-c(1:2)]) #For use in permutation analysis
##Open issue: does it matter or not if I don't reset the row names?
#If you said to not permute, this uses the original data frame instead of the shuffled one
if (do_not_permute == 1)
{
dat_sub_shuffle <- dat_sub
}
#Randomly reorder the data, so that training/test sets contain different observations for each permutation
set.seed(permnum) #Note: set to 17 for the un-permuted data to recreate original results set.seed=17
if (do_not_permute == 1)
{
set.seed(17)
}
new_order <- sample(x=1:dim(dat_sub)[1], size=dim(dat_sub)[1], replace=F) #Defines the new ordering
dat_r <- dat_sub_shuffle[new_order,] #Accomplishes the reordering
#Determine which subjects have a survival event, and which do not
#the which function returns the row number of these subjects
cases <- which(dat_r$event == 1)
controls <- which(dat_r$event == 0)
#Make separate dataframes for cases and controls
#I do this because it's an easy way to insure that
#the training/test sets contain the correct proportion of cases
case_set <- dat_r[cases,]
control_set <- dat_r[controls,]
#Repeat the sequence 1...10
numberseq <- rep(1:10,dim(dat_r)[1])
#Cut the sequence down to the dimension of the datasets
#Now every subject will have been assigned a number from 1 to 10
#We will filter on this number to build training/test sets
control_numbers <- numberseq[1:length(controls)] #Give each control subject a set number
case_numbers <- numberseq[1:length(cases)] #Give each case subject a set number
###Perform cox regression on each subset of data
#Make a matrix to hold the predicted AUCs
#Make it into a data frame so variables can be named easily
pred_base_lps <- data.frame(matrix(data=NA,nrow=dim(dat_r)[1],ncol=2))
names(pred_base_lps) <- c("red","full")
row.names(pred_base_lps) <- paste("REM",1:dim(pred_base_lps)[1],sep="") # Have to remove the existing row names because we're going to merge on row names, which have been shuffled.
#Formula for basic model
if (null_baseline == 0)
{
red_model_formula <- formula(Surv(time,event) ~ age + sex + as.factor(racecat) + as.factor(li_status_collapsed))
}
if (null_baseline == 1)
{
red_model_formula <- formula(Surv(time,event) ~ 1)
}
#Formula for model with clinical predictors
#If choosing to include sheehan, negative leq, and chronicity
if (full_model ==1)
{
model_formula <- formula(Surv(time,event) ~ age + sex + as.factor(racecat) + as.factor(li_status_collapsed) + anxietybaseline_continuous_i +
migraines2_i + as.factor(suicidethobehav_collapsed_i) + mixedepisodes_collapsed_i +
comorbidpersonality2_i + sheehantotal_i + negative_leq_i + chronicaffdis_bin_i)
}
#If choosing to NOT include sheehan, negative leq, and chronicity
if (full_model ==0)
{
model_formula <- formula(Surv(time,event) ~ age + sex + as.factor(racecat) + as.factor(li_status_collapsed) + anxietybaseline_continuous_i +
migraines2_i + as.factor(suicidethobehav_collapsed_i) + mixedepisodes_collapsed_i +
comorbidpersonality2_i )
}
incstart <- 1 # Need to increment this matrix of AUCs based on the dimension of the test sets, which are not uniformly sized
for (i in 1:10)
{
train_set <- rbind(case_set[case_numbers != i,], control_set[control_numbers != i,]) #Get all cases and controls who aren't in the ith set. Combine the set of cases and controls. Regression will be done in these subjects
test_set <- rbind(case_set[case_numbers == i,], control_set[control_numbers == i,]) #Get the ith set to make the testing set
coxmod_base <- coxph(red_model_formula,data=train_set) #Fit the reduced cox model to the training set
coxmod_full <- coxph(model_formula, data= train_set) #Fit the full cox model to the training set
pred_base <- predict(coxmod_base,test_set,"lp") #obtain linear predictors for reduced model on the test set
pred_full <- predict(coxmod_full,test_set,"lp") #obtain linear predictors for full model on the test set
inc <- length(pred_base)
pred_base_lps[incstart:(incstart+inc-1),]$red <- pred_base
pred_base_lps[incstart:(incstart+inc-1),]$full <- pred_full
row.names(pred_base_lps)[incstart:(incstart+inc-1)] <- names(pred_full) #Set teh row names to match the existing data row names
incstart <- incstart + inc #Increment the counter
}
#Based on the row names, merge back with the original data
dmerge <- merge(pred_base_lps,dat_r,by="row.names")
#Now take the ROC from both models
#Do for each cutoff date
t1r365 <- CoxWeights(marker=dmerge$red,Stime = dmerge$time, status=dmerge$event, predict.time=365)
t1f365 <- CoxWeights(marker=dmerge$full,Stime = dmerge$time, status=dmerge$event, predict.time=365)
rocd365 <- t1f365$AUC-t1r365$AUC
t1r730 <- CoxWeights(marker=dmerge$red,Stime = dmerge$time, status=dmerge$event, predict.time=730)
t1f730 <- CoxWeights(marker=dmerge$full,Stime = dmerge$time, status=dmerge$event, predict.time=730)
rocd730 <- t1f730$AUC-t1r730$AUC
roc_outputs[permnum,] <- c(t1f365$AUC, t1r365$AUC, rocd365,t1f730$AUC, t1r730$AUC, rocd730)
}
##Handle the output depending on parameters
if (do_not_permute == 1 & full_model == 1 & null_baseline == 1)
{
t1r730_unpermuted_allpreds <- t1r730
t1f730_unpermuted_allpreds <- t1f730
t1r365_unpermuted_allpreds <- t1r365
t1f365_unpermuted_allpreds <- t1f365
roc_outputs_unpermuted_allpreds <- roc_outputs
save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_nullbaseline_v1_feb6_2017.R")
}
table(0.7489 > roc_outputs_unpermuted_allpreds[,6])
if (do_not_permute == 0 & full_model == 1 & null_baseline == 1)
{
t1r730_unpermuted_allpreds <- t1r730
t1f730_unpermuted_allpreds <- t1f730
t1r365_unpermuted_allpreds <- t1r365
t1f365_unpermuted_allpreds <- t1f365
roc_outputs_unpermuted_allpreds <- roc_outputs
save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_permuted_allpreds_nullbaseline_v1_feb6_2017.R")
}
if (do_not_permute == 1 & full_model == 1 & null_baseline == 0)
{
t1r730_unpermuted_allpreds <- t1r730
t1f730_unpermuted_allpreds <- t1f730
t1r365_unpermuted_allpreds <- t1r365
t1f365_unpermuted_allpreds <- t1f365
roc_outputs_unpermuted_allpreds <- roc_outputs
save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_v1_aug10_2016.R")
}
if (do_not_permute == 1 & full_model == 1 & null_baseline == 1)
{
t1r730_unpermuted_allpreds <- t1r730
t1f730_unpermuted_allpreds <- t1f730
t1r365_unpermuted_allpreds <- t1r365
t1f365_unpermuted_allpreds <- t1f365
roc_outputs_unpermuted_allpreds <- roc_outputs
save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_nullbaseline_v1_feb6_2017.R")
}
if (do_not_permute == 1 & full_model == 0 & null_baseline == 0)
{
t1r730_unpermuted_lesspreds <- t1r730
t1f730_unpermuted_lesspreds <- t1f730
t1r365_unpermuted_lesspreds <- t1r365
t1f365_unpermuted_lesspreds <- t1f365
roc_outputs_unpermuted_lesspreds <- roc_outputs
save(t1r730_unpermuted_lesspreds,t1f730_unpermuted_lesspreds,t1r365_unpermuted_lesspreds,t1f365_unpermuted_lesspreds,roc_outputs_unpermuted_lesspreds, file="roc_outputs_unpermuted_lesspreds_v1_aug10_2016.R")
}
if (do_not_permute == 0 & full_model == 1 & null_baseline == 0)
{
t1r730_permed_allpreds <- t1r730
t1f730_permed_allpreds <- t1f730
t1r365_permed_allpreds <- t1r365
t1f365_permed_allpreds <- t1f365
roc_outputs_permed_allpreds <- roc_outputs
save(t1r730_permed_allpreds,t1f730_permed_allpreds,t1r365_permed_allpreds,t1f365_permed_allpreds,roc_outputs_permed_allpreds, file="roc_outputs_permed_allpreds_v1_aug10_2016.R")
}
if (do_not_permute == 0 & full_model == 0 & null_baseline == 0)
{
t1r730_permed_lesspreds <- t1r730
t1f730_permed_lesspreds <- t1f730
t1r365_permed_lesspreds <- t1r365
t1f365_permed_lesspreds <- t1f365
roc_outputs_permed_lesspreds <- roc_outputs
save(t1r730_permed_lesspreds,t1f730_permed_lesspreds,t1r365_permed_lesspreds,t1f365_permed_lesspreds,roc_outputs_permed_lesspreds, file="roc_outputs_permed_lesspreds_v1_aug10_2016.R")
}
load('roc_outputs_permed_allpreds_v1_aug10_2016.R')
load('roc_outputs_unpermuted_allpreds_v1_aug10_2016.R')
load('roc_outputs_permed_lesspreds_v1_aug10_2016.R')
load('roc_outputs_unpermuted_lesspreds_v1_aug10_2016.R')
pdf("PGBD_ROC_allpredictors_730days_v1_aug10_2016.pdf")
plot(t1f730_unpermuted_allpreds$FP,t1f730_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
lines(t1r730_unpermuted_allpreds$FP,t1r730_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)
pv <- table(roc_outputs_unpermuted_allpreds[,6] > roc_outputs_permed_allpreds[,6])[1] / length( roc_outputs_permed_allpreds[,6])
legend("topleft", legend=c(paste("Full model AUC =", round(t1f730_unpermuted_allpreds$AUC,3)),paste("Reduced model AUC =", round(t1r730_unpermuted_allpreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()
pdf("PGBD_ROC_allpredictors_365days_v1_aug10_2016.pdf")
plot(t1f365_unpermuted_allpreds$FP,t1f365_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
lines(t1r365_unpermuted_allpreds$FP,t1r365_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)
pv <- table(roc_outputs_unpermuted_allpreds[,3] > roc_outputs_permed_allpreds[,3])[1] / length( roc_outputs_permed_allpreds[,3])
legend("topleft", legend=c(paste("Full model AUC =", round(t1f365_unpermuted_allpreds$AUC,3)),paste("Reduced model AUC =", round(t1r365_unpermuted_allpreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()
pdf("PGBD_ROC_lesspredictors_730days_v1_aug10_2016.pdf")
plot(t1f730_unpermuted_lesspreds$FP,t1f730_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
lines(t1r730_unpermuted_lesspreds$FP,t1r730_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)
pv <- table(roc_outputs_unpermuted_lesspreds[,6] > roc_outputs_permed_lesspreds[,6])[1] / length( roc_outputs_permed_lesspreds[,6])
legend("topleft", legend=c(paste("Full model AUC =", round(t1f730_unpermuted_lesspreds$AUC,3)),paste("Reduced model AUC =", round(t1r730_unpermuted_lesspreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()
pdf("PGBD_ROC_lesspredictors_365days_v1_aug10_2016.pdf")
plot(t1f365_unpermuted_lesspreds$FP,t1f365_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
lines(t1r365_unpermuted_lesspreds$FP,t1r365_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)
pv <- table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_permed_lesspreds[,3])[1] / length( roc_outputs_permed_lesspreds[,3])
legend("topleft", legend=c(paste("Full model AUC =", round(t1f365_unpermuted_lesspreds$AUC,3)),paste("Reduced model AUC =", round(t1r365_unpermuted_lesspreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()
#No Sheehan
table(roc_outputs_unpermuted_lesspreds[,1] > roc_outputs_lesspreds[,1])
table(roc_outputs_unpermuted_lesspreds[,2] > roc_outputs_lesspreds[,2])
table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_lesspreds[,3])
pdf("roc_unpermuted_lesspreds_730days_v1_aug1_2016.pdf")
plot(t1f730$FP,t1f730$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue")
lines(t1r730$FP,t1r730$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)
dev.off()
plot(t1f365$FP,t1f365$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue")
lines(t1r365$FP,t1r365$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)
#Default AUC values without the
par(mfrow=c(2,2))
hist(roc_outputs[,1],freq=FALSE,xlim=c(0.5,.8),main="Full model AUCs")
abline(v= 0.75240930,col="red",lwd=2)
hist(roc_outputs[,2],freq=FALSE,xlim=c(0.5,.8),main="Reduced model AUCs")
abline(v= 0.67311803,col="red",lwd=2)
hist(roc_outputs[,3],freq=FALSE,xlim=c(-.2,.2),main="Full - Reduced AUCs")
abline(v= 0.07929127,col="red",lwd=2)
dev.off()
#730 days
#No Sheehan
table(roc_outputs_unpermuted_lesspreds[,4] > roc_outputs_permed_lesspreds[,4])
table(roc_outputs_unpermuted_lesspreds[,5] > roc_outputs_permed_lesspreds[,5])
table(roc_outputs_unpermuted_lesspreds[,6] > roc_outputs_permed_lesspreds[,6])
#With Sheehan
table(roc_outputs_unpermuted_allpreds[,4] > roc_outputs_permed_allpreds[,4])
table(roc_outputs_unpermuted_allpreds[,5] > roc_outputs_permed_allpreds[,5])
table(roc_outputs_unpermuted_allpreds[,6] > roc_outputs_permed_allpreds[,6])
#365 days
#No Sheehan
table(roc_outputs_unpermuted_lesspreds[,1] > roc_outputs_permed_lesspreds[,1])
table(roc_outputs_unpermuted_lesspreds[,2] > roc_outputs_permed_lesspreds[,2])
table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_permed_lesspreds[,3])
#With Sheehan
table(roc_outputs_unpermuted_allpreds[,1] > roc_outputs_permed_allpreds[,1])
table(roc_outputs_unpermuted_allpreds[,2] > roc_outputs_permed_allpreds[,2])
table(roc_outputs_unpermuted_allpreds[,3] > roc_outputs_permed_allpreds[,3])
#Get correlation of measures
library(psych)
corrs <- corr.test(dat_sub[,c("anxietybaseline_continuous_i",
"chronicaffdis_bin_i","migraines2_i", "suicidethobehav_collapsed_i",
"mixedepisodes_collapsed_i","comorbidpersonality2_i" , "sheehantotal_i",
"negative_leq_i")],method="spearman")
以上是关于r 如何使用ROC分析比较复杂的复发风险模型。该分析使用置换方法的实现的主要内容,如果未能解决你的问题,请参考以下文章
R语言使用survival包的coxph函数构建cox回归模型使用ggrisk包的ggrisk函数可视化Cox回归的风险评分图(风险得分图)使用风险得分的roc曲线的约登值(基于LIRI基因数据集
R语言生存分析详解:KM曲线COX比例风险模型HR值解读模型比较残差分析是否比例风险验证:基于survival包lung数据集