r 如何使用ROC分析比较复杂的复发风险模型。该分析使用置换方法的实现

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 如何使用ROC分析比较复杂的复发风险模型。该分析使用置换方法的实现相关的知识,希望对你有一定的参考价值。

################################### 
### Load R libraries

#Contains survival analysis function
 library(survival)
#Functions to build ROC curves from Cox models
 library(risksetROC)


################################### 
### Data loading

#Function to quickly read .csv files
rcsv <- function(...)
	{
		read.csv(..., header=T, stringsAsFactors=FALSE,na.strings=c("NA", "#N/A", "#NULL!"))
	}

#Read the .csv files, of subject data and survival times
 covs <- rcsv('miestimation/pgbd_mi_data_imputed-stata12.csv') #All predictor variables. Variables with imputations have 35 entries each.
 surv <- rcsv('pgbdsurvivalroc.csv')
 
#Merge data based on ID. 
#Note: all.y makes sure that all subjects in the surv dataframe are included here, whether or not they have data in the covs datasheet
#Note: I set it so that if two variables have the same name, the variables from the covs sheet will have the suffix _covs 
 dat <- merge(covs,surv,by="subject_id",all.y=TRUE,suffixes=c("_covs",""))


################################### 
### Data handling (Make it usable for the analysis)

##Per talking with Peter, we just want to take an average of the multiply imputed variables to use as our data

#This is a list of all imputed variables. 
#The scan function takes user input and saves it as a vector
#This is the lazy way of doing it because you can just copy/paste lists from .csv files
imputed_vars_cat <- scan(what="character",sep=",")
childhoodphysabuse_binary,chronicaffdis_bin,comorbidpersonality2,functmorbid_bin,migraines2,mixedepisodes_collapsed,suicidethobehav_collapsed

imputed_vars_cont <- scan(what="character",sep=",")
anxietybaseline_continuous,negative_leq,sheehanimpairment,sheehantotal,suicidethobehav_collapsed


#This loop will go through the list of imputed variables
#and for each subject, take the median value
#iv is an incrementor variable

for (iv in imputed_vars_cont)
{
 #For an imputed variable, find the column number of all imputations
 #This is done by grepping for the variable in the column names of the data
 #note: I prepend a _ before iv, so the original, unimputed data does not come up in the grep result

 var_location <- grep(paste("_",iv,sep=""),names(dat)) 
 ivdat <- dat[,var_location]

 #apply: take the input data (ivdat), and performs a function (in this case median) on every row (as indicated by the 1 value)
 #Will return a vector of length
  imputed_collapsed <- apply(ivdat,1,median) 
 #add the vector of 
  dat <- cbind(dat,imputed_collapsed)
  names(dat)[dim(dat)[2]] <- paste(iv,"i",sep="_")
}

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

for (iv in imputed_vars_cat)
{
 #For an imputed variable, find the column number of all imputations
 #This is done by grepping for the variable in the column names of the data
 #note: I prepend a _ before iv, so the original, unimputed data does not come up in the grep result

 var_location <- grep(paste("_",iv,sep=""),names(dat)) 
 ivdat <- dat[,var_location]

 #apply: take the input data (ivdat), and performs a function (in this case median) on every row (as indicated by the 1 value)
 #Will return a vector of length
  imputed_collapsed <- apply(ivdat,1,Mode) 
 #add the vector of 
  dat <- cbind(dat,imputed_collapsed)
  names(dat)[dim(dat)[2]] <- paste(iv,"i",sep="_")
}



#subset data to relevant covariates only

dat_sub <- subset(dat, select=c(time,event,age,sex,racecat,li_status_collapsed,anxietybaseline_continuous_i,
					chronicaffdis_bin_i,migraines2_i, suicidethobehav_collapsed_i, 
					mixedepisodes_collapsed_i,comorbidpersonality2_i , sheehantotal_i, 
					negative_leq_i))


################################### 
### Permutation and analysis code 

#Note: As of Feb 6, 2017, the model reported in the paper is the 730 day, full model
### turn off resampling the phenotype to get the baseline analysis, which should be  0.75240930 0.67311803 0.07929127 if the seed is set to 17 


do_not_permute = 0 #Set to 1 if you want to turn off permutations and run the default analysis 
full_model = 1 # Set to 1 to include the sheehan and others in the full model (limited set of clinical predictors or a really expanded set of clinical predictors)
null_baseline = 1 # Set to 1 if you want to compare to a model with aboslutely no covariates 

#Set number of permutations
nperm=10000

if (do_not_permute == 1)
{
 nperm= 1
}

#Makes a matrix that contains number of repetitions dimension, ROC for full, reduced, and difference models, for 365 and 730 days
 roc_outputs <- data.frame(matrix(data=NA,nrow=nperm,ncol=6))
 names(roc_outputs) <- c("AUC_f_365","AUC_r_365","AUC_d_365","AUC_f_730","AUC_r_730","AUC_d_730")

#Print a status update for the first or every 1000th permutation
for (permnum in 1:nperm)
{

 if(permnum %% 1000 == 0 | permnum == 1)
 {
  print(paste("permutation run", permnum))
 }

##Permute the phenotype

#Set the seed for random sampling
 set.seed(permnum)

#There are N = dim(dat_sub)[1] rows in the data. Sampling numbers 1:N without 
#replacement N times will give us a randomly ordered list of row numbers
 shuffled <- sample(1:dim(dat_sub)[1],dim(dat_sub)[1],replace=F)

#Make a permuted dataframe by loading the data in the shuffled order for the 
#phenotype column (columns 1 and 2) and in the typical order for all other covariates

 dat_sub_shuffle <- data.frame(dat_sub[shuffled,c(1:2)],dat_sub[,-c(1:2)]) #For use in permutation analysis
 ##Open issue: does it matter or not if I don't reset the row names?


#If you said to not permute, this uses the original data frame instead of the shuffled one
 if (do_not_permute == 1)
 {
  dat_sub_shuffle <- dat_sub 
 }


#Randomly reorder the data, so that training/test sets contain different observations for each permutation

 set.seed(permnum) #Note: set to 17 for the un-permuted data to recreate original results set.seed=17

 if (do_not_permute == 1)
 {
  set.seed(17)
 }
 new_order <- sample(x=1:dim(dat_sub)[1],    size=dim(dat_sub)[1],   replace=F) #Defines the new ordering
 dat_r <- dat_sub_shuffle[new_order,]  #Accomplishes the reordering
 
#Determine which subjects have a survival event, and which do not
#the which function returns the row number of these subjects
 cases     <- which(dat_r$event == 1)
 controls  <- which(dat_r$event == 0)

#Make separate dataframes for cases and controls
#I do this because it's an easy way to insure that 
#the training/test sets contain the correct proportion of cases
 case_set <- dat_r[cases,]
 control_set <- dat_r[controls,]
 
#Repeat the sequence 1...10
 numberseq <- rep(1:10,dim(dat_r)[1]) 

#Cut the sequence down to the dimension of the datasets
#Now every subject will have been assigned a number from 1 to 10
#We will filter on this number to build training/test sets

 control_numbers <- numberseq[1:length(controls)] #Give each control subject a set number
 case_numbers <- numberseq[1:length(cases)] #Give each case subject a set number

###Perform cox regression on each subset of data

#Make a matrix to hold the predicted AUCs
#Make it into a data frame so variables can be named easily

 pred_base_lps <- data.frame(matrix(data=NA,nrow=dim(dat_r)[1],ncol=2))
 names(pred_base_lps) <- c("red","full")
 row.names(pred_base_lps) <- paste("REM",1:dim(pred_base_lps)[1],sep="") # Have to remove the existing row names because we're going to merge on row names, which have been shuffled.

 #Formula for basic model
 if (null_baseline == 0)
 {
  red_model_formula <- formula(Surv(time,event) ~  age + sex + as.factor(racecat) + as.factor(li_status_collapsed))
 }

 if (null_baseline == 1)
 {
  red_model_formula <- formula(Surv(time,event) ~  1)
 }

 #Formula for model with clinical predictors
 #If choosing to include sheehan, negative leq, and chronicity
 if (full_model ==1)
 {
   model_formula <- formula(Surv(time,event) ~ age + sex + as.factor(racecat) + as.factor(li_status_collapsed) + anxietybaseline_continuous_i + 
							 migraines2_i + as.factor(suicidethobehav_collapsed_i) + mixedepisodes_collapsed_i +  
							comorbidpersonality2_i + sheehantotal_i + negative_leq_i + chronicaffdis_bin_i)
 }
 #If choosing to NOT include sheehan, negative leq, and chronicity
 if (full_model ==0)
 {
   model_formula <- formula(Surv(time,event) ~ age + sex + as.factor(racecat) + as.factor(li_status_collapsed) + anxietybaseline_continuous_i + 
							 migraines2_i + as.factor(suicidethobehav_collapsed_i) + mixedepisodes_collapsed_i +  
							comorbidpersonality2_i )
 }

 incstart <- 1 # Need to increment this matrix of AUCs based on the dimension of the test sets, which are not uniformly sized
 
 for (i in 1:10)
 {
    train_set <- rbind(case_set[case_numbers != i,], control_set[control_numbers != i,]) #Get all cases and controls who aren't in the ith set. Combine the set of cases and controls. Regression will be done in these subjects
    test_set  <- rbind(case_set[case_numbers == i,], control_set[control_numbers == i,]) #Get the ith set to make the testing set
    coxmod_base   <- coxph(red_model_formula,data=train_set) #Fit the reduced cox model to the training set
    coxmod_full   <- coxph(model_formula, data= train_set) #Fit the full cox model to the training set 
    pred_base <- predict(coxmod_base,test_set,"lp") #obtain  linear predictors for reduced model on the test set
    pred_full <- predict(coxmod_full,test_set,"lp") #obtain  linear predictors for full model on the test set
    inc <- length(pred_base) 
    pred_base_lps[incstart:(incstart+inc-1),]$red <- pred_base
    pred_base_lps[incstart:(incstart+inc-1),]$full <- pred_full 
    row.names(pred_base_lps)[incstart:(incstart+inc-1)] <- names(pred_full) #Set teh row names to match the existing data row names
    incstart <- incstart + inc  #Increment the counter   
  }

#Based on the row names, merge back with the original data
 dmerge <- merge(pred_base_lps,dat_r,by="row.names")

#Now take the ROC from both models
#Do for each cutoff date 

 t1r365 <- CoxWeights(marker=dmerge$red,Stime = dmerge$time, status=dmerge$event, predict.time=365)
 t1f365 <- CoxWeights(marker=dmerge$full,Stime = dmerge$time, status=dmerge$event, predict.time=365)
 rocd365 <- t1f365$AUC-t1r365$AUC

 t1r730 <- CoxWeights(marker=dmerge$red,Stime = dmerge$time, status=dmerge$event, predict.time=730)
 t1f730 <- CoxWeights(marker=dmerge$full,Stime = dmerge$time, status=dmerge$event, predict.time=730)
 rocd730 <- t1f730$AUC-t1r730$AUC


 roc_outputs[permnum,] <- c(t1f365$AUC, t1r365$AUC, rocd365,t1f730$AUC, t1r730$AUC, rocd730)
 
}

##Handle the output depending on parameters


if (do_not_permute == 1 & full_model == 1 & null_baseline == 1)
{

 t1r730_unpermuted_allpreds <-  t1r730
 t1f730_unpermuted_allpreds <-  t1f730

 t1r365_unpermuted_allpreds <-  t1r365
 t1f365_unpermuted_allpreds <-  t1f365

 roc_outputs_unpermuted_allpreds <- roc_outputs
 save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_nullbaseline_v1_feb6_2017.R")

}
table(0.7489 > roc_outputs_unpermuted_allpreds[,6])


if (do_not_permute == 0 & full_model == 1 & null_baseline == 1)
{

 t1r730_unpermuted_allpreds <-  t1r730
 t1f730_unpermuted_allpreds <-  t1f730

 t1r365_unpermuted_allpreds <-  t1r365
 t1f365_unpermuted_allpreds <-  t1f365

 roc_outputs_unpermuted_allpreds <- roc_outputs
 save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_permuted_allpreds_nullbaseline_v1_feb6_2017.R")

}



if (do_not_permute == 1 & full_model == 1 & null_baseline == 0)
{

 t1r730_unpermuted_allpreds <-  t1r730
 t1f730_unpermuted_allpreds <-  t1f730

 t1r365_unpermuted_allpreds <-  t1r365
 t1f365_unpermuted_allpreds <-  t1f365

 roc_outputs_unpermuted_allpreds <- roc_outputs
 save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_v1_aug10_2016.R")

}

if (do_not_permute == 1 & full_model == 1 & null_baseline == 1)
{

 t1r730_unpermuted_allpreds <-  t1r730
 t1f730_unpermuted_allpreds <-  t1f730

 t1r365_unpermuted_allpreds <-  t1r365
 t1f365_unpermuted_allpreds <-  t1f365

 roc_outputs_unpermuted_allpreds <- roc_outputs
 save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_nullbaseline_v1_feb6_2017.R")

}



if (do_not_permute == 1 & full_model == 0 & null_baseline == 0)
{

 t1r730_unpermuted_lesspreds <-  t1r730
 t1f730_unpermuted_lesspreds <-  t1f730

 t1r365_unpermuted_lesspreds <-  t1r365
 t1f365_unpermuted_lesspreds <-  t1f365

 roc_outputs_unpermuted_lesspreds <- roc_outputs
 save(t1r730_unpermuted_lesspreds,t1f730_unpermuted_lesspreds,t1r365_unpermuted_lesspreds,t1f365_unpermuted_lesspreds,roc_outputs_unpermuted_lesspreds, file="roc_outputs_unpermuted_lesspreds_v1_aug10_2016.R")

}

if (do_not_permute == 0 & full_model == 1 & null_baseline == 0)
{

 t1r730_permed_allpreds <-  t1r730
 t1f730_permed_allpreds <-  t1f730

 t1r365_permed_allpreds <-  t1r365
 t1f365_permed_allpreds <-  t1f365

 roc_outputs_permed_allpreds <- roc_outputs
 save(t1r730_permed_allpreds,t1f730_permed_allpreds,t1r365_permed_allpreds,t1f365_permed_allpreds,roc_outputs_permed_allpreds, file="roc_outputs_permed_allpreds_v1_aug10_2016.R")

}

if (do_not_permute == 0 & full_model == 0 & null_baseline == 0)
{

 t1r730_permed_lesspreds <-  t1r730
 t1f730_permed_lesspreds <-  t1f730

 t1r365_permed_lesspreds <-  t1r365
 t1f365_permed_lesspreds <-  t1f365

 roc_outputs_permed_lesspreds <- roc_outputs
 save(t1r730_permed_lesspreds,t1f730_permed_lesspreds,t1r365_permed_lesspreds,t1f365_permed_lesspreds,roc_outputs_permed_lesspreds, file="roc_outputs_permed_lesspreds_v1_aug10_2016.R")

}

load('roc_outputs_permed_allpreds_v1_aug10_2016.R')
load('roc_outputs_unpermuted_allpreds_v1_aug10_2016.R')
load('roc_outputs_permed_lesspreds_v1_aug10_2016.R')
load('roc_outputs_unpermuted_lesspreds_v1_aug10_2016.R')



pdf("PGBD_ROC_allpredictors_730days_v1_aug10_2016.pdf")
 plot(t1f730_unpermuted_allpreds$FP,t1f730_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
 lines(t1r730_unpermuted_allpreds$FP,t1r730_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
 abline(0,1,col="black",lty=2)
 pv <- table(roc_outputs_unpermuted_allpreds[,6] > roc_outputs_permed_allpreds[,6])[1] / length( roc_outputs_permed_allpreds[,6])

 legend("topleft", legend=c(paste("Full model AUC =", round(t1f730_unpermuted_allpreds$AUC,3)),paste("Reduced model AUC =", round(t1r730_unpermuted_allpreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()

pdf("PGBD_ROC_allpredictors_365days_v1_aug10_2016.pdf")
 plot(t1f365_unpermuted_allpreds$FP,t1f365_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
 lines(t1r365_unpermuted_allpreds$FP,t1r365_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
 abline(0,1,col="black",lty=2)
 pv <- table(roc_outputs_unpermuted_allpreds[,3] > roc_outputs_permed_allpreds[,3])[1] / length( roc_outputs_permed_allpreds[,3])

 legend("topleft", legend=c(paste("Full model AUC =", round(t1f365_unpermuted_allpreds$AUC,3)),paste("Reduced model AUC =", round(t1r365_unpermuted_allpreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()

pdf("PGBD_ROC_lesspredictors_730days_v1_aug10_2016.pdf")
 plot(t1f730_unpermuted_lesspreds$FP,t1f730_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
 lines(t1r730_unpermuted_lesspreds$FP,t1r730_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
 abline(0,1,col="black",lty=2)
 pv <- table(roc_outputs_unpermuted_lesspreds[,6] > roc_outputs_permed_lesspreds[,6])[1] / length( roc_outputs_permed_lesspreds[,6])

 legend("topleft", legend=c(paste("Full model AUC =", round(t1f730_unpermuted_lesspreds$AUC,3)),paste("Reduced model AUC =", round(t1r730_unpermuted_lesspreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()

pdf("PGBD_ROC_lesspredictors_365days_v1_aug10_2016.pdf")
 plot(t1f365_unpermuted_lesspreds$FP,t1f365_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
 lines(t1r365_unpermuted_lesspreds$FP,t1r365_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
 abline(0,1,col="black",lty=2)
 pv <- table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_permed_lesspreds[,3])[1] / length( roc_outputs_permed_lesspreds[,3])

 legend("topleft", legend=c(paste("Full model AUC =", round(t1f365_unpermuted_lesspreds$AUC,3)),paste("Reduced model AUC =", round(t1r365_unpermuted_lesspreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()






#No Sheehan  
table(roc_outputs_unpermuted_lesspreds[,1] > roc_outputs_lesspreds[,1])
table(roc_outputs_unpermuted_lesspreds[,2] > roc_outputs_lesspreds[,2])
table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_lesspreds[,3])






pdf("roc_unpermuted_lesspreds_730days_v1_aug1_2016.pdf")
plot(t1f730$FP,t1f730$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue")
lines(t1r730$FP,t1r730$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)
dev.off()


plot(t1f365$FP,t1f365$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue")
lines(t1r365$FP,t1r365$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)




#Default AUC values without the 
par(mfrow=c(2,2))
hist(roc_outputs[,1],freq=FALSE,xlim=c(0.5,.8),main="Full model AUCs")
abline(v= 0.75240930,col="red",lwd=2)
hist(roc_outputs[,2],freq=FALSE,xlim=c(0.5,.8),main="Reduced model AUCs")
abline(v= 0.67311803,col="red",lwd=2)
hist(roc_outputs[,3],freq=FALSE,xlim=c(-.2,.2),main="Full - Reduced  AUCs")
abline(v= 0.07929127,col="red",lwd=2)
dev.off()

#730 days
#No Sheehan  
table(roc_outputs_unpermuted_lesspreds[,4] > roc_outputs_permed_lesspreds[,4])
table(roc_outputs_unpermuted_lesspreds[,5] > roc_outputs_permed_lesspreds[,5])
table(roc_outputs_unpermuted_lesspreds[,6] > roc_outputs_permed_lesspreds[,6])

#With Sheehan  
table(roc_outputs_unpermuted_allpreds[,4] > roc_outputs_permed_allpreds[,4])
table(roc_outputs_unpermuted_allpreds[,5] > roc_outputs_permed_allpreds[,5])
table(roc_outputs_unpermuted_allpreds[,6] > roc_outputs_permed_allpreds[,6])

#365 days
#No Sheehan  
table(roc_outputs_unpermuted_lesspreds[,1] > roc_outputs_permed_lesspreds[,1])
table(roc_outputs_unpermuted_lesspreds[,2] > roc_outputs_permed_lesspreds[,2])
table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_permed_lesspreds[,3])

#With Sheehan  
table(roc_outputs_unpermuted_allpreds[,1] > roc_outputs_permed_allpreds[,1])
table(roc_outputs_unpermuted_allpreds[,2] > roc_outputs_permed_allpreds[,2])
table(roc_outputs_unpermuted_allpreds[,3] > roc_outputs_permed_allpreds[,3])





#Get correlation of measures
library(psych)
corrs <- corr.test(dat_sub[,c("anxietybaseline_continuous_i",
					"chronicaffdis_bin_i","migraines2_i", "suicidethobehav_collapsed_i", 
					"mixedepisodes_collapsed_i","comorbidpersonality2_i" , "sheehantotal_i", 
					"negative_leq_i")],method="spearman")


  

以上是关于r 如何使用ROC分析比较复杂的复发风险模型。该分析使用置换方法的实现的主要内容,如果未能解决你的问题,请参考以下文章

R语言使用survival包的coxph函数构建cox回归模型使用ggrisk包的ggrisk函数可视化Cox回归的风险评分图(风险得分图)使用风险得分的roc曲线的约登值(基于LIRI基因数据集

生存分析入门和R分析

R语言生存分析详解:KM曲线COX比例风险模型HR值解读模型比较残差分析是否比例风险验证:基于survival包lung数据集

基于R语言PredictABEL包对Logistic回归模型外部验证

R语言ROC曲线下的面积 - 评估逻辑回归中的歧视

R:聚类——如何预测新病例?