r 用小鼠多次插补

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 用小鼠多次插补相关的知识,希望对你有一定的参考价值。

library(mice)

#Continuous and categorical variables should be defined prior to MI. This will prevent nonsensical imputations from happening - e.g. cohorts are 1,2,3,4, you impute, then suddenly someone is in cohort 1.5

#Note feb 5: Put other surrogate mobility variables in noting imputation

#I prefer to make lists of all continuous and categorical variables going in

contvars <- scan(what="character",sep="/n")
AGE
BMI
CREA
CRP
DIAS
GLUC
HDL
INSU
LDL
SYST
TCHO
TEXPWK
TRIG

 #Variables to convert to factors
catvars <- scan(what="character",sep="/n")
ALCOHOL
antidiabetic
antihyperlipidemic
antihypertensive
CABG
cancer_baseline
DIABTRT
DRESS
DRESS_BASELINE
EAT_BASELINE
EDUC
ETHNIC
FEEDSELF
GENHEL
HICHOLRP
HORMSTAT
INOUTBED
INOUTBED_BASELINE
MARITAL
MI
PAD
SHOWER
SHOWER_BASELINE
SMOKING
STAIR
STAIR_BASELINE
statin
STROKE
SURVAGE90
WALK1BLK
WALK1BLK_BASELINE
WALKAID

#Then I subset the data to just these variables
 keepvarlist <- c(catvars,contvars)
 dat_for_imp <- subset(dat, select=keepvarlist )
 
#And if they aren't already coded as factors (e.g. coded as strings), I make the factor variables into factors
 dat_for_imp[,catvars] <-  lapply(dat_for_imp[,c(catvars)] , factor)

#View missing data patterns
 md_dat <- md.pattern(dat_for_imp)
 
pdf('missing_data.plot.pdf',14,7)
aggr_plot <- aggr(dat_for_imp, col=c('navyblue','red','orange'), only.miss=TRUE,
 numbers=TRUE,combined=T,labels=names(dat_for_imp), cex.axis=.7, 
 ylab=c("Histogram of missing data","Pattern"),sortVars=TRUE,)
dev.off()

#Set your seed! 
dat_for_imp_ids <- dat$ID #I'm making a vector of subject IDs to keep on hand, since I'm not keeping them in the data that goes into the imputation model
#Alternatively, you could include your subject ID variable but tell the function to not use it to try and impute stuff!

#Do imputations. Here I'm running 100 imputations using default methods. 
#The default method of imputation will depend upon the variable - e.g. binary variables get imputed using logistic regression. 
#You can tell it which method to use if you have the patience to do so.

dat_imp <- mice(dat_for_imp, m=100,seed=17)
#Save the imputed data as an R object, this takes a lot of time!
save(dat_for_imp_ids,dat_imp,file="alldat_imputations_v6_april7_2017.Rdata")

#Load imputed data
load('alldat_imputations_v6_april7_2017.Rdata')


#This is to demonstrate how variables can be coded in the imputed data itself - e.g. if you imputed blood pressure and then wanted to categorize people into hypertensive or not

#This 'unpacks' the imputed data into a big long dataframe
 long1 <- complete(dat_imp, action='long', include=TRUE)
 #Surv to age 90 w/ wo mobility code
 long1$disabled_recentA <- NA
 not_disabled_recentA <- which(long1$STAIR  == 3  &  long1$WALK1BLK  == 3  )
 disabled_recentA     <- which(long1$STAIR != 3  |  long1$WALK1BLK != 3 ) #ne operation is OK because all data is filled in 
 long1[not_disabled_recentA,]$disabled_recentA <- 0 
 long1[disabled_recentA,]$disabled_recentA <- 1 
 long1$survdis1A <- NA
 long1[which(long1$SURVAGE90 == 0),]$survdis1A <- 1
 long1[which(long1$disabled_recentA == 1 & long1$SURVAGE90 == 1),]$survdis1A <- 2
 long1[which(long1$disabled_recentA == 0 & long1$SURVAGE90 == 1),]$survdis1A <- 3
 
 long2 <- long1
 long2$HDL_cl <- as.numeric(scale(log(long2$HDL)))
 long2$LDL_cl <- as.numeric( scale(log(long2$LDL)))
 long2$LDL_q <- as.numeric(quantcut(long2$LDL, q=seq(0,1,by=0.25), na.rm=TRUE))
 long2$HDL_q <- as.numeric(quantcut(long2$HDL, q=seq(0,1,by=0.25), na.rm=TRUE))
 #Convert education and general health to reduce number of levels 
 long2$GENHEL <- as.numeric(long2$GENHEL)
 long2$EDUC <- as.numeric(long2$EDUC)
 long2$SMOKING <- as.numeric(long2$SMOKING)
 long2$ALCOHOL <- as.numeric(long2$ALCOHOL)

 #Change ethnic v ariable
 long2$ethcat <- long2$ETHNIC
 long2[which(long2$ETHNIC %in% c(1,2,8)),]$ethcat <- 8
 long2$ethcat <- as.factor(long2$ethcat)

 #This repacks the data, after you've done your recoding stuff
 midsobj2 <- as.mids(long2)
 
 
 #Regression models sample code - this is going to run the regression model for every copy of the imputed data
 m1 <- with(midsobj2,glm(SURVAGE90 ~  as.factor(HDL_q) + as.factor(LDL_q)  + AGE +  as.factor (ETHNIC) ,family='binomial'))  #Unadjusted
 
 #estimates can be pooled
 m1pooled <- pool(m1)'
 #And summarized
 summary(m1pooled) 

  #You can also compare nested models 
  m1.a <- with(midsobj2,glm(SURVAGE90 ~    as.factor (ETHNIC)  + AGE,family='binomial'))  #Unadjusted
  m1.c <- with(midsobj2,glm(SURVAGE90 ~    as.factor (ETHNIC)  + AGE+ as.factor(HDL_q) ,family='binomial'))  #Unadjusted

  pool.compare(m1.b,m1.a)$pvalue
 

以上是关于r 用小鼠多次插补的主要内容,如果未能解决你的问题,请参考以下文章

在进行小鼠插补并保存结果后,如何使缺失值保持不变?

R语言用多重插补法估算相对风险

拓端tecdat|R语言编程指导用线性模型进行臭氧预测: 加权泊松回归,普通最小二乘,加权负二项式模型,多重插补缺失值

R语言高级方法进行缺失数据多重插补案例演示

R中缺失和删失数据的多重插补

FOJ1205 小鼠迷宫问题 (BFD+递推)