r 用小鼠多次插补
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 用小鼠多次插补相关的知识,希望对你有一定的参考价值。
library(mice)
#Continuous and categorical variables should be defined prior to MI. This will prevent nonsensical imputations from happening - e.g. cohorts are 1,2,3,4, you impute, then suddenly someone is in cohort 1.5
#Note feb 5: Put other surrogate mobility variables in noting imputation
#I prefer to make lists of all continuous and categorical variables going in
contvars <- scan(what="character",sep="/n")
AGE
BMI
CREA
CRP
DIAS
GLUC
HDL
INSU
LDL
SYST
TCHO
TEXPWK
TRIG
#Variables to convert to factors
catvars <- scan(what="character",sep="/n")
ALCOHOL
antidiabetic
antihyperlipidemic
antihypertensive
CABG
cancer_baseline
DIABTRT
DRESS
DRESS_BASELINE
EAT_BASELINE
EDUC
ETHNIC
FEEDSELF
GENHEL
HICHOLRP
HORMSTAT
INOUTBED
INOUTBED_BASELINE
MARITAL
MI
PAD
SHOWER
SHOWER_BASELINE
SMOKING
STAIR
STAIR_BASELINE
statin
STROKE
SURVAGE90
WALK1BLK
WALK1BLK_BASELINE
WALKAID
#Then I subset the data to just these variables
keepvarlist <- c(catvars,contvars)
dat_for_imp <- subset(dat, select=keepvarlist )
#And if they aren't already coded as factors (e.g. coded as strings), I make the factor variables into factors
dat_for_imp[,catvars] <- lapply(dat_for_imp[,c(catvars)] , factor)
#View missing data patterns
md_dat <- md.pattern(dat_for_imp)
pdf('missing_data.plot.pdf',14,7)
aggr_plot <- aggr(dat_for_imp, col=c('navyblue','red','orange'), only.miss=TRUE,
numbers=TRUE,combined=T,labels=names(dat_for_imp), cex.axis=.7,
ylab=c("Histogram of missing data","Pattern"),sortVars=TRUE,)
dev.off()
#Set your seed!
dat_for_imp_ids <- dat$ID #I'm making a vector of subject IDs to keep on hand, since I'm not keeping them in the data that goes into the imputation model
#Alternatively, you could include your subject ID variable but tell the function to not use it to try and impute stuff!
#Do imputations. Here I'm running 100 imputations using default methods.
#The default method of imputation will depend upon the variable - e.g. binary variables get imputed using logistic regression.
#You can tell it which method to use if you have the patience to do so.
dat_imp <- mice(dat_for_imp, m=100,seed=17)
#Save the imputed data as an R object, this takes a lot of time!
save(dat_for_imp_ids,dat_imp,file="alldat_imputations_v6_april7_2017.Rdata")
#Load imputed data
load('alldat_imputations_v6_april7_2017.Rdata')
#This is to demonstrate how variables can be coded in the imputed data itself - e.g. if you imputed blood pressure and then wanted to categorize people into hypertensive or not
#This 'unpacks' the imputed data into a big long dataframe
long1 <- complete(dat_imp, action='long', include=TRUE)
#Surv to age 90 w/ wo mobility code
long1$disabled_recentA <- NA
not_disabled_recentA <- which(long1$STAIR == 3 & long1$WALK1BLK == 3 )
disabled_recentA <- which(long1$STAIR != 3 | long1$WALK1BLK != 3 ) #ne operation is OK because all data is filled in
long1[not_disabled_recentA,]$disabled_recentA <- 0
long1[disabled_recentA,]$disabled_recentA <- 1
long1$survdis1A <- NA
long1[which(long1$SURVAGE90 == 0),]$survdis1A <- 1
long1[which(long1$disabled_recentA == 1 & long1$SURVAGE90 == 1),]$survdis1A <- 2
long1[which(long1$disabled_recentA == 0 & long1$SURVAGE90 == 1),]$survdis1A <- 3
long2 <- long1
long2$HDL_cl <- as.numeric(scale(log(long2$HDL)))
long2$LDL_cl <- as.numeric( scale(log(long2$LDL)))
long2$LDL_q <- as.numeric(quantcut(long2$LDL, q=seq(0,1,by=0.25), na.rm=TRUE))
long2$HDL_q <- as.numeric(quantcut(long2$HDL, q=seq(0,1,by=0.25), na.rm=TRUE))
#Convert education and general health to reduce number of levels
long2$GENHEL <- as.numeric(long2$GENHEL)
long2$EDUC <- as.numeric(long2$EDUC)
long2$SMOKING <- as.numeric(long2$SMOKING)
long2$ALCOHOL <- as.numeric(long2$ALCOHOL)
#Change ethnic v ariable
long2$ethcat <- long2$ETHNIC
long2[which(long2$ETHNIC %in% c(1,2,8)),]$ethcat <- 8
long2$ethcat <- as.factor(long2$ethcat)
#This repacks the data, after you've done your recoding stuff
midsobj2 <- as.mids(long2)
#Regression models sample code - this is going to run the regression model for every copy of the imputed data
m1 <- with(midsobj2,glm(SURVAGE90 ~ as.factor(HDL_q) + as.factor(LDL_q) + AGE + as.factor (ETHNIC) ,family='binomial')) #Unadjusted
#estimates can be pooled
m1pooled <- pool(m1)'
#And summarized
summary(m1pooled)
#You can also compare nested models
m1.a <- with(midsobj2,glm(SURVAGE90 ~ as.factor (ETHNIC) + AGE,family='binomial')) #Unadjusted
m1.c <- with(midsobj2,glm(SURVAGE90 ~ as.factor (ETHNIC) + AGE+ as.factor(HDL_q) ,family='binomial')) #Unadjusted
pool.compare(m1.b,m1.a)$pvalue
以上是关于r 用小鼠多次插补的主要内容,如果未能解决你的问题,请参考以下文章