r 加载JCO数据
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 加载JCO数据相关的知识,希望对你有一定的参考价值。
load.jcoExprData <- function(jco.datadirectory){
################################################################################
# load vancouvers JCO data given by
# Michael Altenbuchinger in his mail from 21.9.2016 22:12
# on my mail-account gunther.glehr@stud.uni-regensburg.de
# USES:
# data/allvanc/scott_et_al_JCO.zip - Data from JCO
# A_ORIG.GENESIG - original gene signature
#
# Result:
# expr.jco.train - Counts for 309 Samples from vancouver (JCO study)
# expr.jco.test - Counts for 81 Samples from vancouver (JCO study)
#
#
################################################################################
# unzip
unzip(file.path(jco.datadirectory, "scott_et_al_JCO.zip"), exdir = file.path(jco.datadirectory))
# load data
# 1. trainingsdata
# F:\Dropbox\#UNI\Programming\vanc-signature-nanostringV2\AllData_20160810\Daten\JCO\scott_et_al_JCO\training_cohort\JCO_codeset
expr.jco.train <- as.matrix(read.table(file=file.path(jco.datadirectory, "scott_et_al_JCO", "training_cohort", "JCO_codeset"
, "scott_et_al_JCO_training_cohort_JCO_codeset_gene_exprs_mat.tsv")
,header = TRUE
,sep = "\t"
,quote = ""
,row.names = 1))
expr.jco.test <- as.matrix(read.table(file=file.path(jco.datadirectory, "scott_et_al_JCO", "validation_cohort", "JCO_codeset"
, "scott_et_al_JCO_validation_cohort_JCO_codeset_gene_exprs_mat.tsv")
,header = TRUE
,sep = "\t"
,quote = ""
,row.names = 1))
expr.jco.train.HL27 <- as.matrix(read.table(file=file.path(jco.datadirectory, "scott_et_al_JCO_training_cohort_HL27_codeset_gene_exprs_mat.tsv")
,header = TRUE
,sep = "\t"
,quote = ""
,row.names = 1))
mapping <- read.table(file=file.path(jco.datadirectory, "HL27_to_ecog_id.txt")
, header = TRUE
,stringsAsFactors = FALSE)
mapping <- mapping[mapping$sample_id %in% colnames(expr.jco.train.HL27), ]
expr.jco.train.HL27 <- expr.jco.train.HL27[, mapping$sample_id]
colnames(expr.jco.train.HL27) <- paste0("X", mapping$ecog_id)
# Problem: we miss some genes plus 2 Samples are there twice
rn.test <- rownames(expr.jco.test)
rn.train <- rownames(expr.jco.train)
test.in.train <- rn.test %in% rn.train
train.in.test <- rn.train %in% rn.test
# cat("Testgenes not in Traingenes:\n")
# cat(rn.test[!test.in.train], "\n")
# cat("Traingenes not in Testgenes:\n")
# cat(rn.train[!train.in.test], "\n")
#
# cat("According to genecards.com:\n")
# cat("MORG1 <-> WDR83\n")
# cat("TRAIL <-> TNFSF10\n")
# cat("The left genes just have a - instead of a _\n")
# 1. replace _ with - (The origsig names have the -
rn.test <- sub("_", "-", rn.test)
rn.test <- sub("TRAIL", "TNFSF10", rn.test)
rn.test <- sub("MORG1", "WDR83", rn.test)
rn.train <- sub("_", "-", rn.train)
test.in.train <- rn.test %in% rn.train
train.in.test <- rn.train %in% rn.test
# cat("Testgenes not in Traingenes:\n")
# cat(rn.test[!test.in.train], "\n")
# cat("Traingenes not in Testgenes:\n")
# cat(rn.train[!train.in.test], "\n")
catt("In expr.jco.test\n")
cat(" ", rownames(expr.jco.test)[!rownames(expr.jco.test) %in% rn.test], "\n ")
catt("are changed into")
cat("\n ", rn.test[!rownames(expr.jco.test) %in% rn.test], "\n\n")
rownames(expr.jco.test) <- rn.test
catt("In expr.jco.train\n")
cat(" ", rownames(expr.jco.train)[!rownames(expr.jco.train) %in% rn.train], "\n ")
catt("are changed into")
cat("\n ", rn.train[!rownames(expr.jco.train) %in% rn.train], "\n ")
rownames(expr.jco.train) <- rn.train
catt("\n")
catt("In expr.jco.train.HL27, all _ are replaced by -\n")
rownames(expr.jco.train.HL27) <- sub("_", "-", rownames(expr.jco.train.HL27))
# sort the names
expr.jco.train <- expr.jco.train[sort(rownames(expr.jco.train)), ]
expr.jco.train <- expr.jco.train[, sort(colnames(expr.jco.train))]
expr.jco.train.HL27 <- expr.jco.train.HL27[sort(rownames(expr.jco.train.HL27)), ]
expr.jco.train.HL27 <- expr.jco.train.HL27[, sort(colnames(expr.jco.train.HL27))]
expr.jco.test <- expr.jco.test[sort(rownames(expr.jco.test)), ]
expr.jco.test <- expr.jco.test[, sort(colnames(expr.jco.test))]
# we have negative values in expr.jco.test, take absolute values
# plus 1 because there are 0-values
expr.jco.test <- abs(expr.jco.test)+1
expr.jco.train <- abs(expr.jco.train)+1
return(list("expr.jco.test"=expr.jco.test, "expr.jco.train"=expr.jco.train, "expr.jco.train.HL27"=expr.jco.train.HL27))
}
load.jcoPhenoData <- function(jco.datadirectory
,exprmat.jcoTest
,exprmat.jcoTrain){
################################################################################
# load vancouvers JCO phenodata given by
# Fong Chun Chan in his mail from 10.10.2016, 21:48
# on my mail-account gunther.glehr@stud.uni-regensburg.de
# USES:
# AllData_20160810/Daten/jco/mail_fong20161010/ECOG_E2496_annot.clean.tsv - trainingset phenodata
# AllData_20160810/Daten/jco/mail_fong20161010/van_cohort.clean.tsv - testset phenodata
#
# Result:
# pheno.jco.train - trainingset phenodata for 309 Samples from vancouver (JCO study)
# pheno.jco.test - testset phenodata for 81 Samples from vancouver (JCO study)
# expr.jco.train.excl - JCO trainingssamples with some samples excluded through quality control
# expr.jco.test.excl - JCO testsamples with 3 samples excluded through quality control
#
################################################################################
# #### Fong told that the duplicate samples are the same tissue and should have a high correlation.
# #### test now
# allcor <- cor(expr.jco.train)**2
# allcor["X27115.1", "X27115"]
# allcor["X27095.1", "X27095"]
# allcor[upper.tri(allcor, diag = TRUE)] <- NA
# allcor.big <- allcor
# allcor.big[allcor.big<0.98] <- 0
# library(reshape2)
# melted.allcor <- melt(allcor)
# melted.allcor.big <- melt(allcor.big)
# library(ggplot2)
# plot00 <- ggplot(data = melted.allcor, aes(x=Var1, y=Var2, fill=value)) +
# geom_tile()
# plot01 <- ggplot(data = melted.allcor.big, aes(x=Var1, y=Var2, fill=value)) +
# geom_tile()
# sizes <- 20
# pdf("test.pdf", width=sizes, height=sizes)
# print(plot00)
# print(plot01)
# plot(density(c(allcor), na.rm = TRUE))
# abline(v=allcor["X27115.1", "X27115"], col="red")
# abline(v=allcor["X27095.1", "X27095"], col="green")
# hist(c(allcor), breaks=500)
# abline(v=allcor["X27115.1", "X27115"], col="red")
# abline(v=allcor["X27095.1", "X27095"], col="green")
# axis(4)
# dev.off()
##### load data #####
pheno.jco.train <- read.table(file = file.path(jco.datadirectory, "mail_fong20161010", "ECOG_E2496_annot.clean.tsv")
,header = TRUE
,stringsAsFactors = FALSE)
pheno.jco.train$sample_id <- paste0("X", pheno.jco.train$sample_id)
rownames(pheno.jco.train) <- pheno.jco.train$sample_id
pheno.jco.test <- read.table(file = file.path(jco.datadirectory, "mail_fong20161010", "van_cohort.clean.tsv")
,header = TRUE
,stringsAsFactors = FALSE)
rownames(pheno.jco.test) <- pheno.jco.test$sample_id
# exclude the 3 samples Fong described in his mail.
pheno.jco.test <- pheno.jco.test[-which(rownames(pheno.jco.test) %in% c("HL0024", "HL0036", "HL0090")), ]
#### the survival times are probably years, I expect months ####
pheno.jco.train$os <- pheno.jco.train$os * 12
pheno.jco.train$ffs <- pheno.jco.train$ffs * 12
pheno.jco.test$os <- pheno.jco.test$os * 12
pheno.jco.test$ffs <- pheno.jco.test$ffs * 12
##### make exclude-count-data #####
expr.jco.train.excl <- exprmat.jcoTrain[, rownames(pheno.jco.train)]
expr.jco.test.excl <- exprmat.jcoTest[, rownames(pheno.jco.test)]
return(list("pheno.jco.train"=pheno.jco.train, "pheno.jco.test"=pheno.jco.test
,"expr.jco.train.excl"=expr.jco.train.excl, "expr.jco.test.excl"=expr.jco.test.excl))
}
catt("\n", A_scriptname, " started\n")
catt("-----------------------------------------------------\n")
catt <- define.catt(nspace=2)
tmp <- load.jcoExprData(jco.datadirectory = "Data/RawData/jco/")
expr.jco.test.full <- tmp[["expr.jco.test"]]
expr.jco.train.full <- tmp[["expr.jco.train"]]
expr.jco.train.HL27 <- tmp[["expr.jco.train.HL27"]]
tmp <- load.jcoPhenoData(jco.datadirectory = "Data/RawData/jco/"
,exprmat.jcoTest = expr.jco.test.full
,exprmat.jcoTrain = expr.jco.train.full)
pheno.jco.train <- tmp[["pheno.jco.train"]]
pheno.jco.test <- tmp[["pheno.jco.test"]]
expr.jco.train <- tmp[["expr.jco.train.excl"]]
expr.jco.test <- tmp[["expr.jco.test.excl"]]
catt <- define.catt()
catt("-----------------------------------------------------\n")
catt(A_scriptname, " finished\n")
以上是关于r 加载JCO数据的主要内容,如果未能解决你的问题,请参考以下文章