r 加载JCO数据

Posted 2021-05-23
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了r 加载JCO数据相关的知识，希望对你有一定的参考价值。
load.jcoExprData <- function(jco.datadirectory){
	################################################################################
	# load vancouvers JCO data given by 
	# Michael Altenbuchinger in his mail from 21.9.2016 22:12
	# on my mail-account gunther.glehr@stud.uni-regensburg.de
	
	# USES:
	# 	data/allvanc/scott_et_al_JCO.zip	- Data from JCO
	# 	A_ORIG.GENESIG						- original gene signature
	#
	# Result:
	#	expr.jco.train	- Counts for 309 Samples from vancouver (JCO study)
	#	expr.jco.test	- Counts for 81 Samples from vancouver (JCO study)
	#
	#
	################################################################################
	
	# unzip
	unzip(file.path(jco.datadirectory, "scott_et_al_JCO.zip"), exdir = file.path(jco.datadirectory))
	
	# load data
	# 1. trainingsdata
	# F:\Dropbox\#UNI\Programming\vanc-signature-nanostringV2\AllData_20160810\Daten\JCO\scott_et_al_JCO\training_cohort\JCO_codeset
	expr.jco.train <- as.matrix(read.table(file=file.path(jco.datadirectory, "scott_et_al_JCO", "training_cohort", "JCO_codeset"
														  , "scott_et_al_JCO_training_cohort_JCO_codeset_gene_exprs_mat.tsv")
										   ,header = TRUE
										   ,sep = "\t"
										   ,quote = ""
										   ,row.names = 1))
	
	
	expr.jco.test <- as.matrix(read.table(file=file.path(jco.datadirectory, "scott_et_al_JCO", "validation_cohort", "JCO_codeset"
														 , "scott_et_al_JCO_validation_cohort_JCO_codeset_gene_exprs_mat.tsv")
										  ,header = TRUE
										  ,sep = "\t"
										  ,quote = ""
										  ,row.names = 1))
	
	expr.jco.train.HL27 <- as.matrix(read.table(file=file.path(jco.datadirectory, "scott_et_al_JCO_training_cohort_HL27_codeset_gene_exprs_mat.tsv")
												,header = TRUE
												,sep = "\t"
												,quote = ""
												,row.names = 1))
	mapping <- read.table(file=file.path(jco.datadirectory, "HL27_to_ecog_id.txt")
						  , header = TRUE
						  ,stringsAsFactors = FALSE)
	
	mapping <- mapping[mapping$sample_id %in% colnames(expr.jco.train.HL27), ]
	expr.jco.train.HL27 <- expr.jco.train.HL27[, mapping$sample_id]
	colnames(expr.jco.train.HL27) <- paste0("X", mapping$ecog_id)
	
	# Problem: we miss some genes plus 2 Samples are there twice
	rn.test <- rownames(expr.jco.test)
	rn.train <- rownames(expr.jco.train)
	test.in.train <- rn.test %in% rn.train
	train.in.test <- rn.train %in% rn.test
	# cat("Testgenes not in Traingenes:\n")
	# cat(rn.test[!test.in.train], "\n")
	# cat("Traingenes not in Testgenes:\n")
	# cat(rn.train[!train.in.test], "\n")
	# 
	# cat("According to genecards.com:\n")
	# cat("MORG1 <-> WDR83\n")
	# cat("TRAIL <-> TNFSF10\n")
	# cat("The left genes just have a - instead of a _\n")
	# 1. replace _ with - (The origsig names have the -
	rn.test <- sub("_", "-", rn.test)
	rn.test <- sub("TRAIL", "TNFSF10", rn.test)
	rn.test <- sub("MORG1", "WDR83", rn.test)
	
	rn.train <- sub("_", "-", rn.train)
	
	test.in.train <- rn.test %in% rn.train
	train.in.test <- rn.train %in% rn.test
	# cat("Testgenes not in Traingenes:\n")
	# cat(rn.test[!test.in.train], "\n")
	# cat("Traingenes not in Testgenes:\n")
	# cat(rn.train[!train.in.test], "\n")
	catt("In expr.jco.test\n")
	cat("  ", rownames(expr.jco.test)[!rownames(expr.jco.test) %in% rn.test], "\n ")
	catt("are changed into")
	cat("\n  ", rn.test[!rownames(expr.jco.test) %in% rn.test], "\n\n")
	rownames(expr.jco.test) <- rn.test
	
	catt("In expr.jco.train\n")
	cat("  ", rownames(expr.jco.train)[!rownames(expr.jco.train) %in% rn.train], "\n ")
	catt("are changed into")
	cat("\n  ", rn.train[!rownames(expr.jco.train) %in% rn.train], "\n ")
	rownames(expr.jco.train) <- rn.train
	
	catt("\n")
	catt("In expr.jco.train.HL27, all _ are replaced by -\n")
	rownames(expr.jco.train.HL27) <- sub("_", "-", rownames(expr.jco.train.HL27))
	
	
	
	# sort the names
	
	expr.jco.train <- expr.jco.train[sort(rownames(expr.jco.train)), ]
	expr.jco.train <- expr.jco.train[, sort(colnames(expr.jco.train))]
	expr.jco.train.HL27 <- expr.jco.train.HL27[sort(rownames(expr.jco.train.HL27)), ]
	expr.jco.train.HL27 <- expr.jco.train.HL27[, sort(colnames(expr.jco.train.HL27))]
	expr.jco.test <- expr.jco.test[sort(rownames(expr.jco.test)), ]
	expr.jco.test <- expr.jco.test[, sort(colnames(expr.jco.test))]
	
	
	# we have negative values in expr.jco.test, take absolute values
	# plus 1 because there are 0-values
	expr.jco.test <- abs(expr.jco.test)+1
	expr.jco.train <- abs(expr.jco.train)+1
	
	return(list("expr.jco.test"=expr.jco.test, "expr.jco.train"=expr.jco.train, "expr.jco.train.HL27"=expr.jco.train.HL27))
}

load.jcoPhenoData <- function(jco.datadirectory
							  ,exprmat.jcoTest
							  ,exprmat.jcoTrain){
	################################################################################
	# load vancouvers JCO phenodata given by 
	# Fong Chun Chan in his mail from 10.10.2016, 21:48
	# on my mail-account gunther.glehr@stud.uni-regensburg.de
	
	# USES:
	# 	AllData_20160810/Daten/jco/mail_fong20161010/ECOG_E2496_annot.clean.tsv	- trainingset phenodata
	# 	AllData_20160810/Daten/jco/mail_fong20161010/van_cohort.clean.tsv		- testset phenodata
	#
	# Result:
	#	pheno.jco.train		- trainingset phenodata for 309 Samples from vancouver (JCO study)
	#	pheno.jco.test		- testset phenodata for 81 Samples from vancouver (JCO study)
	#	expr.jco.train.excl	- JCO trainingssamples with some samples excluded through quality control
	#	expr.jco.test.excl	- JCO testsamples with 3 samples excluded through quality control
	#
	################################################################################
	
	
	# #### Fong told that the duplicate samples are the same tissue and should have a high correlation.
	# #### test now
	# allcor <- cor(expr.jco.train)**2 
	# allcor["X27115.1", "X27115"]
	# allcor["X27095.1", "X27095"]
	# allcor[upper.tri(allcor, diag = TRUE)] <- NA
	# allcor.big <- allcor
	# allcor.big[allcor.big<0.98] <- 0
	# library(reshape2)
	# melted.allcor <- melt(allcor)
	# melted.allcor.big <- melt(allcor.big)
	# library(ggplot2)
	# plot00 <- ggplot(data = melted.allcor, aes(x=Var1, y=Var2, fill=value)) + 
	# 	geom_tile()
	# plot01 <- ggplot(data = melted.allcor.big, aes(x=Var1, y=Var2, fill=value)) + 
	# 	geom_tile()
	# sizes <- 20
	# pdf("test.pdf", width=sizes, height=sizes)
	# print(plot00)
	# print(plot01)
	# plot(density(c(allcor), na.rm = TRUE))
	# abline(v=allcor["X27115.1", "X27115"], col="red")
	# abline(v=allcor["X27095.1", "X27095"], col="green")
	# hist(c(allcor), breaks=500)
	# abline(v=allcor["X27115.1", "X27115"], col="red")
	# abline(v=allcor["X27095.1", "X27095"], col="green")
	# axis(4)
	# dev.off()
	##### load data #####
	pheno.jco.train <- read.table(file = file.path(jco.datadirectory, "mail_fong20161010", "ECOG_E2496_annot.clean.tsv")
								  ,header = TRUE
								  ,stringsAsFactors = FALSE)
	pheno.jco.train$sample_id <- paste0("X", pheno.jco.train$sample_id)
	rownames(pheno.jco.train) <- pheno.jco.train$sample_id
	
	
	
	pheno.jco.test <- read.table(file = file.path(jco.datadirectory, "mail_fong20161010", "van_cohort.clean.tsv")
								 ,header = TRUE
								 ,stringsAsFactors = FALSE)
	rownames(pheno.jco.test) <- pheno.jco.test$sample_id
	
	# exclude the 3 samples Fong described in his mail.
	pheno.jco.test <- pheno.jco.test[-which(rownames(pheno.jco.test) %in% c("HL0024", "HL0036", "HL0090")), ]
	
	#### the survival times are probably years, I expect months ####
	pheno.jco.train$os <- pheno.jco.train$os * 12
	pheno.jco.train$ffs <- pheno.jco.train$ffs * 12
	pheno.jco.test$os <- pheno.jco.test$os * 12
	pheno.jco.test$ffs <- pheno.jco.test$ffs * 12
	
	##### make exclude-count-data #####
	expr.jco.train.excl <- exprmat.jcoTrain[, rownames(pheno.jco.train)]
	expr.jco.test.excl <- exprmat.jcoTest[, rownames(pheno.jco.test)]
	return(list("pheno.jco.train"=pheno.jco.train, "pheno.jco.test"=pheno.jco.test
				,"expr.jco.train.excl"=expr.jco.train.excl, "expr.jco.test.excl"=expr.jco.test.excl))
}



catt("\n", A_scriptname, " started\n")
catt("-----------------------------------------------------\n")
catt <- define.catt(nspace=2)

tmp <- load.jcoExprData(jco.datadirectory = "Data/RawData/jco/")
expr.jco.test.full <- tmp[["expr.jco.test"]]
expr.jco.train.full <- tmp[["expr.jco.train"]]
expr.jco.train.HL27 <- tmp[["expr.jco.train.HL27"]]

tmp <- load.jcoPhenoData(jco.datadirectory = "Data/RawData/jco/"
				  ,exprmat.jcoTest = expr.jco.test.full
				  ,exprmat.jcoTrain = expr.jco.train.full)

pheno.jco.train <- tmp[["pheno.jco.train"]]
pheno.jco.test <- tmp[["pheno.jco.test"]]
expr.jco.train <- tmp[["expr.jco.train.excl"]]
expr.jco.test <- tmp[["expr.jco.test.excl"]]



catt <- define.catt()
catt("-----------------------------------------------------\n")
catt(A_scriptname, " finished\n")
以上是关于r 加载JCO数据的主要内容，如果未能解决你的问题，请参考以下文章