r 蛋白质簇的ERGM

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 蛋白质簇的ERGM相关的知识,希望对你有一定的参考价值。

library(tidyverse)
library(igraph)
library(assortnet)
library(protr)
library(ergm)
library(intergraph)
library(network)

setwd("~/Downloads/PC_meren/")

# Read files
# Protein clusters
pc <- read.table(file = "protein-clusters.txt", header = T) %>% tbl_df

# PC sequences
pc_seqs <- readFASTA("protein-clusters.fasta")

# Remove sequences with amino acids different than the 20 common types
pc_seqs_good <- plyr::ldply(pc_seqs, protcheck) %>% filter(V1 == "TRUE")

pc_seqs <- pc_seqs[pc_seqs_good$.id]

# Sequence-similarity network (blastp all-vs-all)
sim_net <- read.table(file = "protein-cluster.filt.m8", header = F) %>% 
  filter(V1 %in% pc_seqs_good$.id, V2 %in% pc_seqs_good$.id) %>%
  tbl_df

# Build graph (undirected, weighted and without parallel/self-loops)
sim_net <- sim_net %>% 
  dplyr::select(V1, V2, V3) %>%
  dplyr::rename(node1 = V1, node2 = V2, weight = V3) %>%
  graph_from_data_frame(directed = F) %>%
  simplify(edge.attr.comb = "max") 

# Remove isolates edges
sim_net <- igraph::delete.vertices(sim_net, which(degree(sim_net)<1))

# Check how the graph looks like

dd <- degree.distribution(sim_net, cumulative = T) %>%
  as.data.frame %>%
  mutate(x = 1:length(.)) %>% 
  magrittr::set_colnames(c("y", "x")) %>%
  tbl_df %>% 
  filter(y > 0) %>%
  ggplot(aes(x,y)) + 
  geom_point() + 
  scale_x_log10() + 
  scale_y_log10() + 
  xlab("Degree") + 
  ylab("Cumulative Frequency") +
  theme_bw()

dd

network_description <- function (X) {
  require(igraph)
  cat("NETWORK DESCRIPTION","\n")
  cat("----------------------------------------------","\n")
  #cat("Diameter:",diameter(X),"\n")
  #cat("Girth:",girth(X)$girth,"\n")
  #cat("Average betweenness:",sum(betweenness(X))/vcount(X),"\n")
  #cat("Average closeness:",sum(closeness(X))/vcount(X),"\n")
  #cat("Average path length:",average.path.length(simplify(X)),"\n")
  cat("Transitivity:",transitivity(X),"\n")
  cat("Total number of motifs size 3:",graph.motifs.no(X),"\n")
  cat("Density:",graph.density(X),"\n")
  cat("Average vextex neighbor degree:",sum(graph.knn(X)$knn)/vcount(X),"\n")
  cat("Unique links:",ecount(simplify(X)),"\n")
  cat("Link ratio:",ecount(X)/ecount(simplify(X)),"\n")
  cat("Number of clusters:",no.clusters(X),"\n")
  cat("----------------------------------------------","\n")
}

network_description(sim_net)

mcl_bin <- "mcl"

gx <- sim_net

E(gx)$weight <- E(gx)$weight - min(E(gx)$weight) + 0.001

g_mcl_list <- vector( mode = "list")

options <- paste("--abc -I", 2.5, sep=" ") 
I <- as.character(2.5)

g_mcl_list <- run_mcl(X = "g", graph = gx, options = options, mcl_bin = mcl_bin)
g_mcl_list <- g_mcl_list[match(igraph::get.vertex.attribute(gx, "name"), g_mcl_list$vertex),]
g_mcl_list <- make_clusters(gx, as.numeric(g_mcl_list$com), algorithm = 'MCL', modularity = TRUE)

com_mclX <- paste("com_mcl",I,sep="")
sim_net <- igraph::set.vertex.attribute(graph = sim_net, name = eval(com_mclX), index=V(sim_net),value = as.character(membership(g_mcl_list)) )

dist2origin <- function(x,y){
  m <- rbind(c(0,0), cbind(x, y))
  m <- as.matrix(dist(m, method = "euclidean"))[1,][-1]
}

binary_search_filter <- function(g, low, high) {
  x<-g
  if ( high < low ) {
    return(NULL)
  }else {
    mid <- floor((low + high) / 2)
    x<-igraph::delete.edges(g, which(E(g)$weight <= weights[mid])) 
    cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:", count_components(x), " edges:", ecount(x), "\n")
    if ((high - low) == 0){
      cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:",count_components(x), " edges:", ecount(x), "\n")
      x<-igraph::delete.edges(g, which(E(g)$weight < weights[mid+1])) 
      return(list(weight=mid+1, graph=x))
      exit
    }
    if (!(is.connected(x))){
      binary_search_filter(g, low, mid - 1)
    }else if ( is.connected(x) ){
      binary_search_filter(g, mid + 1, high)
    }
  }
}

binary_search_filter_1 <- function(g, low, high) {
  x<-g
  if ( high < low ) {
    
    return(NULL)
    
  }else {
    mid <- floor((low + high) / 2)
    x<-igraph::delete.edges(g, which(E(g)$weight <= weights[mid])) 
    cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:", count_components(x), " edges:", ecount(x), "\n")
    
    if ((mid - low) == 0){
      x<-igraph::delete.edges(g, which(E(g)$weight <= weights[mid-1])) 
      cat("Final: low:", low - 1, " mid:", mid - 1, " high:", high - 1, " mid_weight:", weights[mid - 1], " components:",count_components(x), " edges:", ecount(x), "\n")
      return(list(weight=mid - 1, graph=x))
      exit
    }
    
    if (((high - low) == 0)){
      x<-igraph::delete.edges(g, which(E(g)$weight <= weights[mid+1])) 
      cat("Final: low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:",count_components(x), " edges:", ecount(x), "\n")
      return(list(weight=mid , graph=x))
      exit
    }
    
    
    if (!(is.connected(x))){
      binary_search_filter_1(g, low, mid - 1)
    }else if (is.connected(x)){
      binary_search_filter_1(g, mid + 1, high)
    }
  }
}

zero_range <- function(x, tol = .Machine$double.eps ^ 0.5) {
  if (length(x) == 1) return(TRUE)
  x <- range(x) / mean(x)
  isTRUE(all.equal(x[1], x[2], tolerance = tol))
}


G <- sim_net
X <- as_data_frame(G , what="vertices")
clmethod <- 'com_mcl2.5'
G.comms <- tapply( X$name, as.numeric(X[,clmethod]), FUN=function(x) { induced.subgraph(G,vids = x ) }  )

com_stats <- vector(mode = "list")
com_ergm <- vector(mode = "list")

for ( i in 1:length(G.comms)) {
  com <- G.comms[[i]]
  if (vcount(com) >=10){
  com_seqs <- pc_seqs[as.character(V(com)$name)]
  com_seqs <- Filter(length, com_seqs)
  den <- graph.density(com)
  if (graph.density(com) == 1){
    weights <- unique(sort(E(com)$weight, decreasing = FALSE))
    com <- binary_search_filter_1(g = com, low = 1, high = length(weights))$graph
  } 
  
  # Protein length
  l <- plyr::ldply(com_seqs, function(x){length(unlist(strsplit(x, "")))})
  length_equal <- zero_range(l$V1)
  com <- igraph::set.vertex.attribute(graph = com, name = "length", index=V(com),value = l$V1) 
  #assortment.continuous(as_adjacency_matrix(com, sparse = F, attr = "weight" ), aac_pc1)
  
  # Acidic-basic ratio
  l <- plyr::ldply(com_seqs, extractAAC)
  row.names(l) <- l$.id
  l <- plyr::adply(l, 1, function(x){
    ab<-(x$D + x$E)/(x$H + x$R + x$K)
    ab <- ifelse(is.infinite(ab), 0, ab)
  })
  ab_ratio_equal <- zero_range(l$V1)
  com <- igraph::set.vertex.attribute(graph = com, name = "ab_ratio", index=V(com),value = l$V1)
  
  # Amino acid composition
  l <- plyr::ldply(com_seqs, extractAAC)
  row.names(l) <- l$.id
  l$.id <- NULL
  aac_equal <- unique(plyr::ldply(l, zero_range)$V1)
  if (aac_equal != TRUE || length(aac_equal) > 1){
    aac_pca <- rda(l, scale = T)
    aac_inertia <- round(cumsum(100*aac_pca$CA$eig/sum(aac_pca$CA$eig)),2)
    if (length(aac_inertia) > 1){
      aac_ed <- dist2origin(aac_pca$CA$u[,1], aac_pca$CA$u[,2])
      com <- igraph::set.vertex.attribute(graph = com, name = "aac_ed", index=V(com),value = aac_ed) 
    }else{
      com <- igraph::set.vertex.attribute(graph = com, name = "aac_ed", index=V(com),value = aac_pca$CA$u[,1])
    }
  } 
  
  # Descriptors: hydrophobicity, normalized van der Waals volume polarity, and polarizability
  l <- plyr::ldply(com_seqs, extractCTDC)
  row.names(l) <- l$.id
  l$.id <- NULL
  ctdc_equal <- unique(plyr::ldply(l, zero_range)$V1)
  if (ctdc_equal != TRUE || length(ctdc_equal) > 1){
    ctdc_pca <- rda(l, scale = T)
    ctdc_inertia <- round(cumsum(100*ctdc_pca$CA$eig/sum(ctdc_pca$CA$eig)),2)
    if (length(ctdc_inertia) > 1){
      ctdc_ed <- dist2origin(ctdc_pca$CA$u[,1], ctdc_pca$CA$u[,2])
      com <- igraph::set.vertex.attribute(graph = com, name = "ctdc_ed", index=V(com), value = ctdc_ed) 
    }else{
      com <- igraph::set.vertex.attribute(graph = com, name = "ctdc_ed", index=V(com), value = ctdc_pca$CA$u[,1]) 
    }
  }
  # Conjoint Triad Descriptors
  # l <- plyr::ldply(com_seqs, extractCTriad)
  # row.names(l) <- l$.id
  # l$.id <- NULL
  # triad_pca <- rda(l, scale = T)
  # triad_inertia <- round(cumsum(100*triad_pca$CA$eig/sum(triad_pca$CA$eig)),2)
  # triad_ed <- dist2origin(triad_pca$CA$u[,1], triad_pca$CA$u[,2])
  # com <- igraph::set.vertex.attribute(graph = com, name = "triad_ed", index=V(com), value = triad_ed) 
  # 
  
  if ((length(length_equal) == 1 & length_equal == TRUE) & 
      (length(aac_equal) == 1 & aac_equal == TRUE) & 
      (length(ctdc_equal) == 1 & ctdc_equal == TRUE)){
    com_ergm[[i]] <- data_frame(Estimate = "equal", Std..Error = "equal",  MCMC..= "equal", p.value= "equal", com = i)
  }else{
      net <- asNetwork(com)
      
      # We test that the edges are different enough to model homophily, if not we will get Inf
      reg_edges <- ergm(net ~ edges)
      
      if (!is.infinite(reg_edges$coef)){
        
        reg <- ergm(net ~ edges + absdiff("ctdc_ed") + absdiff("aac_ed") + 
                      absdiff("length"), verbose = T)
        com_ergm[[i]]  <- data.frame(summary(reg)$coefs, com = i)
        
      }else{
        com_ergm[[i]] <- data_frame(Estimate = Inf, Std..Error = Inf,  MCMC..= Inf, p.value= Inf, com = i)
      }
      
   
    
    com_stats[[i]] <- data.frame(com = i, nodes = V(com)$name, p_len = V(com)$length, 
                                 ctdc_ed = V(com)$ctdc_ed, aac_ed = V(com)$aac_ed,
                                 ab_ratio = V(com)$ab_ratio, den = den)
    }
  }else{
    com_ergm[[i]] <- data_frame(Estimate = NA, Std..Error = NA,  MCMC..= NA, p.value= NA, com = i)
  }
}

p$sign <- ifelse(p$p.value < 0.001, "sig", "no-sig")

p %>% 
  filter(class == "absdiff.ctdc_ed" | class == "absdiff.aac_ed", Estimate <= 100, Estimate >= -100) %>% 
  ggplot(aes(class, Estimate)) + 
  geom_jitter(size = 1, alpha = 0.6, aes(color = sign)) + 
  geom_violin(alpha = 0, color = "red") +
  theme_bw()  

以上是关于r 蛋白质簇的ERGM的主要内容,如果未能解决你的问题,请参考以下文章

无法在 Windows 10 上的 R 中加载 statnet

r 围绕簇的ggplot椭圆

r 计算单一和多域蛋白质

R语言层次聚类(hierarchical clustering):特征缩放抽取hclust中的聚类簇(cutree函数从hclust对象中提取每个聚类簇的成员)基于主成分分析的进行聚类结果可视化

R语言层次聚类:通过内平方和WSS选择最优的聚类K值可视化不同K下的BSS和WSS通过Calinski-Harabasz指数(准则)与聚类簇个数的关系获取最优聚类簇的个数

R语言ggplot2可视化可视化聚类图使用geom_encircle函数绘制多边形标定属于同一聚类簇的数据点并自定义每个聚类簇数据点的颜色多边形框的颜色(Cluster Plot)主副标题题注