r 用于分析原始alma数据集的脚本

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 用于分析原始alma数据集的脚本相关的知识,希望对你有一定的参考价值。

#!/bin/bash
#set -x
DIR="${1}"
EXT1="${2}"
EXT2="${3}"

# programs
TRIMEXE=trimmomatic
USEAEXE=~/Downloads/usearch10.0.240_i86osx32

# Folders
QCFILES=qc_files
MEFILES=merged_files
FASTA=16s_fna
CHIMFILES=${FASTA}/usearch61_chimera_checking/
NOCHIMFILES=${FASTA}/no_chimera
OTUFILES=otu_picking_16s_uclust99/
TMP=tmp
# Get all pair-ends

FILES=($(find "${DIR}" -name "*${EXT1}*" -exec basename {} \;))

mkdir -p "${QCFILES}" "${TMP}" "${MEFILES}" "${FASTA}" "${OTUFILES}" "${NOCHIMFILES}"

for element in $(seq 0 $((${#FILES[@]} - 1))); do
	FWD="${FILES[$element]}"
	REV="${FILES[$element]/${EXT1}/${EXT2}}"

	# quality trim
	QCF="${FWD/fastq.gz/qc.fastq.gz}"
	QCR=$"{REV/fastq.gz/qc.fastq.gz}"

	"${TRIMEXE}" PE -phred33 "${DIR}"/"${FWD}" "${DIR}"/"${REV}" \
		"${TMP}"/R1_pe "${TMP}"/R1_se "${TMP}"/R2_pe "${TMP}"/R2_se \
		SLIDINGWINDOW:4:20 MINLEN:200

	mv "${TMP}"/R1_pe "${QCFILES}"/"${QCF}"
	mv "${TMP}"/R2_pe "${QCFILES}"/"${QCR}"
	rm "${TMP}"/R1_se
	rm "${TMP}"/R2_se

	# Merge reads
	ME="${MEFILES}"/"${FILES[$element]/${EXT1}/merged.fastq}"
	MEFA="${FASTA}"/"${FILES[$element]/${EXT1}/merged.fasta}"
	SE1="${MEFILES}"/notmerged_"${QCF}"
	SE2="${MEFILES}"/notmerged_"${QCR}"

	"${USEAEXE}" -fastq_mergepairs "${QCFILES}"/"${QCF}" -reverse "${QCFILES}"/"${QCR}" \
		-fastqout "${ME}" -fastaout "${MEFA}" -fastq_maxdiffs 3 -fastq_minmergelen 300 \
		-fastqout_notmerged_fwd "${SE1}" -fastqout_notmerged_rev "${SE2}"
	
	# Convert to fasta Skipped because to slow for the test
	# convert_fastaqual_fastq.py -f "${ME}" -o "${FASTA}" -c fastq_to_fastaqual

	# identify chimeric sequences
	identify_chimeric_seqs.py -m usearch61 -i "${MEFA}" -o "${CHIMFILES}" --suppress_usearch61_ref
 	
	NOCHIM="${MEFA/merged.fasta/nochimera.merged.fasta}"
	filter_fasta.py -f "${MEFA}" -o "${NOCHIM}" -s "${CHIMFILES}"/chimeras.txt -n

	# rename reads to be used with qiime (using gawk for mac os X)
	SAMPLE="${FWD/.${EXT1}/}"
	gawk -i inplace -vS="${SAMPLE}" 'BEGIN{i=1}{if ($0 ~ />/){print ">"S"_"i; i++}else{print $0}}' "${NOCHIM}"
	mv "${NOCHIM}" "${NOCHIMFILES}"
done

# Combine all fasta files
cat "${NOCHIMFILES}"/*.fasta > "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna
# Pick OTUs
pick_otus.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna -o "${OTUFILES}" -s 0.99

# Pick representative
pick_rep_set.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed_otus.txt -f "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna \
	-m most_abundant -o "${OTUFILES}"/rep_otus.fasta

# Make OTU table
make_otu_table.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed_otus.txt -o "${OTUFILES}"/16s.insilico_mock.biom

# Convert to tsv
biom convert -i "${OTUFILES}"/16s.insilico_mock.tsv -o test --to-tsv
# UCLUST workflow ---------------------------------------------------------
library(tidyverse)
library(cowplot)
library(ggpubr)

setwd("~/Downloads/alma_original_test")

uclust_df <- read_tsv("16s.insilico_mock.tsv", col_names = T)

uclust_16S_summary <- uclust_df%>% dplyr::rename(otu_name = `#OTU ID`) %>%
  gather(label, count, -otu_name) %>% 
  tbl_df %>% 
  group_by(label) %>% 
  mutate(rel_abun = count/sum(count)) %>% 
  arrange(otu_name, label) %>% 
  filter(count > 0)

uclust_16S_prevalence <- uclust_16S_summary %>% 
  dplyr::select(label,otu_name, count) %>% 
  group_by(otu_name) %>% 
  summarise(prev = sum(count >0), total_counts = sum(count)) 

uclust_16S_abs_singletons <- uclust_16S_prevalence %>% dplyr::filter(total_counts <= 1, prev <= 1)
uclust_16S_abun_singletons <- uclust_16S_prevalence %>% dplyr::filter(total_counts > 1, prev <= 1)

uclust_16S_abs_singletons_names <- uclust_16S_abs_singletons$otu_name

uclust_16S_abun_doubletons <- uclust_16S_prevalence %>% filter(!(otu_name %in% uclust_16S_abs_singletons_names)) %>% dplyr::filter( prev <= 2)

# Remove absolute singletons
uclust_16S_prevalence <- uclust_16S_prevalence %>% filter(!(otu_name %in% uclust_16S_abs_singletons_names))

uclust_16S_counts <- data.frame(class = c("Total ASVs","Absolute singletons", "Abundant singletons"), 
                                counts = c(length(unique(uclust_16S_summary$otu_name)),
                                           dim(uclust_16S_abs_singletons)[1], 
                                           dim(uclust_16S_abun_singletons)[1]))

print(uclust_16S_counts)

uclust_16S_counts$class <- factor(uclust_16S_counts$class, levels=c("Total ASVs","Absolute singletons", "Abundant singletons"))
ggplot(uclust_16S_counts, aes(class, counts)) +
  geom_bar(stat = "identity") + theme_bw() +
  xlab("") + ylab("# ASVs")


# VS ref ------------------------------------------------------------------
# Get mock communities ----------------------------------------------------
# EVEN ERR2098507
# STAG ERR2098508

uclust_mock <- uclust_16S_summary %>%
  filter(otu_name %in% uclust_16S_prevalence$otu_name) %>%
  filter(grepl("Even", label) | grepl("Staggered", label))

uclust_mock_even <- uclust_mock %>%
  filter(grepl("Even", label)) %>%
  ungroup %>%
  select(otu_name, count, label) 

uclust_mock_stag <- uclust_mock %>%
  filter(grepl("Staggered", label)) %>%
  ungroup %>%
  select(otu_name, count, label) 

# Compare references
# usearch10.0.240_i86osx32 -usearch_global rep_otus.fasta \
# -db references_even_mock_noprimers.fasta 
# -strand both -id 0.5 -blast6out otu_rep_vs_even.tsv


alma_mock <- read_tsv("alma_mock_data.txt", col_names = TRUE)

uclust_even_vs_ref <- read_tsv("otu_rep_vs_even.tsv", col_names = F) %>%
  select(X1, X2, X3) %>%
  dplyr::rename(otu_name = X1, ref = X2, id = X3) %>%
  separate(otu_name,into = "otu_name", sep = " ", remove = TRUE, extra = "drop") %>%
  inner_join(uclust_mock_even)


uclust_even_den <- ggplot(uclust_even_vs_ref, aes(id/100, fill = label, color = label)) +
  geom_density(alpha = 0.6) +
  theme_light() +
  xlab("Identity") +
  ylab("Density") +
  scale_x_continuous(labels = scales::percent) +
  ggtitle("uclust")

uclust_even_vs_ref_100 <- uclust_even_vs_ref  %>%
  select(-otu_name) %>%
  group_by(label) %>%
  mutate(N=sum(count),prop = count/N) %>%
  filter(id >= 97) %>%
  group_by(ref, label) %>%
  summarise(prop = sum(prop)) %>%
  ungroup() %>% group_by(ref) %>%
  summarise(prop = mean((prop))) %>%
  right_join(alma_mock %>% filter(!is.na(exp_even))) %>%
  mutate(exp_even = exp_even/100, obs_even = obs_even/100) %>%
  select(-contains("Stag"), -ref) %>%
  dplyr::rename(emose_dada_obs = prop, alma_exp = exp_even, alma_obs = obs_even) %>%
  gather(exp,values, c(emose_dada_obs, alma_exp, alma_obs)) %>%
  ungroup() #%>% filter(!is.na(label))

uclust_even_vs_ref_100$ref_name <- factor(uclust_even_vs_ref_100$ref_name, levels = rev(sort(unique(uclust_even_vs_ref_100$ref_name))))
uclust_even_vs_ref_100_comp <- ggplot(uclust_even_vs_ref_100, aes(ref_name, values, fill = exp)) +
  geom_bar(position = "dodge", stat = "identity") +
  coord_flip() +
  theme_light() +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  scale_fill_manual(values = rev(c( "#1B9E77", "#D95F02" ,"#7570B3"))) +
  ylab("Proportion (hits >= 97%)") +
  xlab("") +
  scale_y_continuous(labels = scales::percent)


# Staggered
# Compare references
# usearch10.0.240_i86osx32 -usearch_global rep_otus.fasta \
# -db references_staggered_mock_noprimers.fasta 
# -strand both -id 0.5 -blast6out otu_rep_vs_stag.tsv

uclust_stag_vs_ref <- read_tsv("otu_rep_vs_stag.tsv", col_names = F) %>%
  select(X1, X2, X3) %>%
  dplyr::rename(otu_name = X1, ref = X2, id = X3) %>%
  separate(otu_name,into = "otu_name", sep = " ", remove = TRUE, extra = "drop") %>%
  inner_join(uclust_mock_stag)


uclust_stag_den <- ggplot(uclust_stag_vs_ref, aes(id/100, fill = label, color = label)) +
  geom_density(alpha = 0.6) +
  theme_light() +
  xlab("Identity") +
  ylab("Density") +
  scale_x_continuous(labels = scales::percent) +
  ggtitle("UCLUST")

uclust_stag_vs_ref_100 <- uclust_stag_vs_ref  %>%
  select(-otu_name) %>%
  group_by(label) %>%
  mutate(N=sum(count),prop = count/N) %>%
  filter(id >= 97) %>%
  group_by(ref, label) %>%
  summarise(prop = sum(prop)) %>%
  ungroup() %>% group_by(ref) %>%
  summarise(prop = mean((prop))) %>%
  right_join(alma_mock) %>%
  mutate(exp_stag = exp_stag/100, obs_stag = obs_stag/100) %>%
  select(-contains("even"), -ref) %>%
  dplyr::rename(emose_dada_obs = prop, alma_exp = exp_stag, alma_obs = obs_stag) %>%
  gather(exp,values, c(emose_dada_obs, alma_exp, alma_obs)) %>%
  ungroup()

uclust_stag_vs_ref_100$ref_name <- factor(uclust_stag_vs_ref_100$ref_name, levels = rev(sort(unique(uclust_stag_vs_ref_100$ref_name))))
uclust_stag_vs_ref_100_comp <- ggplot(uclust_stag_vs_ref_100, aes(ref_name, values, fill = exp)) +
  geom_bar(position = "dodge", stat = "identity") +
  coord_flip() +
  theme_light() +
  theme(legend.position = "top",
        legend.title = element_blank()) +
  scale_fill_manual(values = rev(c( "#1B9E77", "#D95F02" ,"#7570B3"))) +
  ylab("Proportion (hits >= 97%)") +
  xlab("") +
  scale_y_continuous(labels = scales::percent)

all_combined_stag <- bind_rows(uclust_stag_vs_ref_100 %>% 
                                 spread(exp, values) %>% 
                                 select(ref_name, emose_dada_obs, alma_exp) %>%
                                 dplyr::rename(X = emose_dada_obs, Y = alma_exp) %>% 
                                 mutate(exp = "UCLUST"),
                               uclust_stag_vs_ref_100 %>% 
                                 spread(exp, values) %>% 
                                 select(ref_name, alma_obs, alma_exp) %>%
                                 dplyr::rename(X = alma_obs, Y = alma_exp) %>% 
                                 mutate(exp = "Parada et al. 2015"))


all <- ggpubr::ggscatter(all_combined_stag, x = "X",  y = "Y",
                         add = "reg.line",                         # Add regression line
                         conf.int = TRUE,                          # Add confidence interval
                         color = "exp", 
                         palette = get_palette(c("#00AFBB", "#FC4E07"), 3),           # Color by groups "cyl"
                         #shape = "exp",                             # Change point shape by groups "cyl"
                         fullrange = TRUE, 
                         alpha = 0.9, # Extending the regression line
                         facet.by = "exp"
                         #rug = TRUE 
)+ stat_cor(aes(color = exp))  +
  scale_y_log10(labels = scales::percent) +
  scale_x_log10(labels = scales::percent) +
  ylab("515F-Y/926R Observed abundance") +
  xlab("Expected abundance") +
  theme_light() +
  theme(legend.position = "none")

ggdraw() +
  draw_plot(uclust_even_vs_ref_100_comp, x = 0, y = .5, width = .5, height = .5) +
  draw_plot(uclust_stag_vs_ref_100_comp, x = .5, y = .5, width = .5, height = .5) +
  draw_plot(all, x = 0, y = 0, width = 1, height = 0.5) +
  draw_plot_label(label = c("A", "B", "C"), size = 15,
                  x = c(0, 0.5, 0), y = c(1, 1, 0.5))

以上是关于r 用于分析原始alma数据集的脚本的主要内容,如果未能解决你的问题,请参考以下文章

R语言数据挖掘实战系列

16种常用的数据分析方法-主成分分析

R中用于大型复杂调查数据集的方法?

机器学习9 主成分分析

R:训练数据集的 k 折交叉验证

博士师兄手把手教你用R语言做PCA分析,不存在学不会!