r 用于分析原始alma数据集的脚本
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了r 用于分析原始alma数据集的脚本相关的知识,希望对你有一定的参考价值。
#!/bin/bash
#set -x
DIR="${1}"
EXT1="${2}"
EXT2="${3}"
# programs
TRIMEXE=trimmomatic
USEAEXE=~/Downloads/usearch10.0.240_i86osx32
# Folders
QCFILES=qc_files
MEFILES=merged_files
FASTA=16s_fna
CHIMFILES=${FASTA}/usearch61_chimera_checking/
NOCHIMFILES=${FASTA}/no_chimera
OTUFILES=otu_picking_16s_uclust99/
TMP=tmp
# Get all pair-ends
FILES=($(find "${DIR}" -name "*${EXT1}*" -exec basename {} \;))
mkdir -p "${QCFILES}" "${TMP}" "${MEFILES}" "${FASTA}" "${OTUFILES}" "${NOCHIMFILES}"
for element in $(seq 0 $((${#FILES[@]} - 1))); do
FWD="${FILES[$element]}"
REV="${FILES[$element]/${EXT1}/${EXT2}}"
# quality trim
QCF="${FWD/fastq.gz/qc.fastq.gz}"
QCR=$"{REV/fastq.gz/qc.fastq.gz}"
"${TRIMEXE}" PE -phred33 "${DIR}"/"${FWD}" "${DIR}"/"${REV}" \
"${TMP}"/R1_pe "${TMP}"/R1_se "${TMP}"/R2_pe "${TMP}"/R2_se \
SLIDINGWINDOW:4:20 MINLEN:200
mv "${TMP}"/R1_pe "${QCFILES}"/"${QCF}"
mv "${TMP}"/R2_pe "${QCFILES}"/"${QCR}"
rm "${TMP}"/R1_se
rm "${TMP}"/R2_se
# Merge reads
ME="${MEFILES}"/"${FILES[$element]/${EXT1}/merged.fastq}"
MEFA="${FASTA}"/"${FILES[$element]/${EXT1}/merged.fasta}"
SE1="${MEFILES}"/notmerged_"${QCF}"
SE2="${MEFILES}"/notmerged_"${QCR}"
"${USEAEXE}" -fastq_mergepairs "${QCFILES}"/"${QCF}" -reverse "${QCFILES}"/"${QCR}" \
-fastqout "${ME}" -fastaout "${MEFA}" -fastq_maxdiffs 3 -fastq_minmergelen 300 \
-fastqout_notmerged_fwd "${SE1}" -fastqout_notmerged_rev "${SE2}"
# Convert to fasta Skipped because to slow for the test
# convert_fastaqual_fastq.py -f "${ME}" -o "${FASTA}" -c fastq_to_fastaqual
# identify chimeric sequences
identify_chimeric_seqs.py -m usearch61 -i "${MEFA}" -o "${CHIMFILES}" --suppress_usearch61_ref
NOCHIM="${MEFA/merged.fasta/nochimera.merged.fasta}"
filter_fasta.py -f "${MEFA}" -o "${NOCHIM}" -s "${CHIMFILES}"/chimeras.txt -n
# rename reads to be used with qiime (using gawk for mac os X)
SAMPLE="${FWD/.${EXT1}/}"
gawk -i inplace -vS="${SAMPLE}" 'BEGIN{i=1}{if ($0 ~ />/){print ">"S"_"i; i++}else{print $0}}' "${NOCHIM}"
mv "${NOCHIM}" "${NOCHIMFILES}"
done
# Combine all fasta files
cat "${NOCHIMFILES}"/*.fasta > "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna
# Pick OTUs
pick_otus.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna -o "${OTUFILES}" -s 0.99
# Pick representative
pick_rep_set.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed_otus.txt -f "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed.fna \
-m most_abundant -o "${OTUFILES}"/rep_otus.fasta
# Make OTU table
make_otu_table.py -i "${OTUFILES}"/16s.insilico_mock_combined_chimera_removed_otus.txt -o "${OTUFILES}"/16s.insilico_mock.biom
# Convert to tsv
biom convert -i "${OTUFILES}"/16s.insilico_mock.tsv -o test --to-tsv
# UCLUST workflow ---------------------------------------------------------
library(tidyverse)
library(cowplot)
library(ggpubr)
setwd("~/Downloads/alma_original_test")
uclust_df <- read_tsv("16s.insilico_mock.tsv", col_names = T)
uclust_16S_summary <- uclust_df%>% dplyr::rename(otu_name = `#OTU ID`) %>%
gather(label, count, -otu_name) %>%
tbl_df %>%
group_by(label) %>%
mutate(rel_abun = count/sum(count)) %>%
arrange(otu_name, label) %>%
filter(count > 0)
uclust_16S_prevalence <- uclust_16S_summary %>%
dplyr::select(label,otu_name, count) %>%
group_by(otu_name) %>%
summarise(prev = sum(count >0), total_counts = sum(count))
uclust_16S_abs_singletons <- uclust_16S_prevalence %>% dplyr::filter(total_counts <= 1, prev <= 1)
uclust_16S_abun_singletons <- uclust_16S_prevalence %>% dplyr::filter(total_counts > 1, prev <= 1)
uclust_16S_abs_singletons_names <- uclust_16S_abs_singletons$otu_name
uclust_16S_abun_doubletons <- uclust_16S_prevalence %>% filter(!(otu_name %in% uclust_16S_abs_singletons_names)) %>% dplyr::filter( prev <= 2)
# Remove absolute singletons
uclust_16S_prevalence <- uclust_16S_prevalence %>% filter(!(otu_name %in% uclust_16S_abs_singletons_names))
uclust_16S_counts <- data.frame(class = c("Total ASVs","Absolute singletons", "Abundant singletons"),
counts = c(length(unique(uclust_16S_summary$otu_name)),
dim(uclust_16S_abs_singletons)[1],
dim(uclust_16S_abun_singletons)[1]))
print(uclust_16S_counts)
uclust_16S_counts$class <- factor(uclust_16S_counts$class, levels=c("Total ASVs","Absolute singletons", "Abundant singletons"))
ggplot(uclust_16S_counts, aes(class, counts)) +
geom_bar(stat = "identity") + theme_bw() +
xlab("") + ylab("# ASVs")
# VS ref ------------------------------------------------------------------
# Get mock communities ----------------------------------------------------
# EVEN ERR2098507
# STAG ERR2098508
uclust_mock <- uclust_16S_summary %>%
filter(otu_name %in% uclust_16S_prevalence$otu_name) %>%
filter(grepl("Even", label) | grepl("Staggered", label))
uclust_mock_even <- uclust_mock %>%
filter(grepl("Even", label)) %>%
ungroup %>%
select(otu_name, count, label)
uclust_mock_stag <- uclust_mock %>%
filter(grepl("Staggered", label)) %>%
ungroup %>%
select(otu_name, count, label)
# Compare references
# usearch10.0.240_i86osx32 -usearch_global rep_otus.fasta \
# -db references_even_mock_noprimers.fasta
# -strand both -id 0.5 -blast6out otu_rep_vs_even.tsv
alma_mock <- read_tsv("alma_mock_data.txt", col_names = TRUE)
uclust_even_vs_ref <- read_tsv("otu_rep_vs_even.tsv", col_names = F) %>%
select(X1, X2, X3) %>%
dplyr::rename(otu_name = X1, ref = X2, id = X3) %>%
separate(otu_name,into = "otu_name", sep = " ", remove = TRUE, extra = "drop") %>%
inner_join(uclust_mock_even)
uclust_even_den <- ggplot(uclust_even_vs_ref, aes(id/100, fill = label, color = label)) +
geom_density(alpha = 0.6) +
theme_light() +
xlab("Identity") +
ylab("Density") +
scale_x_continuous(labels = scales::percent) +
ggtitle("uclust")
uclust_even_vs_ref_100 <- uclust_even_vs_ref %>%
select(-otu_name) %>%
group_by(label) %>%
mutate(N=sum(count),prop = count/N) %>%
filter(id >= 97) %>%
group_by(ref, label) %>%
summarise(prop = sum(prop)) %>%
ungroup() %>% group_by(ref) %>%
summarise(prop = mean((prop))) %>%
right_join(alma_mock %>% filter(!is.na(exp_even))) %>%
mutate(exp_even = exp_even/100, obs_even = obs_even/100) %>%
select(-contains("Stag"), -ref) %>%
dplyr::rename(emose_dada_obs = prop, alma_exp = exp_even, alma_obs = obs_even) %>%
gather(exp,values, c(emose_dada_obs, alma_exp, alma_obs)) %>%
ungroup() #%>% filter(!is.na(label))
uclust_even_vs_ref_100$ref_name <- factor(uclust_even_vs_ref_100$ref_name, levels = rev(sort(unique(uclust_even_vs_ref_100$ref_name))))
uclust_even_vs_ref_100_comp <- ggplot(uclust_even_vs_ref_100, aes(ref_name, values, fill = exp)) +
geom_bar(position = "dodge", stat = "identity") +
coord_flip() +
theme_light() +
theme(legend.position = "top",
legend.title = element_blank()) +
scale_fill_manual(values = rev(c( "#1B9E77", "#D95F02" ,"#7570B3"))) +
ylab("Proportion (hits >= 97%)") +
xlab("") +
scale_y_continuous(labels = scales::percent)
# Staggered
# Compare references
# usearch10.0.240_i86osx32 -usearch_global rep_otus.fasta \
# -db references_staggered_mock_noprimers.fasta
# -strand both -id 0.5 -blast6out otu_rep_vs_stag.tsv
uclust_stag_vs_ref <- read_tsv("otu_rep_vs_stag.tsv", col_names = F) %>%
select(X1, X2, X3) %>%
dplyr::rename(otu_name = X1, ref = X2, id = X3) %>%
separate(otu_name,into = "otu_name", sep = " ", remove = TRUE, extra = "drop") %>%
inner_join(uclust_mock_stag)
uclust_stag_den <- ggplot(uclust_stag_vs_ref, aes(id/100, fill = label, color = label)) +
geom_density(alpha = 0.6) +
theme_light() +
xlab("Identity") +
ylab("Density") +
scale_x_continuous(labels = scales::percent) +
ggtitle("UCLUST")
uclust_stag_vs_ref_100 <- uclust_stag_vs_ref %>%
select(-otu_name) %>%
group_by(label) %>%
mutate(N=sum(count),prop = count/N) %>%
filter(id >= 97) %>%
group_by(ref, label) %>%
summarise(prop = sum(prop)) %>%
ungroup() %>% group_by(ref) %>%
summarise(prop = mean((prop))) %>%
right_join(alma_mock) %>%
mutate(exp_stag = exp_stag/100, obs_stag = obs_stag/100) %>%
select(-contains("even"), -ref) %>%
dplyr::rename(emose_dada_obs = prop, alma_exp = exp_stag, alma_obs = obs_stag) %>%
gather(exp,values, c(emose_dada_obs, alma_exp, alma_obs)) %>%
ungroup()
uclust_stag_vs_ref_100$ref_name <- factor(uclust_stag_vs_ref_100$ref_name, levels = rev(sort(unique(uclust_stag_vs_ref_100$ref_name))))
uclust_stag_vs_ref_100_comp <- ggplot(uclust_stag_vs_ref_100, aes(ref_name, values, fill = exp)) +
geom_bar(position = "dodge", stat = "identity") +
coord_flip() +
theme_light() +
theme(legend.position = "top",
legend.title = element_blank()) +
scale_fill_manual(values = rev(c( "#1B9E77", "#D95F02" ,"#7570B3"))) +
ylab("Proportion (hits >= 97%)") +
xlab("") +
scale_y_continuous(labels = scales::percent)
all_combined_stag <- bind_rows(uclust_stag_vs_ref_100 %>%
spread(exp, values) %>%
select(ref_name, emose_dada_obs, alma_exp) %>%
dplyr::rename(X = emose_dada_obs, Y = alma_exp) %>%
mutate(exp = "UCLUST"),
uclust_stag_vs_ref_100 %>%
spread(exp, values) %>%
select(ref_name, alma_obs, alma_exp) %>%
dplyr::rename(X = alma_obs, Y = alma_exp) %>%
mutate(exp = "Parada et al. 2015"))
all <- ggpubr::ggscatter(all_combined_stag, x = "X", y = "Y",
add = "reg.line", # Add regression line
conf.int = TRUE, # Add confidence interval
color = "exp",
palette = get_palette(c("#00AFBB", "#FC4E07"), 3), # Color by groups "cyl"
#shape = "exp", # Change point shape by groups "cyl"
fullrange = TRUE,
alpha = 0.9, # Extending the regression line
facet.by = "exp"
#rug = TRUE
)+ stat_cor(aes(color = exp)) +
scale_y_log10(labels = scales::percent) +
scale_x_log10(labels = scales::percent) +
ylab("515F-Y/926R Observed abundance") +
xlab("Expected abundance") +
theme_light() +
theme(legend.position = "none")
ggdraw() +
draw_plot(uclust_even_vs_ref_100_comp, x = 0, y = .5, width = .5, height = .5) +
draw_plot(uclust_stag_vs_ref_100_comp, x = .5, y = .5, width = .5, height = .5) +
draw_plot(all, x = 0, y = 0, width = 1, height = 0.5) +
draw_plot_label(label = c("A", "B", "C"), size = 15,
x = c(0, 0.5, 0), y = c(1, 1, 0.5))
以上是关于r 用于分析原始alma数据集的脚本的主要内容,如果未能解决你的问题,请参考以下文章