sh 创建TARA子采样(在nt级别)读取文件
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了sh 创建TARA子采样(在nt级别)读取文件相关的知识,希望对你有一定的参考价值。
#!/bin/bash
# 1. We read a folder and take all fastq files
# 2. We quality trim and remove short sequences
# 3. Run kaiju and generate output
#. "${MODULESHOME}"/init/bash
set -x
set -e
main(){
trap clean_ok EXIT
NSLOTS=${SLURM_JOB_CPUS_PER_NODE}
NSUB=2000000
ITER=RUN
NAM=NOM_${ITER}
IDIR=/data/oerc-microb3/afernandezguerra/tara_nogs/input
ODIR=/data/oerc-microb3/afernandezguerra/tara_nogs/results/${ITER}/NOM
if [ -d ${ODIR} ]; then
rm -rf ${ODIR}/${NAM}* ${ODIR}/tmp*
fi
mkdir -p ${ODIR}
cd ${ODIR}
# NAM=$(awk -v L="${SGE_TASK_ID}" 'NR==L' "${FFILE}")
VSEARCH=vsearch
BBDUK=bbduk.sh
ADAP=~/opt/bbmap/resources/tara.fa.gz
PQUAL=33
# Original fastq files
PE1RAW="${IDIR}"/NOM_1.fastq.gz
PE2RAW="${IDIR}"/NOM_2.fastq.gz
# Subsampled fastq files
PE1="${ODIR}"/"${NAM}"_1.sub.fastq
PE2="${ODIR}"/"${NAM}"_2.sub.fastq
# Merging files
ME="${ODIR}"/"${NAM}".merged.fastq
NM1="${ODIR}"/"${NAM}"_1.nm.fastq
NM2="${ODIR}"/"${NAM}"_2.nm.fastq
# Quality trimming files
MEQC="${ODIR}"/"${NAM}".merged.qc.fastq.gz
NM1QC="${ODIR}"/"${NAM}"_1.nm.qc.fastq.gz
NM2QC="${ODIR}"/"${NAM}"_2.nm.qc.fastq.gz
PESQC="${ODIR}"/"${NAM}"_2.ms.qc.fastq.gz
SR="${ODIR}"/"${NAM}".qc.fastq.gz
FA="${MEQC/.merged.qc.fastq.gz/.merged.fasta}"
LOG="${ODIR}"/"${NAM}".log
if [ -f "${LOG}" ]; then rm "${LOG}"; else touch "${LOG}"; fi
printf "Crunching sample %s using %s threads\n\n" "${NAM}" "${NSLOTS}" | tee "${LOG}"
printf "Sampling ${NSUB} reads to estimate the read length and quality encoding..." | tee -a "${LOG}"
printf "\n\n" >> "${LOG}"
subsample_reads
check_length
# guess_quality
printf "done.\n" | tee -a "${LOG}"
printf "Read length: %s nt\n\n" "${SLEN}" | tee -a "${LOG}"
printf "Merging reads..." | tee -a "${LOG}"
printf "\n\n" >> "${LOG}"
merge_reads
printf "done\n\n" | tee -a "${LOG}"
printf "Quality trimming with BBDUK (Q20) and min length >= 75..." | tee -a "${LOG}"
printf "\n\n" >> "${LOG}"
quality_trim_paired
quality_trim_single
concat_fastq
printf "done\n\n" | tee -a "${LOG}"
# printf "Taxonomic profile with Kaiju..." | tee -a "${LOG}"
# taxonomy_profiler
# printf "done.\n\n" | tee -a "${LOG}"
printf "Gene prediction with FragGeneScan..." | tee -a "${LOG}"
# gene_prediction
nog_mmseqs
printf "done.\n\n" | tee -a "${LOG}"
# printf "Concatenating and dereplicating fasta files..." | tee -a "${LOG}"
# dereplicate
# printf "done.\n\n" | tee -a "${LOG}"
printf "Cleaning files..." | tee -a "${LOG}"
# clean_ok
printf "done.\n\n" | tee -a "${LOG}"
printf "%s status: DONE" "${NAM}" >> "${LOG}"
}
check_length(){
SLEN=$(cat "${PE1}" | head -n 40000 | awk 'BEGIN{N=0}NR%4==2{N=N+length($0)}END{printf("%.0f", N/10000)}')
}
guess_quality(){
PQUALG=$("${VSEARCH}" --fastq_chars "${PE1}" 2>&1 >/dev/null | grep 'fastq_ascii' | sed -e 's/Guess: //g')
PQUAL=$(echo "${PQUALG}" | awk '{print $6}')
#rm tmp.fastq
}
subsample_reads(){
SEED=$RANDOM
#reformat.sh in=${PE1RAW} in2=${PE2RAW} samplereadstarget=${NSUB} out=${PE1} out2=${PE2} &>> ${LOG}
seqtk sample -s${SEED} ${PE1RAW} ${NSUB} > ${PE1} &
seqtk sample -s${SEED} ${PE2RAW} ${NSUB} > ${PE2} &
wait
}
merge_reads(){
"${VSEARCH}" --fastq_mergepairs "${PE1}" --reverse "${PE2}" --fastqout "${ME}" --fastqout_notmerged_fwd "${NM1}" --fastqout_notmerged_rev "${NM2}" --threads "${NSLOTS}" --fastq_allowmergestagger --notrunclabels -fastq_qmin 0 -fastq_qmax 41 -fastq_ascii 33 &>> "${LOG}"
rm "${PE1}" "${PE2}"
}
quality_trim_paired(){
"${BBDUK}" in="${NM1}" in2="${NM2}" out1="${NM1QC}" out2="${NM2QC}" outs="${PESQC}" qin="${PQUAL}" qtrim=rl trimq=20 ktrim=r k=25 mink=11 hdist=1 tbo tpe maxns=0 threads="${NSLOTS}" ref="${ADAP}" maxns=0 overwrite=true minlen=75 &>> "${LOG}"
rm "${NM1}" "${NM2}"
}
quality_trim_single(){
"${BBDUK}" in="${ME}" out="${MEQC}" qin="${PQUAL}" qtrim=rl trimq=20 ktrim=r k=25 mink=11 hdist=1 tbo tpe maxns=0 threads="${NSLOTS}" ref="${ADAP}" maxns=0 overwrite=true minlen=75 &>> "${LOG}"
rm "${ME}"
}
concat_fastq(){
cat "${PESQC}" >> "${MEQC}"
}
gene_prediction(){
reformat.sh in=${MEQC} out=${FA}
# ~/opt/FragGeneScan1.30/run_FragGeneScan.pl -genome=${FA} -out="${NAM}" -complete=0 -train=illumina_5 -thread="${NSLOTS}" &>> "${LOG}"
# ~/opt/FragGeneScanPlus/FGS+ -s ${FA} -o ${NAM} -w 0 -p ${NSLOTS} -e 0 -d 0 -m 100000 -r ${HOME}/opt/FragGeneScanPlus/train -t illumina_5
}
clean_ok(){
rm -rf "${MEQC}" "${NM1QC}" "${NM2QC}" "${PESQC}" "${SR}" "${FA}" "${PE1}" "${PE2}" "${ME}" "${NM1}" "${NM2}" ${TMP} ${ODIR}/${NAM}_DB_aa* ${ODIR}/${NAM}_DB_orf*
cd ..
tar cvfz ${NAM}.tar.gz ${ODIR}
rm -rf ${ODIR}
}
clean_skip(){
rm "${PE1}" "${PE2}"
}
nog_mmseqs(){
TMP=$(mktemp -d -p ${ODIR})
reformat.sh in=${MEQC} out=${FA}
mmseqs createdb ${FA} ${ODIR}/${NAM}_DB
mmseqs extractorfs ${ODIR}/${NAM}_DB ${ODIR}/${NAM}_DB_orf --longest-orf --min-length 30 --max-length 48000
mmseqs translatenucs ${ODIR}/${NAM}_DB_orf ${ODIR}/${NAM}_DB_aa
mmseqs search ${ODIR}/${NAM}_DB_aa /data/oerc-microb3/afernandezguerra/biodb/bactArNOGDB ${ODIR}/${NAM}_eggnog $TMP --max-seqs 50 --threads ${NSLOTS} -a -e 1e-5
}
main "$@"
以上是关于sh 创建TARA子采样(在nt级别)读取文件的主要内容,如果未能解决你的问题,请参考以下文章
如何在 R 中对 SpatialPointsDataFrame 进行子采样
在 Linux Mint 19 Tara 上安装 Docker [关闭]
在目录中搜索文件并列出它们的名称和路径 - 两个级别的子文件夹