sh 创建TARA子采样(在nt级别)读取文件

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了sh 创建TARA子采样(在nt级别)读取文件相关的知识,希望对你有一定的参考价值。

#!/bin/bash

# 1. We read a folder and take all fastq files
# 2. We quality trim and remove short sequences
# 3. Run kaiju and generate output

#. "${MODULESHOME}"/init/bash
set -x
set -e
main(){
    trap clean_ok EXIT
    
    NSLOTS=${SLURM_JOB_CPUS_PER_NODE}
    NSUB=2000000
    ITER=RUN
    NAM=NOM_${ITER}
    IDIR=/data/oerc-microb3/afernandezguerra/tara_nogs/input
    ODIR=/data/oerc-microb3/afernandezguerra/tara_nogs/results/${ITER}/NOM
    if [ -d ${ODIR} ]; then
	rm -rf ${ODIR}/${NAM}* ${ODIR}/tmp*
    fi
    mkdir -p ${ODIR}
    cd ${ODIR}
    #    NAM=$(awk -v L="${SGE_TASK_ID}" 'NR==L' "${FFILE}")

    
    VSEARCH=vsearch
    BBDUK=bbduk.sh
    ADAP=~/opt/bbmap/resources/tara.fa.gz
    PQUAL=33

    # Original fastq files
    PE1RAW="${IDIR}"/NOM_1.fastq.gz
    PE2RAW="${IDIR}"/NOM_2.fastq.gz

    # Subsampled fastq files
    PE1="${ODIR}"/"${NAM}"_1.sub.fastq
    PE2="${ODIR}"/"${NAM}"_2.sub.fastq

    # Merging files
    ME="${ODIR}"/"${NAM}".merged.fastq
    NM1="${ODIR}"/"${NAM}"_1.nm.fastq
    NM2="${ODIR}"/"${NAM}"_2.nm.fastq

    # Quality trimming files
    MEQC="${ODIR}"/"${NAM}".merged.qc.fastq.gz
    NM1QC="${ODIR}"/"${NAM}"_1.nm.qc.fastq.gz
    NM2QC="${ODIR}"/"${NAM}"_2.nm.qc.fastq.gz
    PESQC="${ODIR}"/"${NAM}"_2.ms.qc.fastq.gz
    
    SR="${ODIR}"/"${NAM}".qc.fastq.gz
    FA="${MEQC/.merged.qc.fastq.gz/.merged.fasta}"
    
    LOG="${ODIR}"/"${NAM}".log
    
    if [ -f "${LOG}" ]; then rm "${LOG}"; else touch "${LOG}"; fi
    
    printf "Crunching sample %s using %s threads\n\n" "${NAM}" "${NSLOTS}" | tee "${LOG}"

    printf "Sampling ${NSUB} reads to estimate the read length and quality encoding..." | tee -a "${LOG}"
    printf "\n\n" >> "${LOG}"
    subsample_reads
    check_length
    #    guess_quality
    printf "done.\n" | tee -a "${LOG}"


    printf "Read length: %s nt\n\n" "${SLEN}" | tee -a "${LOG}"
    
    printf "Merging reads..." | tee -a "${LOG}"
    printf "\n\n" >> "${LOG}"
    merge_reads 
    printf "done\n\n" | tee -a "${LOG}"
    
    printf "Quality trimming with BBDUK (Q20) and min length >= 75..." | tee -a "${LOG}"
    printf "\n\n" >> "${LOG}"
    quality_trim_paired
    quality_trim_single
    concat_fastq
    printf "done\n\n" | tee -a "${LOG}"

    #	printf "Taxonomic profile with Kaiju..." | tee -a "${LOG}"
    #	taxonomy_profiler
    #	printf "done.\n\n" | tee -a "${LOG}"

    printf "Gene prediction with FragGeneScan..." | tee -a "${LOG}"
    #	gene_prediction
    nog_mmseqs
    printf "done.\n\n" | tee -a "${LOG}"
    
    #	printf "Concatenating and dereplicating fasta files..." | tee -a "${LOG}"
    #	dereplicate
    #	printf "done.\n\n" | tee -a "${LOG}"
    
    printf "Cleaning files..." | tee -a "${LOG}"
    #	clean_ok
    printf "done.\n\n" | tee -a "${LOG}"
    
    printf "%s status: DONE" "${NAM}" >> "${LOG}"
 }

check_length(){
    SLEN=$(cat "${PE1}" | head -n 40000 | awk 'BEGIN{N=0}NR%4==2{N=N+length($0)}END{printf("%.0f", N/10000)}')
}

guess_quality(){ 
    PQUALG=$("${VSEARCH}" --fastq_chars "${PE1}" 2>&1 >/dev/null | grep 'fastq_ascii' | sed -e 's/Guess: //g')
    PQUAL=$(echo "${PQUALG}" | awk '{print $6}')
    #rm tmp.fastq
}

subsample_reads(){
    SEED=$RANDOM
    #reformat.sh in=${PE1RAW} in2=${PE2RAW} samplereadstarget=${NSUB} out=${PE1} out2=${PE2} &>> ${LOG}
    seqtk sample -s${SEED} ${PE1RAW} ${NSUB} > ${PE1} &
    seqtk sample -s${SEED} ${PE2RAW} ${NSUB} > ${PE2} &
    wait
}

merge_reads(){
    "${VSEARCH}" --fastq_mergepairs "${PE1}" --reverse "${PE2}" --fastqout "${ME}" --fastqout_notmerged_fwd "${NM1}" --fastqout_notmerged_rev "${NM2}" --threads "${NSLOTS}" --fastq_allowmergestagger --notrunclabels -fastq_qmin 0 -fastq_qmax 41 -fastq_ascii 33 &>> "${LOG}"
    rm "${PE1}" "${PE2}"
}

quality_trim_paired(){
    "${BBDUK}" in="${NM1}" in2="${NM2}" out1="${NM1QC}" out2="${NM2QC}" outs="${PESQC}" qin="${PQUAL}" qtrim=rl trimq=20 ktrim=r k=25 mink=11 hdist=1 tbo tpe maxns=0 threads="${NSLOTS}" ref="${ADAP}" maxns=0 overwrite=true minlen=75 &>> "${LOG}"
    rm "${NM1}" "${NM2}"
}

quality_trim_single(){    
    "${BBDUK}" in="${ME}" out="${MEQC}" qin="${PQUAL}" qtrim=rl trimq=20 ktrim=r k=25 mink=11 hdist=1 tbo tpe maxns=0 threads="${NSLOTS}" ref="${ADAP}" maxns=0 overwrite=true minlen=75 &>> "${LOG}"
    rm "${ME}"
}

concat_fastq(){
    cat "${PESQC}" >> "${MEQC}"
}

gene_prediction(){
    reformat.sh in=${MEQC} out=${FA}
    #    ~/opt/FragGeneScan1.30/run_FragGeneScan.pl -genome=${FA} -out="${NAM}" -complete=0 -train=illumina_5 -thread="${NSLOTS}" &>> "${LOG}"
    #    ~/opt/FragGeneScanPlus/FGS+ -s ${FA} -o ${NAM} -w 0 -p ${NSLOTS} -e 0 -d 0 -m 100000 -r ${HOME}/opt/FragGeneScanPlus/train -t illumina_5
}

clean_ok(){
    rm -rf "${MEQC}" "${NM1QC}" "${NM2QC}" "${PESQC}" "${SR}" "${FA}" "${PE1}" "${PE2}" "${ME}" "${NM1}" "${NM2}" ${TMP} ${ODIR}/${NAM}_DB_aa* ${ODIR}/${NAM}_DB_orf*
    cd ..
    tar cvfz ${NAM}.tar.gz ${ODIR}
    rm -rf ${ODIR}
}

clean_skip(){
    rm "${PE1}" "${PE2}"
}

nog_mmseqs(){
    TMP=$(mktemp -d -p ${ODIR})
    reformat.sh in=${MEQC} out=${FA}
    mmseqs createdb ${FA} ${ODIR}/${NAM}_DB
    mmseqs extractorfs ${ODIR}/${NAM}_DB ${ODIR}/${NAM}_DB_orf --longest-orf --min-length 30 --max-length 48000
    mmseqs translatenucs  ${ODIR}/${NAM}_DB_orf ${ODIR}/${NAM}_DB_aa
    mmseqs search ${ODIR}/${NAM}_DB_aa /data/oerc-microb3/afernandezguerra/biodb/bactArNOGDB ${ODIR}/${NAM}_eggnog $TMP --max-seqs 50 --threads ${NSLOTS} -a -e 1e-5
}

main "$@"

以上是关于sh 创建TARA子采样(在nt级别)读取文件的主要内容,如果未能解决你的问题,请参考以下文章

读取具有多个名称空间的子节点

如何在 R 中对 SpatialPointsDataFrame 进行子采样

在 Linux Mint 19 Tara 上安装 Docker [关闭]

在目录中搜索文件并列出它们的名称和路径 - 两个级别的子文件夹

如何使用textureLod在glsl中对mip级别进行采样?

使用 Dask 从 CSV 文件中采样确切的行数