sh 获取所有装配统计信息
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了sh 获取所有装配统计信息相关的知识,希望对你有一定的参考价值。
# Get number of contigs
find /bioinf/projects/osd/main/2014/06/analysis-results/assembly-evaluation/mapping/bySample/contigs/ -name "OSD*spades*contigs.gt500.fa.gz" | while read LINE; do NAM=$(basename ${LINE} _spades_contigs.gt500.fa.gz); NUM=$(zgrep -c '>' ${LINE}); zcat ${LINE} | seqstat - | awk -v N=${NAM} '$0~/Number/{gsub(" ","");split($0,a,":");printf(N"\t"a[2])}$0~/Total/{gsub(" ","");split($0,a,":");printf "\t"a[2]"\t"}$0~/Largest/{gsub(" ","");split($0,a,":"); printf a[2]"\n"}'; done > stats/OSD2014_numer_contigs.txt
# Get number of ORFs
find /bioinf/projects/osd/main/2014/06/analysis-results/assembly-evaluation/mapping/bySample/BWA/results/ -name "*_spades_contigs.gt500.aa.fasta" | while read LINE; do NAM=$(basename ${LINE} _spades_contigs.gt500.aa.fasta); NUM=$(LC_ALL=C grep -c '>' ${LINE}); printf ${NAM}"\t"${NUM}"\n"; done > OSD2014_number_orfs.txt
#n50
find /bioinf/projects/osd/main/2014/06/analysis-results/assembly-evaluation/mapping/bySample/contigs/ -name "OSD*_spades*contigs.gt500.fa.gz" | while read LINE; do NAM=$(basename ${LINE} _spades_contigs.gt500.fa.gz); zcat ${LINE} | infoseq -only -length -stdout -filter | tail -n+2 | sort -n | mawk -v N=${NAM} '{len[i++]=$1;sum+=$1}END{for(j=0;j<i+1;j++){csum+=len[j];if (csum>=sum/2){print N"\t"len[j];break}}}'; done > OSD2014_n50.txt
# Number raw reads QC
cut -f1,2 osdfile2sample-mg.txt | tail -n+1 | while read LINE; do A=($LINE); for i in ${A[0]}/; do SUM=$(zcat /bioinf/projects/osd/analysis-data/2014/datasets/workable/metagenomes/non-merged/${A[0]}_R1_shotgun_workable.fastq.gz | mawk 'BEGIN{sum=0}NR%4==1{sum=sum+1}END{print sum}'); printf ${A[1]}"\t"${SUM}"\n" ; done; done > stats/OSD2014_number_raw_reads_qc.txt
cut -f1,2 osdfile2sample-mg.txt | tail -n+2 | while read LINE; do A=($LINE); for i in ${A[0]}/; do SUM=$(zcat /bioinf/projects/osd/analysis-data/2014/datasets/raw/metagenomes/${A[0]}_R1_shotgun_raw.fastq.gz | mawk 'BEGIN{sum=0}NR%4==1{sum=sum+1}END{print sum}'); printf ${A[1]}"\t"${SUM}"\n" ; done; done > OSD2014_number_raw_reads.txt
# Get number of contigs cov2
find -L /bioinf/projects/osd/analysis-data/2014/assemblies/bySample/ -name "*contigs.fasta.gz" | while read LINE; do NAM=$(basename ${LINE} _spades_contigs.fa.gz); NUM=$(zgrep -c '>' ${LINE}); zcat ${LINE} | seqstat - | awk -v N=${NAM} '$0~/Number/{gsub(" ","");split($0,a,":");printf(N"\t"a[2])}$0~/Total/{gsub(" ","");split($0,a,":");printf "\t"a[2]"\n"}'; done > stats/OSD2014_cov2_numer_contigs.txt
# n50 cov2
find -L /bioinf/projects/osd/analysis-data/2014/assemblies/bySample/ -name "*contigs.fasta.gz" | while read LINE; do NAM=$(basename ${LINE} _spades.contigs.fasta.gz); zcat ${LINE} | infoseq -only -length -stdout -filter | tail -n+2 | sort -n | mawk -v N=${NAM} '{len[i++]=$1;sum+=$1}END{for(j=0;j<i+1;j++){csum+=len[j];if (csum>=sum/2){print N"\t"len[j];break}}}'; done > stats/OSD2014_n50_cov2.txt
# Number of ORFS cov2
find -L /bioinf/projects/osd/analysis-data/2014/assemblies/bySample/ -name "*spades.orfs.aa.fasta.gz" | while read LINE; do NAM=$(basename ${LINE} _spades.orfs.aa.fasta.gz); NUM=$(LC_ALL=C zgrep -c '>' ${LINE}); printf ${NAM}"\t"${NUM}"\n"; done > OSD2014_number_orfs_cov2.txt
# Sample num_raw_PE_reads num_QC_PE_reads num_contigs contig_length largest_contig mapped_reads mapped_proper_reads N50 num_predicted_genes num_contigs_cov2 contig_length_cov2 largest_contig_cov2 N50_cov2 num_predicted_genes_cov2
paste <(sort OSD2014_number_raw_reads.txt) <(sort OSD2014_number_raw_reads_qc.txt) <(sort OSD2014_numer_contigs.txt) <(sort OSD2014_bwa_mapping_stats.txt) <(sort OSD2014_n50.txt) <(sort OSD2014_number_orfs.txt) <(sort OSD2014_cov2_numer_contigs.txt) <(sort OSD2014_n50_cov2.txt) <(sort OSD2014_number_orfs_cov2.txt) > OSD2014_all_stats.tsv
以上是关于sh 获取所有装配统计信息的主要内容,如果未能解决你的问题,请参考以下文章