python 计算任何给定fasta文件的GC内容
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 计算任何给定fasta文件的GC内容相关的知识,希望对你有一定的参考价值。
#!/usr/bin/python
# Author: Johan Zicola
# Date: 2017-12-16
# Calculate GC content of any given fasta file (can contain >header)
# The file can be gzipped (should then have a .gz extension)
# Standard output provides the GC content per fasta sequence such as:
# file.fa name_fasta CG_content
# In case the name_fasta contains a colon (e.g. chr1:100-200),
# two columns are created such as : chr1 100-200
# If the DNA string does not contain any valid nucleotide (Ns for instance)
# NA is returned as value for GC_content
from __future__ import division
from Bio import SeqIO
import sys
import argparse
import re
import gzip
def calculate_GC_content(my_dna):
# Convert in string and uppercase
#my_dna = str(string_dna.strip())
my_dna = my_dna.upper()
a_content = my_dna.count("A")
t_content = my_dna.count("T")
c_content = my_dna.count("C")
g_content = my_dna.count("G")
# Total_content = a_content + t_content + g_content + c_content
# Short to use length function
# Important to sum the 4 nucleotides to avoid including potential Ns
total_content = a_content + t_content + c_content + g_content
gc = c_content + g_content
# Test if the sequence contains any nucleotides (not only Ns)
# If only Ns, it will return an "division by zero" error
if total_content == 0:
gc_content = "NA"
return gc_content
else:
gc_content = (gc/total_content)
gc_content = round(gc_content, 6)
return gc_content
def main():
# Get DNA sequence from standard input
if len(sys.argv) != 2:
print "Please provide a fasta file as argument"
sys.exit()
name_file = str(sys.argv[1])
if name_file.lower().endswith(".gz"):
handle = gzip.open(name_file, "rt")
fasta_sequences = SeqIO.parse(handle,'fasta')
else:
handle = open(name_file)
fasta_sequences = SeqIO.parse(handle ,'fasta')
for fasta in fasta_sequences:
name, sequence = fasta.id, fasta.seq
# Get name from the accession assumming it is a suite of digits
# and the only one located in the file name
# If no digits in the string, the accession takes the value "unknown"
accession = re.findall("\d+", name_file)
if not accession:
accession="unknown"
else:
accession = accession[0]
GC_content = calculate_GC_content(sequence)
# Check if the name of the sequence contains a colon (so chr1:201-300)
# and split accordingly to a chromosome column and a position column
if re.findall(":", name):
col1 = name.split(":")
chromosome = col1[0]
position = col1[1]
print name_file+"\t"+accession+"\t"+str(chromosome)+"\t"+str(position)+"\t"+str(GC_content)
else:
print name_file+"\t"+accession,"\t"+str(name)+"\t",str(GC_content)
if __name__ == "__main__":
sys.exit(main())
以上是关于python 计算任何给定fasta文件的GC内容的主要内容,如果未能解决你的问题,请参考以下文章
如何在 fasta 文件中并行化计算,其中每个处理器采用一个序列
生物信息学算法之Python实现|Rosalind刷题笔记:005 GC含量计算
python学习——读取染色体长度(七:读取fasta文件)