003生信人必练

Posted 2020-08-30 thinkanddo

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了003生信人必练相关的知识，希望对你有一定的参考价值。

gtf 文件

序列的编号	注释信息的来源	注释信息的类型	开始与结束的位置	得分	序列的方向	起始编码的位置，仅对CDS有效	注释信息描述
11	ensembl_havana	gene	5422111 5423206	”.”表示为空。	+表示正义链, -反义链 , ? 表示未知.	有效值为0、1、2	键+值

11    ensembl_havana  gene   5422111 5423206 .    +    .    gene_id "ENSG00000167360"; gene_version "4"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
11    ensembl_havana  transcript    5422111 5423206 .    +    .    gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381";
11    ensembl_havana  exon   5422111 5423206 .    +    .    gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; exon_number "1"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381"; exon_id "ENSE00001276439"; exon_version "4";
11    ensembl_havana  CDS    5422201 5423151 .    +    0    gene_id "ENSG00000167360"; gene_version "4"; transcript_id "ENST00000300778"; transcript_version "4"; exon_number "1"; gene_name "OR51Q1"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR51Q1-001"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS31381"; protein_id "ENSP00000300778"; protein_version "4";

探究内容

import sys
import re

args = sys.argv
\'\'\'sys.argv 是命令行参数，是一个字符串列表，0代表其路径\'\'\'


class Genome_info:      #这是一个基类，所有类型都通用的
    def __init__(self):
        self.chr = ""   #染色体号
        self.start = 0
        self.end = 0


class Gene(Genome_info):   #这个函数继承了基类
    def __init__(self):
        Genome_info.__init__(self)
        self.orientation = ""
        self.id = ""


class Transcript(Genome_info):
    def __init__(self):
        Genome_info.__init__(self)
        self.id = ""
        self.parent = ""


class Exon(Genome_info):
    def __init__(self):
        Genome_info.__init__(self)
        self.parent = ""


def main(args):
    """
    一个输入参数：
    第一个参数为物种gtf文件

    :return:
    """

    list_chr = []
    list_gene = {}   #因为有 id 号 所以用dir
    list_transcript = {}
    list_exon = []
    # l_n = 0
    with open(args[1]) as fp_gtf:        #打开文件遍历GTF
        for line in fp_gtf:
            if line.startswith("#"):      #号开头的注释过滤掉，不统计这个
                continue
            # print ("in %s" % l_n)
            # l_n += 1
            lines = line.strip("\\n").split("\\t")
            chr = lines[0]
            type = lines[2]
            start = int(lines[3])
            end = int(lines[4])
            orientation = lines[6]
            attr = lines[8]
            if not re.search(r\'protein_coding\', attr):   #取到的是蛋白的编码，如果没有的话就跳过，不做统计了
                continue

            if not chr in list_chr:    #把染色体的类型添加到列表里
                list_chr.append(chr)

            if type == "gene":
                gene = Gene()          #初始化一个基因对象
                id = re.search(r\'gene_id "([^;]+)";?\', attr).group(1)  # 0 返回所有列表，1取第一个
                gene.chr = chr
                gene.start = start
                gene.end = end
                gene.id = id
                gene.orientation = orientation
                list_gene[id] = gene
                # print(id)
            elif type == "transcript":
                transcript = Transcript()
                id = re.search(r\'transcript_id "([^;]+)";?\', attr).group(1)
                parent = re.search(r\'gene_id "([^;]+)";?\', attr).group(1)
                if not parent in list_gene:
                    continue
                transcript.chr = chr
                transcript.start = start
                transcript.end = end
                transcript.id = id
                transcript.parent = parent
                list_transcript[id] = transcript

            elif type == "exon":
                exon = Exon()
                parent = re.search(r\'transcript_id "([^;]+)";?\', attr).group(1)
                if not parent in list_transcript:
                    continue
                exon.chr = chr
                exon.start = start
                exon.end = end
                exon.parent = parent
                list_exon.append(exon)

    chr_gene(list_gene)
    gene_len(list_gene)
    gene_transcript(list_transcript)
    transcript_exon(list_exon)
    exon_pos(list_exon)


def chr_gene(list_gene):
    """
    染色体上基因数量分布

    :param list_gene:
    :return:
    """

    print("染色体上基因数量分布")
    count_gene = {}     #这是一个计数器
    for info in list_gene.values():
        chr = info.chr
        if chr in count_gene:
            count_gene[info.chr] += 1
        else:
            count_gene[info.chr] = 1
    with open("chr_gene.txt", \'w\') as fp_out:
        for chr, num in count_gene.items():
            print("\\t".join([chr, str(num)]) + "\\n")
            fp_out.write("\\t".join([chr, str(num)]) + "\\n")


def gene_len(list_gene):
    """
    基因长度分布情况

    :param list_gene:
    :return:
    """

    print("基因长度分布情况")
    with open("gene_len.txt", \'w\') as fp_out:
        for gene_id, info in list_gene.items():
            len = info.end - info.start + 1
            fp_out.write("\\t".join([gene_id, str(len)]) + "\\n")
            print("\\t".join([gene_id, str(len)]) + "\\n")


def gene_transcript(list_transcript):
    """
    基因的转录本数量分布

    :param list_transcript:
    :return:
    """

    print("基因的转录本数量分布")
    count_transcript = {}
    for info in list_transcript.values():
        gene_id = info.parent
        if gene_id in count_transcript:
            count_transcript[gene_id] += 1
        else:
            count_transcript[gene_id] = 1
    with open("gene_transcript.txt", \'w\') as fp_out:
        for gene_id, num in count_transcript.items():
            print("\\t".join([gene_id, str(num)]) + "\\n")
            fp_out.write("\\t".join([gene_id, str(num)]) + "\\n")


def transcript_exon(list_exon):
    """
    转录本的外显子数量统计

    :param list_exon:
    :return:
    """

    print("转录本的外显子数量统计")
    count_exon = {}
    for exon in list_exon:
        transcript_id = exon.parent
        if transcript_id in count_exon:
            count_exon[transcript_id] += 1
        else:
            count_exon[transcript_id] = 1
    with open("transcript_exon.txt", \'w\') as fp_out:
        for transcript_id, num in count_exon.items():
            print("\\t".join([transcript_id, str(num)]) + "\\n")
            fp_out.write("\\t".join([transcript_id, str(num)]) + "\\n")


def exon_pos(list_exon):
    """
    外显子坐标统计

    :param list_exon:
    :return:
    """

    print("外显子坐标统计")
    count_exon = {}
    for exon in list_exon:
        transcript_id = exon.parent
        if transcript_id in count_exon:
            count_exon[transcript_id] += ",%s-%s" % (str(exon.start), str(exon.end))
        else:
            count_exon[transcript_id] = "%s-%s" % (str(exon.start), str(exon.end))
    with open("exon_pos.txt", \'w\') as fp_out:
        for transcript_id, pos in count_exon.items():
            print("\\t".join([transcript_id, pos]) + "\\n")
            fp_out.write("\\t".join([transcript_id, pos]) + "\\n")


def gene_exon_pos(list_gene, list_transcript, list_exon):
    """
    根据exon的parent将所有exon对应到transcript
    根据transcript的parent将所有transcript对应到gene
    根据gene按chr分组得到chromosome列表

    从chromosome中输出某个指定基因的所有外显子坐标信息并画图
    生信编程直播第五题

    :param list_gene:
    :param list_transcript:
    :param list_exon:
    :return:
    """
    pass


if __name__ == "__main__":
    main(args)

以上是关于003生信人必练的主要内容，如果未能解决你的问题，请参考以下文章