从gff3文件中获取fasta文件

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了从gff3文件中获取fasta文件相关的知识,希望对你有一定的参考价值。

chr1A	NRGenome	gene	1157233	1158291	.	+	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1
chr1A	NRGenome	mRNA	1157233	1158291	.	+	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;coverage=100.0;identity=100.0;matches=1059;mismatches=0;indels=0;unknowns=0
chr1A	NRGenome	exon	1157233	1158291	100	+	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 +
chr1A	NRGenome	CDS	1157233	1158291	100	+	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 +
chr1A	NRGenome	gene	1159521	1162591	.	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1
chr1A	NRGenome	mRNA	1159521	1162591	.	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;coverage=100.0;identity=100.0;matches=1527;mismatches=0;indels=0;unknowns=0
chr1A	NRGenome	exon	1162250	1162591	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 +
chr1A	NRGenome	exon	1161953	1162150	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 +
chr1A	NRGenome	exon	1161682	1161859	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 +
chr1A	NRGenome	exon	1161377	1161547	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 +
chr1A	NRGenome	exon	1160679	1160710	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 +
chr1A	NRGenome	exon	1160535	1160577	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 +
chr1A	NRGenome	exon	1160392	1160459	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 +
chr1A	NRGenome	exon	1160086	1160127	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 +
chr1A	NRGenome	exon	1159521	1159973	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 +
chr1A	NRGenome	CDS	1162250	1162591	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 +
chr1A	NRGenome	CDS	1161953	1162150	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 +
chr1A	NRGenome	CDS	1161682	1161859	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 +
chr1A	NRGenome	CDS	1161377	1161547	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 +
chr1A	NRGenome	CDS	1160679	1160710	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 +
chr1A	NRGenome	CDS	1160535	1160577	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 +
chr1A	NRGenome	CDS	1160392	1160459	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 +
chr1A	NRGenome	CDS	1160086	1160127	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 +
chr1A	NRGenome	CDS	1159521	1159973	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 +
chr1A	NRGenome	gene	1159521	1162591	.	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3
chr1A	NRGenome	mRNA	1159521	1162591	.	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;coverage=100.0;identity=100.0;matches=1434;mismatches=0;indels=0;unknowns=0
chr1A	NRGenome	exon	1162546	1162591	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 +
chr1A	NRGenome	exon	1162250	1162452	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 +
chr1A	NRGenome	exon	1161953	1162150	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 +
chr1A	NRGenome	exon	1161682	1161859	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 +
chr1A	NRGenome	exon	1161377	1161547	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 +
chr1A	NRGenome	exon	1160679	1160710	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 +
chr1A	NRGenome	exon	1160535	1160577	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 +
chr1A	NRGenome	exon	1160392	1160459	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 +
chr1A	NRGenome	exon	1160086	1160127	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 +
chr1A	NRGenome	exon	1159521	1159973	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +
chr1A	NRGenome	CDS	1162546	1162591	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 +
chr1A	NRGenome	CDS	1162250	1162452	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 +
chr1A	NRGenome	CDS	1161953	1162150	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 +
chr1A	NRGenome	CDS	1161682	1161859	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 +
chr1A	NRGenome	CDS	1161377	1161547	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 +
chr1A	NRGenome	CDS	1160679	1160710	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 +
chr1A	NRGenome	CDS	1160535	1160577	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 +
chr1A	NRGenome	CDS	1160392	1160459	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 +
chr1A	NRGenome	CDS	1160086	1160127	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 +
chr1A	NRGenome	CDS	1159521	1159973	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +

python代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from Bio import SeqIO

# fasta = open("/data2/masw_data/seqdb/chr1A.fasta", "rU")
record_dict = SeqIO.index("/data2/masw_data/seqdb/chr1A.fasta", "fasta")
gene_sequence = open(‘gene.fasta‘, ‘w‘)
mRNA_sequence = open(‘mRNA.fasta‘, ‘w‘)
exon_sequence = open(‘exon.fasta‘, ‘w‘)
CDS_sequence = open(‘CDS.fasta‘,‘w‘)
pro_downstream = open(‘pro_and_downstream.fasta‘, ‘w‘)

gene = {}
mRNA = {}
exon = {}
CDS = {}

with open(‘1.txt‘, ‘r‘) as f:
    for line in f:
        line1 = line.strip().split()
        chr = line1[0]
        feature = line1[2]
        start = line1[3]
        end = line1[4]
        direction = line1[6]
        name = line1[8].split(";")[1][5:]
        if feature == ‘gene‘:
                gene[name] = (chr, start, end, direction)
        if line1[2] == ‘mRNA‘:
                mRNA[name] = (chr, start, end, direction)
        if line1[2] == ‘exon‘:
            if exon.get(name, None):
                exon[name].extend([(chr, start, end, direction)])
            else:
                exon[name] = [(chr, start, end, direction)]
        if line1[2] == ‘CDS‘:
            if CDS.get(name, None):
                CDS[name].extend([(chr, start, end, direction)])
            else:
                CDS[name] = [(chr, start, end, direction)]

# get gene sequence include introns
for key, value in gene.items():
    if value[3] == ‘+‘:
        gene_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq))
    if value[3] == ‘-‘:
        gene_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement()))

# get mRNA sequence include introns
for key, value in mRNA.items():
    if value[3] == ‘+‘:
        mRNA_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq))
    if value[3] == ‘-‘:
        mRNA_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement()))

# get 2k upstream,1k downstream and gene sequence
for key, value in gene.items():
    if value[3] == ‘+‘:
        pro_downstream.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-2001:int(value[2]) + 1000].seq))
    if value[3] == ‘-‘:
        pro_downstream.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 999:int(value[2]) + 2000].seq.reverse_complement()))

# get CDS seuqnece
for key, value in CDS.items():
    sequence = []
    for i in value:
        if i[-1] == ‘+‘:
            sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq)
        CDS_sequence.write(‘>%s\n%s\n‘ % (key, sequence))
    for i in value.reverse:
        if i[-1] == ‘-‘:
            sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement())
        CDS_sequence.write(‘>%s\n%s\n‘ % (key, sequence))

# get exon sequence
for key, value in exon.items():
    sequence = []
    for i in value:
        if i[-1] == ‘+‘:
            sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq)
        exon_sequence.write(‘>%s\n%s\n‘ % (key, sequence))
    for i in value.reverse:
        if i[-1] == ‘-‘:
            sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement())
        exon_sequence.write(‘>%s\n%s\n‘ % (key, sequence))

gene_sequence.close()
mRNA_sequence.close()
CDS_sequence.close()
pro_downstream.close()

  

以上是关于从gff3文件中获取fasta文件的主要内容,如果未能解决你的问题,请参考以下文章

from gff3 get gene fasta sequence

perl处理fasta文件

从文件中提取特定范围的 fasta 序列

如何在 fasta 文件中并行化计算,其中每个处理器采用一个序列

如何从大fasta文件中找出自己想要的序列

请教如何从FASTA文件中批量查找序列