从gff3文件中获取fasta文件
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了从gff3文件中获取fasta文件相关的知识,希望对你有一定的参考价值。
chr1A NRGenome gene 1157233 1158291 . + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 chr1A NRGenome mRNA 1157233 1158291 . + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;coverage=100.0;identity=100.0;matches=1059;mismatches=0;indels=0;unknowns=0 chr1A NRGenome exon 1157233 1158291 100 + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 + chr1A NRGenome CDS 1157233 1158291 100 + 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 + chr1A NRGenome gene 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 chr1A NRGenome mRNA 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;coverage=100.0;identity=100.0;matches=1527;mismatches=0;indels=0;unknowns=0 chr1A NRGenome exon 1162250 1162591 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 + chr1A NRGenome exon 1161953 1162150 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 + chr1A NRGenome exon 1161682 1161859 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 + chr1A NRGenome exon 1161377 1161547 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 + chr1A NRGenome exon 1160679 1160710 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 + chr1A NRGenome exon 1160535 1160577 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 + chr1A NRGenome exon 1160392 1160459 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 + chr1A NRGenome exon 1160086 1160127 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 + chr1A NRGenome exon 1159521 1159973 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 + chr1A NRGenome CDS 1162250 1162591 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 + chr1A NRGenome CDS 1161953 1162150 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 + chr1A NRGenome CDS 1161682 1161859 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 + chr1A NRGenome CDS 1161377 1161547 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 + chr1A NRGenome CDS 1160679 1160710 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 + chr1A NRGenome CDS 1160535 1160577 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 + chr1A NRGenome CDS 1160392 1160459 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 + chr1A NRGenome CDS 1160086 1160127 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 + chr1A NRGenome CDS 1159521 1159973 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 + chr1A NRGenome gene 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 chr1A NRGenome mRNA 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;coverage=100.0;identity=100.0;matches=1434;mismatches=0;indels=0;unknowns=0 chr1A NRGenome exon 1162546 1162591 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 + chr1A NRGenome exon 1162250 1162452 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 + chr1A NRGenome exon 1161953 1162150 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 + chr1A NRGenome exon 1161682 1161859 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 + chr1A NRGenome exon 1161377 1161547 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 + chr1A NRGenome exon 1160679 1160710 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 + chr1A NRGenome exon 1160535 1160577 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 + chr1A NRGenome exon 1160392 1160459 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 + chr1A NRGenome exon 1160086 1160127 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 + chr1A NRGenome exon 1159521 1159973 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 + chr1A NRGenome CDS 1162546 1162591 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 + chr1A NRGenome CDS 1162250 1162452 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 + chr1A NRGenome CDS 1161953 1162150 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 + chr1A NRGenome CDS 1161682 1161859 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 + chr1A NRGenome CDS 1161377 1161547 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 + chr1A NRGenome CDS 1160679 1160710 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 + chr1A NRGenome CDS 1160535 1160577 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 + chr1A NRGenome CDS 1160392 1160459 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 + chr1A NRGenome CDS 1160086 1160127 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 + chr1A NRGenome CDS 1159521 1159973 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +
python代码
#!/usr/bin/env python # -*- coding: utf-8 -*- from Bio import SeqIO # fasta = open("/data2/masw_data/seqdb/chr1A.fasta", "rU") record_dict = SeqIO.index("/data2/masw_data/seqdb/chr1A.fasta", "fasta") gene_sequence = open(‘gene.fasta‘, ‘w‘) mRNA_sequence = open(‘mRNA.fasta‘, ‘w‘) exon_sequence = open(‘exon.fasta‘, ‘w‘) CDS_sequence = open(‘CDS.fasta‘,‘w‘) pro_downstream = open(‘pro_and_downstream.fasta‘, ‘w‘) gene = {} mRNA = {} exon = {} CDS = {} with open(‘1.txt‘, ‘r‘) as f: for line in f: line1 = line.strip().split() chr = line1[0] feature = line1[2] start = line1[3] end = line1[4] direction = line1[6] name = line1[8].split(";")[1][5:] if feature == ‘gene‘: gene[name] = (chr, start, end, direction) if line1[2] == ‘mRNA‘: mRNA[name] = (chr, start, end, direction) if line1[2] == ‘exon‘: if exon.get(name, None): exon[name].extend([(chr, start, end, direction)]) else: exon[name] = [(chr, start, end, direction)] if line1[2] == ‘CDS‘: if CDS.get(name, None): CDS[name].extend([(chr, start, end, direction)]) else: CDS[name] = [(chr, start, end, direction)] # get gene sequence include introns for key, value in gene.items(): if value[3] == ‘+‘: gene_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq)) if value[3] == ‘-‘: gene_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement())) # get mRNA sequence include introns for key, value in mRNA.items(): if value[3] == ‘+‘: mRNA_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq)) if value[3] == ‘-‘: mRNA_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement())) # get 2k upstream,1k downstream and gene sequence for key, value in gene.items(): if value[3] == ‘+‘: pro_downstream.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-2001:int(value[2]) + 1000].seq)) if value[3] == ‘-‘: pro_downstream.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 999:int(value[2]) + 2000].seq.reverse_complement())) # get CDS seuqnece for key, value in CDS.items(): sequence = [] for i in value: if i[-1] == ‘+‘: sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq) CDS_sequence.write(‘>%s\n%s\n‘ % (key, sequence)) for i in value.reverse: if i[-1] == ‘-‘: sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement()) CDS_sequence.write(‘>%s\n%s\n‘ % (key, sequence)) # get exon sequence for key, value in exon.items(): sequence = [] for i in value: if i[-1] == ‘+‘: sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq) exon_sequence.write(‘>%s\n%s\n‘ % (key, sequence)) for i in value.reverse: if i[-1] == ‘-‘: sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement()) exon_sequence.write(‘>%s\n%s\n‘ % (key, sequence)) gene_sequence.close() mRNA_sequence.close() CDS_sequence.close() pro_downstream.close()
以上是关于从gff3文件中获取fasta文件的主要内容,如果未能解决你的问题,请参考以下文章
from gff3 get gene fasta sequence