markdown 在refseq中提取GU命中并绘制基因组邻域
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了markdown 在refseq中提取GU命中并绘制基因组邻域相关的知识,希望对你有一定的参考价值。
#!/usr/bin/python
# This script is designed to take a genbank file and 'slice out'/'subset'
# regions (genes/operons etc.) and produce a separate file. This can be
# done explicitly by telling the script which base sites to use, or can
# 'decide' for itself by blasting a fasta of the sequence you're inter-
# ed in against the Genbank you want to slice a record out of.
# Note, the script (obviously) does not preseve the index number of the
# bases from the original
# Based upon the tutorial at:
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc44
# This script depends on BLAST and having the makeblastdb functionality
# available if BLAST_MODE is active. It also depends on Biopython.
# Set up and handle arguments:
import warnings
import sys
import subprocess
import os
import time
from Bio import SeqIO
__author__ = "Joe R. J. Healey"
__version__ = "1.1.0"
__title__ = "Genbank_slicer"
__license__ = "GPLv3"
__author_email__ = "J.R.J.Healey@warwick.ac.uk"
# Import SearchIO and suppress experimental warning
from Bio import BiopythonExperimentalWarning
with warnings.catch_warnings():
warnings.simplefilter('ignore', BiopythonExperimentalWarning)
from Bio import SearchIO
def convert(basename, genbank):
'''Convert the provided genbank to a fasta to BLAST.'''
refFasta = "{}.fasta.tmp".format(basename)
SeqIO.convert(genbank, 'genbank', refFasta, 'fasta')
return refFasta
def runBlast(basename, refFasta, fasta, verbose):
'''Synthesise BLAST commands and make system calls'''
resultHandle = "{}.blastout.tmp".format(basename)
blastdb_cmd = 'makeblastdb -in {0} -dbtype nucl -title temp_blastdb'.format(refFasta)
blastn_cmd = 'blastn -query {0} -strand both -task blastn -db {1} -perc_identity 100 -outfmt 6 -out {2} -max_target_seqs 1'.format(fasta, refFasta, resultHandle)
print("Constructing BLAST Database: " + '\n' + blastdb_cmd)
print("BLASTing: " + '\n' + blastn_cmd)
DB_process = subprocess.Popen(blastdb_cmd,
shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
DB_process.wait()
BLAST_process = subprocess.Popen(blastn_cmd,
shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
BLAST_process.wait()
return resultHandle
def getIndices(resultHandle):
'''If not provided directly by the user, this function retrieves the best BLAST hit's indices.'''
blast_result = SearchIO.read(resultHandle, 'blast-tab')
print(blast_result[0][0])
start = blast_result[0][0].hit_start
end = blast_result[0][0].hit_end
return start, end
def slice(start, end, genbank, FPoffset, TPoffset):
'''Subset the provided genbank to return the sub record.'''
seqObj = SeqIO.read(genbank, 'genbank')
subRecord = seqObj[start-FPoffset:end+TPoffset]
return subRecord
def main():
###################################################################################################
# Parse command line arguments
import argparse
try:
parser = argparse.ArgumentParser(
description='This script slices entries such as genes or operons out of a genbank, subsetting them as their own file.')
parser.add_argument(
'-g',
'--genbank',
action='store',
required=True,
help='The genbank file you wish to subset.')
parser.add_argument(
'-o',
'--outfile',
action='store',
help='If specifed, the script will write a file, otherwise redirect STDOUT for pipelines.')
parser.add_argument(
'-s',
'--start',
type=int,
help='Integer base index to slice from.')
parser.add_argument(
'-e',
'--end',
type=int,
default=0,
help='Integer base index to slice to.')
parser.add_argument(
'-F',
'--FPoffset',
action='store',
type=int,
default=0,
help='If you want to capture regions around the target site, specify the 5\' offset.')
parser.add_argument(
'-T',
'--TPoffset',
action='store',
type=int,
default=0,
help='If you want to capture regions around the target site, specify the 3\' offset.')
parser.add_argument(
'-b',
'--blastmode',
action='store_true',
help='If flag is set, provide an input fasta (-f | --fasta), and BLAST will be called to find your sequence indices in the original genbank.')
parser.add_argument(
'-f',
'--fasta',
action='store',
help='The operon fasta to pull annotations from the provided genbank.')
parser.add_argument(
'-t',
'--tidy',
action='store_true',
help='Tell the script whether or not to remove the temporary files it generated during processing. Off by default. WARNING: removes files based on the "tmp" string.')
parser.add_argument(
'-m',
'--meta',
action='store',
default=None,
help='Specify a string to use as the Genbank meta-data fields if the parent one doesn\'t contain anything. Else inherits from parent sequence.')
parser.add_argument(
'-v',
'--verbose',
action='store_true',
help='Verbose behaviour. Shows progress of BLAST etc. if appropriate.')
args = parser.parse_args()
except NameError:
print "An exception occured with argument parsing. Check your provided options."
genbank = args.genbank
fasta = args.fasta
split = os.path.splitext(args.genbank)
basename = os.path.basename(split[0])
start = args.start
end = args.end
FPoffset = args.FPoffset
TPoffset = args.TPoffset
blastMode = args.blastmode
outfile = args.outfile
fasta = args.fasta
tidy = args.tidy
meta = args.meta
verbose = args.verbose
# Main code:
if blastMode is not False and fasta is not None:
refFasta = convert(basename,genbank)
resultHandle = runBlast(basename, refFasta, fasta, verbose)
start, end = getIndices(resultHandle)
else:
if fasta is None:
print("No fasta was provided so BLAST mode cannot be used.")
if start is None or end is None:
print('No slice indices have been specified or retrieved from blastout')
sys.exit(1)
subRecord = slice(start, end, genbank, FPoffset, TPoffset)
# Populate the metadata of the genbank
if meta is not None:
subRecord.id = meta
subRecord.locus = meta + 'subregion'
subRecord.accession = meta
subRecord.name = meta
subRecord.annotations["date"] = time.strftime("%d-%b-%Y").upper()
# Other field options include:
# subRecord.annotations["source"]
# ["taxonomy"]
# ["keywords"]
# ["references"]
# ["accessions"]
# ["data_file_division"] e.g. BCT, PLN, UNK
# ["organism"]
# ["topology"]
if outfile is not None:
SeqIO.write(subRecord, outfile, "genbank")
else:
print(subRecord.format('genbank'))
if tidy is True:
if verbose is True:
rm="rm -v ./*tmp*"
else:
rm="rm ./*tmp*"
subprocess.Popen(rm,shell=True)
if __name__ == "__main__":
main()
library(ggplot2)
library(genoPlotR)
setwd("~/Downloads/gb_files_geno/")
labelS <- 10L
temp <-list.files(pattern="*genoplot.tsv")
gbknames<- lapply(as.list(temp),function(x){strsplit(x = gsub(pattern = "[.]",replacement = " ",x = x),split = c(" "))[[1]][1]})
df <- lapply(temp, read_tsv, col_names = FALSE)
df <- lapply(df,function(x){colnames(x)<-c("name", "start", "end" ,"strand" ,"col" ,"lty" ,"lwd" ,"pch" ,"cex", "gene_type");x})
df <- lapply(df,function(x){dna_seg(x)})
names(df) <- gbknames
annot<-lapply(df,function(x){annot<-annotation(x1=x$start+10,
x2=x$end-10,
text=x$name,
rot=replicate(nrow(x),35));
annot$text<-paste(substr(x$name,start = 0,stop = labelS),"...")
annot})
uniqnames<-unique(do.call(rbind.data.frame, df)["name"])
uniqnames<-sort(uniqnames[,1])
color <- grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)]
colors<-sample(color, length(uniqnames))
df2color<-data.frame(as.matrix(uniqnames),as.matrix(colors))
df2color<-t(df2color)
colnames(df2color)<-df2color[1,]
df2color<- df2color[-1,]
df<-lapply(df,function(x){x["col"]<-"black";x})
df<-lapply(df,function(x){x["fill"]<-df2color[x$name];x})
uniqnames<-gsub(pattern = "_",x = as.matrix(uniqnames),replacement = " ")
uniqnames<-gsub(pattern = "[.]",x = as.matrix(uniqnames),replacement = ",")
plot(c(0,1000), c(0,1000), type="n", axes=FALSE, xlab="", ylab="")
legend("center", legend = c(as.matrix(uniqnames)),ncol = 1,xpd = NA, cex = 0.8,
bty="n",fill=c(as.matrix(colors)),border = c("white"),title = "Genes")
plot_gene_map(dna_segs = df, dna_seg_label_cex = 0.4, fixed_gap_length = TRUE, annotations = annot, annotation_cex = 0.4)
#!/usr/bin/env perl
#===============================================================================
#
# FILE: gb2ptt.pl
#
# USAGE: ./gb2ptt.pl
#
# DESCRIPTION: Script to convert a genbank flat file to ptt
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Dr. Scott A. Givan (sag), givans@missouri.edu
# COMPANY: University of Missouri, USA
# VERSION: 1.0
# CREATED: 05/04/15 15:23:32
# REVISION: ---
#===============================================================================
use 5.010; # Require at least Perl version 5.10
use autodie;
use Getopt::Long; # use GetOptions function to for CL args
use warnings;
use strict;
use Bio::SeqIO;
my ($debug,$verbose,$help,$infile);
my $result = GetOptions(
"debug" => \$debug,
"verbose" => \$verbose,
"help" => \$help,
"infile:s" => \$infile,
);
if ($help) {
help();
exit(0);
}
sub help {
say <<HELP;
--debug
--verbose
--help
--infile
HELP
}
$infile = 'test' unless ($infile);
my $seqio = Bio::SeqIO->new(
-file => $infile,
-format => 'genbank',
);
open(my $PTT, ">", $infile . ".ptt");
open(my $RNT, ">", $infile . ".rnt");
my %tags = ();
while (my $seq = $seqio->next_seq()) {
my $header1 = $seq->desc() || 'unknown' . " - 1.." . $seq->length();
if ($debug) {
say "\$seq isa '" . ref($seq) . "'" if ($debug);
say $seq->desc() . " - 1.." . $seq->length();
say "CDS: " . scalar($seq->get_SeqFeatures('CDS'));
exit();
}
my @CDS = $seq->get_SeqFeatures('CDS');
my @RNA = $seq->get_SeqFeatures('tRNA');
push(@RNA,$seq->get_SeqFeatures('rRNA'));
my $featcnt = 0;
say $PTT $header1;
say $PTT scalar(@CDS) . " proteins";
say $RNT $header1;
say $RNT scalar(@RNA) . " RNAs";
say $PTT "Location\tStrand\tLength\tPID\tGene\tSynonym\tCode\tCOG\tProduct";
say $RNT "Location\tStrand\tLength\tPID\tGene\tSynonym\tCode\tCOG\tProduct";
for my $feature (@CDS) {
next if $feature->has_tag('pseudo');
my $tag = $feature->primary_tag();
++$tags{$tag};
last if ($debug && ++$featcnt > 10);
my $start = $feature->start();
my $stop = $feature->end();
my $strand = $feature->strand();
my $length = $feature->length();
my @pid = $feature->has_tag('protein_id') ? $feature->get_tag_values('protein_id') : $feature->get_tag_values('locus_tag');
my @gene = $feature->has_tag('gene') ? $feature->get_tag_values('gene') : '-';
my @synonym = $feature->get_tag_values('locus_tag');
my $code = '-';
my $cog = '-';
my @description = $feature->get_tag_values('product');
if ($strand > 0) {
$strand = '+';
} elsif ($strand < 0) {
$strand = '-';
}
# my @misc = $feature->primary_tag('misc_feature');
#
# for my $misc_feature (@misc) {
# say "\$misc_feature isa '" . ref($misc_feature) . "'";
# if ($misc_feature->has_tag('note')) {
# my @notes = $misc_feature->get_tag_values('note');
# for my $note (@notes) {
# if ($note =~ /(COG\d\d\d\d)/) {
# $cog = $1;
# }
# }
# }
# }
say $PTT $start . ".." . "$stop\t$strand\t$length\t$pid[0]\t$gene[0]\t$synonym[0]\t$code\t$cog\t$description[0]";
}
#foreach my $feature (sort { $a->start() <=> $b->start() } (@tRNA, @rRNA)) {
for my $feature (sort { $a->start() <=> $b->start() } @RNA) {
my $start = $feature->start();
my $stop = $feature->end();
my $strand = '+';
my $length = $feature->length();
my $pid = $feature->has_tag('db_xref') ? ($feature->get_tag_values('db_xref'))[0] : ($feature->get_tag_values('locus_tag'))[0];
my @gene = $feature->has_tag('gene') ? $feature->get_tag_values('gene') : '-';
my @synonym = $feature->get_tag_values('locus_tag');
my $code = '-';
my $cog = '-';
my @description = $feature->get_tag_values('product');
say $RNT $start . ".." . "$stop\t$strand\t$length\t$pid\t$gene[0]\t$synonym[0]\t$code\t$cog\t$description[0]";
}
}
# Extract GU hits in refseq and plot genomic neighbourhood
Create needed folders
```bash
mkdir ft_files gb_files gb_files_filt gb_files_slice gb_files_ptt gb_files_geno
```
Get the assembly file from NCBI
```bash
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt
```
An example of the sample file used to get the NCBI data (**genome_narrow.tsv**)
WP_011376775.1 32231714 0.958 60 2 0 1 60 1 60 5.74e-36 123 60 60 GCF_000012645.1_ASM1264v1 hypothetical protein [Prochlorococcus marinus] hypo Bacteria Cyanobacteria Prochloraceae NA Prochlorococcus Prochlorococcus_marinus 340128
WP_032512946.1 32231714 0.905 60 6 0 1 60 1 60 2.35e-33 116 60 60 GCF_000759855.1_ASM75985v1 hypothetical protein [Prochlorococcus marinus] hypo Bacteria Cyanobacteria Prochloraceae NA Prochlorococcus Prochlorococcus_marinus 340128
Get the feature table file from NCBI
```bash
grep -w -f <(cut -f15 genome_narrow.tsv) assembly_summary.txt | cut -f 20 | while read line; do wget -l1 -r --no-parent -A "*feature_table.txt.gz" $line; done
find ftp.ncbi.nlm.nih.gov/ -name '*feature_table.txt.gz' -exec cp {} ft_files/ \;
rm -rf ftp.ncbi.nlm.nih.gov/
```
Get the genbank format file from NCBI
```bash
grep -w -f <(cut -f15 genome_narrow.tsv) assembly_summary.txt | cut -f 20 | while read line; do wget -l1 -r --no-parent -A "*genomic.gbff.gz" $line; done
find ftp.ncbi.nlm.nih.gov/ -name '*gbff.gz' -exec cp {} gb_files/ \;
rm -rf ftp.ncbi.nlm.nih.gov/
```
Get the location of our genes of interest and calculate coordinates 10kb upstream/downstream
```bash
S=10000; grep -f <(cut -f1 genome_narrow.tsv) ft_files/* | tr ':' $'\t' | cut -f 2- -d '/' | sed -e 's/_feature_table.txt//' | awk -vO=$S -vFS="\t" '{print $1,$9,$10,$11,$9-O,$10+O,$13,$17,$18,$15}' | awk -vO=$S '$2 >= O' > hits_10kb_up.txt
```
Split incomplete genomes in a set of contig files and keep the ones containing our gene of interest
```bash
cat hits_10kb_up.txt | while read LINE; do mawk -vV=$(echo $LINE | cut -f1 -d ' ') -v n=1 '/^\/\//{close(V".partial."n);n++;next} {print > V".partial."n}' gb_files/$(echo $LINE | cut -f1 -d ' ')_genomic.gbff; done
grep -f <(cut -f7 -d ' ' hits_10kb_up.txt) *partial* | cut -f 1 -d ':' | sort -u | while read file; do mv $file ${file/.partial.*/.gbk}; done
mv *gbk gb_files_filt
rm *partial*
```
Slice the genbank files 10kb upstream/downstream and convert them to ptt format
```bash
cat hits_10kb_up.txt | while read LINE; do echo '//' >> gb_files_filt/$(echo $LINE | cut -f1 -d ' ').gbk; python slicer.py -g gb_files_filt/$(echo $LINE | cut -f1 -d ' ').gbk -s $(echo $LINE | cut -f5 -d ' ') -e $(echo $LINE | cut -f6 -d ' ') -o $(echo $LINE | cut -f1 -d ' ')_slice.gbk; done
for i in *_slice.gbk; do perl gb2ptt.pl --infile $i; done
mv *slice.gbk gb_files_slice
mv *gbk.ptt gb_files_ptt
```
>Slicer script from [here](https://gist.github.com/jrjhealey/06a7fbdfe495bc5a8824ed152dff7919)
>gb2ptt script from [here](https://github.com/sgivan/gb2ptt/blob/master/bin/gb2ptt.pl)
Create files for plotting the gene structures
```bash
for i in gb_files_ptt/*ptt; do NAM=$(basename $i _slice.gbk.ptt); awk 'NR>3' $i | sed -e 's/\.\./\t/' | awk -vFS="\t" -vOFS="\t" '{gsub(" ","_",$10); if($3 == "+"){S=1}else{S=-1}; print $10,$1,$2,S,"gray\t1\t1\t8\t1\tarrows"}' > ${NAM}_genoplot.tsv ; done
cat hits_10kb_up.txt | while read line; do F=$(echo $line | cut -f1 -d ' '); E=$(echo $line | cut -f7 -d ' '); awk -vC=$E '{if ($1 == C){$6="red"; $2="Genomic_unknown"}; print $0}' ${F}_genoplot.tsv | tr ' ' $'\t' | cut -f2- > tmp && mv tmp ${F}_genoplot.tsv; done
mv *_genoplot.tsv gb_files_geno
```
Plot with **geneplot.r** (Code modified from [here](https://github.com/Sanrrone/multiGenomicContext))
Calculate synteny of the sliced files using mauve (`conda install progressiveMauve`)
```bash
progressiveMauve --output=test gb_files_slice/*gbk
```
以上是关于markdown 在refseq中提取GU命中并绘制基因组邻域的主要内容,如果未能解决你的问题,请参考以下文章
使用 BigQuery 提取命中级别数据时,Google Analytics 指标被夸大了
Gulp Front Matter + Markdown 通过 Nunjucks