markdown 在refseq中提取GU命中并绘制基因组邻域

Posted 2021-05-06

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了markdown 在refseq中提取GU命中并绘制基因组邻域相关的知识，希望对你有一定的参考价值。

#!/usr/bin/python

# This script is designed to take a genbank file and 'slice out'/'subset'
# regions (genes/operons etc.) and produce a separate file. This can be
# done explicitly by telling the script which base sites to use, or can
# 'decide' for itself by blasting a fasta of the sequence you're inter-
# ed in against the Genbank you want to slice a record out of.

# Note, the script (obviously) does not preseve the index number of the
# bases from the original

# Based upon the tutorial at:
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc44

# This script depends on BLAST and having the makeblastdb functionality
# available if BLAST_MODE is active. It also depends on Biopython.

# Set up and handle arguments:

import warnings
import sys
import subprocess
import os
import time
from Bio import SeqIO

__author__ = "Joe R. J. Healey"
__version__ = "1.1.0"
__title__ = "Genbank_slicer"
__license__ = "GPLv3"
__author_email__ = "J.R.J.Healey@warwick.ac.uk"

# Import SearchIO and suppress experimental warning
from Bio import BiopythonExperimentalWarning
with warnings.catch_warnings():
	warnings.simplefilter('ignore', BiopythonExperimentalWarning)
	from Bio import SearchIO


def convert(basename, genbank):
	'''Convert the provided genbank to a fasta to BLAST.'''

	refFasta = "{}.fasta.tmp".format(basename)
	SeqIO.convert(genbank, 'genbank', refFasta, 'fasta')

	return refFasta

def runBlast(basename, refFasta, fasta, verbose):
	'''Synthesise BLAST commands and make system calls'''

	resultHandle = "{}.blastout.tmp".format(basename)
	blastdb_cmd = 'makeblastdb -in {0} -dbtype nucl -title temp_blastdb'.format(refFasta)
	blastn_cmd = 'blastn -query {0} -strand both -task blastn -db {1} -perc_identity 100 -outfmt 6 -out {2} -max_target_seqs 1'.format(fasta, refFasta, resultHandle)

	print("Constructing BLAST Database: " + '\n' + blastdb_cmd)
	print("BLASTing: " + '\n' + blastn_cmd)
	DB_process = subprocess.Popen(blastdb_cmd,
				      shell=True,
				      stdin=subprocess.PIPE,
				      stdout=subprocess.PIPE,
				      stderr=subprocess.PIPE)
	DB_process.wait()
	BLAST_process = subprocess.Popen(blastn_cmd,
					 shell=True,
					 stdin=subprocess.PIPE,
					 stdout=subprocess.PIPE,
					 stderr=subprocess.PIPE)
	BLAST_process.wait()

	return resultHandle

def getIndices(resultHandle):
	'''If not provided directly by the user, this function retrieves the best BLAST hit's indices.'''

	blast_result = SearchIO.read(resultHandle, 'blast-tab')

	print(blast_result[0][0])
	start = blast_result[0][0].hit_start
	end = blast_result[0][0].hit_end

	return start, end

def slice(start, end, genbank, FPoffset, TPoffset):
	'''Subset the provided genbank to return the sub record.'''

	seqObj = SeqIO.read(genbank, 'genbank')
	subRecord = seqObj[start-FPoffset:end+TPoffset]

	return subRecord


def main():
###################################################################################################
# Parse command line arguments
	import argparse
	try:
   		parser = argparse.ArgumentParser(
   			description='This script slices entries such as genes or operons out of a genbank, subsetting them as their own file.')
		parser.add_argument(
			'-g',
			'--genbank',
			action='store',
			required=True,
			help='The genbank file you wish to subset.')
		parser.add_argument(
			'-o',
			'--outfile',
			action='store',
			help='If specifed, the script will write a file, otherwise redirect STDOUT for pipelines.')
		parser.add_argument(
			'-s',
			'--start',
			type=int,
			help='Integer base index to slice from.')
		parser.add_argument(
			'-e',
			'--end',
			type=int,
			default=0,
			help='Integer base index to slice to.')
		parser.add_argument(
			'-F',
			'--FPoffset',
			action='store',
			type=int,
			default=0,
			help='If you want to capture regions around the target site, specify the 5\' offset.')
		parser.add_argument(
			'-T',
			'--TPoffset',
			action='store',
			type=int,
			default=0,
			help='If you want to capture regions around the target site, specify the 3\' offset.')
		parser.add_argument(
			'-b',
			'--blastmode',
			action='store_true',
			help='If flag is set, provide an input fasta (-f | --fasta), and BLAST will be called to find your sequence indices in the original genbank.')
		parser.add_argument(
			'-f',
			'--fasta',
			action='store',
			help='The operon fasta to pull annotations from the provided genbank.')
		parser.add_argument(
			'-t',
			'--tidy',
			action='store_true',
			help='Tell the script whether or not to remove the temporary files it generated during processing. Off by default. WARNING: removes files based on the "tmp" string.')
		parser.add_argument(
			'-m',
			'--meta',
			action='store',
			default=None,
			help='Specify a string to use as the Genbank meta-data fields if the parent one doesn\'t contain anything. Else inherits from parent sequence.')
		parser.add_argument(
			'-v',
			'--verbose',
			action='store_true',
			help='Verbose behaviour. Shows progress of BLAST etc. if appropriate.')


		args = parser.parse_args()

	except NameError:
		print "An exception occured with argument parsing. Check your provided options."

	genbank   =  args.genbank
	fasta     =  args.fasta
	split     =  os.path.splitext(args.genbank)
	basename  =  os.path.basename(split[0])
	start     =  args.start
	end       =  args.end
	FPoffset  =  args.FPoffset
	TPoffset  =  args.TPoffset
	blastMode =  args.blastmode
	outfile   =  args.outfile
	fasta     =  args.fasta
	tidy      =  args.tidy
	meta	  =  args.meta
	verbose   =  args.verbose

# Main code:
	if blastMode is not False and fasta is not None:
		refFasta = convert(basename,genbank)
		resultHandle = runBlast(basename, refFasta, fasta, verbose)
		start, end = getIndices(resultHandle)
	else:
		if fasta is None:
			print("No fasta was provided so BLAST mode cannot be used.")

	if start is None or end is None:
        	print('No slice indices have been specified or retrieved from blastout')
                sys.exit(1)


	subRecord = slice(start, end, genbank, FPoffset, TPoffset)
	# Populate the metadata of the genbank
	if meta is not None:
		subRecord.id = meta
		subRecord.locus = meta + 'subregion'
		subRecord.accession = meta
		subRecord.name = meta
		subRecord.annotations["date"] = time.strftime("%d-%b-%Y").upper()
		# Other field options include:
		#  subRecord.annotations["source"]
		#			["taxonomy"]
		#			["keywords"]
		#			["references"]
		#			["accessions"]
		#			["data_file_division"] e.g. BCT, PLN, UNK
		#			["organism"]
		#			["topology"]

	if outfile is not None:
		SeqIO.write(subRecord, outfile, "genbank")
	else:
		print(subRecord.format('genbank'))

	if tidy is True:
		if verbose is True:
			rm="rm -v ./*tmp*"
		else:
			rm="rm ./*tmp*"

		subprocess.Popen(rm,shell=True)

if __name__ == "__main__":
	main()

library(ggplot2)
library(genoPlotR)

setwd("~/Downloads/gb_files_geno/")
labelS <- 10L
temp <-list.files(pattern="*genoplot.tsv")


gbknames<- lapply(as.list(temp),function(x){strsplit(x = gsub(pattern = "[.]",replacement = " ",x = x),split = c(" "))[[1]][1]})

df <- lapply(temp, read_tsv, col_names = FALSE)
df <- lapply(df,function(x){colnames(x)<-c("name", "start",  "end" ,"strand"  ,"col" ,"lty" ,"lwd" ,"pch" ,"cex", "gene_type");x})
df <- lapply(df,function(x){dna_seg(x)})
names(df) <- gbknames



annot<-lapply(df,function(x){annot<-annotation(x1=x$start+10,
                                               x2=x$end-10,
                                               text=x$name,
                                               rot=replicate(nrow(x),35));
annot$text<-paste(substr(x$name,start = 0,stop = labelS),"...")
annot})



uniqnames<-unique(do.call(rbind.data.frame, df)["name"])
uniqnames<-sort(uniqnames[,1])
color <- grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)]
colors<-sample(color, length(uniqnames))
df2color<-data.frame(as.matrix(uniqnames),as.matrix(colors))
df2color<-t(df2color)
colnames(df2color)<-df2color[1,]
df2color<- df2color[-1,]
df<-lapply(df,function(x){x["col"]<-"black";x})
df<-lapply(df,function(x){x["fill"]<-df2color[x$name];x})
uniqnames<-gsub(pattern = "_",x = as.matrix(uniqnames),replacement = " ")
uniqnames<-gsub(pattern = "[.]",x = as.matrix(uniqnames),replacement = ",")

plot(c(0,1000), c(0,1000), type="n", axes=FALSE, xlab="", ylab="")
legend("center", legend = c(as.matrix(uniqnames)),ncol = 1,xpd = NA, cex = 0.8,
       bty="n",fill=c(as.matrix(colors)),border = c("white"),title = "Genes")
plot_gene_map(dna_segs = df, dna_seg_label_cex = 0.4, fixed_gap_length = TRUE, annotations = annot, annotation_cex = 0.4)

#!/usr/bin/env perl 
#===============================================================================
#
#         FILE:  gb2ptt.pl
#
#        USAGE:  ./gb2ptt.pl  
#
#  DESCRIPTION:  Script to convert a genbank flat file to ptt
#
#      OPTIONS:  ---
# REQUIREMENTS:  ---
#         BUGS:  ---
#        NOTES:  ---
#       AUTHOR:  Dr. Scott A. Givan (sag), givans@missouri.edu
#      COMPANY:  University of Missouri, USA
#      VERSION:  1.0
#      CREATED:  05/04/15 15:23:32
#     REVISION:  ---
#===============================================================================

use 5.010;      # Require at least Perl version 5.10
use autodie;
use Getopt::Long; # use GetOptions function to for CL args
use warnings;
use strict;
use Bio::SeqIO;

my ($debug,$verbose,$help,$infile);

my $result = GetOptions(
    "debug"     =>  \$debug,
    "verbose"   =>  \$verbose,
    "help"      =>  \$help,
    "infile:s"  =>  \$infile,
);

if ($help) {
    help();
    exit(0);
}

sub help {

say <<HELP;

--debug
--verbose
--help
--infile

HELP

}

$infile = 'test' unless ($infile);

my $seqio = Bio::SeqIO->new(
                                -file   =>  $infile,
                                -format =>  'genbank',
                            );

open(my $PTT, ">", $infile . ".ptt");
open(my $RNT, ">", $infile . ".rnt");

my %tags = ();
while (my $seq = $seqio->next_seq()) {
    my $header1 = $seq->desc() || 'unknown' . " - 1.." . $seq->length();

    if ($debug) {
        say "\$seq isa '" . ref($seq) . "'" if ($debug);
        say $seq->desc() . " - 1.." . $seq->length();
        say "CDS: " . scalar($seq->get_SeqFeatures('CDS'));
        exit();
    }

    my @CDS = $seq->get_SeqFeatures('CDS');
    my @RNA = $seq->get_SeqFeatures('tRNA');
    push(@RNA,$seq->get_SeqFeatures('rRNA'));

    my $featcnt = 0;
    
    say $PTT $header1;
    say $PTT scalar(@CDS) . " proteins";
    say $RNT $header1;
    say $RNT scalar(@RNA) . " RNAs";
    say $PTT "Location\tStrand\tLength\tPID\tGene\tSynonym\tCode\tCOG\tProduct";
    say $RNT "Location\tStrand\tLength\tPID\tGene\tSynonym\tCode\tCOG\tProduct";

    for my $feature (@CDS) {

        next if $feature->has_tag('pseudo');

        my $tag = $feature->primary_tag();
        ++$tags{$tag};


        last if ($debug && ++$featcnt > 10);

        my $start = $feature->start();
        my $stop = $feature->end();
        my $strand = $feature->strand();
        my $length = $feature->length();
        my @pid = $feature->has_tag('protein_id') ? $feature->get_tag_values('protein_id') : $feature->get_tag_values('locus_tag');
        my @gene = $feature->has_tag('gene') ? $feature->get_tag_values('gene') : '-';
        my @synonym = $feature->get_tag_values('locus_tag');
        my $code = '-';
        my $cog = '-';
        my @description = $feature->get_tag_values('product');

        if ($strand > 0) {
            $strand = '+';
        } elsif ($strand < 0) {
            $strand = '-';
        }

#            my @misc = $feature->primary_tag('misc_feature');
#
#            for my $misc_feature (@misc) {
#                say "\$misc_feature isa '" . ref($misc_feature) . "'";
#                if ($misc_feature->has_tag('note')) {
#                    my @notes = $misc_feature->get_tag_values('note');
#                    for my $note (@notes) {
#                        if ($note =~ /(COG\d\d\d\d)/) {
#                            $cog = $1;
#                        }
#                    }
#                }
#            }

        say $PTT $start . ".." . "$stop\t$strand\t$length\t$pid[0]\t$gene[0]\t$synonym[0]\t$code\t$cog\t$description[0]";
    }

    #foreach my $feature (sort { $a->start() <=> $b->start() } (@tRNA, @rRNA)) {
    for my $feature (sort { $a->start() <=> $b->start() } @RNA) {

            my $start = $feature->start();
            my $stop = $feature->end();
            my $strand = '+';
            my $length = $feature->length();
            my $pid = $feature->has_tag('db_xref') ? ($feature->get_tag_values('db_xref'))[0] : ($feature->get_tag_values('locus_tag'))[0];
            my @gene = $feature->has_tag('gene') ? $feature->get_tag_values('gene') : '-';
            my @synonym = $feature->get_tag_values('locus_tag');
            my $code = '-';
            my $cog = '-';
            my @description = $feature->get_tag_values('product');

            say $RNT $start . ".." . "$stop\t$strand\t$length\t$pid\t$gene[0]\t$synonym[0]\t$code\t$cog\t$description[0]";
        }
}

# Extract GU hits in refseq and plot genomic neighbourhood

Create needed folders

```bash
mkdir ft_files gb_files gb_files_filt gb_files_slice gb_files_ptt gb_files_geno
```

Get the assembly file from NCBI

```bash
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt
```

An example of the sample file used to get the NCBI data (**genome_narrow.tsv**)

    WP_011376775.1  32231714        0.958   60      2       0       1       60      1       60      5.74e-36        123     60      60      GCF_000012645.1_ASM1264v1       hypothetical protein [Prochlorococcus marinus]  hypo    Bacteria        Cyanobacteria   Prochloraceae   NA   Prochlorococcus  Prochlorococcus_marinus 340128
    WP_032512946.1  32231714        0.905   60      6       0       1       60      1       60      2.35e-33        116     60      60      GCF_000759855.1_ASM75985v1      hypothetical protein [Prochlorococcus marinus]  hypo    Bacteria        Cyanobacteria   Prochloraceae   NA   Prochlorococcus  Prochlorococcus_marinus 340128

Get the feature table file from NCBI

```bash
grep -w -f <(cut -f15 genome_narrow.tsv) assembly_summary.txt | cut -f 20 | while read line; do wget -l1 -r --no-parent -A "*feature_table.txt.gz" $line; done

find ftp.ncbi.nlm.nih.gov/ -name '*feature_table.txt.gz' -exec cp {} ft_files/ \;

rm -rf ftp.ncbi.nlm.nih.gov/
```

Get the genbank format file from NCBI

```bash
grep -w -f <(cut -f15 genome_narrow.tsv) assembly_summary.txt | cut -f 20 | while read line; do wget -l1 -r --no-parent -A "*genomic.gbff.gz" $line; done

find ftp.ncbi.nlm.nih.gov/ -name '*gbff.gz' -exec cp {} gb_files/ \;

rm -rf ftp.ncbi.nlm.nih.gov/
```

Get the location of our genes of interest and calculate coordinates 10kb upstream/downstream

```bash
S=10000; grep -f <(cut -f1 genome_narrow.tsv) ft_files/* | tr ':' $'\t' | cut -f 2- -d '/' | sed -e 's/_feature_table.txt//' | awk -vO=$S -vFS="\t" '{print $1,$9,$10,$11,$9-O,$10+O,$13,$17,$18,$15}' | awk -vO=$S '$2 >= O' > hits_10kb_up.txt
```

Split incomplete genomes in a set of contig files and keep the ones containing our gene of interest

```bash
cat hits_10kb_up.txt | while read LINE; do mawk -vV=$(echo $LINE | cut -f1 -d ' ') -v n=1 '/^\/\//{close(V".partial."n);n++;next} {print > V".partial."n}' gb_files/$(echo $LINE | cut -f1 -d ' ')_genomic.gbff; done

grep -f <(cut -f7 -d ' ' hits_10kb_up.txt) *partial* | cut -f 1 -d ':' | sort -u | while read file; do mv $file ${file/.partial.*/.gbk}; done

mv *gbk gb_files_filt

rm *partial*
```

Slice the genbank files 10kb upstream/downstream and convert them to ptt format

```bash
cat hits_10kb_up.txt | while read LINE; do echo '//' >> gb_files_filt/$(echo $LINE | cut -f1 -d ' ').gbk; python slicer.py -g gb_files_filt/$(echo $LINE | cut -f1 -d ' ').gbk -s $(echo $LINE | cut -f5 -d ' ') -e $(echo $LINE | cut -f6 -d ' ') -o $(echo $LINE | cut -f1 -d ' ')_slice.gbk; done

for i in *_slice.gbk; do perl gb2ptt.pl --infile $i; done

mv *slice.gbk gb_files_slice

mv *gbk.ptt gb_files_ptt
```
>Slicer script from [here](https://gist.github.com/jrjhealey/06a7fbdfe495bc5a8824ed152dff7919)  
>gb2ptt script from [here](https://github.com/sgivan/gb2ptt/blob/master/bin/gb2ptt.pl)

Create files for plotting the gene structures

```bash
for i in gb_files_ptt/*ptt; do NAM=$(basename $i _slice.gbk.ptt); awk 'NR>3' $i | sed -e 's/\.\./\t/' | awk -vFS="\t" -vOFS="\t" '{gsub(" ","_",$10); if($3 == "+"){S=1}else{S=-1}; print $10,$1,$2,S,"gray\t1\t1\t8\t1\tarrows"}' > ${NAM}_genoplot.tsv ; done

cat hits_10kb_up.txt | while read line; do F=$(echo $line | cut -f1 -d ' '); E=$(echo $line | cut -f7 -d ' '); awk -vC=$E '{if ($1 == C){$6="red"; $2="Genomic_unknown"}; print $0}' ${F}_genoplot.tsv | tr ' ' $'\t' | cut -f2- > tmp && mv tmp ${F}_genoplot.tsv; done

mv *_genoplot.tsv gb_files_geno
```

Plot with **geneplot.r** (Code modified from [here](https://github.com/Sanrrone/multiGenomicContext))

Calculate synteny of the sliced files using mauve (`conda install progressiveMauve`)

```bash
progressiveMauve --output=test gb_files_slice/*gbk
```

以上是关于markdown 在refseq中提取GU命中并绘制基因组邻域的主要内容，如果未能解决你的问题，请参考以下文章

sh refseq chroms dict convert

使用 BigQuery 提取命中级别数据时，Google Analytics 指标被夸大了

Guava cache使用总结

Gulp Front Matter + Markdown 通过 Nunjucks

markdown Kubernetes：从私人仓库中提取图片

markdown 从JSON中提取数据的方法