sh 用于为hs37d5创建通用掩码的脚本和命令行

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了sh 用于为hs37d5创建通用掩码的脚本和命令行相关的知识,希望对你有一定的参考价值。

var s = 10;

var f = new File();
var b = new Bytes();

var last_chr = null, start = 0, end = 0;
while (f.readline(b) >= 0) {
	var t = b.toString().split("\t");
	t[1] = parseInt(t[1]);
	t[2] = parseInt(t[2]);
	if (t[0] != last_chr) {
		if (last_chr) print(last_chr, start, end);
		last_chr = t[0], start = t[1], end = t[2];
	} else if (t[1] - end > s) {
		print(last_chr, start, end);
		start = t[1], end = t[2];
	} else {
		end = end > t[2]? end : t[2];
	}
}
if (last_chr) print(last_chr, start, end);

b.destroy();
f.close();
var getopt = function(args, ostr) {
	var oli; // option letter list index
	if (typeof(getopt.place) == 'undefined')
		getopt.ind = 0, getopt.arg = null, getopt.place = -1;
	if (getopt.place == -1) { // update scanning pointer
		if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') {
			getopt.place = -1;
			return null;
		}
		if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--"
			++getopt.ind;
			getopt.place = -1;
			return null;
		}
	}
	var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity
	if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) {
		if (optopt == '-') return null; //  if the user didn't specify '-' as an option, assume it means null.
		if (getopt.place < 0) ++getopt.ind;
		return '?';
	}
	if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument
		getopt.arg = null;
		if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1;
	} else { // need an argument
		if (getopt.place >= 0 && getopt.place < args[getopt.ind].length)
			getopt.arg = args[getopt.ind].substr(getopt.place);
		else if (args.length <= ++getopt.ind) { // no arg
			getopt.place = -1;
			if (ostr.length > 0 && ostr.charAt(0) == ':') return ':';
			return '?';
		} else getopt.arg = args[getopt.ind]; // white space
		getopt.place = -1;
		++getopt.ind;
	}
	return optopt;
}

// parse command-line options
var c, w1 = 1000, w2 = 10000, min_n = 2;
while ((c = getopt(arguments, "w:W:n:")) != null) {
	if (c == 'w') w1 = parseInt(getopt.arg);
	else if (c == 'W') w2 = parseInt(getopt.arg);
	else if (c == 'n') min_n = parseInt(getopt.arg);
}

// read data
var file = getopt.ind < arguments.length? new File(arguments[getopt.ind]) : new File();
var buf = new Bytes();
var seqs = {};
while (file.readline(buf) >= 0) {
	var t = buf.toString().split("\t");
	if (seqs[t[0]] == null) seqs[t[0]] = [];
	seqs[t[0]].push(parseInt(t[1]));
}
buf.destroy();
file.close();

// cluster
function cluster(a, w1, w2)
{
	a.sort(function(b,c){return b-c});
	var x = [], y = [];
	for (var i = 0; i < a.length; ++i)
		x.push([i, a[i] - 1, a[i], 1]);
	for (var i = 0; i < a.length - 1; ++i)
		y.push([a[i+1] - a[i], i]);

	y.sort(function(b,c){return b[0]!=c[0]?b[0]-c[0]:b[1]-c[1]});
	for (var j = 0; j < y.length && y[j][0] <= w2; ++j) {
		var l = y[j][1] + 1;
		if (x[l][0] != l)
			throw Error("Bug!!!");
		// find the parent of y[j][1]
		var k = y[j][1], tmp = [];
		while (x[k][0] != k)
			tmp.push((k = x[k][0]));
		for (var i = 0; i < tmp.length; ++i) x[tmp[i]][0] = k;
		// test merge
		if ((x[l][2] - x[k][1]) / (x[l][3] + x[k][3]) > w1) continue;
		// merge l into k
		x[k][2] = x[l][2]; x[k][3] += x[l][3];
		x[l][0] = k;
	}
	y.length = 0;
	return x;
}

for (var seq in seqs) {
	var x = cluster(seqs[seq], w1, w2);
	for (var i = 0; i < x.length; ++i)
		if (x[i][0] == i && x[i][3] >= min_n)
			print(seq, x[i][1], x[i][2], x[i][3]);
}
#!/usr/bin/env perl

use strict;
use warnings;

while (<>) {
	next if /^#/;
	next if /INDEL/;
	my @t = split("\t");
	my ($hwe, $f) = (1, 0);
	my @g3;
	my $af = ($t[7] =~ /AF1=([^;]+)/)? $1 : 0;
	if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/) {
		$hwe = $4 == 0? 1e-300 : $4;
		my $p = $1 + .5 * $2;
		$f = ($p > 0 && $p < 1)? 1 - $2 / (2 * $p * (1-$p)) : 0;
		$hwe = -4.343 * log($hwe);
		@g3 = ($1, $2, $3);
	} else {
		$hwe = 0;
	}
	$t[7] =~ /DP=(\d+)/;
	my $dp = $1;
	print join("\t", $t[0], $t[1], $t[5], $dp, sprintf("%.4g\t%.4g\t%.4f", $af, $f, $hwe), @g3), "\n";
}
#
# generate 01compositional.bed.gz
#

# low-complexity by mDUST
mdust hs37d5.fa -c -w7 -v28 \
  | hs37d5.mdust-w7-v28.txt \
  | cut -f1,3,4 \
  | gawk -vOFS="\t" '{--$2;print}' \
  | bgzip > 01compositional/hs37d5.mdust-w7-v28.bed.gz

# long homopolymer runs
seqtk hrun hs37d5.fa \
  | bgzip > 01compositional/hs37d5.hrun.bed.gz

# satellite sequences reported by RepeatMasker. rmsk.txt.gz is downloaded from UCSC.
zcat rmsk.txt.gz \
  | grep Satellite \
  | cut -f6,7,8 \
  | sed s,^chr,, \
  | perl -pe 's/^[^\s_]+_([^\s_]+)_random/$1.1/' \
  | tr "gl" "GL" \
  | sort-alt -k1,1N -k2,2n \
  | bgzip > 01compositional/hs37d5.satellite.bed.gz

# Low-complexity regions identified by RepeatMasker
zcat rmsk.txt.gz \
  | grep Low_complexity \
  | cut -f6,7,8 \
  | sed s,^chr,, \
  | perl -pe 's/^[^\s_]+_([^\s_]+)_random/$1.1/' \
  | tr "gl" "GL" \
  | sort-alt -k1,1N -k2,2n \
  | bgzip > 01compositional/hs37d5.rmsk-lc.bed.gz

# merge the four filters above with 10bp flanking sequences. bedmerge can be replaced by bedtools
zcat hs37d5.hrun.bed.gz hs37d5.rmsk-lc.bed.gz hs37d5.satellite.bed.gz hs37d5.mdust-w7-v28.bed.gz \
  | sort-alt -k1,1N -k2,2n \
  | gawk -v OFS="\t" '{$2=$2<10?0:$2-10;$3+=10;print}' \
  | bioutils.lua bedmerge \
  | bgzip > 01compositional.bed.gz

#
# generate 02structural.bed.gz
#

bcftools view -D hs37d5.fasta.fai -GeS unflt.vcf.gz \
  | ./filter.pl \
  | awk '$6<0&&($7>=100||($7>=50&&((/^[0-9]/&&$4>=22000)||(/^X/&&$4>=19000))))' \
  | k8 clustreg.js \
  | gawk -v OFS="\t" '{$2=$2<150?0:$2-150;$3+=150;print}' \
  | bioutils.lua bedmerge \
  | bgzip > 02structural.bed.gz

#
# Mappability filter is generated with instructions here:
#   http://lh3lh3.users.sourceforge.net/snpable.shtml
#

#
# Merge compositional, structural and mappability filters
#
zcat 01compositional/01compositional.bed.gz 02structural/02structural.bed.gz 03mappable/hs37d5.mask75_50.bed.gz \
  | sort-alt -k1,1N -k2,2n \
  | bioutils.lua bedmerge \
  | k8 merge-close.js \
  | bgzip > um75-hs37d5.bed.gz

以上是关于sh 用于为hs37d5创建通用掩码的脚本和命令行的主要内容,如果未能解决你的问题,请参考以下文章

sh 从命令行复制和重命名文件(用于NPM脚本)。需要https://github.com/mysticatea/cpx

模板匹配:为 minMaxLoc 创建掩码的有效方法?

sh 用于在OS X命令行上全局链接sublime文本的Shell脚本

子网掩码是啥?

sh 用于递归删除的Linux命令行脚本(使用`sudo ln -sv $ HOME / .bin / rmr.sh / usr / local / sbin`链接)

非分类掩码的 UNet 损失函数?