python metasoft等位基因比对(对NA的调整)
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python metasoft等位基因比对(对NA的调整)相关的知识,希望对你有一定的参考价值。
#!/usr/bin/env python
import math
########################################################
# plink2metasoft.py
# Convert Plink .assoc files to Metasoft input file
# Free license -- you are free to use it in any ways
# Buhm Han (2012)
########################################################
import sys, subprocess, os
comple = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
# PROCESS ARGUMENTS
if len(sys.argv) < 3:
print_and_exit()
out=sys.argv[1]
files=sys.argv[2:]
# READ FILES
studies=[]
for f in files:
study={}
fin=open(f)
colnames=fin.next().split()
for line in fin:
snp={}
for (x,y) in zip(colnames,line.split()):
snp[x]=y
rsid=snp['SNP']
study[rsid]=snp
fin.close()
studies.append(study)
# UNION OF SNPS
allsnps={}
for study in studies:
for rsid, snp in study.iteritems():
allsnps[rsid]=snp
# SORT SNPS
rsids=sorted(allsnps.keys(),
) # key=lambda x:(allsnps[x]['CHR'],int(allsnps[x]['BP'])))
# MERGE STUDIES
fout=open(out+'.meta','w') #open(out+'.meta.tmp','w')
fmap=open(out+'.mmap','w')
flog=open(out+'.log','w')
for rsid in rsids:
output=rsid+'\t'
pivot=-1
pivotstudyindex=-1
numstudy=0
for study in studies:
studyindex=studies.index(study)+1
if rsid in study:
snp=study[rsid]
if 'OR' in snp:
beta1=float(snp['OR'])
if beta1 == 0 :
beta='NA'
stderr='NA'
#Do not convert to log scale if OR is 0, just consider it invalid.
else:
beta=math.log(beta1)
beta=str(beta) #'log(%s)'%snp['OR']
stderr=snp['SE']
elif 'BETA' in snp:
beta=snp['BETA']
stderr=snp['SE']
else:
assert 0, 'OR or BETA must be in columns'
#if 'P' in snp:
#p=snp['P']
#else:
#assert 0, 'P must be in columns'
#'abs(%s/qnorm(%s/2))'%(beta,p)
#add some error handling to deal with NA ses...
se1=snp['SE']
if se1 != "NA":
se1=float(se1)
else:
stderr='NA'
if se1 == 0:
stderr='NA'
print 'SE of 0 found for ' , snp['SNP']
if pivot == -1:
pivot=snp # 1ST STUDY's SNP INFO IS PIVOT
pivotstudyindex=studyindex
else:
# CHECK ALLELE TO PIVOT
if 'A2' in pivot and 'A2' in snp and beta != "NA":
if pivot['A1'] == snp['A1'] and \
pivot['A2'] == snp['A2']:
# GOOD
pass
elif pivot['A1'] == snp['A2'] and \
pivot['A2'] == snp['A1']:
# SIMPLE FLIP
beta=str(-1*float(beta))
elif snp['A1'] not in comple or snp['A2'] not in comple:
beta='NA'
stderr='NA'
#Adam: Really hard to deal with multi allelics on different strands, i won't even bother with this shit
flog.write('Due to non acgt, cant align alleles for %s in study %d\n'%(rsid,studyindex))
elif pivot['A1'] == comple[snp['A1']] and \
pivot['A2'] == comple[snp['A2']]:
# STRAND INCONSIS., BUT GOOD
flog.write('FLIP_STRAND %s in study %d\n'%(rsid,studyindex))
elif pivot['A1'] == comple[snp['A2']] and \
pivot['A2'] == comple[snp['A1']]:
# STRAND INCONSIS., SIMPLE FLIP
flog.write('FLIP_STRAND %s in study %d\n'%(rsid,studyindex))
beta=str(-1*float(beta))
else:
print 'SNP' , snp['SNP'], 'Error. Expected alleles', pivot['A1'], pivot['A2'],' but got', snp['A1'], snp['A2']
flog.write('EXCLUDE %s due to allele inconsistency: A1:%s A2:%s in study %d but A1:%s A2:%s in study %d\n'
%(rsid, pivot['A1'], pivot['A2'], pivotstudyindex,
snp['A1'], snp['A2'], studyindex))
beta='NA'
stderr='NA'
#If you can't flip it, set it to NA.
else:
if pivot['A1'] == snp['A1']:
# GOOD
#This code exists if you DO NOT have an A2 allele in your results - just assume that if A1 matches both, that the second allele is OK.
pass
else:
flog.write('EXCLUDE %s due to allele inconsistency: A1:%s in study %d but A1:%s in study %d\n'
%(rsid, pivot['A1'], pivotstudyindex,
snp['A1'], studyindex))
beta='NA'
stderr='NA'
#If you can't flip it, set it to NA.
# CHECK CHR & BP TO PIVOT
#if pivot['CHR'] != snp['CHR']:
# flog.write('WARNING %s has chr inconsistency\n'%rsid)
#if pivot['BP'] != snp['BP']:
# flog.write('WARNING %s has basepair inconsistency\n'%rsid)
output+=beta+' '+stderr+' '
numstudy+=1
else:
output+='NA NA '
#If a marker isn't found, just output NA
if numstudy == 1:
flog.write('EXCLUDE %s due to being in single study\n'%rsid)
else:
fout.write(output+'\n')
fmap.write('%s\t%s\t%d\n'% # fmap.write('%s\t%s\t%s\t%s\t%d\n'%
(rsid, pivot['A1'], numstudy)) # (rsid, pivot['CHR'], pivot['BP'], pivot['A1'], numstudy))
fout.close()
fmap.close()
flog.close()
# CALL R TO EVALUATE MATH EXPRESSION
#subprocess.call(['R --vanilla --slave "--args '+out+'.meta.tmp '+out+'.meta" < plink2metasoft_subroutine.R'],shell=True)
#subprocess.call(['rm '+out+'.meta.tmp'], shell=True)
以上是关于python metasoft等位基因比对(对NA的调整)的主要内容,如果未能解决你的问题,请参考以下文章
python 从HapMap 2010-08_phaseII + III下载等位基因频率并构建一个sqlite数据库以便于访问。
双等位基因(biallelic sites )和多等位基因(multiallelic sites)