python metasoft等位基因比对(对NA的调整)

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python metasoft等位基因比对(对NA的调整)相关的知识,希望对你有一定的参考价值。

#!/usr/bin/env python
import math
########################################################
# plink2metasoft.py                                    
#   Convert Plink .assoc files to Metasoft input file  
#   Free license -- you are free to use it in any ways 
#   Buhm Han (2012)                                    
########################################################

import sys, subprocess, os

comple = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}

# PROCESS ARGUMENTS
if len(sys.argv) < 3:
    print_and_exit()
out=sys.argv[1]
files=sys.argv[2:]

# READ FILES
studies=[]
for f in files:
    study={}
    fin=open(f)
    colnames=fin.next().split()
    for line in fin:
        snp={}
        for (x,y) in zip(colnames,line.split()):
            snp[x]=y
        rsid=snp['SNP']
        study[rsid]=snp
    fin.close()
    studies.append(study)
    
# UNION OF SNPS
allsnps={}
for study in studies:
    for rsid, snp in study.iteritems():
        allsnps[rsid]=snp

# SORT SNPS
rsids=sorted(allsnps.keys(), 
             ) #  key=lambda x:(allsnps[x]['CHR'],int(allsnps[x]['BP'])))

# MERGE STUDIES
fout=open(out+'.meta','w') #open(out+'.meta.tmp','w') 
fmap=open(out+'.mmap','w')
flog=open(out+'.log','w')  
for rsid in rsids:
    output=rsid+'\t'
    pivot=-1
    pivotstudyindex=-1
    numstudy=0
    for study in studies:
        studyindex=studies.index(study)+1
        if rsid in study:
            snp=study[rsid]
            if 'OR' in snp:
                beta1=float(snp['OR'])
                
                if beta1 == 0 :
                    beta='NA'
                    stderr='NA'
                    #Do not convert to log scale if OR is 0, just consider it invalid.
                else:
                    beta=math.log(beta1)
                    beta=str(beta) #'log(%s)'%snp['OR']
                    stderr=snp['SE']
            elif 'BETA' in snp:
                beta=snp['BETA']
                stderr=snp['SE']
            else:
                assert 0, 'OR or BETA must be in columns'
            #if 'P' in snp:
                #p=snp['P']
            #else:
                #assert 0, 'P must be in columns'
             #'abs(%s/qnorm(%s/2))'%(beta,p)
             
            #add some error handling to deal with NA ses...
            se1=snp['SE']
            if se1 != "NA":
                se1=float(se1)
            else:    
                stderr='NA'
            if se1 == 0:
                stderr='NA'
                print 'SE of 0 found for ' , snp['SNP']
            if pivot == -1:
                pivot=snp # 1ST STUDY's SNP INFO IS PIVOT
                pivotstudyindex=studyindex
            else:
                # CHECK ALLELE TO PIVOT
                
                if 'A2' in pivot and 'A2' in snp and beta != "NA": 
                    if pivot['A1'] == snp['A1'] and \
                       pivot['A2'] == snp['A2']:
                        # GOOD
                        pass
                    elif pivot['A1'] == snp['A2'] and \
                         pivot['A2'] == snp['A1']:
                        # SIMPLE FLIP
                        beta=str(-1*float(beta))
                    elif snp['A1'] not in comple or snp['A2'] not in comple: 
                         beta='NA'
                         stderr='NA'
                         #Adam: Really hard to deal with multi allelics on different strands, i won't even bother with this shit
                         flog.write('Due to non acgt, cant align alleles for %s in study %d\n'%(rsid,studyindex))
                    elif pivot['A1'] == comple[snp['A1']] and \
                         pivot['A2'] == comple[snp['A2']]:
                        # STRAND INCONSIS., BUT GOOD
                        flog.write('FLIP_STRAND %s in study %d\n'%(rsid,studyindex))
                    elif pivot['A1'] == comple[snp['A2']] and \
                         pivot['A2'] == comple[snp['A1']]:
                        # STRAND INCONSIS., SIMPLE FLIP
                        flog.write('FLIP_STRAND %s in study %d\n'%(rsid,studyindex))
                        beta=str(-1*float(beta))

                  
                    else:
                        print 'SNP' , snp['SNP'], 'Error. Expected alleles', pivot['A1'], pivot['A2'],' but got',  snp['A1'], snp['A2']
                        flog.write('EXCLUDE %s due to allele inconsistency: A1:%s A2:%s in study %d but A1:%s A2:%s in study %d\n'
                                   %(rsid, pivot['A1'], pivot['A2'], pivotstudyindex,
                                     snp['A1'], snp['A2'], studyindex))
                        beta='NA'
                        stderr='NA'   
                        #If you can't flip it, set it to NA.
                else: 
                    if pivot['A1'] == snp['A1']:
                        # GOOD
                        #This code exists if you DO NOT have an A2 allele in your results - just assume that if A1 matches both, that the second allele is OK.
                        pass
                    else:
                        flog.write('EXCLUDE %s due to allele inconsistency: A1:%s in study %d but A1:%s in study %d\n'
                                   %(rsid, pivot['A1'], pivotstudyindex,
                                     snp['A1'], studyindex))
                        beta='NA'
                        stderr='NA'   
                        #If you can't flip it, set it to NA.            
                        
                # CHECK CHR & BP TO PIVOT
                #if pivot['CHR'] != snp['CHR']:
                #    flog.write('WARNING %s has chr inconsistency\n'%rsid)
                #if pivot['BP'] != snp['BP']:
                #    flog.write('WARNING %s has basepair inconsistency\n'%rsid)
            output+=beta+' '+stderr+' '
            numstudy+=1
        else:
            output+='NA NA '
            #If a marker isn't found, just output NA
    if numstudy == 1:
        flog.write('EXCLUDE %s due to being in single study\n'%rsid)
    else:
        fout.write(output+'\n')
        fmap.write('%s\t%s\t%d\n'% # fmap.write('%s\t%s\t%s\t%s\t%d\n'%
                   (rsid, pivot['A1'], numstudy)) # (rsid, pivot['CHR'], pivot['BP'], pivot['A1'], numstudy))
fout.close()
fmap.close()
flog.close()

# CALL R TO EVALUATE MATH EXPRESSION
#subprocess.call(['R --vanilla --slave "--args '+out+'.meta.tmp '+out+'.meta" < plink2metasoft_subroutine.R'],shell=True)
#subprocess.call(['rm '+out+'.meta.tmp'], shell=True)


以上是关于python metasoft等位基因比对(对NA的调整)的主要内容,如果未能解决你的问题,请参考以下文章

python 从HapMap 2010-08_phaseII + III下载等位基因频率并构建一个sqlite数据库以便于访问。

双等位基因(biallelic sites )和多等位基因(multiallelic sites)

sh 得到含糊不清的等位基因

sh 获得1000G亚群的等位基因频率

r Rambo:Allelic Richness和Private等位基因通过多次随机减少

以rs开头例如rs3918254是指啥东东?跟等位基因有啥关系呢?