sh 推测单倍型用于局部血统分析

Posted 2021-05-06
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了sh 推测单倍型用于局部血统分析相关的知识，希望对你有一定的参考价值。
wd=/home/maihofer/haps/
cd $wd

#Datasets that have (x) or need (y) to rephase haplotype data
 # safr y
 # mrsc y
 # dnhs y
 # gsdc y
 # fscd y
 # nss1 y
 # nss2 y
 # pts1 x
 # stro x
 # psy3 x
 # meg2 x
 # gtpc x
 # comc x
 # grac x
 # wrby x
 # cogb x  
 
 sha2=/home/maihofer/haps/shapeit.v2.900.glibcv2.12.linux/bin/shapeit
 imp2=/home/maihofer/haps/impute_v2.3.2_x86_64_static/impute2
 refdir="$wd"/refdat
 datdir="$wd"/mydata
 outdir="$wd"/imputed
 
 
 sdatdir="$wd"/mydata/shapeit
 soutdir="$wd"/mydata/

 
 mkdir errandout

##For data with haplotypes available;  extract haplotypes
 cd $datdir

 for chunk in 9_138_141 6_030_033
 do
  for study in pts1 stro psy3 meg2 gtpc comc grac wrby cogb
  do
   tar xvzf /archive/maihofer/"$study"_ha_v1_oct6_2016.tgz  --wildcards "*$chunk*" --strip-components=5
  done
 done

##For data without haps; extract aligned data, phase data
 cd $sdatdir
  for chunk in 9_138_141 6_030_033
 do
  for study in safr mrsc dnhs gsdc fscd nss1 nss2 
  do
   tar xvzf /archive/maihofer/"$study"_ha_v1_oct6_2016.tgz  --wildcards "*$chunk*" --strip-components=5
  done
 done

 

 #option for chr 9

 for chunk in 6_030_033 # 9_138_141  # chr13_054_057 #chr 6 needs all datasets
 do 
  ls $sdatdir | grep .bim | sed 's/.bim//g' | grep chr"$chunk" | grep nss1  > "$chunk"_shapeitdatfile.txt  #| grep -P "mrsc|nss1|nss2|prom|stro"
  sdatfile="$chunk"_shapeitdatfile.txt
  chr=$(echo $chunk | awk 'BEGIN { FS = "_" } ; { print $1 }')
  mapfile=genetic_map_chr"$chr"_combined_b37.txt
  ncommands=$(wc -l $sdatfile | awk '{print $1}')
  nodeuse=1 # $(($nodesize - 1)) #This many cores will be used at once
  jobsize=1 #this many samples will run in a given job
 #Total number of jobs = Number of commands / number of commands used per job (i'll say to do 100 at a time), rounded up 
  totjobs=$((( ($ncommands + $nodeuse - 1 ) / $jobsize + 1) - 1 ))
  force=yes #set to xxxx if you dont want the --force option enabled
  qsub -t1-$totjobs -lwalltime=02:30:00 shapeit2.pbs -e errandout/ -o errandout -d $wd -F "-i $sha2 -r $refdir -m $mapfile -d $sdatfile -k $sdatdir  -o $soutdir -j $jobsize -n 1 -f $force"
 done

##After data is phased, do imputation
echo "rs142174523" > 6_030_033.snplist 
echo "rs9410042
rs3123508" >  9_138_141.snplist

  for chunk in 6_030_033 #9_138_141
  do
  ls mydata | grep .haps | grep chr"$chunk"| grep .haps$ | grep -P "nss1" > "$chunk"_toimpute.txt #need to add the .haps files for data that you imputed, wont be split otherwise. maybe just take haps file without split

  datfile="$chunk"_toimpute.txt
  ncommands=$(wc -l $datfile | awk '{print $1}')
  chr=$(echo $chunk | awk 'BEGIN { FS = "_" } ; { print $1 }')
  mapfile=genetic_map_chr"$chr"_combined_b37.txt
  
  nodeuse=4 # $(($nodesize - 1)) #This many cores will be used at once
  jobsize=4 #this many samples will run in a given job
 #Total number of jobs = Number of commands / number of commands used per job (i'll say to do 100 at a time), rounded up 
  totjobs=$((( ($ncommands + $nodeuse - 1 ) / $jobsize + 1) - 1 ))

  reffile=sc_my.1000GP_Phase3_chr"$chunk"_1000.impute.phased.bgl
  start1=$(echo $chunk | awk 'BEGIN { FS = "_" } ; { print $2 }')
  start2=$((10#$start1))
  stop1=$(echo $chunk | awk 'BEGIN { FS = "_" } ; { print $3 }')
  stop2=$((10#$stop1))
  start=$((start2*1000000))
  stop=$((stop2*1000000))
  buffer=1000
  snplist="$chunk".snplist
  qsub -t1-$totjobs  -lwalltime=00:35:00 impute2.pbs -e errandout/ -o errandout -d $wd -F "-i $imp2 -r $refdir -m $mapfile -h $reffile -d $datfile -k $datdir  -s $start -p $stop -b $buffer -o $outdir -j $jobsize -n $nodeuse -l $snplist"
 done
 qsub -t1   -lwalltime=00:15:00 impute2.pbs -e errandout/ -o errandout -d $wd -F "-i $imp2 -r $refdir -m $mapfile -h $reffile -d $datfile -k $datdir  -s $start -p $stop -b $buffer -o $outdir -j $jobsize -n $nodeuse -l $snplist"


##After grep, join .fam file to .haps file
#This might just be a time waster?? This stuff is never called upon again...
# mkdir temp
# snp=rs9410042
# chunk=9_138_141 
# for study in mrsc nss1 nss2 stro PROM # safr mrsc dnhs gsdc fscd nss1 nss2 pts1 stro psy3 meg2 gtpc comc grac wrby cogb PROM
# do
  # wc -l /home/maihofer/haps/mydata/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam
  # awk '{print NR,$1,$2}' /home/maihofer/haps/mydata/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s1
  # cat temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s1 temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s1 | sort -n -k1 | awk '{print $2,$3}' > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s2
  # grep $snp imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps | perl /home/maihofer/localanc/transpose.pl > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps_transpose
  # wc -l temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s2
  # tail -n+6 temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps_transpose | wc -l 
  # paste <(tail -n+6 temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps_transpose) > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s2 
# done

##Align my hapfile and elizabeth's hapfile

#Pull out elizabeths hapfiles and ancfiles

 chr=9
 
for files in $(cat  elizabeth_ancfilelocations_haps2.csv | sed "s/chr13/chr"$chr"/g" | grep -P "mrsc|nss1|nss2|PROM|stro"  )
do
 study=$(echo $files | awk -F, '{print $1}')
 folder=$(echo $files | awk -F, '{print $2}')
 haps=$(echo $files | awk -F, '{print $4}' | sed 's/haps//g' )
 msp=$(echo $files | awk -F, '{print $3}')
 echo "$study | $folder | $msp"
 cp "$folder"/"$msp" ~/localanc/msp/"$study"_"$chr".msp.tsv
# cp "$folder"/"$haps"haps ~/localanc/haplotypes/"$study"_"$chr".haps
 #cp "$folder"/"$haps"sample  ~/localanc/haplotypes/"$study"_"$chr".sample
done

#From anc calls, get the top hit ancestry info
 #lat tophit
 chr=9
 snp=rs9410042
 snp_pos=140627715
 
 #lat male tophit
 chr=9
 snp=rs3123508
 snp_pos=140624309
 
 for study in  mrsc nss1 nss2 ppds PROM stro  # 
 do
  awk -v snp_pos=$snp_pos '{if(NR==2 || ($2 <= snp_pos && $3 >= snp_pos)) print}'  ~/localanc/msp/"$study"_"$chr".msp.tsv | cut -f7- > temp/"$snp"_"$study".msp
  nsub=$(awk 'NR==1{print NF}' temp/"$snp"_"$study".msp )
  perl ~/localanc/transpose.pl <(cat  temp/"$snp"_"$study".msp ) > temp/"$snp"_"$study".msp.an1
  paste <(cut -f 1 temp/"$snp"_"$study".msp.an1 | sed 's/[.]0$//g' | sed 's/[.]1$//g' ) temp/"$snp"_"$study".msp.an1 > ~/localanc/mspchr"$chr"/"$snp"_"$study".msp.an2

 done
 cp ~/localanc/mspchr"$chr"/"$snp"_nss1.msp.an2 ~/localanc/mspchr"$chr"/"$snp"_ppds.msp.an2
 cp ~/localanc/mspchr"$chr"/"$snp"_meg2.msp.an2 ~/localanc/mspchr"$chr"/"$snp"_maf1.msp.an2
 cp ~/localanc/mspchr"$chr"/"$snp"_meg2.msp.an2 ~/localanc/mspchr"$chr"/"$snp"_maf2.msp.an2
 cp ~/localanc/mspchr"$chr"/"$snp"_meg2.msp.an2 ~/localanc/mspchr"$chr"/"$snp"_maf3.msp.an2
 
 
 

 


#Use an IF statement to flip sort the mrs .fam file in elizabeths header file data
#Format each dataset to overlapping subjects and SNPs
 snp=rs115539978
 chr=13
  snp=rs9410042
 chunk=9_138_141 
 chr=9
 mkdir "$snp"_elizabeth
 
 for study in   mrsc nss1 nss2 stro PROM # safr mrsc dnhs gsdc fscd nss1 nss2 pts1 stro psy3 meg2 gtpc comc grac wrby cogb
 do
  echo $study
  #Identify all SNPs in elizabeths data
   awk '{print $1}' /home/maihofer/localanc/haplotypes/"$study"_"$chr".haps | LC_ALL=C sort -k1b,1 > "$snp"_elizabeth/"$snp"_"$study"_elizabeth.snplist
  #Identify all SNPs in my data
   awk '{print $2}' imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps | LC_ALL=C sort -k1b,1  > "$snp"_elizabeth/"$snp"_"$study"_adam.snplist
  #Identify overlapping SNPs
   LC_ALL=C join "$snp"_elizabeth/"$snp"_"$study"_elizabeth.snplist "$snp"_elizabeth/"$snp"_"$study"_adam.snplist > "$snp"_elizabeth/"$snp"_"$study"_overlap.snplist
   
  #Filter each dataset to overlapping SNPs
   LC_ALL=C join <(LC_ALL=C sort -k1b,1 /home/maihofer/localanc/haplotypes/"$study"_"$chr".haps) "$snp"_elizabeth/"$snp"_"$study"_overlap.snplist | awk '{OFS=" "} {$1=$1; print}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".elizabeth_haps_overlap
   LC_ALL=C join -1 2 <(LC_ALL=C sort -k2b,2 imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps) "$snp"_elizabeth/"$snp"_"$study"_overlap.snplist  > /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".adam_haps_overlap
   
  #Identify subjects in my data (NOTE: this is one place where things can go very wrong, if somehow subjects were dropped from .fam 
  #Make a duplicate of .fam file, order it, so that it gives each subject 2 entries. Join this to .haps file
   awk '{print $1"_"$2, NR}' /home/maihofer/haps/mydata/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subjects
  
   cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subjects /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subjects | sort -n -k2 | awk '{print $1}' | perl /home/maihofer/localanc/transpose.pl | awk '{OFS=" "} {$1=$1; print}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader
   echo "SNP CHR BP A1 A2" | paste -d " " - /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader2
   
   cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader2 /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".adam_haps_overlap > /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".adam_haps_overlap_wheader
  
  #Do the same in elizabeths
  
   tail -n+3 /home/maihofer/localanc/haplotypes/"$study"_"$chr".sample | awk '{print $1"_"$2,NR}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subjects
  #Add in exception handling for MRS to swap the IDs back
   if [ $study == "mrsc" ]
    then
     awk '{print $1,$2}' /home/maihofer/haps/mydata/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam  | LC_ALL=C sort -k1b,1 > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".famcols
     tail -n+3 /home/maihofer/localanc/haplotypes/"$study"_"$chr".sample | awk '{print $1,NR}' | LC_ALL=C sort -k1b,1 > "$study"_"$chr".sample_s1
     LC_ALL=C join -e X -o auto -a2 temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".famcols "$study"_"$chr".sample_s1 | sort -n -k 3 | awk '{print $1"_"$2,NR}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subjects  #Must have matching N for join!!!
    fi
    
   cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subjects /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subjects | sort -n -k2 | awk '{print $1}' | perl /home/maihofer/localanc/transpose.pl | awk '{OFS=" "} {$1=$1; print}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subheader
   echo "SNP SNPA BP A1 A2" | paste -d " " - /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subheader > /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subheader2
   cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subheader2 /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".elizabeth_haps_overlap > /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".elizabeth_haps_overlap_wheader
   
 
  #Make a copy of just the target SNP haplotype data, with a header
  echo "CHR SNP BP A1 A2" | paste -d " " - /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader_fulldat
   cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader_fulldat <(grep $snp imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps) > "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader
 done
 
 #Check back to .fam and .sample files to see if subjects really dont just line up, i have it that the lizabeth ones are differently ordered form mine..
 
 #Give orientation requirement for MY data to match elizabeths
 for study in  mrsc nss1 nss2 stro PROM  # safr mrsc dnhs gsdc fscd nss1 nss2 pts1 stro psy3 meg2 gtpc comc grac wrby cogb
 do
  echo $study
  Rscript haps_comp_v3.r "$snp"_elizabeth/"$study"_"$chr".elizabeth_haps_overlap_wheader "$snp"_elizabeth/"$study"_"$chr".adam_haps_overlap_wheader  "$snp"_elizabeth/"$study"_"$chr".hapcolmatch
 done
 
 #Copy NSS to be ppds.
 #Fix MRS by ?? flipping identities
 
 #Use orientation to flip haplotype data to the correct position
 for study in  mrsc nss1 nss2 stro PROM  #   safr mrsc dnhs gsdc fscd nss1 nss2 pts1 stro psy3 meg2 gtpc comc grac wrby cogb
 do
  echo $study
    echo "CHR SNP BP A1 A2" | paste -d " " - /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader_fulldat
   cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader_fulldat <(grep $snp imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps) > "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader
 
  Rscript haps_align.r "$snp"_elizabeth/"$study"_"$chr".hapcolmatch  "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader   "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader_lizorder
 done
 
 cp "$snp"_elizabeth/nss1_"$snp".adam_haps_overlap_wheader_lizorder "$snp"_elizabeth/ppds_"$snp".adam_haps_overlap_wheader_lizorder
 cp "$snp"_elizabeth/meg2_"$snp".adam_haps_overlap_wheader_lizorder "$snp"_elizabeth/maf1_"$snp".adam_haps_overlap_wheader_lizorder
 cp "$snp"_elizabeth/meg2_"$snp".adam_haps_overlap_wheader_lizorder "$snp"_elizabeth/maf2_"$snp".adam_haps_overlap_wheader_lizorder
 cp "$snp"_elizabeth/meg2_"$snp".adam_haps_overlap_wheader_lizorder "$snp"_elizabeth/maf3_"$snp".adam_haps_overlap_wheader_lizorder
 
 
 
 
 #Get association statistics
 #make copies of meg2, nss1, to make extra datasets
 ##AAM tophit
 region=chr13_054_057
 chr=13
 snpr=rs115539978
 ancgroup=aam
 gender=mf
 
 region=chr9_138_141 
 chr=9
 snp=rs9410042
 ancgroup=lat
 gender=mf
 
#note: gender doesnt do anything here, only does for the actual combined haps analysis
#note: need global pcs
#Half of MRS subjects are gone for some fucking reason

 for study in ppds # mrsc nss1 nss2 stro PROM  #  maf1 maf2 maf3 safr mrsc dnhs gsdc cogb fscd nss1 ppds nss2 pts1 stro psy3 gtpc comc grac wrby # note: mrsc - may require columns to be flipped around. meg2 not analyzed because maf1 maf2 maf3 are analyzed instead
 do
 echo $study
  Rscript ../localanc/1_association_test_haplotype.r ../localanc/mspchr"$chr"/"$snp"_"$study".msp.an2 "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader_lizorder "$snp"   ../localanc/mds/all_"$ancgroup"/pts_"$study"_mix_am-qc-"$ancgroup"_pca.menv.mds_cov ../localanc/genotypes/pts_"$study"_mix_am-qc.fam $gender "$study" temp/"$snp"_"$study"_"$gender"_dose.txt
 done

 cat temp/"$snp"_*_"$gender"_dose.txt |  awk '{if (NR == 1 || $1 != "\"FID_Hap\"") print }'  > temp/"$snp"_"$gender"_dose_alldata.txt
 Rscript ../localanc/1_association_combinedhaps.r  temp/"$snp"_"$gender"_dose_alldata.txt $snp $gender allstudies.predpc tables_"$snp"_"$gender"_dose_alldata.txt
 
 #Sanity check: in SAFR, the allele is mono in europeans, and 14% in africans
 
 
 #Sanity check: The fact that the NEITHER error is rarely happening means that my data is aligned. Possibly output those subject names to a file, AS WELL AS the ones from my 
 #Sanity check : include max correspondence rate in the output file
 #Sanity check: The flip rate is about 50%, what I would expect for problem like this
 
 #Compare each file in R by subject, 
 
 
 #Maybe do this on SNP join, then transpose and subject join? First just put the fucking subject names in the file instead of screwing yourself over joining on IDs.

   
 
 ##Do basically the same thing, but for all SNPs

 
#Need to align my phasing with elizabeths (jesus FUCKING christ)
  Her .haps files have the actual study subjects first, then the reference
  #must make sure samples align too..
  
  Easiest correspondence check? Only easy if ref and alt alleles stay the same. Retain from each data where markers match (cols 4 and 5).
  #Take overlapping marker set in the right order. Filter to correc... this need sto be done on a per subject level this is a fucking nightmare
  #Get overlapping marker set and overlapping subjects. Check order of my .fams vs her .sample files to make sure the match
 #backup:impute a single da taset
 
 
  datname=$dataset
  mapfile=genetic_map_chr6_combined_b37.txt
  reffile=sc_my.1000GP_Phase3_"$chunk"_1000.impute.phased.bgl
  start=54000000
  stop=57000000 
  qsub -lwalltime=00:50:00 impute.pbs -e errandout/ -o errandout -d $wd -F "-i $imp2 -r $refdir -m $mapfile -h $reffile -d $datname -k $datdir  -s $start -p $stop -b $buffer -o $outdir"
 
 
#
以上是关于sh 推测单倍型用于局部血统分析的主要内容，如果未能解决你的问题，请参考以下文章