sh 推测单倍型用于局部血统分析
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了sh 推测单倍型用于局部血统分析相关的知识,希望对你有一定的参考价值。
wd=/home/maihofer/haps/
cd $wd
#Datasets that have (x) or need (y) to rephase haplotype data
# safr y
# mrsc y
# dnhs y
# gsdc y
# fscd y
# nss1 y
# nss2 y
# pts1 x
# stro x
# psy3 x
# meg2 x
# gtpc x
# comc x
# grac x
# wrby x
# cogb x
sha2=/home/maihofer/haps/shapeit.v2.900.glibcv2.12.linux/bin/shapeit
imp2=/home/maihofer/haps/impute_v2.3.2_x86_64_static/impute2
refdir="$wd"/refdat
datdir="$wd"/mydata
outdir="$wd"/imputed
sdatdir="$wd"/mydata/shapeit
soutdir="$wd"/mydata/
mkdir errandout
##For data with haplotypes available; extract haplotypes
cd $datdir
for chunk in 9_138_141 6_030_033
do
for study in pts1 stro psy3 meg2 gtpc comc grac wrby cogb
do
tar xvzf /archive/maihofer/"$study"_ha_v1_oct6_2016.tgz --wildcards "*$chunk*" --strip-components=5
done
done
##For data without haps; extract aligned data, phase data
cd $sdatdir
for chunk in 9_138_141 6_030_033
do
for study in safr mrsc dnhs gsdc fscd nss1 nss2
do
tar xvzf /archive/maihofer/"$study"_ha_v1_oct6_2016.tgz --wildcards "*$chunk*" --strip-components=5
done
done
#option for chr 9
for chunk in 6_030_033 # 9_138_141 # chr13_054_057 #chr 6 needs all datasets
do
ls $sdatdir | grep .bim | sed 's/.bim//g' | grep chr"$chunk" | grep nss1 > "$chunk"_shapeitdatfile.txt #| grep -P "mrsc|nss1|nss2|prom|stro"
sdatfile="$chunk"_shapeitdatfile.txt
chr=$(echo $chunk | awk 'BEGIN { FS = "_" } ; { print $1 }')
mapfile=genetic_map_chr"$chr"_combined_b37.txt
ncommands=$(wc -l $sdatfile | awk '{print $1}')
nodeuse=1 # $(($nodesize - 1)) #This many cores will be used at once
jobsize=1 #this many samples will run in a given job
#Total number of jobs = Number of commands / number of commands used per job (i'll say to do 100 at a time), rounded up
totjobs=$((( ($ncommands + $nodeuse - 1 ) / $jobsize + 1) - 1 ))
force=yes #set to xxxx if you dont want the --force option enabled
qsub -t1-$totjobs -lwalltime=02:30:00 shapeit2.pbs -e errandout/ -o errandout -d $wd -F "-i $sha2 -r $refdir -m $mapfile -d $sdatfile -k $sdatdir -o $soutdir -j $jobsize -n 1 -f $force"
done
##After data is phased, do imputation
echo "rs142174523" > 6_030_033.snplist
echo "rs9410042
rs3123508" > 9_138_141.snplist
for chunk in 6_030_033 #9_138_141
do
ls mydata | grep .haps | grep chr"$chunk"| grep .haps$ | grep -P "nss1" > "$chunk"_toimpute.txt #need to add the .haps files for data that you imputed, wont be split otherwise. maybe just take haps file without split
datfile="$chunk"_toimpute.txt
ncommands=$(wc -l $datfile | awk '{print $1}')
chr=$(echo $chunk | awk 'BEGIN { FS = "_" } ; { print $1 }')
mapfile=genetic_map_chr"$chr"_combined_b37.txt
nodeuse=4 # $(($nodesize - 1)) #This many cores will be used at once
jobsize=4 #this many samples will run in a given job
#Total number of jobs = Number of commands / number of commands used per job (i'll say to do 100 at a time), rounded up
totjobs=$((( ($ncommands + $nodeuse - 1 ) / $jobsize + 1) - 1 ))
reffile=sc_my.1000GP_Phase3_chr"$chunk"_1000.impute.phased.bgl
start1=$(echo $chunk | awk 'BEGIN { FS = "_" } ; { print $2 }')
start2=$((10#$start1))
stop1=$(echo $chunk | awk 'BEGIN { FS = "_" } ; { print $3 }')
stop2=$((10#$stop1))
start=$((start2*1000000))
stop=$((stop2*1000000))
buffer=1000
snplist="$chunk".snplist
qsub -t1-$totjobs -lwalltime=00:35:00 impute2.pbs -e errandout/ -o errandout -d $wd -F "-i $imp2 -r $refdir -m $mapfile -h $reffile -d $datfile -k $datdir -s $start -p $stop -b $buffer -o $outdir -j $jobsize -n $nodeuse -l $snplist"
done
qsub -t1 -lwalltime=00:15:00 impute2.pbs -e errandout/ -o errandout -d $wd -F "-i $imp2 -r $refdir -m $mapfile -h $reffile -d $datfile -k $datdir -s $start -p $stop -b $buffer -o $outdir -j $jobsize -n $nodeuse -l $snplist"
##After grep, join .fam file to .haps file
#This might just be a time waster?? This stuff is never called upon again...
# mkdir temp
# snp=rs9410042
# chunk=9_138_141
# for study in mrsc nss1 nss2 stro PROM # safr mrsc dnhs gsdc fscd nss1 nss2 pts1 stro psy3 meg2 gtpc comc grac wrby cogb PROM
# do
# wc -l /home/maihofer/haps/mydata/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam
# awk '{print NR,$1,$2}' /home/maihofer/haps/mydata/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s1
# cat temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s1 temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s1 | sort -n -k1 | awk '{print $2,$3}' > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s2
# grep $snp imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps | perl /home/maihofer/localanc/transpose.pl > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps_transpose
# wc -l temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s2
# tail -n+6 temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps_transpose | wc -l
# paste <(tail -n+6 temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps_transpose) > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam.s2
# done
##Align my hapfile and elizabeth's hapfile
#Pull out elizabeths hapfiles and ancfiles
chr=9
for files in $(cat elizabeth_ancfilelocations_haps2.csv | sed "s/chr13/chr"$chr"/g" | grep -P "mrsc|nss1|nss2|PROM|stro" )
do
study=$(echo $files | awk -F, '{print $1}')
folder=$(echo $files | awk -F, '{print $2}')
haps=$(echo $files | awk -F, '{print $4}' | sed 's/haps//g' )
msp=$(echo $files | awk -F, '{print $3}')
echo "$study | $folder | $msp"
cp "$folder"/"$msp" ~/localanc/msp/"$study"_"$chr".msp.tsv
# cp "$folder"/"$haps"haps ~/localanc/haplotypes/"$study"_"$chr".haps
#cp "$folder"/"$haps"sample ~/localanc/haplotypes/"$study"_"$chr".sample
done
#From anc calls, get the top hit ancestry info
#lat tophit
chr=9
snp=rs9410042
snp_pos=140627715
#lat male tophit
chr=9
snp=rs3123508
snp_pos=140624309
for study in mrsc nss1 nss2 ppds PROM stro #
do
awk -v snp_pos=$snp_pos '{if(NR==2 || ($2 <= snp_pos && $3 >= snp_pos)) print}' ~/localanc/msp/"$study"_"$chr".msp.tsv | cut -f7- > temp/"$snp"_"$study".msp
nsub=$(awk 'NR==1{print NF}' temp/"$snp"_"$study".msp )
perl ~/localanc/transpose.pl <(cat temp/"$snp"_"$study".msp ) > temp/"$snp"_"$study".msp.an1
paste <(cut -f 1 temp/"$snp"_"$study".msp.an1 | sed 's/[.]0$//g' | sed 's/[.]1$//g' ) temp/"$snp"_"$study".msp.an1 > ~/localanc/mspchr"$chr"/"$snp"_"$study".msp.an2
done
cp ~/localanc/mspchr"$chr"/"$snp"_nss1.msp.an2 ~/localanc/mspchr"$chr"/"$snp"_ppds.msp.an2
cp ~/localanc/mspchr"$chr"/"$snp"_meg2.msp.an2 ~/localanc/mspchr"$chr"/"$snp"_maf1.msp.an2
cp ~/localanc/mspchr"$chr"/"$snp"_meg2.msp.an2 ~/localanc/mspchr"$chr"/"$snp"_maf2.msp.an2
cp ~/localanc/mspchr"$chr"/"$snp"_meg2.msp.an2 ~/localanc/mspchr"$chr"/"$snp"_maf3.msp.an2
#Use an IF statement to flip sort the mrs .fam file in elizabeths header file data
#Format each dataset to overlapping subjects and SNPs
snp=rs115539978
chr=13
snp=rs9410042
chunk=9_138_141
chr=9
mkdir "$snp"_elizabeth
for study in mrsc nss1 nss2 stro PROM # safr mrsc dnhs gsdc fscd nss1 nss2 pts1 stro psy3 meg2 gtpc comc grac wrby cogb
do
echo $study
#Identify all SNPs in elizabeths data
awk '{print $1}' /home/maihofer/localanc/haplotypes/"$study"_"$chr".haps | LC_ALL=C sort -k1b,1 > "$snp"_elizabeth/"$snp"_"$study"_elizabeth.snplist
#Identify all SNPs in my data
awk '{print $2}' imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps | LC_ALL=C sort -k1b,1 > "$snp"_elizabeth/"$snp"_"$study"_adam.snplist
#Identify overlapping SNPs
LC_ALL=C join "$snp"_elizabeth/"$snp"_"$study"_elizabeth.snplist "$snp"_elizabeth/"$snp"_"$study"_adam.snplist > "$snp"_elizabeth/"$snp"_"$study"_overlap.snplist
#Filter each dataset to overlapping SNPs
LC_ALL=C join <(LC_ALL=C sort -k1b,1 /home/maihofer/localanc/haplotypes/"$study"_"$chr".haps) "$snp"_elizabeth/"$snp"_"$study"_overlap.snplist | awk '{OFS=" "} {$1=$1; print}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".elizabeth_haps_overlap
LC_ALL=C join -1 2 <(LC_ALL=C sort -k2b,2 imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps) "$snp"_elizabeth/"$snp"_"$study"_overlap.snplist > /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".adam_haps_overlap
#Identify subjects in my data (NOTE: this is one place where things can go very wrong, if somehow subjects were dropped from .fam
#Make a duplicate of .fam file, order it, so that it gives each subject 2 entries. Join this to .haps file
awk '{print $1"_"$2, NR}' /home/maihofer/haps/mydata/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subjects
cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subjects /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subjects | sort -n -k2 | awk '{print $1}' | perl /home/maihofer/localanc/transpose.pl | awk '{OFS=" "} {$1=$1; print}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader
echo "SNP CHR BP A1 A2" | paste -d " " - /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader2
cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader2 /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".adam_haps_overlap > /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".adam_haps_overlap_wheader
#Do the same in elizabeths
tail -n+3 /home/maihofer/localanc/haplotypes/"$study"_"$chr".sample | awk '{print $1"_"$2,NR}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subjects
#Add in exception handling for MRS to swap the IDs back
if [ $study == "mrsc" ]
then
awk '{print $1,$2}' /home/maihofer/haps/mydata/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".fam | LC_ALL=C sort -k1b,1 > temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".famcols
tail -n+3 /home/maihofer/localanc/haplotypes/"$study"_"$chr".sample | awk '{print $1,NR}' | LC_ALL=C sort -k1b,1 > "$study"_"$chr".sample_s1
LC_ALL=C join -e X -o auto -a2 temp/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".famcols "$study"_"$chr".sample_s1 | sort -n -k 3 | awk '{print $1"_"$2,NR}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subjects #Must have matching N for join!!!
fi
cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subjects /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subjects | sort -n -k2 | awk '{print $1}' | perl /home/maihofer/localanc/transpose.pl | awk '{OFS=" "} {$1=$1; print}' > /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subheader
echo "SNP SNPA BP A1 A2" | paste -d " " - /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subheader > /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subheader2
cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_elizabeth.subheader2 /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".elizabeth_haps_overlap > /home/maihofer/haps/"$snp"_elizabeth/"$study"_"$chr".elizabeth_haps_overlap_wheader
#Make a copy of just the target SNP haplotype data, with a header
echo "CHR SNP BP A1 A2" | paste -d " " - /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader_fulldat
cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader_fulldat <(grep $snp imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps) > "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader
done
#Check back to .fam and .sample files to see if subjects really dont just line up, i have it that the lizabeth ones are differently ordered form mine..
#Give orientation requirement for MY data to match elizabeths
for study in mrsc nss1 nss2 stro PROM # safr mrsc dnhs gsdc fscd nss1 nss2 pts1 stro psy3 meg2 gtpc comc grac wrby cogb
do
echo $study
Rscript haps_comp_v3.r "$snp"_elizabeth/"$study"_"$chr".elizabeth_haps_overlap_wheader "$snp"_elizabeth/"$study"_"$chr".adam_haps_overlap_wheader "$snp"_elizabeth/"$study"_"$chr".hapcolmatch
done
#Copy NSS to be ppds.
#Fix MRS by ?? flipping identities
#Use orientation to flip haplotype data to the correct position
for study in mrsc nss1 nss2 stro PROM # safr mrsc dnhs gsdc fscd nss1 nss2 pts1 stro psy3 meg2 gtpc comc grac wrby cogb
do
echo $study
echo "CHR SNP BP A1 A2" | paste -d " " - /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader > /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader_fulldat
cat /home/maihofer/haps/"$snp"_elizabeth/"$study"_adam.subheader_fulldat <(grep $snp imputed/plink.pts_"$study"_mix_am-qc.hg19.ch.fl.chr"$chunk".haps_haps) > "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader
Rscript haps_align.r "$snp"_elizabeth/"$study"_"$chr".hapcolmatch "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader_lizorder
done
cp "$snp"_elizabeth/nss1_"$snp".adam_haps_overlap_wheader_lizorder "$snp"_elizabeth/ppds_"$snp".adam_haps_overlap_wheader_lizorder
cp "$snp"_elizabeth/meg2_"$snp".adam_haps_overlap_wheader_lizorder "$snp"_elizabeth/maf1_"$snp".adam_haps_overlap_wheader_lizorder
cp "$snp"_elizabeth/meg2_"$snp".adam_haps_overlap_wheader_lizorder "$snp"_elizabeth/maf2_"$snp".adam_haps_overlap_wheader_lizorder
cp "$snp"_elizabeth/meg2_"$snp".adam_haps_overlap_wheader_lizorder "$snp"_elizabeth/maf3_"$snp".adam_haps_overlap_wheader_lizorder
#Get association statistics
#make copies of meg2, nss1, to make extra datasets
##AAM tophit
region=chr13_054_057
chr=13
snpr=rs115539978
ancgroup=aam
gender=mf
region=chr9_138_141
chr=9
snp=rs9410042
ancgroup=lat
gender=mf
#note: gender doesnt do anything here, only does for the actual combined haps analysis
#note: need global pcs
#Half of MRS subjects are gone for some fucking reason
for study in ppds # mrsc nss1 nss2 stro PROM # maf1 maf2 maf3 safr mrsc dnhs gsdc cogb fscd nss1 ppds nss2 pts1 stro psy3 gtpc comc grac wrby # note: mrsc - may require columns to be flipped around. meg2 not analyzed because maf1 maf2 maf3 are analyzed instead
do
echo $study
Rscript ../localanc/1_association_test_haplotype.r ../localanc/mspchr"$chr"/"$snp"_"$study".msp.an2 "$snp"_elizabeth/"$study"_"$snp".adam_haps_overlap_wheader_lizorder "$snp" ../localanc/mds/all_"$ancgroup"/pts_"$study"_mix_am-qc-"$ancgroup"_pca.menv.mds_cov ../localanc/genotypes/pts_"$study"_mix_am-qc.fam $gender "$study" temp/"$snp"_"$study"_"$gender"_dose.txt
done
cat temp/"$snp"_*_"$gender"_dose.txt | awk '{if (NR == 1 || $1 != "\"FID_Hap\"") print }' > temp/"$snp"_"$gender"_dose_alldata.txt
Rscript ../localanc/1_association_combinedhaps.r temp/"$snp"_"$gender"_dose_alldata.txt $snp $gender allstudies.predpc tables_"$snp"_"$gender"_dose_alldata.txt
#Sanity check: in SAFR, the allele is mono in europeans, and 14% in africans
#Sanity check: The fact that the NEITHER error is rarely happening means that my data is aligned. Possibly output those subject names to a file, AS WELL AS the ones from my
#Sanity check : include max correspondence rate in the output file
#Sanity check: The flip rate is about 50%, what I would expect for problem like this
#Compare each file in R by subject,
#Maybe do this on SNP join, then transpose and subject join? First just put the fucking subject names in the file instead of screwing yourself over joining on IDs.
##Do basically the same thing, but for all SNPs
#Need to align my phasing with elizabeths (jesus FUCKING christ)
Her .haps files have the actual study subjects first, then the reference
#must make sure samples align too..
Easiest correspondence check? Only easy if ref and alt alleles stay the same. Retain from each data where markers match (cols 4 and 5).
#Take overlapping marker set in the right order. Filter to correc... this need sto be done on a per subject level this is a fucking nightmare
#Get overlapping marker set and overlapping subjects. Check order of my .fams vs her .sample files to make sure the match
#backup:impute a single da taset
datname=$dataset
mapfile=genetic_map_chr6_combined_b37.txt
reffile=sc_my.1000GP_Phase3_"$chunk"_1000.impute.phased.bgl
start=54000000
stop=57000000
qsub -lwalltime=00:50:00 impute.pbs -e errandout/ -o errandout -d $wd -F "-i $imp2 -r $refdir -m $mapfile -h $reffile -d $datname -k $datdir -s $start -p $stop -b $buffer -o $outdir"
#
以上是关于sh 推测单倍型用于局部血统分析的主要内容,如果未能解决你的问题,请参考以下文章