sh ukbb cnv重新格式化

Posted 2021-05-06
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了sh ukbb cnv重新格式化相关的知识，希望对你有一定的参考价值。

for chr in 2 6 11 # {7..20} # {2..20} # {1..20} #22 # {1..8} # {1..22} X MT XY Y
do
echo $chr
date
 mkdir temporary_files/chr"$chr"_baf1
 mkdir temporary_files/chr"$chr"_baf2
 mkdir temporary_files/chr"$chr"_baf3
 mkdir temporary_files/chr"$chr"_baf4
 mkdir temporary_files/chr"$chr"_l2r1
 mkdir temporary_files/chr"$chr"_l2r2
 mkdir temporary_files/chr"$chr"_l2r3
 mkdir temporary_files/chr"$chr"_l2r4
 mkdir chr"$chr"_complete
  
##Split the file by 100 lines at a time
#l2r
 
 echo "splitting l2r"
 date
 zcat ukb_l2r_chr"$chr"_v2.txt.gz | split -l 100 --suffix-length=6  -d - temporary_files/chr"$chr"_l2r1/chr"$chr"_l2r_
#baf
 echo "splitting baf"
 date
 zcat ukb_baf_chr"$chr"_v2.txt.gz | split -l 100 --suffix-length=6  -d - temporary_files/chr"$chr"_baf1/chr"$chr"_baf_

 #Benchmark: with 100 per file, can do about 400 SNPs/minute. For a whole genome thats ~1500 minutes (roughly 24 hours). So just over 2 days to do this for all sets
 #With 50 per file, same speed. No speed up here... 
 #With 25 per file, same speed
 #with 5 per file, 73 files per minute = 365 snps/minute. i dont think this isa  slow down, just timing is more accurate given file inputes
 #with 1 per file, 359 per minute.. so no real timing change here (6 SNPs per second).
 
##Transpose (I think this is the long step)

#l2r 
 filecount=1
 for files in $(ls temporary_files/chr"$chr"_l2r1)
  do
  perl transpose_space.pl  temporary_files/chr"$chr"_l2r1/$files > temporary_files/chr"$chr"_l2r2/$files 
  if [ $(($filecount % 100))  -eq 0 ]
  then 
   echo "l2r is at $filecount"
   date
  fi
   filecount=$(($filecount+1))
 done
 
#baf

filecount=1
 for files in $(ls temporary_files/chr"$chr"_baf1)
 do
  perl transpose_space.pl  temporary_files/chr"$chr"_baf1/$files > temporary_files/chr"$chr"_baf2/$files 

  if [ $(($filecount % 100))  -eq 0 ]
  then 
   echo "baf is at $filecount"
   date
  fi
   filecount=$(($filecount+1))
 done

 #with 100 lines, 1 transpose took 50 seconds. So again, 2 days to do all of this
 #with 50 lines, 1 transpose took 20 seconds. 2 took 40 seconds.. so potential speed gain here. (150/min)
 #With 25 lines, 1 transpose took..9 seconds..so the speedup isnt great here.. (150/min)
 #with 5 lines, 2.2 seconds... 25 lines might be best. 
 #With a sed based transpose, 230 files in 1 minute...so this is much faster than the other approaches, provided I can paste themm....another says 85 in 30 seconds. so thats 170/min.. really not much faster?? (170/min)

##Rejoin file 

#L2r
echo "Rejoining l2r"
date
time paste temporary_files/chr"$chr"_l2r2/* > temporary_files/chr"$chr"_l2r3/ukb_l2r_chr"$chr"_v2_t.txt
#baf
echo "Rejoining baf"
date
paste temporary_files/chr"$chr"_baf2/* > temporary_files/chr"$chr"_baf3/ukb_baf_chr"$chr"_v2_t.txt

#WAs 62 minutes with 50 lines per file.

#Split file by line (I could alternatively paste after this but its too many i/o
#l2r
echo "spliting l2r"
date
split -l 1  --suffix-length=7  -d temporary_files/chr"$chr"_l2r3/ukb_l2r_chr"$chr"_v2_t.txt temporary_files/chr"$chr"_l2r4/ukb_l2r_chr"$chr"_
#baf
echo "spliting baf"
date
split -l 1  --suffix-length=7  -d temporary_files/chr"$chr"_baf3/ukb_baf_chr"$chr"_v2_t.txt temporary_files/chr"$chr"_baf4/ukb_baf_chr"$chr"_

#Based on previous stuff, this will take 2 days
#rm -rf temporary_files/chr"$chr"_l2r1
#rm -rf temporary_files/chr"$chr"_baf1
#rm -rf temporary_files/chr"$chr"_l2r2
#rm -rf temporary_files/chr"$chr"_baf2
#rm -rf temporary_files/chr"$chr"_l2r3
#rm -rf temporary_files/chr"$chr"_baf3
#For each subject:
 #perl transpose_space.pl <( 
 #Concatenate together genotype Names  and transpose back..
 echo "adjusting bim file"
 date
 perl transpose.pl <(cat ukb_snp_chr"$chr"_v2.bim | awk 'BEGIN{OFS="\t"}{print $2}')  > chr"$chr"_pcv_snps.txt
 cat ukb_snp_chr"$chr"_v2.bim | awk 'BEGIN{OFS="\t"}{print $2}'  > chr"$chr"_pcv_snps_ALT.txt
 

done
#need this for chr 2, 6?


#Get priority list of IDs
seq -w 0 488377   
LC_ALL=C join <(LC_ALL=C sort -k1b,1 UKB_ptsd_eur_related_m3_may3_2019.pheno | awk '{print $1}' ) <( paste <(awk '{print $1}' /mnt/ukbb/adam/ptsd/preimputation_genotypes/ukb41209_cal_chr1_v2_s488292.fam) <(seq -w 0 488377 ) | LC_ALL=C sort -k1b,1) | sort -k2 > priority_subjects.txt


split --numeric-suffixes=1 -l 5000 priority_subjects.txt priority
 
for n in $(seq -w 1 1 30 | head -n6 | tail -n 3 ) # seq -w 23 1 30
do
 
for chr in  1 {3..5} {7..15} #   4 5 {7..22} # 1 is half done #3 is 3/4 done #4 is 2/3 done  #5 is 2/3... do 2000 subjects at a time
do
 echo "Working on $chr"
 date



  now=$(date)
  echo "It is $now. Processing chr $chr : chunk $n"
  #mkdir temporary_files/chr"$chr"_l2r4_"$n"
 # mkdir temporary_files/chr"$chr"_baf4_"$n"
  #move the stuff from the chunks into what should be a smaller, easier to read folder...

  for subject in $(awk '{print $2}' priority$n )  # go to number of subjects 488377
  do
  # mv temporary_files/chr"$chr"_l2r4/ukb_l2r_chr"$chr"_0$subject temporary_files/chr"$chr"_l2r4_"$n"/.
   #mv temporary_files/chr"$chr"_baf4/ukb_baf_chr"$chr"_0$subject temporary_files/chr"$chr"_baf4_"$n"/.
   if [ ! -f "chr"$chr"_complete/"$chr"_"$subject".pcv" ]
   then
    paste chr"$chr"_pcv_snps_ALT.txt <(sed 's/\t/\n/g' temporary_files/chr"$chr"_l2r4/ukb_l2r_chr"$chr"_0$subject) <(sed 's/\t/\n/g' temporary_files/chr"$chr"_baf4/ukb_baf_chr"$chr"_0$subject) >  chr"$chr"_complete/"$chr"_"$subject".pcv 
   fi
  done
  #zip up the folder 
 done
done


#note some parts of chr 10 and 11 have been moved into folders temporary_files/chr"$chr"_l2r4_"$n"/. temporary_files/chr"$chr"_baf4_"$n"/. (possibly also 9) do not retain this systems
#chr 13 and 14 have done 1:10 so far
#5 chr15 has done5-10


#nEXT: Then concatenate all files.. and add a header


1000 subjects in 1 minute for 12000 SNPs... 2 minutes for 100000 SNPs...so scaling is not linear...1 day to finish..


so 400 minutes per thing... thats 7 hours for just one chromosome... 20% faster (600 in 30 seconds) if i didnt have to fuck with the headers, which shouldnt be in at this point anyway because im going to
can do 2000 in 1 minute using the non tranpose way...




#join with SNP names as well as BAFs processed in the the same way

#file checking info

for chr in  {1..22}  # 21 22
do
 
# echo chr$chr baf1; 
#ls --sort=non temporary_files/chr"$chr"_baf1  | wc -l
# echo chr$chr baf2; 
#ls --sort=non temporary_files/chr"$chr"_baf2 | wc -l
# echo chr$chr  baf3; 
#ls --sort=non temporary_files/chr"$chr"_baf3 | wc -l
 echo chr$chr  baf4; 
ls --sort=non temporary_files/chr"$chr"_baf4 | wc -l


# echo chr$chr l2r1; 
#ls --sort=non temporary_files/chr"$chr"_l2r1 | wc -l
# echo chr$chr l2r1; 
#ls --sort=non temporary_files/chr"$chr"_l2r2 | wc -l
# echo chr$chr  l2r1; 
#ls --sort=non temporary_files/chr"$chr"_l2r3 | wc -l
echo chr$chr  l2r4; 
ls --sort=non temporary_files/chr"$chr"_l2r4 | wc -l

done

#do chr 2 enitrely
#chr6: restart chr 6
chr11 is not done either
以上是关于sh ukbb cnv重新格式化的主要内容，如果未能解决你的问题，请参考以下文章
FRB数据库：不同类型的CNV 5年长期结局也会不同吗？ —隐匿型CNV和经典型CNV 5年长期治疗结局
CNV检测新方法
DNA拷贝数变异CNV检测——基础概念篇
10X单细胞个性化分析之CNV篇
外显子WES数据检测CNV方法梳理与软件汇总