sh ukbb cnv重新格式化
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了sh ukbb cnv重新格式化相关的知识,希望对你有一定的参考价值。
for chr in 2 6 11 # {7..20} # {2..20} # {1..20} #22 # {1..8} # {1..22} X MT XY Y
do
echo $chr
date
mkdir temporary_files/chr"$chr"_baf1
mkdir temporary_files/chr"$chr"_baf2
mkdir temporary_files/chr"$chr"_baf3
mkdir temporary_files/chr"$chr"_baf4
mkdir temporary_files/chr"$chr"_l2r1
mkdir temporary_files/chr"$chr"_l2r2
mkdir temporary_files/chr"$chr"_l2r3
mkdir temporary_files/chr"$chr"_l2r4
mkdir chr"$chr"_complete
##Split the file by 100 lines at a time
#l2r
echo "splitting l2r"
date
zcat ukb_l2r_chr"$chr"_v2.txt.gz | split -l 100 --suffix-length=6 -d - temporary_files/chr"$chr"_l2r1/chr"$chr"_l2r_
#baf
echo "splitting baf"
date
zcat ukb_baf_chr"$chr"_v2.txt.gz | split -l 100 --suffix-length=6 -d - temporary_files/chr"$chr"_baf1/chr"$chr"_baf_
#Benchmark: with 100 per file, can do about 400 SNPs/minute. For a whole genome thats ~1500 minutes (roughly 24 hours). So just over 2 days to do this for all sets
#With 50 per file, same speed. No speed up here...
#With 25 per file, same speed
#with 5 per file, 73 files per minute = 365 snps/minute. i dont think this isa slow down, just timing is more accurate given file inputes
#with 1 per file, 359 per minute.. so no real timing change here (6 SNPs per second).
##Transpose (I think this is the long step)
#l2r
filecount=1
for files in $(ls temporary_files/chr"$chr"_l2r1)
do
perl transpose_space.pl temporary_files/chr"$chr"_l2r1/$files > temporary_files/chr"$chr"_l2r2/$files
if [ $(($filecount % 100)) -eq 0 ]
then
echo "l2r is at $filecount"
date
fi
filecount=$(($filecount+1))
done
#baf
filecount=1
for files in $(ls temporary_files/chr"$chr"_baf1)
do
perl transpose_space.pl temporary_files/chr"$chr"_baf1/$files > temporary_files/chr"$chr"_baf2/$files
if [ $(($filecount % 100)) -eq 0 ]
then
echo "baf is at $filecount"
date
fi
filecount=$(($filecount+1))
done
#with 100 lines, 1 transpose took 50 seconds. So again, 2 days to do all of this
#with 50 lines, 1 transpose took 20 seconds. 2 took 40 seconds.. so potential speed gain here. (150/min)
#With 25 lines, 1 transpose took..9 seconds..so the speedup isnt great here.. (150/min)
#with 5 lines, 2.2 seconds... 25 lines might be best.
#With a sed based transpose, 230 files in 1 minute...so this is much faster than the other approaches, provided I can paste themm....another says 85 in 30 seconds. so thats 170/min.. really not much faster?? (170/min)
##Rejoin file
#L2r
echo "Rejoining l2r"
date
time paste temporary_files/chr"$chr"_l2r2/* > temporary_files/chr"$chr"_l2r3/ukb_l2r_chr"$chr"_v2_t.txt
#baf
echo "Rejoining baf"
date
paste temporary_files/chr"$chr"_baf2/* > temporary_files/chr"$chr"_baf3/ukb_baf_chr"$chr"_v2_t.txt
#WAs 62 minutes with 50 lines per file.
#Split file by line (I could alternatively paste after this but its too many i/o
#l2r
echo "spliting l2r"
date
split -l 1 --suffix-length=7 -d temporary_files/chr"$chr"_l2r3/ukb_l2r_chr"$chr"_v2_t.txt temporary_files/chr"$chr"_l2r4/ukb_l2r_chr"$chr"_
#baf
echo "spliting baf"
date
split -l 1 --suffix-length=7 -d temporary_files/chr"$chr"_baf3/ukb_baf_chr"$chr"_v2_t.txt temporary_files/chr"$chr"_baf4/ukb_baf_chr"$chr"_
#Based on previous stuff, this will take 2 days
#rm -rf temporary_files/chr"$chr"_l2r1
#rm -rf temporary_files/chr"$chr"_baf1
#rm -rf temporary_files/chr"$chr"_l2r2
#rm -rf temporary_files/chr"$chr"_baf2
#rm -rf temporary_files/chr"$chr"_l2r3
#rm -rf temporary_files/chr"$chr"_baf3
#For each subject:
#perl transpose_space.pl <(
#Concatenate together genotype Names and transpose back..
echo "adjusting bim file"
date
perl transpose.pl <(cat ukb_snp_chr"$chr"_v2.bim | awk 'BEGIN{OFS="\t"}{print $2}') > chr"$chr"_pcv_snps.txt
cat ukb_snp_chr"$chr"_v2.bim | awk 'BEGIN{OFS="\t"}{print $2}' > chr"$chr"_pcv_snps_ALT.txt
done
#need this for chr 2, 6?
#Get priority list of IDs
seq -w 0 488377
LC_ALL=C join <(LC_ALL=C sort -k1b,1 UKB_ptsd_eur_related_m3_may3_2019.pheno | awk '{print $1}' ) <( paste <(awk '{print $1}' /mnt/ukbb/adam/ptsd/preimputation_genotypes/ukb41209_cal_chr1_v2_s488292.fam) <(seq -w 0 488377 ) | LC_ALL=C sort -k1b,1) | sort -k2 > priority_subjects.txt
split --numeric-suffixes=1 -l 5000 priority_subjects.txt priority
for n in $(seq -w 1 1 30 | head -n6 | tail -n 3 ) # seq -w 23 1 30
do
for chr in 1 {3..5} {7..15} # 4 5 {7..22} # 1 is half done #3 is 3/4 done #4 is 2/3 done #5 is 2/3... do 2000 subjects at a time
do
echo "Working on $chr"
date
now=$(date)
echo "It is $now. Processing chr $chr : chunk $n"
#mkdir temporary_files/chr"$chr"_l2r4_"$n"
# mkdir temporary_files/chr"$chr"_baf4_"$n"
#move the stuff from the chunks into what should be a smaller, easier to read folder...
for subject in $(awk '{print $2}' priority$n ) # go to number of subjects 488377
do
# mv temporary_files/chr"$chr"_l2r4/ukb_l2r_chr"$chr"_0$subject temporary_files/chr"$chr"_l2r4_"$n"/.
#mv temporary_files/chr"$chr"_baf4/ukb_baf_chr"$chr"_0$subject temporary_files/chr"$chr"_baf4_"$n"/.
if [ ! -f "chr"$chr"_complete/"$chr"_"$subject".pcv" ]
then
paste chr"$chr"_pcv_snps_ALT.txt <(sed 's/\t/\n/g' temporary_files/chr"$chr"_l2r4/ukb_l2r_chr"$chr"_0$subject) <(sed 's/\t/\n/g' temporary_files/chr"$chr"_baf4/ukb_baf_chr"$chr"_0$subject) > chr"$chr"_complete/"$chr"_"$subject".pcv
fi
done
#zip up the folder
done
done
#note some parts of chr 10 and 11 have been moved into folders temporary_files/chr"$chr"_l2r4_"$n"/. temporary_files/chr"$chr"_baf4_"$n"/. (possibly also 9) do not retain this systems
#chr 13 and 14 have done 1:10 so far
#5 chr15 has done5-10
#nEXT: Then concatenate all files.. and add a header
1000 subjects in 1 minute for 12000 SNPs... 2 minutes for 100000 SNPs...so scaling is not linear...1 day to finish..
so 400 minutes per thing... thats 7 hours for just one chromosome... 20% faster (600 in 30 seconds) if i didnt have to fuck with the headers, which shouldnt be in at this point anyway because im going to
can do 2000 in 1 minute using the non tranpose way...
#join with SNP names as well as BAFs processed in the the same way
#file checking info
for chr in {1..22} # 21 22
do
# echo chr$chr baf1;
#ls --sort=non temporary_files/chr"$chr"_baf1 | wc -l
# echo chr$chr baf2;
#ls --sort=non temporary_files/chr"$chr"_baf2 | wc -l
# echo chr$chr baf3;
#ls --sort=non temporary_files/chr"$chr"_baf3 | wc -l
echo chr$chr baf4;
ls --sort=non temporary_files/chr"$chr"_baf4 | wc -l
# echo chr$chr l2r1;
#ls --sort=non temporary_files/chr"$chr"_l2r1 | wc -l
# echo chr$chr l2r1;
#ls --sort=non temporary_files/chr"$chr"_l2r2 | wc -l
# echo chr$chr l2r1;
#ls --sort=non temporary_files/chr"$chr"_l2r3 | wc -l
echo chr$chr l2r4;
ls --sort=non temporary_files/chr"$chr"_l2r4 | wc -l
done
#do chr 2 enitrely
#chr6: restart chr 6
chr11 is not done either
以上是关于sh ukbb cnv重新格式化的主要内容,如果未能解决你的问题,请参考以下文章
FRB数据库:不同类型的CNV 5年长期结局也会不同吗? —隐匿型CNV和经典型CNV 5年长期治疗结局