set -euo pipefail

awk 'BEGIN {for (i=6970000; i<7030000; i+=5000) printf("chr8\t%d\t%d\n",i,i+5000)}' > DEFA_chunks.bed
awk 'BEGIN {for (i=102481200; i<102580400; i+=5000) printf("chr7\t%d\t%d\n",i,i+5000)}' > SPDYE2_chunks.bed

### convert GRCh38 reference fasta to "2bit" format

# /n/data1/bwh/medicine/loh/data/GRCh38$
# wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/faToTwoBit
# chmod u+x faToTwoBit
# ./faToTwoBit GRCh38_full_analysis_set_plus_decoy_hla.fa GRCh38_full_analysis_set_plus_decoy_hla.2bit



for LOCUS in DEFA SPDYE2
do
    ### extract 5kb chunks to provide as input to BLAT
    # module load bedtools/2.27.1
    GRCH38_FA=/n/data1/bwh/medicine/loh/data/GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa
    bedtools getfasta -fi $GRCH38_FA -bed ${LOCUS}_chunks.bed > ${LOCUS}_chunks.fasta

    ### run BLAT
    # module load gcc/6.2.0; module load blat
    blat -minScore=200 -minIdentity=98 \
	/n/data1/bwh/medicine/loh/data/GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.2bit \
	${LOCUS}_chunks.fasta \
	${LOCUS}_chunks.psl

    ### add line breaks to improve readability
    awk '$10~/chr/ && $10!=prev {print "----"} {print; prev=$10}' ${LOCUS}_chunks.psl

done



### generate "bonusCN" files indicating segments with more/fewer copies in GRCh38

# DEFA region: chr8:6971800-7020000
# canonical repeat: chr8:6976649-6995766
# blat chr8:7019800-7020000 -> chr8:6981591-6981791 => bonus 0/-2 transition at 6981591
# blat chr8:6971800-6972000 -> chr8:6990960-6991160 => bonus -2/0 transition at 6990960

awk 'BEGIN {
  for (pos=6976650; pos<=6995766; pos++) {
    if (pos>6981591 && pos<6990960) bonusCN = -2;
    else bonusCN = 0;
    print pos,bonusCN;
  }
}' > DEFA_bonusCN.txt


# SPDYE2 regions: chr7:102341679-102357966 chr7:102473938-102691171
# reference region (~99kb): chr7:102481214-102580389
# chr7:102341679-102357966 -> chr7:102551313-102567586 (from seg dup track)
#                             => bonus +2 from 102551313-102567586
# chr7:102473938-102480264 -> chr7:102567587-102573920 (from seg dup track)
#                             => bonus +2 starting at 102567587
#                                ... so actually just +2 from 102551313-END
# chr7:102687791-102691171 -> chr7:102492712-102496097 (from seg dup track)
#                             => bonus +2 from START-102496097

awk 'BEGIN {
  for (pos=102481215; pos<=102580389; pos++) {
    if (pos>102496097 && pos<102551313) bonusCN = 0;
    else bonusCN = 2;
    print pos,bonusCN;
  }
}' > SPDYE2_bonusCN.txt
