# bash compute_PSAF.sh DEFA chr8 6976649 6995766
# bash compute_PSAF.sh SPDYE2 chr7 102481214 102580389

set -euo pipefail

LOCUS=$1
REF_CHR=$2
REF_START=$3
REF_END=$4

### extract reference sequence (needed to handle hom-REF positions suppressed from htsbox output)
printf "%s\t%s\t%s\n" $REF_CHR $REF_START $REF_END > ref.bed
bedtools getfasta -fi /mnt/project/lohdata/resources/GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa -bed ref.bed > ref.fasta

### tabulate fraction of samples in batch10 heterozygous for each PSV
rm -rf out/
tar -zxf /mnt/project/lohdata/ploh/WES_CNVs/PSVs_realign/results_$LOCUS/htsbox_batch10.tar.gz
awk -v OFS=$'\t' '
ARGIND==1 && FNR==2 { ref=$1 }
ARGIND>1 {
  pos = $1;
  seq = $2;
  readsSeq = $3;
  readsTot = $4;
  if (readsSeq>=5 && readsTot-readsSeq>=5)
    hets[pos,seq]++;
}
END {
  print "POS","REF","SEQ","HET_RATE";
  for (pos_seq in hets) {
    split(pos_seq,arr,SUBSEP);
    pos = arr[1];
    seq = arr[2];
    print pos,substr(ref,pos,1),seq,hets[pos_seq]/(ARGIND-1);
  }
}' \
    ref.fasta \
    out/*.htsbox.txt \
    | sort -k1,1n -k2,2 > $LOCUS.PSV_summary.txt



MIN_HET_RATE=0.02
OUT_FILE=$LOCUS.minHetRate_gt_$MIN_HET_RATE.PSAF.txt
rm -f $OUT_FILE

### generate header (list of common PSVs)
awk -v minHetRate=$MIN_HET_RATE -v chr=$REF_CHR -v offset=$REF_START 'BEGIN { printf "ID"; }
$4>minHetRate && FNR!=1 {
  pos = $1 + offset; # shift coordinates to match GRCh38
  ref = $2;
  seq = $3;
  printf "\t%s_%d_%s_%s",chr,pos,ref,seq;
}
END { printf "\n"; }' $LOCUS.PSV_summary.txt \
    >> $OUT_FILE

### compute PSAF for each sample for each common PSV
for BATCH in {10..60}
do
    echo $BATCH
    rm -rf out/
    tar -zxf /mnt/project/lohdata/ploh/WES_CNVs/PSVs_realign/results_$LOCUS/htsbox_batch$BATCH.tar.gz
    for FILE in $( ls out/*.htsbox.txt )
    do
	ID=$( basename "$FILE" )
	ID=${ID:0:7}
	echo -n $ID >> $OUT_FILE

	awk -v minHetRate=$MIN_HET_RATE '
ARGIND==1 {
  pos = $1;
  seq = $2;
  readsSeq = $3;
  readsTot = $4;
  psaf[pos,seq] = readsSeq / readsTot;
  depth[pos] = readsTot;
}
ARGIND==2 && $4>minHetRate && FNR!=1 {
  pos = $1;
  ref = $2;
  seq = $3;
  if (ref==seq && depth[pos]=="")
    psaf[pos,seq] = 1; # pileup data was not saved because almost all reads were REF
  printf "\t%.3f",psaf[pos,seq];
}
END { printf "\n"; }' \
    $FILE \
    $LOCUS.PSV_summary.txt \
    >> $OUT_FILE

    done
done

gzip -f $OUT_FILE
