set -euo pipefail

FILE="$1"
REF_LENGTH=$2
TMP_DIR=$3 # NOTE: reference should be at $TMP_DIR/ref.fasta (w/ bwa index)
# NOTE: arg 4+ = regions for samtools view

ID=$( basename "$FILE" )
ID=${ID:0:7}

### convert bam -> fastq (reads with no bad flags)
samtools view -h \
    -T GRCh38_full_analysis_set_plus_decoy_hla.fa \
    -F 0xF00 \
    "$FILE" \
    ${@:4} \
    | samtools sort -n - \
    | samtools fastq -1 $TMP_DIR/$ID.reads1.fastq -2 $TMP_DIR/$ID.reads2.fastq -s $TMP_DIR/$ID.reads_single.fastq -

### realign
THREADS=1
./bwa mem -v 1 -t $THREADS $TMP_DIR/ref.fasta $TMP_DIR/$ID.reads_single.fastq \
    | samtools sort -o $TMP_DIR/$ID.unpaired.bam -
./bwa mem -v 1 -t $THREADS $TMP_DIR/ref.fasta $TMP_DIR/$ID.reads{1,2}.fastq \
    | samtools sort -o $TMP_DIR/$ID.paired.bam -
samtools merge -f $TMP_DIR/$ID.bam $TMP_DIR/$ID.unpaired.bam $TMP_DIR/$ID.paired.bam

# tabulate read support A,C,G,T
./htsbox pileup -f $TMP_DIR/ref.fasta -Q20 -l50 $TMP_DIR/$ID.bam \
    | sed 's/:/ /g' | awk -v refLen=$REF_LENGTH -v OFS=$'\t' '{
  pos = $3<=refLen ? $3 : $3-refLen;
  split($5,bases,",");
  split($7,counts,",");
  for (i in bases) {
    c[pos,bases[i]] += counts[i];
    cTot[pos] += counts[i];
    if (bases[i]==$4)
      cREF[pos] += counts[i];
  }
}
END {
  for (pos_seq in c) {
    split(pos_seq,arr,SUBSEP);
    pos = arr[1];
    seq = arr[2];
    if (cREF[pos] <= cTot[pos]-3)
      print pos,seq,c[pos_seq],cTot[pos];
  }
}' \
    - \
    | sort -k1,1n -k2,2 > $TMP_DIR/out/$ID.htsbox.txt

# clean up
rm $TMP_DIR/$ID.{bam,unpaired.bam,paired.bam,reads1.fastq,reads2.fastq,reads_single.fastq}
