### run interactively on mem2_ssd1_v2_x4

# [old] set on input: $CHR, $OUT_FILE, $CORES

CHR=8
CORES=4
OUT_PREFIX=DEFA_PSVCN_WGS_phased

set -euo pipefail

TMP_DIR=$HOME/tmp
mkdir -p $TMP_DIR

### phase region; output unphased and phased data

cp -f /mnt/project/lohdata/ploh/WES_CNVs/region_phasing/normPhaseRegionsPSVs $TMP_DIR
chmod +x $TMP_DIR/normPhaseRegionsPSVs

INPUTS_DIR=/mnt/project/lohdata/ploh/WES_CNVs/region_phasing/inputs
RESOURCES_DIR=/mnt/project/lohdata/resources

PBWT_BAND_WIDTH=100 # increase because only 200K/487K samples have WGS

/usr/bin/time -v $TMP_DIR/normPhaseRegionsPSVs \
    chr$CHR \
    /mnt/project/lohdata/ploh/WES_CNVs/PSVs_realign/results_DEFA_compiled/DEFA.minHetRate_gt_0.02.PSVCN.txt.gz \
    $RESOURCES_DIR/bim_with_cM/chr$CHR.bim \
    $RESOURCES_DIR/mosaic500K_phasing/chr$CHR.snps.txt.gz \
    $INPUTS_DIR/chr$CHR.UKBsites.hg38.ucsc_bed \
    "/mnt/project/Bulk/Imputation/UKB imputation from genotype/ukb22828_c1_b0_v3.sample" \
    $RESOURCES_DIR/mosaic500K_phasing/mosaic500K_phasing.chr$CHR.hap64.bin \
    $INPUTS_DIR/trios_indep_EUR.40709.txt \
    $PBWT_BAND_WIDTH \
    $CORES \
    "$OUT_PREFIX"


SLICES=5
cat $OUT_PREFIX.hapCN.txt \
    | awk -v slices=$SLICES -v prefix=$OUT_PREFIX '
NR==1 {
  for (slice=1; slice<=slices; slice++) {
    out_file = prefix".dipCN_t.slice"slice".txt";
    printf "ID" > out_file;
    for (i=2; i<=NF; i+=2)
      printf "\t%s",substr($i,1,7) > out_file;
    printf "\n" > out_file
  }
}
NR>1 {
  split($1,arr,"_"); pos = arr[2]; slice = int((pos-6976000)/20000*slices)+1;
  out_file = prefix".dipCN_t.slice"slice".txt";
  printf "%s",$1 > out_file;
  for (i=2; i<=NF; i+=2)
    printf "\t%.3f",$i+$(i+1) > out_file;
  printf "\n" > out_file;
}'

for SLICE in `seq $SLICES`
do
    echo $SLICE
    date
    transpose < $OUT_PREFIX.dipCN_t.slice$SLICE.txt > $OUT_PREFIX.dipCN.slice$SLICE.txt
done

for SLICE in `seq $SLICES`
do
    gzip $OUT_PREFIX.dipCN.slice$SLICE.txt && echo done$SLICE &
done

gzip $OUT_PREFIX.hapCN.txt

cp $OUT_PREFIX.hapCN.txt.gz $OUT_PREFIX.stats.txt $OUT_PREFIX.dipCN.slice*.txt.gz \
    /mnt/project/lohdata/ploh/WES_CNVs/PSVs_realign/results_DEFA_compiled/
