# assume files from step 9 are located here : HICNV_OUTPUT_DIR/output/HI-CNV_calls/CNVs.chr*.ibd*.txt.gz
# assumes TMRCA/IBD parameters: 0,5,10,25,50,100 and chr1..22


HICNV_OUTPUT_DIR=FILL_IN_WITH_STEP9_DIRECTORY
DIR_CALLS=$HICNV_OUTPUT_DIR/output/HI-CNV_calls/CNVs.
OUTPUT_DIR=FILL_IN_WITH_OUTPUT_DIRECTORY

mkdir $OUTPUT_DIR/rawfiles/ 
mkdir $OUTPUT_DIR/mergedWithinTMRCA/
mkdir $OUTPUT_DIR/deduped/
mkdir $OUTPUT_DIR/flattened/


# Quality control: 
# Step 1:
# subset to calls > 50 basepairs for deletions; for duplications calls > 500 bp + log10BF > 9


for CHR in {1..22}
do
for IBD in {0,5,10,25,50,100}
do
zcat {$DIR_CALLS}chr$CHR.ibd$IBD.txt.gz | awk 'NR==1 || ($3 == "DEL" && $7 > 0.05) || ($3 == "DUP" && $7 > 0.5 && $11 > 9)' | gzip > $OUTPUT_DIR/rawfiles/chr$CHR.ibd$IBD.txt.gz
done
done

# run merging_withinTMRCA.R

# inputs: 
# -- CHR: chromosome of interest
# -- TMRCA: TMRCA of interest
# -- DIR_UKBB_calls: where HMM file outputs are stored
# -- DIR_Save_Merged_calls: where to save output (see below)
# output: 
# -- saves a merged file (merged within individual-haplotype pair) to DIR_Save_Merged_calls for given CHR-TMRCA pair
# -- columns in output are:
# ID_1	ID_2	HAP	CHR	CNV_TYPE	LOD	BP_START	BP_END	PROBE_START	PROBE_END	LOD_HAP_NBRS_ONLY

# notes: 
# could be simplified by only look at 1 haplotype for TMRCA 0
# can be used to create a call set for a given TMRCA (e.g. if interest is in HI-CNV_0)
# LOD and LOD_HAP_NBRS_ONLY for bridged events is the sum of the events' LOD/LOD_HAP_NBRS_ONLY


for CHR in {1..22}
do
for IBD in {0,5,10,25,50,100}
do
# run merging_withinTMRCA.R for each chromosome/TMRCA pair;
# sample command: Rscript merging_withinTMRCA.R $CHR $IBD $OUTPUT_DIR/rawfiles/ $OUTPUT_DIR/mergedWithinTMRCA/

done
done

# run deduping_acrossTMRCA.R: de-duplicates calls of same type across TMRCA+haplotype
# Considered two CNV calls of the same type (DUP or DEL) to be duplicates if their
# endpoints matched within 4 SNP-array probes (i.e., ∆start <= 4 and ∆end <= 4). 
# For each such duplicate pair, we retained the call with higher log10BF. 

# inputs: 
# -- CHR: chromosome of interest
# -- delta: two CNV calls of the same type (DUP or DEL) to be duplicates if their endpoints matched within delta SNP-array probes (i.e., ∆start ≤ delta and ∆end ≤ delta; in our manuscript this value was 4)
# -- Dir_merged_calls: where merged files are stored
# -- Dir_save_deduped_calls: where to save output (see below)
# output: 
# -- saves a de-duplicated file to Dir_save_deduped_calls
# -- columns in output are:
# ID_1	ID_2	CHR	CNV_TYPE	LOD	BP_START	BP_END	PROBE_START	PROBE_END	LOD_HAP_NBRS_ONLY	gt

for CHR in {1..22}
do
# run deduping_acrossTMRCA.R for each chromosome; default delta=4; 
# sample command: Rscript deduping_acrossTMRCA.R $CHR 4 $OUTPUT_DIR/mergedWithinTMRCA/ $OUTPUT_DIR/deduped/
done


# run Flatten.R: the deduped callset can still contain overlapping CNV calls; this script creates a “unioned” callset in which we merged overlapping CNV calls of the same type
# (DUP or DEL). Can either return a union of overlapping events, or the event with the highest log10 BF

# inputs: 
# -- CHR: chromosome of interest
# -- delta: two CNV calls of the same type (DUP or DEL) to be duplicates if their endpoints matched within delta SNP-array probes (i.e., ∆start ≤ delta and ∆end ≤ delta; in our manuscript this value was 4)
# -- UnionIndicator: whether user wants a union of overlapping events (1), or the event with the highest log10 BF (0)
# -- Dir_deduped_calls: where deduped files are stored
# -- Dir_save_flattened_calls: where to save output (see below)
# output: 
# -- saves a flattened file to Dir_save_flattened_calls
# -- columns in output are:
# unioned:
# ID_1 ID_2 CHR CNV_TYPE BP_START BP_END PROBE_START PROBE_END
# highest log10 BF:
# ID_1 ID_2 CHR CNV_TYPE LOD BP_START BP_END PROBE_START PROBE_END LOD_HAP_NBRS_ONLY gt

for CHR in {1..22}
do
# run Flatten.R for each chromosome; default delta=4; default UnionIndicator indicator is 1
# sample command: Rscript Flatten.R $CHR 4 1 $OUTPUT_DIR/deduped/ $OUTPUT_DIR/flattened/
done

# Lastly, we applied a final set of length filters on the CNV calls, requiring deletions to be >75 bp and duplications to be >500 bp  

for CHR in {1..22}
do
awk 'NR==1 || $7-$6 + 1 > 75' <(zcat $OUTPUT_DIR/deduped/chr${CHR}_delta4_deduped_and_genotyped.txt.gz) | gzip  > $OUTPUT_DIR/deduped/chr${CHR}_delta4_deduped_and_genotyped_greater75bp.txt.gz
awk 'NR==1 || $6-$5 + 1 > 75' <( zcat $OUTPUT_DIR/flattened/chr${CHR}_delta4_unioned.txt)  | gzip > $OUTPUT_DIR/flattened/chr${CHR}_delta4_unioned_greater75bp.txt.gz
done
