#!/usr/bin/env Rscript
options(echo=FALSE);
param <- commandArgs(trailingOnly=T)
CHR = as.numeric(eval(paste(text=param[1])))
delta = as.numeric(eval(paste(text=param[2]))) # 4
Dir_merged_calls = eval(paste(text=param[3]))
Dir_save_deduped_calls =  eval(paste(text=param[4]))
# -----------#
library(dplyr, warn.conflicts = FALSE)
library(data.table)
options(dplyr.summarise.inform = FALSE)
# -----------#
print("Reading all TMCRA data")
allCalls = NULL
for(tmrca in c(0,5,10,25,50,100)){
  tbl = fread(paste0(Dir_merged_calls,"chr",CHR,".ibd",tmrca,".txt.gz"),h=T,data.table = F)
  tbl$tmrca = tmrca
  allCalls = rbind(allCalls,tbl)
}
print("Done reading data")

# Considered two CNV calls of the same type (DUP or DEL) to be duplicates if their
# endpoints matched within 4 SNP-array probes (i.e., ∆start ≤ 4 and ∆end ≤ 4). 
# For each such duplicate pair, we retained the call with higher log10BF. 

dplyr_tbl_iid_Type <-  allCalls %>% group_by(ID_1,ID_2,CHR,CNV_TYPE) %>% mutate(n = n())

# only have to consider deduping if an individual has >1 event of a given type
# on a chromosome
deduping_needed = dplyr_tbl_iid_Type[dplyr_tbl_iid_Type$n > 1,]
deduping_unneeded = dplyr_tbl_iid_Type[dplyr_tbl_iid_Type$n == 1,]

function_apply <- function(df){
  # Sort the list of merged calls (from all TMRCA, hap) in descending order of log10 Bayes factor
  df = df[order(df$LOD,decreasing=T),]
  # Iterate through the list, dropping any call that approximately matches a previously-seen call 
  callsToKeep = df[1,] # keep the highest score 
  EventType = c(1)
  EventType_counter=1
  callsToKeep$EventNum = 1
  for(i in 2:nrow(df)){
    # considered two CNV calls of the same type (DUP or DEL) to be duplicates 
    # if their endpoints matched within delta SNP-array probes (i.e., ∆start ≤ delta and ∆end ≤ delta)
    difference_from_enteredCalls_start = abs(callsToKeep$PROBE_START - df$PROBE_START[i]) 
    difference_from_enteredCalls_end = abs(callsToKeep$PROBE_END - df$PROBE_END[i])
    if(sum(difference_from_enteredCalls_start <= delta & difference_from_enteredCalls_end <= delta) == 0){
      # there exists NO entry in calls to keep for which both the start and the end are within delta -- thus add call to callsToKeep
      EventType_counter = EventType_counter+1
      callsToKeep = rbind(callsToKeep,cbind(df[i,],EventNum=EventType_counter))
      EventType = c(EventType,EventType_counter)
    }else{
      EventType = c(EventType,
                    min(callsToKeep[which((difference_from_enteredCalls_start <= delta & difference_from_enteredCalls_end <= delta)==1),"EventNum"]))
    }
  }
  df$EventNum = EventType
  # now let us genotype 
  callsToKeep$gt=rep(1,nrow(callsToKeep))
  for(duped_event in unique(df$EventNum)){
    hap1_subset = df$tmrca[df$EventNum == duped_event & df$HAP == 1 & df$LOD_HAP_NBRS_ONLY > 6]
    hap2_subset = df$tmrca[df$EventNum == duped_event & df$HAP == 2 & df$LOD_HAP_NBRS_ONLY > 6]
    if(length(intersect(hap1_subset,hap2_subset)) > 0){
      callsToKeep$gt[callsToKeep$EventNum == duped_event]=2
    }
  }
  # 
  return(callsToKeep[,c("LOD", "BP_START", "BP_END", "PROBE_START", "PROBE_END", "LOD_HAP_NBRS_ONLY","gt")])
}
pipe_function <- function(.data){
  return(function_apply(as.data.frame(.data)))
}
print("deduping AND genotyping")
dplyr_tbl_postDeDuping <- deduping_needed %>% group_modify(~pipe_function(.x))
print("Done deduping AND genotyping")

dplyr_tbl_noDeDuping_needed = deduping_unneeded[,c("ID_1","ID_2","CHR","CNV_TYPE", "LOD", "BP_START", "BP_END", "PROBE_START", "PROBE_END", "LOD_HAP_NBRS_ONLY")]
dplyr_tbl_noDeDuping_needed$gt = 1
all = rbind(dplyr_tbl_postDeDuping, dplyr_tbl_noDeDuping_needed)

fwrite(all,paste0(Dir_save_deduped_calls,"chr",CHR,"_delta",delta,"_deduped_and_genotyped.txt"),
       quote = F,col.names = T,row.names = F,sep="\t")
R.utils::gzip(paste0(Dir_save_deduped_calls,"chr",CHR,"_delta",delta,"_deduped_and_genotyped.txt"),
              destname=paste0(Dir_save_deduped_calls,"chr",CHR,"_delta",delta,"_deduped_and_genotyped.txt.gz"))

