#!/usr/bin/env Rscript
options(echo=FALSE);
param <- commandArgs(trailingOnly=T)
CHR = as.numeric(eval(paste(text=param[1])))
delta = as.numeric(eval(paste(text=param[2])))
UnionIndicator = as.numeric(eval(paste(text=param[3])))
Dir_deduped_calls = eval(paste(text=param[4])) 
Dir_save_flattened_calls =  eval(paste(text=param[5])) 
# -----------#
library(dplyr, warn.conflicts = FALSE)
options(dplyr.summarise.inform = FALSE)
library(data.table)
# -----------#
# Because the deduped callset could still contain overlapping CNV calls --
# we also created a “unioned” callset in which we merged overlapping CNV calls of the same type
# (DUP or DEL). 

print("Reading deduped data")
ptm <- proc.time()
tbl = fread(paste0(Dir_deduped_calls,"chr",CHR,"_delta",delta,"_deduped_and_genotyped.txt.gz"),h=T,data.table=F)
print(proc.time() - ptm)
print("Done reading data")

# the potential to return 2 frames - one in which calls are unioned (e.g. 1-10 and 5-15 --> 1-15)
# and one which selects event with highest log10BF 

dplyr_tbl_iid_Type <-  tbl %>% group_by(ID_1,ID_2,CHR,CNV_TYPE) %>% mutate(n = n())
flattening_needed = dplyr_tbl_iid_Type[dplyr_tbl_iid_Type$n > 1,]
flattening_unneeded = dplyr_tbl_iid_Type[dplyr_tbl_iid_Type$n == 1,]

function_apply_highestLog10BF <- function(df){
  # Sort the list of de-gapped calls (from all TMRCA, hap) in descending order of log10 Bayes factor
  df = df[order(df$LOD,decreasing=T),]
  # Iterate through the list, dropping any call that overlaps a previously-seen call 
  callsToKeep = df[1,] # keep the highest score 
  for(i in 2:nrow(df)){
    overlaps_prev_call= ifelse((callsToKeep$PROBE_START <= df$PROBE_START[i] & callsToKeep$PROBE_END >= df$PROBE_START[i]) | 
                                 (callsToKeep$PROBE_START <= df$PROBE_END[i] & callsToKeep$PROBE_END >= df$PROBE_END[i]) |
                                 (callsToKeep$PROBE_START >= df$PROBE_START[i] & callsToKeep$PROBE_END <= df$PROBE_END[i]) ,1,0)
    if(sum(overlaps_prev_call) == 0){
      # there exists NO overlapping call
      callsToKeep = rbind(callsToKeep,df[i,])
    }
  }
  # 
  return(callsToKeep[,c("LOD", "BP_START", "BP_END", "PROBE_START", "PROBE_END", "LOD_HAP_NBRS_ONLY","gt")])
}
pipe_function_highestLog10BF <- function(.data){
  return(function_apply_highestLog10BF(as.data.frame(.data)))
}

function_apply_union <- function(df){
  df = df[order(df$PROBE_START),]
  numEvent = c(1)
  eventNumber = 1
  endIndex = df$PROBE_END[1]
  # any overlap - add to call
  for(i in 2:nrow(df)){
    if(df$PROBE_START[i] <= endIndex){
      numEvent = c(numEvent, eventNumber)
    } else{
      eventNumber = eventNumber+1
      numEvent = c(numEvent, eventNumber)
    }
    endIndex = max(endIndex,df$PROBE_END[i])
  }
  df$EventNumber = numEvent
  df = df %>% group_by(EventNumber) %>%  summarise(BP_START=min(BP_START),BP_END = max(BP_END), PROBE_START=min(PROBE_START),PROBE_END = max(PROBE_END))
  return(df[,c("BP_START","BP_END","PROBE_START","PROBE_END")])
}
pipe_function_union <- function(.data){
  return(function_apply_union(as.data.frame(.data)))
}

if(UnionIndicator == 0){
  print("Finding highest log10BF event for overlapping events")
  dplyr_tbl_flattening_highestlog10bf <- flattening_needed %>% group_modify(~pipe_function_highestLog10BF(.x))
  print("Done")
  
  dplyr_tbl_noFlattening_needed_highestlog10bf = flattening_unneeded[,c("ID_1","ID_2","CHR","CNV_TYPE", "LOD", "BP_START", "BP_END", "PROBE_START", "PROBE_END", "LOD_HAP_NBRS_ONLY","gt")]
  all_highestlog10bf = rbind(dplyr_tbl_flattening_highestlog10bf, dplyr_tbl_noFlattening_needed_highestlog10bf)

  fwrite(all_highestlog10bf,paste0(Dir_save_flattened_calls,"chr",CHR,"_delta",delta,"_highestlog10BF.txt"),
         quote = F,col.names = T,row.names = F,sep="\t")
  R.utils::gzip(paste0(Dir_save_flattened_calls,"chr",CHR,"_delta",delta,"_highestlog10BF.txt"),
                destname=paste0(Dir_save_flattened_calls,"chr",CHR,"_delta",delta,"_highestlog10BF.txt.gz"))
  
}
if(UnionIndicator == 1){
  print("Unioning overlapping events")
  dplyr_tbl_flattening_unioning <- flattening_needed %>% group_modify(~pipe_function_union(.x))
  print("Done unioning")
  
  dplyr_tbl_noFlattening_needed_unioning = flattening_unneeded[,c("ID_1","ID_2","CHR","CNV_TYPE", "BP_START", "BP_END", "PROBE_START", "PROBE_END")]
  all_unioning = rbind(dplyr_tbl_flattening_unioning, dplyr_tbl_noFlattening_needed_unioning)

  fwrite(all_unioning,paste0(Dir_save_flattened_calls,"chr",CHR,"_delta",delta,"_unioned.txt"),
         quote = F,col.names = T,row.names = F,sep="\t")
  R.utils::gzip(paste0(Dir_save_flattened_calls,"chr",CHR,"_delta",delta,"_unioned.txt"),
                destname=paste0(Dir_save_flattened_calls,"chr",CHR,"_delta",delta,"_unioned.txt.gz"))
  
}






