#!/usr/bin/env Rscript
options(echo=FALSE);
param <- commandArgs(trailingOnly=T)
CHR = as.numeric(eval(paste(text=param[1])))
TMRCA  =  as.numeric(eval(paste(text=param[2]))) 
DIR_UKBB_calls = eval(paste(text=param[3])) 
DIR_Save_Merged_calls = eval(paste(text=param[4]))
# -------------#
library(dplyr, warn.conflicts = FALSE)
library(data.table)
options(dplyr.summarise.inform = FALSE)
# -------------#
print(paste0("Reading data from",DIR_UKBB_calls))
tbl = fread(paste0(DIR_UKBB_calls,"chr",CHR,".ibd", TMRCA,".txt.gz"),h=T)
print("Done reading data")
# -------------# 
# We have already restrict putative deletions to span at least 50 bp
# and duplications to span at least 500 bp and have log10BF > 9 support
# if this step hasn't been completed a line such as below can be uncommented and run:
# tbl = tbl[(tbl$Type == "DEL" & tbl$LENGTH_KB >= 0.05) | (tbl$Type == "DUP" & tbl$LENGTH_KB >= 0.5 & tbl$LOD > 9) ,]
# -------------#
# We further post-processed the segments that survived filtering by bridging 
# short gaps between consecutive segments of the same copy-number state 
# (because the Viterbi path through long CNVs was sometimes interrupted by 
# short sequences of no-CNV states). 

dplyr_tbl <- tbl %>% group_by(ID_1,ID_2,HAP,CHR,CNV_TYPE) %>% mutate(n = n())
# only have to think about merging if there are more than 1 event within a group
merging_needed = dplyr_tbl[dplyr_tbl$n > 1,]
merging_unneeded = dplyr_tbl[dplyr_tbl$n == 1,]
# 
function_apply <- function(df){
  df = df[order(df$PROBE_START),]
  numEvent = c(1)
  eventNumber = 1
  endIndex = df$PROBE_END[1]
  endIndex_BP = df$BP_END[1]
  startIndex_BP = df$BP_START[1] 
  startIndex =df$PROBE_START[1]
  
  for(i in 2:nrow(df)){
    # Specifically, we bridged gaps between nearby  CNV segments if either 
    # (i) they included ≤4 probes and spanned <20 kb; 
    # or (ii) they spanned ≤20% of the combined length after bridging. 
    if( (df$PROBE_START[i] <= endIndex + 5 & (df$BP_START[i] - endIndex_BP - 1) < 20000) |
        ((df$BP_START[i] - endIndex_BP-1)/(df$BP_END[i]-startIndex_BP + 1) <= 0.20)){
      # we will merge the events
      numEvent = c(numEvent, eventNumber)
      # update end index of the merged event; startIndex_BP and startIndex remain
      endIndex = df$PROBE_END[i]
      endIndex_BP = df$BP_END[i]
      if(length(unique(numEvent)) > 1){ # maybe previous events can now be merged -- look at all previous events
        for(prevEvent in unique(numEvent)[order(unique(numEvent),decreasing=T)][-1] ){ # find the start/end of this previous event 
          end_prev_event_BP = max(df$BP_END[which(numEvent == prevEvent)])
          end_prev_event_INDEX = max(df$PROBE_END[which(numEvent == prevEvent)])
          start_prev_event_BP = min(df$BP_START[which(numEvent == prevEvent)])
          start_prev_event_INDEX = min(df$PROBE_START[which(numEvent == prevEvent)])
          # only have to check gap - if it was less than 4 probes & 20 kb away it would have been merged
          if((startIndex_BP - end_prev_event_BP -1)/(endIndex_BP- start_prev_event_BP + 1) <= 0.20){
            # update event number of event; startIndex_BP and startIndex change
            numEvent[which(numEvent==prevEvent)] = eventNumber
            startIndex_BP = start_prev_event_BP
            startIndex = start_prev_event_INDEX
          } else {
            break # exits the loop - if the "closest" event can't be merged
          }
        }
      }
    } else {
      eventNumber = eventNumber+1
      numEvent = c(numEvent, eventNumber)
      endIndex = df$PROBE_END[i]
      endIndex_BP = df$PROBE_END[i]
      startIndex_BP = df$BP_START[i]
      startIndex = df$PROBE_START[i]
    }
  }
  df$EventNumber = numEvent
  df = df %>% group_by(EventNumber) %>%  summarise(BP_START=min(BP_START),BP_END = max(BP_END), 
                                                   PROBE_START=min(PROBE_START),PROBE_END = max(PROBE_END), 
                                                   LOD=sum(LOD),LOD_HAP_NBRS_ONLY=sum(LOD_HAP_NBRS_ONLY))
  return(df[,c("LOD","BP_START","BP_END","PROBE_START","PROBE_END","LOD_HAP_NBRS_ONLY")])
}
pipe_function <- function(.data){
  return(function_apply(as.data.frame(.data)))
}
print("Merging calls")
dplyr_tbl_postMerging <- merging_needed %>% group_modify(~pipe_function(.x))
print("Done merging calls")
dplyr_tbl_noMerging_needed = merging_unneeded[,c("ID_1","ID_2","HAP","CHR","CNV_TYPE","LOD","BP_START","BP_END","PROBE_START","PROBE_END","LOD_HAP_NBRS_ONLY")]
all = rbind(dplyr_tbl_postMerging, dplyr_tbl_noMerging_needed)

fwrite(all,paste0(DIR_Save_Merged_calls,"chr",CHR,".ibd", TMRCA,".txt"),quote = F,col.names = T,row.names = F,sep="\t")
R.utils::gzip(paste0(DIR_Save_Merged_calls,"chr",CHR,".ibd", TMRCA,".txt"),
              destname=paste0(DIR_Save_Merged_calls,"chr",CHR,".ibd", TMRCA,".txt.gz"))
