#get saturation curve for peaks
#to compare peaks from newly detected and the rest
#compare the FDR too

args = commandArgs(trailingOnly=TRUE)
TISS = args[1] #Brain Heart Muscle Lung

# TISS2RoadmapLabel=c(Brain ="BRN",
#                     Heart ="HRT",
#                     Lung="LNG",
#                     Muscle="MUS")
RSC.CUTOFF = 0.8
READSTOTAL.CUTOFF = 1E7

SAMP.N=10
GRP2COLOR =c(newlyDetected="red", rest="lightblue")

library(ggplot2)

#file.in.peaks = paste0("a_2_peakHeightsNormalized_hg19.narrowPeak.RSC0.8.RdsTot1e7.logQ2_V1.3_CSFilt/", TISS, ".peaks.matchedCS.bed")
file.in.metaAndQC = "~/lhou.compbio/data/eGTEX-H3K27ac/Batch_summary_6.16.2018/sample.metaAndQC.RSC0.6.totalReads5e+06.RSC0.8OrMatchedForLung.txt"
#file.in.merge.res = paste("a_mergePeak2HeightsMean_V1.1/hg19.narrowPeak.RSC0.8.RdsTot1e+07.logQ2.merged.GoodSamps.", TISS, ".bed")
file.in.mergePk2Ind.RDS = paste0("a2_mergePeak2IndPeak_V1.1_allSamps/hg19.narrowPeak.RSC0.8.RdsTot10000000.logQ2.merged2IndPeak.matrix.", TISS, ".RDS")
file.in.mod.AREs.RData="../variationAcrossTissueandIndiv/a_4_modules_annotations_mergedPk4Tiss_Epimap_V1.2.3_CSandeGTExActivity/mARE2ModuleGrpName.RData"
file.in.tiss2ARE.RData="./c_mergePeaksV1.3FromTissues_hg19/4Tiss.mergedPeak2tissuePeak.hg19.RData"

dir.ou = "c_peaks_saturationCurve_V1.1_Grp14vsRest/"
dir.create(dir.ou, showWarnings = F, recursive = T)
file.ou.RData = paste0(dir.ou, TISS, ".peaks.ind.saturation.RData")
file.ou.pdf = paste0(dir.ou, TISS, ".peaks.ind.saturation.pdf")


#
meta = read.table(file.in.metaAndQC, sep="\t", header=T, row.names=1)
meta.tiss = meta[meta[,"tissue"]==TISS,]
samps.tiss.good = rownames(meta.tiss)[meta.tiss$totalReads>=READSTOTAL.CUTOFF & meta.tiss$RSC>=RSC.CUTOFF]

# #
# buf= read.table(file.in.peaks, sep="\t", row.names = NULL, header = F)
# peaks.filt=paste0(buf[,1], ":", buf[,2], "-", buf[,3])
load(file.in.mod.AREs.RData)
load(file.in.tiss2ARE.RData)

mergePeak2IndPeak = readRDS(file.in.mergePk2Ind.RDS)
rownames(mergePeak2IndPeak) = sub("\t", ":", rownames(mergePeak2IndPeak))
rownames(mergePeak2IndPeak) = sub("\t", "-", rownames(mergePeak2IndPeak))
mergePeak2PresCounts = apply(mergePeak2IndPeak!=0, 1, sum)

samp.ref= colnames(mergePeak2IndPeak)[grepl("^E", colnames(mergePeak2IndPeak))]

mAREs.byGrp = list()
mAREs.byGrp$newlyDetected = names(mARE2ModGrpName)[mARE2ModGrpName=="Newly detected"]
mAREs.byGrp$rest = setdiff(names(mARE2ModGrpName), mAREs.byGrp$newlyDetected)

tiss.pks.byGrp = lapply(mAREs.byGrp,
FUN=function(mAREs)
{
  unlist(tiss.mergedPeak2peaks[[TISS]][gsub(":|-", "\t", mAREs)])
})
  
inds.sampled=lapply(1:(length(samps.tiss.good)-1),
FUN=function(N)
{
  rbind(sapply(1:SAMP.N, FUN=function(i)sample(samps.tiss.good, N, replace = F)))
})


peaks.detectAndRepProp.list = lapply(names(tiss.pks.byGrp),
FUN=function(nm)
{
  peaks=tiss.pks.byGrp[[nm]]
  
  res=lapply(inds.sampled,
  FUN=function(inds.i)
  {
    t(apply(inds.i, 2, 
    FUN=function(samps.dis)
    {
      print(samps.dis)
      samps.dis=c(samps.dis, samp.ref)
       
      pks.dis= peaks[apply(mergePeak2IndPeak[peaks, samps.dis]>0, 1, sum)>=2]
      prop.dis= length(pks.dis)/length(peaks)
      
      samps.rep= setdiff(colnames(mergePeak2IndPeak), samps.dis)
      pks.rep= peaks[apply(mergePeak2IndPeak[peaks, samps.rep]>0, 1, sum)>=1]
      
      prop.rep=  length(intersect(pks.dis, pks.rep))/length(pks.dis)
      
      return(c(pks.dis=length(pks.dis),
               prop.detected=prop.dis,
               pks.rep=length(pks.rep),
               prop.replicated=prop.rep,
               indiv.number=length(samps.dis)-1))
    }))
  })
  data.frame(do.call(rbind, res),
             group=nm,
             stringsAsFactors = F)
}) 

peaks.detectAndRepProp.df=do.call(rbind, peaks.detectAndRepProp.list)
# peaks.detectAndRepProp.df = data.frame()
# for(nm in names(peaks.detectAndRepProp.list))
# {
#   df = data.frame(peaks.detectAndRepProp[[nm]],
#                   group = nm)
#                   #indiv.number = rep(1:ncol(peaks.detectAndRepProp[[nm]]), each =nrow(peaks.detectAndRepProp[[nm]])))
#   peaks.detectAndRepProp.df = rbind(peaks.detectAndRepProp.df, df)
# }


save(mergePeak2IndPeak,
     #peaks.filt,
     tiss.pks.byGrp,
     inds.sampled,
     peaks.detectAndRepProp.df,
     #peaks.detectAndRepProp.df,
     file=file.ou.RData)


pdf(file.ou.pdf, width=12, height=5)


# peak.indivCounts.df = data.frame(indiv.counts=mergePeak2PresCounts,
#                                  group="background",
#                                  stringsAsFactors = F)
# rownames(peak.indivCounts.df) = names(mergePeak2PresCounts)
# # for(grp in names(tiss.pks.byGrp))
# # {
# #   peak.indivCounts.df[tiss.pks.byGrp[[grp]], "group"] = grp
# # }
# for(grp in names(tiss.pks.byGrp))
# {
#   p = ggplot(peak.indivCounts.df[tiss.pks.byGrp[[grp]],], 
#              aes(x=indiv.counts)) + 
#     geom_histogram(binwidth=.5, fill=GRP2COLOR[grp]) + 
#     ggtitle(paste(TISS, grp)) +
#     #scale_fill_manual(values=c(novel = "red", rest = "cyan", background ="grey")) +
#     theme_bw()
#   print(p)
# }

p = ggplot(peaks.detectAndRepProp.df,
           aes(x=factor(indiv.number), y=prop.detected)) + 
  geom_boxplot(aes(fill=group), lwd=.2, outlier.shape = NA) +
  scale_fill_manual(values=GRP2COLOR) + 
  ggtitle(TISS) +
  theme_bw()

print(p)

p = ggplot(peaks.detectAndRepProp.df,
           aes(x=factor(indiv.number), y=prop.replicated)) + 
  geom_boxplot(aes(fill=group), lwd=.2, outlier.shape = NA) +
  scale_fill_manual(values=GRP2COLOR) + 
  ggtitle(TISS) +
  theme_bw()

print(p)


dev.off()