#V1.2
#we may need consider the length of the peak 
#for permutation, only interested in peaks as "nonMatchedTissueOnly"
#but stil use all the peaks in that group for permutation
#those longer peaks will be with higher reproducible rate thus more confident

#V1.1
#use peak present in each individual at merged peak location instead of number of individual peaks 
#use permutation to test how likely x merged peaks overlapping with two or more peaks each from different individuals by chance

#compare peaks found to all chromatin states from Roadmap
#to answer 
#a) do most peak comeing from brain promoter or enhancers
#b) how many are not detected as promoters or enhancers in brain regions
#c) how many are not detected as any promoter or enhancer regions in any tissue at all
#d) how is that related to overlap times with peaks from each individuals


args = commandArgs(trailingOnly=TRUE)
TISS = args[1] #Brain Heart Muscle Lung


options(scipen = 999)

OVLP.MIN= 50
#BIN.MAX = 100
PERM.NUM=50

TISS2RoadmapLabel=c(Brain ="BRN",
                    Heart ="HRT",
                    Lung="LNG",
                    Muscle="MUS")
PEAKLEN.BINs = c(0, 200, 400, 600, 800, 1000, 1500, 2000)

CSGRP2COLOR = c("red", "orange", "yellow", "grey")
#names(CSGRP2COLOR) = 

#file.in.peak = paste0("./a_mergePeak2HeightsMean_V1.1/hg19.narrowPeak.RSC0.8.RdsTot1e+07.logQ2.merged.GoodSamps.", TISS, ".bed")
#file.in.peak = paste0("a_mergePeak2HeightsMean_V1.1/hg19.narrowPeak.RSC0.8.RdsTot1e+07.logQ2.merged.GoodSamps.", TISS, ".bed")
file.in.peak = paste0("a_mergePeak2HeightsMean_V1.2_addRefPeak/hg19.narrowPeak.RSC0.8.RdsTot1e+07.logQ2.merged.GoodSamps.", TISS, ".bed")
file.in.mergePk2Ind.RDS = paste0("a2_mergePeak2IndPeak/hg19.narrowPeak.RSC0.8.RdsTot10000000.logQ2.merged2IndPeak.matrix.", TISS, ".RDS")
file.in.Roadmap.meta = "~/lhou.compbio/data/Roadmap/ROADMAP_sampleMeta_colorCode.txt"
files.in.Roadmap.CS = dir("~/lhou.compbio/data/Roadmap/18state_6marks_98epigen/", "_18_core_K27ac_mnemonics.bed.gz", full.names = T)


dir.tmp= "~/hptmp/eGTEx-H3K27ac/cmpPeak2RoadmapCS/"
if(!dir.exists(dir.tmp))
{
  dir.create(dir.tmp)
}

dir.ou="b_cmpPeak2RoadmapChromStates_V1.2_pkLen/"
if(!dir.exists(dir.ou))
{
  dir.create(dir.ou)
}
file.ou.RData = paste0(dir.ou, TISS, ".ovlp.Roadmap18CS.Ovlp", OVLP.MIN, "bp.RData")
file.ou.pdf = paste0(dir.ou, TISS, ".ovlp.Roadmap18CS.Ovlp", OVLP.MIN, "bp.CSTypeProp.pdf")


#
Roadmap.meta = read.table(file.in.Roadmap.meta, head=F, skip=1, sep="\t", row.names=1, comment.char = "", stringsAsFactors = F)
RoadmapSamps.tiss = rownames(Roadmap.meta)[grepl(TISS2RoadmapLabel[TISS], Roadmap.meta[, 4])]
#
mergePeak2IndPeak = readRDS(file.in.mergePk2Ind.RDS)

peaks.all.bed = rownames(mergePeak2IndPeak)
peaks.all2len = sapply(peaks.all.bed,
FUN=function(pk)
{
  buf = as.numeric(unlist(strsplit(pk, split="\t", fixed=T))[2:3])
  return(buf[2]-buf[1])
})
names(peaks.all2len) = peaks.all.bed
#
peak2mergIndNumber = apply(mergePeak2IndPeak, 1, 
FUN=function(x)
{
  sum(x!=0)
})
#names(peak2mergIndNumber) = peaks.all.bed
#
for(f.i.cs in files.in.Roadmap.CS)
{
  print(f.i.cs)
  f.tmp.ovlp = paste0(dir.tmp, TISS, ".ovlp.", gsub(".gz", "", basename(f.i.cs)))
  # if(!file.exists(f.tmp.ovlp))
  # {
  cmd =paste0("bedtools intersect -a ", file.in.peak, " -b ", f.i.cs, " -wo >", f.tmp.ovlp)
  system(cmd)
  # }
  
}

#
files.tmp.ovlp = dir(dir.tmp, paste0(TISS, ".ovlp.*_18_core_K27ac_mnemonics.bed"), full.names = T)
names(files.tmp.ovlp) = sapply(files.tmp.ovlp,
FUN=function(f.tmp)
{
  ref = gsub(".*(E\\d+)_.*","\\1", basename(f.tmp), perl =T)
}
)

peak2RefCS = sapply(files.tmp.ovlp,
FUN=function(f.tmp.ovlp)
{
  print(f.tmp.ovlp)
  
  pk2cs.all = rep(NA, length(peaks.all.bed))
  names(pk2cs.all) = peaks.all.bed
  
  cmd =paste0("awk '($9>=", OVLP.MIN, "){print $1 \"\\t\" $2 \"\\t\"$3 \",\" $8 }' ", f.tmp.ovlp)
  buf = sapply(system(cmd, intern = T), FUN=function(x){unlist(strsplit(x, split=","))})
  pk2cs = sapply(split(buf[2,], buf[1,]), paste, collapse=",")
  pk2cs.all[names(pk2cs)] = pk2cs
  
  return(pk2cs.all)
})
#colnames(peak2RefCS) = 

save(peaks.all.bed, peak2RefCS, file=file.ou.RData)

#
RoadmapSamps.tiss.ovlp = intersect(names(files.tmp.ovlp), RoadmapSamps.tiss)
RoadmapSamps.tiss.other = setdiff(names(files.tmp.ovlp),RoadmapSamps.tiss)
peak2RefCSType = apply(peak2RefCS, 1,
FUN=function(cs)
{
  #print(cs)
  csIsTssOrEnh = grepl("Tss|Enh", cs)
  names(csIsTssOrEnh) = names(cs)
  
  if(any(csIsTssOrEnh[RoadmapSamps.tiss.ovlp], na.rm = T))
  {
    if(any(csIsTssOrEnh[RoadmapSamps.tiss.other], na.rm = T))
    {
      return("Tss|Enh.MatchedAndNotTissue")
    }
    else
    {
      return("Tss|Enh.MatchedTissueOnly")
    }
  }else
  {
    if(any(csIsTssOrEnh, na.rm = T))
    {
      return("Tss|Enh.nonMatchedTissueOnly")
    }else
    {
      return("Tss|Enh.noTissue")
    }
  }
})

# peak2RefCSTypeAndmergePeakNum= data.frame(chromatinStates = peak2RefCSType,
#                                           mergePeakNum = peak2mergIndNumber)


RefCSType2peakProp.bin = sapply(1:ncol(mergePeak2IndPeak),
FUN=function(i)
{
  pks = names(peak2RefCSType)[peak2mergIndNumber==i]
  CSType2Counts = summary(factor(peak2RefCSType[pks], 
                                 levels=c("Tss|Enh.MatchedTissueOnly", 
                                          "Tss|Enh.MatchedAndNotTissue",
                                          "Tss|Enh.nonMatchedTissueOnly", 
                                          "Tss|Enh.noTissue")))
  
  CSType2prop = CSType2Counts/sum(CSType2Counts)
})
colnames(RefCSType2peakProp.bin) = 1:ncol(mergePeak2IndPeak)

peakNum.bin = sapply(1:ncol(mergePeak2IndPeak),
FUN=function(i)
{
  pks = names(peak2RefCSType)[peak2mergIndNumber==i]
  length(pks)
})

save(peaks.all.bed, RoadmapSamps.tiss.ovlp, 
     peak2RefCS, peak2RefCSType, RefCSType2peakProp.bin, peakNum.bin,
     file=file.ou.RData)

#permutation based estimation of background signal percentage
peakGrps.byCS=list()
peakGrps.byCS$TorE.MatchedTiss = names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.MatchedTissueOnly"]
peakGrps.byCS$TorE.MatchedAndNotTiss = names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.MatchedAndNotTissue"]
peakGrps.byCS$TorE.nonMatchedTissOnly = names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.nonMatchedTissueOnly"]
#peakGrps.byCS$TorE.noMatchedTiss = c(names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.noTissue"], 
                                  #names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.nonMatchedTissueOnly"])
peakGrps.byCS$TorE.noTiss = names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.noTissue"]


peakGrps.byCSandLen = lapply(peakGrps.byCS,
FUN=function(pks)
{
  pkGrp.byLen = lapply(1:length(PEAKLEN.BINs),
  FUN=function(i)
  {
    if(i<length(PEAKLEN.BINs))
    {
      return(pks[peaks.all2len[pks]>PEAKLEN.BINs[i] & peaks.all2len[pks]<=PEAKLEN.BINs[i+1]])
    }else
    {
      return(pks[peaks.all2len[pks]>=PEAKLEN.BINs[i]])
    }
  })
  names(pkGrp.byLen) = PEAKLEN.BINs
  
  return(pkGrp.byLen)
})


peakGrp.byCS2IndFreq = sapply(peakGrps.byCS,
FUN=function(pks)
{
  apply(mergePeak2IndPeak[pks,], 2,
  FUN=function(x)
  {
    sum(x!=0)/length(x)
  }
  )
})

peakGrp.byCSandLen2IndFreq = lapply(peakGrps.byCSandLen,
FUN=function(pks.grp)
{
  
  res=sapply(pks.grp,
  FUN=function(pks.subGrp)
  {
    if(length(pks.subGrp)==0)
    {
      return(NA)
    }
      
    apply(mergePeak2IndPeak[pks.subGrp,], 2,
    FUN=function(x)
    {
      sum(x!=0)/length(x)    
      
    })
    
  })
  names(res) = names(pks.grp)
  return(res)
  
})

#perm 

grp.slctForPerm = "TorE.nonMatchedTissOnly" #only focus on nonMatchedTissueOnly type of peaks
mP2IndPk.nonMatchedTissOnly = mergePeak2IndPeak[peakGrps.byCS[[grp.slctForPerm]], ]
peak.nonMatchedTissOnly.2IndCounts =  apply(mP2IndPk.nonMatchedTissOnly, 1, 
FUN=function(x)
{
  sum(x!=0)
})
peak.nonMatchedTissOnly.IndCountsDist=sapply(0:ncol(mergePeak2IndPeak),
FUN=function(i)
{
  sum(peak.nonMatchedTissOnly.2IndCounts>=i)
})

#permutation according to cs+length grp
peaks.nonMatchedTissOnly.IndCountsDist.perm = t(sapply(1:PERM.NUM,
FUN=function(i)
{
  print(i)
  # mP2IndPk.nonMatchedTissOnly.perm = apply(mP2IndPk.nonMatchedTissOnly, 2, 
  # FUN=function(x)
  # {
  #   unlist(lapply(peakGrps.byCSandLen[[grp.slctForPerm]],
  #   FUN=function(pks.grp)
  #   {
  #     sample(x[pks.grp], length(pks.grp))  
  #   }))
  #   
  # })
  mP2IndPk.nonMatchedTissOnly.perm = apply(mP2IndPk.nonMatchedTissOnly, 2, 
  FUN=function(x)
  {
    sample(x, length(x))  
    
  })
  
  pk2IndCounts= apply(mP2IndPk.nonMatchedTissOnly.perm, 1, 
  FUN=function(x)
  {
    sum(x!=0)
  })
  
  sapply(0:ncol(mergePeak2IndPeak),
  FUN=function(i)
  {
    sum(pk2IndCounts>=i)
  })
  
}))
colnames(peaks.nonMatchedTissOnly.IndCountsDist.perm)=0:ncol(mergePeak2IndPeak)


save(peaks.all.bed, RoadmapSamps.tiss.ovlp, 
     peak2RefCS, peak2RefCSType, RefCSType2peakProp.bin, 
     peak2mergIndNumber,
     peaks.all2len,
     peakNum.bin,
     peakGrps.byCS,
     peakGrps.byCSandLen,
     peakGrp.byCS2IndFreq,
     peakGrp.byCSandLen2IndFreq,
     peak.nonMatchedTissOnly.IndCountsDist,
     peaks.nonMatchedTissOnly.IndCountsDist.perm,
     file=file.ou.RData)


###############################
pdf(file.ou.pdf, width =20, heigh=9)
par(mar=c(6,6,6,18))
par(xpd = T)
barplot(RefCSType2peakProp.bin, 
        xlab="peaks detected in individuals #", 
        ylab ="proportion of chromatin state type",
        col = CSGRP2COLOR,
        las=2,
        border = NA, space=0.1,
        cex.lab = 1.5)
legend(#"topright", 
       x=1,
       y=1.15,
       legend=rownames(RefCSType2peakProp.bin), 
       #inset=c(-0.25,0),
       fill=CSGRP2COLOR)
par(new=T)
plot(1:ncol(mergePeak2IndPeak), peakNum.bin, 
     xaxt="n", yaxt="n",xlab="",ylab="", type="b", bty="n", 
     lwd=2, cex= 1.5)
axis(4)
mtext("peaks number",side=4, line=3, cex = 1.5)#,col="blue")

###################
par(mar=c(6,6,6,6))
boxplot(peakGrp.byCS2IndFreq, col = CSGRP2COLOR,
        ylab="proportion of merged peaks\ndetected in individual")
layout(matrix(1:4, ncol=2))
for(i in 1:length(peakGrp.byCSandLen2IndFreq))
{
  
  boxplot(peakGrp.byCSandLen2IndFreq[[i]], 
          col = CSGRP2COLOR[i],
          ylab="proportion of merged peaks\ndetected in individual",
          xlab="peak length cutoff (bp)",
          cex.lab=1.5,
          main=names(peakGrp.byCSandLen2IndFreq)[i])
  par(new=T)
  plot(1:length(peakGrps.byCSandLen[[i]]), 
       sapply(peakGrps.byCSandLen[[i]],length), 
       xaxt="n", yaxt="n",xlab="",ylab="", type="b", bty="n", 
       lwd=1.5, cex= 1)
  axis(4)
  mtext("peaks number",side=4, line=3, cex = 1.5)#,col="blue")


}


layout(matrix(1, ncol=1))
peaks.nonMatchedTissOnly.IndCountsDist.perm.mean=apply(peaks.nonMatchedTissOnly.IndCountsDist.perm,2,mean)
# peaks.nonMatchedTissOnly.IndCountsDist.perm.FC = sapply(1:sum(peaks.nonMatchedTissOnly.IndCountsDist.perm.mean!=0),
# FUN=function(i)
# {
#   peaks.nonMatchedTissOnly.IndCountsDist.perm[,peaks.nonMatchedTissOnly.IndCountsDist.perm.mean!=0][,i]/
#     peaks.nonMatchedTissOnly.IndCountsDist.perm.mean[peaks.nonMatchedTissOnly.IndCountsDist.perm.mean!=0][i]
# })
# colnames(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC) = names(peaks.nonMatchedTissOnly.IndCountsDist.perm.mean)[peaks.nonMatchedTissOnly.IndCountsDist.perm.mean!=0]
# #rownames(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC) = rownames(peaks.nonMatchedTissOnly.IndCountsDist.perm) 
peaks.nonMatchedTissOnly.IndCountsDist.perm.FC = apply(peaks.nonMatchedTissOnly.IndCountsDist.perm,2,
FUN=function(x)
{
  x/(mean(x)+1)
})
boxplot(ylim=c(0, 5), 
        log10(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC+1),
        ylab="log10 FC of peaks # detected\n compared to mean in permutation",
        xlab="no less than individual #",
        main= paste(TISS, grp.slctForPerm, sep=" "))
points(1:ncol(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC), 
       log10(peak.nonMatchedTissOnly.IndCountsDist/(peaks.nonMatchedTissOnly.IndCountsDist.perm.mean+1)+1), pch="x", col="red",
       cex=2,
       cex.lab=1.5)
legend("topright", 
       legend=c("observed", "permutation"), 
       #inset=c(-0.25,0),
       pch=c("X", "o"),
       col=c("red", "black"),
       cex= 1.5
       )
par(new=T)
plot(1:ncol(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC),
     peak.nonMatchedTissOnly.IndCountsDist,
     log="y",
     xaxt="n", yaxt="n",xlab="",ylab="", type="b", bty="n", 
     lwd=2, cex= 1.5)
axis(4, at=c(1, 10, 100, 1000, 10000))
mtext("peaks #",side=4, line=3, cex = 1.5)#,col="blue")

# par(mar=c(6,6,6,6))
# layout(matrix(1:15, ncol=5))
# for(i in 1:ncol(peaks.noMatchTiss.2IndCounts.perm))
# {
#   
# }

dev.off()