#V4.2
#keep reads overlapping with all filtered merged peaks 
#compare the proportion of reads in reference overlapped peaks, selected reference overlapping peaks

#V4.1 
#merge peak only from samples with enough reads and good RSC
#use logq as cutoff to filter peaks

#V4
#merge peaks from samples with good quality [defined by mismatch] and only non-AD related samples would be included
#do not use wiggler to calculate peak heights, but use shifted reads coverage instead
#also caculate counts for referecen samples
#normalized to 1kb region are not used any more

#V3
#merge peaks from samples only with good quality and non-AD samples would be included
#remove those merged peaks overlapping with peak from at least two individuals
#get peak counts from normalized bw, and re-normalzied to 30,000,000 reads in total from 1*10^9


#V2
#merge peaks, get reads counts and normalize by GC content
#merge peak
# need to have overlap with peak from at least one other individuals


#for GC  normalization
#a) base-pair-level union of peak regions from all individuals as the set of active regions in our samples
#b) H3K27ac peak height was defined as the number of ChIP-seq fragments in the peak regions. For paired-end reads, the ChIP-seq fragment was defined as the entire region between the outer edges of the read pair, whereas for single-end reads we used the median fragment length across all libraries. Fragments that lay only partially within the peak region were counted according to their overlap fraction. 
#c) Peak heights were normalized for G+C content biases as in Pickrell et al
#https://www.nature.com/nature/journal/v464/n7289/full/nature08872.html


args = commandArgs(trailingOnly=TRUE)
HM = args[1]


R.LIBS.MYPATHs = c(V3.5="~/lhou.compbio/libs/R/x86_64-pc-linux-gnu-library/3.5/",
                V3.4="~/lhou.compbio/libs/R/x86_64-pc-linux-gnu-library/3.4/",
                V3.3="~/lhou.compbio/libs/R/x86_64-pc-linux-gnu-library/3.3/")
R.LIBS.PATH = .libPaths()

if(!grepl("R version 3.3", R.version$version.string))
{
  stop("reuse R-3.3")
}
.libPaths(c(R.LIBS.MYPATHs["V3.3"], R.LIBS.PATH))


N.CUTOFF=2 #need to appear in at least 10 percent of all samples
SUPER.TOTALREADs.CUTOFF=3e7
SUPER.RSC.CUTOFF=2
PEAK.LogQ.CUTOFF =2
GOOD.TOTALREADs.CUTOFF=2e7
GOOD.RSC.CUTOFF=1

TYPE2COL=c("grey","red")
names(TYPE2COL)=c("control", "case") #c("IBDRA_Control", "CD", "UC", "RA")

REFs.SELECT= c("E040","E039", "E048", "E047", "E029", "E032", "E046", "E030")# ,"  "BPNeutrop") #use neutrophil from ROADMAP instead of blueprint

REF2CELLTYPE= 
  c(
    E062="Primary mononuclear cells from peripheral blood",
    E034="Primary T cells from peripheral blood",
    E045="Primary T cells effector/memory enriched from peripheral blood",
    #E033="Primary T cells from cord blood",
    E044="Primary T regulatory cells",# from peripheral blood",
    E043="Primary T helper cells",# from peripheral blood",
    E041="Primary T helper cells PMA-I stimulated",
    E042="Primary T helper 17 cells PMA-I stimulated",
    E040="Primary T helper memory cells1",#  from peripheral blood 1",
    E037="Primary T helper memory cells2",# from peripheral blood 2",
    E038="Primary T helper naive cells",#  from peripheral blood",
    E039="Primary T helper naive cells",# from peripheral blood",
    E048="Primary T CD8+ memory cells",#  from peripheral blood",
    E047="Primary T CD8+ naive cells",#  from peripheral blood",   
    E029="Primary monocytes",#  from peripheral blood",
    #E031="Primary B cells from cord blood",
    # E035="Primary hematopoietic stem cells",
    # E051="Primary hematopoietic stem cells G-CSF-mobilized Male",
    #E050="Primary hematopoietic stem cells G-CSF-mobilized Female",
    # E036="Primary hematopoietic stem cells short term culture",
    E032="Primary B cells",#  from peripheral blood",
    E046="Primary Natural Killer cells",#,#  from peripheral blood",
    E030="Primary neutrophils"#  from peripheral blood"
    )

sh.head = paste("#!/bin/bash",
                "#$ -S /bin/bash",
                "#$ -P compbio_lab",
                "##$ -V",
                "#$ -cwd",
                "#$ -e ./error.$JOB_NAME.$JOB_ID",
                "#$ -o ./outpt.$JOB_NAME.$JOB_ID",
                "#$ -l h_vmem=30g",
                "source ~/.bashrc",
                "source ~/.my.bashrc",
                sep="\n")


##Mayo
dir.in = paste0("~/lhou.compbio/data/Mayo_Bipolar/Batch_Feb.2019/AQUAS_", HM, "/")
file.in.meta = paste0("~/lhou.compbio/data/Mayo_Bipolar/Batch_Feb.2019/c_metaAndQC/", HM, ".sample.metaDataAndQC.txt")
file.in.peak.suf = "out/peak/macs2/rep1/*nodup.tagAlign.pval0.01.500K.filt.narrowPeak.gz"
file.in.tagAlign.suf = "out/align/rep1/*nodup.tagAlign.gz"
# file.in.badSamp = "b_tissueSpecif/sample.QC.RSCcutoff1.txt"
# file.in.qc = "b_QCmetric/QC_metrics.txt"
# file.in.testMeta="~/lhou.compbio/data/Mayo_aim2_5.2017/H3K27ac_3.27.2018/sample.metaData.until3.27.2018.txt"

#roadmap
file.in.ref.tag=paste0("/broad/compbio/anshul/projects/roadmap/alignments/consolidated_nosubsampling/E000-", HM, ".tagAlign.gz")
file.in.ref.pk=paste0("/broad/compbio/anshul/projects/roadmap/peaks/consolidated/narrowPeak/E000-", HM, ".narrowPeak.gz")
file.in.ref.fragLen = "/broad/compbio/anshul/projects/roadmap/metadata/strandcorr/all.encode.roadmap.fraglen"

#blueprint
dir.blueprint = "/broad/compbio/data/ihec/Venous_blood_2018/Venous_blood/"
blueprint.indNames = c("S00D9Y", "S00DKC", "S00DP2", "S00E9U", "S00F9Q", "S00J9A", "S00P04", "S00R1V", "S0100M", "S01342")
blueprint.nm2celltypes = c("mature_neutrophil", "CD4-positive_alpha-beta_T_cell", "CD14-positive_CD16-negative_classical_monocyte")
names(blueprint.nm2celltypes) = c("BP-Neutrophil", "BP-CD4NaiveT", "BP-monocyte")
files.in.blueprint = list()
for (ind in blueprint.indNames)
{
  for(nm in names(blueprint.nm2celltypes))
  {
    ct = blueprint.nm2celltypes[nm]
    
    f.bw = system(paste("find ", dir.blueprint, ind, "/", ct, "/*/*/*", HM, "*.bw", sep=""), intern=T)
    f.pk = system(paste("find ", dir.blueprint, ind, "/", ct, "/*/*/*", HM, "*.bed.gz", sep=""), intern=T)
    
    if(length(f.bw)==1 & length(f.pk)==1)
    {
      files.in.blueprint[[paste(nm, ".", ind, sep="")]]=c(bw=f.bw, pk=f.pk)
    }else
    {
      print(paste("NO", ind, nm))
    }
  }
  
}


dir.tmp = "~/hptmp/MayoBipolar/peakMerge/"
if(! dir.exists(dir.tmp))
{
  dir.create(dir.tmp)
}
file.tmp.allPeak = paste(dir.tmp, HM, ".hg19.nodup.filt.narrowPeak.all.bed", sep="")
file.tmp.mergedPeak = paste(dir.tmp, HM, ".hg19.nodup.filt.narrowPeak.merged.bed", sep="")
file.tmp.filt.mergedPeak = paste(dir.tmp, HM, ".hg19.nodup.filt.narrowPeak.merged.filt", N.CUTOFF, "Samps.bed", sep="")
file.tmp.pkInRef = paste(dir.tmp, HM, ".merged.peaks.inRef.bed", sep="")


dir.ou="./a_mergePeak2readsCounts_V4.2_filteredMergedPeaks/"
if(! dir.exists(dir.ou))
{
  dir.create(dir.ou)
}
file.ou.filt.mergedPeak = paste(dir.ou, HM, ".hg19.narrowPeak.RSC", SUPER.RSC.CUTOFF, ".Tot", SUPER.TOTALREADs.CUTOFF, ".logQ", PEAK.LogQ.CUTOFF, ".merged.filt", N.CUTOFF, "GoodSamps.bed", sep="")
file.ou.RData=paste(dir.ou, HM, ".hg19.narrowPeak.RSC", SUPER.RSC.CUTOFF, ".Tot", SUPER.TOTALREADs.CUTOFF, ".logQ", PEAK.LogQ.CUTOFF, ".merged.filt", N.CUTOFF, "GoodSamps.heightsMean.RData", sep="")
#file.ou.fractMatrix = paste(dir.ou, "hg19.nodup.filt.narrowPeak.merged.No0Fraction", sep="")

file.ou.peakMerged.pdf = paste(dir.ou, HM, ".diagnose.peakMerge.pdf", sep="")
file.ou.ovlp.pdf = paste(dir.ou, HM, ".diagnose.ovlpInRef.pdf", sep="")
file.ou.pdf =   paste(dir.ou, HM, ".hg19.narrowPeak.merged.filt", N.CUTOFF, "GoodSamps.heigthsMean.pdf", sep="")
file.ou.readPerct.pdf =   paste(dir.ou, HM, ".hg19.narrowPeak.merged.filt", N.CUTOFF, "GoodSamps.readsPercent.pdf", sep="")

#
print("input meta info") ########################################################################
# qc.info=c() #sample names, peak file path, bw file path, whether good or not based RSC and comparison to ROADMAP, 
# 
# for(d.i in dirs.in)
# {
#   isBad = as.matrix(read.table(paste(d.i, file.in.badSamp, sep=""), sep="\t", header=T, row.names=1))[,"isBad.byMismatch"]
#   fragLen = as.matrix(read.table(paste(d.i, file.in.qc, sep=""), sep="\t", header=T, row.names=1))[,"FragLen"]
#   for(samp in names(isBad))
#   {
#     f.pk = system(paste("find ", d.i, "AQUAS/", samp, "/", file.in.peak.suf, sep=""), intern = T)
#     f.tag = system(paste("find ", d.i, "AQUAS/", samp, "/", file.in.tagAlign.suf, sep=""), intern = T)
#     qc.info=rbind(qc.info, c(samp, isBad[samp]==0, fragLen[samp], f.pk, f.tag))
#   }
#   
# }
# rownames(qc.info)=qc.info[,1]
# qc.info=qc.info[,2:ncol(qc.info)]
# colnames(qc.info) = c("GoodQuality", "fragLen", "Peak", "Tag")


qc.info = read.table(file.in.meta, sep="\t", row.names=1, header = T)


#
#samp2meta = read.table(file.in.testMeta, sep="\t", header=T, row.names=1)#[colnames(samp2Heights.withRef$testWithRef.GCnorm.quant)[1:60],]
# dis2samps = lapply(names(TYPE2COL),
# FUN=function(dis)
# {
#   rownames(samp2meta)[samp2meta$phenotype==dis]
# })
# names(dis2samps)=names(TYPE2COL)
#samps.nonAD = unlist(dis2samps[c("IBDRA_Control", "CD", "UC", "RA", "AD_control")])

#########################################################################
print("merged peak") 
if(file.exists(file.tmp.allPeak))
{
  file.remove(file.tmp.allPeak)
}

for(samp.id in rownames(qc.info)[qc.info$totalReads>=SUPER.TOTALREADs.CUTOFF & qc.info$RSC>=SUPER.RSC.CUTOFF & qc.info$isBad.byMismatch==0])
{
  f.i.pk = paste0(dir.in, samp.id, "/", file.in.peak.suf)
  print(f.i.pk)
  #cmd = paste("zcat ", f.i.pk, " >>", file.tmp.allPeak)
  cmd = paste("zcat ", f.i.pk, "|awk '($9>=", PEAK.LogQ.CUTOFF, "){print $1 \"\\t\" $2 \"\\t\" $3 \"\\t\" $4 \"_\" $8}' >>", file.tmp.allPeak, sep="")
  system(cmd)
}

cmd = paste("sort -k1,1 -k2,2n ", file.tmp.allPeak, " | mergeBed -i stdin -c 4 -o collapse -delim \",\" >", file.tmp.mergedPeak, sep="")
system(cmd)


#diagnosis
pk2len = as.numeric(system(paste("awk '{print $3-$2}'", file.tmp.mergedPeak), intern = T)) #- as.numeric(system(paste("awk '{print $2}'", file.tmp.mergedPeak), intern = T))
buf=system(paste("awk '{print $4}'", file.tmp.mergedPeak), intern = T)
pk2pvals = lapply(buf,
FUN=function(pks)
{
  sapply(unlist(strsplit(pks, split=",")),
  FUN=function(pk)
  {
    as.numeric(unlist(strsplit(pk, split="_"))[3])
  })
})
pk2pvals.size = sapply(pk2pvals,length)
size.seq = sort(as.numeric(levels(factor(pk2pvals.size))))
size2pval = lapply(size.seq,
FUN=function(size)
{
  unlist(pk2pvals[pk2pvals.size==size])
})
names(size2pval) = as.character(size.seq)

size2SecondTopPval = lapply(size.seq,
FUN=function(size)
{
  if(size==1)
  {
    return(unlist(pk2pvals[pk2pvals.size==1]))
  }
  sapply(pk2pvals[pk2pvals.size==size],
  FUN=function(qs)
  {
    
    sort(qs, decreasing = T)[2]
  })
  
})
pdf(file.ou.peakMerged.pdf, height=9, width=12)
layout(1:4)
hist(pk2pvals.size, xlab="number of peaks overlapped with merged peaks", breaks=100)
boxplot(size2pval, xlab="number of peaks overlapped with merged peaks", ylab="pvalues", outline=F)
boxplot(size2SecondTopPval[1:500], xlab="number of peaks overlapped with merged peaks", ylab="2nd largest pvalues", outline=F)
boxplot(pk2len~pk2pvals.size, xlab="number of peaks overlapped with merged peaks", ylab="peak length", outline=F)
layout(matrix(1:9, ncol=3))
for(i in  1:sum(qc.info$totalReads>=SUPER.TOTALREADs.CUTOFF & qc.info$RSC>=SUPER.RSC.CUTOFF & qc.info$isBad.byMismatch==0))
{
  print(i)
  hist(size2pval[[i]], main=paste("size ", i, sep=""), xlab="qvalue", breaks=200)
}
dev.off()

 
cmd = paste("awk '{if (gsub(/,/,\"\",$4)>=", N.CUTOFF-1,") print}' ", file.tmp.mergedPeak," > ", file.tmp.filt.mergedPeak, sep="")
system(cmd)

cmd= paste("awk '{print $1 \"\\t\" $2 \"\\t\" $3 \"\\t\" $1 \":\" $2 \"-\" $3}' ", file.tmp.filt.mergedPeak, ">", file.ou.filt.mergedPeak, sep="" )
system(cmd)

# cmd =paste("awk '{print $1 \":\" $2 \"-\" $3}' ", file.ou.filt.mergedPeak, sep="")
# merged.peaks = system(cmd, intern = T)
merged.peaks.df = read.table(file.ou.filt.mergedPeak, header=F, sep="\t", row.names = 4)
colnames(merged.peaks.df) = c("chr", "start", "end")
merged.peaks = rownames(merged.peaks.df)
merged.peaks.len = merged.peaks.df[,3]-merged.peaks.df[,2]
names(merged.peaks.len) = merged.peaks
# 
save(merged.peaks, merged.peaks.len, qc.info,file=file.ou.RData)

##################################################
print("get reads overlapping with filtered merged peaks for both testing samples and reference samples")
####################################################

print("mean counts in each peak for Mayo sample")
#ref region  names
 

for(samp in rownames(qc.info)[qc.info$totalReads>=GOOD.TOTALREADs.CUTOFF & qc.info$RSC>=GOOD.RSC.CUTOFF & qc.info$isBad.byMismatch==0])
{
  f.i.tag=paste(dir.in, samp, "/", file.in.tagAlign.suf, sep="")
  frag.len = qc.info[samp,"FragLen"]
  f.o.sh = paste("a.", samp, ".ovlp.merged.sh", sep="")
  f.tmp.merged = paste(dir.tmp, samp, ".", HM, ".ovlp.merged.bed", sep="")
  
  # if(unlist(strsplit(system(paste("wc -l ", f.tmp.merged, sep=""), intern = T), split=" "))[1] != 0)
  # {
  #   next
  # }
  write(sh.head, f.o.sh)
  
  cmd= paste("zcat ", f.i.tag, "|awk '{if($6==\"+\") {print $1 \"\\t\" $2 \"\\t\" $2+", frag.len, "} if($6==\"-\") {start =1>$3-", frag.len, "?1:$3-", frag.len, ";print $1 \"\\t\" start \"\\t\" $3 }}'|sortBed -i stdin| intersectBed -b stdin -a ", file.ou.filt.mergedPeak, " -wo -sorted|mergeBed -i stdin -c 4,8 -o distinct,sum >", f.tmp.merged, sep="")
  write(cmd, file=f.o.sh, append = T)
  
  write("echo done",file=f.o.sh, append = T)
  system(paste("qsub ", f.o.sh, sep=""))
}

#need to wait for the above codes running

samp2HeightsMean=list()

samp2HeightsMean$Mayo = sapply(rownames(qc.info)[qc.info$totalReads>=GOOD.TOTALREADs.CUTOFF & qc.info$RSC>=GOOD.RSC.CUTOFF & qc.info$isBad.byMismatch==0],
FUN=function(samp)
{
  print(samp)
  f.tmp.merged = paste(dir.tmp, samp, ".", HM, ".ovlp.merged.bed", sep="")

  heights  =as.matrix(read.table(f.tmp.merged, sep="\t", header=F, row.names=4, colClasses = "character"))[,4]
  pk2heightMean = rep(0, length(merged.peaks))
  names(pk2heightMean) = merged.peaks
  pk2heightMean[names(heights)] = as.numeric(heights)/merged.peaks.len[names(heights)]
  
  return(pk2heightMean)
})
colnames(samp2HeightsMean$Mayo) = rownames(qc.info)[qc.info$totalReads>=GOOD.TOTALREADs.CUTOFF & qc.info$RSC>=GOOD.RSC.CUTOFF & qc.info$isBad.byMismatch==0]

save(samp2HeightsMean, merged.peaks, merged.peaks.len, qc.info,file=file.ou.RData)

#rownames(samp2Heights$Mayo) = merged.peaks.inRef
#rownames(samp2Heigths) =merge.peaks

#################################################
print("get counts for ROADMAP samples")

file2fragLen = as.matrix(read.table(file.in.ref.fragLen, sep="\t", row.names=1, header=F))

for(samp in names(REF2CELLTYPE))
{
  f.i.tag= sub("E000", samp, file.in.ref.tag)
  frag.len = file2fragLen[basename(f.i.tag),1]
  f.o.sh = paste("a.", samp, ".ovlp.merged.sh", sep="")
  f.tmp.merged = paste(dir.tmp, samp, ".", HM, ".ovlp.merged.bed", sep="")
  
  # if(unlist(strsplit(system(paste("wc -l ", f.tmp.merged, sep=""), intern = T), split=" "))[1] != 0)
  # {
  #   next
  # }
  
  write(sh.head, f.o.sh)
  
  cmd= paste("zcat ", f.i.tag, "|awk '{if($6==\"+\") {print $1 \"\\t\" $2 \"\\t\" $2+", frag.len, "} if($6==\"-\") {start =1>$3-", frag.len, "?1:$3-", frag.len, ";print $1 \"\\t\" start \"\\t\" $3 }}'|sortBed -i stdin| intersectBed -b stdin -a ", file.ou.filt.mergedPeak, " -wo -sorted|mergeBed -i stdin -c 4,8 -o distinct,sum >", f.tmp.merged, sep="")
  write(cmd, file=f.o.sh, append = T)
  
  write("echo done",file=f.o.sh, append = T)
  system(paste("qsub ", f.o.sh, sep=""))
}

#need to wait for the above codes running
samp2HeightsMean$ROADMAP = sapply(names(REF2CELLTYPE),
FUN=function(samp)
{
  print(samp)
  f.tmp.merged =  paste(dir.tmp, samp, ".", HM, ".ovlp.merged.bed", sep="")
  
  heights  =as.matrix(read.table(f.tmp.merged, sep="\t", header=F, row.names=4, colClasses = "character"))[,4]
  pk2heightMean = rep(0, length(merged.peaks))
  names(pk2heightMean) = merged.peaks
  pk2heightMean[names(heights)] = as.numeric(heights)/merged.peaks.len[names(heights)]
  
  return(pk2heightMean)
  #return(counts)
})
colnames(samp2HeightsMean$ROADMAP) = names(REF2CELLTYPE)
#rownames(samp2Heights$ROADMAP) = merged.peaks.inRef


#####################################################
print("get counts for blueprint samples")

samp2HeightsMean$blueprint = sapply(names(files.in.blueprint),
FUN=function(samp)
{
  print(samp)
  f.i.bw = files.in.blueprint[[samp]]["bw"]
  f.tmp.heights = paste(dir.tmp, samp, ".bw2height.txt", sep="")
  
  cmd = paste("bigWigAverageOverBed ", f.i.bw, " ", file.ou.filt.mergedPeak, " ", f.tmp.heights, sep="")
  system(cmd)
  
  heights  =as.matrix(read.table(f.tmp.heights, sep="\t", header=F, row.names=1))[,4]
  return(heights)
})
colnames(samp2HeightsMean$blueprint) =  names(files.in.blueprint)
# rownames(samp2Heights$blueprint) =  merged.peaks.inRef


save(samp2HeightsMean, qc.info, merged.peaks, merged.peaks.df, merged.peaks.len, file=file.ou.RData)
#save(samp2HeightsMean, qc.info, ref2pkOvlpWithTest, pk2refOvlpTest.Weightsum, merged.peaks.inRef, merged.peaks, merged.peaks.df, merged.peaks.len, file=file.ou.RData)

##################################################
print("get peaks overlap with reference peaks")


pdf(file.ou.ovlp.pdf, height=6, width=12)
layout(matrix(1:12, ncol=4))
ref2pkOvlpWithTest = sapply(c(names(REF2CELLTYPE), names(files.in.blueprint)),
FUN=function(ref)
{
  f.i.pk = ""
  if(grepl("^E",ref))
  {
    f.i.pk=sub("E000", ref, file.in.ref.pk)
  }else if(grepl("^BP",ref))
  {
    f.i.pk = files.in.blueprint[[ref]]["pk"]  
  }
  print(f.i.pk)
  
  cmd = paste("intersectBed -b ", file.ou.filt.mergedPeak, " -a ", f.i.pk, " -wo|awk '{print $3-$2 \"\t\" $NF \"\t\" $(NF-1)  }'", sep="")
  pksOvlp.info=t(sapply(system(cmd, intern = T), FUN=function(x){unlist(strsplit(x, split="\t"))}))
  smoothScatter(as.numeric(pksOvlp.info[,1]), as.numeric(pksOvlp.info[,2])/as.numeric(pksOvlp.info[,1]), 
                xlab="peak length in ref", ylab="fraction of each ref peak\noverlapping by test",
                nbin = 500,
                main=ref) 
  
  #cmd = paste("intersectBed -a ", file.ou.filt.mergedPeak, " -b ", f.i.pk, " -u|awk '{print $1 \":\" $2 \"-\" $3}'  ", sep="")
  pks.isOvlp=rep(0, length(merged.peaks))
  names(pks.isOvlp) = merged.peaks
  
  pks.isOvlp[levels(factor(pksOvlp.info[,3]))]=1
  
  
  return(pks.isOvlp)
})
colnames(ref2pkOvlpWithTest) = c(names(REF2CELLTYPE), names(files.in.blueprint))
dev.off()

weights = rep(1, ncol(ref2pkOvlpWithTest))
names(weights) = colnames(ref2pkOvlpWithTest)
for(ct in names(blueprint.nm2celltypes))
{
  weights[grepl(paste("^", ct, sep=""), colnames(ref2pkOvlpWithTest))] = 1/sum(grepl(paste("^", ct, sep=""), colnames(ref2pkOvlpWithTest))) #from the same cell type, share the same weights  
}
weights["E040"] <- weights["E037"] <- 0.5
weights["E038"] <- weights["E039"] <- 0.5
pk2refOvlpTest.Weightsum = apply(ref2pkOvlpWithTest,1,
FUN=function(x)
{
  sum(weights[x==1])
})
merged.peaks.inRef = merged.peaks[pk2refOvlpTest.Weightsum>=0.5]

merged.peaks.inRef.bed = gsub(":", "\t", merged.peaks.inRef)
merged.peaks.inRef.bed = gsub("-", "\t", merged.peaks.inRef.bed)
write(paste(merged.peaks.inRef.bed, merged.peaks.inRef, sep="\t"), file.tmp.pkInRef)


save(samp2HeightsMean, qc.info, ref2pkOvlpWithTest, pk2refOvlpTest.Weightsum, merged.peaks.inRef, merged.peaks, merged.peaks.df, merged.peaks.len, file=file.ou.RData)


#proporiton of reads
colnames(ref2pkOvlpWithTest) = gsub("BP-", "BP",colnames(ref2pkOvlpWithTest) )
ref2pkOvlpWithTest.refSlct = ref2pkOvlpWithTest[,grepl(paste(REFs.SELECT, collapse="|"), colnames(ref2pkOvlpWithTest))]

ref2pkOvlpWithTest.refSlct.combBP = sapply(REFs.SELECT,
FUN=function(ref)
{
  if(ref %in% colnames(ref2pkOvlpWithTest.refSlct))
  {
    return(ref2pkOvlpWithTest.refSlct[,ref])
  }else
  {
    samps = colnames(ref2pkOvlpWithTest.refSlct)
    samps = samps[grepl(ref, samps)]
    apply(ref2pkOvlpWithTest.refSlct[,samps], 1, mean)
  }
  
})


peaks = list()
peaks$merged.2goodSample = merged.peaks
peaks$merged.2GS.inRef = merged.peaks.inRef
peaks$merged.2GS.inRef.slcted = rownames(ref2pkOvlpWithTest.refSlct.combBP)[apply(ref2pkOvlpWithTest.refSlct.combBP,1,max)>=0.5]

reads.total = lapply(samp2HeightsMean,
FUN=function(samps.heightMean)
{
  res= sapply(peaks,
  FUN=function(pks)
  {
    rbind(merged.peaks.len[pks]) %*% samps.heightMean[pks, ]
  })
  
})

reads.prop = lapply(reads.total,
FUN=function(rds.tot)
{
  rds.tot/rds.tot[,1]
})

#
library(ggplot2)
df = data.frame(reads.prop = c(reads.prop$Mayo[, "merged.2GS.inRef"], reads.prop$Mayo[, "merged.2GS.inRef.slcted"],
                                     reads.prop$ROADMAP[, "merged.2GS.inRef"], reads.prop$ROADMAP[, "merged.2GS.inRef.slcted"],
                                     reads.prop$blueprint[, "merged.2GS.inRef"], reads.prop$blueprint[, "merged.2GS.inRef.slcted"]),
                datasets =c(rep("Mayo", 2*nrow(reads.prop$Mayo)), 
                            rep("ROADMAP", 2*nrow(reads.prop$ROADMAP)),
                            rep("Blueprint", 2*nrow(reads.prop$blueprint))),
                type = c(rep("reads from peaks overlapping those from any reference samples", nrow(reads.prop$Mayo)), 
                         rep("reads from peaks overlapping those from selected reference samples", nrow(reads.prop$Mayo)),
                         rep("reads from peaks overlapping those from any reference samples", nrow(reads.prop$ROADMAP)), 
                         rep("reads from peaks overlapping those from selected reference samples", nrow(reads.prop$ROADMAP)),
                         rep("reads from peaks overlapping those from any reference samples", nrow(reads.prop$blueprint)), 
                         rep("reads from peaks overlapping those from selected reference samples", nrow(reads.prop$blueprint))
                         ),
                stringsAsFactors=F)
df$datasets=factor(df$datasets, levels=c("Mayo", "ROADMAP", "Blueprint"))
df$type=factor(df$type, levels=c("reads from peaks overlapping those from any reference samples", 
                                 "reads from peaks overlapping those from selected reference samples"))


pdf(file.ou.readPerct.pdf, width =12, height=5)

p <-ggplot(df, aes(x=datasets, y=reads.prop, fill=type)) +
  geom_violin() + 
  ggtitle("compared to reads in all filtered merged peaks") +
  xlab("datasets") + 
  ylab("reads proportion")

print(p)
# # Change the position
# p<-ggplot(ToothGrowth, aes(x=dose, y=len, fill=supp)) +
#   geom_violin(position=position_dodge(1))
# p

dev.off()


save(samp2HeightsMean, qc.info, ref2pkOvlpWithTest, pk2refOvlpTest.Weightsum, peaks, merged.peaks.df, merged.peaks.len, reads.total, reads.prop, file=file.ou.RData)