#get saturation curve for peaks #to compare peaks from newly detected and the rest #compare the FDR too args = commandArgs(trailingOnly=TRUE) TISS = args[1] #Brain Heart Muscle Lung # TISS2RoadmapLabel=c(Brain ="BRN", # Heart ="HRT", # Lung="LNG", # Muscle="MUS") RSC.CUTOFF = 0.8 READSTOTAL.CUTOFF = 1E7 SAMP.N=10 GRP2COLOR =c(newlyDetected="red", rest="lightblue") library(ggplot2) #file.in.peaks = paste0("a_2_peakHeightsNormalized_hg19.narrowPeak.RSC0.8.RdsTot1e7.logQ2_V1.3_CSFilt/", TISS, ".peaks.matchedCS.bed") file.in.metaAndQC = "~/lhou.compbio/data/eGTEX-H3K27ac/Batch_summary_6.16.2018/sample.metaAndQC.RSC0.6.totalReads5e+06.RSC0.8OrMatchedForLung.txt" #file.in.merge.res = paste("a_mergePeak2HeightsMean_V1.1/hg19.narrowPeak.RSC0.8.RdsTot1e+07.logQ2.merged.GoodSamps.", TISS, ".bed") file.in.mergePk2Ind.RDS = paste0("a2_mergePeak2IndPeak_V1.1_allSamps/hg19.narrowPeak.RSC0.8.RdsTot10000000.logQ2.merged2IndPeak.matrix.", TISS, ".RDS") file.in.mod.AREs.RData="../variationAcrossTissueandIndiv/a_4_modules_annotations_mergedPk4Tiss_Epimap_V1.2.3_CSandeGTExActivity/mARE2ModuleGrpName.RData" file.in.tiss2ARE.RData="./c_mergePeaksV1.3FromTissues_hg19/4Tiss.mergedPeak2tissuePeak.hg19.RData" dir.ou = "c_peaks_saturationCurve_V1.1_Grp14vsRest/" dir.create(dir.ou, showWarnings = F, recursive = T) file.ou.RData = paste0(dir.ou, TISS, ".peaks.ind.saturation.RData") file.ou.pdf = paste0(dir.ou, TISS, ".peaks.ind.saturation.pdf") # meta = read.table(file.in.metaAndQC, sep="\t", header=T, row.names=1) meta.tiss = meta[meta[,"tissue"]==TISS,] samps.tiss.good = rownames(meta.tiss)[meta.tiss$totalReads>=READSTOTAL.CUTOFF & meta.tiss$RSC>=RSC.CUTOFF] # # # buf= read.table(file.in.peaks, sep="\t", row.names = NULL, header = F) # peaks.filt=paste0(buf[,1], ":", buf[,2], "-", buf[,3]) load(file.in.mod.AREs.RData) load(file.in.tiss2ARE.RData) mergePeak2IndPeak = readRDS(file.in.mergePk2Ind.RDS) rownames(mergePeak2IndPeak) = sub("\t", ":", rownames(mergePeak2IndPeak)) rownames(mergePeak2IndPeak) = sub("\t", "-", rownames(mergePeak2IndPeak)) mergePeak2PresCounts = apply(mergePeak2IndPeak!=0, 1, sum) samp.ref= colnames(mergePeak2IndPeak)[grepl("^E", colnames(mergePeak2IndPeak))] mAREs.byGrp = list() mAREs.byGrp$newlyDetected = names(mARE2ModGrpName)[mARE2ModGrpName=="Newly detected"] mAREs.byGrp$rest = setdiff(names(mARE2ModGrpName), mAREs.byGrp$newlyDetected) tiss.pks.byGrp = lapply(mAREs.byGrp, FUN=function(mAREs) { unlist(tiss.mergedPeak2peaks[[TISS]][gsub(":|-", "\t", mAREs)]) }) inds.sampled=lapply(1:(length(samps.tiss.good)-1), FUN=function(N) { rbind(sapply(1:SAMP.N, FUN=function(i)sample(samps.tiss.good, N, replace = F))) }) peaks.detectAndRepProp.list = lapply(names(tiss.pks.byGrp), FUN=function(nm) { peaks=tiss.pks.byGrp[[nm]] res=lapply(inds.sampled, FUN=function(inds.i) { t(apply(inds.i, 2, FUN=function(samps.dis) { print(samps.dis) samps.dis=c(samps.dis, samp.ref) pks.dis= peaks[apply(mergePeak2IndPeak[peaks, samps.dis]>0, 1, sum)>=2] prop.dis= length(pks.dis)/length(peaks) samps.rep= setdiff(colnames(mergePeak2IndPeak), samps.dis) pks.rep= peaks[apply(mergePeak2IndPeak[peaks, samps.rep]>0, 1, sum)>=1] prop.rep= length(intersect(pks.dis, pks.rep))/length(pks.dis) return(c(pks.dis=length(pks.dis), prop.detected=prop.dis, pks.rep=length(pks.rep), prop.replicated=prop.rep, indiv.number=length(samps.dis)-1)) })) }) data.frame(do.call(rbind, res), group=nm, stringsAsFactors = F) }) peaks.detectAndRepProp.df=do.call(rbind, peaks.detectAndRepProp.list) # peaks.detectAndRepProp.df = data.frame() # for(nm in names(peaks.detectAndRepProp.list)) # { # df = data.frame(peaks.detectAndRepProp[[nm]], # group = nm) # #indiv.number = rep(1:ncol(peaks.detectAndRepProp[[nm]]), each =nrow(peaks.detectAndRepProp[[nm]]))) # peaks.detectAndRepProp.df = rbind(peaks.detectAndRepProp.df, df) # } save(mergePeak2IndPeak, #peaks.filt, tiss.pks.byGrp, inds.sampled, peaks.detectAndRepProp.df, #peaks.detectAndRepProp.df, file=file.ou.RData) pdf(file.ou.pdf, width=12, height=5) # peak.indivCounts.df = data.frame(indiv.counts=mergePeak2PresCounts, # group="background", # stringsAsFactors = F) # rownames(peak.indivCounts.df) = names(mergePeak2PresCounts) # # for(grp in names(tiss.pks.byGrp)) # # { # # peak.indivCounts.df[tiss.pks.byGrp[[grp]], "group"] = grp # # } # for(grp in names(tiss.pks.byGrp)) # { # p = ggplot(peak.indivCounts.df[tiss.pks.byGrp[[grp]],], # aes(x=indiv.counts)) + # geom_histogram(binwidth=.5, fill=GRP2COLOR[grp]) + # ggtitle(paste(TISS, grp)) + # #scale_fill_manual(values=c(novel = "red", rest = "cyan", background ="grey")) + # theme_bw() # print(p) # } p = ggplot(peaks.detectAndRepProp.df, aes(x=factor(indiv.number), y=prop.detected)) + geom_boxplot(aes(fill=group), lwd=.2, outlier.shape = NA) + scale_fill_manual(values=GRP2COLOR) + ggtitle(TISS) + theme_bw() print(p) p = ggplot(peaks.detectAndRepProp.df, aes(x=factor(indiv.number), y=prop.replicated)) + geom_boxplot(aes(fill=group), lwd=.2, outlier.shape = NA) + scale_fill_manual(values=GRP2COLOR) + ggtitle(TISS) + theme_bw() print(p) dev.off()