#V1.2 #we may need consider the length of the peak #for permutation, only interested in peaks as "nonMatchedTissueOnly" #but stil use all the peaks in that group for permutation #those longer peaks will be with higher reproducible rate thus more confident #V1.1 #use peak present in each individual at merged peak location instead of number of individual peaks #use permutation to test how likely x merged peaks overlapping with two or more peaks each from different individuals by chance #compare peaks found to all chromatin states from Roadmap #to answer #a) do most peak comeing from brain promoter or enhancers #b) how many are not detected as promoters or enhancers in brain regions #c) how many are not detected as any promoter or enhancer regions in any tissue at all #d) how is that related to overlap times with peaks from each individuals args = commandArgs(trailingOnly=TRUE) TISS = args[1] #Brain Heart Muscle Lung options(scipen = 999) OVLP.MIN= 50 #BIN.MAX = 100 PERM.NUM=50 TISS2RoadmapLabel=c(Brain ="BRN", Heart ="HRT", Lung="LNG", Muscle="MUS") PEAKLEN.BINs = c(0, 200, 400, 600, 800, 1000, 1500, 2000) CSGRP2COLOR = c("red", "orange", "yellow", "grey") #names(CSGRP2COLOR) = #file.in.peak = paste0("./a_mergePeak2HeightsMean_V1.1/hg19.narrowPeak.RSC0.8.RdsTot1e+07.logQ2.merged.GoodSamps.", TISS, ".bed") #file.in.peak = paste0("a_mergePeak2HeightsMean_V1.1/hg19.narrowPeak.RSC0.8.RdsTot1e+07.logQ2.merged.GoodSamps.", TISS, ".bed") file.in.peak = paste0("a_mergePeak2HeightsMean_V1.2_addRefPeak/hg19.narrowPeak.RSC0.8.RdsTot1e+07.logQ2.merged.GoodSamps.", TISS, ".bed") file.in.mergePk2Ind.RDS = paste0("a2_mergePeak2IndPeak/hg19.narrowPeak.RSC0.8.RdsTot10000000.logQ2.merged2IndPeak.matrix.", TISS, ".RDS") file.in.Roadmap.meta = "~/lhou.compbio/data/Roadmap/ROADMAP_sampleMeta_colorCode.txt" files.in.Roadmap.CS = dir("~/lhou.compbio/data/Roadmap/18state_6marks_98epigen/", "_18_core_K27ac_mnemonics.bed.gz", full.names = T) dir.tmp= "~/hptmp/eGTEx-H3K27ac/cmpPeak2RoadmapCS/" if(!dir.exists(dir.tmp)) { dir.create(dir.tmp) } dir.ou="b_cmpPeak2RoadmapChromStates_V1.2_pkLen/" if(!dir.exists(dir.ou)) { dir.create(dir.ou) } file.ou.RData = paste0(dir.ou, TISS, ".ovlp.Roadmap18CS.Ovlp", OVLP.MIN, "bp.RData") file.ou.pdf = paste0(dir.ou, TISS, ".ovlp.Roadmap18CS.Ovlp", OVLP.MIN, "bp.CSTypeProp.pdf") # Roadmap.meta = read.table(file.in.Roadmap.meta, head=F, skip=1, sep="\t", row.names=1, comment.char = "", stringsAsFactors = F) RoadmapSamps.tiss = rownames(Roadmap.meta)[grepl(TISS2RoadmapLabel[TISS], Roadmap.meta[, 4])] # mergePeak2IndPeak = readRDS(file.in.mergePk2Ind.RDS) peaks.all.bed = rownames(mergePeak2IndPeak) peaks.all2len = sapply(peaks.all.bed, FUN=function(pk) { buf = as.numeric(unlist(strsplit(pk, split="\t", fixed=T))[2:3]) return(buf[2]-buf[1]) }) names(peaks.all2len) = peaks.all.bed # peak2mergIndNumber = apply(mergePeak2IndPeak, 1, FUN=function(x) { sum(x!=0) }) #names(peak2mergIndNumber) = peaks.all.bed # for(f.i.cs in files.in.Roadmap.CS) { print(f.i.cs) f.tmp.ovlp = paste0(dir.tmp, TISS, ".ovlp.", gsub(".gz", "", basename(f.i.cs))) # if(!file.exists(f.tmp.ovlp)) # { cmd =paste0("bedtools intersect -a ", file.in.peak, " -b ", f.i.cs, " -wo >", f.tmp.ovlp) system(cmd) # } } # files.tmp.ovlp = dir(dir.tmp, paste0(TISS, ".ovlp.*_18_core_K27ac_mnemonics.bed"), full.names = T) names(files.tmp.ovlp) = sapply(files.tmp.ovlp, FUN=function(f.tmp) { ref = gsub(".*(E\\d+)_.*","\\1", basename(f.tmp), perl =T) } ) peak2RefCS = sapply(files.tmp.ovlp, FUN=function(f.tmp.ovlp) { print(f.tmp.ovlp) pk2cs.all = rep(NA, length(peaks.all.bed)) names(pk2cs.all) = peaks.all.bed cmd =paste0("awk '($9>=", OVLP.MIN, "){print $1 \"\\t\" $2 \"\\t\"$3 \",\" $8 }' ", f.tmp.ovlp) buf = sapply(system(cmd, intern = T), FUN=function(x){unlist(strsplit(x, split=","))}) pk2cs = sapply(split(buf[2,], buf[1,]), paste, collapse=",") pk2cs.all[names(pk2cs)] = pk2cs return(pk2cs.all) }) #colnames(peak2RefCS) = save(peaks.all.bed, peak2RefCS, file=file.ou.RData) # RoadmapSamps.tiss.ovlp = intersect(names(files.tmp.ovlp), RoadmapSamps.tiss) RoadmapSamps.tiss.other = setdiff(names(files.tmp.ovlp),RoadmapSamps.tiss) peak2RefCSType = apply(peak2RefCS, 1, FUN=function(cs) { #print(cs) csIsTssOrEnh = grepl("Tss|Enh", cs) names(csIsTssOrEnh) = names(cs) if(any(csIsTssOrEnh[RoadmapSamps.tiss.ovlp], na.rm = T)) { if(any(csIsTssOrEnh[RoadmapSamps.tiss.other], na.rm = T)) { return("Tss|Enh.MatchedAndNotTissue") } else { return("Tss|Enh.MatchedTissueOnly") } }else { if(any(csIsTssOrEnh, na.rm = T)) { return("Tss|Enh.nonMatchedTissueOnly") }else { return("Tss|Enh.noTissue") } } }) # peak2RefCSTypeAndmergePeakNum= data.frame(chromatinStates = peak2RefCSType, # mergePeakNum = peak2mergIndNumber) RefCSType2peakProp.bin = sapply(1:ncol(mergePeak2IndPeak), FUN=function(i) { pks = names(peak2RefCSType)[peak2mergIndNumber==i] CSType2Counts = summary(factor(peak2RefCSType[pks], levels=c("Tss|Enh.MatchedTissueOnly", "Tss|Enh.MatchedAndNotTissue", "Tss|Enh.nonMatchedTissueOnly", "Tss|Enh.noTissue"))) CSType2prop = CSType2Counts/sum(CSType2Counts) }) colnames(RefCSType2peakProp.bin) = 1:ncol(mergePeak2IndPeak) peakNum.bin = sapply(1:ncol(mergePeak2IndPeak), FUN=function(i) { pks = names(peak2RefCSType)[peak2mergIndNumber==i] length(pks) }) save(peaks.all.bed, RoadmapSamps.tiss.ovlp, peak2RefCS, peak2RefCSType, RefCSType2peakProp.bin, peakNum.bin, file=file.ou.RData) #permutation based estimation of background signal percentage peakGrps.byCS=list() peakGrps.byCS$TorE.MatchedTiss = names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.MatchedTissueOnly"] peakGrps.byCS$TorE.MatchedAndNotTiss = names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.MatchedAndNotTissue"] peakGrps.byCS$TorE.nonMatchedTissOnly = names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.nonMatchedTissueOnly"] #peakGrps.byCS$TorE.noMatchedTiss = c(names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.noTissue"], #names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.nonMatchedTissueOnly"]) peakGrps.byCS$TorE.noTiss = names(peak2RefCSType)[peak2RefCSType=="Tss|Enh.noTissue"] peakGrps.byCSandLen = lapply(peakGrps.byCS, FUN=function(pks) { pkGrp.byLen = lapply(1:length(PEAKLEN.BINs), FUN=function(i) { if(iPEAKLEN.BINs[i] & peaks.all2len[pks]<=PEAKLEN.BINs[i+1]]) }else { return(pks[peaks.all2len[pks]>=PEAKLEN.BINs[i]]) } }) names(pkGrp.byLen) = PEAKLEN.BINs return(pkGrp.byLen) }) peakGrp.byCS2IndFreq = sapply(peakGrps.byCS, FUN=function(pks) { apply(mergePeak2IndPeak[pks,], 2, FUN=function(x) { sum(x!=0)/length(x) } ) }) peakGrp.byCSandLen2IndFreq = lapply(peakGrps.byCSandLen, FUN=function(pks.grp) { res=sapply(pks.grp, FUN=function(pks.subGrp) { if(length(pks.subGrp)==0) { return(NA) } apply(mergePeak2IndPeak[pks.subGrp,], 2, FUN=function(x) { sum(x!=0)/length(x) }) }) names(res) = names(pks.grp) return(res) }) #perm grp.slctForPerm = "TorE.nonMatchedTissOnly" #only focus on nonMatchedTissueOnly type of peaks mP2IndPk.nonMatchedTissOnly = mergePeak2IndPeak[peakGrps.byCS[[grp.slctForPerm]], ] peak.nonMatchedTissOnly.2IndCounts = apply(mP2IndPk.nonMatchedTissOnly, 1, FUN=function(x) { sum(x!=0) }) peak.nonMatchedTissOnly.IndCountsDist=sapply(0:ncol(mergePeak2IndPeak), FUN=function(i) { sum(peak.nonMatchedTissOnly.2IndCounts>=i) }) #permutation according to cs+length grp peaks.nonMatchedTissOnly.IndCountsDist.perm = t(sapply(1:PERM.NUM, FUN=function(i) { print(i) # mP2IndPk.nonMatchedTissOnly.perm = apply(mP2IndPk.nonMatchedTissOnly, 2, # FUN=function(x) # { # unlist(lapply(peakGrps.byCSandLen[[grp.slctForPerm]], # FUN=function(pks.grp) # { # sample(x[pks.grp], length(pks.grp)) # })) # # }) mP2IndPk.nonMatchedTissOnly.perm = apply(mP2IndPk.nonMatchedTissOnly, 2, FUN=function(x) { sample(x, length(x)) }) pk2IndCounts= apply(mP2IndPk.nonMatchedTissOnly.perm, 1, FUN=function(x) { sum(x!=0) }) sapply(0:ncol(mergePeak2IndPeak), FUN=function(i) { sum(pk2IndCounts>=i) }) })) colnames(peaks.nonMatchedTissOnly.IndCountsDist.perm)=0:ncol(mergePeak2IndPeak) save(peaks.all.bed, RoadmapSamps.tiss.ovlp, peak2RefCS, peak2RefCSType, RefCSType2peakProp.bin, peak2mergIndNumber, peaks.all2len, peakNum.bin, peakGrps.byCS, peakGrps.byCSandLen, peakGrp.byCS2IndFreq, peakGrp.byCSandLen2IndFreq, peak.nonMatchedTissOnly.IndCountsDist, peaks.nonMatchedTissOnly.IndCountsDist.perm, file=file.ou.RData) ############################### pdf(file.ou.pdf, width =20, heigh=9) par(mar=c(6,6,6,18)) par(xpd = T) barplot(RefCSType2peakProp.bin, xlab="peaks detected in individuals #", ylab ="proportion of chromatin state type", col = CSGRP2COLOR, las=2, border = NA, space=0.1, cex.lab = 1.5) legend(#"topright", x=1, y=1.15, legend=rownames(RefCSType2peakProp.bin), #inset=c(-0.25,0), fill=CSGRP2COLOR) par(new=T) plot(1:ncol(mergePeak2IndPeak), peakNum.bin, xaxt="n", yaxt="n",xlab="",ylab="", type="b", bty="n", lwd=2, cex= 1.5) axis(4) mtext("peaks number",side=4, line=3, cex = 1.5)#,col="blue") ################### par(mar=c(6,6,6,6)) boxplot(peakGrp.byCS2IndFreq, col = CSGRP2COLOR, ylab="proportion of merged peaks\ndetected in individual") layout(matrix(1:4, ncol=2)) for(i in 1:length(peakGrp.byCSandLen2IndFreq)) { boxplot(peakGrp.byCSandLen2IndFreq[[i]], col = CSGRP2COLOR[i], ylab="proportion of merged peaks\ndetected in individual", xlab="peak length cutoff (bp)", cex.lab=1.5, main=names(peakGrp.byCSandLen2IndFreq)[i]) par(new=T) plot(1:length(peakGrps.byCSandLen[[i]]), sapply(peakGrps.byCSandLen[[i]],length), xaxt="n", yaxt="n",xlab="",ylab="", type="b", bty="n", lwd=1.5, cex= 1) axis(4) mtext("peaks number",side=4, line=3, cex = 1.5)#,col="blue") } layout(matrix(1, ncol=1)) peaks.nonMatchedTissOnly.IndCountsDist.perm.mean=apply(peaks.nonMatchedTissOnly.IndCountsDist.perm,2,mean) # peaks.nonMatchedTissOnly.IndCountsDist.perm.FC = sapply(1:sum(peaks.nonMatchedTissOnly.IndCountsDist.perm.mean!=0), # FUN=function(i) # { # peaks.nonMatchedTissOnly.IndCountsDist.perm[,peaks.nonMatchedTissOnly.IndCountsDist.perm.mean!=0][,i]/ # peaks.nonMatchedTissOnly.IndCountsDist.perm.mean[peaks.nonMatchedTissOnly.IndCountsDist.perm.mean!=0][i] # }) # colnames(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC) = names(peaks.nonMatchedTissOnly.IndCountsDist.perm.mean)[peaks.nonMatchedTissOnly.IndCountsDist.perm.mean!=0] # #rownames(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC) = rownames(peaks.nonMatchedTissOnly.IndCountsDist.perm) peaks.nonMatchedTissOnly.IndCountsDist.perm.FC = apply(peaks.nonMatchedTissOnly.IndCountsDist.perm,2, FUN=function(x) { x/(mean(x)+1) }) boxplot(ylim=c(0, 5), log10(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC+1), ylab="log10 FC of peaks # detected\n compared to mean in permutation", xlab="no less than individual #", main= paste(TISS, grp.slctForPerm, sep=" ")) points(1:ncol(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC), log10(peak.nonMatchedTissOnly.IndCountsDist/(peaks.nonMatchedTissOnly.IndCountsDist.perm.mean+1)+1), pch="x", col="red", cex=2, cex.lab=1.5) legend("topright", legend=c("observed", "permutation"), #inset=c(-0.25,0), pch=c("X", "o"), col=c("red", "black"), cex= 1.5 ) par(new=T) plot(1:ncol(peaks.nonMatchedTissOnly.IndCountsDist.perm.FC), peak.nonMatchedTissOnly.IndCountsDist, log="y", xaxt="n", yaxt="n",xlab="",ylab="", type="b", bty="n", lwd=2, cex= 1.5) axis(4, at=c(1, 10, 100, 1000, 10000)) mtext("peaks #",side=4, line=3, cex = 1.5)#,col="blue") # par(mar=c(6,6,6,6)) # layout(matrix(1:15, ncol=5)) # for(i in 1:ncol(peaks.noMatchTiss.2IndCounts.perm)) # { # # } dev.off()