#calculation the distribution of peaks around genes HM2COL=c("H3K27ac"="#E5803C", "H3K4me1"="#E974FF", "H3K4me3"="#DD4545", "H3K36me3"="#5FAA37", "H3K27me3"="#4474EA" ) UPSTREAM.WIND=50000 DNSTREAM.WIND=50000 BIN.SIZE=500 BIN.NUM=UPSTREAM.WIND/BIN.SIZE BIN.PROM.NUM=2000/BIN.SIZE library(data.table) file.in.hg19.gtf= "~/lhou.compbio/data/gene/human/gencode.v26lift37.basic.annotation.gtf" files.in.peak=paste0("c_bulkDiffPeak_limma_V1.4.1_sva_noEHR/", names(HM2COL), "/", names(HM2COL), ".diffPeak.BG.bed") names(files.in.peak) = names(HM2COL) dir.tmp = "~/hptmp/b_bgPeak_homerAnnot/" dir.create(dir.tmp, showWarnings = F, recursive = T) dir.ou = "b_bgPeak_homerAnnot/" dir.create(dir.ou, showWarnings = F, recursive = T) files.ou.annot = paste0(dir.ou, names(HM2COL), ".bgPeaks.homerAnnot.txt") names(files.ou.annot) = names(HM2COL) file.ou.RData = paste0(dir.ou, "5HM.peakDistrib.scaledGeneBody.RData") file.ou.pdf=paste0(dir.ou, "5HM.peakDistrib.scaledGeneBody.pdf") # for(hm in names(files.in.peak)) { print(hm) f.tmp= paste0(dir.tmp, hm, ".pk.tmp") if(file.exists(files.ou.annot[hm])) next buf=read.table(files.in.peak[hm], sep="\t", head=F, row.names = NULL) write(paste(paste0(buf[,1], ":", buf[,2], "-", buf[,3]), buf[,1], buf[,2], buf[,3], "+", sep="\t"), file=f.tmp) cmd=paste0("annotatePeaks.pl ", f.tmp, " hg19 -gtf ", file.in.hg19.gtf, ">", files.ou.annot[hm]) system(cmd) } # if(file.exists(file.ou.RData)) { load(file.ou.RData) }else { buf=fread(file.in.hg19.gtf, sep="\t", header=T, data.table=F, stringsAsFactors=F) gene.annots=buf[buf[,3]=="gene", ] rownames(gene.annots)=sapply(gene.annots[, 9], FUN=function(x) { gsub("gene_id \"(ENSG.*?)\".*", "\\1", x) }) save(gene.annots, file=file.ou.RData) } # hm.distb.toGene=sapply(names(HM2COL), FUN=function(hm) { print(hm) buf=read.table(files.ou.annot[hm], sep="\t", header = F, row.names = 1, stringsAsFactors = F, skip=5) res=sapply(1:nrow(buf), FUN=function(i) { if(i %% 1000 ==0) print(i) pk.center=(buf[i,2]+buf[i,3])/2 g=buf[i,11] if(g =="" || is.na(g) || !(g %in% rownames(gene.annots))) return(NA) g.start=gene.annots[g,4] g.end=gene.annots[g,5] g.strand=gene.annots[g,7] pk.bin=NA if(g.strand=="+") { if(pk.centerUPSTREAM.WIND) return (NA) pk.bin= paste0("up", round((g.start-pk.center)/BIN.SIZE)) }else { if(pk.center>=g.start && pk.center<=g.end) { pk.bin= paste0("gb", round((pk.center-g.start)/((g.end-g.start)/BIN.NUM))) }else { if(pk.center-g.end>DNSTREAM.WIND) return (NA) pk.bin= paste0("dn", round((pk.center-g.end)/BIN.SIZE)) } } }else { if(pk.center>g.end) { if(pk.center-g.end>UPSTREAM.WIND) return (NA) pk.bin= paste0("up", round((pk.center-g.end)/BIN.SIZE)) }else { if(pk.center>=g.start && pk.center<=g.end) { pk.bin= paste0("gb", round((g.end-pk.center)/((g.end-g.start)/BIN.NUM))) }else { if(g.start-pk.center>DNSTREAM.WIND) return (NA) pk.bin= paste0("dn", round((g.start-pk.center)/BIN.SIZE)) #pk.bin= round((g.start-pk.center)/BIN.SIZE) } } } return(pk.bin) }) names(res) =rownames(buf) return(res) }) save(gene.annots, hm.distb.toGene, file=file.ou.RData) pdf(file.ou.pdf, width=12, height=10) layout(matrix(1:5, nrow=5)) for(hm in names(HM2COL)) { x = hm.distb.toGene[[hm]] x.gbProp = sum(!is.na(x))/length(x)*100 x = x[!is.na(x)] x[x=="up100"]="up99" x[x=="gb100"]="gb99" x[x=="dn100"]="dn99" bin2Num= summary(factor(x, levels=c(paste0("up", 99:0), paste0("gb", 0:99), paste0("dn", 0:99))), maxsum=3*BIN.NUM+1) bin.promot.avg= mean(bin2Num[c(paste0("up", 0:(BIN.PROM.NUM-1)), "gb0")]) bin.gb.avg=mean(bin2Num[paste0("gb", 0:99)]) xs=barplot(bin2Num, las=2, #lwd=0, #col=NA, border=HM2COL[hm], col=HM2COL[hm], xlab="genomic bins relative to genebody", ylab="counts of CREs", main=paste0(hm, " ", signif(x.gbProp,2), "% CREs within 50kb of a gene\ngeneBody to prom signal ratio: ", signif(bin.gb.avg/bin.promot.avg,2))) abline(v=c(xs[c(101, 200)]), lty=2) } dev.off() # #print how much in each category # for(hm in names(hm.distb.toGene)) # { # print(hm) # props=summary(factor(sub("\\d+", "", hm.distb.toGene[[hm]])))/length(hm.distb.toGene[[hm]]) # print(props) # }