#V1.2 #compare the correlation between H3K27ac at promoter and all Marks at gene body across individuals for each gene #to check whether H3K36me3 is best surrogate for the gene expression #only consider the peaks in gene bodies that are candidates for gene peak genetic linking #mapping peaks to gene body based on hg38 coordinates #do meta-analysis of differential signal of peaks taking into considertaion of sign between peak and gene expression #including peaks # library(clusterProfiler) # library(org.Hs.eg.db) # library(ggplot2) # library(DOSE) # source("~/lhou.compbio/codes/mylib/RCode/clusterProfiler_auxiliary_function.R") #KEGG.FDR=0.05 # args = commandArgs(trailingOnly=TRUE) HM = args[1] library(ggplot2) HM.Prom = "H3K27ac" HMs= union(HM.Prom,HM) #)"H3K36me3", "H3K4me1", "H3K4me3", "H3K27me3") PROM.WIND=2000 files.in.hg38tohg19 = paste0("a_2_peakHeightsNormalized_V1.4/", HMs, ".pk.hg38tohg19.RDS") files.in.meta.RData = paste0("a_2_peakHeightsNormalized_V1.4/", HMs, ".MayoWithRef.goodSamples.meta.RData") files.in.norm.RData = paste0("a_2_peakHeightsNormalized_V1.4/", HMs, ".MayoWithRef.heightsMean.depthGCNormed.RData") files.in.peak2gene.geneticScore.RData= paste0("../haQTLvseQTL/b_summary_Links.coloc.MR.PRS_V1.2_addGeneBody/Blood.", HMs, ".linkFromColoc.MR.PRS.RData") names(files.in.peak2gene.geneticScore.RData) <- names(files.in.hg38tohg19) <- names(files.in.meta.RData) <- names(files.in.norm.RData ) <- HMs dir.ou="d_CorrAcrossIndivBtwHMs_V1.2_promVSgeneBody/" dir.create(dir.ou, showWarnings = F, recursive = T) #file.ou.pkAndGeneBody.RDS = paste0(dir.ou, "pkAndGeneBody.RDS") file.ou.RData=paste0(dir.ou, "prom.", HM.Prom, ".VS.genebody.", HM, ".RData") file.ou.pdf=paste0(dir.ou, "prom.", HM.Prom, ".VS.genebody.", HM, ".pdf") #file.ou.KEGG.pdf=paste0(dir.ou, "geneBody2Peak.DP.meta.KEGG.pdf") # HM.hg38tohg19= lapply(HMs, FUN=function(HM) { pks.hg382hg19.uniq=readRDS(files.in.hg38tohg19[HM]) }) names(HM.hg38tohg19)=HMs #genetic link HM.geneticLinkScore=lapply(files.in.peak2gene.geneticScore.RData, FUN=function(f.i.RData) { load(f.i.RData) scores.all }) #peak activity HM.activity=lapply(files.in.norm.RData, FUN=function(f.i) { load(f.i) d=samp2HeightsMean$depth.GCnormTeng d=d[, grepl("id_", colnames(d))] }) #samples ovlp HM.sampsOvlp=intersect(colnames(HM.activity[[HM.Prom]]), colnames(HM.activity[[HM]])) #promoter signal buf=sapply(rownames(HM.geneticLinkScore[[HM.Prom]])[abs(HM.geneticLinkScore[[HM.Prom]]$distance)<=PROM.WIND], FUN=function(b) { unlist(strsplit(b, split=";")) }) HM.gene2Promot=split(buf[2,], buf[1,]) geneProm.signal= sapply(HM.gene2Promot, FUN=function(pks) { pks.hg19=HM.hg38tohg19[[HM.Prom]][pks] apply(rbind(HM.activity[[HM.Prom]][pks.hg19, HM.sampsOvlp]), 2, mean) }) #gene body signal HM.geneBody.score=HM.geneticLinkScore[[HM]] HM.geneBody.score.filt = HM.geneBody.score[HM.geneBody.score$geneBody,] buf= sapply(rownames(HM.geneBody.score.filt), FUN=function(b) { unlist(strsplit(b, split=";")) }) HM.geneBody.score.filt$gene=buf[1,] HM.geneBody.score.filt$pk=buf[2,] HM.gene2geneBody.score.filt= split(HM.geneBody.score.filt, HM.geneBody.score.filt$gene) save(HM.hg38tohg19, HM.geneticLinkScore, HM.activity, HM.sampsOvlp, HM.gene2Promot, geneProm.signal, HM.gene2geneBody.score.filt, file=file.ou.RData ) #correlation between gs.ovlp=intersect(names(HM.gene2geneBody.score.filt), colnames(geneProm.signal)) gs.ovlp.promVSgBody.cors=lapply(gs.ovlp, FUN=function(g) { pks.gb.hg38 = HM.gene2geneBody.score.filt[[g]]$peak pks.gb.hg19= HM.hg38tohg19[[HM]][pks.gb.hg38] pcors=cor(cbind(geneProm.signal[HM.sampsOvlp, g]), t(rbind(HM.activity[[HM]][pks.gb.hg19, HM.sampsOvlp]))) df = HM.gene2geneBody.score.filt[[g]] df$pVSgbody.cor=pcors[1,] return(df) }) gs.ovlp.promVSgBody.cors.df = do.call(rbind, gs.ovlp.promVSgBody.cors) gs.ovlp.promVSgBody.cors.df$MR.z.bin= floor(gs.ovlp.promVSgBody.cors.df$MR.z) gs.ovlp.promVSgBody.cors.df$MR.z.bin[gs.ovlp.promVSgBody.cors.df$MR.z.bin< -5 ] = -5 gs.ovlp.promVSgBody.cors.df$MR.z.bin[gs.ovlp.promVSgBody.cors.df$MR.z.bin >5 ] = 5 gs.ovlp.promVSgBody.cors.df$MR.z.bin[is.na(gs.ovlp.promVSgBody.cors.df$MR.z.bin)]="NA" gs.ovlp.promVSgBody.cors.df$MR.z.bin=factor(gs.ovlp.promVSgBody.cors.df$MR.z.bin, levels=c("NA", seq(from=-5, to=5))) gs.ovlp.promVSgBody.cors.df$dist.bin = floor(abs(gs.ovlp.promVSgBody.cors.df$distance)/100000) save(geneProm.signal, HM.gene2geneBody.score.filt, gs.ovlp.promVSgBody.cors.df, file=file.ou.RData ) pdf(file.ou.pdf, width=12, height=6) layout(matrix(1:2, ncol=2)) # smoothScatter(abs(gs.ovlp.promVSgBody.cors.df$distance.log), gs.ovlp.promVSgBody.cors.df$pVSgbody.cor, # xlab="log10 distance between gene body peak and promoter", # ylab=paste0("correlation: ", HM.Prom, " at promoter vs ", HM, " on gene body")) # abline(h=0, lty=2) boxplot( gs.ovlp.promVSgBody.cors.df$pVSgbody.cor ~gs.ovlp.promVSgBody.cors.df$dist.bin, xlab="distance bin of gene body peak to promoter (100kb)", ylab=paste0("correlation: ", HM.Prom, " at promoter vs ", HM, " on gene body")) abline(h=0, lty=2) smoothScatter(gs.ovlp.promVSgBody.cors.df$MR.z, gs.ovlp.promVSgBody.cors.df$pVSgbody.cor, xlab=paste0("gene expression mediated by ", HM, " MR.z"), ylab=paste0("correlation: ", HM.Prom, " at promoter vs ", HM, " on gene body")) abline(h=0, lty=2) abline(v=0, lty=2) # smoothScatter(gs.ovlp.promVSgBody.cors.df$distance.log, gs.ovlp.promVSgBody.cors.df$pVSgbody.cor) # abline(h=0, lty=2) p <- ggplot(gs.ovlp.promVSgBody.cors.df) + geom_boxplot(aes(x=factor(dist.bin), y=pVSgbody.cor, fill=MR.z.bin)) + xlab("distance bin of gene body peak to promoter (100kb)") + ylab(paste0("correlation: ", HM.Prom, " at promoter vs ", HM, " on gene body")) print(p) dev.off()