#V1.2 #fix the bug about sorted issue, which only produce part of reads overlapping with the peaks #filter peak wihout any reads across all samples #add peaks that overlapped with ref # #V1.1 #use qvalue cutoff for filtering before merging # #need to run twice, firt time to merge peaks and map tags, second time to put them in a matrix #need to merge peak for each tissue #only merge peaks from samples with good quality [defined by mismatch, RSC and total Reads] # use shifted reads coverage instead #do not caculate counts for referecen samples in this version, reference samples could be calculated separatedly args = commandArgs(trailingOnly=TRUE) TISS = args[1] #Brain Heart Muscle Lung #options(scipen = 999) RSC.CUTOFF = 0.8 READSTOTAL.CUTOFF = 1E7 LOGQ.CUTOFF=2 N.CUTOFF=2 #need to appear in at least 2 samples TISSUEs = c("Brain", "Heart", "Lung", "Muscle" ) TISS2RoadmapLabel=c(Brain ="E073", Heart ="E095", Lung="E096", Muscle="E108") sh.head = paste("#!/bin/bash", "#$ -S /bin/bash", "##$ -V", "#$ -cwd", "#$ -e ./error.$JOB_NAME.$JOB_ID", "#$ -o ./outpt.$JOB_NAME.$JOB_ID", "#$ -l h_vmem=40g", "source ~/.bashrc", "source ~/.my.bashrc", sep="\n") # file.in.peak.suf = "out/peak/macs2/rep1/*nodup.pval0.01.500K.filt.narrowPeak.gz" file.in.tagAlign.suf = "out/align/rep1/*nodup.tagAlign.gz" #file.in.badSamp = "b_tissueSpecif/sample.QC.RSCcutoff1.txt" file.in.metaAndQC = "~/lhou.compbio/data/eGTEX-H3K27ac/Batch_summary_6.16.2018/sample.metaAndQC.RSC0.6.totalReads5e+06.noMatchForLung.txt" file.in.Roadmap.pk.bed=paste0("/broad/compbio/anshul/projects/roadmap/peaks/consolidated/narrowPeak/", TISS2RoadmapLabel[TISS],"-H3K27ac.narrowPeak.gz") dir.tmp = "/broad/hptmp/leihou/eGTEx-H3K27ac/peakMerge/" dir.create(dir.tmp, showWarnings = F, recursive = T) file.tmp.allPeak = paste(dir.tmp, "hg19.nodup.filt.narrowPeak.all.bed", sep="") #file.tmp.pkInRef = paste(dir.tmp, "merged.peaks.inRef.bed", sep="") file.tmp.filt.mergedPeak = paste(dir.tmp, "hg19.nodup.filt.narrowPeak.merged.filt", N.CUTOFF, "Samps.bed", sep="") dir.ou="./a_mergePeak2HeightsMean_V1.2_addRefPeak/" dir.create(dir.ou, showWarnings = F, recursive = T) #file.ou.mergedPeak = paste(dir.ou, "hg19.nodup.filt.narrowPeak.merged.bed", sep="") file.ou.pref = paste0(dir.ou, "hg19.narrowPeak.RSC", RSC.CUTOFF, ".RdsTot", READSTOTAL.CUTOFF, ".logQ", LOGQ.CUTOFF, ".merged.") #.filt", N.CUTOFF, "GoodSamps", sep="") #file.ou.filt.mergedPeak = paste(file.ou.pref, ".bed", sep="") #file.ou.fractMatrix = paste(dir.ou, "hg19.nodup.filt.narrowPeak.merged.No0Fraction", sep="") #file.ou.ovlp.pdf = paste(dir.ou, "diagnose.ovlpInRef.pdf", sep="") #file.ou.pdf = paste(file.ou.pref, ".heigthsMean.pdf", sep="") ######################################################################### print("read metadata") meta = read.table(file.in.metaAndQC, sep="\t", header=T, row.names=1) tiss.meta = meta[meta[,"tissue"]==TISS,] file.ou.filt.mergedPeak.bed = paste(file.ou.pref, "filt", N.CUTOFF, "GoodSampsAndRef.", TISS, ".bed", sep="") file.ou.filt.mergedPeak.sorted.bed= paste(file.ou.pref, "filt", N.CUTOFF, "GoodSampsAndRef.", TISS, ".sorted.bed", sep="") # file.ou.filt2GoodSamps.mergedPeak.bed = paste(file.ou.pref, "filt", N.CUTOFF, "GoodSamps.", TISS, ".bed", sep="") # file.ou.filt2GoodSamps.mergedPeak.sorted.bed= paste(file.ou.pref, "filt", N.CUTOFF, "GoodSamps.", TISS, ".sorted.bed", sep="") isPeakMerged.filt = file.exists(file.ou.filt.mergedPeak.sorted.bed) ######################################################################### if(!isPeakMerged.filt) { file.ou.mergedPeak = paste0(file.ou.pref, "GoodSamps.", TISS, ".bed") file.ou.pdf = paste(file.ou.pref, ".", TISS, ".diag.pdf", sep="") print(paste(TISS, " merge peaks", sep="")) if(file.exists(file.tmp.allPeak)) { file.remove(file.tmp.allPeak) } samps.GoodForMergedPeaks = rownames(tiss.meta[tiss.meta[,"RSC"]>=RSC.CUTOFF & tiss.meta[,"totalReads"]>=READSTOTAL.CUTOFF,]) for(samp in samps.GoodForMergedPeaks) { f.i.pk = system(paste("find ", tiss.meta[samp,"dir"], file.in.peak.suf, sep=""), intern = T) #log10.pvals=as.numeric(system(paste("zcat ", f.i.pk, "| awk '{print $8}'", sep="" ), intern=T)) #hist(log10.pvals) cmd = paste("zcat ", f.i.pk, "|awk '($9>=", LOGQ.CUTOFF, "){print $1 \"\\t\" $2 \"\\t\" $3 \"\\t\" $1 \":\" $2 \"-\" $3 \"_\" $8}' >>", file.tmp.allPeak, sep="") system(cmd) } cmd = paste("sort -k1,1 -k2,2n ", file.tmp.allPeak, " | mergeBed -i stdin -c 4 -o collapse -delim \",\" >", file.ou.mergedPeak, sep="") system(cmd) #diagnosis mPK2len = as.numeric(system(paste("awk '{print $3}'", file.ou.mergedPeak), intern = T)) - as.numeric(system(paste("awk '{print $2}'", file.ou.mergedPeak), intern = T)) buf=system(paste("awk '{print $4}'", file.ou.mergedPeak), intern = T) mPK2pvals = lapply(buf, FUN=function(pks) { sapply(unlist(strsplit(pks, split=",")), FUN=function(pk) { as.numeric(unlist(strsplit(pk, split="_"))[2]) }) }) mPK2pkLen = lapply(buf, FUN=function(pks) { sapply(unlist(strsplit(pks, split=",")), FUN=function(pk) { buf = unlist(strsplit(gsub(":|-|_", "\t", pk), "\t", perl = T)) as.numeric(buf[3])-as.numeric(buf[2]) }) }) mPK2coverage = sapply(mPK2lengths, sum)/mPK2len mPK2pkNum = sapply(mPK2pvals,length) size.seq = sort(as.numeric(levels(factor(mPK2pkNum)))) size2pval = lapply(size.seq, FUN=function(size) { unlist(mPK2pvals[mPK2pkNum==size]) }) names(size2pval) = as.character(size.seq) size2SecondTopPval = lapply(size.seq, FUN=function(size) { if(size==1) { return(unlist(mPK2pvals[mPK2pkNum==1])) } sapply(mPK2pvals[mPK2pkNum==size], FUN=function(qs) { sort(qs, decreasing = T)[2] }) }) size2coverage=lapply(size.seq, FUN=function(size) { unlist(mPK2coverage[mPK2pkNum==size]) }) names(size2pval) = as.character(size.seq) # pdf(file.ou.pdf, height = 9, width=12) hist(mPK2pkNum, xlab="number of peaks overlapped with merged peaks", breaks=100) boxplot(size2pval, xlab="number of peaks overlapped with merged peaks", ylab="pvalues") boxplot(size2SecondTopPval, xlab="number of peaks overlapped with merged peaks", ylab="2nd largest pvalues") boxplot(size2coverage, xlab="number of peaks overlapped with merged peaks", ylab="coverage of peaks over merged peaks") boxplot(mPK2len~mPK2pkNum, xlab="number of peaks overlapped with merged peaks", ylab="peak length") smoothScatter(mPK2len, mPK2coverage, xlab="merged peak length", ylab="coverage of peaks over merged peaks") #layout(matrix(1:9, ncol=3)) for(i in 1:length(samps.GoodForMergedPeaks)) { print(i) hist(size2pval[[as.character(i)]], main=paste("size ", i, sep=""), xlab="qvalue", breaks=200) } dev.off() # cmd =paste0("intersectBed -u -a ", file.ou.mergedPeak, " -b ", file.in.Roadmap.pk.bed, "|awk '{print $1 \"\\t\" $2 \"\\t\" $3}'") #cmd =paste0("intersectBed -u -a ", files.in.Roadmap.pk.bed[TISS2RoadmapLabel[[TISS]]], " -b ",file.in.peaks.all.bed, "|awk '{print $1 \"\\t\" $2 \"\\t\" $3}'") peaks.ovlpRef.bed=system(cmd, intern = T) cmd = paste0("awk '{if (gsub(/,/,\"\",$4)>=", N.CUTOFF-1,") print}' ", file.ou.mergedPeak,"| awk '{print $1 \"\\t\" $2 \"\\t\" $3}'") peaks.2ind.bed = system(cmd, intern = T) peaks.filt.bed = union(peaks.ovlpRef.bed, peaks.2ind.bed) peaks.filt = sub("\t", ":", peaks.filt.bed) peaks.filt = sub("\t", "-", peaks.filt) write(paste(peaks.filt.bed, peaks.filt, sep="\t"), file.ou.filt.mergedPeak.bed) system(paste0("sortBed -i ", file.ou.filt.mergedPeak.bed, ">", file.ou.filt.mergedPeak.sorted.bed)) # peaks.2ind = sub("\t", ":", peaks.2ind.bed) peaks.2ind = sub("\t", "-", peaks.2ind) # write(paste(peaks.2ind.bed, peaks.2ind, sep="\t"), file.ou.filt2GoodSamps.mergedPeak.bed) # system(paste0("sortBed -i ", file.ou.filt2GoodSamps.mergedPeak.bed, ">", file.ou.filt2GoodSamps.mergedPeak.sorted.bed)) #################################################### print("mean counts in each peak for testing samples") # merged.peaks.df = read.table(file.ou.filt.mergedPeak, header=F, sep="\t", row.names = 4) # colnames(merged.peaks.df) = c("chr", "start", "end") # merged.peaks = rownames(merged.peaks.df) # # merged.peaks.len = merged.peaks.df[,3]-merged.peaks.df[,2] # names(merged.peaks.len) = merged.peaks for(samp in rownames(tiss.meta)) { print(samp) f.i.tag=system(paste("find ", tiss.meta[samp,"dir"], file.in.tagAlign.suf, sep=""), intern = T) frag.len = tiss.meta[samp,"FragLen"] f.o.sh = paste("a.", samp, ".ovlp.merged.sh", sep="") f.tmp.merged = paste(dir.tmp, samp, ".", TISS, ".ovlp.merged.bed", sep="") # if(unlist(strsplit(system(paste("wc -l ", f.tmp.merged, sep=""), intern = T), split=" "))[1] != 0) # { # next # } write(sh.head, f.o.sh) cmd= paste("zcat ", f.i.tag, "|awk '{if($6==\"+\") {print $1 \"\\t\" $2 \"\\t\" $2+", frag.len, "} if($6==\"-\") {start =1>$3-", frag.len, "?1:$3-", frag.len, ";print $1 \"\\t\" start \"\\t\" $3 }}'|sortBed -i stdin| intersectBed -b stdin -a ", file.ou.filt.mergedPeak.sorted.bed, " -wo -sorted|mergeBed -i stdin -c 4,8 -o distinct,sum >", f.tmp.merged, sep="") #the tatal bps overlapping with reads and peak regions #system(cmd) write(cmd, file=f.o.sh, append = T) write("echo done",file=f.o.sh, append = T) system(paste("qsub ", f.o.sh, sep="")) } }else { #need to wait for the above codes running print(TISS) file.ou.RData = paste(file.ou.pref, "heightsMean.", TISS, ".filt0.RData", sep="") file.ou.sum.pdf = paste(file.ou.pref, "heightsMean.", TISS, ".sum.pdf", sep="") file.ou.filt0.peak.bed = paste0(file.ou.pref, N.CUTOFF, "GoodSampsAndRef.filt0.", TISS, ".bed") merged.peaks.df = read.table(file.ou.filt.mergedPeak.sorted.bed, header=F, sep="\t", row.names = 4) colnames(merged.peaks.df) = c("chr", "start", "end") merged.peaks = rownames(merged.peaks.df) merged.peaks.len = merged.peaks.df[,3]-merged.peaks.df[,2] names(merged.peaks.len) = merged.peaks samp2HeightsMean = sapply(rownames(tiss.meta), FUN=function(samp) { print(samp) f.tmp.merged = paste(dir.tmp, samp, ".", TISS, ".ovlp.merged.bed", sep="") heights =as.matrix(read.table(f.tmp.merged, sep="\t", header=F, row.names=4, colClasses = "character"))[,4] pk2heightMean = rep(0, length(merged.peaks)) names(pk2heightMean) = merged.peaks pk2heightMean[names(heights)] = as.numeric(heights)/merged.peaks.len[names(heights)] return(pk2heightMean) }) colnames(samp2HeightsMean) = rownames(tiss.meta) # samp2HeightsMean.log.sum = apply(log10(samp2HeightsMean+1), 1, sum) pdf(file.ou.sum.pdf) hist(samp2HeightsMean.log.sum, xlab="sum of log10 signal", breaks=100, main=TISS) dev.off() samp2HeightsMean=samp2HeightsMean[samp2HeightsMean.log.sum!=0 ,] #write(paste(gsub(":|-", "\t", rownames(samp2HeightsMean)), rownames(samp2HeightsMean), sep="\t"), # file.ou.filt0.peak.bed) meta = tiss.meta save(samp2HeightsMean, meta, merged.peaks, merged.peaks.df, merged.peaks.len, file=file.ou.RData) }