#https://choishingwan.github.io/PRS-Tutorial/plink/ #calculate PRS plink #and decouple the immune components based on intersection with our peak sets related to BD #library(ggplot2) # cutoffs= rbind(c(0.001, 0, 0.001), # c(0.05, 0, 0.05), # c(0.1, 0, 0.1), # c(0.2, 0, 0.2), # c(0.3, 0, 0.3), # c(0.4, 0, 0.4), # c(0.5, 0, 0.5)) options(scipen=999) library(data.table) cutoffs= rbind( #c(0.001, 0, 0.001), c(0.05, 0, 0.05), c(0.1, 0, 0.1) #c(0.2, 0, 0.2), #c(0.3, 0, 0.3), #c(0.4, 0, 0.4), #c(0.5, 0, 0.5) ) CLUMP.LD.R2.CUTOFF=0.2 CLUMP.DIS.CUTOFF=10000 PEAK.WIND=2000 file.in.gwas.gz = "./a_BP_PGC_leaveMayoBDout_GWAS_QC/BDlooMAYO_PGC3_hg38.sorted.maf0.01.INFO0.8.NProp0.9.noDup.noAmbg.bed.gz"#"./BP_GWAS_QC/daner_PGC_BIP32b_mds7a_0416a.hg38.sorted.maf0.01.INFO0.8.NProp0.9.noDup.noAmbg.bed.gz" file.in.target.QC = "./b_Mayo.BP_PGC.leaveOutMayo.GWAS_MAF0.05.locID_V1.1_noSwapping_hg38_QC/plink.maf0.01.hwe1e-06.geno0.1.mind0.1.QC"#"./Kellis.WGS.subsetFromBiobank.hg38.snp.MAF0.05.locID_V1.1_noSwapping/plink.maf0.01.hwe1e-06.geno0.1.mind0.1.QC" file.in.WGSid2phenotype = "~/lhou.compbio/data/Mayo_Bipolar/Batch_Feb.2019/broad_wgs_metadata.csv" #file.in.meta2 = "~/lhou.compbio/data/Mayo_Bipolar/Bipolar_ptDemographics.csv" #file.in.clinic = "/broad/compbio/lhou/data/Mayo_Bipolar/210415_new_broad_data_deidentified.csv" files.in.pks.bed=dir("b_genomicRegions_forPRS/", "hg38.bed", full.names=T) names(files.in.pks.bed)=gsub(".hg38.bed", "", basename(files.in.pks.bed)) # dir.tmp=paste0("~/hptmp/MayoBipolar/c_PRS_genomicRegions/ld", CLUMP.LD.R2.CUTOFF, ".clumpDis", CLUMP.DIS.CUTOFF/1000, "kb.wind", PEAK.WIND/1000, "kb/") dir.create(dir.tmp, showWarnings = F, recursive = T) dir.ou=paste0("c_PRS_byPlink_PGC.leaveOutMayo.GWAS_genomicRegions/ld", CLUMP.LD.R2.CUTOFF, ".clumpDis", CLUMP.DIS.CUTOFF/1000, "kb.wind", PEAK.WIND/1000, "kb/") dir.create(dir.ou, showWarnings = F, recursive = T) file.ou.gwas.logOR = "./a_BP_PGC_leaveMayoBDout_GWAS_QC/BDlooMAYO_PGC3_hg38.hg38.sorted.maf0.01.INFO0.8.NProp0.9.noDup.noAmbg.logOR.bed" file.ou.target.QC.clump.snp = paste0(dir.ou, "plink.maf0.01.hwe1e-06.geno0.1.mind0.1.QC.clump.valid.snps") file.ou.target.QC.clump.GWAS.bed = paste0(dir.ou, "plink.maf0.01.hwe1e-06.geno0.1.mind0.1.QC.clump.valid.GWAS.bed") file.ou.base.snp.p = paste0(dir.ou, "BDlooMAYO_PGC3.hg38.sorted.maf0.01.INFO0.8.NProp0.9.noDup.noAmbg.P") file.ou.p.range= paste0(dir.ou, "P_range.list") file.ou.PRS.pref = paste0(dir.ou, "plink.maf0.01.hwe1e-06.geno0.1.mind0.1.QC.vs.BDlooMAYO_PGC3.hg38.sorted.maf0.01.INFO0.8.NProp0.9.noDup.noAmbg.") #file.ou.PRS.pdf = paste0(dir.ou, "plink.PRSvsphenotype.pdf") file.ou.PRS.RData = paste0(dir.ou, "plink_PRS_genomicRegions.RData") #Update Effect Size by log odds dat <- fread(file.in.gwas.gz, header=T, sep="\t", data.table=F) #beta = log(OR if(sum(grepl("beta", names(dat), ignore.case=T))==0) { dat$BETA <- log(dat$OR) }else { names(dat)[grepl("beta", names(dat), ignore.case=T)]="BETA" } #names(dat)[ names(dat) =="SNP"] = "rs" #recoganize rs as default if(!file.exists(file.ou.gwas.logOR)) { write.table(dat, file.ou.gwas.logOR, quote=F, row.names=F) } #clumping #may need LD estimated from elsewhere cmd =paste0("plink --bfile ", file.in.target.QC, " --clump-p1 1", " --clump-r2 ", CLUMP.LD.R2.CUTOFF, " --clump-kb ", CLUMP.DIS.CUTOFF/1000, " --clump ", file.ou.gwas.logOR, #By default, variant IDs are expected to be in the 'rs' column. You can change this with the --clump-snp-field flag, which takes a space-delimited sequence of field names to search for. With multiple field names, earlier names take precedence over later ones. #By default, p-values are expected to be in the 'P' column; change this with --clump-field. This has the same semantics as --clump-snp-field #" --clump-snp-field rs", # --clump-field P", " --out ", file.in.target.QC) system(cmd) #SNPs cmd= paste0("awk 'NR!=1{print $3}' ", file.in.target.QC, ".clumped > ", file.ou.target.QC.clump.snp) system(cmd) #PRS calculation, SNP and pvalue #system(paste0("awk '{print $4,$11}' ", file.ou.gwas.logOR, " > ", file.ou.base.snp.p)) system(paste0("awk '{print $4,$7}' ", file.ou.gwas.logOR, " > ", file.ou.base.snp.p)) write.table(cutoffs, file=file.ou.p.range, sep=" ", row.names = F, col.names = F, quote=F) #all SNPs cmd=paste0("plink ", " --bfile ", file.in.target.QC, " --score ", file.ou.gwas.logOR, " 4 5 8 header",#" 4 5 13 header", " --q-score-range ", file.ou.p.range, " ", file.ou.base.snp.p, " --extract ", file.ou.target.QC.clump.snp, " --out ", paste0(file.ou.PRS.pref, ".all")) system(cmd) #filter based on peaks and calculate snps.clump=read.table(file.ou.target.QC.clump.snp, stringsAsFactors=F)[,1] snps.clump.bed=dat[dat$SNP %in% snps.clump,] write.table(snps.clump.bed, file=file.ou.target.QC.clump.GWAS.bed, row.names=F, sep="\t", quote=F) for(nm in names(files.in.pks.bed)) { print(nm) #filter f.t.ovlp.bed=paste0(dir.tmp, "all.ovlp.", nm, ".hg38.bed") file.ou.target.QC.clump.snp.filt=paste0(file.ou.target.QC.clump.snp, ".filtBy.", nm) file.ou.pks.ovlp.bed=paste0(dir.ou, "peaks.window.index.snp.", nm, ".hg38.bed") # cmd=paste0("bedtools window -u -w ", PEAK.WIND, " -a ", file.in.gwas.gz, " -b ", files.in.pks.bed[[nm]], ">", f.t.ovlp.bed) # system(cmd) cmd=paste0("bedtools window -w ", PEAK.WIND, " -a ", file.ou.target.QC.clump.GWAS.bed, " -b ", files.in.pks.bed[[nm]], ">", f.t.ovlp.bed) system(cmd) df=read.table(f.t.ovlp.bed, sep="\t", head=F, row.names=NULL, stringsAsFactors=F) snps.filt=df[,4] snps.filt=snps.filt[!duplicated(snps.filt)] pks.ovlp=paste0(df[,15], ":", df[,16], "-", df[,17]) #hg38 pks.ovlp=pks.ovlp[!duplicated(pks.ovlp)] #snps.filt=intersect(snps.clump, snps.filt) print(length(snps.filt)) write(snps.filt, file=file.ou.target.QC.clump.snp.filt) write(paste(gsub(":|-", "\t", pks.ovlp), pks.ovlp, sep="\t"), file.ou.pks.ovlp.bed) snps.filt=read.table(f.t.ovlp.bed, sep="\t", head=F, row.names=NULL,stringsAsFactors=F)[,4] snps.filt=intersect(snps.clump, snps.filt) print(length(snps.filt)) write(snps.filt, file=file.ou.target.QC.clump.snp.filt) # cmd=paste0("plink ", " --bfile ", file.in.target.QC, " --score ", file.ou.gwas.logOR, " 4 5 8 header",#" 4 5 13 header", " --q-score-range ", file.ou.p.range, " ", file.ou.base.snp.p, " --extract ", file.ou.target.QC.clump.snp.filt, " --out ", paste0(file.ou.PRS.pref, ".filtBy.", nm)) system(cmd) } print("done") # samp2meta=read.table(file.in.WGSid2phenotype, sep=",", header = T, row.names = 5, stringsAsFactors = F) rownames(samp2meta)=gsub("s_", "", rownames(samp2meta)) #samp2meta$group[samp2meta$group=="case"] = "disease" wgsId2sampID = paste0("id_", samp2meta$newsubjectid) names(wgsId2sampID)=rownames(samp2meta) samp2PRS.list=lapply(cutoffs[,3], FUN=function(cutoff) { res=lapply(c("all", paste0("filtBy.", names(files.in.pks.bed))), FUN=function(nm) { f.i=paste0(file.ou.PRS.pref, ".", nm, ".", cutoff, ".profile") buf=read.table(f.i, sep="", header=T, row.names=2) rownames(buf)=wgsId2sampID[rownames(buf)] df=data.frame(#group=samp2grp[rownames(buf), "group"], buf$SCORE, (buf$SCORE-mean(buf$SCORE))/sd(buf$SCORE), #paste0("pval_", cutoff), #nm, stringsAsFactors=F) colnames(df)=c(paste0(nm, ".PRS"), paste0(nm, ".PRS.z"))#, "cutoff", "type") rownames(df) = rownames(buf) return(df) }) #names(res)=c("all", paste0("filtBy.", names(files.in.pks.bed))) res.df=do.call(cbind, res) #res.df=cbind(res.df, samp2grp[rownames(res.df), c("group", "clu.corHclust5")]) #res.df=res.df[!is.na(res.df$group),] res.df$cutoff=paste0("pval_", cutoff) return(res.df) }) names(samp2PRS.list)=paste0("pval_", cutoffs[,3]) save(samp2PRS.list, file=file.ou.PRS.RData)