#V1.3 #only focus on eGene #split the job across genome #need to first correct expression covariates # #GENO.DIR = argv[1] argv = commandArgs(trailingOnly = TRUE) MOD= argv[1] #rerun or TISS = argv[2] JOB.N = argv[3] if(!grepl("R version 3.6", R.version$version.string)) { stop("use .r-3.6.0-bioconductor") } R.LIB.MYPATH = "~/lhou.compbio/libs/R/x86_64-pc-linux-gnu-library/3.6/" R.LIBS.PATH = .libPaths() .libPaths(c(R.LIB.MYPATH, R.LIBS.PATH)) options(stringsAsFactors = FALSE) options(scipen = 999) sh.heads =list() sh.heads$norm= paste("#!/bin/bash", "#$ -binding linear:1", "#$ -S /bin/bash", "##$ -V", "#$ -cwd", "#$ -e ./error.$JOB_NAME.$JOB_ID", "#$ -o ./outpt.$JOB_NAME.$JOB_ID", "#$ -l h_vmem=30g", "#$ -l h_rt=2:00:00", "##$ -pe smp 4", "source ~/.bashrc", "source ~/.my.bashrc", "use .r-3.6.0-bioconductor", sep="\n") sh.heads$rerun= paste("#!/bin/bash", "#$ -binding linear:1", "#$ -S /bin/bash", "##$ -V", "#$ -cwd", "#$ -e ./error.$JOB_NAME.$JOB_ID", "#$ -o ./outpt.$JOB_NAME.$JOB_ID", "#$ -l h_vmem=30g", "#$ -l h_rt=5:00:00", "##$ -pe smp 4", "source ~/.bashrc", "source ~/.my.bashrc", "use .r-3.6.0-bioconductor", sep="\n") SAMP.PROP = 0.7 SAMP.TIMES = 10 EQTL.CUTOFF =0.05 GENE2eQTL.WIND = 1000000 tiss2GTExTiss = c(Brain = "Brain_Frontal_Cortex_BA9", Lung = "Lung", Muscle = "Muscle_Skeletal", Heart = "Heart_Left_Ventricle", Blood="Whole_Blood") # library(dplyr) # library(tidyr) # library(zqtl) library(data.table) #source('~/lhou.compbio/codes/mylib/RCode/Util_zqtl_byYongjin.R') file.in.exp.matrix = paste0("/broad/compbio/data/GTEx/v8/eqtl/GTEx_Analysis_v8_eQTL_expression_matrices/", tiss2GTExTiss[TISS], ".v8.normalized_expression.bed.gz") file.in.GTExCov = paste0("/broad/compbio/data/GTEx/v8/eqtl/GTEx_Analysis_v8_eQTL_covariates/", tiss2GTExTiss[TISS], ".v8.covariates.txt") file.in.eqtl.RData = paste0("../haQTLvseQTL/b.GTEx_eQTL_V8_MAFfilt/", tiss2GTExTiss[TISS], ".MAF0.05.RData") #"../haQTL/a_5_haQTL_partOfSamp_FixedFactorNum_withGTExMinCov_V1.1_quan.RUVg_2000k_bed_tabix/" dir.tmp = paste0("~/hptmp/eGTEx-H3K27ac/a_PGS_exp_glmnet/", TISS, "_", GENE2eQTL.WIND/1000, "kb/") dir.create(dir.tmp, showWarnings = F, recursive = T) dir.ou=paste0("a_exp_PGS_glmnet_V1.3_allData_eGene/", TISS, "/") dir.create(dir.ou, showWarnings = F, recursive = T) file.ou.RData = paste0(dir.ou, TISS, ".exp.rmCov.RData") file.ou.pdf = paste0(dir.ou, TISS, ".exp.rmCov.pdf") file.ou.eqtl.filt.RDS = paste0(dir.ou, TISS, ".eqtl.filt.p", EQTL.CUTOFF, ".RDS") SCRIPT = "a.estGeneticCompInExp_glmnet_V1.3_allData_eGene.R" #correct gene expression if(!file.exists(file.ou.RData)) { buf = read.table(gzfile(file.in.exp.matrix), sep="\t", row.names=4, header=T, comment="", check.names = F ) Y= as.matrix(buf[,-(1:3)]) genes.TSS = buf[,1:3] Y.cov = as.matrix(read.table(file.in.GTExCov, sep="\t", header = T, check.names = F, row.names=1))[, colnames(Y)] Y.rmCov = t(sapply(rownames(Y), FUN=function(g) { #print(g) df = data.frame(Y=Y[g, ], t(Y.cov) ) mod = lm(Y~., df) return(summary(mod)$resid) })) save(genes.TSS, Y.rmCov, file=file.ou.RData ) pccs = sapply(rownames(Y), FUN=function(g) { cor(Y[g, ], Y.rmCov[g,]) }) pdf(file.ou.pdf, width=12, height = 5) layout(matrix(1:2, ncol=2)) par(mar=c(5,7,5,5)) hist(pccs, xlab="correlation between before and after removing covariates\nof gene expression", main = TISS) plot(apply(Y,1,mean), pccs, xlab="mean of gene expression", ylab="correlation between before and after removing covariates\nof gene expression") dev.off() } #filter SNPs if(!file.exists(file.ou.eqtl.filt.RDS)) { load(file.in.eqtl.RData) eQTL.filt = eQTL[eQTL$pval_nominal<=EQTL.CUTOFF,] eQTL.filt.byGene = split(eQTL.filt[,c("variant_id", "tss_distance", "maf", "pval_nominal", "slope", "slope_se")], eQTL.filt$gene_id) #mutate(gene.tis, histone.tis) saveRDS(eQTL.filt.byGene, file=file.ou.eqtl.filt.RDS) } load(file.ou.RData) for(job.i in 1:JOB.N) { JOB.I.inFN = paste(c(rep(0, nchar(JOB.N)-nchar(job.i)), job.i), collapse="") f.tmp.RData = paste0(dir.tmp, TISS, ".PGS.", JOB.I.inFN, ".RData") f.o.sh = paste0("a_estPGSExp.", TISS, ".", job.i, ".sh") if(MOD=="rerun" && file.exists(f.tmp.RData)) next write(sh.heads[[MOD]], f.o.sh) write(paste("Rscript ", SCRIPT, TISS, GENE2eQTL.WIND, job.i, JOB.N, dir.ou, f.tmp.RData), f.o.sh, append = T) write("echo work done", f.o.sh, append=T) system(paste0("qsub ", f.o.sh)) }