#V1.2 #instead using training data, using whole data to build the model this time #V1.1 #add single lead eQTL prediction so that to compare how polygenic risk score is much better than single one #remove SNP that is not associated much before model fitting #build polygenic model from individual data by glmnet #need to impute the genotype first #GENO.DIR = argv[1] argv = commandArgs(trailingOnly = TRUE) TISS = argv[1] GENE2eQTL.WIND = as.numeric(argv[2]) JOB.i = as.numeric(argv[3]) JOB.N = as.numeric(argv[4]) dir.in = argv[5] f.tmp.RData = argv[6] if(!grepl("R version 3.6", R.version$version.string)) { stop("use .r-3.6.0-bioconductor") } R.LIB.MYPATH = "~/lhou.compbio/libs/R/x86_64-pc-linux-gnu-library/3.6/" R.LIBS.PATH = .libPaths() .libPaths(c(R.LIB.MYPATH, R.LIBS.PATH)) options(stringsAsFactors = FALSE) options(scipen = 999) #c(2000, 5000, 10000, 1000000) 50000# GENOTYPE.PERCT.CUTOFF=0.8 GLMNET.ALPHA = 0.5 tiss2GTExTiss = c(Brain = "Brain_Frontal_Cortex_BA9", Lung = "Lung", Muscle = "Muscle_Skeletal", Heart = "Heart_Left_Ventricle", Blood="Whole_Blood") # library(dplyr) # library(tidyr) # library(zqtl) library(glmnet) library(data.table) #source('~/lhou.compbio/codes/mylib/RCode/Util_zqtl_byYongjin.R') file.in.eGene = paste0("/broad/compbio/data/GTEx/v8/eqtl/GTEx_Analysis_v8_eQTL_significant/", tiss2GTExTiss[TISS], ".v8.egenes.txt.gz")#"/broad/compbio/data/GTEx/v8/references/gencode.v26.GRCh38.genes.gtf" # dir.in.genotype.plink = "/broad/compbio/data/GTEx/GTEx_restricted/v8_plink/plink-relaxed/" # file.in.sampWithGenotype = paste0(dir.in.genotype.plink, CHR, ".fam") file.in.totalVCF = "/broad/compbio/data/GTEx/GTEx_restricted/GTEx_Analysis_2017-06-05_v8/genotypes/WGS/variant_calls/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.vcf.gz" #dir.in.haQTL.train = paste0("../haQTL/a_4_haQTL_V1.2_quan.RUVg_rmCov_trainingSamp_2000k/", TISS, "_nominal/") file.in.rmCov.RData = paste0(dir.in, TISS, ".exp.rmCov.RData") file.in.eqtl.filt.RDS = paste0(dir.in, TISS, ".eqtl.filt.p0.05.RDS") #file.ou.PGS.RData = paste0(dir, TISS, ".", CHR, ".PGS.", GENE2eQTL.WIND/1000, "kb.", JOB.i, ".RData") ################################################################ #samps = read.table(file.in.sampWithGenotype, sep=" ", header=F, row.names = NULL, stringsAsFactors = F)[,2] eGene.info = read.table(gzfile(file.in.eGene), sep="\t", header = T, row.names = 1, stringsAsFactors = F) eGenes = rownames(eGene.info)[eGene.info$qval<=0.05] ## hg38 peaks samps.all = system(paste0("bcftools query -l ", file.in.totalVCF), intern = T) #samps.train = rownames(read.table(file.in.inds.train, sep="\t", row.names=1, header = F)) load(file.in.rmCov.RData) samps.exp = colnames(Y.rmCov) eQTL.filt.byGene = readRDS(file.in.eqtl.filt.RDS) genes.start = round((JOB.i-1)*(length(eGenes)/JOB.N)) +1 genes.end = round(JOB.i*(length(eGenes)/JOB.N)) if(JOB.i == JOB.N) genes.end = length(eGenes) gene.genotypeAndModelAndEst_glmnet=lapply(eGenes[genes.start:genes.end], FUN=function(gene) { print(gene) snps.filt = eQTL.filt.byGene[[gene]]$variant_id gene.tss = genes.TSS[gene,3] gene.lb = max(1, gene.tss-GENE2eQTL.WIND) gene.ub = gene.tss + GENE2eQTL.WIND gene.chr= genes.TSS[gene,1] buf = system(paste0("tabix ", file.in.totalVCF, " ", gene.chr, ":", gene.lb, "-", gene.ub), intern=T) if(length(buf)==0) { return(NA) } names(buf) = sapply(buf, FUN=function(b) { snp.info = unlist(strsplit(b, split="\t")) snp.id = snp.info[3] }) buf.filt = buf[intersect(names(buf), snps.filt)] if(length(buf.filt)==0) { return(NA) } snps.AC = sapply(buf.filt, FUN=function(b) { snp.info = unlist(strsplit(b, split="\t")) snp.AC = sapply(snp.info[10:length(snp.info)], FUN=function(info) { sum(as.numeric(unlist(strsplit(unlist(strsplit(info, split=":"))[1],split="/")))) }) if(length(levels(factor(snp.AC[!is.na(snp.AC)])))==1) { return(rep(NA, length(snp.AC))) } return(snp.AC) }) # colnames(snps.AC) = sapply(buf, # FUN=function(b) # { # snp.info = unlist(strsplit(b, split="\t")) # snp.id = snp.info[3] # }) rownames(snps.AC) = samps.all #impute and filter snps.AC.noNAPerct = apply(snps.AC, 2, FUN=function(xi) { 1-sum(is.na(xi))/length(xi) }) snps.AC.filt = cbind(snps.AC[, snps.AC.noNAPerct>=GENOTYPE.PERCT.CUTOFF]) colnames(snps.AC.filt) = colnames(snps.AC)[snps.AC.noNAPerct>=GENOTYPE.PERCT.CUTOFF] if(ncol(snps.AC.filt)==0) { return(NA) } snps.AC.filt.impute=apply(snps.AC.filt, 2, FUN=function(xi) { xi[is.na(xi)]= mean(xi, na.rm=T) return(xi) }) y.train = Y.rmCov[gene, samps.exp] X.train= cbind(snps.AC.filt.impute[samps.exp,]) #single SNP view # cors = cor(y.train, X.train) X.topSNPid = colnames(X.train)[which.max(abs(cors))[1]] X.top= X.train[, X.topSNPid] mod=lm(y~., data.frame(y=y.train, X.top=X.top)) y.est_leadeQTL.all = predict(mod, data.frame(X.top=snps.AC.filt.impute[,X.topSNPid])) # #foldid=sample(1:10,size=length(y),replace=TRUE) #cv1=cv.glmnet(x=x, y=y, foldid=foldid,alpha=1) if(ncol(X.train)==1) { mod = lm(y.train~X.train) cvfit.coef = cbind(summary(mod)$coef[,"Estimate"]) rownames(cvfit.coef)[-1] = colnames(snps.AC.filt.impute) }else { cvfit=cv.glmnet(x=X.train, y=y.train, alpha=GLMNET.ALPHA) cvfit.coef = matrix(coef(cvfit, s = "lambda.min")) rownames(cvfit.coef) = rownames(coef(cvfit, s = "lambda.min")) } y.est_PGS.all = cbind(1,snps.AC.filt.impute) %*% cvfit.coef return(list(X.impute=snps.AC.filt.impute, cvfit.coef = cvfit.coef, y.est_PGS.all = y.est_PGS.all, y.est_leadeQTL.all = y.est_leadeQTL.all )) }) names(gene.genotypeAndModelAndEst_glmnet)=eGenes[genes.start:genes.end] #mutate(gene.tis, histone.tis) save(gene.genotypeAndModelAndEst_glmnet, file=f.tmp.RData) #eqtl.tab.filt, # hqtl.tab.filt, # g.pks, #output.bgzip(out.tab, OUT.FILE) #log.msg('Done')