#V1.2
#instead using training data, using whole data to build the model this time

#V1.1
#add single lead eQTL prediction so that to compare how polygenic risk score is much better than single one


#remove SNP that is not associated much before model fitting
#build polygenic model from individual data by glmnet
#need to impute the genotype first

#GENO.DIR    = argv[1]
argv = commandArgs(trailingOnly = TRUE)
TISS = argv[1]
GENE2eQTL.WIND = as.numeric(argv[2])
JOB.i = as.numeric(argv[3])
JOB.N = as.numeric(argv[4])
dir.in = argv[5]
f.tmp.RData = argv[6]

if(!grepl("R version 3.6", R.version$version.string))
{
  stop("use .r-3.6.0-bioconductor")
}

R.LIB.MYPATH = "~/lhou.compbio/libs/R/x86_64-pc-linux-gnu-library/3.6/"
R.LIBS.PATH = .libPaths()
.libPaths(c(R.LIB.MYPATH, R.LIBS.PATH))

options(stringsAsFactors = FALSE)
options(scipen = 999)


 #c(2000, 5000, 10000, 1000000) 50000#
GENOTYPE.PERCT.CUTOFF=0.8
GLMNET.ALPHA = 0.5


tiss2GTExTiss = c(Brain = "Brain_Frontal_Cortex_BA9",
                  Lung = "Lung",
                  Muscle = "Muscle_Skeletal",
                  Heart = "Heart_Left_Ventricle",
                  Blood="Whole_Blood")


# library(dplyr)
# library(tidyr)
# library(zqtl)
library(glmnet)
library(data.table)
#source('~/lhou.compbio/codes/mylib/RCode/Util_zqtl_byYongjin.R')


file.in.eGene = paste0("/broad/compbio/data/GTEx/v8/eqtl/GTEx_Analysis_v8_eQTL_significant/", tiss2GTExTiss[TISS], ".v8.egenes.txt.gz")#"/broad/compbio/data/GTEx/v8/references/gencode.v26.GRCh38.genes.gtf"
# dir.in.genotype.plink = "/broad/compbio/data/GTEx/GTEx_restricted/v8_plink/plink-relaxed/"
# file.in.sampWithGenotype = paste0(dir.in.genotype.plink, CHR, ".fam")
file.in.totalVCF = "/broad/compbio/data/GTEx/GTEx_restricted/GTEx_Analysis_2017-06-05_v8/genotypes/WGS/variant_calls/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.vcf.gz"
#dir.in.haQTL.train = paste0("../haQTL/a_4_haQTL_V1.2_quan.RUVg_rmCov_trainingSamp_2000k/", TISS, "_nominal/")

file.in.rmCov.RData = paste0(dir.in, TISS, ".exp.rmCov.RData")
file.in.eqtl.filt.RDS = paste0(dir.in, TISS, ".eqtl.filt.p0.05.RDS")

#file.ou.PGS.RData  = paste0(dir, TISS, ".", CHR, ".PGS.", GENE2eQTL.WIND/1000, "kb.", JOB.i, ".RData")


################################################################
#samps = read.table(file.in.sampWithGenotype, sep=" ", header=F, row.names = NULL, stringsAsFactors = F)[,2]
eGene.info = read.table(gzfile(file.in.eGene), sep="\t", header = T, row.names = 1, stringsAsFactors = F)
eGenes = rownames(eGene.info)[eGene.info$qval<=0.05]

## hg38 peaks
samps.all = system(paste0("bcftools query -l ", file.in.totalVCF), intern = T)
#samps.train = rownames(read.table(file.in.inds.train, sep="\t", row.names=1, header = F))

load(file.in.rmCov.RData)
samps.exp = colnames(Y.rmCov)
eQTL.filt.byGene = readRDS(file.in.eqtl.filt.RDS)
  
genes.start = round((JOB.i-1)*(length(eGenes)/JOB.N)) +1
genes.end =   round(JOB.i*(length(eGenes)/JOB.N))
if(JOB.i == JOB.N) genes.end = length(eGenes)


gene.genotypeAndModelAndEst_glmnet=lapply(eGenes[genes.start:genes.end],
FUN=function(gene)
{
  print(gene)
  snps.filt = eQTL.filt.byGene[[gene]]$variant_id
  
  
  gene.tss = genes.TSS[gene,3]
  gene.lb = max(1, gene.tss-GENE2eQTL.WIND)
  gene.ub = gene.tss + GENE2eQTL.WIND
  
  gene.chr= genes.TSS[gene,1]
  
  buf = system(paste0("tabix ", file.in.totalVCF, " ", gene.chr, ":", gene.lb, "-", gene.ub), intern=T)
  if(length(buf)==0)
  {
    return(NA)
  }
  names(buf) = sapply(buf,
  FUN=function(b)
  {
    snp.info = unlist(strsplit(b, split="\t"))
    snp.id = snp.info[3]
  })
  buf.filt = buf[intersect(names(buf), snps.filt)]
  if(length(buf.filt)==0)
  {
    return(NA)
  }
  
  snps.AC = sapply(buf.filt,
  FUN=function(b)
  {
    snp.info = unlist(strsplit(b, split="\t"))
    
    snp.AC = sapply(snp.info[10:length(snp.info)],
    FUN=function(info)
    {
      sum(as.numeric(unlist(strsplit(unlist(strsplit(info, split=":"))[1],split="/"))))
    })
    if(length(levels(factor(snp.AC[!is.na(snp.AC)])))==1)
    {
      return(rep(NA, length(snp.AC)))
    }    
    return(snp.AC)
  })
  # colnames(snps.AC) =  sapply(buf,
  # FUN=function(b)
  # {
  #   snp.info = unlist(strsplit(b, split="\t"))
  #   snp.id = snp.info[3]
  # })
  rownames(snps.AC) = samps.all
  
  
  #impute and filter
  snps.AC.noNAPerct = apply(snps.AC, 2, 
  FUN=function(xi)
  {
    1-sum(is.na(xi))/length(xi)
  })
  snps.AC.filt = cbind(snps.AC[, snps.AC.noNAPerct>=GENOTYPE.PERCT.CUTOFF])
  colnames(snps.AC.filt) = colnames(snps.AC)[snps.AC.noNAPerct>=GENOTYPE.PERCT.CUTOFF]
  
  if(ncol(snps.AC.filt)==0)
  {
    return(NA)
    
  }
  snps.AC.filt.impute=apply(snps.AC.filt, 2, 
  FUN=function(xi)
  {
    xi[is.na(xi)]= mean(xi, na.rm=T)
    return(xi)
  })
  
  
  y.train = Y.rmCov[gene, samps.exp]
  X.train= cbind(snps.AC.filt.impute[samps.exp,])
 
  #single SNP view
  #
  cors = cor(y.train, X.train)
  X.topSNPid = colnames(X.train)[which.max(abs(cors))[1]]
  X.top= X.train[, X.topSNPid]
  mod=lm(y~., data.frame(y=y.train, X.top=X.top))
  y.est_leadeQTL.all = predict(mod, data.frame(X.top=snps.AC.filt.impute[,X.topSNPid]))
  #
  #foldid=sample(1:10,size=length(y),replace=TRUE)
  #cv1=cv.glmnet(x=x, y=y, foldid=foldid,alpha=1)
  if(ncol(X.train)==1)
  {
    mod = lm(y.train~X.train)
    cvfit.coef = cbind(summary(mod)$coef[,"Estimate"])
    rownames(cvfit.coef)[-1] = colnames(snps.AC.filt.impute)
  }else
  {
    cvfit=cv.glmnet(x=X.train, y=y.train, alpha=GLMNET.ALPHA)
  
    cvfit.coef = matrix(coef(cvfit, s = "lambda.min"))
    rownames(cvfit.coef) = rownames(coef(cvfit, s = "lambda.min"))
  }
  
  y.est_PGS.all = cbind(1,snps.AC.filt.impute) %*% cvfit.coef
  
  return(list(X.impute=snps.AC.filt.impute,
              cvfit.coef = cvfit.coef,
              y.est_PGS.all = y.est_PGS.all,
              y.est_leadeQTL.all = y.est_leadeQTL.all
              ))
})

names(gene.genotypeAndModelAndEst_glmnet)=eGenes[genes.start:genes.end]


#mutate(gene.tis, histone.tis)

save(gene.genotypeAndModelAndEst_glmnet,
     file=f.tmp.RData) 
#eqtl.tab.filt,
#     hqtl.tab.filt,
#     g.pks,
     
#output.bgzip(out.tab, OUT.FILE)
#log.msg('Done')