#clump the GWAS signal 
# in hg38
#

library(data.table)

options(scipen=999)

GWAS.P.CUTOFF=1e-6
CLUMP.LD.R2.CUTOFF=0.2
CLUMP.DIS.CUTOFF=250

file.in.GWAS= "~/lhou.compbio/data/GWAS/PGC_leaveMayoBDout/BDlooMAYO_PGC3_hg38.sorted.bed.gz" #"~/lhou.compbio/data/GWAS/pgc_bipolar/pgc_bip_hg38.sorted.bed.gz"#"~/lhou.compbio/data/GWAS/AD/AD_Bellenguez_NG2022/GCST90027158_buildGRCh38.tsv.gz"
files.in.BDMayo.genotype=dir("~/lhou.compbio/data/Mayo_Bipolar/WGS/plink_locID_hg38/", "bim", full.names=T) #hg19
files.in.BDMayo.genotype=gsub(".bim", "", files.in.BDMayo.genotype)
names(files.in.BDMayo.genotype)=basename(files.in.BDMayo.genotype)
# # files.in.GTEx.genotype=dir("/broad/compbio/data/GTEx/GTEx_restricted/v8_plink/plink/", "bed", full.names=T)
# # files.in.GTEx.genotype=gsub(".bed", "", files.in.GTEx.genotype)
# # names(files.in.GTEx.genotype)=basename(files.in.GTEx.genotype)
# file.in.GTEx.lookup= "/broad/compbio/data/GTEx/GTEx_restricted/GTEx_Analysis_2017-06-05_v8/genotypes/WGS/variant_calls/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz"

dir.ou="a_GWAS_clump/BD_PGC_bip/"
dir.create(dir.ou, showWarnings=F, recursive=T)
file.ou.filt.GWAS=paste0(dir.ou, "BDlooMAYO_PGC3_hg38.sorted.P", GWAS.P.CUTOFF, ".tsv")
file.ou.clump=paste0(dir.ou, "BDlooMAYO_PGC3_hg38.R2", CLUMP.LD.R2.CUTOFF, ".dis", CLUMP.DIS.CUTOFF, "kb")
file.ou.clump.bed=paste0(dir.ou, "BDlooMAYO_PGC3_hg38.P", GWAS.P.CUTOFF, ".R2", CLUMP.LD.R2.CUTOFF, ".dis", CLUMP.DIS.CUTOFF, "kb.hg38.bed")


#GWAS with GTEx variants id
gwas.df=fread(file.in.GWAS, data.table=F, sep="\t", head=T,
             colClasses=c("character", rep("numeric",2), 
                            rep("character", 3),
                            rep("numeric", 8)))
gwas.pFilt.df=gwas.df[gwas.df$P<=GWAS.P.CUTOFF,]
colnames(gwas.pFilt.df)[1]="chr"
gwas.pFilt.df$loc_id=paste(gwas.pFilt.df$chr, gwas.pFilt.df$stop, gwas.pFilt.df$a1, gwas.pFilt.df$a2, sep="_")
gwas.pFilt.flip.df=gwas.pFilt.df
gwas.pFilt.flip.df$loc_id=paste(gwas.pFilt.flip.df$chr, gwas.pFilt.flip.df$stop, gwas.pFilt.flip.df$a2, gwas.pFilt.flip.df$a1, sep="_")
gwas.pFilt.cmb.df=rbind(gwas.pFilt.df, gwas.pFilt.flip.df)
#variants in 1KG
snps.BDMayo=unlist(lapply(files.in.BDMayo.genotype,
FUN=function(f.i.pref)
{
    snps=read.table(paste0(f.i.pref, ".bim"), sep="\t", header=F, row.names=NULL, stringsAsFactors=F)[,2]
    # snps.dup=snps[duplicated(snps)]
    # print(length(snps.dup))
    # setdiff(snps, snps.dup)

}))
gwas.pFilt.cmb.filt.df=gwas.pFilt.cmb.df[gwas.pFilt.cmb.df$loc_id %in% snps.BDMayo, ]
#colnames(gwas.pFilt.cmb.filt.df)[10]="P"
colnames(gwas.pFilt.cmb.filt.df)[4]="rs"
colnames(gwas.pFilt.cmb.filt.df)[15]="SNP" #for clumping
rownames(gwas.pFilt.cmb.filt.df)=gwas.pFilt.cmb.filt.df$SNP
write.table(gwas.pFilt.cmb.filt.df, file=file.ou.filt.GWAS, sep="\t", quote=F, row.names=F)

#clump 
#clumping
#may need LD estimated from elsewhere
for(chr in paste0("chr", 1:22))
{
    print(chr)
    cmd =paste0("plink  --bfile ", files.in.BDMayo.genotype[chr], 
            " --clump-p1 1",
            " --clump-r2 ", CLUMP.LD.R2.CUTOFF, 
            " --clump-kb ", CLUMP.DIS.CUTOFF, #kb
            " --clump ", file.ou.filt.GWAS, #By default, variant IDs are expected to be in the 'SNP' column. You can change this with the --clump-snp-field flag, which takes a space-delimited sequence of field names to search for. With multiple field names, earlier names take precedence over later ones.
#By default, p-values are expected to be in the 'P' column; change this with --clump-field. This has the same semantics as --clump-snp-field
          #  " --clump-snp-field rs", 
          #  " --clump-field p", 
            " --out ", paste0(file.ou.clump, ".", chr))
    system(cmd)  
}


#summary
write("#chr\tstart\tend\tSNP.loc_ID\trsID\ttagged.leadSNP\tP\tbeta\tse\tA1AF", file.ou.clump.bed)
files.in.clump=system(paste0("find ", file.ou.clump, "*.clumped"), intern=T)
for(f.i in files.in.clump)
{
    print(f.i)
    d=read.table(f.i, header=T, row.names=NULL, stringsAsFactors=F)

    d.list=lapply(1:nrow(d),
    FUN=function(i)
    {
        snp.lead = d[i, "SNP"]
        snps= unlist(strsplit(d[i, "SP2"], split=","))
        snps=setdiff(snps, "NONE")
        snps=gsub("(1)", "", snps, fixed=T)

        snps.all=c(snp.lead, snps)

        df=data.frame(
            chr=gwas.pFilt.cmb.filt.df[snps.all, "chr"],
            start=gwas.pFilt.cmb.filt.df[snps.all, "start"],
            end=gwas.pFilt.cmb.filt.df[snps.all, "stop"],
            SNP.loc_ID=gwas.pFilt.cmb.filt.df[snps.all, "SNP"],
            rsID=gwas.pFilt.cmb.filt.df[snps.all, "rs"],
            tagged.leadSNP=snp.lead,
            P=format(gwas.pFilt.cmb.filt.df[snps.all, "P"], scientific = T),
            beta=gwas.pFilt.cmb.filt.df[snps.all, "BETA"],
            se=gwas.pFilt.cmb.filt.df[snps.all, "SE"],
            A1AF=gwas.pFilt.cmb.filt.df[snps.all, "A1AF"],
            stringsAsFactors=F
            )
    })
    d.df=do.call(rbind, d.list)

    write.table(d.df, file.ou.clump.bed, sep="\t", quote=F, row.names=F, col.names=F, append=T)

}
d=read.table(file.ou.clump.bed, sep="\t", head=T, row.names=NULL, stringsAsFactors=F, comment.char="")