/*
 * The Broad Institute
 * SOFTWARE COPYRIGHT NOTICE AGREEMENT
 * This is copyright (2007-2009) by the Broad Institute/Massachusetts Institute
 * of Technology.  It is licensed to You under the Gnu Public License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *    http://www.opensource.org/licenses/gpl-2.0.php
 *
 * This software is supplied without any warranty or guaranteed support
 * whatsoever. Neither the Broad Institute nor MIT can be responsible for its
 * use, misuse, or functionality.
 */








package org.broad.igv.feature;

//~--- JDK imports ------------------------------------------------------------

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

/*
 *
chromosome1     GeneDB  CDS_parts       1       5662    .       -       .       mRNA SPAC212.11 ; Alias "tlh1" ; systematic_id "SPAC212.11" ; colour "2" ; controlled_curation "term=transcriptional repression dependent on heterochromatin and involves contributions from Chp1, Taz1 and Rik1 complex subunits in independent pathways; db_xref=PMID:16407326; date=20060327 ||| term=present at telomere ends; db_xref=PMID:15591066; date=20060327 ||| term=contains dh repeats; db_xref=PMID:15591066; date=20060327 ||| term=similar to S. pombe SPBCPT2R1.08c; date=19700101 ||| term=conserved fungal family; date=20070913 ||| term=no apparent S. cerevisiae ortholog; date=20070913" ; GO "GO:0043140; ATP-dependent 3' to 5' DNA helicase activity  GO:0000722; telomere maintenance via recombination  GO:0000781; chromosome, telomeric region  GO:0005634; nucleus" ; gene "SPAC212.11 ||| tlh1" ; primary_name "tlh1" ; product "RecQ type DNA helicase" ;
chromosome1     GeneDB  CDS     1       5662    .       -       .       mRNA SPAC212.11 ; colour "2" ;
chromosome1     GeneDB  5'-UTR  5663    7265    .       -       .       mRNA SPAC212.11 ; db_xref "EMBL:BK005597" ; gene "SPAC212.11 ||| tlh1" ;
chromosome1     GeneDB  CDS_parts       5726    6331    .       -       .       mRNA SPAC212.10 ; systematic_id "SPAC212.10" ; pseudo "_no_value" ; colour "13" ; controlled_curation "term=truncated C at terminal; date=19700101" ; gene "SPAC212.10" ; product "pseudo- malic acid transport protein" ; note "removed as part of UTR of SPAC212.11 and truncted with respect to paralogs" ;
 * */

/**
 *  Parse a GFF feature file.
 */
public class GFFParser extends AbstractFeatureParser {

    Map<String, BasicFeature> featureCache = new HashMap();
    static HashSet<String> geneParts = new HashSet();
    static
    {
        geneParts.add("CDS");
        geneParts.add("5'-UTR");
        geneParts.add("3'-UTR");     
        geneParts.add("transcript");
    }


    /**
     * Create a feature from the array of tokens.  The tokens represent a single line in the
     * GFF file.
     *
     * GFF files typically represent genes as a collection of exon, utr, and in some cases intron
     * features.  The current IGV model for genes is based on the UCSC model.   A gene is a single
     * feature wtih exons represented as "FeatureRegions".  The coding start and end are explicit
     * properties of the, so 5' and 3' utrs are not explicitly represented.  To conform to this
     * model a single gene feature is created for all CDS,  5'UTR, and 3'UTR gff lines with
     * matching "id" values.
     */
    @Override
    protected Feature parseLine(String line) {

        if (line.startsWith("#"))
        {
            if (line.toLowerCase().startsWith("##gff-version"))
            {
                String[] tmp = line.trim().split("\\s+");
                if (tmp.length > 1)
                {

                    // todo set version
                }
            }
            return null;
        }

        String[] tokens = new String[10];
       int nTokens = ParsingUtils.split(line.replaceAll("\"", ""), tokens, '\t');


        // GFF files have 9 columns
        if (nTokens < 9)
        {
            return null;
        }

        // Pseudo code
        String featureType = tokens[2];
        Feature returnValue = null;
        String chromosome = tokens[0].trim().intern();

        // GFF coordinates are 1 based, and inclusive (includes end).  So subtract 1 from the
        // start, but leave the end unchanged (subtract 1 then add it back as UCSC coordinates
        // are exclusive).
        int start = Integer.parseInt(tokens[3]) - 1;
        int end = Integer.parseInt(tokens[4]);

        String strandString = tokens[6];
        Strand strand = Strand.NONE;
        if (strandString.equals("-"))
        {
            strand = Strand.NEGATIVE;
        }
        else if (strandString.equals("+"))
        {
            strand = Strand.POSITIVE;
        }

        String phaseString = tokens[7].trim();

        String description = tokens[8];
        Map<String, String> attributes = parseDescription(description);

        String identifier = attributes.get("id");
        if (identifier == null)
        {
            identifier = attributes.get("systematic_id");
        }
        if(identifier == null) {
            identifier = attributes.get("transcript_id");
        }
        if(identifier == null) {
            identifier = attributes.get("gene");
        }
        // If still null take the first attribute
        
        

        BasicFeature f = ((identifier == null) ? null : featureCache.get(identifier));
        if (f == null)
        {
            f = new BasicFeature(chromosome, start, end, strand);
            f.setName(identifier);
            f.setIdentifier(identifier);            
            f.setDescription(description.replace(";", "<br>"));
            if (identifier != null)
            {
                featureCache.put(identifier, f);
            }
            returnValue = f;
        }

        // Try to find a reasonable name for this feature.  The attributes below
        // are based on a single example.  TODO -- see if GFF spec has some
        // suggested name attribute
        String name = attributes.get("alias");
        if (name == null)
        {
            name = attributes.get("gene");
        }
        if (name == null)
        {
            name = attributes.get("primary_name");
        }
        if(name == null) {
            name = attributes.get("locus");
        }
        if (name != null)
        {
            f.setName(name);
        }




        // If this row represents a gene part create an exon
        if (geneParts.contains(featureType))
        {
            Exon exon = new Exon(chromosome, start, end, strand);
            f.addExon(exon);

            // Extend gene extent if neccessary
            f.setStart(Math.min(f.getStart(), exon.getStart()));
            f.setEnd(Math.max(f.getEnd(), exon.getEnd()));

            // If UTR set coding start or end
            if (featureType.equals("5'-UTR") || featureType.equals("3'-UTR"))
            {
                exon.setUTR(true);
            }

            if ((phaseString.length() > 0) &&!phaseString.equals("."))
            {
                try
                {
                    int phase = Integer.parseInt(phaseString);
                    if ((phase >= 0) && (phase <= 2))
                    {
                        exon.setPhase(phase);
                    }
                }
                catch (NumberFormatException e)
                {
                    // apparently not a phase
                }
            }
        }
        return returnValue;
    }

    // Feature feature = featureCache.get(featureName);
    // if(feature == null)
    // {
    // feature = <create feature from tokens>
    // featureMap.put(featureName, feature);
    // return feature;
    // }
    // else
    // {
    // augment existing feature, for example
    // Exon region = <create region from tokens>
    // feature.addExon(region);
    // return null;   <= signals that this line did not create a new features
    // }


    private Map<String, String> parseDescription(String description) {
        Map<String, String> kvalues = new HashMap();
        
       String[] kvPairs = new String[50];
       int nTokens = ParsingUtils.split(description.trim(), kvPairs, ';');

       for (int k=0; k<nTokens; k++) 
        {
            String kv = kvPairs[k];
            String[] tmp = kv.split("\\s");
            List<String> tokens = new ArrayList();
            for (String s : tmp)
            {
                if (s.trim().length() > 0)
                {
                    tokens.add(s);
                }
            }
            if (tokens.size() >= 2)
            {
                String key = tokens.get(0).trim().replace("\"", "").toLowerCase();
                StringBuffer value = new StringBuffer();
                for (int i = 1; i < tokens.size(); i++)
                {
                    value.append(tokens.get(i).replace("\"", "") + " ");
                }
                kvalues.put(key, value.toString());
            }
        }
        return kvalues;
    }

    @Override
    protected void parsingComplete(List<Feature> features) {
        for (Feature f : features)
        {
            ((BasicFeature) f).sortExons();
        }

    }

}
