/*
 * The Broad Institute
 * SOFTWARE COPYRIGHT NOTICE AGREEMENT
 * This is copyright (2007-2009) by the Broad Institute/Massachusetts Institute
 * of Technology.  It is licensed to You under the Gnu Public License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *    http://www.opensource.org/licenses/gpl-2.0.php
 *
 * This software is supplied without any warranty or guaranteed support
 * whatsoever. Neither the Broad Institute nor MIT can be responsible for its
 * use, misuse, or functionality.
 */
/*
 * GCTDatasetParser.java
 *
 * Parser for GCT and related file formats (e.g. RES).
 *
 * Created on October 18, 2007, 2:33 PM
 *
 * To change this template, choose Tools | Template Manager
 * and open the template in the editor.
 */
package org.broad.igv.data;

//~--- non-JDK imports --------------------------------------------------------
import org.broad.igv.preprocess.old.StatusMonitor;
import org.broad.igv.preprocess.old.ProbeListParser;
import org.broad.igv.feature.*;
import org.broad.igv.util.ResourceLocator;

import org.broad.igv.track.TrackManager;
import org.broad.igv.track.TrackType;


import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;

import java.util.*;
import org.broad.igv.util.AsciiLineReader;
import cern.colt.list.ObjectArrayList;
import org.broad.igv.ui.IGVModel;

/**
 *  TODO -- handle case with probe file
 * @author jrobinso
 */
public class GCTDatasetParser {

    static String LOCUS_START_DELIMITER = "|@";
    static String LOCUS_END_DELIMITER = "|";

    enum FileType {

        RES, GCT, MAPPED, TAB, MET, DCHIP, MAGE_TAB
    }
    ResourceLocator dataFileLocator;
    File probeFile;
    FileType type;
    int dataStartColumn;
    int probeColumn;
    int descriptionColumn;
    String genome;
    ProbeSet probeSet;
    /**
     * Flag to record server connect failures
     */
    boolean canConnectToServer = true;
    /**
     * Flag to record connect load user probe mapping file
     */
    boolean canLoadProbeMapping = true;
    /** Map colum heading -> index for effecient reverse lookup */
    private Map<String, Integer> headingIndexMap = new HashMap();
    Map<String, List<Row>> rowMap = new HashMap();
    StatusMonitor statusMonitor;
    GeneManager geneManager = null;

    /**
     * Map of probe -> array of feature rowValues (i.e. a data row).  The array should
     *  be hte same size and same order as the column headings.
     *
     * @param resFile
     * @param probeFile
     * @param genome
     * @param statusMonitor
     */
    /** Creates a new instance of GCTDatasetParser */
    public GCTDatasetParser(ResourceLocator resFile, File probeFile, String genome,
            StatusMonitor statusMonitor) {
        this(resFile, probeFile, genome);
        this.statusMonitor = statusMonitor;
    }

    /**
     * Constructs ...
     *
     *
     * @param resFile
     * @param probeFile
     * @param genome
     */
    public GCTDatasetParser(ResourceLocator resFile, File probeFile, String genome) {
        this.dataFileLocator = resFile;
        this.probeFile = probeFile;
        this.genome = genome;
        if (IGVModel.getInstance().getViewContext().getGenomeId() != null &&
                IGVModel.getInstance().getViewContext().getGenomeId().equals(genome)) {
            this.geneManager = TrackManager.getInstance().getGeneManager();
        } else {
            this.geneManager = GeneManager.getGeneManager(genome);
        }
    }

    /**
     * Constructs ...
     *
     *
     * @param resFile
     * @param probeFile
     * @param genome
     */
    public GCTDatasetParser(File resFile, File probeFile, String genome) {
        this(new ResourceLocator(resFile.getAbsolutePath()), probeFile, genome);
    }

    /*
     */
    public static boolean parsableMAGE_TAB(ResourceLocator file) {
        AsciiLineReader reader = null;
        try {
            reader = ParsingUtils.openAsciiReader(file);
            String nextLine = null;

            //skip first row
            reader.readLine();

            //check second row for MAGE_TAB identifiers
            if ((nextLine = reader.readLine()) != null && (nextLine.contains("Reporter REF") || nextLine.contains("Composite Element REF") || nextLine.contains("Term Source REF") || nextLine.contains("CompositeElement REF") || nextLine.contains("TermSource REF") || nextLine.contains("Coordinates REF"))) {
                int count = 0;
                // check if this mage_tab data matrix can be parsed by this class
                while ((nextLine = reader.readLine()) != null && count < 5) {
                    nextLine = nextLine.trim();
                    if (nextLine.startsWith("SNP_A") || nextLine.startsWith("CN_")) {
                        return false;
                    }

                    count++;
                }
                return true;
            }
        } catch (IOException e) {
            e.printStackTrace();
            return false;
        } finally {
            if (reader != null) {
                reader.close();
            }
        }

        return false;
    }

    /**
     * Parse the file and return a Dataset
     *
     * @return
     */
    public GCTDataset parse() {

        // Create a buffer for the string split utility.  We use  a custom utility as opposed
        // to String.split() for performance.
        String[] tokens = new String[10000];

        String fn = dataFileLocator.getPath().toLowerCase();
        if (fn.endsWith(".txt") || fn.endsWith(".tab") || fn.endsWith(".xls") || fn.endsWith(".gz")) {
            fn = fn.substring(0, fn.lastIndexOf("."));
        }


        descriptionColumn = -1;    // Default - no description column
        if (fn.endsWith("res")) {
            type = FileType.RES;
            dataStartColumn = 2;
            probeColumn = 1;
            descriptionColumn = 0;
        } else if (fn.endsWith("gct")) {
            type = FileType.GCT;
            dataStartColumn = 2;
            probeColumn = 0;
            descriptionColumn = 1;
        } else if (fn.endsWith("mapped")) {
            type = FileType.MAPPED;
            dataStartColumn = 4;
            probeColumn = 0;
        } else if (fn.endsWith("met")) {
            type = FileType.MET;
            dataStartColumn = 4;
            probeColumn = 0;
        } else if (fn.endsWith("dchip")) {
            type = FileType.DCHIP;
            dataStartColumn = 1;
            probeColumn = 0;
            descriptionColumn = -1;
        } else if (dataFileLocator.getDescription() != null &&
                dataFileLocator.getDescription().equals("MAGE_TAB")) {
            type = FileType.MAGE_TAB;
            descriptionColumn = -1;
            dataStartColumn = 1;
            probeColumn = 0;
        } else {
            type = FileType.TAB;
            dataStartColumn = 1;
            probeColumn = 0;
        }


        boolean hasCalls = (type == FileType.RES);
        boolean hasDescription = (descriptionColumn >= 0);


        // A legacy option,  probeFile is a mapping of probe -> gene symbol
        if (probeFile != null) {
            probeSet = ProbeListParser.parseFile(probeFile);
        }


        GCTDataset dataset = new GCTDataset(genome);
        //Assume data is gene expression for now
        dataset.setType(TrackType.GENE_EXPRESSION);

        AsciiLineReader reader = null;
        try {
            reader = ParsingUtils.openAsciiReader(dataFileLocator);

            String nextLine = null;
            String headerLine = null;

            // Skip header rows
            if (type == FileType.GCT) {
                nextLine = reader.readLine();
                if (nextLine.startsWith("#")) {
                    parseComment(nextLine, dataset);
                }
                nextLine = reader.readLine();
                if (nextLine.startsWith("#")) {
                    parseComment(nextLine, dataset);
                }
                headerLine = reader.readLine();
            } else if (type != FileType.MAGE_TAB) {

                // Skip meta data, if any
                while ((nextLine = reader.readLine()).startsWith("#") && (nextLine != null)) {
                    parseComment(nextLine, dataset);
                }
                headerLine = nextLine;
            } else {
                headerLine = reader.readLine();
            }

            // Parse column headings
            int skip = hasCalls ? 2 : 1;
            int nTokens = ParsingUtils.split(headerLine, tokens, '\t');

            int nColumns = (nTokens - dataStartColumn) / skip;
            ObjectArrayList columnHeadingsObj = new ObjectArrayList();
            for (int i = 0; i < nColumns; i++) {
                String heading = tokens[dataStartColumn + i * skip].replace('\"', ' ').trim();
                if (type == FileType.MAGE_TAB) {
                    if (!columnHeadingsObj.contains(heading, true)) {
                        columnHeadingsObj.add(heading);
                        headingIndexMap.put(heading, columnHeadingsObj.size() - 1);
                    }
                } else {
                    columnHeadingsObj.add(heading);
                    headingIndexMap.put(heading, i);
                }
            }

            String[] columnHeadings = (String[]) columnHeadingsObj.toArray(new String[0]);
            dataset.setColumnHeadings(columnHeadings);

            nColumns = columnHeadings.length;

            //parse quantitation type column header
            IntArrayList valuesIndices = new IntArrayList(nColumns);
            if (type == FileType.MAGE_TAB) {
                nextLine = reader.readLine();
                nTokens = ParsingUtils.split(nextLine, tokens, '\t');
                for (int i = dataStartColumn; i < nTokens; i++) {
                    String heading = tokens[i].replace('\"', ' ').trim();

                    //Check for tcga data column headings
                    if (heading.contains("Beta value") || heading.contains("log2 Signal") || heading.contains("Signal") || heading.contains("unc_DWD_Batch_adjusted")) {
                        valuesIndices.add(i);
                    }
                    if (heading.contains("Gene symbol")) {
                        descriptionColumn = i;
                        hasDescription = true;
                    }
                }

                if (nColumns != valuesIndices.size()) {
                    return null;
                }
            }

            // If format is RES skip the two lines following the header
            if (type == FileType.RES) {
                reader.readLine();
                reader.readLine();
            }

            int lineCount = 0;

            while ((nextLine = reader.readLine()) != null) {
                nTokens = ParsingUtils.split(nextLine, tokens, '\t');
                String probeId = new String(tokens[probeColumn]);
                float[] values = new float[nColumns];
                char[] calls = hasCalls ? new char[nColumns] : (char[]) null;

                String description = (hasDescription && (nTokens > descriptionColumn))
                        ? new String(tokens[descriptionColumn]) : null;

                if (type == FileType.MAGE_TAB && probeId.startsWith("cg")) {
                    dataset.setType(TrackType.DNA_METHYLATION);
                }

                for (int i = 0; i < nColumns; i++) {
                    try {
                        int dataIndex = -1;
                        if (type == FileType.MAGE_TAB) {
                            dataIndex = valuesIndices.get(i);

                            //convert to description mapping format 
                            if (hasDescription) {
                                description = "|@" + description + "|";
                            }
                        } else {
                            dataIndex = dataStartColumn + i * skip;
                        }

                        // If we are out of value tokens, or the cell is blank, assign NAN to the cell.
                        if ((dataIndex >= nTokens) || (tokens[dataIndex].length() == 0)) {
                            values[i] = Float.NaN;
                        } else {
                            values[i] = Float.parseFloat(tokens[dataIndex]);
                        }

                    } catch (NumberFormatException numberFormatException) {

                        // This s an expected condition.  IGV uses NaN to
                        // indicate non numbers (missing data values)
                        values[i] = Float.NaN;
                    }

                    // We ignore calls, just skip them if present
                    if (hasCalls) {
                        calls[i] = tokens[3 + i * skip].charAt(0);
                    }
                }
                addRow(probeId, description, values, calls);
                lineCount++;

                // This method is designed to be interruptable (canceled by
                // user.  Check every 1000 lines for an interrupt.
                if (lineCount == 1000) {
                    checkForInterrupt();
                    lineCount = 0;
                    if (statusMonitor != null) {
                        statusMonitor.incrementStatus(1);
                    }
                }
            }    // End loop through lines

            // Sort row for each chromosome by start location
            sortRows();

            // Update dataset
            for (String chr : rowMap.keySet()) {
                dataset.setStartLocations(chr, getStartLocations(chr));
                dataset.setEndLocations(chr, getEndLocations(chr));
                dataset.setFeatureNames(chr, getProbes(chr));
                for (String heading : columnHeadings) {
                    dataset.setData(heading, chr, getData(heading, chr));
                }
            }
        } catch (FileNotFoundException ex) {
            ex.printStackTrace();
        } catch (IOException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            return null;
        } finally {
            if (reader != null) {
                reader.close();
            }
        }

        return dataset;

    }

    /**
     * Method descriptionå
     *
     *
     * @param probeId
     * @param description
     * @param values
     * @param calls
     */
    public void addRow(String probeId, String description, float[] values, char[] calls) {

        // TODO -- simplify this with the use of a locus interface
        String chr = null;
        //int start = -1;
        //int end = -1;

        // Search for locus in description string.  This relies on the special
        // IGV convention for specifying loci  (e.g  |@chrX:1000-2000|
        if ((chr == null) && (description != null) && (description.length() > 3)) {
            String[] locusStrings = getExplicitLocusStrings(description);
            if (locusStrings != null) {
                for (String ls : locusStrings) {
                    ls = ls.trim();
                    Locus locus = getLocus(ls);
                    if ((locus != null) && locus.isValid()) {
                        chr = locus.getChr();
                        addRow(probeId, locus, values);
                    }
                }
                return;
            }
        }


        // Search for locus from the probe name itself.
        if (chr == null) {
            Locus locus = getLocus(probeId);
            if ((locus != null) && locus.isValid()) {
                addRow(probeId, locus, values);
                return;
            }
        }

        // Try the probe -> gene map
        if (canConnectToServer) {
            try {

                // and to loop through the list
                String[] loci = ProbeToGeneMap.getInstance().getGenesForProbe(probeId);

                if (loci != null) {
                    for (String locusString : loci) {
                        Locus locus = getLocus(locusString);
                        //Feature gene = FeatureDB.getFeature(locusString);

                        if (locus != null) {
                            addRow(probeId, locus, values);
                        }
                    }
                }

            } catch (Exception exception) {
                canConnectToServer = false;
            }
        }


    }

    private void addRow(String probeId, Locus locus, float[] values) {
        List<Row> rows = rowMap.get(locus.getChr());
        if (rows == null) {
            rows = new ArrayList();
            rowMap.put(locus.getChr(), rows);
        }
        rows.add(new Row(probeId, locus.getChr(), locus.getStart(), locus.getEnd(), values));

    }

    /**
     * Sort all row collections by ascending start location
     */
    private void sortRows() {
        Comparator<Row> c = new Comparator<Row>() {

            public int compare(GCTDatasetParser.Row arg0, GCTDatasetParser.Row arg1) {
                return arg0.start - arg1.start;
            }
        };
        for (List<Row> rows : rowMap.values()) {
            Collections.sort(rows, c);
        }
    }

    public String[] getProbes(String chr) {
        List<Row> rows = rowMap.get(chr);
        String[] labels = new String[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            labels[i] = rows.get(i).feature;
        }
        return labels;

    }

    /**
     * Method description
     *
     *
     * @param chr
     *
     * @return
     */
    public int[] getStartLocations(String chr) {

        List<Row> rows = rowMap.get(chr);
        int[] startLocations = new int[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            startLocations[i] = rows.get(i).start;
        }
        return startLocations;
    }

    /**
     * Method description
     *
     *
     * @param chr
     *
     * @return
     */
    public int[] getEndLocations(String chr) {

        List<Row> rows = rowMap.get(chr);
        int[] endLocations = new int[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            endLocations[i] = rows.get(i).end;
        }
        return endLocations;
    }

    /**
     * Method description
     *
     *
     * @param heading
     * @param chr
     *
     * @return
     */
    public float[] getData(String heading, String chr) {

        int columnIndex = this.headingIndexMap.get(heading);
        List<Row> rows = rowMap.get(chr);

        float[] data = new float[rows.size()];
        for (int i = 0; i < rows.size(); i++) {
            data[i] = rows.get(i).values[columnIndex];
        }
        return data;

    }

    /**
     * Search for a locus explicitly specified in the description field.
     * A locus can be specified either directrly, as a UCSC style locus string
     * (e.g. chrX:1000-2000), or indirectly as a HUGO gene symbol (e.g. egfr).
     * The locus string is distinguished by the  delimiters |@ and |.
     *
     * @param description
     * @return
     */
    private String[] getExplicitLocusStrings(String description) {

        // Search for locus in description string
        int startIndex = description.indexOf(LOCUS_START_DELIMITER);
        if (startIndex < 0) {
            return null;
        } else {
            startIndex += 2;
        }

        int endIndex = description.indexOf("|", startIndex + 1);
        if (endIndex < 0) {

            // Assume the locus extends to the end of the string
            endIndex = description.length();
        }

        if (endIndex > startIndex + 3) {
            String locusString = description.substring(startIndex, endIndex);
            if (locusString.contains(",")) {
                return locusString.split(",");
            } else {
                return new String[]{locusString};
            }
        }
        return null;

    }

    Locus getLocus(String locusString) {
        Locus locus = new Locus(locusString);
        if (locus.isValid()) {
            return locus;
        } else {

            // Maybe its a gene
            if (geneManager != null) {
                Feature gene = FeatureDB.getFeature(locusString);
                if (gene != null) {
                    return new Locus(gene.getChromosome(), gene.getStart(), gene.getEnd());
                }
            }
        }
        return null;
    }

    /**
     * Note:  This is an exact copy of the method in IGVDatasetParser.  Refactor to merge these
     * two parsers, or share a common base class.
     * @param comment
     * @param dataset
     */
    private void parseComment(String comment, GCTDataset dataset) {

        String tmp = comment.substring(1, comment.length());
        if (tmp.startsWith("track")) {
            DatasetParserUtils.parseTrackLine(tmp, dataset.getTrackProperties());
        } else {
            String[] tokens = tmp.split("=");
            if (tokens.length != 2) {
                return;
            }
            String key = tokens[0].trim().toLowerCase();
            if (key.equals("name")) {
                dataset.setName(tokens[1].trim());
            } else if (key.equals("type")) {

                try {
                    dataset.setType(TrackType.valueOf(tokens[1].trim().toUpperCase()));
                } catch (Exception exception) {

                    // Ignore
                }
            }
        }
    }

    private void checkForInterrupt() throws InterruptedException {
        Thread.sleep(1);    // <- check for interrupted thread
    }

    class Row {

        String feature;
        String chr;
        int start;
        int end;
        float[] values;

        Row(String feature, String chr, int start, int end, float[] values) {
            this.feature = feature;
            this.chr = chr;
            this.start = start;
            this.end = end;
            this.values = values;
        }
    }
}
