/*
 * The Broad Institute
 * SOFTWARE COPYRIGHT NOTICE AGREEMENT
 * This is copyright (2007-2009) by the Broad Institute/Massachusetts Institute
 * of Technology.  It is licensed to You under the Gnu Public License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *    http://www.opensource.org/licenses/gpl-2.0.php
 *
 * This software is supplied without any warranty or guaranteed support
 * whatsoever. Neither the Broad Institute nor MIT can be responsible for its
 * use, misuse, or functionality.
 */
package org.broad.igv.data;

//~--- non-JDK imports --------------------------------------------------------
import org.apache.log4j.Logger;

import org.broad.igv.feature.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.track.TrackType;
import org.broad.igv.track.WindowFunction;

//~--- JDK imports ------------------------------------------------------------

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

import java.io.InputStream;

import java.util.*;

import cern.colt.list.ObjectArrayList;
import org.broad.igv.ui.IGVMainFrame;
import org.broad.igv.util.AsciiLineReader;

/**
 * Class description
 *
 *
 * @version    Enter version here..., 08/11/11
 * @author     Enter your name here...
 */
public class IGVDatasetParser {

    private static Logger log = Logger.getLogger(IGVDatasetParser.class);
    private ResourceLocator dataResourceLocator;
    private int chrColumn;
    private int startColumn;
    private int endColumn;
    private int firstDataColumn;
    private boolean hasEndLocations;
    private boolean hasCalls;
    private String genomeId;
    private FileType type;
    private IntArrayList valuesIndices = null;

    enum FileType {

        IGV, SNP, CN, XCN, MAGE_TAB
    }

    /**
     * Constructs ...
     *
     *
     * @param copyNoFile
     * @param genomeId
     */
    public IGVDatasetParser(ResourceLocator copyNoFile, String genomeId) {
        this.dataResourceLocator = copyNoFile;
        this.genomeId = genomeId;
        initParameters();
    }

    private void initParameters() {
        String tmp = dataResourceLocator.getPath().endsWith(".txt")
                ? dataResourceLocator.getPath().substring(0,
                dataResourceLocator.getPath().length() - 4) : dataResourceLocator.getPath();

        if (tmp.endsWith(".igv")) {
            chrColumn = 0;
            startColumn = 1;
            endColumn = 2;
            firstDataColumn = 4;
            hasEndLocations = true;
            hasCalls = false;
            type = FileType.IGV;
        } else if (tmp.endsWith(".xcn") || tmp.endsWith("cn") || tmp.endsWith(".snp")) {
            chrColumn = 1;
            startColumn = 2;
            endColumn = -1;
            firstDataColumn = 3;
            hasEndLocations = false;
            hasCalls = tmp.endsWith(".xcn") || tmp.endsWith(".snp");
            type = tmp.endsWith(".xcn") ? FileType.XCN : FileType.SNP;
            type = tmp.endsWith(".cn") ? FileType.CN : FileType.SNP;

        } else {
            //Assume this is a MAGE-TAB data matrix file
            chrColumn = 1;
            startColumn = 2;
            endColumn = -1;
            firstDataColumn = 1;
            hasEndLocations = false;
            hasCalls = false;
            type = FileType.MAGE_TAB;
        }
    }

    /*
     */
    public static boolean parsableMAGE_TAB(ResourceLocator file) {
        org.broad.igv.util.AsciiLineReader reader = null;
        try {
            reader = ParsingUtils.openAsciiReader(file);
            String nextLine = null;

            //skip first row
            reader.readLine();

            //check second row for MAGE_TAB identifiers
            if ((nextLine = reader.readLine()) != null && (nextLine.contains("Reporter REF") || nextLine.contains("Composite Element REF") || nextLine.contains("Term Source REF") || nextLine.contains("CompositeElement REF") || nextLine.contains("TermSource REF") || nextLine.contains("Coordinates REF"))) {
                int count = 0;
                // check if this mage_tab data matrix can be parsed by this class
                while ((nextLine = reader.readLine()) != null && count < 5) {
                    nextLine = nextLine.trim();
                    if (nextLine.startsWith("SNP_A") || nextLine.startsWith("CN_")) {
                        return true;
                    }

                    count++;
                }
                return false;
            }
        } catch (IOException e) {
            e.printStackTrace();
            return false;
        } finally {
            if (reader != null) {
                reader.close();
            }
        }

        return false;
    }

    /**
     * Scan the datafile for chromosome breaks.
     * @param dataset
     * @return
     */
    public List<ChromosomeSummary> scan(IGVDataset dataset) {

        int estLineCount = ParsingUtils.estimateLineCount(dataResourceLocator.getPath());

        float dataMin = 0;
        float dataMax = 0;
        long filePosition = 0;
        try {

            List<ChromosomeSummary> chrSummaries = new ArrayList();
            int skipColumns = hasCalls ? 2 : 1;

            // BufferedReader reader = ParsingUtils.openBufferedReader(dataResourceLocator);
            InputStream is = ParsingUtils.openInputStream(dataResourceLocator);
            AsciiLineReader reader = new AsciiLineReader(is);

            // Infer datatype from extension.  This can be overriden in the
            // comment section
            if (isCopyNumberFileExt(dataResourceLocator.getPath())) {
                dataset.setTrackType(TrackType.COPY_NUMBER);
                dataset.getTrackProperties().setWindowingFunction(WindowFunction.median);
            } else if (isLOHFileExt(dataResourceLocator.getPath())) {
                dataset.setTrackType(TrackType.LOH);
                dataset.getTrackProperties().setWindowingFunction(WindowFunction.median);
            } else {
                dataset.getTrackProperties().setWindowingFunction(WindowFunction.mean);
            }

            // Parse comments, if any
            String nextLine = reader.readLine(true);
            filePosition += nextLine.length();

            while (nextLine.startsWith("#") || (nextLine.trim().length() == 0)) {
                if (nextLine.length() > 0) {
                    parseComment(nextLine, dataset);
                }
                nextLine = reader.readLine(true);
                filePosition += nextLine.length();
            }

            // Parse column headings
            String[] data = nextLine.trim().split("\t");

            String[] headings = null;
            if (type == FileType.MAGE_TAB) {
                headings = getHeadings(data, skipColumns, true);

                valuesIndices = new IntArrayList(headings.length);
                //skip to second header row
                nextLine = reader.readLine();
                int nTokens = ParsingUtils.split(nextLine, tokens, '\t');
                for (int i = startColumn; i < nTokens; i++) {
                    String heading = tokens[i].replace('\"', ' ').trim();

                    //Check for tcga data column headings
                    if (heading.contains("Signal")) {
                        valuesIndices.add(i);
                    }
                }

                if (headings.length != valuesIndices.size()) {
                    log.error("Number of samples is not equal to number of Signal columns");
                    return null;
                }
                filePosition = nextLine.length();
            } else {
                headings = getHeadings(data, skipColumns);
            }

            dataset.setDataHeadings(headings);

            // Infer if the data is logNormalized by looking for negative data values.
            // Assume it is not until proven otherwise
            boolean logNormalized = false;

            ChromosomeSummary chrSummary = null;
            WholeGenomeData wgData = new WholeGenomeData(headings);
            int nRows = 0;

            // Compute a sample interval for the whole genome view such that we
            // sample ~ 10000 rows
            int wgSampleInterval = estLineCount / 10000 + 1;
            int chrRowCount = 0;

            // Update
            int updateCount = 5000;
            int count = 0;

            while ((nextLine = reader.readLine(true)) != null) {

                if (++count % updateCount == 0) {
                    IGVMainFrame.getInstance().setStatusBarMessage("Loaded: " + count + " / " + estLineCount + " (est)");
                }
                // Distance since last sample

                int nBytes = nextLine.length();
                int nTokens = ParsingUtils.split(nextLine.trim(), tokens, '\t');
                if (nTokens > 0) {
                    String thisChr = ParsingUtils.convertChrString(genomeId, tokens[chrColumn]);
                    if (chrSummary == null || !thisChr.equals(chrSummary.getName())) {
                        // Update whole genome and previous chromosome summary, unless this is
                        // the first chromosome
                        if (chrSummary != null) {
                            updateWholeGenome(chrSummary.getName(), dataset, headings, wgData);
                            chrSummary.setNDataPoints(nRows);
                        }

                        // Shart the next chromosome
                        chrSummary = new ChromosomeSummary(thisChr, filePosition);
                        chrSummaries.add(chrSummary);
                        nRows = 0;
                        wgData = new WholeGenomeData(headings);
                        chrRowCount = 0;

                    }

                    int location = -1;
                    try {
                        location = Integer.parseInt(tokens[startColumn]);

                    } catch (NumberFormatException numberFormatException) {
                        log.info("Location column is not a number.  Skipping row: " + nextLine);
                        continue;
                    }

                    if (chrRowCount % wgSampleInterval == 0) {
                        wgData.locations.add(location);

                        if (type != FileType.MAGE_TAB && nTokens > headings.length * skipColumns + firstDataColumn) {

                            // TODO -- throw error here.  this will cause an index out of bounds exception
                            log.info(
                                    "Unexpected number of tokens.  Expected " + headings.length + firstDataColumn + " found: " + nTokens + "   (" + nextLine + ")");
                        }

                        if (type == FileType.MAGE_TAB) {
                            for (int i = 0; i < valuesIndices.size(); i++) {
                                float copyNo = Float.NaN;
                                try {
                                    copyNo = Float.parseFloat(tokens[valuesIndices.get(i)]);

                                    if (!Float.isNaN(copyNo)) {
                                        dataMin = Math.min(dataMin, copyNo);
                                        dataMax = Math.max(dataMax, copyNo);
                                    }
                                } catch (NumberFormatException e) {
                                }

                                if (copyNo < 0) {
                                    logNormalized = true;
                                }
                                String heading = headings[i];
                                wgData.data.get(heading).add(copyNo);
                            }
                        } else {
                            for (int i = firstDataColumn; i < nTokens; i += skipColumns) {
                                int idx = (i - firstDataColumn) / skipColumns;
                                float copyNo = Float.NaN;
                                try {
                                    copyNo = Float.parseFloat(tokens[i]);

                                    if (!Float.isNaN(copyNo)) {
                                        dataMin = Math.min(dataMin, copyNo);
                                        dataMax = Math.max(dataMax, copyNo);
                                    }
                                } catch (NumberFormatException e) {
                                }

                                if (copyNo < 0) {
                                    logNormalized = true;
                                }
                                String heading = headings[idx];
                                wgData.data.get(heading).add(copyNo);
                            }
                        }
                        nRows++;
                    }

                }
                chrRowCount++;
                filePosition += nBytes;
            }

            // Update last chromosome
            if (chrSummary != null) {
                updateWholeGenome(chrSummary.getName(), dataset, headings, wgData);
                chrSummary.setNDataPoints(nRows);
            }


            dataset.setLogNormalized(logNormalized);
            dataset.setDataMin(dataMin);
            dataset.setDataMax(dataMax);

            reader.close();

            return chrSummaries;

        } catch (FileNotFoundException e) {

            // DialogUtils.showError("SNP file not found: " + dataSource.getCopyNoFile());
            log.error("CN file not found: " + dataResourceLocator);
            throw new RuntimeException(e);
        } catch (IOException e) {

            // DialogUtils.showError("Error parsing SNP file: " + dataSource.getCopyNoFile());
            log.error(dataResourceLocator.getPath(), e);
            throw new RuntimeException(e);
        }

    }

    /**
     * Load data for a single chromosome.
     *
     * @param chrSummary
     * @param columnHeaders
     * @return
     */
    public ChromosomeData loadChromosomeData(
            ChromosomeSummary chrSummary, String[] columnHeaders) {

        // InputStream is = null;
        try {
            int skipColumns = hasCalls ? 2 : 1;

            // Get an estimate of the number of snps (rows).  THIS IS ONLY AN ESTIMATE
            int nRowsEst = chrSummary.getNDataPts();


            // BufferedReader reader = ParsingUtils.openBufferedReader(dataResourceLocator);
            InputStream is = ParsingUtils.openInputStream(dataResourceLocator);
            position(is, chrSummary.getStartPosition());
            AsciiLineReader reader = new AsciiLineReader(is);


            int nRows = 0;

            String nextLine = reader.readLine();


            // Create containers to hold data
            IntArrayList startLocations = new IntArrayList(nRowsEst);
            IntArrayList endLocations = (hasEndLocations ? new IntArrayList(nRowsEst) : null);


            Map<String, FloatArrayList> dataMap = new HashMap();
            for (String h : columnHeaders) {
                dataMap.put(h, new FloatArrayList(nRowsEst));
            }

            // Begin loop through rows
            String chromosome = chrSummary.getName();
            boolean chromosomeStarted = false;
            nRows = 0;
            while ((nextLine != null) && (nextLine.trim().length() > 0)) {

                try {
                    int nTokens = ParsingUtils.split(nextLine, tokens, '\t');

                    String thisChromosome = ParsingUtils.convertChrString(genomeId,
                            tokens[chrColumn].trim());
                    if (thisChromosome.equals(chromosome)) {
                        chromosomeStarted = true;

                        // chromosomeData.setMarkerId(nRows, tokens[0]);

                        int start = Integer.parseInt(tokens[startColumn].trim());
                        if (hasEndLocations) {
                            endLocations.add(Integer.parseInt(tokens[endColumn].trim()));
                        }

                        startLocations.add(start);

                        if (type == FileType.MAGE_TAB) {
                            for (int i = 0; i < valuesIndices.size(); i++) {
                                float copyNo = Float.NaN;
                                try {
                                    copyNo = Float.parseFloat(tokens[valuesIndices.get(i)].trim());
                                } catch (NumberFormatException e) {
                                }
                                String heading = columnHeaders[i];
                                dataMap.get(heading).add(copyNo);
                            }
                        } else {
                            int idx = -1;
                            for (int i = firstDataColumn; i < nTokens; i += skipColumns) {
                                idx = (i - firstDataColumn) / skipColumns;
                                float copyNo = Float.NaN;
                                try {
                                    copyNo = Float.parseFloat(tokens[i].trim());
                                } catch (NumberFormatException e) {
                                }

                                // Accomdate blanks at end of line                                
                                String heading = columnHeaders[idx];
                                dataMap.get(heading).add(copyNo);
                            }
                            // Accomdate blanks at end of line
                            for (idx++; idx < columnHeaders.length; idx++) {
                                String heading = columnHeaders[idx];
                                dataMap.get(heading).add(Float.NaN);
                            }

                        }
                        nRows++;


                    } else if (chromosomeStarted) {
                        break;
                    }

                } catch (NumberFormatException numberFormatException) {

                    // Skip line
                    log.info("Skipping line (NumberFormatException) " + nextLine);
                }

                nextLine = reader.readLine();
            }

// Loop complete
            ChromosomeData cd = new ChromosomeData(chrSummary.getName());
            cd.setStartLocations(startLocations.toArray());
            if (hasEndLocations) {
                cd.setEndLocations(endLocations.toArray());
            }

            for (String h : columnHeaders) {
                cd.setData(h, dataMap.get(h).toArray());
            }

            return cd;

        } catch (IOException ex) {
            log.error("Error parsing cn file", ex);
            throw new RuntimeException("Error parsing cn file", ex);
        }

    }

    /**
     * Note:  This is an exact copy of the method in GCTDatasetParser.  Refactor to merge these
     * two parsers, or share a common base class.
     * @param comment
     * @param dataset
     */
    private void parseComment(String comment, IGVDataset dataset) {

        String tmp = comment.substring(1, comment.length());
        if (tmp.startsWith("track")) {
            DatasetParserUtils.parseTrackLine(tmp, dataset.getTrackProperties());

        } else {
            String[] tokens = tmp.split("=");
            if (tokens.length != 2) {
                return;
            }

            String key = tokens[0].trim().toLowerCase();
            if (key.equals("name")) {
                dataset.setName(tokens[1].trim());
            } else if (key.equals("type")) {

                try {
                    dataset.setTrackType(TrackType.valueOf(tokens[1].trim().toUpperCase()));
                } catch (Exception exception) {

                    // Ignore
                }
            }
        }
    }

    private boolean isCopyNumberFileExt(String filename) {
        String tmp = (filename.endsWith(".txt") || filename.endsWith(".tab") || filename.endsWith(".xls")
                ? filename.substring(0, filename.length() - 4) : filename);
        return tmp.endsWith(".cn") || tmp.endsWith(".xcn") || tmp.endsWith(".snp");
    }

    private boolean isLOHFileExt(String filename) {
        String tmp = (filename.endsWith(".txt") || filename.endsWith(".tab") || filename.endsWith(".xls")
                ? filename.substring(0, filename.length() - 4) : filename);
        return tmp.endsWith(".loh");
    }

    /**
     * Return the sample headings for the copy number file.
     * @param tokens
     * @param skipColumns
     * @return
     */
    public String[] getHeadings(String[] tokens, int skipColumns) {
        return getHeadings(tokens, skipColumns, false);
    }

    /**
     * Return the sample headings for the copy number file.
     * @param tokens
     * @param skipColumns
     * @param removeDuplicates , whether to remove any duplicate headings
     * @return
     */
    public String[] getHeadings(String[] tokens, int skipColumns, boolean removeDuplicates) {

        ObjectArrayList headings = new ObjectArrayList();
        String previousHeading = null;
        for (int i = firstDataColumn; i < tokens.length; i += skipColumns) {
            if (removeDuplicates) {
                if (previousHeading != null && tokens[i].equals(previousHeading) || tokens[i].equals("")) {
                    continue;
                }

                previousHeading = tokens[i];
            }

            headings.add(tokens[i].trim());
        }

        return (String[]) headings.toArray(new String[0]);
    }
    static String[] tokens = new String[10000];

    /**
     * Positin the stream at the specified position.
     * @param is
     * @param position
     * @throws java.io.IOException
     */
    private void position(InputStream is, long position) throws IOException {
        // This is ugly, but most streams are filestreams and we want to take advantage of 
        // the file channel
        if (is instanceof FileInputStream) {
            ((FileInputStream) is).getChannel().position(position);
        } else {
            is.skip(position);
        }




    }

    private void updateWholeGenome(String currentChromosome, IGVDataset dataset, String[] headings,
            IGVDatasetParser.WholeGenomeData sumData) {
        // Update whole genome data
        int[] locations = sumData.locations.toArray();
        if (locations.length > 0) {
            dataset.getGenomeSummary().addLocations(currentChromosome, sumData.locations.toArray());
            for (String h : headings) {
                dataset.getGenomeSummary().addData(h, currentChromosome, sumData.data.get(h).toArray());
            }
        }
    }

    class WholeGenomeData {

        String[] headings;
        IntArrayList locations = new IntArrayList(25000);
        Map<String, FloatArrayList> data = new HashMap();

        WholeGenomeData(String[] headings) {
            this.headings = headings;
            for (String h : headings) {
                data.put(h, new FloatArrayList(25000));
            }
        }

        int size() {
            return locations.size();
        }
    }
}
