/*
 * The Broad Institute
 * SOFTWARE COPYRIGHT NOTICE AGREEMENT
 * This is copyright (2007-2009) by the Broad Institute/Massachusetts Institute
 * of Technology.  It is licensed to You under the Gnu Public License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *    http://www.opensource.org/licenses/gpl-2.0.php
 *
 * This software is supplied without any warranty or guaranteed support
 * whatsoever. Neither the Broad Institute nor MIT can be responsible for its
 * use, misuse, or functionality.
*/

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package org.broad.igv.feature;

import org.apache.log4j.Logger;

import org.broad.igv.IGVConstants;
import org.broad.igv.ui.util.ProgressMonitor;
import org.broad.igv.util.Utilities;

import org.broad.igv.util.ZipArchiveWrapper;
import org.broad.igv.util.ZipArchiveWrapper.ZipIterator;

import static org.broad.igv.IGVConstants.*;

//~--- JDK imports ------------------------------------------------------------

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;

import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;

import java.util.HashSet;
import java.util.Properties;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

/**
/**
 *
 * @author jrobinso
 */
public class GenomeImporter {

    static Logger log = Logger.getLogger(GenomeImporter.class);



    /**
     * Create a cytoband file from chromosome sequences.
     *
     * @param sequenceDirectory A directory of chromosome sequence files.
     * @param cytobandFile The file to be created.
     *
     * @throws IOException
     */
    public void createCytobandFile(File sequenceDirectory, File cytobandFile) throws IOException {

        if ((sequenceDirectory == null) || (cytobandFile == null) || !sequenceDirectory.isDirectory()) {
            log.error("Invalid input for cytoband creation: ");
            log.error("\tsequenceDirectory =" + sequenceDirectory);
            log.error("\tCytoband File =" + cytobandFile);
            return;
        }

        PrintWriter cytobandFileWriter = null;
        try {

            // Create cytoband file if it does not exist; otherwise, we append to it
            if (!cytobandFile.exists()) {
                cytobandFile.createNewFile();
            }
            cytobandFileWriter = new PrintWriter(new FileWriter(cytobandFile, true));

            // Generate a single cytoband per chromosome.  Length == chromosome length
            for (File seqFile : sequenceDirectory.listFiles()) {

                // Chromosome name is everything up to the extension
                String filename = seqFile.getName();
                String chrName = filename.substring(0,
                        filename.length() - GenomeManager.SEQUENCE_FILE_EXTENSION.length());
                long chrLength = seqFile.length();
                cytobandFileWriter.println(chrName + "\t0\t" + chrLength);
            }
        } catch (Exception e) {
            log.warn("Error creating cytoband file from sequence files", e);
        } finally {
            if (cytobandFileWriter != null) {
                cytobandFileWriter.close();
            }
        }
    }

    /**
     * Validate FASTA files in a zip.
     *
     * @param fastaFileSetZip
     * @param returnedInvalidZipEntries The set of invalid FASTA zip entries
     * found (only the zip entry name).
     *
     * @return true if all entries were sequence files.
     *
     */
    public boolean checkFastaFileSetFormat(File fastaFileSetZip,
            HashSet<String> returnedInvalidZipEntries) {

        boolean isOk = false;

        if (fastaFileSetZip == null) {
            return isOk;
        }

        URL zipUrl = null;
        try {
            zipUrl = fastaFileSetZip.toURI().toURL();

            if (zipUrl == null) {
                return false;
            }

            String zipFilePath = URLDecoder.decode(zipUrl.getFile(), "UTF-8");
            if ((zipFilePath == null) || !zipFilePath.endsWith(IGVConstants.FASTA_FILE_SET_EXTENSION)) {
                return false;
            }

            ZipInputStream zipInputStream = null;
            try {
                zipInputStream = new ZipInputStream(zipUrl.openStream());
                ZipEntry zipEntry = zipInputStream.getNextEntry();
                while (zipEntry != null) {

                    String zipEntryName = zipEntry.getName();

                    BufferedReader data = new BufferedReader(new InputStreamReader(zipInputStream));

                    String textRead = data.readLine();
                    if ((textRead == null) || (textRead.length() < 2)) {
                        returnedInvalidZipEntries.add(zipEntryName);
                        zipEntry = zipInputStream.getNextEntry();
                        continue;
                    }
                    if (!textRead.trim().startsWith(">")) {
                        returnedInvalidZipEntries.add(zipEntryName);
                        zipEntry = zipInputStream.getNextEntry();
                        continue;
                    }
                    zipEntry = zipInputStream.getNextEntry();
                }

                if (returnedInvalidZipEntries.size() == 0) {
                    isOk = true;
                }
            } finally {
                try {
                    if (zipInputStream != null) {
                        zipInputStream.close();
                    }
                } catch (IOException ex) {
                    log.warn("Error closing sequence file stream!", ex);
                }
            }

        } catch (MalformedURLException e) {

            String urlString = null;
            try {
                urlString = URLDecoder.decode(zipUrl.getFile(), "UTF-8");
            } catch (UnsupportedEncodingException ex) {
                urlString = zipUrl.getFile();
            }
            log.error(("Invalid sequence file set: " + urlString), e);
        } catch (IOException e) {

            String urlString = null;
            try {
                urlString = URLDecoder.decode(zipUrl.getFile(), "UTF-8");
            } catch (UnsupportedEncodingException ex) {
                urlString = zipUrl.getFile();
            }
            log.error(("Invalid sequence file set: " + urlString), e);
        }

        return isOk;
    }

    /**
     * Validate FASTA files in a directory.
     *
     * @param sequenceDirectory firectory containing ONLY sequence files.
     * @param returnedInvalidFastaFiles The set of invalid FASTA files found.
     *
     * @return true if all file were sequence files.
     */
    public boolean checkFastaFilesInDirectory(File sequenceDirectory,
            HashSet<String> returnedInvalidFastaFiles) {

        boolean isOk = false;

        if ((sequenceDirectory == null) || !sequenceDirectory.isDirectory()) {
            return isOk;
        }

        try {
            File[] files = sequenceDirectory.listFiles();
            InputStream inputStream = null;

            try {
                for (File sequenceFile : files) {

                    if (sequenceFile.getName().toLowerCase().endsWith(
                            IGVConstants.FASTA_GZIP_FILE_EXTENSION)) {

                        // A single FASTA file is in a .gz file
                        inputStream = new GZIPInputStream(new FileInputStream(sequenceFile));
                    } else {

                        // A single FASTA file not in any type of compressed file
                        inputStream = sequenceFile.toURI().toURL().openStream();
                    }

                    BufferedReader data = new BufferedReader(new InputStreamReader(inputStream));

                    String textRead = data.readLine();
                    if ((textRead == null) || (textRead.length() < 2)) {
                        returnedInvalidFastaFiles.add(sequenceFile.getName());
                        inputStream.close();
                        continue;
                    }
                    if (!textRead.trim().startsWith(">")) {
                        returnedInvalidFastaFiles.add(sequenceFile.getName());
                        inputStream.close();
                        continue;
                    }
                    inputStream.close();
                }

                if (returnedInvalidFastaFiles.size() == 0) {
                    isOk = true;
                }
            } finally {
                try {
                    if (inputStream != null) {
                        inputStream.close();
                    }
                } catch (IOException ex) {
                    log.warn("Error closing sequence file stream!", ex);
                }
            }

        } catch (MalformedURLException e) {
            log.error(("Invalid sequence file directory: " + sequenceDirectory.getAbsolutePath()),
                    e);
        } catch (IOException e) {
            log.error(("Invalid sequence file directory: " + sequenceDirectory.getAbsolutePath()),
                    e);
        }

        return isOk;
    }

    /**
     * Validate a file is a sequence file.
     *
     * @param fastaInputFile
     *
     * @return true if file was a sequence files.
     */
    public boolean checkFastaFileFormat(File fastaInputFile) {

        boolean isOk = false;

        if (fastaInputFile == null) {
            return isOk;
        }

        URL url = null;
        try {
            url = fastaInputFile.toURI().toURL();
            InputStream inputStream = url.openStream();

            if (fastaInputFile.getName().toLowerCase().endsWith(
                    IGVConstants.FASTA_GZIP_FILE_EXTENSION)) {

                // A single FASTA file is in a .gz file
                inputStream = new GZIPInputStream(inputStream);
            }

            isOk = checkFastaFileFormat(inputStream);
        } catch (MalformedURLException e) {
            log.error(("Invalid sequence file URL:" + url), e);
        } catch (IOException e) {
            log.error(("Invalid sequence file URL:" + url), e);
        }

        return isOk;
    }

    /**
     * Validates a sequence file.
     *
     * @param inputStream sequence file stream.
     *
     * @return true if format ok.
     */
    private boolean checkFastaFileFormat(InputStream inputStream) {

        boolean isOk = false;

        if (inputStream == null) {
            return isOk;
        }

        BufferedReader dataReader = null;
        try {

            try {

                dataReader = new BufferedReader(new InputStreamReader(inputStream));

                String fastaDataLine = null;
                while ((fastaDataLine = dataReader.readLine()) != null) {
                    if (fastaDataLine != null) {

                        fastaDataLine = fastaDataLine.trim();

                        // Check first line Is FASTA format correct?
                        if (fastaDataLine.startsWith(">") && (fastaDataLine.length() > 1)) {
                            isOk = true;
                        }
                        break;
                    }
                }
            } finally {

                if (dataReader != null) {
                    dataReader.close();
                }
            }

        } catch (Exception e) {
            isOk = false;
            log.error("Error validating sequence file!", e);
        }

        return isOk;
    }

    /**
     * Create a zip containing all the information and data required to load a
     * genome. All file/directory validation is assume to have been done by validation
     * outside of this method.
     *
     * @param archiveOutputLocation
     * @param genomeFileName
     * @param genomeId Id of the genome.
     * @param genomeDisplayName The genome name that is user-friendly.
     * @param relativeSequenceLocation The location of sequence data.
     * @param sequenceInputFile
     * @param refFlatFile RefFlat file.
     * @param cytobandFile Cytoband file.
     * @param sequenceOutputLocationOverride
     * @param monitor
     *
     * @return The newly created genome archive file.
     */
    public File createGenomeArchive(File archiveOutputLocation, String genomeFileName,
            String genomeId, String genomeDisplayName,
            String relativeSequenceLocation, File sequenceInputFile,
            File refFlatFile, File cytobandFile,
            String sequenceOutputLocationOverride,
            ProgressMonitor monitor) {

        if ((archiveOutputLocation == null) || (genomeFileName == null) || (genomeId == null) || (genomeDisplayName == null)) {

            log.error("Invalid input for genome creation: ");
            log.error("\tGenome Output Location=" + archiveOutputLocation);
            log.error("\tGenome filename=" + genomeFileName);
            log.error("\tGenome Id=" + genomeId);
            log.error("\tGenome Name" + genomeDisplayName);
            return null;
        }

        boolean autoGeneratedCytobandFile = (cytobandFile == null) ? true : false;

        File archive = null;
        FileWriter propertyFileWriter = null;
        try {

            // If we have a FASTA file we need to use the passed sequence
            // location as a directory to place the generated sequences.
            if (sequenceInputFile != null) {
                File sequenceOutputFolder = new File(archiveOutputLocation,
                        relativeSequenceLocation);
                if (!sequenceOutputFolder.exists()) {
                    sequenceOutputFolder.mkdir();
                }

                // Got a FASTA directory so we have to process all the files in
                // as part of the genome creation request (each FASTA file must
                // be for a single chromosome)
                if (sequenceInputFile.isDirectory()) {

                    // Create all sequence files
                    File[] files = sequenceInputFile.listFiles();

                    int progressIncrement = ((files.length > 0) ? (50 / files.length) : 50);
                    for (File file : files) {
                        createSequenceFiles(file, sequenceOutputFolder, genomeId, monitor);
                        if (monitor != null) {
                            monitor.fireProgressChange(progressIncrement);
                        }
                    }
                } else // Process a FASTA file set (each FASTA file must be for a single chromosome)
                if (sequenceInputFile.getName().toLowerCase().endsWith(
                        IGVConstants.FASTA_FILE_SET_EXTENSION)) {

                    // Sequence input is a zip archive
                    ZipArchiveWrapper zip = new ZipArchiveWrapper(sequenceInputFile);
                    boolean closeStreamOnExit = false;
                    ZipIterator iterator = null;


                    try {

                        int entryCount = zip.getEntryCount();
                        int progressIncrement = ((entryCount > 0) ? 50 / entryCount : 50);

                        // Create all sequences
                        iterator = zip.iterator();
                        ZipInputStream inputStream = iterator.getZipInputStream();
                        while (iterator.hasNext()) {

                            iterator.next();    // Move to next entry

                            // Create Sequence Data
                            createSequenceFiles(inputStream, sequenceOutputFolder, genomeId,
                                    closeStreamOnExit, monitor);

                            if (monitor != null) {
                                monitor.fireProgressChange(progressIncrement);
                            }
                        }
                        iterator.close();
                    } finally {
                        if (iterator != null) {
                            iterator.close();
                        }
                    }
                } else // Only have a single FASTA file
                {

                    // Create sequence data file from a single FASTA file
                    createSequenceFiles(sequenceInputFile, sequenceOutputFolder, genomeId, monitor);
                }

                // Create Cytoband file
                if (autoGeneratedCytobandFile) {

                    // Construct a cytoband file
                    String cytobandFileName = genomeId + "_cytoband.txt";
                    cytobandFile = new File(IGVConstants.IGV_TEMP_DIRECTORY, cytobandFileName);
                    createCytobandFile(sequenceOutputFolder, cytobandFile);
                }
            }

            // Create Property File for genome archive
            if (sequenceOutputLocationOverride != null) {
                relativeSequenceLocation = sequenceOutputLocationOverride;
            }
            File propertyFile = createGenomePropertyFile(genomeId, genomeDisplayName,
                    relativeSequenceLocation, refFlatFile, cytobandFile);

            archive = new File(archiveOutputLocation, genomeFileName);
            File[] inputFiles = {refFlatFile, cytobandFile, propertyFile};
            Utilities.createZipFile(archive, inputFiles);
            propertyFile.delete();
        } catch (MaximumContigGenomeException e) {
            throw e;
        } catch (Exception e) {
            log.error("Failed to create genome archive: +" + archive.getAbsolutePath(), e);
        } finally {
            if (propertyFileWriter != null) {
                try {
                    propertyFileWriter.close();
                } catch (IOException ex) {
                    log.error("Failed to close genome archive: +" + archive.getAbsolutePath(),
                            ex);
                }
            }

            if (autoGeneratedCytobandFile) {
                if ((cytobandFile != null) && cytobandFile.exists()) {
                    cytobandFile.delete();
                }
            }
        }
        return archive;
    }

    /**
     * This method creates the property.txt file that is stored in each
     * .genome file. This is not the user-defined genome property file
     * created by storeUserDefinedGenomeListToFile(...)
     *
     * @param genomeId The genome's id.
     * @param genomeDisplayName
     * @param relativeSequenceLocation
     * @param refFlatFile
     * @param cytobandFile
     *
     * @return
     */
    public File createGenomePropertyFile(String genomeId, String genomeDisplayName,
            String relativeSequenceLocation, File refFlatFile, File cytobandFile) {

        FileWriter propertyFileWriter = null;
        File propertyFile = null;
        try {

            propertyFile = new File(IGV_TEMP_DIRECTORY, "property.txt");
            propertyFile.createNewFile();

            // Add the new property file to the archive
            propertyFileWriter = new FileWriter(propertyFile);
            if (genomeId != null) {
                propertyFileWriter.write(IGVConstants.GENOME_ARCHIVE_ID_KEY + "=" + genomeId);
                propertyFileWriter.write("\n");
            }
            if (genomeDisplayName != null) {
                propertyFileWriter.write(
                        IGVConstants.GENOME_ARCHIVE_NAME_KEY + "=" + genomeDisplayName);
                propertyFileWriter.write("\n");
            }
            if (cytobandFile != null) {
                propertyFileWriter.write(
                        IGVConstants.GENOME_ARCHIVE_CYTOBAND_FILE_KEY + "=" + cytobandFile.getName());
                propertyFileWriter.write("\n");
            }
            if (refFlatFile != null) {
                propertyFileWriter.write(
                        IGVConstants.GENOME_ARCHIVE_GENE_FILE_KEY + "=" + refFlatFile.getName());
                propertyFileWriter.write("\n");
            }
            if (relativeSequenceLocation != null) {

                if (!relativeSequenceLocation.startsWith("http:")) {
                    relativeSequenceLocation = relativeSequenceLocation.replace('\\', '/');
                }
                propertyFileWriter.write(
                        IGVConstants.GENOME_ARCHIVE_SEQUENCE_FILE_LOCATION_KEY + "=" + relativeSequenceLocation);
                propertyFileWriter.write("\n");
            }
            if (propertyFileWriter != null) {
                propertyFileWriter.close();
                propertyFileWriter = null;
            }
        } catch (Exception e) {
            log.error("Failed to create genome property file: " + propertyFile.getAbsolutePath(),
                    e);
        } finally {
            if (propertyFileWriter != null) {
                try {
                    propertyFileWriter.close();
                } catch (IOException ex) {
                    log.error(
                            "Failed to close genome property file: " + propertyFile.getAbsolutePath(),
                            ex);
                }
            }
        }
        return propertyFile;
    }

    /**
     * Writes a user-defined genome property file.
     *
     * @param outputFile A java properties file containing tab delimetered data
     * (display name [tab] genome file location [tab] genome id) about
     * the user-defined genome.
     *
     * @param properties A list of properties to store.
     * @throws IOException
     */
    public static void storeUserDefinedGenomeListToFile(File outputFile, Properties properties)
            throws IOException {

        if ((properties != null) && (outputFile != null)) {

            if (!outputFile.exists()) {
                outputFile.createNewFile();
            }

            FileOutputStream out = null;
            try {
                out = new FileOutputStream(outputFile);
                properties.store(out, "");
            } catch (FileNotFoundException e) {
                log.error("Property file for imported genomes was not " + "found!", e);
            } catch (IOException e) {
                log.error("Error writing property file for imported " + "genomes!", e);
            } finally {
                if (out != null) {
                    try {
                        out.close();
                    } catch (IOException e) {
                        log.error("Error closing property file for imported genomes!", e);
                    }
                }
            }
        }
    }

    /**
     * Creates chromosome sequence files.
     *
     * @param sequenceInputFile A FASTA file.
     * @param genomeSequenceFolder The output folder for chromosome sequence
     * files.
     * @param genomeId The genome Id.
     * @param monitor
     *
     * @throws IOException
     */
    public void createSequenceFiles(File sequenceInputFile, File genomeSequenceFolder,
            String genomeId, ProgressMonitor monitor)
            throws IOException {

        if (sequenceInputFile == null) {
            log.error("Invalid input for sequence creation: ");
            log.error("\tSequence Filename =" + sequenceInputFile);
            log.error("\tSequence Location =" + genomeSequenceFolder);
            log.error("\tGenome Id =" + genomeId);
            return;
        }

        InputStream inputStream = null;
        if (sequenceInputFile.getName().toLowerCase().endsWith(
                IGVConstants.FASTA_GZIP_FILE_EXTENSION)) {

            // A single FASTA file is in a .gz file
            inputStream = new GZIPInputStream(new FileInputStream(sequenceInputFile));
        } else {

            // A single FASTA file not in any type of compressed file
            inputStream = sequenceInputFile.toURI().toURL().openStream();
        }
        createSequenceFiles(inputStream, genomeSequenceFolder, genomeId, true, monitor);
    }

    /**
     * Creates chromosome sequence files.
     *
     * @param sequenceInputStream Input stream for a FASTA file.
     * @param genomeSequenceFolder The output folder for chromosome sequence
     * files.
     * @param genomeId The genome Id.
     * @param closeStreamOnExit Flag to indicate whether the passed input
     * stream should be automatically close before exiting this method.
     * @param monitor
     *
     * @throws IOException
     */
    public void createSequenceFiles(InputStream sequenceInputStream, File genomeSequenceFolder,
            String genomeId, boolean closeStreamOnExit,
            ProgressMonitor monitor)
            throws IOException {

        if (sequenceInputStream == null) {
            log.error("Invalid input for sequence creation: ");
            log.error("\tInput Stream =" + sequenceInputStream);
            log.error("\tSequence Location =" + genomeSequenceFolder);
            log.error("\tGenome Id =" + genomeId);
            return;
        }

        if ((genomeId == null) || (genomeSequenceFolder == null)) {
            genomeId = "";
        }

        int contigCounter = 0;
        BufferedWriter chromosomeFileWriter = null;
        BufferedReader dataReader = null;
        try {
            if (!genomeSequenceFolder.exists()) {
                genomeSequenceFolder.mkdir();
            }

            dataReader = new BufferedReader(new InputStreamReader(sequenceInputStream));

            String fastaDataLine = null;
            while ((fastaDataLine = dataReader.readLine()) != null) {

                // If we reached the number of allowed contigs throw an error
                if (contigCounter > 5000) {
                    throw new MaximumContigGenomeException(
                            "Maximum number of contigs exceeded (5000)");

                }
                if (fastaDataLine != null) {
                    fastaDataLine = fastaDataLine.trim();

                    // If a new chromosome name
                    if (fastaDataLine.startsWith(">")) {

                        // Count contigs processed
                        ++contigCounter;

                        // Find the first word break.  According the the spec the id of the sequence
                        // is the first "word",  the remaining part of the line is a comment.
                        char[] chars = fastaDataLine.toCharArray();
                        int whitespaceIndex = 0;
                        for (whitespaceIndex = 0; whitespaceIndex < chars.length; whitespaceIndex++) {
                            if (Character.isSpaceChar(chars[whitespaceIndex])) {
                                break;
                            }
                        }
                        String chromosome = fastaDataLine.substring(1, whitespaceIndex).trim();
                        chromosome = chromosome + GenomeManager.SEQUENCE_FILE_EXTENSION;

                        File chromosomeSequenceFile = new File(genomeSequenceFolder, chromosome);
                        chromosomeSequenceFile.createNewFile();

                        if (chromosomeFileWriter != null) {
                            chromosomeFileWriter.close();
                            chromosomeFileWriter = null;
                        }
                        chromosomeFileWriter =
                                new BufferedWriter(new FileWriter(chromosomeSequenceFile));
                        continue;
                    }
                }
                chromosomeFileWriter.write(fastaDataLine.toUpperCase());
            }
        } catch (MaximumContigGenomeException e) {
            throw e;
        } catch (Exception e) {
            log.warn("Error writing FASTA sequence!", e);
        } finally {
            if (chromosomeFileWriter != null) {
                chromosomeFileWriter.close();
            }
            if (closeStreamOnExit && (dataReader != null)) {
                dataReader.close();
            }
        }
    }
}
