// static executable:
// g++ -O3 -fopenmp -Wall -static-libgcc -static-libstdc++ count_reads_per_bin_batch.cpp -o count_reads_per_bin_batch -I/n/data1/bwh/medicine/loh/ploh/external_software/htslib/htslib-1.10.2/ -I/home/pl88/boost_1_58_0/install/include -L/n/data1/bwh/medicine/loh/ploh/external_software/htslib/htslib-1.10.2/ -L/n/groups/price/poru/external_software/libstdc++/usr/lib/gcc/x86_64-redhat-linux/4.8.5/ -L/n/groups/price/poru/external_software/zlib/zlib-1.2.11 -L/home/pl88/boost_1_58_0/install/lib -Wl,-Bstatic -lhts -lboost_iostreams -lz -Wl,-Bdynamic -lpthread

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <map>
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <cctype>
#include <cassert>
#include <htslib/sam.h>
#include <boost/iostreams/filtering_stream.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <sys/time.h>

using namespace std;

#define BIN_SIZE 100
unsigned short counts[25][250000000/BIN_SIZE][2]; // table of counts per bin

struct Region {
  int startBin, endBin, minMAPQ;
  Region(int _start, int _end, int _minMAPQ) {
    startBin = _start / BIN_SIZE; assert(_start % BIN_SIZE == 0);
    endBin = _end / BIN_SIZE; assert(_end % BIN_SIZE == 0);
    minMAPQ = _minMAPQ;
  }
};

vector < vector <Region> > readRegions(const char *file, bool is100bpBin) {

  cout << "Reading region list: " << file << endl;

  vector < vector <Region> > regionsPerChr(25); // blank, chr1-22, X, Y

  map <string, int> chrStrToInt;
  for (int i = 1; i <= 24; i++) {
    char buf[10]; sprintf(buf, "%d", i);
    chrStrToInt[buf] = i;
    chrStrToInt["chr" + string(buf)] = i;
  }
  chrStrToInt["X"] = chrStrToInt["chrX"] = 23;
  chrStrToInt["Y"] = chrStrToInt["chrY"] = 24;

  ifstream finRegions(file); assert(finRegions);
  string line, chrStr; double start, end; string Qstr;
  getline(finRegions, line); // throw away header
  int ctr = 0;
  while (finRegions >> chrStr >> start >> end) {
    ctr++;
    assert(chrStrToInt.find(chrStr) != chrStrToInt.end());
    int minMAPQ = 1;
    if (is100bpBin)
      assert(end-start == BIN_SIZE);
    else {
      finRegions >> Qstr; assert(Qstr == "Q0" || Qstr == "Q1");
      minMAPQ = Qstr=="Q1";
    }
    regionsPerChr[chrStrToInt[chrStr]].push_back(Region((int) start, (int) end, minMAPQ));
    getline(finRegions, line); // throw away rest of line
  }
  finRegions.close();

  for (int chr = 1; chr <= 24; chr++)
    if (!regionsPerChr[chr].empty())
      printf("chr%-2d : %6d\n", chr, (int) regionsPerChr[chr].size());
  cout << "Total regions: " << ctr << endl << endl;

  return regionsPerChr;
}

void openWriteHeader(ofstream &of, boost::iostreams::filtering_ostream &foutGz,
		     const char *outFile, int Mchr, vector <Region> &regions) {

  of.open(outFile, std::ios_base::binary);
  foutGz.push(boost::iostreams::gzip_compressor());
  foutGz.push(of);
  foutGz.write((const char *) &Mchr, sizeof(int));
  for (int i = 0; i < Mchr; i++) { regions[i].startBin*=BIN_SIZE; regions[i].endBin*=BIN_SIZE; }
  foutGz.write((const char *) &regions[0], Mchr * sizeof(Region));
  for (int i = 0; i < Mchr; i++) { regions[i].startBin/=BIN_SIZE; regions[i].endBin/=BIN_SIZE; }
  assert(foutGz);
}

int main(int argc, char *argv[]){

  if (argc != 6) {
    fprintf(stderr, "Usage:\n");
    fprintf(stderr, "- arg1 = ID bam/cram file list (col1 = ID, col2 = /path/to/cram)\n");
    fprintf(stderr, "- arg2 = list of 100bp bins (CHR START END; bed coords)\n");
    fprintf(stderr, "- arg3 = list of common CNP regions (CHR START END Q0/Q1)\n");
    fprintf(stderr, "- arg4 = threads\n");
    fprintf(stderr, "- arg5 = output prefix -> .{100bp,regions}.chr{1..22}.bin.gz\n");
    fflush(stderr);
    exit(1);
  }

  const char *IDpathFile = argv[1];
  const char *binsFile = argv[2];
  const char *regionsFile = argv[3];
  int threads; sscanf(argv[4], "%d", &threads);
  const char *outPrefix = argv[5];

  // read bin list + region list; split by chr
  vector < vector <Region> > binsPerChr = readRegions(binsFile, true);
  vector < vector <Region> > regionsPerChr = readRegions(regionsFile, false);

  unsigned short *binCounts[25];
  unsigned int *regionCounts[25];
  ofstream ofstreamBins[25], ofstreamRegions[25];
  boost::iostreams::filtering_ostream foutBinsGz[25], foutRegionsGz[25];

  // allocate buffers for bin, region output; open per-chr {bin,region} output files; write headers
  for (int chr = 1; chr <= 24; chr++) {
    if (!binsPerChr[chr].empty()) {
      int Mchr = binsPerChr[chr].size();
      binCounts[chr] = new unsigned short[Mchr];
      char outFile[1000]; sprintf(outFile, "%s.100bp.chr%d.bin.gz", outPrefix, chr);
      openWriteHeader(ofstreamBins[chr], foutBinsGz[chr], outFile, Mchr, binsPerChr[chr]);
    }
    if (!regionsPerChr[chr].empty()) {
      int Mchr = regionsPerChr[chr].size();
      regionCounts[chr] = new unsigned int[Mchr];
      char outFile[1000]; sprintf(outFile, "%s.regions.chr%d.bin.gz", outPrefix, chr);
      openWriteHeader(ofstreamRegions[chr], foutRegionsGz[chr], outFile, Mchr, regionsPerChr[chr]);
    }
  }

  struct timeval tv;
  gettimeofday(&tv, NULL); double tPrev = tv.tv_sec + tv.tv_usec*1e-6;

  // iterate over cram files
  ifstream finIDpath(IDpathFile);
  int ID;
  while (finIDpath >> ID) {
    char c; finIDpath.get(c); // throw away whitespace
    string cramFile; getline(finIDpath, cramFile);

    // zero count arrays
    memset(counts, 0, sizeof(counts));
    for (int chr = 1; chr <= 24; chr++)
      if (!regionsPerChr[chr].empty())
	memset(regionCounts[chr], 0, regionsPerChr[chr].size() * sizeof(regionCounts[chr][0]));
    
    cout << "Processing " << ID << "... " << flush;
    
    /***** open cram file; process header to find which "targets" correspond to chr1-22,X,Y *****/

    samFile *fin = hts_open(cramFile.c_str(), "r"); assert(fin != NULL); // open cram file
    hts_set_threads(fin, threads-1);
    // only extract FLAG RNAME(CHR) POS MAPQ RNEXT(CHR of mate)
    hts_set_opt(fin, CRAM_OPT_REQUIRED_FIELDS, 0x2 | 0x4 | 0x8 | 0x10 | 0x40);

    bam_hdr_t *hdr = sam_hdr_read(fin); assert(hdr != NULL); // read header
    int n_targets = hdr->n_targets;
    char *tid_to_chr = (char *) calloc(n_targets, 1); // map target IDs to 1-24 (autoXY) or else 0
    for (int tid = 0; tid < n_targets; tid++) {
      const char *tname = hdr->target_name[tid];
      if (strlen(tname) > 3 && tname[0]=='c' && tname[1]=='h' && tname[2]=='r') {
	if (strlen(tname) == 4) {
	  if (isdigit(tname[3]))
	    tid_to_chr[tid] = tname[3]-'0';
	  else if (tname[3]=='X')
	    tid_to_chr[tid] = 23;
	  else if (tname[3]=='Y')
	    tid_to_chr[tid] = 24;
	}
	else if (strlen(tname) == 5 && isdigit(tname[3]) && isdigit(tname[4]))
	  tid_to_chr[tid] = 10*(tname[3]-'0') + tname[4]-'0';
      }
    }

    /***** iterate through cram file, incrementing counts per bin *****/

    bam1_t *aln = bam_init1(); // initialize storage of alignment data
    while (sam_read1(fin, hdr, aln) > 0) {
      int chr = tid_to_chr[aln->core.tid];
      if (chr != 0 // require autosome or X or Y
	  && aln->core.tid==aln->core.mtid) { // require read and mate to align to same chromosome
	int flag = aln->core.flag;
	if (flag == 81 || flag == 83 || flag == 145 || flag == 147) { // require reverse, mate=fwd
	  int pos = aln->core.pos; // leftmost position of alignment (zero-based coord)
	  int bin = pos / BIN_SIZE;
	  counts[chr][bin][0]++; if (counts[chr][bin][0]==0) counts[chr][bin][0]--; // overflow
	  if (aln->core.qual) { // MAPQ > 0
	    counts[chr][bin][1]++; if (counts[chr][bin][1]==0) counts[chr][bin][1]--;
	  }
	}
      }
    }
    
    bam_destroy1(aln);
    free(tid_to_chr);
    bam_hdr_destroy(hdr);
    sam_close(fin);
    
    /***** output data for selected bins + regions *****/

#pragma omp parallel for schedule(dynamic) num_threads(threads)
    for (int chr = 1; chr <= 24; chr++) {
      if (!binsPerChr[chr].empty()) {
	foutBinsGz[chr].write((const char *) &ID, sizeof(int));
	int Mchr = binsPerChr[chr].size();
	for (int i = 0; i < Mchr; i++)
	  binCounts[chr][i] = counts[chr][binsPerChr[chr][i].startBin][1];
	foutBinsGz[chr].write((const char *) binCounts[chr], Mchr * sizeof(binCounts[chr][0]));
      }
      if (!regionsPerChr[chr].empty()) {
	foutRegionsGz[chr].write((const char *) &ID, sizeof(int));
	int Mchr = regionsPerChr[chr].size();
	for (int i = 0; i < Mchr; i++) {
	  const Region &region = regionsPerChr[chr][i];
	  for (int bin = region.startBin; bin < region.endBin; bin++)
	    regionCounts[chr][i] += counts[chr][bin][region.minMAPQ];
	}
	foutRegionsGz[chr].write((const char *) regionCounts[chr],
				 Mchr * sizeof(regionCounts[chr][0]));
      }
    }

    gettimeofday(&tv, NULL); double tCur = tv.tv_sec + tv.tv_usec*1e-6;
    cout << tCur-tPrev << " sec" << endl;
    tPrev = tCur;
  }

  // free buffers; close output files
  for (int chr = 1; chr <= 24; chr++) {
    if (!binsPerChr[chr].empty()) {
      delete[] binCounts[chr];
      foutBinsGz[chr].reset();
    }
    if (!regionsPerChr[chr].empty()) {
      delete[] regionCounts[chr];
      foutRegionsGz[chr].reset();
    }
  }

  return 0;
}
