// g++ -O3 -fopenmp -Wall -static-libgcc -static-libstdc++ computeSDscales.cpp -o computeSDscales -I/n/groups/price/poru/HSPH_SVN/src/EAGLE -I/home/pl88/boost_1_58_0/install/include -L/n/groups/price/poru/external_software/libstdc++/usr/lib/gcc/x86_64-redhat-linux/4.8.5/ -L/n/groups/price/poru/external_software/zlib/zlib-1.2.11 -L/home/pl88/boost_1_58_0/install/lib -Wl,-Bstatic -lboost_iostreams -lz

#include <iostream>
#include <vector>
#include <string>
#include <set>
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cmath>

#include <boost/math/distributions/negative_binomial.hpp>

#include "omp.h"

#include "Types.hpp"
#include "FileUtils.cpp"
#include "StringUtils.cpp"
#include "Timer.cpp"

using namespace std;

#define RC_QUANTILES 10

struct Region {
  int start38, end38, minMAPQ;
};

inline float sq(float x) { return x*x; }

double negLogLc(const vector <float> &expectRCs, const vector <ushort> &RCs,
		const vector <float> &theta0s, float cInv) {
  const float clipProb = 1e-4; // for robust estimation (M-estimator?)
  double negLogLsum = 0;
  int Nbins = RCs.size();
  for (int i = 0; i < Nbins; i++) {
    float theta0 = theta0s[i];
    float r = 1 / (theta0 * cInv);
    float theta = expectRCs[i] * theta0 * cInv;
    float p = 1 / (1+theta);
    float prob =
      boost::math::pdf(boost::math::negative_binomial_distribution <float> (r, p), RCs[i]);
    negLogLsum += -logf(max(prob, clipProb));
  }
  return negLogLsum;
}


int main(int argc, char *argv[]) {

  if (argc != 6) {
    cerr << "Usage:" << endl;
    cerr << "- arg1 = format string for binary baselineScale files" << endl;
    cerr << "- arg2 = format string for binary read count files" << endl;
    cerr << "- arg3 = file listing noCommonSV regions" << endl;
    cerr << "- arg4 = threads" << endl;
    cerr << "- arg5 = output file (ID stdScale per-decile-cInv)" << endl;
    exit(1);
  }
  const char *baselineScaleFormatStr = argv[1];
  const char *RCsFormatStr = argv[2];
  const char *noCommonSVregionsFile = argv[3];
  int threads; sscanf(argv[4], "%d", &threads);
  const char *outFile = argv[5];
  const float coeffVarBaselineRCthresh = 0.2f;

  Timer timer;

  cout << "Setting number of threads to " << threads << endl << endl;
  omp_set_num_threads(threads);

  // read list of noCommonSV bins to use for std scale + negative binomial shape calibration
  set < pair <int, int> > noCommonSVchrStarts;
  {
    FileUtils::AutoGzIfstream finRegions; finRegions.openOrExit(noCommonSVregionsFile);
    int chr, start, end; double sd_highCov_50K, sd_highCov_150K;
    while (finRegions >> chr >> start >> end >> sd_highCov_50K >> sd_highCov_150K) {
      assert(end == start+100);
      noCommonSVchrStarts.insert(make_pair(chr, start));
    }
  }
  cout << "Calibrating s.d. using " << noCommonSVchrStarts.size()
       << " bins annotated as noCommonSV" << endl;

  // open per-chr files; read headers
  FILE *finBaselineScales[23], *finRCs[23];
  uint64 Rchr[23];
  vector < vector <bool> > isNoCommonSVbaselineCVok(23);
  vector < vector <float> > meanRCs(23);
  vector < vector <float> > sdNormRCsInv(23);
  vector < vector <float> > meanBaselineRCs(23);
  vector < vector <float> > theta0s(23);
  vector < vector <uchar> > baselineScales(23);
  vector < vector <ushort> > RCs(23);
  vector <float> meanRCsAllChr;
  int ctrNoCommonSV = 0, ctrRkept = 0;
  for (int chr = 1; chr <= 22; chr++) {
    char buf[1000]; sprintf(buf, baselineScaleFormatStr, chr);
    finBaselineScales[chr] = fopen(buf, "rb"); assert(finBaselineScales[chr] != NULL);
    uint64 R;
    fread(&R, sizeof(R), 1, finBaselineScales[chr]);
    Rchr[chr] = R;

    isNoCommonSVbaselineCVok[chr].resize(R);
    meanRCs[chr].resize(R);
    sdNormRCsInv[chr].resize(R);
    meanBaselineRCs[chr].resize(R);
    baselineScales[chr].resize(R);
    theta0s[chr].resize(R);
    RCs[chr].resize(R);
    vector <Region> regions(R);
    vector <float> sdNormRCs(R), coeffVarBaselineRCs(R);

    fread(&regions[0], sizeof(regions[0]), R, finBaselineScales[chr]);
    fread(&meanRCs[chr][0], sizeof(meanRCs[chr][0]), R, finBaselineScales[chr]);
    fread(&sdNormRCs[0], sizeof(sdNormRCs[0]), R, finBaselineScales[chr]);
    fread(&meanBaselineRCs[chr][0], sizeof(meanBaselineRCs[chr][0]), R, finBaselineScales[chr]);
    fread(&coeffVarBaselineRCs[0], sizeof(coeffVarBaselineRCs[0]), R, finBaselineScales[chr]);
    fread(&theta0s[chr][0], sizeof(theta0s[chr][0]), R, finBaselineScales[chr]);
    
    meanRCsAllChr.insert(meanRCsAllChr.end(), meanRCs[chr].begin(), meanRCs[chr].end());

    for (uint r = 0; r < R; r++) {
      bool isNoCommonSVregion = noCommonSVchrStarts.count(make_pair(chr, regions[r].start38));
      ctrNoCommonSV += isNoCommonSVregion;
      isNoCommonSVbaselineCVok[chr][r] = isNoCommonSVregion
	&& coeffVarBaselineRCs[r] < coeffVarBaselineRCthresh;
      ctrRkept += isNoCommonSVbaselineCVok[chr][r];
      sdNormRCsInv[chr][r] = 1 / sdNormRCs[r];
    }
    cout << "chr" << chr << ": " << R << " bins" << endl;

    sprintf(buf, RCsFormatStr, chr);
    finRCs[chr] = fopen(buf, "rb"); assert(finRCs[chr] != NULL);
  }
  cout << "Found " << ctrNoCommonSV << " bins from noCommonSV list in normRC data" << endl;
  cout << "Kept " << ctrRkept << " bins with coeffVar(baselineRC) < "
       << coeffVarBaselineRCthresh << endl;
  
  // write header line 1 of output file
  FileUtils::AutoGzOfstream fout; fout.openOrExit(outFile);
  fout << "ID\tstd_scale";
  for (int q = 0; q < RC_QUANTILES; q++)
    fout << "\tRC_q" << q;
  fout << endl;

  // write header line 2 of output file = upper limit of each meanRC quantile
  fout << "0\t0";
  sort(meanRCsAllChr.begin(), meanRCsAllChr.end());
  vector <float> meanRCquantilesLo(RC_QUANTILES+1);
  for (int q = 1; q <= RC_QUANTILES; q++) {
    meanRCquantilesLo[q] = meanRCsAllChr[q*meanRCsAllChr.size()/RC_QUANTILES - 1];
    fout << "\t" << meanRCquantilesLo[q];
  }
  fout << endl;

  // assign bins to mean read count quantiles
  vector < vector <char> > quantileAssign(23);
  vector <int> quantileFreqs(RC_QUANTILES);
  for (int chr = 1; chr <= 22; chr++) {
    quantileAssign[chr].resize(Rchr[chr]);
    for (uint r = 0; r < Rchr[chr]; r++)
      for (int q = RC_QUANTILES-1; q >= 0; q--)
	if (meanRCs[chr][r] > meanRCquantilesLo[q]) {
	  quantileAssign[chr][r] = q;
	  quantileFreqs[q]++;
	  break;
	}
  }
  cout << "meanRCquantilesLo:" << endl;
  for (int q = 1; q <= RC_QUANTILES; q++)
    cout << q << "/" << RC_QUANTILES << ": " << meanRCquantilesLo[q] << " count="
	 << quantileFreqs[q-1] << endl;

  // compute calibration; write data lines of outputfile
  vector <float> zAbs(ctrRkept);
  vector < vector <ushort> > RCvecs(RC_QUANTILES);
  vector < vector <float> > expectRCvecs(RC_QUANTILES), theta0vecs(RC_QUANTILES);
  int ID; float coverageScale;
  int ctr = 0;
  while (fread(&ID, sizeof(ID), 1, finBaselineScales[1])) {
    fread(&coverageScale, sizeof(coverageScale), 1, finBaselineScales[1]);
    for (int q = 0; q < RC_QUANTILES; q++) {
      RCvecs[q].clear();
      expectRCvecs[q].clear();
      theta0vecs[q].clear();
    }
    for (int chr = 2; chr <= 22; chr++) {
      int IDchr; fread(&IDchr, sizeof(IDchr), 1, finBaselineScales[chr]);
      assert(ID==IDchr);
      float covScaleChr; fread(&covScaleChr, sizeof(covScaleChr), 1, finBaselineScales[chr]);
      assert(coverageScale==covScaleChr);
    }
    ctrRkept = 0;
    for (int chr = 1; chr <= 22; chr++) {
      assert(fread(&baselineScales[chr][0], sizeof(baselineScales[chr][0]), Rchr[chr],
		   finBaselineScales[chr]) == Rchr[chr]);
      assert(fread(&RCs[chr][0], sizeof(RCs[chr][0]), Rchr[chr], finRCs[chr]) == Rchr[chr]);
      for (uint r = 0; r < Rchr[chr]; r++)
	if (isNoCommonSVbaselineCVok[chr][r]) {
	  ushort RC = RCs[chr][r];
	  float expectRC = coverageScale * baselineScales[chr][r] * meanBaselineRCs[chr][r] / 128;
	  float normRC = RC / expectRC;
	  zAbs[ctrRkept] = fabsf((normRC-1) * sdNormRCsInv[chr][r]);
	  int q = quantileAssign[chr][r];
	  RCvecs[q].push_back(RC);
	  expectRCvecs[q].push_back(expectRC);
	  theta0vecs[q].push_back(theta0s[chr][r]);
	  ctrRkept++;
	}
    }
    // std_scale calibration of Gaussian (MAD-based)
    nth_element(zAbs.begin(), zAbs.begin() + ctrRkept/2, zAbs.end());
    fout << ID << "\t" << zAbs[ctrRkept/2] / 0.67449;

    // calibration of negative Binomial (per-meanRC quantile)
    float cInvOpts[RC_QUANTILES];
#pragma omp parallel for schedule(dynamic)
    for (int q = 0; q < RC_QUANTILES; q++) {

      const float cInvMin = 0.01f, cInvMax = 100;

      // quadratic iteration
      vector < pair <double, float> > yx;
      float xCur;
      for (xCur = 0.5f; xCur <= 1.5f; xCur += 0.5f)
        yx.push_back(make_pair(negLogLc(expectRCvecs[q], RCvecs[q], theta0vecs[q], xCur), xCur));
      float xLast = 1;
      while (true) {
	sort(yx.begin(), yx.end());
	float x[3] = {yx[0].second, yx[1].second, yx[2].second};
	double y[3] = {yx[0].first, yx[1].first, yx[2].first};
	xCur =
	  - (sq(x[2]) * (y[0] - y[1]) + sq(x[1]) * (y[2] - y[0]) + sq(x[0]) * (y[1] - y[2]))
	  / (2 * (x[2] * (y[1] - y[0]) + x[1] * (y[0] - y[2]) + x[0] * (y[2] - y[1])));
	xCur = min(max(xCur, cInvMin), cInvMax);

	if (isnan(xCur)) xCur = xLast; // error (usually same value used twice) => take last est
	if ((fabsf(xCur-xLast)<0.01f || fabsf(xCur/xLast-1)<0.01f) && yx.size() != 3) // converged
	  break;
	if (yx.size() == 20) // ran out of iters; use best attempt
	  xCur = yx[0].second;
	yx.push_back(make_pair(negLogLc(expectRCvecs[q], RCvecs[q], theta0vecs[q], xCur), xCur));
	xLast = xCur;
      }

      cInvOpts[q] = xCur;
    }
    for (int q = 0; q < RC_QUANTILES; q++)
      fout << "\t" << cInvOpts[q];
    fout << endl;

    ctr++;
    if (ctr % 100 == 0) cout << "." << flush;
  }
  fout.close();

  for (int chr = 1; chr <= 22; chr++) {
    fclose(finBaselineScales[chr]);
    fclose(finRCs[chr]);
  }

  cout << "\nTime for calibration: " << timer.update_time() << " sec" << endl;

  return 0;
}
