#include <iostream>
#include <iomanip>
#include <sstream>
#include <vector>
#include <string>
#include <set>
#include <queue>
#include <utility>
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cmath>

#include "omp.h"

#include "VersionHeader.hpp"
#include "AssignBatches.hpp"
#include "FileUtils.cpp"
#include "StringUtils.cpp"
#include "Timer.cpp"

using namespace std;

const char NAN_CHAR = -128;
const double INF_STD = 1000;

struct GenoInfo {
  char lrr;
  char theta;
  unsigned char geno: 2;
  unsigned char conf: 6;
};

struct Cluster {
  double muX, muY;
  double logdetSigma, SigmaInvXX, SigmaInvYY, SigmaInvXY;
  Cluster(double _muX=0, double _muY=0, double _logdetSigma=0, double _SigmaInvXX=1, double _SigmaInvYY=1, double _SigmaInvXY=0) : muX(_muX), muY(_muY), logdetSigma(_logdetSigma), SigmaInvXX(_SigmaInvXX), SigmaInvYY(_SigmaInvYY), SigmaInvXY(_SigmaInvXY) {}
};

struct ClusterBatch {
  Cluster c[4][4];
};

inline double sq(double x) { return x*x; }

vector < vector <ClusterBatch> > readClusterData(int &chr, const string &predClusterPrefix, int M,
						 int B) {

  vector < vector <ClusterBatch> > clusters(M, vector <ClusterBatch> (B));

  for (int batchNum = 1; batchNum <= B; batchNum++) {
    string predClusterFile = predClusterPrefix + ".batch" + StringUtils::itos(batchNum) + ".txt.gz";
    cout << "Reading predicted CNV genotype clusters: " << predClusterFile << endl;
    FileUtils::AutoGzIfstream finClusters; finClusters.openOrExit(predClusterFile);
    for (int m = 0; m < M; m++) {
      for (int CN = 1; CN <= 3; CN++)
	for (int geno = 0; geno <= CN; geno++) {
	  int g; double muX, muY, SigmaXX, SigmaYY, SigmaXY;
	  finClusters >> g >> muX >> muY >> SigmaXX >> SigmaYY >> SigmaXY;
	  double det = SigmaXX*SigmaYY - sq(SigmaXY);
	  double logdetSigma = log(det);
	  double SigmaInvXX = 1/det * SigmaYY;
	  double SigmaInvYY = 1/det * SigmaXX;
	  double SigmaInvXY = 1/det * -SigmaXY;
	  if (SigmaXX == INF_STD*INF_STD) { // ignore x-coordinate (theta)
	    logdetSigma = log(SigmaYY);
	    SigmaInvXX = 0;
	    SigmaInvYY = 1/SigmaYY;
	    SigmaInvXY = 0;
	  }
	  Cluster c(muX, muY, logdetSigma, SigmaInvXX, SigmaInvYY, SigmaInvXY);
	  assert(CN==2 || g==geno);
	  clusters[m][batchNum-1].c[CN][geno] = c;
	}
      // CN=0 (treat as CN=1 geno=2)
      Cluster &c0 = clusters[m][batchNum-1].c[1][2];
      c0 = clusters[m][batchNum-1].c[2][1];
      c0.muY -= 100; // shift CN=2 down 100 units
      c0.logdetSigma += log(4); // expand dimensions by sqrt(2)
      c0.SigmaInvXX *= 0.5;
      c0.SigmaInvYY *= 0.5;
      c0.SigmaInvXY *= 0.5;
      // ignore last 3 columns
      int flipType, snpNum; finClusters >> flipType >> chr >> snpNum;
    }
    string str; assert(!(finClusters >> str));
  }
  return clusters;
}

// output: scaled log BFs for CN=1 vs. CN=2, CN=3 vs. CN=2
void computeBFs(char *logBFs, const GenoInfo &gInfo, int b, double relScaleInv,
		const vector <ClusterBatch> &cb, bool clipAbsFlag, double logBFscale) {
  //int g = gInfo.geno;
  char lrr = gInfo.lrr;
  char theta = gInfo.theta;
  if (lrr == NAN_CHAR || theta == NAN_CHAR)
    logBFs[0] = logBFs[1] = 0;
  else {
    double logBFoffset = logBFscale * -log(2*M_PI/relScaleInv);
    double logBFscaled[4][4];
    for (int CN = 1; CN <= 3; CN++)
      for (int geno = 0; geno <= max(CN, 2); geno++) { // "CN=1 geno=2" = CN=0 cluster
	const Cluster &c = cb[b].c[CN][geno];	  
	double dy = lrr - c.muY;
	double dx = theta - c.muX;
	logBFscaled[CN][geno] = logBFoffset +
	  logBFscale * -0.5 * (c.logdetSigma + sq(relScaleInv) *
			       (sq(dx)*c.SigmaInvXX + 2*dx*dy*c.SigmaInvXY + sq(dy)*c.SigmaInvYY));
	if (clipAbsFlag)
	  logBFscaled[CN][geno] = max(-127.0, logBFscaled[CN][geno]);
      }
    // compute CN likelihoods
    double bestForCN[4]; bestForCN[1]=bestForCN[2]=bestForCN[3]=-1e9;
    for (int CN = 1; CN <= 3; CN++)
      for (int geno = 0; geno <= max(CN, 2); geno++) // "CN=1 geno=2" = CN=0 cluster
	if (logBFscaled[CN][geno] > bestForCN[CN])
	  bestForCN[CN] = logBFscaled[CN][geno];
    for (int CN = 1; CN <= 3; CN += 2)
      logBFs[CN/2] = (int) min(127.0, max(-127.0, bestForCN[CN]-bestForCN[2]));
  }
}


const int maxNumLong = 10; // maximum number of IBD neighbors
double tIBD, tBF, tHMM;

struct Match {
  int hap; ushort mStart, mEnd;
  Match(int _hap=0, ushort _mStart=0, ushort _mEnd=0) : hap(_hap), mStart(_mStart), mEnd(_mEnd) {}
  bool operator < (const Match &match) const { // endOpp_hap_start sort order for active set PQ
    if (mEnd != match.mEnd) return mEnd > match.mEnd; // opp for priority queue!
    else if (hap != match.hap) return hap < match.hap;
    else return mStart < match.mStart;
  }
};
bool comp_start_endOpp_hap(const Match &match1, const Match &match2) {
  if (match1.mStart != match2.mStart) return match1.mStart < match2.mStart;
  else if (match1.mEnd != match2.mEnd) return match1.mEnd > match2.mEnd;
  else return match1.hap < match2.hap;
}

struct Match_cM {
  Match match; double cMlen;
  Match_cM(const Match &_match, const vector <double> &cMvec) :
    match(_match), cMlen(cMvec[_match.mEnd-1]-cMvec[_match.mStart]) {}
  bool operator < (const Match_cM &m) const { // sort order for active set
    if (cMlen != m.cMlen) return cMlen > m.cMlen; // longest first
    else return match < m.match;
  }
};

struct MatchWeight {
  int hap; double prob_lt_T_gen;
  MatchWeight(int _hap=0, double _prob_lt_T_gen=0) : hap(_hap), prob_lt_T_gen(_prob_lt_T_gen) {}
};

struct MatchArray {
  int mStart, mEnd;
  vector <MatchWeight> mList;
};

double prob_lt_T_gen(int mStart, int mEnd, const vector <double> &cMvec, int IBDparam) {
  if (IBDparam <= 0) return NAN;
  else {
    int genT = IBDparam;
    double len = 0.01 * (cMvec[mEnd-1] - cMvec[mStart]);
    return 1-exp(-2*len*genT)*(1+2*len*genT+0.5*sq(2*len*genT));
  }
}

// note: matchData for haplotype h is sorted by this function
vector <MatchArray> extractMatchArrays(const vector <double> &cMvec,
				       const vector <uint64> &blockStarts, Match *matchData,
				       int h, int IBDparam) {

  vector <MatchArray> matchArrays;

  // read matches
  int matchesN = blockStarts[h+1] - blockStarts[h];
  Match *matchesMerged = matchData + blockStarts[h];

  // prune to top maxNumLong matches per position
  priority_queue <Match> activeMatchPQ;
  set <Match_cM> activeMatches;
  set <Match> usedMatches;
  sort(matchesMerged, matchesMerged + matchesN, comp_start_endOpp_hap); // sort by start
  int jMerged = 0; // position in matchesMerged list
  int mPrev = 0;
  while (jMerged < matchesN || !activeMatchPQ.empty()) {
    int earliestActiveEnd = activeMatchPQ.empty() ? (1<<30) : (int) activeMatchPQ.top().mEnd;
    int incomingStart = (jMerged < matchesN) ? (int) matchesMerged[jMerged].mStart : (1<<30);
      
    int mFirst = min(earliestActiveEnd, incomingStart);
    if (mFirst > mPrev) {
      MatchArray matchArray;
      matchArray.mStart = mPrev;
      matchArray.mEnd = mFirst;
      int topCtr = 0;
      set <int> usedInds; usedInds.insert(h/2); // don't use HBD matches (note: set is a bit slow)
      for (set <Match_cM>::iterator it = activeMatches.begin();
	   it != activeMatches.end() && topCtr < maxNumLong; it++)
	if (!usedInds.count(it->match.hap/2)) { // don't use an individual twice
	  usedInds.insert(it->match.hap/2);
	  topCtr++;
	  matchArray.mList.push_back(MatchWeight(it->match.hap,
						 prob_lt_T_gen(it->match.mStart, it->match.mEnd,
							       cMvec, IBDparam)));
	}
      matchArrays.push_back(matchArray);
    }

    // pop all matches with earliestActiveEnd; delete them from activeMatches set
    if (earliestActiveEnd <= incomingStart) {
      while (!activeMatchPQ.empty() && activeMatchPQ.top().mEnd == earliestActiveEnd) {
	activeMatches.erase(Match_cM(activeMatchPQ.top(), cMvec));
	activeMatchPQ.pop();
      }
    }
    // add all matches with incomingStart to active set
    if (incomingStart <= earliestActiveEnd) {
      while (jMerged < matchesN && matchesMerged[jMerged].mStart==incomingStart) {
	activeMatchPQ.push(matchesMerged[jMerged]);
	activeMatches.insert(Match_cM(matchesMerged[jMerged], cMvec));
	jMerged++;
      }
    }
    mPrev = mFirst;
  }
  assert(matchArrays[0].mStart==0);

  return matchArrays;
}

void computeTransitionProbs(double (*logPtrans)[3][3], const vector <int> &bps, int M) {

  double Ddel = 100*1e3, Ddup = 100*1e3, D = 150e6; // typical lengths of del, dup, CN=2
  double avgNum_del = 15, avgNum_dup = 5; // typical numbers of del and dup calls
  double flipP = 1e-4; // del->dup and dup->del relative probability (assuming transitioning out)
  int M_allchr = 784256;

  for (int m = 0; m < M; m++) {
    double di = m>0 ? bps[m] - bps[m-1] : 1e9;
    for (int CNfrom = 1; CNfrom <= 3; CNfrom++)
      for (int CNto = 1; CNto <= 3; CNto++) {
	double P;
	if (CNfrom==1) {
	  double stay = exp(-di/Ddel);
	  double leave = 1-stay;
	  if (CNto==1) P = stay;
	  else if (CNto==2) P = leave*(1-flipP);
	  else P = leave*flipP;
	}
	else if (CNfrom==3) {
	  double stay = exp(-di/Ddup);
	  double leave = 1-stay;
	  if (CNto==3) P = stay;
	  else if (CNto==2) P = leave*(1-flipP);
	  else P = leave*flipP;
	}
	else { // (CNfrom==2)
	  double leave_del = min(1-exp(-di/D), avgNum_del/M_allchr);
	  double leave_dup = min(1-exp(-di/D), avgNum_dup/M_allchr);
	  double stay = 1-leave_del-leave_dup;
	  if (CNto==1) P = leave_del;
	  else if (CNto==3) P = leave_dup;
	  else P = stay;
	}
	logPtrans[m][CNfrom-1][CNto-1] = log(P);
      }
  }
}

bool maskNearbySNP(const uint64 *maskBits, int M, int m, int i) {
  uint64 x = i * (uint64) M + m;
  return (maskBits[x>>6ULL]>>(x&63ULL))&1;
}

string runHMM(const double (*logPtrans)[3][3], const vector <int> &bps,
	      const vector <double> &cMvec, const vector <uint64> &blockStarts, Match *matchData,
	      const char *logBFs, double logBFscaleInv, const uint64 *maskBits, int H, int M,
	      int IBDparam, int h, const string &ID, int chr) {

  Timer timer;

  vector <MatchArray> matchArrays = extractMatchArrays(cMvec, blockStarts, matchData, h, IBDparam);

#pragma omp atomic
  tIBD += timer.update_time();

  // compute and store combined, rescaled Bayes factors for CN=1 vs. 2 and CN=3 vs. 2
  double (*logBFsIBD)[2] = new double[M][2]; // [m][0=del,1=dup]
  double (*logBFsIndiv)[2] = new double[M][2]; // [m][0=del,1=dup]
  double *usedIBD = new double[M], *unmaskedFrac = new double[M];
  vector <bool> usedInd(H/2);
  int pos = 0;
  for (int m = 0; m < M; m++) {
    // update current index into matchArrays
    while (pos < (int) matchArrays.size()
	   && !(matchArrays[pos].mStart <= m && m < matchArrays[pos].mEnd))
      pos++;

    // set logBFs for current individual; undo scaling to char [-127,127]
    usedIBD[m] = unmaskedFrac[m] = 0;
    if (!maskNearbySNP(maskBits, M, m, h/2)) {
      for (int c = 0; c < 2; c++)
	logBFsIndiv[m][c] = logBFs[(h/2)*2LL*M + 2*m + c] * logBFscaleInv;
      unmaskedFrac[m]++;
    }
    else
      logBFsIndiv[m][0] = logBFsIndiv[m][1] = 0;

    // initialize combined logBFs (indiv + IBD neighbors) to logBFsIndiv
    for (int c = 0; c < 2; c++)
      logBFsIBD[m][c] = logBFsIndiv[m][c];

    // add in logBFs of neighbors, weighting according to nbrCount/genThresh
    if (pos < (int) matchArrays.size()) { // haven't overshot last IBD match
      const MatchArray &matchArray = matchArrays[pos];
      if (IBDparam < 0) {
	for (int k = 0; k < -IBDparam && k < (int) matchArray.mList.size(); k++) {
	  int iPhased = matchArray.mList[k].hap/2;
	  if (!maskNearbySNP(maskBits, M, m, iPhased)) {
	    for (int c = 0; c < 2; c++)
	      logBFsIBD[m][c] += logBFs[iPhased*2LL*M + 2*m + c] * logBFscaleInv;
	    unmaskedFrac[m]++;
	  }
	  usedIBD[m]++;
	}
      }
      else if (IBDparam > 0) { // P(IBD>generation) weighting
	for (int k = 0; k < (int) matchArray.mList.size(); k++) {
	  int iPhased = matchArray.mList[k].hap/2;
	  if (!maskNearbySNP(maskBits, M, m, iPhased)) {
	    for (int c = 0; c < 2; c++)
	      logBFsIBD[m][c] += logBFs[iPhased*2LL*M + 2*m + c] * logBFscaleInv *
		matchArray.mList[k].prob_lt_T_gen;
	    unmaskedFrac[m] += matchArray.mList[k].prob_lt_T_gen;
	  }
	  usedIBD[m] += matchArray.mList[k].prob_lt_T_gen;
	}
      }
    }
    unmaskedFrac[m] /= (1+usedIBD[m]);
  }

#pragma omp atomic
  tBF += timer.update_time();

  double (*cumLogP)[3] = new double[M][3];
  char (*prev)[3] = new char[M][3];
  //const double logPjump = log(1e-3);

  // initialize
  int m = 0;
  cumLogP[m][0] = logPtrans[m][1][0] + logBFsIBD[m][0]; // del
  cumLogP[m][1] = logPtrans[m][1][1]; // CN=2
  cumLogP[m][2] = logPtrans[m][1][2] + logBFsIBD[m][1]; // dup
  // iterate
  for (m = 1; m < M; m++) {
    // transition; set prev
    for (int s = 0; s <= 2; s++) // cur state
      for (int t = 0; t <= 2; t++) // prev state
	if (t==0 || (cumLogP[m][s] < cumLogP[m-1][t] + logPtrans[m][t][s])) {
	  cumLogP[m][s] = cumLogP[m-1][t] + logPtrans[m][t][s];
	  prev[m][s] = t;
	}
    // emission
    cumLogP[m][0] += logBFsIBD[m][0]; // del
    cumLogP[m][2] += logBFsIBD[m][1]; // dup
  }
  // finalize: penalty for ending in del/dup state
  //cumLogP[m][0] += logPjump;
  //cumLogP[m][2] += logPjump;
  m = M-1;
    
  // backtrack
  int s = 0;
  if (cumLogP[m][1] > cumLogP[m][s]) s = 1;
  if (cumLogP[m][2] > cumLogP[m][s]) s = 2;
  int mSegEnd = m;
  vector < pair <int, int> > CNVsegs; // inclusive
  vector <bool> states; // 0=del, 1=dup
  while (m > 0) {
    if (prev[m][s] != s) {
      if (s != 1) { CNVsegs.push_back(make_pair(m, mSegEnd)); states.push_back(s/2); }
      mSegEnd = m-1;
      s = prev[m][s];
    }
    m--;
  }
  if (s != 1) { CNVsegs.push_back(make_pair(m, mSegEnd)); states.push_back(s/2); }

#pragma omp atomic
  tHMM += timer.update_time();

  // print output
  ostringstream oss;
  oss << std::fixed;

  for (int k = CNVsegs.size()-1; k >= 0; k--) {
    int mStart = CNVsegs[k].first, mEnd = CNVsegs[k].second, state = states[k];
    while (unmaskedFrac[mStart]==0) mStart++;
    while (unmaskedFrac[mEnd]==0) mEnd--;
    double logBF = 0, logBFindiv = 0, meanUsedIBD = 0, sumUnmaskedFrac = 0;
    for (int m = mStart; m <= mEnd; m++) {
      logBF += logBFsIBD[m][state];
      logBFindiv += logBFsIndiv[m][state];
      meanUsedIBD += usedIBD[m];
      sumUnmaskedFrac += unmaskedFrac[m];
    }
    meanUsedIBD /= (mEnd-mStart+1);
    const char types[2][4] = {"DEL", "DUP"};
    oss << ID << "\t" << types[state] << "\t"
	<< chr << "\t" << bps[mStart] << "\t" << bps[mEnd] << "\t"
	<< std::setprecision(3) << (bps[mEnd]-bps[mStart]+1)*1e-3 << "\t"
	<< mStart << "\t" << mEnd << "\t"
	<< std::setprecision(1) << sumUnmaskedFrac << "\t"
	<< std::setprecision(2) << logBF/log(10) << "\t" << (logBF-logBFindiv)/log(10) << "\t"
	<< (h&1)+1 << "\t" << std::setprecision(1) << meanUsedIBD << endl;
    /*
    sprintf(buf, "%s %d %3s %7.2f %4.1f %9.3f kb %9d %9d %5d %5d %5.1f %7.2f\n",
	    ID.c_str(), (h&1)+1, types[state], logBF/log(10), meanUsedIBD,
	    (bps[mEnd]-bps[mStart]+1)*1e-3, bps[mStart], bps[mEnd], mStart, mEnd, sumUnmaskedFrac,
	    (logBF-logBFindiv)/log(10));
    oss << string(buf);
    */
    //cout << string(buf);
  }

  delete[] logBFsIBD;
  delete[] logBFsIndiv;
  delete[] usedIBD;
  delete[] unmaskedFrac;
  delete[] cumLogP;
  delete[] prev;
  
  return oss.str();
}

int main(int argc, char *argv[]) {

  printVersion();

  cout << "call_CNVs:" << endl;
  cout << "- arg1 = $LRR_STD_SCALE_FILE" << endl;
  cout << "- arg2 = $BIM_FILE" << endl;
  cout << "- arg3 = $LRR_THETA_GENO_FILE" << endl;
  cout << "- arg4 = $PRED_CLUSTER_PREFIX" << endl;
  cout << "- arg5 = $LIKELIHOOD_CLIP_THRESH" << endl;
  cout << "- arg6 = $IBD_FILE" << endl;
  cout << "- arg7 = $IBD_PARAM (<0 for fixed # of neighbors; >=0 for P(IBD>generations)" << endl;
  cout << "- arg8 = $NEARBY_IMP_MASK_FILE" << endl;
  cout << "- arg9 = $THREADS" << endl;
  cout << "- arg10 = output file" << endl;
  cout << endl;

  printCmd(argc, argv);

  if (argc != 11) {
    cout << "ERROR: 10 arguments required" << endl;
    exit(1);
  }

  const char *lrrStdScaleFile = checkInputFileExt(argv, 1, ".txt");
  const char *bimFile = checkInputFileExt(argv, 2, ".bim");
  const char *lrrThetaGenoFile = checkInputFileExt(argv, 3, ".bin");
  const char *predClusterPrefix = argv[4];
  double pClip; assert(sscanf(argv[5], "%lf", &pClip));
  const double logBFscale = -127 / log(pClip), logBFscaleInv = 1/logBFscale;;
  const bool clipAbsFlag = false;
  const char *ibdFile = checkInputFileExt(argv, 6, ".bin");
  int IBDparam; assert(sscanf(argv[7], "%d", &IBDparam));
  const char *maskFile = checkInputFileExt(argv, 8, ".bin");
  int threads; assert(sscanf(argv[9], "%d", &threads));
  const char *outFile = argv[10];

  FileUtils::requireReadable(lrrStdScaleFile);
  FileUtils::requireReadable(bimFile);
  FileUtils::requireReadable(lrrThetaGenoFile);
  FileUtils::requireReadable(predClusterPrefix + string(".batch1.txt.gz"));
  FileUtils::requireReadable(ibdFile);
  FileUtils::requireReadable(maskFile);
  FileUtils::requireWriteable(outFile);

  Timer timer; double t0 = timer.get_time();

  cout << "Probe intensity likelihood clipping parameter: " << pClip << endl;
  assert(1e-6 <= pClip && pClip <= 0.1);
  cout << "IBD weight parameter: " << IBDparam << endl;
  assert(IBDparam >= -maxNumLong && IBDparam <= 10000);
  cout << "Setting number of threads to " << threads << endl;
  assert(threads > 0);
  cout << endl;
  omp_set_num_threads(threads);

  /***** read lrr std scale file (noise per indiv) *****/
  vector < pair <int, int> > batchMinMaxPerSet;
  vector <int> batch; vector <double> relScale; vector <string> IDpairs;
  int B = assignBatches(batchMinMaxPerSet, batch, relScale, IDpairs, lrrStdScaleFile);
  int Nfam = IDpairs.size();

  /***** read bim file *****/
  cout << "Reading bim file: " << bimFile << endl;
  FileUtils::AutoGzIfstream finBim; finBim.openOrExit(bimFile);
  vector <int> bps; vector <double> cMvec; int c, bp; double genpos; string rsID;
  while (finBim >> c >> rsID >> genpos >> bp) {
    bps.push_back(bp);
    cMvec.push_back(100*genpos);
    string line; getline(finBim, line);
  }
  finBim.close();
  uint64 M = bps.size();
  cout << "Read " << M << " variants from bim file" << endl;

  /***** read IBD data *****/
  FILE *finBin;
  cout << "Reading IBD file: " << ibdFile << endl;
  finBin = fopen(ibdFile, "rb"); assert(finBin != NULL);
  int Nphased;
  fread(&Nphased, sizeof(int), 1, finBin);
  vector <int> famInds(Nphased);
  fread(&famInds[0], sizeof(int), Nphased, finBin); // Nphased int: fam inds (header1)
  vector <string> IDs(Nphased);
  for (int ip = 0; ip < Nphased; ip++) IDs[ip] = IDpairs[famInds[ip]];
  int H = 2*Nphased;
  vector <uint64> blockStarts(H+1); // start of match blocks in binary file (after headers)
  fread(&blockStarts[0], sizeof(uint64), H+1, finBin); // H+1 uint64: match block starts (header2)
  for (int h = H; h >= 0; h--)
    blockStarts[h] = (blockStarts[h] - blockStarts[0]) / sizeof(Match); // change to array indices
  Match *matchData = new Match[blockStarts[H]]; assert(matchData != NULL);
  fread(matchData, sizeof(Match), blockStarts[H], finBin);
  fclose(finBin);

  cout << "Read IBD data for " << Nphased << " samples (" << timer.update_time() << " sec)"
       << endl;

  /***** read cluster data *****/
  int chr;
  vector < vector <ClusterBatch> > clusters = readClusterData(chr, predClusterPrefix, M, B);

  /***** read + process lrr theta confgeno info *****/
  cout << "Reading SNP-array data: " << lrrThetaGenoFile << endl;
  finBin = fopen(lrrThetaGenoFile, "rb"); assert(finBin != NULL);
  vector <GenoInfo> genoRow(Nfam);
  char *logBFs = new char[Nphased*2LL*M]; assert(logBFs != NULL); // indiv-major (transposed)
  // [0]: scaled log p(lrr|del) - log p(lrr|CN=2)
  // [1]: scaled log p(lrr|dup) - log p(lrr|CN=2)
  for (uint m = 0; m < M; m++) {
    fread(&genoRow[0], sizeof(GenoInfo), Nfam, finBin);
#pragma omp parallel for
    for (int ip = 0; ip < Nphased; ip++) {
      int i = famInds[ip];
      computeBFs(&logBFs[ip*2LL*M+2*m], genoRow[i], batch[i]-1, 1/relScale[i], clusters[m],
		 clipAbsFlag, logBFscale);
    }
  }
  fclose(finBin);

  cout << "Computed logBFs (time = " << timer.update_time() << " sec)" << endl;

  /***** read maskBits data *****/
  cout << "Reading mask file: " << maskFile << endl;
  finBin = fopen(maskFile, "rb"); assert(finBin != NULL);
  uint64 maskBitsULLs = (Nphased*M + 63)>>6;
  uint64 *maskBits = new uint64[maskBitsULLs]; assert(maskBits != NULL);
  fread(maskBits, sizeof(uint64), maskBitsULLs, finBin);
  fclose(finBin);

  cout << "Read maskBits data (time = " << timer.update_time() << " sec)" << endl;

  /***** set up transition matrix *****/
  double (*logPtrans)[3][3] = new double[M][3][3]; assert(logPtrans != NULL);
  computeTransitionProbs(logPtrans, bps, M);

  vector <string> output(H);
#pragma omp parallel for
  for (int h = 0; h < H; h++)
    output[h] = runHMM(logPtrans, bps, cMvec, blockStarts, matchData, logBFs, logBFscaleInv,
		       maskBits, H, M, IBDparam, h, IDs[h/2], chr);

  cout << "Finished HMM computation (time = " << timer.update_time() << " sec)" << endl;
  /*
  cout << "tIBD: " << tIBD << endl;
  cout << "tBF:  " << tBF << endl;
  cout << "tHMM: " << tHMM << endl;
  */
  delete[] matchData;
  delete[] logBFs;
  delete[] maskBits;
  delete[] logPtrans;

  FileUtils::AutoGzOfstream fout; fout.openOrExit(outFile);
  fout << "ID_1\tID_2\tCNV_TYPE\tCHR\tBP_START\tBP_END\tLENGTH_KB\tPROBE_START\tPROBE_END\tPROBES_USED\tLOD\tLOD_HAP_NBRS_ONLY\tHAP\tHAP_NBRS_USED" << endl;
  for (int h = 0; h < H; h++)
    fout << output[h];
  fout.close();
  
  cout << "Finished call_CNVs; total time = " << timer.get_time()-t0 << " sec" << endl;

  return 0;
}
