#ifndef ASSIGNBATCHES_HPP
#define ASSIGNBATCHES_HPP

#include <vector>
#include <string>
#include <map>
#include <utility>
#include <algorithm>
#include <cassert>

#include "FileUtils.hpp"

using namespace std;

const int MAX_BATCHES_PER_SET = 10;

// batchMinMaxPerSet: minimum and maximum batch number in each genotypingSet
// batch: fam ind -> sample's batch assignment
// relScale: fam ind -> sample's LRR std dev multiplier (relative to median in batch)
// IDpairs: fam ind -> ID_1 ID_2
// lrrStdScaleFile: previously generated by denoise_LRR
int assignBatches(vector < pair <int, int> > &batchMinMaxPerSet, vector <int> &batch,
		  vector <double> &relScale, vector <string> &IDpairs, const char *lrrStdScaleFile)
{
  const int maxBatchSize = 20000;

  cout << "Reading LRR noise data computed by denoise_lrr: " << lrrStdScaleFile << endl;
  FileUtils::AutoGzIfstream fin; fin.openOrExit(lrrStdScaleFile);
  cout << "Assigning samples to batches based on genotypingSet and LRR noise" << endl;
  string line; getline(fin, line);
  assert(line == "ID_1\tID_2\tgenotypingSet\tstd_scale\tnum_snps");
  map < string, vector < pair <double, int> > > setNameToScaleInds;
  string ID_1, ID_2, genotypingSet; double stdScale; int numSNPs;
  int N = 0, ctrBatch = 0;
  while (fin >> ID_1 >> ID_2 >> genotypingSet >> stdScale >> numSNPs) {
    IDpairs.push_back(ID_1 + "\t" + ID_2);
    setNameToScaleInds[genotypingSet].push_back(make_pair(stdScale, N++));
  }
  fin.close();
  cout << "Read data for " << N << " samples" << endl;

  batch.resize(N); relScale.resize(N);

  for (map < string, vector < pair <double, int> > >::iterator it = setNameToScaleInds.begin();
       it != setNameToScaleInds.end(); it++) {
    vector < pair <double, int> > &scaleInds = it->second;
    int Nset = scaleInds.size();
    cout << "  genotypingSet = " << it->first << " (" << Nset << " samples" << ")" << endl;
    sort(scaleInds.begin(), scaleInds.end());
    // divide genotyping set into sub-batches
    int numBatches = min(MAX_BATCHES_PER_SET, (Nset + maxBatchSize - 1) / maxBatchSize);
    batchMinMaxPerSet.push_back(make_pair(ctrBatch+1, ctrBatch+numBatches));
    for (int batchNum = 1; batchNum <= numBatches; batchNum++) { // batch index within set
      int Nstart = (batchNum-1)*Nset/numBatches;
      int Nend = batchNum*Nset/numBatches;
      ctrBatch++;
      printf("    batch%d: relScale=%.3f-%.3f (%d samples)\n", ctrBatch,
	     scaleInds[Nstart].first, scaleInds[Nend-1].first, Nend-Nstart);
      for (int i = Nstart; i < Nend; i++) {
	int famInd = scaleInds[i].second;
	batch[famInd] = ctrBatch;
	relScale[famInd] = scaleInds[i].first / scaleInds[(Nstart+Nend)/2].first;
      }
    }
  }

  return ctrBatch;
}

#endif
