// g++ -O3 -Wall computeR2_200K.cpp -I/n/groups/price/poru/HSPH_SVN/src/EAGLE -I/home/pl88/boost_1_58_0/install/include -Wl,-rpath,/home/pl88/boost_1_58_0/install/lib -o computeR2_200K -L/home/pl88/boost_1_58_0/install/lib -lboost_iostreams -lz #include #include #include #include #include #include #include "FileUtils.cpp" #include "NumericUtils.cpp" #include "StatsUtils.cpp" #include "StringUtils.cpp" using namespace std; void processFiles(const string &file1, const string &file2, vector &mafs, vector &r2s) { FileUtils::AutoGzIfstream fin1; fin1.openOrExit(file1); FileUtils::AutoGzIfstream fin2; fin2.openOrExit(file2); string line1, line2; getline(fin1, line1); getline(fin2, line2); assert(line1==line2); int N = -5; for (int i = 0; i < (int) line1.length(); i++) N += line1[i]=='\t'; cerr << "Number of samples: N = " << N << endl; vector x(N), y(N); int M = 0; string chr, snp, cm, pos1, ref1, alt1, pos2, ref2, alt2, token; while (fin1 >> chr >> snp >> cm >> pos1 >> ref1 >> alt1) { bool found2 = false; while (fin2 >> chr >> snp >> cm >> pos2 >> ref2 >> alt2) { if (pos1==pos2 && ref1==ref2 && alt1==alt2) { found2 = true; break; } else getline(fin2, token); } if (!found2) cerr << pos1 << " " << ref1 << " " << alt1 << endl; assert(found2); printf("%s:%s_%s_%s", chr.c_str(), pos1.c_str(), ref1.c_str(), alt1.c_str()); M++; // augment mafs and r2s int nonMissing1 = 0, nonMissing2 = 0, nonMissing = 0; double sumx = 0, sumy = 0; for (int i = 0; i < N; i++) { fin1 >> token; double dose1 = NAN; if (token!="NA") sscanf(token.c_str(), "%lf", &dose1); fin2 >> token; double dose2 = NAN; if (token!="NA") sscanf(token.c_str(), "%lf", &dose2); if (!isnan(dose1)) nonMissing1++; if (!isnan(dose2)) nonMissing2++; if (!isnan(dose1) && !isnan(dose2)) { x[nonMissing] = dose1; y[nonMissing] = dose2; sumx += x[nonMissing]; sumy += y[nonMissing]; nonMissing++; } } double xBar = sumx / (double) nonMissing, yBar = sumy / (double) nonMissing; double xx = 0, xy = 0, yy = 0; for (int i = 0; i < nonMissing; i++) { xx += (x[i] - xBar) * (x[i] - xBar); xy += (x[i] - xBar) * (y[i] - yBar); yy += (y[i] - yBar) * (y[i] - yBar); } double r2 = 0, af1 = 0, maf2 = 0; if (xx == 0) { // SNP is monomorphic ; } else { if (xx * yy == 0) r2 = 0; else { r2 = xy*xy / (xx*yy); r2 = r2 - (1-r2)/(nonMissing-2); } if (yBar/2 < 0.5) { maf2 = yBar/2; af1 = xBar/2; } else { maf2 = 1 - yBar/2; af1 = 1 - xBar/2; } mafs.push_back(maf2); r2s.push_back(r2); } printf("\t%.6f\t%.6f\t%.3f\t%.3f\t%.4f\t%6d", af1, maf2, fabs(af1 - maf2), 1-nonMissing2/(double) N, r2, nonMissing); cout << endl; } cerr << "Number of SNPs: M = " << M << endl; } int main(int argc, char *argv[]) { if (argc != 3) { cerr << "Usage:" << endl; cerr << "- arg 1: traw file 1" << endl; cerr << "- arg 2: traw file 2" << endl; exit(1); } vector mafs, r2s; processFiles(argv[1], argv[2], mafs, r2s); const int B = 13; const double mafBounds[B+1] = {1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.500001}; int tooRare = 0; vector mafBinCounts(B); vector sumr2s(B); vector < vector > r2sPerBin(B); for (int m = 0; m < (int) mafs.size(); m++) { if (mafs[m] < mafBounds[0]) tooRare++; else for (int b = 0; b < B; b++) { if (mafBounds[b] <= mafs[m] && mafs[m] < mafBounds[b+1]) { mafBinCounts[b]++; sumr2s[b] += r2s[m]; r2sPerBin[b].push_back(r2s[m]); } } } cerr << "Discarded " << tooRare << " SNPs with MAF < " << mafBounds[0] << endl; cerr << endl; cerr << "Avg accuracy per MAF bin: " << endl; char buf[200]; for (int b = 0; b < B; b++) { sprintf(buf, "%5.2g-%5.2g%%: r2 = %.3f (%.3f) over %5d SNPs\n", 100*mafBounds[b], 100*mafBounds[b+1], sumr2s[b] / mafBinCounts[b], StatsUtils::stdDev(r2sPerBin[b]) / sqrt(mafBinCounts[b]), mafBinCounts[b]); cerr << string(buf); } cerr << endl << endl; /* printf("RAW\tBIN_NUM\tBIN_LO\tBIN_HI\tMAF\tR2\n"); for (int m = 0; m < (int) mafs.size(); m++) for (int b = -1; b < B; b++) { double binLo = (b==-1 ? 0 : mafBounds[b]); double binHi = mafBounds[b+1]; if (binLo <= mafs[m] && mafs[m] < binHi) printf("RAW\t%d\t%.3f\t%.3f\t%.4f\t%.4f\n", b, binLo, binHi, mafs[m], r2s[m]); } */ return 0; }