// g++ -fopenmp -O3 -Wall match_carriers_vcf.cpp -I/n/groups/price/poru/HSPH_SVN/src/EAGLE -I/home/pl88/boost_1_58_0/install/include -Wl,-rpath,/home/pl88/boost_1_58_0/install/lib -o match_carriers_vcf -L/home/pl88/boost_1_58_0/install/lib -lboost_iostreams -lz #include #include #include #include #include #include using namespace std; #include "FileUtils.cpp" #include "StringUtils.cpp" int main(void) { FileUtils::AutoGzIfstream fin; set nonWhiteIDs; string line; int ID; fin.openOrExit("/n/groups/price/UKBiobank/sampleQC/remove.nonWhite.FID_IID.txt"); getline(fin, line); while (fin >> ID) { nonWhiteIDs.insert(ID); fin >> ID; } fin.close(); cout << "Read " << nonWhiteIDs.size() << " non-white IDs to remove" << endl; map IDtoInd; fin.openOrExit("vcf/chrALL.vcf.gz"); while (true) { getline(fin, line); if (line[0]=='#' && line[1]=='C') { istringstream iss(line); int ctr = 0; string token; while (iss >> token) { ctr++; if (ctr >= 10) { int ID1, ID2; sscanf(token.c_str(), "%d_%d", &ID1, &ID2); IDtoInd[ID1] = ctr-10; } } break; } } fin.close(); int Nvcf = IDtoInd.size(); cout << "Read " << Nvcf << " sample IDs from vcf" << endl; vector order(Nvcf); for (int i = 0; i < Nvcf; i++) order[i] = i; vector covar_masks(IDtoInd.size(), -1); fin.openOrExit("ID.sex.birth_year.txt.gz"); getline(fin, line); int sex, birth_year, max_mask = 0, found = 0;; while (fin >> ID >> sex >> birth_year) { if (ID>0 && !nonWhiteIDs.count(ID) && IDtoInd.find(ID) != IDtoInd.end()) { int i = IDtoInd[ID]; covar_masks[i] = 0; covar_masks[i] |= sex<<1; covar_masks[i] |= ((birth_year-1934)/3)<<2; // 1934-1971 if (covar_masks[i] > max_mask) max_mask = covar_masks[i]; found++; } } fin.close(); cout << "Found " << found << " non-masked individuals with covars" << endl; max_mask++; cout << "max_mask: " << max_mask << endl; fin.openOrExit("/n/groups/price/UKBiobank/WES_50K/chr1_v1.SPB.hg19.fam"); found = 0; while (fin >> ID) { if (ID>0 && !nonWhiteIDs.count(ID) && IDtoInd.find(ID) != IDtoInd.end()) { int i = IDtoInd[ID]; covar_masks[i] |= 1; found++; } getline(fin, line); } fin.close(); cout << "Found " << found << " non-masked individuals in WES" << endl; FileUtils::AutoGzOfstream fout; fout.openOrExit("vcf/chrALL.masked.vcf"); fin.openOrExit("vcf/chrALL.vcf.gz"); while (getline(fin, line)) { if (line[0] != '#') { string token; istringstream iss(line); for (int f = 0; f < 9; f++) { iss >> token; if (f==2) printf("%-20s ", token.c_str()); } vector isCarrier(Nvcf); int counts[max_mask+1][2]; int keep_counts[max_mask+1][2]; memset(counts, 0, (max_mask+1)*2*4); int Ncarrier = 0; for (int i = 0; i < Nvcf; i++) { iss >> token; if (token != "0|0" && covar_masks[i]>0) { isCarrier[i] = 1; Ncarrier++; } if (covar_masks[i]>0) counts[covar_masks[i]][isCarrier[i]]++; } int ratio_best = 0, Nkeep_best = 0; double Neff_best = 0; for (int ratio = 1; ratio < 100000; ratio++) { int Nkeep = 0; for (int mask = 0; mask <= max_mask; mask++) Nkeep += min(counts[mask][1], counts[mask][0]/ratio); double Neff = Nkeep * 4 / (1 + 1.0/ratio); if (Neff > Neff_best) { Neff_best = Neff; Nkeep_best = Nkeep; ratio_best = ratio; } } printf("Ncarrier: %4d Nkeep: %4d ratio: %4d Neff: %7.1f\n", Ncarrier, Nkeep_best, ratio_best, Neff_best); /* for (int mask = 0; mask <= max_mask; mask++) cout << counts[mask][0] << " "; cout << endl; */ for (int mask = 0; mask <= max_mask; mask++) { keep_counts[mask][1] = min(counts[mask][1], counts[mask][0]/ratio_best); keep_counts[mask][0] = keep_counts[mask][1]*ratio_best; } random_shuffle(order.begin(), order.end()); int geno_start = line.length()-4*Nvcf+1; for (int j = 0; j < Nvcf; j++) { int i = order[j]; if (covar_masks[i]>0 && keep_counts[covar_masks[i]][isCarrier[i]]) keep_counts[covar_masks[i]][isCarrier[i]]--; else line[geno_start + 4*i] = line[geno_start + 4*i+2] = '.'; } } fout << line << endl; } fin.close(); fout.close(); return 0; }