1 // Ported from blast2lca
2 // https://github.com/emepyc/Blast2lca
3 // Originally licensed under GPLv2 or later
4
5 #include "NcbiTaxonomy.h"
6 #include "FileUtil.h"
7 #include "MathUtil.h"
8 #include "Debug.h"
9 #include "Util.h"
10 #include "sys/mman.h"
11
12 #include <fstream>
13 #include <algorithm>
14 #include <cassert>
15
16 const int NcbiTaxonomy::SERIALIZATION_VERSION = 2;
17
makeMatrix(size_t maxNodes)18 int **makeMatrix(size_t maxNodes) {
19 size_t dimension = maxNodes * 2;
20 int **M = new int*[dimension];
21 int k = (int)(MathUtil::flog2(dimension)) + 1;
22 M[0] = new int[dimension * k]();
23 for(size_t i = 1; i < dimension; i++) {
24 M[i] = M[i-1] + k;
25 }
26
27 return M;
28 }
29
deleteMatrix(int ** M)30 void deleteMatrix(int** M) {
31 delete[] M[0];
32 delete[] M;
33 }
34
NcbiTaxonomy(const std::string & namesFile,const std::string & nodesFile,const std::string & mergedFile)35 NcbiTaxonomy::NcbiTaxonomy(const std::string &namesFile, const std::string &nodesFile, const std::string &mergedFile) : externalData(false) {
36 block = new StringBlock<unsigned int>();
37 std::vector<TaxonNode> tmpNodes;
38 loadNodes(tmpNodes, nodesFile);
39 loadMerged(mergedFile);
40 loadNames(tmpNodes, namesFile);
41
42 maxNodes = tmpNodes.size();
43 taxonNodes = new TaxonNode[maxNodes];
44 std::copy(tmpNodes.begin(), tmpNodes.end(), taxonNodes);
45
46 std::vector<int> tmpE;
47 tmpE.reserve(maxNodes * 2);
48
49 std::vector<int> tmpL;
50 tmpL.reserve(maxNodes * 2);
51
52 H = new int[maxNodes];
53 std::fill(H, H + maxNodes, 0);
54
55 std::vector<std::vector<TaxID>> children(tmpNodes.size());
56 for (std::vector<TaxonNode>::const_iterator it = tmpNodes.begin(); it != tmpNodes.end(); ++it) {
57 if (it->parentTaxId != it->taxId) {
58 children[nodeId(it->parentTaxId)].push_back(it->taxId);
59 }
60 }
61
62 elh(children, 1, 0, tmpE, tmpL);
63 tmpE.resize(maxNodes * 2, 0);
64 tmpL.resize(maxNodes * 2, 0);
65
66 E = new int[maxNodes * 2];
67 std::copy(tmpE.begin(), tmpE.end(), E);
68 L = new int[maxNodes * 2];
69 std::copy(tmpL.begin(), tmpL.end(), L);
70
71 M = makeMatrix(maxNodes);
72 InitRangeMinimumQuery();
73
74 mmapData = NULL;
75 mmapSize = 0;
76 }
77
~NcbiTaxonomy()78 NcbiTaxonomy::~NcbiTaxonomy() {
79 if (externalData) {
80 delete[] M;
81 } else {
82 delete[] taxonNodes;
83 delete[] H;
84 delete[] D;
85 delete[] E;
86 delete[] L;
87 deleteMatrix(M);
88 }
89 delete block;
90 if (mmapData != NULL) {
91 munmap(mmapData, mmapSize);
92 }
93 }
94
splitByDelimiter(const std::string & s,const std::string & delimiter,int maxCol)95 std::vector<std::string> splitByDelimiter(const std::string &s, const std::string &delimiter, int maxCol) {
96 std::vector<std::string> result;
97 size_t prev = 0, pos = 0;
98 int i = 0;
99 do {
100 pos = s.find(delimiter, prev);
101 if (pos == std::string::npos) pos = s.length();
102 result.emplace_back(s.substr(prev, pos - prev));
103 prev = pos + delimiter.length();
104 i++;
105 } while (pos < s.length() && prev < s.length() && i < maxCol);
106
107 return result;
108 }
109
loadNodes(std::vector<TaxonNode> & tmpNodes,const std::string & nodesFile)110 size_t NcbiTaxonomy::loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile) {
111 Debug(Debug::INFO) << "Loading nodes file ...";
112 std::ifstream ss(nodesFile);
113 if (ss.fail()) {
114 Debug(Debug::ERROR) << "File " << nodesFile << " not found!\n";
115 EXIT(EXIT_FAILURE);
116 }
117
118 std::map<TaxID, int> Dm; // temporary map TaxID -> internal ID;
119 maxTaxID = 0;
120 int currentId = 0;
121 std::string line;
122 while (std::getline(ss, line)) {
123 std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 3);
124 TaxID taxId = (TaxID) strtol(result[0].c_str(), NULL, 10);
125 TaxID parentTaxId = (TaxID) strtol(result[1].c_str(), NULL, 10);
126 if (taxId > maxTaxID) {
127 maxTaxID = taxId;
128 }
129 size_t rankIdx = block->append(result[2].c_str(), result[2].size());
130 tmpNodes.emplace_back(currentId, taxId, parentTaxId, rankIdx, (size_t)-1);
131 Dm.emplace(taxId, currentId);
132 ++currentId;
133 }
134
135 D = new int[maxTaxID + 1];
136 std::fill_n(D, maxTaxID + 1, -1);
137 for (std::map<TaxID, int>::iterator it = Dm.begin(); it != Dm.end(); ++it) {
138 assert(it->first <= maxTaxID);
139 D[it->first] = it->second;
140 }
141
142 // Loop over taxonNodes and check all parents exist
143 for (std::vector<TaxonNode>::iterator it = tmpNodes.begin(); it != tmpNodes.end(); ++it) {
144 if (!nodeExists(it->parentTaxId)) {
145 Debug(Debug::ERROR) << "Inconsistent nodes.dmp taxonomy file! Cannot find parent taxon with ID " << it->parentTaxId << "!\n";
146 EXIT(EXIT_FAILURE);
147 }
148 }
149
150 Debug(Debug::INFO) << " Done, got " << tmpNodes.size() << " nodes\n";
151 return tmpNodes.size();
152 }
153
parseName(const std::string & line)154 std::pair<int, std::string> parseName(const std::string &line) {
155 std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
156 if (result.size() != 2) {
157 Debug(Debug::ERROR) << "Invalid name entry!\n";
158 EXIT(EXIT_FAILURE);
159 }
160 return std::make_pair((int)strtol(result[0].c_str(), NULL, 10), result[1]);
161 }
162
loadNames(std::vector<TaxonNode> & tmpNodes,const std::string & namesFile)163 void NcbiTaxonomy::loadNames(std::vector<TaxonNode> &tmpNodes, const std::string &namesFile) {
164 Debug(Debug::INFO) << "Loading names file ...";
165 std::ifstream ss(namesFile);
166 if (ss.fail()) {
167 Debug(Debug::ERROR) << "File " << namesFile << " not found!\n";
168 EXIT(EXIT_FAILURE);
169 }
170
171 std::string line;
172 while (std::getline(ss, line)) {
173 if (line.find("scientific name") == std::string::npos) {
174 continue;
175 }
176
177 std::pair<int, std::string> entry = parseName(line);
178 if (!nodeExists(entry.first)) {
179 Debug(Debug::ERROR) << "loadNames: Taxon " << entry.first << " not present in nodes file!\n";
180 EXIT(EXIT_FAILURE);
181 }
182 tmpNodes[nodeId(entry.first)].nameIdx = block->append(entry.second.c_str(), entry.second.size());
183 }
184 Debug(Debug::INFO) << " Done\n";
185 }
186
187 // Euler traversal of tree
elh(std::vector<std::vector<TaxID>> const & children,TaxID taxId,int level,std::vector<int> & tmpE,std::vector<int> & tmpL)188 void NcbiTaxonomy::elh(std::vector<std::vector<TaxID>> const & children, TaxID taxId, int level, std::vector<int> &tmpE, std::vector<int> &tmpL) {
189 assert (taxId > 0);
190 int id = nodeId(taxId);
191
192 if (H[id] == 0) {
193 H[id] = tmpE.size();
194 }
195
196 tmpE.emplace_back(id);
197 tmpL.emplace_back(level);
198
199 for (std::vector<TaxID>::const_iterator child_it = children[id].begin(); child_it != children[id].end(); ++child_it) {
200 elh(children, *child_it, level + 1, tmpE, tmpL);
201 }
202 tmpE.emplace_back(nodeId(taxonNodes[id].parentTaxId));
203 tmpL.emplace_back(level - 1);
204 }
205
InitRangeMinimumQuery()206 void NcbiTaxonomy::InitRangeMinimumQuery() {
207 Debug(Debug::INFO) << "Init RMQ ...";
208
209 for (unsigned int i = 0; i < (maxNodes * 2); ++i) {
210 M[i][0] = i;
211 }
212
213 for (unsigned int j = 1; (1ul << j) <= (maxNodes * 2); ++j) {
214 for (unsigned int i = 0; (i + (1ul << j) - 1) < (maxNodes * 2); ++i) {
215 int A = M[i][j - 1];
216 int B = M[i + (1ul << (j - 1))][j - 1];
217 if (L[A] < L[B]) {
218 M[i][j] = A;
219 } else {
220 M[i][j] = B;
221 }
222 }
223 }
224 Debug(Debug::INFO) << "Done\n";
225 }
226
RangeMinimumQuery(int i,int j) const227 int NcbiTaxonomy::RangeMinimumQuery(int i, int j) const {
228 assert(j >= i);
229 int k = (int)MathUtil::flog2(j - i + 1);
230 int A = M[i][k];
231 int B = M[j - MathUtil::ipow<int>(2, k) + 1][k];
232 if (L[A] <= L[B]) {
233 return A;
234 }
235 return B;
236 }
237
lcaHelper(int i,int j) const238 int NcbiTaxonomy::lcaHelper(int i, int j) const {
239 if (i == 0 || j == 0) {
240 return 0;
241 }
242 assert(i > 0);
243 assert(j > 0);
244 if (i == j) {
245 return i;
246 }
247 int v1 = H[i];
248 int v2 = H[j];
249 if (v1 > v2) {
250 int tmp = v1;
251 v1 = v2;
252 v2 = tmp;
253 }
254 int rmq = RangeMinimumQuery(v1, v2);
255 assert(E[rmq] >= 0);
256 return E[rmq];
257 }
258
IsAncestor(TaxID ancestor,TaxID child)259 bool NcbiTaxonomy::IsAncestor(TaxID ancestor, TaxID child) {
260 if (ancestor == child) {
261 return true;
262 }
263
264 if (ancestor == 0 || child == 0) {
265 return false;
266 }
267
268 if (!nodeExists(child)) {
269 return false;
270 }
271
272 if (!nodeExists(ancestor)) {
273 return false;
274 }
275
276 return lcaHelper(nodeId(child), nodeId(ancestor)) == nodeId(ancestor);
277 }
278
279
LCA(TaxID taxonA,TaxID taxonB) const280 TaxID NcbiTaxonomy::LCA(TaxID taxonA, TaxID taxonB) const {
281 if (!nodeExists(taxonA)) {
282 return taxonB;
283 } else if (!nodeExists(taxonB)) {
284 return taxonA;
285 }
286 return taxonNodes[lcaHelper(nodeId(taxonA), nodeId(taxonB))].taxId;
287 }
288
289
LCA(const std::vector<TaxID> & taxa) const290 TaxonNode const * NcbiTaxonomy::LCA(const std::vector<TaxID>& taxa) const {
291 std::vector<int>::const_iterator it = taxa.begin();
292 while (it != taxa.end() && !nodeExists(*it)) {
293 Debug(Debug::WARNING) << "No node for taxID " << *it << ", ignoring it.\n";
294 ++it;
295 }
296 if (it == taxa.end()) { return NULL; }
297 int red = nodeId(*it++);
298 for (; it != taxa.end(); ++it) {
299 if (nodeExists(*it)) {
300 red = lcaHelper(red, nodeId(*it));
301 } else {
302 Debug(Debug::WARNING) << "No node for taxID " << *it << ", ignoring it.\n";
303 }
304 }
305
306 assert(red >= 0 && static_cast<unsigned int>(red) < maxNodes);
307
308 return &(taxonNodes[red]);
309 }
310
311
312 // AtRanks returns a slice of slices having the taxons at the specified taxonomic levels
AtRanks(TaxonNode const * node,const std::vector<std::string> & levels) const313 std::vector<std::string> NcbiTaxonomy::AtRanks(TaxonNode const *node, const std::vector<std::string> &levels) const {
314 std::vector<std::string> result;
315 std::map<std::string, std::string> allRanks = AllRanks(node);
316 // map does not include "no rank" nor "no_rank"
317 const char* rank = getString(node->rankIdx);
318 int baseRankIndex = findRankIndex(rank);
319 std::string baseRank = "uc_";
320 baseRank.append(getString(node->nameIdx));
321 for (std::vector<std::string>::const_iterator it = levels.begin(); it != levels.end(); ++it) {
322 std::map<std::string, std::string>::iterator jt = allRanks.find(*it);
323 if (jt != allRanks.end()) {
324 result.emplace_back(jt->second);
325 continue;
326 }
327
328 // If not ... 2 possible causes: i) too low level ("uc_")
329 if (NcbiRanks.at(*it) < baseRankIndex) {
330 result.emplace_back(baseRank);
331 continue;
332 }
333
334 // ii) No taxon for the LCA at the required level -- give the first known upstream
335 result.emplace_back("unknown");
336 }
337 return result;
338 }
339
parseRanks(const std::string & ranks)340 std::vector<std::string> NcbiTaxonomy::parseRanks(const std::string& ranks) {
341 std::vector<std::string> temp = Util::split(ranks, ",");
342 for (size_t i = 0; i < temp.size(); ++i) {
343 if (findRankIndex(temp[i]) == -1) {
344 Debug(Debug::ERROR) << "Invalid taxonomic rank " << temp[i] << "given\n";
345 EXIT(EXIT_FAILURE);
346 }
347 }
348 return temp;
349 }
350
findRankIndex(const std::string & rank)351 int NcbiTaxonomy::findRankIndex(const std::string& rank) {
352 std::map<std::string, int>::const_iterator it;
353 if ((it = NcbiRanks.find(rank)) != NcbiRanks.end()) {
354 return it->second;
355 }
356 return -1;
357 }
358
findShortRank(const std::string & rank)359 char NcbiTaxonomy::findShortRank(const std::string& rank) {
360 std::map<std::string, char>::const_iterator it;
361 if ((it = NcbiShortRanks.find(rank)) != NcbiShortRanks.end()) {
362 return it->second;
363 }
364 return '-';
365 }
366
taxLineage(TaxonNode const * node,bool infoAsName)367 std::string NcbiTaxonomy::taxLineage(TaxonNode const *node, bool infoAsName) {
368 std::vector<TaxonNode const *> taxLineageVec;
369 std::string taxLineage;
370 taxLineage.reserve(4096);
371 do {
372 taxLineageVec.push_back(node);
373 node = taxonNode(node->parentTaxId);
374 } while (node->parentTaxId != node->taxId);
375
376 for (int i = taxLineageVec.size() - 1; i >= 0; --i) {
377 if (infoAsName) {
378 taxLineage += findShortRank(getString(taxLineageVec[i]->rankIdx));
379 taxLineage += '_';
380 taxLineage += getString(taxLineageVec[i]->nameIdx);
381 } else {
382 taxLineage += SSTR(taxLineageVec[i]->taxId);
383 }
384
385 if (i > 0) {
386 taxLineage += ";";
387 }
388 }
389 return taxLineage;
390 }
391
nodeId(TaxID taxonId) const392 int NcbiTaxonomy::nodeId(TaxID taxonId) const {
393 if (taxonId < 0 || !nodeExists(taxonId)) {
394 Debug(Debug::ERROR) << "Invalid node " << taxonId << "!\n";
395 EXIT(EXIT_FAILURE);
396 }
397 return D[taxonId];
398 }
399
nodeExists(TaxID taxonId) const400 bool NcbiTaxonomy::nodeExists(TaxID taxonId) const {
401 return taxonId <= maxTaxID && D[taxonId] != -1;
402 }
403
taxonNode(TaxID taxonId,bool fail) const404 TaxonNode const * NcbiTaxonomy::taxonNode(TaxID taxonId, bool fail) const {
405 if (taxonId == 0 || (!fail && !nodeExists(taxonId))) {
406 return NULL;
407 }
408 return &(taxonNodes[nodeId(taxonId)]);
409 }
410
AllRanks(TaxonNode const * node) const411 std::map<std::string, std::string> NcbiTaxonomy::AllRanks(TaxonNode const *node) const {
412 std::map<std::string, std::string> result;
413 while (true) {
414 std::string rank = getString(node->rankIdx);
415 std::string name = getString(node->nameIdx);
416 if (node->taxId == 1) {
417 result.emplace(rank, name);
418 return result;
419 }
420
421 if ((rank != "no_rank") && (rank != "no rank")) {
422 result.emplace(rank, name);
423 }
424
425 node = taxonNode(node->parentTaxId);
426 }
427 }
428
loadMerged(const std::string & mergedFile)429 size_t NcbiTaxonomy::loadMerged(const std::string &mergedFile) {
430 Debug(Debug::INFO) << "Loading merged file ...";
431 std::ifstream ss(mergedFile);
432 if (ss.fail()) {
433 Debug(Debug::ERROR) << "File " << mergedFile << " not found!\n";
434 EXIT(EXIT_FAILURE);
435 }
436
437 std::string line;
438 size_t count = 0;
439 while (std::getline(ss, line)) {
440 std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
441 if (result.size() != 2) {
442 Debug(Debug::ERROR) << "Invalid name entry!\n";
443 EXIT(EXIT_FAILURE);
444 }
445
446 unsigned int oldId = (unsigned int)strtoul(result[0].c_str(), NULL, 10);
447 unsigned int mergedId = (unsigned int)strtoul(result[1].c_str(), NULL, 10);
448 if (!nodeExists(oldId) && nodeExists(mergedId)) {
449 D[oldId] = D[mergedId];
450 ++count;
451 }
452 }
453 Debug(Debug::INFO) << " Done, added " << count << " merged nodes.\n";
454 return count;
455 }
456
getCladeCounts(std::unordered_map<TaxID,unsigned int> & taxonCounts) const457 std::unordered_map<TaxID, TaxonCounts> NcbiTaxonomy::getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts) const {
458 Debug(Debug::INFO) << "Calculating clade counts ... ";
459 std::unordered_map<TaxID, TaxonCounts> cladeCounts;
460
461 for (std::unordered_map<TaxID, unsigned int>::const_iterator it = taxonCounts.begin(); it != taxonCounts.end(); ++it) {
462 cladeCounts[it->first].taxCount = it->second;
463 cladeCounts[it->first].cladeCount += it->second;
464 if (nodeExists(it->first)) {
465 TaxonNode const* taxon = taxonNode(it->first);
466 while (taxon->parentTaxId != taxon->taxId && nodeExists(taxon->parentTaxId)) {
467 taxon = taxonNode(taxon->parentTaxId);
468 cladeCounts[taxon->taxId].cladeCount += it->second;
469 }
470 }
471 }
472
473 for (size_t i = 0; i < maxNodes; ++i) {
474 TaxonNode& tn = taxonNodes[i];
475 if (tn.parentTaxId != tn.taxId && cladeCounts.count(tn.taxId)) {
476 std::unordered_map<TaxID, TaxonCounts>::iterator itp = cladeCounts.find(tn.parentTaxId);
477 itp->second.children.push_back(tn.taxId);
478 }
479 }
480
481 Debug(Debug::INFO) << " Done\n";
482 return cladeCounts;
483 }
484
openTaxonomy(const std::string & database)485 NcbiTaxonomy * NcbiTaxonomy::openTaxonomy(const std::string &database){
486 std::string binFile = database + "_taxonomy";
487 if (FileUtil::fileExists(binFile.c_str())) {
488 FILE* handle = fopen(binFile.c_str(), "r");
489 struct stat sb;
490 if (fstat(fileno(handle), &sb) < 0) {
491 Debug(Debug::ERROR) << "Failed to fstat file " << binFile << "\n";
492 EXIT(EXIT_FAILURE);
493 }
494 char* data = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fileno(handle), 0);
495 if (data == MAP_FAILED){
496 Debug(Debug::ERROR) << "Failed to mmap file " << binFile << " with error " << errno << "\n";
497 EXIT(EXIT_FAILURE);
498 }
499 fclose(handle);
500 NcbiTaxonomy* t = NcbiTaxonomy::unserialize(data);
501 if (t != NULL) {
502 t->mmapData = data;
503 t->mmapSize = sb.st_size;
504 return t;
505 } else {
506 Debug(Debug::WARNING) << "Outdated taxonomy information, please recreate with createtaxdb.\n";
507 }
508 }
509 Debug(Debug::INFO) << "Loading NCBI taxonomy\n";
510 std::string nodesFile = database + "_nodes.dmp";
511 std::string namesFile = database + "_names.dmp";
512 std::string mergedFile = database + "_merged.dmp";
513 if (FileUtil::fileExists(nodesFile.c_str())
514 && FileUtil::fileExists(namesFile.c_str())
515 && FileUtil::fileExists(mergedFile.c_str())) {
516 } else if (FileUtil::fileExists("nodes.dmp")
517 && FileUtil::fileExists("names.dmp")
518 && FileUtil::fileExists("merged.dmp")) {
519 nodesFile = "nodes.dmp";
520 namesFile = "names.dmp";
521 mergedFile = "merged.dmp";
522 } else {
523 Debug(Debug::ERROR) << "names.dmp, nodes.dmp, merged.dmp from NCBI taxdump could not be found!\n";
524 EXIT(EXIT_FAILURE);
525 }
526 return new NcbiTaxonomy(namesFile, nodesFile, mergedFile);
527 }
528
529 const TaxID ROOT_TAXID = 1;
530 const int ROOT_RANK = INT_MAX;
531
532 struct TaxNode {
TaxNodeTaxNode533 TaxNode(const double weight, const bool isCandidate, const TaxID childTaxon)
534 : weight(weight), isCandidate(isCandidate), childTaxon(childTaxon) {}
535
updateTaxNode536 void update(const double weightToAdd, const TaxID & childTaxonInput) {
537 if (childTaxon != childTaxonInput) {
538 isCandidate = true;
539 childTaxon = childTaxonInput;
540 }
541 weight += weightToAdd;
542 }
543
544 double weight;
545 bool isCandidate;
546 TaxID childTaxon;
547 };
548
getString(size_t blockIdx) const549 const char* NcbiTaxonomy::getString(size_t blockIdx) const {
550 return block->getString(blockIdx);
551 }
552
WeightedTaxHit(const TaxID taxon,const float evalue,const int weightVoteMode)553 WeightedTaxHit::WeightedTaxHit(const TaxID taxon, const float evalue, const int weightVoteMode) : taxon(taxon) {
554 switch (weightVoteMode) {
555 case Parameters::AGG_TAX_UNIFORM:
556 weight = 1.0;
557 break;
558 case Parameters::AGG_TAX_MINUS_LOG_EVAL:
559 weight = evalue;
560 if (evalue != FLT_MAX) {
561 if (evalue > 0) {
562 weight = -log(evalue);
563 } else {
564 weight = MAX_TAX_WEIGHT;
565 }
566 }
567 break;
568 case Parameters::AGG_TAX_SCORE:
569 weight = evalue;
570 break;
571 default:
572 Debug(Debug::ERROR) << "Invalid weight vote mode\n";
573 EXIT(EXIT_FAILURE);
574 }
575 }
576
weightedMajorityLCA(const std::vector<WeightedTaxHit> & setTaxa,const float majorityCutoff)577 WeightedTaxResult NcbiTaxonomy::weightedMajorityLCA(const std::vector<WeightedTaxHit> &setTaxa, const float majorityCutoff) {
578 // count num occurences of each ancestor, possibly weighted
579 std::map<TaxID, TaxNode> ancTaxIdsCounts;
580
581 // initialize counters and weights
582 size_t assignedSeqs = 0;
583 size_t unassignedSeqs = 0;
584 double totalAssignedSeqsWeights = 0.0;
585
586 for (size_t i = 0; i < setTaxa.size(); ++i) {
587 TaxID currTaxId = setTaxa[i].taxon;
588 double currWeight = setTaxa[i].weight;
589 // ignore unassigned sequences
590 if (currTaxId == 0) {
591 unassignedSeqs++;
592 continue;
593 }
594 TaxonNode const *node = taxonNode(currTaxId, false);
595 if (node == NULL) {
596 Debug(Debug::ERROR) << "taxonid: " << currTaxId << " does not match a legal taxonomy node.\n";
597 EXIT(EXIT_FAILURE);
598 }
599 totalAssignedSeqsWeights += currWeight;
600 assignedSeqs++;
601
602 // each start of a path due to an orf is a candidate
603 std::map<TaxID, TaxNode>::iterator it;
604 if ((it = ancTaxIdsCounts.find(currTaxId)) != ancTaxIdsCounts.end()) {
605 it->second.update(currWeight, 0);
606 } else {
607 TaxNode current(currWeight, true, 0);
608 ancTaxIdsCounts.emplace(currTaxId, current);
609 }
610
611 // iterate all ancestors up to root (including). add currWeight and candidate status to each
612 TaxID currParentTaxId = node->parentTaxId;
613 while (currParentTaxId != currTaxId) {
614 if ((it = ancTaxIdsCounts.find(currParentTaxId)) != ancTaxIdsCounts.end()) {
615 it->second.update(currWeight, currTaxId);
616 } else {
617 TaxNode parent(currWeight, false, currTaxId);
618 ancTaxIdsCounts.emplace(currParentTaxId, parent);
619 }
620 // move up
621 currTaxId = currParentTaxId;
622 node = taxonNode(currParentTaxId, false);
623 currParentTaxId = node->parentTaxId;
624 }
625 }
626
627 TaxID selctedTaxon = 0;
628 if (totalAssignedSeqsWeights == 0) {
629 return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, 0, 0.0);
630 }
631
632 // select the lowest ancestor that meets the cutoff
633 int minRank = INT_MAX;
634 double selectedPercent = 0;
635 for (std::map<TaxID, TaxNode>::iterator it = ancTaxIdsCounts.begin(); it != ancTaxIdsCounts.end(); it++) {
636 // consider only candidates
637 if (it->second.isCandidate == false) {
638 continue;
639 }
640
641 double currPercent = it->second.weight / totalAssignedSeqsWeights;
642 if (currPercent >= majorityCutoff) {
643 // iterate all ancestors to find lineage min rank (the candidate is a descendant of a node with this rank)
644 TaxID currTaxId = it->first;
645 TaxonNode const *node = taxonNode(currTaxId, false);
646 int currMinRank = ROOT_RANK;
647 TaxID currParentTaxId = node->parentTaxId;
648 while (currParentTaxId != currTaxId) {
649 int currRankInd = NcbiTaxonomy::findRankIndex(getString(node->rankIdx));
650 if ((currRankInd > 0) && (currRankInd < currMinRank)) {
651 currMinRank = currRankInd;
652 // the rank can only go up on the way to the root, so we can break
653 break;
654 }
655 // move up:
656 currTaxId = currParentTaxId;
657 node = taxonNode(currParentTaxId, false);
658 currParentTaxId = node->parentTaxId;
659 }
660
661 if ((currMinRank < minRank) || ((currMinRank == minRank) && (currPercent > selectedPercent))) {
662 selctedTaxon = it->first;
663 minRank = currMinRank;
664 selectedPercent = currPercent;
665 }
666 }
667 }
668
669 // count the number of seqs who have selectedTaxon in their ancestors (agree with selection):
670 if (selctedTaxon == ROOT_TAXID) {
671 // all agree with "root"
672 return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, assignedSeqs, selectedPercent);
673 }
674 if (selctedTaxon == 0) {
675 // nothing informative
676 return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, 0, selectedPercent);
677 }
678 size_t seqsAgreeWithSelectedTaxon = 0;
679 // otherwise, iterate over all seqs
680 for (size_t i = 0; i < setTaxa.size(); ++i) {
681 TaxID currTaxId = setTaxa[i].taxon;
682 // ignore unassigned sequences
683 if (currTaxId == 0) {
684 continue;
685 }
686 TaxonNode const *node = taxonNode(currTaxId, false);
687
688 // iterate all ancestors up to the root
689 TaxID currParentTaxId = node->parentTaxId;
690 while (currParentTaxId != currTaxId) {
691 if (currTaxId == selctedTaxon) {
692 seqsAgreeWithSelectedTaxon++;
693 break;
694 }
695 currTaxId = currParentTaxId;
696 node = taxonNode(currParentTaxId, false);
697 currParentTaxId = node->parentTaxId;
698 }
699 }
700
701 return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, seqsAgreeWithSelectedTaxon, selectedPercent);
702 }
703
serialize(const NcbiTaxonomy & t)704 std::pair<char*, size_t> NcbiTaxonomy::serialize(const NcbiTaxonomy& t) {
705 t.block->compact();
706 size_t matrixDim = (t.maxNodes * 2);
707 size_t matrixK = (int)(MathUtil::flog2(matrixDim)) + 1;
708 size_t matrixSize = matrixDim * matrixK * sizeof(int);
709 size_t blockSize = StringBlock<unsigned int>::memorySize(*t.block);
710 size_t memSize = sizeof(int) // SERIALIZATION_VERSION
711 + sizeof(size_t) // maxNodes
712 + sizeof(int) // maxTaxID
713 + t.maxNodes * sizeof(TaxonNode) // taxonNodes
714 + (t.maxTaxID + 1) * sizeof(int) // D
715 + 2 * (t.maxNodes * 2) * sizeof(int) // E,L
716 + t.maxNodes * sizeof(int) // H
717 + matrixSize // M
718 + blockSize; // block
719
720 char* mem = (char*) malloc(memSize);
721 char* p = mem;
722 memcpy(p, &t.SERIALIZATION_VERSION, sizeof(int));
723 p += sizeof(int);
724 memcpy(p, &t.maxNodes, sizeof(size_t));
725 p += sizeof(size_t);
726 memcpy(p, &t.maxTaxID, sizeof(int));
727 p += sizeof(int);
728 memcpy(p, t.taxonNodes, t.maxNodes * sizeof(TaxonNode));
729 p += t.maxNodes * sizeof(TaxonNode);
730 memcpy(p, t.D, (t.maxTaxID + 1) * sizeof(int));
731 p += (t.maxTaxID + 1) * sizeof(int);
732 memcpy(p, t.E, (t.maxNodes * 2) * sizeof(int));
733 p += (t.maxNodes * 2) * sizeof(int);
734 memcpy(p, t.L, (t.maxNodes * 2) * sizeof(int));
735 p += (t.maxNodes * 2) * sizeof(int);
736 memcpy(p, t.H, t.maxNodes * sizeof(int));
737 p += t.maxNodes * sizeof(int);
738 memcpy(p, t.M[0], matrixSize);
739 p += matrixSize;
740 char* blockData = StringBlock<unsigned int>::serialize(*t.block);
741 memcpy(p, blockData, blockSize);
742 p += blockSize;
743 free(blockData);
744 return std::make_pair(mem, memSize);
745 }
746
unserialize(char * mem)747 NcbiTaxonomy* NcbiTaxonomy::unserialize(char* mem) {
748 const char* p = mem;
749 int version = *((int*)p);
750 p += sizeof(int);
751 if (version != NcbiTaxonomy::SERIALIZATION_VERSION) {
752 return NULL;
753 }
754 size_t maxNodes = *((size_t*)p);
755 p += sizeof(size_t);
756 int maxTaxID = *((int*)p);
757 p += sizeof(int);
758 TaxonNode* taxonNodes = (TaxonNode*)p;
759 p += maxNodes * sizeof(TaxonNode);
760 int* D = (int*)p;
761 p += (maxTaxID + 1) * sizeof(int);
762 int* E = (int*)p;
763 p += (maxNodes * 2) * sizeof(int);
764 int* L = (int*)p;
765 p += (maxNodes * 2) * sizeof(int);
766 int* H = (int*)p;
767 p += maxNodes * sizeof(int);
768 size_t matrixDim = (maxNodes * 2);
769 size_t matrixK = (int)(MathUtil::flog2(matrixDim)) + 1;
770 size_t matrixSize = matrixDim * matrixK * sizeof(int);
771 int** M = new int*[matrixDim];
772 M[0] = (int*)p;
773 for(size_t i = 1; i < matrixDim; i++) {
774 M[i] = M[i-1] + matrixK;
775 }
776 p += matrixSize;
777 StringBlock<unsigned int>* block = StringBlock<unsigned int>::unserialize(p);
778 return new NcbiTaxonomy(taxonNodes, maxNodes, maxTaxID, D, E, L, H, M, block);
779 }
780