1 // Ported from blast2lca
2 // https://github.com/emepyc/Blast2lca
3 // Originally licensed under GPLv2 or later
4 
5 #include "NcbiTaxonomy.h"
6 #include "FileUtil.h"
7 #include "MathUtil.h"
8 #include "Debug.h"
9 #include "Util.h"
10 #include "sys/mman.h"
11 
12 #include <fstream>
13 #include <algorithm>
14 #include <cassert>
15 
16 const int NcbiTaxonomy::SERIALIZATION_VERSION = 2;
17 
makeMatrix(size_t maxNodes)18 int **makeMatrix(size_t maxNodes) {
19     size_t dimension = maxNodes * 2;
20     int **M = new int*[dimension];
21     int k = (int)(MathUtil::flog2(dimension)) + 1;
22     M[0] = new int[dimension * k]();
23     for(size_t i = 1; i < dimension; i++) {
24         M[i] = M[i-1] + k;
25     }
26 
27     return M;
28 }
29 
deleteMatrix(int ** M)30 void deleteMatrix(int** M) {
31     delete[] M[0];
32     delete[] M;
33 }
34 
NcbiTaxonomy(const std::string & namesFile,const std::string & nodesFile,const std::string & mergedFile)35 NcbiTaxonomy::NcbiTaxonomy(const std::string &namesFile, const std::string &nodesFile, const std::string &mergedFile) : externalData(false) {
36     block = new StringBlock<unsigned int>();
37     std::vector<TaxonNode> tmpNodes;
38     loadNodes(tmpNodes, nodesFile);
39     loadMerged(mergedFile);
40     loadNames(tmpNodes, namesFile);
41 
42     maxNodes = tmpNodes.size();
43     taxonNodes = new TaxonNode[maxNodes];
44     std::copy(tmpNodes.begin(), tmpNodes.end(), taxonNodes);
45 
46     std::vector<int> tmpE;
47     tmpE.reserve(maxNodes * 2);
48 
49     std::vector<int> tmpL;
50     tmpL.reserve(maxNodes * 2);
51 
52     H = new int[maxNodes];
53     std::fill(H, H + maxNodes, 0);
54 
55     std::vector<std::vector<TaxID>> children(tmpNodes.size());
56     for (std::vector<TaxonNode>::const_iterator it = tmpNodes.begin(); it != tmpNodes.end(); ++it) {
57         if (it->parentTaxId != it->taxId) {
58             children[nodeId(it->parentTaxId)].push_back(it->taxId);
59         }
60     }
61 
62     elh(children, 1, 0, tmpE, tmpL);
63     tmpE.resize(maxNodes * 2, 0);
64     tmpL.resize(maxNodes * 2, 0);
65 
66     E = new int[maxNodes * 2];
67     std::copy(tmpE.begin(), tmpE.end(), E);
68     L = new int[maxNodes * 2];
69     std::copy(tmpL.begin(), tmpL.end(), L);
70 
71     M = makeMatrix(maxNodes);
72     InitRangeMinimumQuery();
73 
74     mmapData = NULL;
75     mmapSize = 0;
76 }
77 
~NcbiTaxonomy()78 NcbiTaxonomy::~NcbiTaxonomy() {
79     if (externalData) {
80         delete[] M;
81     } else {
82         delete[] taxonNodes;
83         delete[] H;
84         delete[] D;
85         delete[] E;
86         delete[] L;
87         deleteMatrix(M);
88     }
89     delete block;
90     if (mmapData != NULL) {
91         munmap(mmapData, mmapSize);
92     }
93 }
94 
splitByDelimiter(const std::string & s,const std::string & delimiter,int maxCol)95 std::vector<std::string> splitByDelimiter(const std::string &s, const std::string &delimiter, int maxCol) {
96     std::vector<std::string> result;
97     size_t prev = 0, pos = 0;
98     int i = 0;
99     do {
100         pos = s.find(delimiter, prev);
101         if (pos == std::string::npos) pos = s.length();
102         result.emplace_back(s.substr(prev, pos - prev));
103         prev = pos + delimiter.length();
104         i++;
105     } while (pos < s.length() && prev < s.length() && i < maxCol);
106 
107     return result;
108 }
109 
loadNodes(std::vector<TaxonNode> & tmpNodes,const std::string & nodesFile)110 size_t NcbiTaxonomy::loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile) {
111     Debug(Debug::INFO) << "Loading nodes file ...";
112     std::ifstream ss(nodesFile);
113     if (ss.fail()) {
114         Debug(Debug::ERROR) << "File " << nodesFile << " not found!\n";
115         EXIT(EXIT_FAILURE);
116     }
117 
118     std::map<TaxID, int> Dm; // temporary map TaxID -> internal ID;
119     maxTaxID = 0;
120     int currentId = 0;
121     std::string line;
122     while (std::getline(ss, line)) {
123         std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 3);
124         TaxID taxId = (TaxID) strtol(result[0].c_str(), NULL, 10);
125         TaxID parentTaxId = (TaxID) strtol(result[1].c_str(), NULL, 10);
126         if (taxId > maxTaxID) {
127             maxTaxID = taxId;
128         }
129         size_t rankIdx = block->append(result[2].c_str(), result[2].size());
130         tmpNodes.emplace_back(currentId, taxId, parentTaxId, rankIdx, (size_t)-1);
131         Dm.emplace(taxId, currentId);
132         ++currentId;
133     }
134 
135     D = new int[maxTaxID + 1];
136     std::fill_n(D, maxTaxID + 1, -1);
137     for (std::map<TaxID, int>::iterator it = Dm.begin(); it != Dm.end(); ++it) {
138         assert(it->first <= maxTaxID);
139         D[it->first] = it->second;
140     }
141 
142     // Loop over taxonNodes and check all parents exist
143     for (std::vector<TaxonNode>::iterator it = tmpNodes.begin(); it != tmpNodes.end(); ++it) {
144         if (!nodeExists(it->parentTaxId)) {
145             Debug(Debug::ERROR) << "Inconsistent nodes.dmp taxonomy file! Cannot find parent taxon with ID " << it->parentTaxId << "!\n";
146             EXIT(EXIT_FAILURE);
147         }
148     }
149 
150     Debug(Debug::INFO) << " Done, got " << tmpNodes.size() << " nodes\n";
151     return tmpNodes.size();
152 }
153 
parseName(const std::string & line)154 std::pair<int, std::string> parseName(const std::string &line) {
155     std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
156     if (result.size() != 2) {
157         Debug(Debug::ERROR) << "Invalid name entry!\n";
158         EXIT(EXIT_FAILURE);
159     }
160     return std::make_pair((int)strtol(result[0].c_str(), NULL, 10), result[1]);
161 }
162 
loadNames(std::vector<TaxonNode> & tmpNodes,const std::string & namesFile)163 void NcbiTaxonomy::loadNames(std::vector<TaxonNode> &tmpNodes, const std::string &namesFile) {
164     Debug(Debug::INFO) << "Loading names file ...";
165     std::ifstream ss(namesFile);
166     if (ss.fail()) {
167         Debug(Debug::ERROR) << "File " << namesFile << " not found!\n";
168         EXIT(EXIT_FAILURE);
169     }
170 
171     std::string line;
172     while (std::getline(ss, line)) {
173         if (line.find("scientific name") == std::string::npos) {
174             continue;
175         }
176 
177         std::pair<int, std::string> entry = parseName(line);
178         if (!nodeExists(entry.first)) {
179             Debug(Debug::ERROR) << "loadNames: Taxon " << entry.first << " not present in nodes file!\n";
180             EXIT(EXIT_FAILURE);
181         }
182         tmpNodes[nodeId(entry.first)].nameIdx = block->append(entry.second.c_str(), entry.second.size());
183     }
184     Debug(Debug::INFO) << " Done\n";
185 }
186 
187 // Euler traversal of tree
elh(std::vector<std::vector<TaxID>> const & children,TaxID taxId,int level,std::vector<int> & tmpE,std::vector<int> & tmpL)188 void NcbiTaxonomy::elh(std::vector<std::vector<TaxID>> const & children, TaxID taxId, int level, std::vector<int> &tmpE, std::vector<int> &tmpL) {
189     assert (taxId > 0);
190     int id = nodeId(taxId);
191 
192     if (H[id] == 0) {
193         H[id] = tmpE.size();
194     }
195 
196     tmpE.emplace_back(id);
197     tmpL.emplace_back(level);
198 
199     for (std::vector<TaxID>::const_iterator child_it = children[id].begin(); child_it != children[id].end(); ++child_it) {
200         elh(children, *child_it, level + 1, tmpE, tmpL);
201     }
202     tmpE.emplace_back(nodeId(taxonNodes[id].parentTaxId));
203     tmpL.emplace_back(level - 1);
204 }
205 
InitRangeMinimumQuery()206 void NcbiTaxonomy::InitRangeMinimumQuery() {
207     Debug(Debug::INFO) << "Init RMQ ...";
208 
209     for (unsigned int i = 0; i < (maxNodes * 2); ++i) {
210         M[i][0] = i;
211     }
212 
213     for (unsigned int j = 1; (1ul << j) <= (maxNodes * 2); ++j) {
214         for (unsigned int i = 0; (i + (1ul << j) - 1) < (maxNodes * 2); ++i) {
215             int A = M[i][j - 1];
216             int B = M[i + (1ul << (j - 1))][j - 1];
217             if (L[A] < L[B]) {
218                 M[i][j] = A;
219             } else {
220                 M[i][j] = B;
221             }
222         }
223     }
224     Debug(Debug::INFO) << "Done\n";
225 }
226 
RangeMinimumQuery(int i,int j) const227 int NcbiTaxonomy::RangeMinimumQuery(int i, int j) const {
228     assert(j >= i);
229     int k = (int)MathUtil::flog2(j - i + 1);
230     int A = M[i][k];
231     int B = M[j - MathUtil::ipow<int>(2, k) + 1][k];
232     if (L[A] <= L[B]) {
233         return A;
234     }
235     return B;
236 }
237 
lcaHelper(int i,int j) const238 int NcbiTaxonomy::lcaHelper(int i, int j) const {
239     if (i == 0 || j == 0) {
240         return 0;
241     }
242     assert(i > 0);
243     assert(j > 0);
244     if (i == j) {
245         return i;
246     }
247     int v1 = H[i];
248     int v2 = H[j];
249     if (v1 > v2) {
250         int tmp = v1;
251         v1 = v2;
252         v2 = tmp;
253     }
254     int rmq = RangeMinimumQuery(v1, v2);
255     assert(E[rmq] >= 0);
256     return E[rmq];
257 }
258 
IsAncestor(TaxID ancestor,TaxID child)259 bool NcbiTaxonomy::IsAncestor(TaxID ancestor, TaxID child) {
260     if (ancestor == child) {
261         return true;
262     }
263 
264     if (ancestor == 0 || child == 0) {
265         return false;
266     }
267 
268     if (!nodeExists(child)) {
269         return false;
270     }
271 
272     if (!nodeExists(ancestor)) {
273         return false;
274     }
275 
276     return lcaHelper(nodeId(child), nodeId(ancestor)) == nodeId(ancestor);
277 }
278 
279 
LCA(TaxID taxonA,TaxID taxonB) const280 TaxID NcbiTaxonomy::LCA(TaxID taxonA, TaxID taxonB) const {
281     if (!nodeExists(taxonA)) {
282         return taxonB;
283     } else if (!nodeExists(taxonB)) {
284         return taxonA;
285     }
286     return taxonNodes[lcaHelper(nodeId(taxonA), nodeId(taxonB))].taxId;
287 }
288 
289 
LCA(const std::vector<TaxID> & taxa) const290 TaxonNode const * NcbiTaxonomy::LCA(const std::vector<TaxID>& taxa) const {
291     std::vector<int>::const_iterator it = taxa.begin();
292     while (it != taxa.end() && !nodeExists(*it)) {
293         Debug(Debug::WARNING) << "No node for taxID " << *it << ", ignoring it.\n";
294         ++it;
295     }
296     if (it == taxa.end()) { return NULL; }
297     int red = nodeId(*it++);
298     for (; it != taxa.end(); ++it) {
299         if (nodeExists(*it)) {
300             red = lcaHelper(red, nodeId(*it));
301         } else {
302             Debug(Debug::WARNING) << "No node for taxID " << *it << ", ignoring it.\n";
303         }
304     }
305 
306     assert(red >= 0 && static_cast<unsigned int>(red) < maxNodes);
307 
308     return &(taxonNodes[red]);
309 }
310 
311 
312 // AtRanks returns a slice of slices having the taxons at the specified taxonomic levels
AtRanks(TaxonNode const * node,const std::vector<std::string> & levels) const313 std::vector<std::string> NcbiTaxonomy::AtRanks(TaxonNode const *node, const std::vector<std::string> &levels) const {
314     std::vector<std::string> result;
315     std::map<std::string, std::string> allRanks = AllRanks(node);
316     // map does not include "no rank" nor "no_rank"
317     const char* rank = getString(node->rankIdx);
318     int baseRankIndex = findRankIndex(rank);
319     std::string baseRank = "uc_";
320     baseRank.append(getString(node->nameIdx));
321     for (std::vector<std::string>::const_iterator it = levels.begin(); it != levels.end(); ++it) {
322         std::map<std::string, std::string>::iterator jt = allRanks.find(*it);
323         if (jt != allRanks.end()) {
324             result.emplace_back(jt->second);
325             continue;
326         }
327 
328         // If not ... 2 possible causes: i) too low level ("uc_")
329         if (NcbiRanks.at(*it) < baseRankIndex) {
330             result.emplace_back(baseRank);
331             continue;
332         }
333 
334         // ii) No taxon for the LCA at the required level -- give the first known upstream
335         result.emplace_back("unknown");
336     }
337     return result;
338 }
339 
parseRanks(const std::string & ranks)340 std::vector<std::string> NcbiTaxonomy::parseRanks(const std::string& ranks) {
341     std::vector<std::string> temp = Util::split(ranks, ",");
342     for (size_t i = 0; i < temp.size(); ++i) {
343         if (findRankIndex(temp[i]) == -1) {
344             Debug(Debug::ERROR) << "Invalid taxonomic rank " << temp[i] << "given\n";
345             EXIT(EXIT_FAILURE);
346         }
347     }
348     return temp;
349 }
350 
findRankIndex(const std::string & rank)351 int NcbiTaxonomy::findRankIndex(const std::string& rank) {
352     std::map<std::string, int>::const_iterator it;
353     if ((it = NcbiRanks.find(rank)) != NcbiRanks.end()) {
354         return it->second;
355     }
356     return -1;
357 }
358 
findShortRank(const std::string & rank)359 char NcbiTaxonomy::findShortRank(const std::string& rank) {
360     std::map<std::string, char>::const_iterator it;
361     if ((it = NcbiShortRanks.find(rank)) != NcbiShortRanks.end()) {
362         return it->second;
363     }
364     return '-';
365 }
366 
taxLineage(TaxonNode const * node,bool infoAsName)367 std::string NcbiTaxonomy::taxLineage(TaxonNode const *node, bool infoAsName) {
368     std::vector<TaxonNode const *> taxLineageVec;
369     std::string taxLineage;
370     taxLineage.reserve(4096);
371     do {
372         taxLineageVec.push_back(node);
373         node = taxonNode(node->parentTaxId);
374     } while (node->parentTaxId != node->taxId);
375 
376     for (int i = taxLineageVec.size() - 1; i >= 0; --i) {
377         if (infoAsName) {
378             taxLineage += findShortRank(getString(taxLineageVec[i]->rankIdx));
379             taxLineage += '_';
380             taxLineage += getString(taxLineageVec[i]->nameIdx);
381         } else {
382             taxLineage += SSTR(taxLineageVec[i]->taxId);
383         }
384 
385         if (i > 0) {
386             taxLineage += ";";
387         }
388     }
389     return taxLineage;
390 }
391 
nodeId(TaxID taxonId) const392 int NcbiTaxonomy::nodeId(TaxID taxonId) const {
393     if (taxonId < 0 || !nodeExists(taxonId)) {
394         Debug(Debug::ERROR) << "Invalid node " << taxonId << "!\n";
395         EXIT(EXIT_FAILURE);
396     }
397     return D[taxonId];
398 }
399 
nodeExists(TaxID taxonId) const400 bool NcbiTaxonomy::nodeExists(TaxID taxonId) const {
401     return taxonId <= maxTaxID && D[taxonId] != -1;
402 }
403 
taxonNode(TaxID taxonId,bool fail) const404 TaxonNode const * NcbiTaxonomy::taxonNode(TaxID taxonId, bool fail) const {
405     if (taxonId == 0 || (!fail && !nodeExists(taxonId))) {
406         return NULL;
407     }
408     return &(taxonNodes[nodeId(taxonId)]);
409 }
410 
AllRanks(TaxonNode const * node) const411 std::map<std::string, std::string> NcbiTaxonomy::AllRanks(TaxonNode const *node) const {
412     std::map<std::string, std::string> result;
413     while (true) {
414         std::string rank = getString(node->rankIdx);
415         std::string name = getString(node->nameIdx);
416         if (node->taxId == 1) {
417             result.emplace(rank, name);
418             return result;
419         }
420 
421         if ((rank != "no_rank") && (rank != "no rank")) {
422             result.emplace(rank, name);
423         }
424 
425         node = taxonNode(node->parentTaxId);
426     }
427 }
428 
loadMerged(const std::string & mergedFile)429 size_t NcbiTaxonomy::loadMerged(const std::string &mergedFile) {
430     Debug(Debug::INFO) << "Loading merged file ...";
431     std::ifstream ss(mergedFile);
432     if (ss.fail()) {
433         Debug(Debug::ERROR) << "File " << mergedFile << " not found!\n";
434         EXIT(EXIT_FAILURE);
435     }
436 
437     std::string line;
438     size_t count = 0;
439     while (std::getline(ss, line)) {
440         std::vector<std::string> result = splitByDelimiter(line, "\t|\t", 2);
441         if (result.size() != 2) {
442             Debug(Debug::ERROR) << "Invalid name entry!\n";
443             EXIT(EXIT_FAILURE);
444         }
445 
446         unsigned int oldId = (unsigned int)strtoul(result[0].c_str(), NULL, 10);
447         unsigned int mergedId = (unsigned int)strtoul(result[1].c_str(), NULL, 10);
448         if (!nodeExists(oldId) && nodeExists(mergedId)) {
449             D[oldId] = D[mergedId];
450             ++count;
451         }
452     }
453     Debug(Debug::INFO) << " Done, added " << count << " merged nodes.\n";
454     return count;
455 }
456 
getCladeCounts(std::unordered_map<TaxID,unsigned int> & taxonCounts) const457 std::unordered_map<TaxID, TaxonCounts> NcbiTaxonomy::getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts) const {
458     Debug(Debug::INFO) << "Calculating clade counts ... ";
459     std::unordered_map<TaxID, TaxonCounts> cladeCounts;
460 
461     for (std::unordered_map<TaxID, unsigned int>::const_iterator it = taxonCounts.begin(); it != taxonCounts.end(); ++it) {
462         cladeCounts[it->first].taxCount = it->second;
463         cladeCounts[it->first].cladeCount += it->second;
464         if (nodeExists(it->first)) {
465             TaxonNode const* taxon = taxonNode(it->first);
466             while (taxon->parentTaxId != taxon->taxId && nodeExists(taxon->parentTaxId)) {
467                 taxon = taxonNode(taxon->parentTaxId);
468                 cladeCounts[taxon->taxId].cladeCount += it->second;
469             }
470         }
471     }
472 
473     for (size_t i = 0; i < maxNodes; ++i) {
474         TaxonNode& tn = taxonNodes[i];
475         if (tn.parentTaxId != tn.taxId && cladeCounts.count(tn.taxId)) {
476             std::unordered_map<TaxID, TaxonCounts>::iterator itp = cladeCounts.find(tn.parentTaxId);
477             itp->second.children.push_back(tn.taxId);
478         }
479     }
480 
481     Debug(Debug::INFO) << " Done\n";
482     return cladeCounts;
483 }
484 
openTaxonomy(const std::string & database)485 NcbiTaxonomy * NcbiTaxonomy::openTaxonomy(const std::string &database){
486     std::string binFile = database + "_taxonomy";
487     if (FileUtil::fileExists(binFile.c_str())) {
488         FILE* handle = fopen(binFile.c_str(), "r");
489         struct stat sb;
490         if (fstat(fileno(handle), &sb) < 0) {
491             Debug(Debug::ERROR) << "Failed to fstat file " << binFile << "\n";
492             EXIT(EXIT_FAILURE);
493         }
494         char* data = (char*)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fileno(handle), 0);
495         if (data == MAP_FAILED){
496             Debug(Debug::ERROR) << "Failed to mmap file " << binFile << " with error " << errno << "\n";
497             EXIT(EXIT_FAILURE);
498         }
499         fclose(handle);
500         NcbiTaxonomy* t = NcbiTaxonomy::unserialize(data);
501         if (t != NULL) {
502             t->mmapData = data;
503             t->mmapSize = sb.st_size;
504             return t;
505         } else {
506             Debug(Debug::WARNING) << "Outdated taxonomy information, please recreate with createtaxdb.\n";
507         }
508     }
509     Debug(Debug::INFO) << "Loading NCBI taxonomy\n";
510     std::string nodesFile = database + "_nodes.dmp";
511     std::string namesFile = database + "_names.dmp";
512     std::string mergedFile = database + "_merged.dmp";
513     if (FileUtil::fileExists(nodesFile.c_str())
514         && FileUtil::fileExists(namesFile.c_str())
515         && FileUtil::fileExists(mergedFile.c_str())) {
516     } else if (FileUtil::fileExists("nodes.dmp")
517                && FileUtil::fileExists("names.dmp")
518                && FileUtil::fileExists("merged.dmp")) {
519         nodesFile = "nodes.dmp";
520         namesFile = "names.dmp";
521         mergedFile = "merged.dmp";
522     } else {
523         Debug(Debug::ERROR) << "names.dmp, nodes.dmp, merged.dmp from NCBI taxdump could not be found!\n";
524         EXIT(EXIT_FAILURE);
525     }
526     return new NcbiTaxonomy(namesFile, nodesFile, mergedFile);
527 }
528 
529 const TaxID ROOT_TAXID = 1;
530 const int ROOT_RANK = INT_MAX;
531 
532 struct TaxNode {
TaxNodeTaxNode533     TaxNode(const double weight, const bool isCandidate, const TaxID childTaxon)
534             : weight(weight), isCandidate(isCandidate), childTaxon(childTaxon) {}
535 
updateTaxNode536     void update(const double weightToAdd, const TaxID & childTaxonInput) {
537         if (childTaxon != childTaxonInput) {
538             isCandidate = true;
539             childTaxon = childTaxonInput;
540         }
541         weight += weightToAdd;
542     }
543 
544     double weight;
545     bool isCandidate;
546     TaxID childTaxon;
547 };
548 
getString(size_t blockIdx) const549 const char* NcbiTaxonomy::getString(size_t blockIdx) const {
550     return block->getString(blockIdx);
551 }
552 
WeightedTaxHit(const TaxID taxon,const float evalue,const int weightVoteMode)553 WeightedTaxHit::WeightedTaxHit(const TaxID taxon, const float evalue, const int weightVoteMode) : taxon(taxon) {
554     switch (weightVoteMode) {
555         case Parameters::AGG_TAX_UNIFORM:
556             weight = 1.0;
557             break;
558         case Parameters::AGG_TAX_MINUS_LOG_EVAL:
559             weight = evalue;
560             if (evalue != FLT_MAX) {
561                 if (evalue > 0) {
562                     weight = -log(evalue);
563                 } else {
564                     weight = MAX_TAX_WEIGHT;
565                 }
566             }
567             break;
568         case Parameters::AGG_TAX_SCORE:
569             weight = evalue;
570             break;
571         default:
572             Debug(Debug::ERROR) << "Invalid weight vote mode\n";
573             EXIT(EXIT_FAILURE);
574     }
575 }
576 
weightedMajorityLCA(const std::vector<WeightedTaxHit> & setTaxa,const float majorityCutoff)577 WeightedTaxResult NcbiTaxonomy::weightedMajorityLCA(const std::vector<WeightedTaxHit> &setTaxa, const float majorityCutoff) {
578     // count num occurences of each ancestor, possibly weighted
579     std::map<TaxID, TaxNode> ancTaxIdsCounts;
580 
581     // initialize counters and weights
582     size_t assignedSeqs = 0;
583     size_t unassignedSeqs = 0;
584     double totalAssignedSeqsWeights = 0.0;
585 
586     for (size_t i = 0; i < setTaxa.size(); ++i) {
587         TaxID currTaxId = setTaxa[i].taxon;
588         double currWeight = setTaxa[i].weight;
589         // ignore unassigned sequences
590         if (currTaxId == 0) {
591             unassignedSeqs++;
592             continue;
593         }
594         TaxonNode const *node = taxonNode(currTaxId, false);
595         if (node == NULL) {
596             Debug(Debug::ERROR) << "taxonid: " << currTaxId << " does not match a legal taxonomy node.\n";
597             EXIT(EXIT_FAILURE);
598         }
599         totalAssignedSeqsWeights += currWeight;
600         assignedSeqs++;
601 
602         // each start of a path due to an orf is a candidate
603         std::map<TaxID, TaxNode>::iterator it;
604         if ((it = ancTaxIdsCounts.find(currTaxId)) != ancTaxIdsCounts.end()) {
605             it->second.update(currWeight, 0);
606         } else {
607             TaxNode current(currWeight, true, 0);
608             ancTaxIdsCounts.emplace(currTaxId, current);
609         }
610 
611         // iterate all ancestors up to root (including). add currWeight and candidate status to each
612         TaxID currParentTaxId = node->parentTaxId;
613         while (currParentTaxId != currTaxId) {
614             if ((it = ancTaxIdsCounts.find(currParentTaxId)) != ancTaxIdsCounts.end()) {
615                 it->second.update(currWeight, currTaxId);
616             } else {
617                 TaxNode parent(currWeight, false, currTaxId);
618                 ancTaxIdsCounts.emplace(currParentTaxId, parent);
619             }
620             // move up
621             currTaxId = currParentTaxId;
622             node = taxonNode(currParentTaxId, false);
623             currParentTaxId = node->parentTaxId;
624         }
625     }
626 
627     TaxID selctedTaxon = 0;
628     if (totalAssignedSeqsWeights == 0) {
629         return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, 0, 0.0);
630     }
631 
632     // select the lowest ancestor that meets the cutoff
633     int minRank = INT_MAX;
634     double selectedPercent = 0;
635     for (std::map<TaxID, TaxNode>::iterator it = ancTaxIdsCounts.begin(); it != ancTaxIdsCounts.end(); it++) {
636         // consider only candidates
637         if (it->second.isCandidate == false) {
638             continue;
639         }
640 
641         double currPercent = it->second.weight / totalAssignedSeqsWeights;
642         if (currPercent >= majorityCutoff) {
643             // iterate all ancestors to find lineage min rank (the candidate is a descendant of a node with this rank)
644             TaxID currTaxId = it->first;
645             TaxonNode const *node = taxonNode(currTaxId, false);
646             int currMinRank = ROOT_RANK;
647             TaxID currParentTaxId = node->parentTaxId;
648             while (currParentTaxId != currTaxId) {
649                 int currRankInd = NcbiTaxonomy::findRankIndex(getString(node->rankIdx));
650                 if ((currRankInd > 0) && (currRankInd < currMinRank)) {
651                     currMinRank = currRankInd;
652                     // the rank can only go up on the way to the root, so we can break
653                     break;
654                 }
655                 // move up:
656                 currTaxId = currParentTaxId;
657                 node = taxonNode(currParentTaxId, false);
658                 currParentTaxId = node->parentTaxId;
659             }
660 
661             if ((currMinRank < minRank) || ((currMinRank == minRank) && (currPercent > selectedPercent))) {
662                 selctedTaxon = it->first;
663                 minRank = currMinRank;
664                 selectedPercent = currPercent;
665             }
666         }
667     }
668 
669     // count the number of seqs who have selectedTaxon in their ancestors (agree with selection):
670     if (selctedTaxon == ROOT_TAXID) {
671         // all agree with "root"
672         return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, assignedSeqs, selectedPercent);
673     }
674     if (selctedTaxon == 0) {
675         // nothing informative
676         return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, 0, selectedPercent);
677     }
678     size_t seqsAgreeWithSelectedTaxon = 0;
679     // otherwise, iterate over all seqs
680     for (size_t i = 0; i < setTaxa.size(); ++i) {
681         TaxID currTaxId = setTaxa[i].taxon;
682         // ignore unassigned sequences
683         if (currTaxId == 0) {
684             continue;
685         }
686         TaxonNode const *node = taxonNode(currTaxId, false);
687 
688         // iterate all ancestors up to the root
689         TaxID currParentTaxId = node->parentTaxId;
690         while (currParentTaxId != currTaxId) {
691             if (currTaxId == selctedTaxon) {
692                 seqsAgreeWithSelectedTaxon++;
693                 break;
694             }
695             currTaxId = currParentTaxId;
696             node = taxonNode(currParentTaxId, false);
697             currParentTaxId = node->parentTaxId;
698         }
699     }
700 
701     return WeightedTaxResult(selctedTaxon, assignedSeqs, unassignedSeqs, seqsAgreeWithSelectedTaxon, selectedPercent);
702 }
703 
serialize(const NcbiTaxonomy & t)704 std::pair<char*, size_t> NcbiTaxonomy::serialize(const NcbiTaxonomy& t) {
705     t.block->compact();
706     size_t matrixDim = (t.maxNodes * 2);
707     size_t matrixK = (int)(MathUtil::flog2(matrixDim)) + 1;
708     size_t matrixSize = matrixDim * matrixK * sizeof(int);
709     size_t blockSize = StringBlock<unsigned int>::memorySize(*t.block);
710     size_t memSize = sizeof(int) // SERIALIZATION_VERSION
711         + sizeof(size_t) // maxNodes
712         + sizeof(int) // maxTaxID
713         + t.maxNodes * sizeof(TaxonNode) // taxonNodes
714         + (t.maxTaxID + 1) * sizeof(int) // D
715         + 2 * (t.maxNodes * 2) * sizeof(int) // E,L
716         + t.maxNodes * sizeof(int) // H
717         + matrixSize // M
718         + blockSize; // block
719 
720     char* mem = (char*) malloc(memSize);
721     char* p = mem;
722     memcpy(p, &t.SERIALIZATION_VERSION, sizeof(int));
723     p += sizeof(int);
724     memcpy(p, &t.maxNodes, sizeof(size_t));
725     p += sizeof(size_t);
726     memcpy(p, &t.maxTaxID, sizeof(int));
727     p += sizeof(int);
728     memcpy(p, t.taxonNodes, t.maxNodes * sizeof(TaxonNode));
729     p += t.maxNodes * sizeof(TaxonNode);
730     memcpy(p, t.D, (t.maxTaxID + 1) * sizeof(int));
731     p += (t.maxTaxID + 1) * sizeof(int);
732     memcpy(p, t.E, (t.maxNodes * 2) * sizeof(int));
733     p += (t.maxNodes * 2) * sizeof(int);
734     memcpy(p, t.L, (t.maxNodes * 2) * sizeof(int));
735     p += (t.maxNodes * 2) * sizeof(int);
736     memcpy(p, t.H, t.maxNodes * sizeof(int));
737     p += t.maxNodes * sizeof(int);
738     memcpy(p, t.M[0], matrixSize);
739     p += matrixSize;
740     char* blockData = StringBlock<unsigned int>::serialize(*t.block);
741     memcpy(p, blockData, blockSize);
742     p += blockSize;
743     free(blockData);
744     return std::make_pair(mem, memSize);
745 }
746 
unserialize(char * mem)747 NcbiTaxonomy* NcbiTaxonomy::unserialize(char* mem) {
748     const char* p = mem;
749     int version = *((int*)p);
750     p += sizeof(int);
751     if (version != NcbiTaxonomy::SERIALIZATION_VERSION) {
752         return NULL;
753     }
754     size_t maxNodes = *((size_t*)p);
755     p += sizeof(size_t);
756     int maxTaxID = *((int*)p);
757     p += sizeof(int);
758     TaxonNode* taxonNodes = (TaxonNode*)p;
759     p += maxNodes * sizeof(TaxonNode);
760     int* D = (int*)p;
761     p += (maxTaxID + 1) * sizeof(int);
762     int* E = (int*)p;
763     p += (maxNodes * 2) * sizeof(int);
764     int* L = (int*)p;
765     p += (maxNodes * 2) * sizeof(int);
766     int* H = (int*)p;
767     p += maxNodes * sizeof(int);
768     size_t matrixDim = (maxNodes * 2);
769     size_t matrixK = (int)(MathUtil::flog2(matrixDim)) + 1;
770     size_t matrixSize = matrixDim * matrixK * sizeof(int);
771     int** M = new int*[matrixDim];
772     M[0] = (int*)p;
773     for(size_t i = 1; i < matrixDim; i++) {
774         M[i] = M[i-1] + matrixK;
775     }
776     p += matrixSize;
777     StringBlock<unsigned int>* block = StringBlock<unsigned int>::unserialize(p);
778     return new NcbiTaxonomy(taxonNodes, maxNodes, maxTaxID, D, E, L, H, M, block);
779 }
780