1 // Ported from blast2lca
2 // Copyright: 2010 Miguel Pignatelli
3 // License: GPLv2 or later
4 // https://github.com/emepyc/Blast2lca
5 
6 #ifndef MMSEQS_NCBITAXONOMY_H
7 #define MMSEQS_NCBITAXONOMY_H
8 
9 #include "StringBlock.h"
10 
11 #include <map>
12 #include <unordered_map>
13 #include <vector>
14 #include <string>
15 
16 typedef int TaxID;
17 
18 struct TaxonNode {
19 public:
20     int id;
21     TaxID taxId;
22     TaxID parentTaxId;
23     size_t rankIdx;
24     size_t nameIdx;
25 
TaxonNodeTaxonNode26     TaxonNode() {};
27 
TaxonNodeTaxonNode28     TaxonNode(int id, TaxID taxId, TaxID parentTaxId, size_t rankIdx, size_t nameIdx)
29             : id(id), taxId(taxId), parentTaxId(parentTaxId), rankIdx(rankIdx), nameIdx(nameIdx) {};
30 };
31 
32 const double MAX_TAX_WEIGHT = 1000;
33 struct WeightedTaxHit {
34     WeightedTaxHit(const TaxID taxon, const float evalue, const int weightVoteMode);
35 
36     TaxID taxon;
37     double weight;
38 };
39 
40 struct WeightedTaxResult {
WeightedTaxResultWeightedTaxResult41     WeightedTaxResult(TaxID taxon, size_t assignedSeqs, size_t unassignedSeqs, size_t seqsAgreeWithSelectedTaxon, double selectedPercent)
42             : taxon(taxon), assignedSeqs(assignedSeqs), unassignedSeqs(unassignedSeqs), seqsAgreeWithSelectedTaxon(seqsAgreeWithSelectedTaxon), selectedPercent(selectedPercent) {};
43 
44     TaxID  taxon;
45     size_t assignedSeqs;
46     size_t unassignedSeqs;
47     size_t seqsAgreeWithSelectedTaxon;
48     double selectedPercent;
49 };
50 
51 struct TaxonCounts {
52     unsigned int taxCount;       // number of reads/sequences matching to taxa
53     unsigned int cladeCount;     // number of reads/sequences matching to taxa or its children
54     std::vector<TaxID> children; // list of children
55 };
56 
57 static const std::map<std::string, int> NcbiRanks = {{ "forma", 1 },
58                                                      { "varietas", 2 },
59                                                      { "subspecies", 3 },
60                                                      { "species", 4 },
61                                                      { "species subgroup", 5 },
62                                                      { "species group", 6 },
63                                                      { "subgenus", 7 },
64                                                      { "genus", 8 },
65                                                      { "subtribe", 9 },
66                                                      { "tribe", 10 },
67                                                      { "subfamily", 11 },
68                                                      { "family", 12 },
69                                                      { "superfamily", 13 },
70                                                      { "parvorder", 14 },
71                                                      { "infraorder", 15 },
72                                                      { "suborder", 16 },
73                                                      { "order", 17 },
74                                                      { "superorder", 18 },
75                                                      { "infraclass", 19 },
76                                                      { "subclass", 20 },
77                                                      { "class", 21 },
78                                                      { "superclass", 22 },
79                                                      { "subphylum", 23 },
80                                                      { "phylum", 24 },
81                                                      { "superphylum", 25 },
82                                                      { "subkingdom", 26 },
83                                                      { "kingdom", 27 },
84                                                      { "superkingdom", 28 }};
85 
86 static const std::map<std::string, char> NcbiShortRanks = {{ "species", 's' },
87                                                            { "genus", 'g' },
88                                                            { "family", 'f' },
89                                                            { "order", 'o' },
90                                                            { "class", 'c' },
91                                                            { "phylum", 'p' },
92                                                            { "kingdom", 'k' },
93                                                            { "superkingdom", 'd' }};
94 
95 class NcbiTaxonomy {
96 public:
97     static NcbiTaxonomy* openTaxonomy(const std::string &database);
98     NcbiTaxonomy(const std::string &namesFile,  const std::string &nodesFile, const std::string &mergedFile);
99     ~NcbiTaxonomy();
100 
101     TaxonNode const * LCA(const std::vector<TaxID>& taxa) const;
102     TaxID LCA(TaxID taxonA, TaxID taxonB) const;
103     std::vector<std::string> AtRanks(TaxonNode const * node, const std::vector<std::string> &levels) const;
104     std::map<std::string, std::string> AllRanks(TaxonNode const *node) const;
105     std::string taxLineage(TaxonNode const *node, bool infoAsName = true);
106 
107     static std::vector<std::string> parseRanks(const std::string& ranks);
108     static int findRankIndex(const std::string& rank);
109     static char findShortRank(const std::string& rank);
110 
111     bool IsAncestor(TaxID ancestor, TaxID child);
112     TaxonNode const* taxonNode(TaxID taxonId, bool fail = true) const;
113     bool nodeExists(TaxID taxId) const;
114 
115     std::unordered_map<TaxID, TaxonCounts> getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts) const;
116 
117     WeightedTaxResult weightedMajorityLCA(const std::vector<WeightedTaxHit> &setTaxa, const float majorityCutoff);
118 
119     const char* getString(size_t blockIdx) const;
120 
121     static std::pair<char*, size_t> serialize(const NcbiTaxonomy& taxonomy);
122     static NcbiTaxonomy* unserialize(char* data);
123 
124     TaxonNode* taxonNodes;
125     size_t maxNodes;
126 private:
127     size_t loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile);
128     size_t loadMerged(const std::string &mergedFile);
129     void loadNames(std::vector<TaxonNode> &tmpNodes, const std::string &namesFile);
130     void elh(std::vector<std::vector<TaxID>> const & children, int node, int level, std::vector<int> &tmpE, std::vector<int> &tmpL);
131     void InitRangeMinimumQuery();
132     int nodeId(TaxID taxId) const;
133 
134     int RangeMinimumQuery(int i, int j) const;
135     int lcaHelper(int i, int j) const;
136 
NcbiTaxonomy(TaxonNode * taxonNodes,size_t maxNodes,int maxTaxID,int * D,int * E,int * L,int * H,int ** M,StringBlock<unsigned int> * block)137     NcbiTaxonomy(TaxonNode* taxonNodes, size_t maxNodes, int maxTaxID, int *D, int *E, int *L, int *H, int **M, StringBlock<unsigned int> *block)
138         : taxonNodes(taxonNodes), maxNodes(maxNodes), maxTaxID(maxTaxID), D(D), E(E), L(L), H(H), M(M), block(block), externalData(true), mmapData(NULL), mmapSize(0) {};
139     int maxTaxID;
140     int *D; // maps from taxID to node ID in taxonNodes
141     int *E; // for Euler tour sequence (size 2N-1)
142     int *L; // Level of nodes in tour sequence (size 2N-1)
143     int *H;
144     int **M;
145     StringBlock<unsigned int>* block;
146 
147     bool externalData;
148     char* mmapData;
149     size_t mmapSize;
150 
151     static const int SERIALIZATION_VERSION;
152 };
153 
154 #endif
155