1 // Ported from blast2lca 2 // Copyright: 2010 Miguel Pignatelli 3 // License: GPLv2 or later 4 // https://github.com/emepyc/Blast2lca 5 6 #ifndef MMSEQS_NCBITAXONOMY_H 7 #define MMSEQS_NCBITAXONOMY_H 8 9 #include "StringBlock.h" 10 11 #include <map> 12 #include <unordered_map> 13 #include <vector> 14 #include <string> 15 16 typedef int TaxID; 17 18 struct TaxonNode { 19 public: 20 int id; 21 TaxID taxId; 22 TaxID parentTaxId; 23 size_t rankIdx; 24 size_t nameIdx; 25 TaxonNodeTaxonNode26 TaxonNode() {}; 27 TaxonNodeTaxonNode28 TaxonNode(int id, TaxID taxId, TaxID parentTaxId, size_t rankIdx, size_t nameIdx) 29 : id(id), taxId(taxId), parentTaxId(parentTaxId), rankIdx(rankIdx), nameIdx(nameIdx) {}; 30 }; 31 32 const double MAX_TAX_WEIGHT = 1000; 33 struct WeightedTaxHit { 34 WeightedTaxHit(const TaxID taxon, const float evalue, const int weightVoteMode); 35 36 TaxID taxon; 37 double weight; 38 }; 39 40 struct WeightedTaxResult { WeightedTaxResultWeightedTaxResult41 WeightedTaxResult(TaxID taxon, size_t assignedSeqs, size_t unassignedSeqs, size_t seqsAgreeWithSelectedTaxon, double selectedPercent) 42 : taxon(taxon), assignedSeqs(assignedSeqs), unassignedSeqs(unassignedSeqs), seqsAgreeWithSelectedTaxon(seqsAgreeWithSelectedTaxon), selectedPercent(selectedPercent) {}; 43 44 TaxID taxon; 45 size_t assignedSeqs; 46 size_t unassignedSeqs; 47 size_t seqsAgreeWithSelectedTaxon; 48 double selectedPercent; 49 }; 50 51 struct TaxonCounts { 52 unsigned int taxCount; // number of reads/sequences matching to taxa 53 unsigned int cladeCount; // number of reads/sequences matching to taxa or its children 54 std::vector<TaxID> children; // list of children 55 }; 56 57 static const std::map<std::string, int> NcbiRanks = {{ "forma", 1 }, 58 { "varietas", 2 }, 59 { "subspecies", 3 }, 60 { "species", 4 }, 61 { "species subgroup", 5 }, 62 { "species group", 6 }, 63 { "subgenus", 7 }, 64 { "genus", 8 }, 65 { "subtribe", 9 }, 66 { "tribe", 10 }, 67 { "subfamily", 11 }, 68 { "family", 12 }, 69 { "superfamily", 13 }, 70 { "parvorder", 14 }, 71 { "infraorder", 15 }, 72 { "suborder", 16 }, 73 { "order", 17 }, 74 { "superorder", 18 }, 75 { "infraclass", 19 }, 76 { "subclass", 20 }, 77 { "class", 21 }, 78 { "superclass", 22 }, 79 { "subphylum", 23 }, 80 { "phylum", 24 }, 81 { "superphylum", 25 }, 82 { "subkingdom", 26 }, 83 { "kingdom", 27 }, 84 { "superkingdom", 28 }}; 85 86 static const std::map<std::string, char> NcbiShortRanks = {{ "species", 's' }, 87 { "genus", 'g' }, 88 { "family", 'f' }, 89 { "order", 'o' }, 90 { "class", 'c' }, 91 { "phylum", 'p' }, 92 { "kingdom", 'k' }, 93 { "superkingdom", 'd' }}; 94 95 class NcbiTaxonomy { 96 public: 97 static NcbiTaxonomy* openTaxonomy(const std::string &database); 98 NcbiTaxonomy(const std::string &namesFile, const std::string &nodesFile, const std::string &mergedFile); 99 ~NcbiTaxonomy(); 100 101 TaxonNode const * LCA(const std::vector<TaxID>& taxa) const; 102 TaxID LCA(TaxID taxonA, TaxID taxonB) const; 103 std::vector<std::string> AtRanks(TaxonNode const * node, const std::vector<std::string> &levels) const; 104 std::map<std::string, std::string> AllRanks(TaxonNode const *node) const; 105 std::string taxLineage(TaxonNode const *node, bool infoAsName = true); 106 107 static std::vector<std::string> parseRanks(const std::string& ranks); 108 static int findRankIndex(const std::string& rank); 109 static char findShortRank(const std::string& rank); 110 111 bool IsAncestor(TaxID ancestor, TaxID child); 112 TaxonNode const* taxonNode(TaxID taxonId, bool fail = true) const; 113 bool nodeExists(TaxID taxId) const; 114 115 std::unordered_map<TaxID, TaxonCounts> getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts) const; 116 117 WeightedTaxResult weightedMajorityLCA(const std::vector<WeightedTaxHit> &setTaxa, const float majorityCutoff); 118 119 const char* getString(size_t blockIdx) const; 120 121 static std::pair<char*, size_t> serialize(const NcbiTaxonomy& taxonomy); 122 static NcbiTaxonomy* unserialize(char* data); 123 124 TaxonNode* taxonNodes; 125 size_t maxNodes; 126 private: 127 size_t loadNodes(std::vector<TaxonNode> &tmpNodes, const std::string &nodesFile); 128 size_t loadMerged(const std::string &mergedFile); 129 void loadNames(std::vector<TaxonNode> &tmpNodes, const std::string &namesFile); 130 void elh(std::vector<std::vector<TaxID>> const & children, int node, int level, std::vector<int> &tmpE, std::vector<int> &tmpL); 131 void InitRangeMinimumQuery(); 132 int nodeId(TaxID taxId) const; 133 134 int RangeMinimumQuery(int i, int j) const; 135 int lcaHelper(int i, int j) const; 136 NcbiTaxonomy(TaxonNode * taxonNodes,size_t maxNodes,int maxTaxID,int * D,int * E,int * L,int * H,int ** M,StringBlock<unsigned int> * block)137 NcbiTaxonomy(TaxonNode* taxonNodes, size_t maxNodes, int maxTaxID, int *D, int *E, int *L, int *H, int **M, StringBlock<unsigned int> *block) 138 : taxonNodes(taxonNodes), maxNodes(maxNodes), maxTaxID(maxTaxID), D(D), E(E), L(L), H(H), M(M), block(block), externalData(true), mmapData(NULL), mmapSize(0) {}; 139 int maxTaxID; 140 int *D; // maps from taxID to node ID in taxonNodes 141 int *E; // for Euler tour sequence (size 2N-1) 142 int *L; // Level of nodes in tour sequence (size 2N-1) 143 int *H; 144 int **M; 145 StringBlock<unsigned int>* block; 146 147 bool externalData; 148 char* mmapData; 149 size_t mmapSize; 150 151 static const int SERIALIZATION_VERSION; 152 }; 153 154 #endif 155