1 /**** 2 DIAMOND protein aligner 3 Copyright (C) 2016-2020 Max Planck Society for the Advancement of Science e.V. 4 Benjamin Buchfink 5 6 Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de> 7 8 This program is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 3 of the License, or 11 (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program. If not, see <http://www.gnu.org/licenses/>. 20 ****/ 21 22 #pragma once 23 #include <map> 24 #include <vector> 25 #include <set> 26 #include <string> 27 #include "../util/io/serializer.h" 28 #include "../util/io/deserializer.h" 29 30 struct Rank { RankRank31 Rank() : 32 r(none) 33 {} RankRank34 Rank(size_t i) : 35 r((char)i) 36 {} 37 Rank(const char *s); 38 enum { 39 count = 45, strain = 34, biotype = 35, clade = 36, forma_specialis = 37, genotype = 38, isolate = 39, morph = 40, pathogroup = 41, serogroup = 42, serotype = 43, subvariety = 44, 40 forma = 33, varietas = 32, subspecies = 31, species = 30, species_subgroup = 29, species_group = 28, series = 27, subsection = 26, section = 25, subgenus = 24, genus = 23, subtribe = 22, 41 tribe = 21, subfamily = 20, family = 19, superfamily = 18, parvorder = 17, infraorder = 16, suborder = 15, order = 14, superorder = 13, subcohort = 12, cohort = 11, infraclass = 10, 42 subclass = 9, class_rank = 8, superclass = 7, subphylum = 6, phylum = 5, superphylum = 4, subkingdom = 3, kingdom = 2, superkingdom = 1, none = 0 43 }; 44 operator int() const { 45 return (int)r; 46 } 47 friend std::ostream& operator<<(std::ostream &s, Rank &r) { 48 s << std::string(names[(int)r.r]); 49 return s; 50 } 51 static const char* names[count]; 52 private: 53 char r; 54 static const std::map<std::string, Rank> rank_map; 55 static std::map<std::string, Rank> init_map(); 56 }; 57 58 struct TaxonomyNodes 59 { 60 61 TaxonomyNodes(Deserializer &in, uint32_t db_build); 62 static void build(Serializer &out); get_parentTaxonomyNodes63 unsigned get_parent(unsigned taxid) const 64 { 65 if (taxid >= parent_.size()) 66 throw std::runtime_error(std::string("No taxonomy node found for taxon id ") + std::to_string(taxid)); 67 return parent_[taxid]; 68 } 69 unsigned rank_taxid(unsigned taxid, Rank rank) const; 70 std::set<unsigned> rank_taxid(const std::vector<unsigned> &taxid, Rank rank) const; 71 unsigned get_lca(unsigned t1, unsigned t2) const; 72 bool contained(unsigned query, const std::set<unsigned> &filter); 73 bool contained(const std::vector<unsigned> query, const std::set<unsigned> &filter); 74 75 private: 76 set_cachedTaxonomyNodes77 void set_cached(unsigned taxon_id, bool contained) 78 { 79 cached_[taxon_id] = true; 80 contained_[taxon_id] = contained; 81 } 82 83 std::vector<uint32_t> parent_; 84 std::vector<Rank> rank_; 85 std::vector<bool> cached_, contained_; 86 87 }; 88