1 /****
2 DIAMOND protein aligner
3 Copyright (C) 2016-2020 Max Planck Society for the Advancement of Science e.V.
4                         Benjamin Buchfink
5 
6 Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de>
7 
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with this program.  If not, see <http://www.gnu.org/licenses/>.
20 ****/
21 
22 #pragma once
23 #include <map>
24 #include <vector>
25 #include <set>
26 #include <string>
27 #include "../util/io/serializer.h"
28 #include "../util/io/deserializer.h"
29 
30 struct Rank {
RankRank31 	Rank() :
32 		r(none)
33 	{}
RankRank34 	Rank(size_t i) :
35 		r((char)i)
36 	{}
37 	Rank(const char *s);
38 	enum {
39 		count = 45, strain = 34, biotype = 35, clade = 36, forma_specialis = 37, genotype = 38, isolate = 39, morph = 40, pathogroup = 41, serogroup = 42, serotype = 43, subvariety = 44,
40 		forma = 33, varietas = 32, subspecies = 31, species = 30, species_subgroup = 29, species_group = 28, series = 27, subsection = 26, section = 25, subgenus = 24, genus = 23, subtribe = 22,
41 		tribe = 21, subfamily = 20, family = 19, superfamily = 18, parvorder = 17, infraorder = 16, suborder = 15, order = 14, superorder = 13, subcohort = 12, cohort = 11, infraclass = 10,
42 		subclass = 9, class_rank = 8, superclass = 7, subphylum = 6, phylum = 5, superphylum = 4, subkingdom = 3, kingdom = 2, superkingdom = 1, none = 0
43 	};
44 	operator int() const {
45 		return (int)r;
46 	}
47 	friend std::ostream& operator<<(std::ostream &s, Rank &r) {
48 		s << std::string(names[(int)r.r]);
49 		return s;
50 	}
51 	static const char* names[count];
52 private:
53 	char r;
54 	static const std::map<std::string, Rank> rank_map;
55 	static std::map<std::string, Rank> init_map();
56 };
57 
58 struct TaxonomyNodes
59 {
60 
61 	TaxonomyNodes(Deserializer &in, uint32_t db_build);
62 	static void build(Serializer &out);
get_parentTaxonomyNodes63 	unsigned get_parent(unsigned taxid) const
64 	{
65 		if (taxid >= parent_.size())
66 			throw std::runtime_error(std::string("No taxonomy node found for taxon id ") + std::to_string(taxid));
67 		return parent_[taxid];
68 	}
69 	unsigned rank_taxid(unsigned taxid, Rank rank) const;
70 	std::set<unsigned> rank_taxid(const std::vector<unsigned> &taxid, Rank rank) const;
71 	unsigned get_lca(unsigned t1, unsigned t2) const;
72 	bool contained(unsigned query, const std::set<unsigned> &filter);
73 	bool contained(const std::vector<unsigned> query, const std::set<unsigned> &filter);
74 
75 private:
76 
set_cachedTaxonomyNodes77 	void set_cached(unsigned taxon_id, bool contained)
78 	{
79 		cached_[taxon_id] = true;
80 		contained_[taxon_id] = contained;
81 	}
82 
83 	std::vector<uint32_t> parent_;
84 	std::vector<Rank> rank_;
85 	std::vector<bool> cached_, contained_;
86 
87 };
88