1 /****
2 DIAMOND protein aligner
3 Copyright (C) 2013-2021 Max Planck Society for the Advancement of Science e.V.
4                         Benjamin Buchfink
5                         Eberhard Karls Universitaet Tuebingen
6 
7 Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de>
8 
9 This program is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with this program.  If not, see <http://www.gnu.org/licenses/>.
21 ****/
22 
23 #pragma once
24 #include <assert.h>
25 #include <vector>
26 #include <stddef.h>
27 #include "../basic/sequence.h"
28 #include "../util/algo/binary_search.h"
29 
30 template<typename T, char _pchar, size_t _padding = 1lu>
31 struct StringSetBase
32 {
33 
34 	enum { PERIMETER_PADDING = 256 };
35 	static const char DELIMITER = _pchar;
36 
StringSetBaseStringSetBase37 	StringSetBase():
38 		data_ (PERIMETER_PADDING, _pchar)
39 	{
40 		limits_.push_back(PERIMETER_PADDING);
41 	}
42 
finish_reserveStringSetBase43 	void finish_reserve()
44 	{
45 		data_.resize(raw_len() + PERIMETER_PADDING);
46 		std::fill(data_.begin() + raw_len(), data_.end(), _pchar);
47 	}
48 
reserveStringSetBase49 	void reserve(size_t n)
50 	{
51 		limits_.push_back(raw_len() + n + _padding);
52 	}
53 
reserveStringSetBase54 	void reserve(size_t entries, size_t length) {
55 		limits_.reserve(entries + 1);
56 		data_.reserve(length + 2 * PERIMETER_PADDING + entries * _padding);
57 	}
58 
clearStringSetBase59 	void clear() {
60 		limits_.resize(1);
61 		data_.resize(PERIMETER_PADDING);
62 	}
63 
shrink_to_fitStringSetBase64 	void shrink_to_fit() {
65 		limits_.shrink_to_fit();
66 		data_.shrink_to_fit();
67 	}
68 
69 	template<typename _it>
push_backStringSetBase70 	void push_back(_it begin, _it end)
71 	{
72 		assert(begin <= end);
73 		limits_.push_back(raw_len() + (end - begin) + _padding);
74 		data_.insert(data_.end(), begin, end);
75 		data_.insert(data_.end(), _padding, _pchar);
76 	}
77 
78 	template<typename It>
assignStringSetBase79 	void assign(const size_t i, const It begin, const It end) {
80 		std::copy(begin, end, ptr(i));
81 	}
82 
fillStringSetBase83 	void fill(size_t n, T v)
84 	{
85 		limits_.push_back(raw_len() + n + _padding);
86 		data_.insert(data_.end(), n, v);
87 		data_.insert(data_.end(), _padding, _pchar);
88 	}
89 
ptrStringSetBase90 	T* ptr(size_t i)
91 	{ return &data_[limits_[i]]; }
92 
ptrStringSetBase93 	const T* ptr(size_t i) const
94 	{ return &data_[limits_[i]]; }
95 
endStringSetBase96 	const T* end(size_t i) const {
97 		return &data_[limits_[i + 1] - _padding];
98 	}
99 
check_idxStringSetBase100 	size_t check_idx(size_t i) const
101 	{
102 		if (limits_.size() < i + 2)
103 			throw std::runtime_error("Sequence set index out of bounds.");
104 		return i;
105 	}
106 
lengthStringSetBase107 	size_t length(size_t i) const
108 	{ return limits_[i+1] - limits_[i] - _padding; }
109 
sizeStringSetBase110 	size_t size() const
111 	{ return limits_.size() - 1; }
112 
emptyStringSetBase113 	bool empty() const {
114 		return limits_.size() <= 1;
115 	}
116 
raw_lenStringSetBase117 	size_t raw_len() const
118 	{ return limits_.back(); }
119 
lettersStringSetBase120 	size_t letters() const
121 	{ return raw_len() - size() - PERIMETER_PADDING; }
122 
123 	T* data(uint64_t p = 0)
124 	{ return &data_[p]; }
125 
126 	const T* data(uint64_t p = 0) const
127 	{ return &data_[p]; }
128 
positionStringSetBase129 	size_t position(const T* p) const
130 	{ return p - data(); }
131 
positionStringSetBase132 	size_t position(size_t i, size_t j) const
133 	{ return limits_[i] + j; }
134 
local_positionStringSetBase135 	std::pair<size_t, size_t> local_position(size_t p) const
136 	{
137 		size_t i = std::upper_bound(limits_.begin(), limits_.end(), p) - limits_.begin() - 1;
138 		return std::pair<size_t, size_t>(i, p - limits_[i]);
139 	}
140 
141 	template<typename It, typename Out, typename Cmp>
local_position_batchStringSetBase142 	void local_position_batch(It begin, It end, Out out, Cmp cmp) const {
143 		batch_binary_search(begin, end, limits_.begin(), limits_.end(), out, cmp);
144 	}
145 
146 	const T* operator[](size_t i) const
147 	{
148 		return ptr(i);
149 	}
150 
backStringSetBase151 	const T* back() const {
152 		return ptr(limits_.size() - 2);
153 	}
154 
limits_beginStringSetBase155 	typename std::vector<size_t>::const_iterator limits_begin() const {
156 		return limits_.begin();
157 	}
158 
limits_endStringSetBase159 	typename std::vector<size_t>::const_iterator limits_end() const {
160 		return limits_.end();
161 	}
162 
163 	struct ConstIterator {
164 
ConstIteratorStringSetBase::ConstIterator165 		ConstIterator(const T* data, const size_t* limits):
166 			data_(data),
167 			limits_(limits)
168 		{}
169 
170 		using iterator_category = std::random_access_iterator_tag;
171 		using difference_type = ptrdiff_t;
172 		using value_type = T;
173 		using pointer = T*;
174 		using reference = T&;
175 
176 		ptrdiff_t operator-(const ConstIterator& it) const {
177 			return limits_ - it.limits_;
178 		}
179 
180 		bool operator==(const ConstIterator& it) const {
181 			return limits_ == it.limits_;
182 		}
183 
184 		ConstIterator operator+(ptrdiff_t d) const {
185 			return { data_ + *(limits_ + d) - *limits_, limits_ + d };
186 		}
187 
188 		bool operator<(const ConstIterator& it) const {
189 			return limits_ < it.limits_;
190 		}
191 
192 		ConstIterator& operator+=(ptrdiff_t d) {
193 			data_ += *(limits_ + d) - *limits_;
194 			limits_ += d;
195 			return *this;
196 		}
197 
198 		std::pair<const T*, int64_t> operator[](const ptrdiff_t i) const {
199 			return { data_ + limits_[i] - limits_[0], limits_[i + 1] - limits_[i] - _padding };
200 		}
201 
202 	private:
203 
204 		const T* data_;
205 		const size_t* limits_;
206 
207 	};
208 
cbeginStringSetBase209 	ConstIterator cbegin() const {
210 		return ConstIterator(ptr(0), limits_.data());
211 	}
212 
cendStringSetBase213 	ConstIterator cend() const {
214 		return ConstIterator(nullptr, &limits_[size()]);
215 	}
216 
217 private:
218 
219 	std::vector<T> data_;
220 	std::vector<size_t> limits_;
221 
222 };
223 
224 using StringSet = StringSetBase<char, '\0'>;