1 /**** 2 DIAMOND protein aligner 3 Copyright (C) 2013-2021 Max Planck Society for the Advancement of Science e.V. 4 Benjamin Buchfink 5 Eberhard Karls Universitaet Tuebingen 6 7 Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de> 8 9 This program is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with this program. If not, see <http://www.gnu.org/licenses/>. 21 ****/ 22 23 #pragma once 24 #include <assert.h> 25 #include <vector> 26 #include <stddef.h> 27 #include "../basic/sequence.h" 28 #include "../util/algo/binary_search.h" 29 30 template<typename T, char _pchar, size_t _padding = 1lu> 31 struct StringSetBase 32 { 33 34 enum { PERIMETER_PADDING = 256 }; 35 static const char DELIMITER = _pchar; 36 StringSetBaseStringSetBase37 StringSetBase(): 38 data_ (PERIMETER_PADDING, _pchar) 39 { 40 limits_.push_back(PERIMETER_PADDING); 41 } 42 finish_reserveStringSetBase43 void finish_reserve() 44 { 45 data_.resize(raw_len() + PERIMETER_PADDING); 46 std::fill(data_.begin() + raw_len(), data_.end(), _pchar); 47 } 48 reserveStringSetBase49 void reserve(size_t n) 50 { 51 limits_.push_back(raw_len() + n + _padding); 52 } 53 reserveStringSetBase54 void reserve(size_t entries, size_t length) { 55 limits_.reserve(entries + 1); 56 data_.reserve(length + 2 * PERIMETER_PADDING + entries * _padding); 57 } 58 clearStringSetBase59 void clear() { 60 limits_.resize(1); 61 data_.resize(PERIMETER_PADDING); 62 } 63 shrink_to_fitStringSetBase64 void shrink_to_fit() { 65 limits_.shrink_to_fit(); 66 data_.shrink_to_fit(); 67 } 68 69 template<typename _it> push_backStringSetBase70 void push_back(_it begin, _it end) 71 { 72 assert(begin <= end); 73 limits_.push_back(raw_len() + (end - begin) + _padding); 74 data_.insert(data_.end(), begin, end); 75 data_.insert(data_.end(), _padding, _pchar); 76 } 77 78 template<typename It> assignStringSetBase79 void assign(const size_t i, const It begin, const It end) { 80 std::copy(begin, end, ptr(i)); 81 } 82 fillStringSetBase83 void fill(size_t n, T v) 84 { 85 limits_.push_back(raw_len() + n + _padding); 86 data_.insert(data_.end(), n, v); 87 data_.insert(data_.end(), _padding, _pchar); 88 } 89 ptrStringSetBase90 T* ptr(size_t i) 91 { return &data_[limits_[i]]; } 92 ptrStringSetBase93 const T* ptr(size_t i) const 94 { return &data_[limits_[i]]; } 95 endStringSetBase96 const T* end(size_t i) const { 97 return &data_[limits_[i + 1] - _padding]; 98 } 99 check_idxStringSetBase100 size_t check_idx(size_t i) const 101 { 102 if (limits_.size() < i + 2) 103 throw std::runtime_error("Sequence set index out of bounds."); 104 return i; 105 } 106 lengthStringSetBase107 size_t length(size_t i) const 108 { return limits_[i+1] - limits_[i] - _padding; } 109 sizeStringSetBase110 size_t size() const 111 { return limits_.size() - 1; } 112 emptyStringSetBase113 bool empty() const { 114 return limits_.size() <= 1; 115 } 116 raw_lenStringSetBase117 size_t raw_len() const 118 { return limits_.back(); } 119 lettersStringSetBase120 size_t letters() const 121 { return raw_len() - size() - PERIMETER_PADDING; } 122 123 T* data(uint64_t p = 0) 124 { return &data_[p]; } 125 126 const T* data(uint64_t p = 0) const 127 { return &data_[p]; } 128 positionStringSetBase129 size_t position(const T* p) const 130 { return p - data(); } 131 positionStringSetBase132 size_t position(size_t i, size_t j) const 133 { return limits_[i] + j; } 134 local_positionStringSetBase135 std::pair<size_t, size_t> local_position(size_t p) const 136 { 137 size_t i = std::upper_bound(limits_.begin(), limits_.end(), p) - limits_.begin() - 1; 138 return std::pair<size_t, size_t>(i, p - limits_[i]); 139 } 140 141 template<typename It, typename Out, typename Cmp> local_position_batchStringSetBase142 void local_position_batch(It begin, It end, Out out, Cmp cmp) const { 143 batch_binary_search(begin, end, limits_.begin(), limits_.end(), out, cmp); 144 } 145 146 const T* operator[](size_t i) const 147 { 148 return ptr(i); 149 } 150 backStringSetBase151 const T* back() const { 152 return ptr(limits_.size() - 2); 153 } 154 limits_beginStringSetBase155 typename std::vector<size_t>::const_iterator limits_begin() const { 156 return limits_.begin(); 157 } 158 limits_endStringSetBase159 typename std::vector<size_t>::const_iterator limits_end() const { 160 return limits_.end(); 161 } 162 163 struct ConstIterator { 164 ConstIteratorStringSetBase::ConstIterator165 ConstIterator(const T* data, const size_t* limits): 166 data_(data), 167 limits_(limits) 168 {} 169 170 using iterator_category = std::random_access_iterator_tag; 171 using difference_type = ptrdiff_t; 172 using value_type = T; 173 using pointer = T*; 174 using reference = T&; 175 176 ptrdiff_t operator-(const ConstIterator& it) const { 177 return limits_ - it.limits_; 178 } 179 180 bool operator==(const ConstIterator& it) const { 181 return limits_ == it.limits_; 182 } 183 184 ConstIterator operator+(ptrdiff_t d) const { 185 return { data_ + *(limits_ + d) - *limits_, limits_ + d }; 186 } 187 188 bool operator<(const ConstIterator& it) const { 189 return limits_ < it.limits_; 190 } 191 192 ConstIterator& operator+=(ptrdiff_t d) { 193 data_ += *(limits_ + d) - *limits_; 194 limits_ += d; 195 return *this; 196 } 197 198 std::pair<const T*, int64_t> operator[](const ptrdiff_t i) const { 199 return { data_ + limits_[i] - limits_[0], limits_[i + 1] - limits_[i] - _padding }; 200 } 201 202 private: 203 204 const T* data_; 205 const size_t* limits_; 206 207 }; 208 cbeginStringSetBase209 ConstIterator cbegin() const { 210 return ConstIterator(ptr(0), limits_.data()); 211 } 212 cendStringSetBase213 ConstIterator cend() const { 214 return ConstIterator(nullptr, &limits_[size()]); 215 } 216 217 private: 218 219 std::vector<T> data_; 220 std::vector<size_t> limits_; 221 222 }; 223 224 using StringSet = StringSetBase<char, '\0'>;