1 /**** 2 DIAMOND protein aligner 3 Copyright (C) 2013-2021 Max Planck Society for the Advancement of Science e.V. 4 Benjamin Buchfink 5 Eberhard Karls Universitaet Tuebingen 6 7 Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de> 8 9 This program is free software: you can redistribute it and/or modify 10 it under the terms of the GNU General Public License as published by 11 the Free Software Foundation, either version 3 of the License, or 12 (at your option) any later version. 13 14 This program is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 GNU General Public License for more details. 18 19 You should have received a copy of the GNU General Public License 20 along with this program. If not, see <http://www.gnu.org/licenses/>. 21 ****/ 22 23 #pragma once 24 #include <stdint.h> 25 #include "../basic/packed_loc.h" 26 #include "../basic/value.h" 27 #include "../util/io/input_file.h" 28 #include "../util/io/serialize.h" 29 30 // #define HIT_KEEP_TARGET_ID 31 32 namespace Search { 33 34 #pragma pack(1) 35 36 struct Hit 37 { 38 using Key = uint32_t; 39 using SeedOffset = uint32_t; 40 41 uint32_t query_; 42 PackedLoc subject_; 43 SeedOffset seed_offset_; 44 uint16_t score_; 45 #ifdef HIT_KEEP_TARGET_ID 46 uint32_t target_block_id; 47 #endif 48 HitHit49 Hit() : 50 query_(), 51 subject_(), 52 seed_offset_() 53 { } 54 Hit(unsigned query, PackedLoc subject, SeedOffset seed_offset, uint16_t score = 0, uint32_t target_block_id = 0) : query_Hit55 query_(query), 56 subject_(subject), 57 seed_offset_(seed_offset), 58 score_(score) 59 #ifdef HIT_KEEP_TARGET_ID 60 ,target_block_id(target_block_id) 61 #endif 62 { } 63 bool operator==(const Hit& h) const { 64 return query_ == h.query_ && subject_ == h.subject_ && seed_offset_ == h.seed_offset_ && score_ == h.score_; 65 } 66 bool operator<(const Hit& rhs) const 67 { 68 return query_ < rhs.query_; 69 } blankHit70 bool blank() const 71 { 72 return uint64_t(subject_) == 0; 73 } 74 unsigned operator%(unsigned i) const 75 { 76 return (query_ / align_mode.query_contexts) % i; 77 } 78 unsigned operator/(size_t i) const 79 { 80 return (query_ / align_mode.query_contexts) / (unsigned)i; 81 } frameHit82 unsigned frame() const { 83 return query_ % align_mode.query_contexts; 84 } global_diagonalHit85 int64_t global_diagonal() const 86 { 87 return (int64_t)subject_ - (int64_t)seed_offset_; 88 } 89 template<unsigned _d> query_idHit90 static unsigned query_id(const Hit& x) 91 { 92 return x.query_ / _d; 93 } 94 template<unsigned _d> 95 struct Query_id 96 { operatorHit::Query_id97 unsigned operator()(const Hit& x) const 98 { 99 return query_id<_d>(x); 100 } 101 }; 102 struct Query { operatorHit::Query103 unsigned operator()(const Hit& h) const { 104 return h.query_; 105 } 106 }; 107 struct SourceQuery { operatorHit::SourceQuery108 int32_t operator()(const Hit& h) const { 109 return h.query_ / contexts; 110 } 111 const int32_t contexts; 112 }; 113 struct Subject { operatorHit::Subject114 uint64_t operator()(const Hit& h) const { 115 return h.subject_; 116 } 117 }; 118 struct CmpSubject { operatorHit::CmpSubject119 bool operator()(const Hit& lhs, const Hit& rhs) const 120 { 121 return lhs.subject_ < rhs.subject_ 122 || (lhs.subject_ == rhs.subject_ && lhs.seed_offset_ < rhs.seed_offset_); 123 } 124 }; 125 struct CmpQueryTarget { operatorHit::CmpQueryTarget126 bool operator()(const Hit& x, const Hit& y) const { 127 return x.query_ < y.query_ || (x.query_ == y.query_ && x.subject_ < y.subject_); 128 } 129 }; 130 struct CmpTargetOffset { operatorHit::CmpTargetOffset131 bool operator()(const Hit& x, size_t s) const { 132 return (uint64_t)x.subject_ < s; 133 } 134 }; cmp_normalized_subjectHit135 static bool cmp_normalized_subject(const Hit &lhs, const Hit &rhs) 136 { 137 const uint64_t x = (uint64_t)lhs.subject_ + (uint64_t)rhs.seed_offset_, y = (uint64_t)rhs.subject_ + (uint64_t)lhs.seed_offset_; 138 return x < y || (x == y && lhs.seed_offset_ < rhs.seed_offset_); 139 } cmp_frameHit140 static bool cmp_frame(const Hit &x, const Hit &y) { 141 return x.frame() < y.frame(); 142 } 143 friend std::ostream& operator<<(std::ostream &s, const Hit &me) 144 { 145 s << me.query_ << '\t' << uint64_t(me.subject_) << '\t' << me.seed_offset_ << '\t' << me.score_ << '\n'; 146 return s; 147 } 148 } PACKED_ATTRIBUTE; 149 150 #pragma pack() 151 152 } 153 154 template<> struct SerializerTraits<Search::Hit> { 155 SerializerTraits(bool long_subject_offsets, int32_t query_contexts): 156 long_subject_offsets(long_subject_offsets), 157 key{ query_contexts } 158 {} 159 const bool long_subject_offsets; 160 const struct Key { 161 int32_t operator()(const Search::Hit& hit) const { 162 return hit.query_ / query_contexts; 163 } 164 const int32_t query_contexts; 165 } key; 166 static Search::Hit make_sentry(uint32_t query, uint32_t seed_offset) { 167 return { query, 0, seed_offset,0 }; 168 } 169 static bool is_sentry(const Search::Hit& hit) { 170 return hit.score_ == 0; 171 } 172 }; 173 174 template<> struct TypeSerializer<Search::Hit> { 175 176 TypeSerializer(TextBuffer& buf, const SerializerTraits<Search::Hit>& traits): 177 traits(traits), 178 buf_(&buf) 179 {} 180 181 TypeSerializer& operator<<(const Search::Hit& hit) { 182 if (SerializerTraits<Search::Hit>::is_sentry(hit)) { 183 buf_->write((uint16_t)0); 184 buf_->write_varint(hit.query_); 185 buf_->write_varint(hit.seed_offset_); 186 return *this; 187 } 188 buf_->write((uint16_t)hit.score_); 189 if (traits.long_subject_offsets) 190 buf_->write_raw((const char*)&hit.subject_, 5); 191 else 192 buf_->write(hit.subject_.low); 193 #ifdef HIT_KEEP_TARGET_ID 194 buf_->write(hit.target_block_id); 195 #endif 196 return *this; 197 } 198 199 const SerializerTraits<Search::Hit> traits; 200 201 private: 202 203 TextBuffer* buf_; 204 205 }; 206 207 template<> struct TypeDeserializer<Search::Hit> { 208 209 TypeDeserializer(InputFile& f, const SerializerTraits<Search::Hit>& traits): 210 f_(&f), 211 traits_(traits) 212 { 213 } 214 215 template<typename It> 216 TypeDeserializer<Search::Hit>& operator>>(It& it) { 217 uint16_t x; 218 f_->read(x); 219 220 for (;;) { 221 uint32_t query_id, seed_offset; 222 f_->varint = true; 223 (*f_) >> query_id >> seed_offset; 224 PackedLoc subject_loc; 225 uint32_t x; 226 f_->varint = false; 227 for (;;) { 228 uint16_t score; 229 try { 230 f_->read(score); 231 } 232 catch (EndOfStream&) { 233 return *this; 234 } 235 if (score == 0) 236 break; 237 if (traits_.long_subject_offsets) 238 f_->read(subject_loc); 239 else { 240 f_->read(x); 241 subject_loc = x; 242 } 243 #ifdef HIT_KEEP_TARGET_ID 244 uint32_t target_block_id; 245 f_->read(target_block_id); 246 *it = { query_id, subject_loc, seed_offset, score, target_block_id }; 247 #else 248 *it = { query_id, subject_loc, seed_offset, score }; 249 #endif 250 } 251 } 252 } 253 254 private: 255 256 InputFile* f_; 257 const SerializerTraits<Search::Hit> traits_; 258 259 }; 260