1 /****
2 DIAMOND protein aligner
3 Copyright (C) 2013-2021 Max Planck Society for the Advancement of Science e.V.
4                         Benjamin Buchfink
5                         Eberhard Karls Universitaet Tuebingen
6 
7 Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de>
8 
9 This program is free software: you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation, either version 3 of the License, or
12 (at your option) any later version.
13 
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 GNU General Public License for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with this program.  If not, see <http://www.gnu.org/licenses/>.
21 ****/
22 
23 #pragma once
24 #include <stdint.h>
25 #include "../basic/packed_loc.h"
26 #include "../basic/value.h"
27 #include "../util/io/input_file.h"
28 #include "../util/io/serialize.h"
29 
30 // #define HIT_KEEP_TARGET_ID
31 
32 namespace Search {
33 
34 #pragma pack(1)
35 
36 struct Hit
37 {
38 	using Key = uint32_t;
39 	using SeedOffset = uint32_t;
40 
41 	uint32_t query_;
42 	PackedLoc subject_;
43 	SeedOffset seed_offset_;
44 	uint16_t score_;
45 #ifdef HIT_KEEP_TARGET_ID
46 	uint32_t target_block_id;
47 #endif
48 
HitHit49 	Hit() :
50 		query_(),
51 		subject_(),
52 		seed_offset_()
53 	{ }
54 	Hit(unsigned query, PackedLoc subject, SeedOffset seed_offset, uint16_t score = 0, uint32_t target_block_id = 0) :
query_Hit55 		query_(query),
56 		subject_(subject),
57 		seed_offset_(seed_offset),
58 		score_(score)
59 #ifdef HIT_KEEP_TARGET_ID
60 		,target_block_id(target_block_id)
61 #endif
62 	{ }
63 	bool operator==(const Hit& h) const {
64 		return query_ == h.query_ && subject_ == h.subject_ && seed_offset_ == h.seed_offset_ && score_ == h.score_;
65 	}
66 	bool operator<(const Hit& rhs) const
67 	{
68 		return query_ < rhs.query_;
69 	}
blankHit70 	bool blank() const
71 	{
72 		return uint64_t(subject_) == 0;
73 	}
74 	unsigned operator%(unsigned i) const
75 	{
76 		return (query_ / align_mode.query_contexts) % i;
77 	}
78 	unsigned operator/(size_t i) const
79 	{
80 		return (query_ / align_mode.query_contexts) / (unsigned)i;
81 	}
frameHit82 	unsigned frame() const {
83 		return query_ % align_mode.query_contexts;
84 	}
global_diagonalHit85 	int64_t global_diagonal() const
86 	{
87 		return (int64_t)subject_ - (int64_t)seed_offset_;
88 	}
89 	template<unsigned _d>
query_idHit90 	static unsigned query_id(const Hit& x)
91 	{
92 		return x.query_ / _d;
93 	}
94 	template<unsigned _d>
95 	struct Query_id
96 	{
operatorHit::Query_id97 		unsigned operator()(const Hit& x) const
98 		{
99 			return query_id<_d>(x);
100 		}
101 	};
102 	struct Query {
operatorHit::Query103 		unsigned operator()(const Hit& h) const {
104 			return h.query_;
105 		}
106 	};
107 	struct SourceQuery {
operatorHit::SourceQuery108 		int32_t operator()(const Hit& h) const {
109 			return h.query_ / contexts;
110 		}
111 		const int32_t contexts;
112 	};
113 	struct Subject {
operatorHit::Subject114 		uint64_t operator()(const Hit& h) const {
115 			return h.subject_;
116 		}
117 	};
118 	struct CmpSubject {
operatorHit::CmpSubject119 		bool operator()(const Hit& lhs, const Hit& rhs) const
120 		{
121 			return lhs.subject_ < rhs.subject_
122 				|| (lhs.subject_ == rhs.subject_ && lhs.seed_offset_ < rhs.seed_offset_);
123 		}
124 	};
125 	struct CmpQueryTarget {
operatorHit::CmpQueryTarget126 		bool operator()(const Hit& x, const Hit& y) const {
127 			return x.query_ < y.query_ || (x.query_ == y.query_ && x.subject_ < y.subject_);
128 		}
129 	};
130 	struct CmpTargetOffset {
operatorHit::CmpTargetOffset131 		bool operator()(const Hit& x, size_t s) const {
132 			return (uint64_t)x.subject_ < s;
133 		}
134 	};
cmp_normalized_subjectHit135 	static bool cmp_normalized_subject(const Hit &lhs, const Hit &rhs)
136 	{
137 		const uint64_t x = (uint64_t)lhs.subject_ + (uint64_t)rhs.seed_offset_, y = (uint64_t)rhs.subject_ + (uint64_t)lhs.seed_offset_;
138 		return x < y || (x == y && lhs.seed_offset_ < rhs.seed_offset_);
139 	}
cmp_frameHit140 	static bool cmp_frame(const Hit &x, const Hit &y) {
141 		return x.frame() < y.frame();
142 	}
143 	friend std::ostream& operator<<(std::ostream &s, const Hit &me)
144 	{
145 		s << me.query_ << '\t' << uint64_t(me.subject_) << '\t' << me.seed_offset_ << '\t' << me.score_ << '\n';
146 		return s;
147 	}
148 } PACKED_ATTRIBUTE;
149 
150 #pragma pack()
151 
152 }
153 
154 template<> struct SerializerTraits<Search::Hit> {
155 	SerializerTraits(bool long_subject_offsets, int32_t query_contexts):
156 		long_subject_offsets(long_subject_offsets),
157 		key{ query_contexts }
158 	{}
159 	const bool long_subject_offsets;
160 	const struct Key {
161 		int32_t operator()(const Search::Hit& hit) const {
162 			return hit.query_ / query_contexts;
163 		}
164 		const int32_t query_contexts;
165 	} key;
166 	static Search::Hit make_sentry(uint32_t query, uint32_t seed_offset) {
167 		return { query, 0, seed_offset,0 };
168 	}
169 	static bool is_sentry(const Search::Hit& hit) {
170 		return hit.score_ == 0;
171 	}
172 };
173 
174 template<> struct TypeSerializer<Search::Hit> {
175 
176 	TypeSerializer(TextBuffer& buf, const SerializerTraits<Search::Hit>& traits):
177 		traits(traits),
178 		buf_(&buf)
179 	{}
180 
181 	TypeSerializer& operator<<(const Search::Hit& hit) {
182 		if (SerializerTraits<Search::Hit>::is_sentry(hit)) {
183 			buf_->write((uint16_t)0);
184 			buf_->write_varint(hit.query_);
185 			buf_->write_varint(hit.seed_offset_);
186 			return *this;
187 		}
188 		buf_->write((uint16_t)hit.score_);
189 		if (traits.long_subject_offsets)
190 			buf_->write_raw((const char*)&hit.subject_, 5);
191 		else
192 			buf_->write(hit.subject_.low);
193 #ifdef HIT_KEEP_TARGET_ID
194 		buf_->write(hit.target_block_id);
195 #endif
196 		return *this;
197 	}
198 
199 	const SerializerTraits<Search::Hit> traits;
200 
201 private:
202 
203 	TextBuffer* buf_;
204 
205 };
206 
207 template<> struct TypeDeserializer<Search::Hit> {
208 
209 	TypeDeserializer(InputFile& f, const SerializerTraits<Search::Hit>& traits):
210 		f_(&f),
211 		traits_(traits)
212 	{
213 	}
214 
215 	template<typename It>
216 	TypeDeserializer<Search::Hit>& operator>>(It& it) {
217 		uint16_t x;
218 		f_->read(x);
219 
220 		for (;;) {
221 			uint32_t query_id, seed_offset;
222 			f_->varint = true;
223 			(*f_) >> query_id >> seed_offset;
224 			PackedLoc subject_loc;
225 			uint32_t x;
226 			f_->varint = false;
227 			for (;;) {
228 				uint16_t score;
229 				try {
230 					f_->read(score);
231 				}
232 				catch (EndOfStream&) {
233 					return *this;
234 				}
235 				if (score == 0)
236 					break;
237 				if (traits_.long_subject_offsets)
238 					f_->read(subject_loc);
239 				else {
240 					f_->read(x);
241 					subject_loc = x;
242 				}
243 #ifdef HIT_KEEP_TARGET_ID
244 				uint32_t target_block_id;
245 				f_->read(target_block_id);
246 				*it = { query_id, subject_loc, seed_offset, score, target_block_id };
247 #else
248 				*it = { query_id, subject_loc, seed_offset, score };
249 #endif
250 			}
251 		}
252 	}
253 
254 private:
255 
256 	InputFile* f_;
257 	const SerializerTraits<Search::Hit> traits_;
258 
259 };
260