1 // Copyright (c) 2020 Robert Vaser
2
3 #ifndef SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_
4 #define SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_
5
6 #include "simd_alignment_engine.hpp"
7
8 #include <algorithm>
9 #include <cmath>
10 #include <iostream>
11 #include <limits>
12 #include <memory>
13 #include <stdexcept>
14 #include <vector>
15
16 extern "C" {
17 #ifdef USE_SIMDE
18 #ifdef __AVX2__
19 #include <simde/x86/avx2.h>
20 #else
21 #include <simde/x86/sse4.1.h> // SSE4.1 is covered better
22 #endif
23 #elif defined(__AVX2__) || defined(__SSE4_1__)
24 #include <immintrin.h> // AVX2 and lower
25 #endif
26 }
27
28 #include "spoa/graph.hpp"
29
30 namespace spoa {
31
32 // Taken from https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=216149
align(std::size_t __align,std::size_t __size,void * & __ptr,std::size_t & __space)33 inline void* align(
34 std::size_t __align,
35 std::size_t __size,
36 void*& __ptr, // NOLINT
37 std::size_t& __space) noexcept { // NOLINT
38 const auto __intptr = reinterpret_cast<uintptr_t>(__ptr);
39 const auto __aligned = (__intptr - 1u + __align) & -__align;
40 const auto __diff = __aligned - __intptr;
41 if ((__size + __diff) > __space) {
42 return nullptr;
43 } else {
44 __space -= __diff;
45 return __ptr = reinterpret_cast<void*>(__aligned);
46 }
47 }
48
49 template<Architecture A, typename T>
AllocateAlignedMemory(T ** storage,std::size_t size,std::size_t alignment)50 T* AllocateAlignedMemory(
51 T** storage,
52 std::size_t size,
53 std::size_t alignment) {
54 *storage = new T[size + alignment - 1];
55 void* ptr = static_cast<void*>(*storage);
56 std::size_t storage_size = (size + alignment - 1) * sizeof(T);
57 return static_cast<T*>(align(alignment, size * sizeof(T), ptr, storage_size));
58 }
59
60 template<Architecture A, typename T>
61 struct InstructionSet;
62
63 #if defined(__AVX2__)
64
65 constexpr std::uint32_t kRegisterSize = 256;
66 using __mxxxi = __m256i;
67
_mmxxx_load_si(__mxxxi const * mem_addr)68 inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) {
69 return _mm256_load_si256(mem_addr);
70 }
71
_mmxxx_store_si(__mxxxi * mem_addr,const __mxxxi & a)72 inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) {
73 _mm256_store_si256(mem_addr, a);
74 }
75
_mmxxx_or_si(const __mxxxi & a,const __mxxxi & b)76 inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) {
77 return _mm256_or_si256(a, b);
78 }
79
80 #define _mmxxx_slli_si(a, n) n < 16 ? \
81 _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0)), 16 - n) : \
82 _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0))
83
84 #define _mmxxx_srli_si(a, n) \
85 _mm256_srli_si256(_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), n - 16) // NOLINT
86
87 template<Architecture A>
88 struct InstructionSet<A, std::int16_t> {
89 using type = std::int16_t;
90 static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
91 static constexpr std::uint32_t kLogNumVar = 4;
92 static constexpr std::uint32_t kLSS = 2; // Left Shift Size
93 static constexpr std::uint32_t kRSS = 30; // Right Shift Size
_mmxxx_add_epispoa::InstructionSet94 static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
95 return _mm256_add_epi16(a, b);
96 }
_mmxxx_sub_epispoa::InstructionSet97 static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
98 return _mm256_sub_epi16(a, b);
99 }
_mmxxx_min_epispoa::InstructionSet100 static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
101 return _mm256_min_epi16(a, b);
102 }
_mmxxx_max_epispoa::InstructionSet103 static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
104 return _mm256_max_epi16(a, b);
105 }
_mmxxx_set1_epispoa::InstructionSet106 static inline __mxxxi _mmxxx_set1_epi(type a) {
107 return _mm256_set1_epi16(a);
108 }
_mmxxx_prefix_maxspoa::InstructionSet109 static inline void _mmxxx_prefix_max(
110 __mxxxi& a, // NOLINT
111 const __mxxxi* masks,
112 const __mxxxi* penalties) {
113 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 2))); // NOLINT
114 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 4))); // NOLINT
115 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 8))); // NOLINT
116 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[3], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[3]), 16))); // NOLINT
117 }
118 };
119
120 template<Architecture A>
121 struct InstructionSet<A, std::int32_t> {
122 using type = std::int32_t;
123 static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
124 static constexpr std::uint32_t kLogNumVar = 3;
125 static constexpr std::uint32_t kLSS = 4;
126 static constexpr std::uint32_t kRSS = 28;
_mmxxx_add_epispoa::InstructionSet127 static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
128 return _mm256_add_epi32(a, b);
129 }
_mmxxx_sub_epispoa::InstructionSet130 static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
131 return _mm256_sub_epi32(a, b);
132 }
_mmxxx_min_epispoa::InstructionSet133 static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
134 return _mm256_min_epi32(a, b);
135 }
_mmxxx_max_epispoa::InstructionSet136 static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
137 return _mm256_max_epi32(a, b);
138 }
_mmxxx_set1_epispoa::InstructionSet139 static inline __mxxxi _mmxxx_set1_epi(type a) {
140 return _mm256_set1_epi32(a);
141 }
_mmxxx_prefix_maxspoa::InstructionSet142 static inline void _mmxxx_prefix_max(
143 __mxxxi& a, // NOLINT
144 const __mxxxi* masks,
145 const __mxxxi* penalties) {
146 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 4))); // NOLINT
147 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 8))); // NOLINT
148 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 16))); // NOLINT
149 }
150 };
151
152 #elif defined(__SSE4_1__) || defined(USE_SIMDE)
153
154 constexpr std::uint32_t kRegisterSize = 128;
155 using __mxxxi = __m128i;
156
_mmxxx_load_si(__mxxxi const * mem_addr)157 inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) {
158 return _mm_load_si128(mem_addr);
159 }
160
_mmxxx_store_si(__mxxxi * mem_addr,const __mxxxi & a)161 inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) {
162 _mm_store_si128(mem_addr, a);
163 }
164
_mmxxx_or_si(const __mxxxi & a,const __mxxxi & b)165 inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) {
166 return _mm_or_si128(a, b);
167 }
168
169 #define _mmxxx_slli_si(a, n) \
170 _mm_slli_si128(a, n)
171
172 #define _mmxxx_srli_si(a, n) \
173 _mm_srli_si128(a, n)
174
175 template<Architecture A>
176 struct InstructionSet<A, std::int16_t> {
177 using type = std::int16_t;
178 static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
179 static constexpr std::uint32_t kLogNumVar = 3;
180 static constexpr std::uint32_t kLSS = 2;
181 static constexpr std::uint32_t kRSS = 14;
_mmxxx_add_epispoa::InstructionSet182 static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
183 return _mm_add_epi16(a, b);
184 }
_mmxxx_sub_epispoa::InstructionSet185 static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
186 return _mm_sub_epi16(a, b);
187 }
_mmxxx_min_epispoa::InstructionSet188 static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
189 return _mm_min_epi16(a, b);
190 }
_mmxxx_max_epispoa::InstructionSet191 static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
192 return _mm_max_epi16(a, b);
193 }
_mmxxx_set1_epispoa::InstructionSet194 static inline __mxxxi _mmxxx_set1_epi(type a) {
195 return _mm_set1_epi16(a);
196 }
_mmxxx_prefix_maxspoa::InstructionSet197 static inline void _mmxxx_prefix_max(
198 __mxxxi& a, // NOLINT
199 const __mxxxi* masks,
200 const __mxxxi* penalties) {
201 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 2))); // NOLINT
202 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 4))); // NOLINT
203 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 8))); // NOLINT
204 }
205 };
206
207 template<Architecture A>
208 struct InstructionSet<A, std::int32_t> {
209 using type = std::int32_t;
210 static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
211 static constexpr std::uint32_t kLogNumVar = 2;
212 static constexpr std::uint32_t kLSS = 4;
213 static constexpr std::uint32_t kRSS = 12;
_mmxxx_add_epispoa::InstructionSet214 static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
215 return _mm_add_epi32(a, b);
216 }
_mmxxx_sub_epispoa::InstructionSet217 static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
218 return _mm_sub_epi32(a, b);
219 }
_mmxxx_min_epispoa::InstructionSet220 static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
221 return _mm_min_epi32(a, b);
222 }
_mmxxx_max_epispoa::InstructionSet223 static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
224 return _mm_max_epi32(a, b);
225 }
_mmxxx_set1_epispoa::InstructionSet226 static inline __mxxxi _mmxxx_set1_epi(type a) {
227 return _mm_set1_epi32(a);
228 }
_mmxxx_prefix_maxspoa::InstructionSet229 static inline void _mmxxx_prefix_max(
230 __mxxxi& a, // NOLINT
231 const __mxxxi* masks,
232 const __mxxxi* penalties) {
233 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 4))); // NOLINT
234 a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 8))); // NOLINT
235 }
236 };
237
238 #endif
239
240 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
241
242 template<Architecture A, typename T>
_mmxxx_print(const __mxxxi & a)243 void _mmxxx_print(const __mxxxi& a) {
244 __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT
245 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a);
246
247 for (std::uint32_t i = 0; i < T::kNumVar; i++) {
248 std::cout << unpacked[i] << " ";
249 }
250 }
251
252 template<Architecture A, typename T>
_mmxxx_max_value(const __mxxxi & a)253 typename T::type _mmxxx_max_value(const __mxxxi& a) {
254 typename T::type max_score = 0;
255 __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT
256 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a);
257
258 for (std::uint32_t i = 0; i < T::kNumVar; i++) {
259 max_score = std::max(max_score, unpacked[i]);
260 }
261 return max_score;
262 }
263
264 template<Architecture A, typename T>
_mmxxx_value_at(const __mxxxi & a,std::uint32_t i)265 typename T::type _mmxxx_value_at(const __mxxxi& a, std::uint32_t i) {
266 __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT
267 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a);
268
269 return unpacked[i];
270 }
271
272 template<Architecture A, typename T>
_mmxxx_index_of(const __mxxxi * row,std::uint32_t row_width,typename T::type value)273 std::int32_t _mmxxx_index_of(
274 const __mxxxi* row,
275 std::uint32_t row_width,
276 typename T::type value) {
277 for (std::uint32_t i = 0; i < row_width; ++i) {
278 __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT
279 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), row[i]);
280
281 for (std::uint32_t j = 0; j < T::kNumVar; j++) {
282 if (unpacked[j] == value) {
283 return i * T::kNumVar + j;
284 }
285 }
286 }
287 return -1;
288 }
289
290 #endif
291
292 template<Architecture A>
Create(AlignmentType type,AlignmentSubtype subtype,std::int8_t m,std::int8_t n,std::int8_t g,std::int8_t e,std::int8_t q,std::int8_t c)293 std::unique_ptr<AlignmentEngine> SimdAlignmentEngine<A>::Create(
294 AlignmentType type,
295 AlignmentSubtype subtype,
296 std::int8_t m,
297 std::int8_t n,
298 std::int8_t g,
299 std::int8_t e,
300 std::int8_t q,
301 std::int8_t c) {
302 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
303 return std::unique_ptr<AlignmentEngine>(
304 new SimdAlignmentEngine<A>(type, subtype, m, n, g, e, q, c));
305 #else
306 return nullptr;
307 #endif
308 }
309
310 template<Architecture A>
311 struct SimdAlignmentEngine<A>::Implementation {
312 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
313 std::vector<std::uint32_t> node_id_to_rank;
314
315 std::unique_ptr<__mxxxi[]> sequence_profile_storage;
316 std::uint64_t sequence_profile_size;
317 __mxxxi* sequence_profile;
318
319 std::vector<std::int32_t> first_column;
320 std::unique_ptr<__mxxxi[]> M_storage;
321 std::uint64_t M_size;
322 __mxxxi* H;
323 __mxxxi* F;
324 __mxxxi* E;
325 __mxxxi* O;
326 __mxxxi* Q;
327
328 std::unique_ptr<__mxxxi[]> masks_storage;
329 std::uint32_t masks_size;
330 __mxxxi* masks;
331
332 std::unique_ptr<__mxxxi[]> penalties_storage;
333 std::uint32_t penalties_size;
334 __mxxxi* penalties;
335
Implementationspoa::SimdAlignmentEngine::Implementation336 Implementation()
337 : node_id_to_rank(),
338 sequence_profile_storage(nullptr),
339 sequence_profile_size(0),
340 sequence_profile(nullptr),
341 first_column(),
342 M_storage(nullptr),
343 M_size(0),
344 H(nullptr),
345 F(nullptr),
346 E(nullptr),
347 O(nullptr),
348 Q(nullptr),
349 masks_storage(nullptr),
350 masks_size(0),
351 masks(nullptr),
352 penalties_storage(nullptr),
353 penalties_size(0),
354 penalties(nullptr) {
355 }
356 #endif
357 };
358
359 template<Architecture A>
SimdAlignmentEngine(AlignmentType type,AlignmentSubtype subtype,std::int8_t m,std::int8_t n,std::int8_t g,std::int8_t e,std::int8_t q,std::int8_t c)360 SimdAlignmentEngine<A>::SimdAlignmentEngine(
361 AlignmentType type,
362 AlignmentSubtype subtype,
363 std::int8_t m,
364 std::int8_t n,
365 std::int8_t g,
366 std::int8_t e,
367 std::int8_t q,
368 std::int8_t c)
369 : AlignmentEngine(type, subtype, m, n, g, e, q, c),
370 pimpl_(new Implementation()) {
371 }
372
373 template<Architecture A>
Prealloc(std::uint32_t max_sequence_len,std::uint8_t alphabet_size)374 void SimdAlignmentEngine<A>::Prealloc(
375 std::uint32_t max_sequence_len,
376 std::uint8_t alphabet_size) {
377 if (max_sequence_len > std::numeric_limits<int32_t>::max()) {
378 throw std::invalid_argument(
379 "[spoa::SimdAlignmentEngine::Prealloc] error: too large sequence!");
380 }
381
382 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
383
384 std::int64_t worst_case_score = WorstCaseAlignmentScore(
385 static_cast<std::int64_t>(max_sequence_len) + 8,
386 static_cast<std::int64_t>(max_sequence_len) * alphabet_size);
387
388 if (worst_case_score < std::numeric_limits<std::int32_t>::min() + 1024) {
389 return;
390 } else if (worst_case_score < std::numeric_limits<std::int16_t>::min() + 1024) { // NOLINT
391 try {
392 Realloc(
393 (max_sequence_len / InstructionSet<A, std::int32_t>::kNumVar) + 1,
394 static_cast<std::uint64_t>(max_sequence_len) * alphabet_size,
395 alphabet_size);
396 } catch (std::bad_alloc& ba) {
397 throw std::invalid_argument(
398 "[spoa::SimdAlignmentEngine::Prealloc] error: insufficient memory!");
399 }
400 } else {
401 try {
402 Realloc(
403 (max_sequence_len / InstructionSet<A, std::int16_t>::kNumVar) + 1,
404 static_cast<std::uint64_t>(max_sequence_len) * alphabet_size,
405 alphabet_size);
406 } catch (std::bad_alloc& ba) {
407 throw std::invalid_argument(
408 "[spoa::SimdAlignmentEngine::Prealloc] error: insufficient memory!");
409 }
410 }
411
412 #endif
413 }
414
415 template<Architecture A>
Realloc(std::uint64_t matrix_width,std::uint64_t matrix_height,std::uint8_t num_codes)416 void SimdAlignmentEngine<A>::Realloc(
417 std::uint64_t matrix_width,
418 std::uint64_t matrix_height,
419 std::uint8_t num_codes) {
420 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
421 if (pimpl_->node_id_to_rank.size() < matrix_height - 1) {
422 pimpl_->node_id_to_rank.resize(matrix_height - 1, 0);
423 }
424 if (pimpl_->sequence_profile_size < num_codes * matrix_width) {
425 __mxxxi* storage = nullptr;
426 pimpl_->sequence_profile_size = num_codes * matrix_width;
427 pimpl_->sequence_profile = AllocateAlignedMemory<A>(
428 &storage,
429 pimpl_->sequence_profile_size,
430 kRegisterSize / 8);
431 pimpl_->sequence_profile_storage.reset();
432 pimpl_->sequence_profile_storage = std::unique_ptr<__mxxxi[]>(storage);
433 }
434 if (subtype_ == AlignmentSubtype::kLinear) {
435 if (pimpl_->first_column.size() < matrix_height) {
436 pimpl_->first_column.resize(matrix_height, 0);
437 }
438 if (pimpl_->M_size < matrix_height * matrix_width) {
439 __mxxxi* storage = nullptr;
440 pimpl_->M_size = matrix_height * matrix_width;
441 pimpl_->H = AllocateAlignedMemory<A>(
442 &storage,
443 pimpl_->M_size,
444 kRegisterSize / 8);
445 pimpl_->M_storage.reset();
446 pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage);
447 }
448 } else if (subtype_ == AlignmentSubtype::kAffine) {
449 if (pimpl_->first_column.size() < 2 * matrix_height) {
450 pimpl_->first_column.resize(2 * matrix_height, 0);
451 }
452 if (pimpl_->M_size < 3 * matrix_height * matrix_width) {
453 __mxxxi* storage = nullptr;
454 pimpl_->M_size = 3 * matrix_height * matrix_width;
455 pimpl_->H = AllocateAlignedMemory<A>(
456 &storage,
457 pimpl_->M_size,
458 kRegisterSize / 8);
459 pimpl_->F = pimpl_->H + matrix_height * matrix_width;
460 pimpl_->E = pimpl_->F + matrix_height * matrix_width;
461 pimpl_->M_storage.reset();
462 pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage);
463 }
464 } else if (subtype_ == AlignmentSubtype::kConvex) {
465 if (pimpl_->first_column.size() < 3 * matrix_height) {
466 pimpl_->first_column.resize(3 * matrix_height, 0);
467 }
468 if (pimpl_->M_size < 5 * matrix_height * matrix_width) {
469 __mxxxi* storage = nullptr;
470 pimpl_->M_size = 5 * matrix_height * matrix_width;
471 pimpl_->H = AllocateAlignedMemory<A>(
472 &storage,
473 pimpl_->M_size,
474 kRegisterSize / 8);
475 pimpl_->F = pimpl_->H + matrix_height * matrix_width;
476 pimpl_->E = pimpl_->F + matrix_height * matrix_width;
477 pimpl_->O = pimpl_->E + matrix_height * matrix_width;
478 pimpl_->Q = pimpl_->O + matrix_height * matrix_width;
479 pimpl_->M_storage.reset();
480 pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage);
481 }
482 }
483 if (pimpl_->masks_size < InstructionSet<A, std::int16_t>::kLogNumVar + 1) {
484 __mxxxi* storage = nullptr;
485 pimpl_->masks_size = InstructionSet<A, std::int16_t>::kLogNumVar + 1;
486 pimpl_->masks = AllocateAlignedMemory<A>(
487 &storage,
488 pimpl_->masks_size,
489 kRegisterSize / 8);
490 pimpl_->masks_storage.reset();
491 pimpl_->masks_storage = std::unique_ptr<__mxxxi[]>(storage);
492 }
493 if (pimpl_->penalties_size < 2 * InstructionSet<A, std::int16_t>::kLogNumVar) { // NOLINT
494 __mxxxi* storage = nullptr;
495 pimpl_->penalties_size = 2 * InstructionSet<A, std::int16_t>::kLogNumVar;
496 pimpl_->penalties = AllocateAlignedMemory<A>(
497 &storage,
498 pimpl_->penalties_size,
499 kRegisterSize / 8);
500 pimpl_->penalties_storage.reset();
501 pimpl_->penalties_storage = std::unique_ptr<__mxxxi[]>(storage);
502 }
503 #endif
504 }
505
506 template<Architecture A> template<typename T>
Initialize(const char * sequence,const Graph & graph,std::uint64_t normal_matrix_width,std::uint64_t matrix_width,std::uint64_t matrix_height)507 void SimdAlignmentEngine<A>::Initialize(
508 const char* sequence,
509 const Graph& graph,
510 std::uint64_t normal_matrix_width,
511 std::uint64_t matrix_width,
512 std::uint64_t matrix_height) noexcept {
513 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
514 std::int32_t padding_penatly = -1 * std::max(
515 std::max(abs(m_), abs(n_)),
516 std::max(abs(g_), abs(q_)));
517
518 __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {}; // NOLINT
519
520 for (std::uint32_t i = 0; i < graph.num_codes(); ++i) {
521 char c = graph.decoder(i);
522 for (std::uint32_t j = 0; j < matrix_width; ++j) {
523 for (std::uint32_t k = 0; k < T::kNumVar; ++k) {
524 unpacked[k] = (j * T::kNumVar + k) < normal_matrix_width ?
525 (c == sequence[j * T::kNumVar + k] ? m_ : n_) : padding_penatly;
526 }
527 pimpl_->sequence_profile[i * matrix_width + j] =
528 _mmxxx_load_si(reinterpret_cast<const __mxxxi*>(unpacked));
529 }
530 }
531
532 const auto& rank_to_node = graph.rank_to_node();
533 for (std::uint32_t i = 0; i < rank_to_node.size(); ++i) {
534 pimpl_->node_id_to_rank[rank_to_node[i]->id] = i;
535 }
536
537 typename T::type kNegativeInfinity =
538 std::numeric_limits<typename T::type>::min() + 1024;
539
540 __mxxxi negative_infinities = T::_mmxxx_set1_epi(kNegativeInfinity);
541 __mxxxi zeroes = T::_mmxxx_set1_epi(0);
542
543 // initialize secondary matrices
544 switch (subtype_) {
545 case AlignmentSubtype::kConvex:
546 for (std::uint32_t j = 0; j < matrix_width; ++j) {
547 pimpl_->O[j] = negative_infinities;
548 pimpl_->Q[j] = T::_mmxxx_set1_epi(q_ + j * T::kNumVar * c_);
549
550 __mxxxi c = T::_mmxxx_set1_epi(c_);
551 for (std::uint32_t k = 1; k < T::kNumVar; ++k) {
552 c = _mmxxx_slli_si(c, T::kLSS);
553 pimpl_->Q[j] = T::_mmxxx_add_epi(pimpl_->Q[j], c);
554 }
555 }
556 pimpl_->first_column[2 * matrix_height] = 0;
557 for (std::uint32_t i = 1; i < matrix_height; ++i) {
558 const auto& edges = rank_to_node[i - 1]->inedges;
559 std::int32_t penalty = edges.empty() ? q_ - c_ : kNegativeInfinity;
560 for (const auto& it : edges) {
561 std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1;
562 penalty = std::max(penalty, pimpl_->first_column[2 * matrix_height + pred_i]); // NOLINT
563 }
564 pimpl_->first_column[2 * matrix_height + i] = penalty + c_;
565 }
566 case AlignmentSubtype::kAffine:
567 for (std::uint32_t j = 0; j < matrix_width; ++j) {
568 pimpl_->F[j] = negative_infinities;
569 pimpl_->E[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * e_);
570
571 __mxxxi e = T::_mmxxx_set1_epi(e_);
572 for (std::uint32_t k = 1; k < T::kNumVar; ++k) {
573 e = _mmxxx_slli_si(e, T::kLSS);
574 pimpl_->E[j] = T::_mmxxx_add_epi(pimpl_->E[j], e);
575 }
576 }
577 pimpl_->first_column[matrix_height] = 0;
578 for (std::uint32_t i = 1; i < matrix_height; ++i) {
579 const auto& edges = rank_to_node[i - 1]->inedges;
580 std::int32_t penalty = edges.empty() ? g_ - e_ : kNegativeInfinity;
581 for (const auto& it : edges) {
582 std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1;
583 penalty = std::max(penalty, pimpl_->first_column[matrix_height + pred_i]); // NOLINT
584 }
585 pimpl_->first_column[matrix_height + i] = penalty + e_;
586 }
587 case AlignmentSubtype::kLinear:
588 default:
589 break;
590 }
591
592 // initialize primary matrix
593 switch (type_) {
594 case AlignmentType::kSW:
595 for (std::uint32_t j = 0; j < matrix_width; ++j) {
596 pimpl_->H[j] = zeroes;
597 }
598 for (std::uint32_t i = 0; i < matrix_height; ++i) {
599 pimpl_->first_column[i] = 0;
600 }
601 break;
602 case AlignmentType::kNW:
603 switch (subtype_) {
604 case AlignmentSubtype::kConvex:
605 for (std::uint32_t i = 0; i < matrix_height; ++i) {
606 pimpl_->first_column[i] = std::max(
607 pimpl_->first_column[matrix_height + i],
608 pimpl_->first_column[2 * matrix_height + i]);
609 }
610 for (std::uint32_t j = 0; j < matrix_width; ++j) {
611 pimpl_->H[j] = T::_mmxxx_max_epi(pimpl_->E[j], pimpl_->Q[j]);
612 }
613 break;
614 case AlignmentSubtype::kAffine:
615 for (std::uint32_t i = 0; i < matrix_height; ++i) {
616 pimpl_->first_column[i] = pimpl_->first_column[matrix_height + i];
617 }
618 for (std::uint32_t j = 0; j < matrix_width; ++j) {
619 pimpl_->H[j] = pimpl_->E[j];
620 }
621 break;
622 case AlignmentSubtype::kLinear:
623 pimpl_->first_column[0] = 0;
624 for (std::uint32_t i = 1; i < matrix_height; ++i) {
625 const auto& edges = rank_to_node[i - 1]->inedges;
626 std::int32_t penalty = edges.empty() ? 0 : kNegativeInfinity;
627 for (const auto& it : edges) {
628 std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1;
629 penalty = std::max(penalty, pimpl_->first_column[pred_i]);
630 }
631 pimpl_->first_column[i] = penalty + g_;
632 }
633 for (std::uint32_t j = 0; j < matrix_width; ++j) {
634 pimpl_->H[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * g_);
635 __mxxxi g = T::_mmxxx_set1_epi(g_);
636
637 for (std::uint32_t k = 1; k < T::kNumVar; ++k) {
638 g = _mmxxx_slli_si(g, T::kLSS);
639 pimpl_->H[j] = T::_mmxxx_add_epi(pimpl_->H[j], g);
640 }
641 }
642 default:
643 break;
644 }
645 break;
646 case AlignmentType::kOV:
647 switch (subtype_) {
648 case AlignmentSubtype::kConvex:
649 for (std::uint32_t j = 0; j < matrix_width; ++j) {
650 pimpl_->H[j] = T::_mmxxx_max_epi(pimpl_->E[j],
651 pimpl_->Q[j]);
652 }
653 break;
654 case AlignmentSubtype::kAffine:
655 for (std::uint32_t j = 0; j < matrix_width; ++j) {
656 pimpl_->H[j] = pimpl_->E[j];
657 }
658 break;
659 case AlignmentSubtype::kLinear:
660 for (std::uint32_t j = 0; j < matrix_width; ++j) {
661 pimpl_->H[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * g_);
662 __mxxxi g = T::_mmxxx_set1_epi(g_);
663
664 for (std::uint32_t k = 1; k < T::kNumVar; ++k) {
665 g = _mmxxx_slli_si(g, T::kLSS);
666 pimpl_->H[j] = T::_mmxxx_add_epi(pimpl_->H[j], g);
667 }
668 }
669 break;
670 default:
671 break;
672 }
673 for (std::uint32_t i = 0; i < matrix_height; ++i) {
674 pimpl_->first_column[i] = 0;
675 }
676 break;
677 default:
678 break;
679 }
680 #endif
681 }
682
683 template<Architecture A>
Align(const char * sequence,std::uint32_t sequence_len,const Graph & graph,std::int32_t * score)684 Alignment SimdAlignmentEngine<A>::Align(
685 const char* sequence, std::uint32_t sequence_len,
686 const Graph& graph,
687 std::int32_t* score) {
688 if (sequence_len > std::numeric_limits<int32_t>::max()) {
689 throw std::invalid_argument(
690 "[spoa::SimdAlignmentEngine::Align] error: too large sequence!");
691 }
692
693 if (graph.nodes().empty() || sequence_len == 0) {
694 return Alignment();
695 }
696
697 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
698
699 std::int64_t worst_case_score = WorstCaseAlignmentScore(
700 sequence_len + 8,
701 graph.nodes().size());
702
703 if (worst_case_score < std::numeric_limits<std::int32_t>::min() + 1024) {
704 throw std::invalid_argument(
705 "[spoa::SimdAlignmentEngine::Align] error: possible overflow!");
706 } else if (worst_case_score < std::numeric_limits<std::int16_t>::min() + 1024) { // NOLINT
707 try {
708 Realloc(
709 std::ceil(static_cast<double>(sequence_len) / InstructionSet<A, std::int32_t>::kNumVar), // NOLINT
710 graph.nodes().size() + 1,
711 graph.num_codes());
712 } catch (std::bad_alloc& ba) {
713 throw std::invalid_argument(
714 "[spoa::SimdAlignmentEngine::Align] error: insufficient memory!");
715 }
716 Initialize<InstructionSet<A, std::int32_t>>(
717 sequence,
718 graph,
719 sequence_len,
720 std::ceil(static_cast<double>(sequence_len) / InstructionSet<A, std::int32_t>::kNumVar), // NOLINT
721 graph.nodes().size() + 1);
722
723 if (subtype_ == AlignmentSubtype::kLinear) {
724 return Linear<InstructionSet<A, std::int32_t>>(sequence_len, graph, score); // NOLINT
725 } else if (subtype_ == AlignmentSubtype::kAffine) {
726 return Affine<InstructionSet<A, std::int32_t>>(sequence_len, graph, score); // NOLINT
727 } else if (subtype_ == AlignmentSubtype::kConvex) {
728 return Convex<InstructionSet<A, std::int32_t>>(sequence_len, graph, score); // NOLINT
729 }
730 } else {
731 try {
732 Realloc(
733 std::ceil(static_cast<double>(sequence_len) / InstructionSet<A, std::int16_t>::kNumVar), // NOLINT
734 graph.nodes().size() + 1,
735 graph.num_codes());
736 } catch (std::bad_alloc& ba) {
737 throw std::invalid_argument(
738 "[spoa::SimdAlignmentEngine::Align] error: insufficient memory!");
739 }
740 Initialize<InstructionSet<A, std::int16_t>>(
741 sequence,
742 graph,
743 sequence_len,
744 std::ceil(static_cast<double>(sequence_len) / InstructionSet<A, std::int16_t>::kNumVar), // NOLINT
745 graph.nodes().size() + 1);
746
747 if (subtype_ == AlignmentSubtype::kLinear) {
748 return Linear<InstructionSet<A, std::int16_t>>(sequence_len, graph, score); // NOLINT
749 } else if (subtype_ == AlignmentSubtype::kAffine) {
750 return Affine<InstructionSet<A, std::int16_t>>(sequence_len, graph, score); // NOLINT
751 } else if (subtype_ == AlignmentSubtype::kConvex) {
752 return Convex<InstructionSet<A, std::int16_t>>(sequence_len, graph, score); // NOLINT
753 }
754 }
755
756 #endif
757 return Alignment();
758 }
759
760 template<Architecture A> template <typename T>
Linear(std::uint32_t sequence_len,const Graph & graph,std::int32_t * score)761 Alignment SimdAlignmentEngine<A>::Linear(
762 std::uint32_t sequence_len,
763 const Graph& graph,
764 std::int32_t* score) noexcept {
765 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
766 std::uint64_t normal_matrix_width = sequence_len;
767 std::uint64_t matrix_width =
768 std::ceil(static_cast<double>(sequence_len) / T::kNumVar);
769 const auto& rank_to_node = graph.rank_to_node();
770
771 typename T::type kNegativeInfinity =
772 std::numeric_limits<typename T::type>::min() + 1024;
773
774 __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0}; // NOLINT
775
776 for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) {
777 unpacked[i] = kNegativeInfinity;
778 if ((i & (i + 1)) == 0) {
779 pimpl_->masks[j++] = _mmxxx_load_si(
780 reinterpret_cast<const __mxxxi*>(unpacked));
781 }
782 }
783 pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si(
784 T::_mmxxx_set1_epi(kNegativeInfinity),
785 T::kLSS);
786
787 pimpl_->penalties[0] = T::_mmxxx_set1_epi(g_);
788 for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) {
789 pimpl_->penalties[i] = T::_mmxxx_add_epi(
790 pimpl_->penalties[i - 1],
791 pimpl_->penalties[i - 1]);
792 }
793
794 typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; // NOLINT
795 std::int32_t max_i = -1;
796 std::int32_t max_j = -1;
797 std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar;
798 __mxxxi zeroes = T::_mmxxx_set1_epi(0);
799 __mxxxi g = T::_mmxxx_set1_epi(g_);
800
801 // alignment
802 for (const auto& it : rank_to_node) {
803 __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); // NOLINT
804
805 std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1;
806 std::uint32_t pred_i = it->inedges.empty() ?
807 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1;
808
809 __mxxxi* H_row = &(pimpl_->H[i * matrix_width]);
810 __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
811
812 __mxxxi x = _mmxxx_srli_si(
813 T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
814 T::kRSS);
815
816 for (std::uint64_t j = 0; j < matrix_width; ++j) {
817 // get diagonal
818 __mxxxi t1 = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
819 H_row[j] = _mmxxx_or_si(
820 _mmxxx_slli_si(H_pred_row[j], T::kLSS),
821 x);
822 x = t1;
823
824 // update M
825 H_row[j] = T::_mmxxx_max_epi(
826 T::_mmxxx_add_epi(H_row[j], char_profile[j]),
827 T::_mmxxx_add_epi(H_pred_row[j], g));
828 }
829 // check other predecessors
830 for (std::uint32_t p = 1; p < it->inedges.size(); ++p) {
831 pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1;
832
833 H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
834
835 x = _mmxxx_srli_si(
836 T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
837 T::kRSS);
838
839 for (std::uint64_t j = 0; j < matrix_width; ++j) {
840 // get diagonal
841 __mxxxi t1 = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
842 __mxxxi m = _mmxxx_or_si(
843 _mmxxx_slli_si(H_pred_row[j], T::kLSS),
844 x);
845 x = t1;
846
847 // updage M
848 H_row[j] = T::_mmxxx_max_epi(
849 H_row[j],
850 T::_mmxxx_max_epi(
851 T::_mmxxx_add_epi(m, char_profile[j]),
852 T::_mmxxx_add_epi(H_pred_row[j], g)));
853 }
854 }
855
856 __mxxxi score = T::_mmxxx_set1_epi(kNegativeInfinity);
857 x = _mmxxx_srli_si(
858 T::_mmxxx_add_epi(
859 T::_mmxxx_set1_epi(pimpl_->first_column[i]),
860 g),
861 T::kRSS);
862
863 for (std::uint64_t j = 0; j < matrix_width; ++j) {
864 // add last element of previous vector into this one
865 H_row[j] = T::_mmxxx_max_epi(
866 H_row[j],
867 _mmxxx_or_si(x, pimpl_->masks[T::kLogNumVar]));
868
869 T::_mmxxx_prefix_max(H_row[j], pimpl_->masks, pimpl_->penalties);
870
871 x = _mmxxx_srli_si(
872 T::_mmxxx_add_epi(H_row[j], g),
873 T::kRSS);
874
875 if (type_ == AlignmentType::kSW) {
876 H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes);
877 }
878 score = T::_mmxxx_max_epi(score, H_row[j]);
879 }
880
881 if (type_ == AlignmentType::kSW) {
882 std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
883 if (max_score < max_row_score) {
884 max_score = max_row_score;
885 max_i = i;
886 }
887 } else if (type_ == AlignmentType::kOV) {
888 if (it->outedges.empty()) {
889 std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
890 if (max_score < max_row_score) {
891 max_score = max_row_score;
892 max_i = i;
893 }
894 }
895 } else if (type_ == AlignmentType::kNW) {
896 if (it->outedges.empty()) {
897 std::int32_t max_row_score = _mmxxx_value_at<A, T>(
898 H_row[matrix_width - 1],
899 last_column_id);
900 if (max_score < max_row_score) {
901 max_score = max_row_score;
902 max_i = i;
903 }
904 }
905 }
906 }
907
908 if (max_i == -1 && max_j == -1) {
909 return Alignment();
910 }
911 if (score) {
912 *score = max_score;
913 }
914
915 if (type_ == AlignmentType::kSW) {
916 max_j = _mmxxx_index_of<A, T>(
917 &(pimpl_->H[max_i * matrix_width]),
918 matrix_width,
919 max_score);
920 } else if (type_ == AlignmentType::kOV) {
921 if (rank_to_node[max_i - 1]->outedges.empty()) {
922 max_j = _mmxxx_index_of<A, T>(
923 &(pimpl_->H[max_i * matrix_width]),
924 matrix_width,
925 max_score);
926 } else {
927 max_j = normal_matrix_width - 1;
928 }
929 } else if (type_ == AlignmentType::kNW) {
930 max_j = normal_matrix_width - 1;
931 }
932
933 // backtrack
934 std::uint32_t max_num_predecessors = 1;
935 for (std::uint32_t i = 0; i < static_cast<std::uint32_t>(max_i); ++i) {
936 max_num_predecessors = std::max(
937 max_num_predecessors,
938 static_cast<std::uint32_t>(rank_to_node[i]->inedges.size()));
939 }
940
941 typename T::type* backtrack_storage = nullptr;
942 typename T::type* H = AllocateAlignedMemory<A>(
943 &backtrack_storage,
944 3 * T::kNumVar + 2 * T::kNumVar * max_num_predecessors,
945 kRegisterSize / 8);
946 typename T::type* H_pred = H + T::kNumVar;
947 typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors;
948 typename T::type* H_left_pred = H_diag_pred + T::kNumVar * max_num_predecessors; // NOLINT
949 typename T::type* profile = H_left_pred + T::kNumVar;
950
951 std::vector<std::uint32_t> predecessors;
952
953 std::int32_t i = max_i;
954 std::int32_t j = max_j;
955 std::int32_t prev_i = 0, prev_j = 0;
956
957 std::uint32_t j_div = j / T::kNumVar;
958 std::uint32_t j_mod = j % T::kNumVar;
959
960 bool load_next_segment = true;
961
962 Alignment alignment;
963
964 do {
965 // check stop condition
966 if (j == -1 || i == 0) {
967 break;
968 }
969
970 const auto& it = rank_to_node[i - 1];
971 // load everything
972 if (load_next_segment) {
973 predecessors.clear();
974
975 // load current cells
976 _mmxxx_store_si(
977 reinterpret_cast<__mxxxi*>(H),
978 pimpl_->H[i * matrix_width + j_div]);
979
980 // load predecessors cells
981 if (it->inedges.empty()) {
982 predecessors.emplace_back(0);
983 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]);
984 } else {
985 std::uint32_t store_pos = 0;
986 for (const auto& jt : it->inedges) {
987 predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1);
988 _mmxxx_store_si(
989 reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
990 pimpl_->H[predecessors.back() * matrix_width + j_div]);
991 ++store_pos;
992 }
993 }
994
995 // load query profile cells
996 _mmxxx_store_si(
997 reinterpret_cast<__mxxxi*>(profile),
998 pimpl_->sequence_profile[it->code * matrix_width + j_div]);
999 }
1000
1001 // check stop condition
1002 if (type_ == AlignmentType::kSW && H[j_mod] == 0) {
1003 break;
1004 }
1005
1006 if (j_mod == 0) {
1007 // border case
1008 if (j_div > 0) {
1009 _mmxxx_store_si(
1010 reinterpret_cast<__mxxxi*>(H_left_pred),
1011 pimpl_->H[i * matrix_width + j_div - 1]);
1012
1013 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1014 _mmxxx_store_si(
1015 reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]),
1016 pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]);
1017 }
1018 } else {
1019 H_left_pred[T::kNumVar - 1] = pimpl_->first_column[i];
1020
1021 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1022 H_diag_pred[(p + 1) * T::kNumVar - 1] =
1023 pimpl_->first_column[predecessors[p]];
1024 }
1025 }
1026 }
1027
1028 // find best predecessor cell
1029 bool predecessor_found = false;
1030
1031 if (i != 0) {
1032 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1033 if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) || // NOLINT
1034 (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) { // NOLINT
1035 prev_i = predecessors[p];
1036 prev_j = j - 1;
1037 predecessor_found = true;
1038 break;
1039 }
1040 }
1041 }
1042
1043 if (!predecessor_found && i != 0) {
1044 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1045 if (H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) {
1046 prev_i = predecessors[p];
1047 prev_j = j;
1048 predecessor_found = true;
1049 break;
1050 }
1051 }
1052 }
1053
1054 if (!predecessor_found) {
1055 if ((j_mod == 0 && H[j_mod] == H_left_pred[T::kNumVar - 1] + g_) ||
1056 (j_mod != 0 && H[j_mod] == H[j_mod - 1] + g_)) {
1057 prev_i = i;
1058 prev_j = j - 1;
1059 predecessor_found = true;
1060 }
1061 }
1062
1063 alignment.emplace_back(
1064 i == prev_i ? -1 : rank_to_node[i - 1]->id,
1065 j == prev_j ? -1 : j);
1066
1067 // update for next round
1068 load_next_segment =
1069 (i == prev_i ? false : true) ||
1070 (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false);
1071
1072 i = prev_i;
1073 j = prev_j;
1074 j_div = j / T::kNumVar;
1075 j_mod = j % T::kNumVar;
1076 } while (true);
1077
1078 delete[] backtrack_storage;
1079
1080 // update alignment for NW (backtrack stops on first row or column)
1081 if (type_ == AlignmentType::kNW) {
1082 while (i == 0 && j != -1) {
1083 alignment.emplace_back(-1, j);
1084 --j;
1085 }
1086 while (i != 0 && j == -1) {
1087 alignment.emplace_back(rank_to_node[i - 1]->id, -1);
1088
1089 const auto& it = rank_to_node[i - 1];
1090 if (it->inedges.empty()) {
1091 i = 0;
1092 } else {
1093 for (const auto& jt : it->inedges) {
1094 std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1;
1095 if (pimpl_->first_column[i] == pimpl_->first_column[pred_i] + g_) {
1096 i = pred_i;
1097 break;
1098 }
1099 }
1100 }
1101 }
1102 }
1103
1104 std::reverse(alignment.begin(), alignment.end());
1105 return alignment;
1106 #else
1107 return Alignment();
1108 #endif
1109 }
1110
1111 template<Architecture A> template <typename T>
Affine(std::uint32_t sequence_len,const Graph & graph,std::int32_t * score)1112 Alignment SimdAlignmentEngine<A>::Affine(
1113 std::uint32_t sequence_len,
1114 const Graph& graph,
1115 std::int32_t* score) noexcept {
1116 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
1117 std::uint64_t normal_matrix_width = sequence_len;
1118 std::uint64_t matrix_width =
1119 std::ceil(static_cast<double>(sequence_len) / T::kNumVar);
1120 const auto& rank_to_node = graph.rank_to_node();
1121
1122 typename T::type kNegativeInfinity =
1123 std::numeric_limits<typename T::type>::min() + 1024;
1124
1125 typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; // NOLINT
1126 std::int32_t max_i = -1;
1127 std::int32_t max_j = -1;
1128 std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar;
1129 __mxxxi zeroes = T::_mmxxx_set1_epi(0);
1130 __mxxxi g = T::_mmxxx_set1_epi(g_ - e_);
1131 __mxxxi e = T::_mmxxx_set1_epi(e_);
1132
1133 __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0}; // NOLINT
1134
1135 for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) {
1136 unpacked[i] = kNegativeInfinity;
1137 if ((i & (i + 1)) == 0) {
1138 pimpl_->masks[j++] = _mmxxx_load_si(
1139 reinterpret_cast<const __mxxxi*>(unpacked));
1140 }
1141 }
1142 pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si(
1143 T::_mmxxx_set1_epi(kNegativeInfinity),
1144 T::kLSS);
1145
1146 pimpl_->penalties[0] = T::_mmxxx_set1_epi(e_);
1147 for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) {
1148 pimpl_->penalties[i] = T::_mmxxx_add_epi(
1149 pimpl_->penalties[i - 1],
1150 pimpl_->penalties[i - 1]);
1151 }
1152
1153 // alignment
1154 for (const auto& it : rank_to_node) {
1155 __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); // NOLINT
1156
1157 std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1;
1158
1159 __mxxxi* H_row = &(pimpl_->H[i * matrix_width]);
1160 __mxxxi* F_row = &(pimpl_->F[i * matrix_width]);
1161
1162 std::uint32_t pred_i = it->inedges.empty() ?
1163 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1;
1164
1165 __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
1166 __mxxxi* F_pred_row = &(pimpl_->F[pred_i * matrix_width]);
1167
1168 __mxxxi x = _mmxxx_srli_si(
1169 T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
1170 T::kRSS);
1171
1172 for (std::uint64_t j = 0; j < matrix_width; ++j) {
1173 // update F
1174 F_row[j] = T::_mmxxx_add_epi(
1175 T::_mmxxx_max_epi(
1176 T::_mmxxx_add_epi(H_pred_row[j], g),
1177 F_pred_row[j]),
1178 e);
1179
1180 // update H
1181 H_row[j] = T::_mmxxx_add_epi(
1182 _mmxxx_or_si(
1183 _mmxxx_slli_si(H_pred_row[j], T::kLSS),
1184 x),
1185 char_profile[j]);
1186 x = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
1187 }
1188 // check other predecessors
1189 for (std::uint32_t p = 1; p < it->inedges.size(); ++p) {
1190 pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1;
1191
1192 H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
1193 F_pred_row = &(pimpl_->F[pred_i * matrix_width]);
1194
1195 x = _mmxxx_srli_si(
1196 T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
1197 T::kRSS);
1198
1199 for (std::uint64_t j = 0; j < matrix_width; ++j) {
1200 // update F
1201 F_row[j] = T::_mmxxx_max_epi(
1202 F_row[j],
1203 T::_mmxxx_add_epi(
1204 T::_mmxxx_max_epi(
1205 T::_mmxxx_add_epi(H_pred_row[j], g),
1206 F_pred_row[j]),
1207 e));
1208
1209 // update H
1210 H_row[j] = T::_mmxxx_max_epi(
1211 H_row[j],
1212 T::_mmxxx_add_epi(
1213 _mmxxx_or_si(
1214 _mmxxx_slli_si(H_pred_row[j], T::kLSS),
1215 x),
1216 char_profile[j]));
1217 x = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
1218 }
1219 }
1220
1221 __mxxxi* E_row = &(pimpl_->E[i * matrix_width]);
1222 __mxxxi score = zeroes;
1223 x = T::_mmxxx_set1_epi(pimpl_->first_column[i]);
1224
1225 for (std::uint64_t j = 0; j < matrix_width; ++j) {
1226 H_row[j] = T::_mmxxx_max_epi(H_row[j], F_row[j]);
1227
1228 E_row[j] = T::_mmxxx_add_epi(
1229 T::_mmxxx_add_epi(
1230 _mmxxx_or_si(
1231 _mmxxx_slli_si(H_row[j], T::kLSS),
1232 _mmxxx_srli_si(x, T::kRSS)),
1233 g),
1234 e);
1235
1236 T::_mmxxx_prefix_max(E_row[j], pimpl_->masks, pimpl_->penalties);
1237
1238 H_row[j] = T::_mmxxx_max_epi(H_row[j], E_row[j]);
1239 x = T::_mmxxx_max_epi(
1240 H_row[j],
1241 T::_mmxxx_sub_epi(E_row[j], g));
1242
1243 if (type_ == AlignmentType::kSW) {
1244 H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes);
1245 }
1246 score = T::_mmxxx_max_epi(score, H_row[j]);
1247 }
1248
1249 if (type_ == AlignmentType::kSW) {
1250 std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
1251 if (max_score < max_row_score) {
1252 max_score = max_row_score;
1253 max_i = i;
1254 }
1255 } else if (type_ == AlignmentType::kOV) {
1256 if (it->outedges.empty()) {
1257 std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
1258 if (max_score < max_row_score) {
1259 max_score = max_row_score;
1260 max_i = i;
1261 }
1262 }
1263 } else if (type_ == AlignmentType::kNW) {
1264 if (it->outedges.empty()) {
1265 std::int32_t max_row_score = _mmxxx_value_at<A, T>(
1266 H_row[matrix_width - 1],
1267 last_column_id);
1268 if (max_score < max_row_score) {
1269 max_score = max_row_score;
1270 max_i = i;
1271 }
1272 }
1273 }
1274 }
1275
1276 if (max_i == -1 && max_j == -1) {
1277 return Alignment();
1278 }
1279 if (score) {
1280 *score = max_score;
1281 }
1282
1283 if (type_ == AlignmentType::kSW) {
1284 max_j = _mmxxx_index_of<A, T>(
1285 &(pimpl_->H[max_i * matrix_width]),
1286 matrix_width, max_score);
1287 } else if (type_ == AlignmentType::kOV) {
1288 if (rank_to_node[max_i - 1]->outedges.empty()) {
1289 max_j = _mmxxx_index_of<A, T>(
1290 &(pimpl_->H[max_i * matrix_width]),
1291 matrix_width, max_score);
1292 } else {
1293 max_j = normal_matrix_width - 1;
1294 }
1295 } else if (type_ == AlignmentType::kNW) {
1296 max_j = normal_matrix_width - 1;
1297 }
1298
1299 // backtrack
1300 std::uint32_t max_num_predecessors = 1;
1301 for (std::uint32_t i = 0; i < static_cast<std::uint32_t>(max_i); ++i) {
1302 max_num_predecessors = std::max(
1303 max_num_predecessors,
1304 static_cast<std::uint32_t>(rank_to_node[i]->inedges.size()));
1305 }
1306
1307 typename T::type* backtrack_storage = nullptr;
1308 typename T::type* H = AllocateAlignedMemory<A>(
1309 &backtrack_storage,
1310 6 * T::kNumVar + 3 * T::kNumVar * max_num_predecessors,
1311 kRegisterSize / 8);
1312 typename T::type* H_pred = H + T::kNumVar;
1313 typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors;
1314 typename T::type* H_left = H_diag_pred + T::kNumVar * max_num_predecessors;
1315 typename T::type* F = H_left + T::kNumVar;
1316 typename T::type* F_pred = F + T::kNumVar;
1317 typename T::type* E = F_pred + T::kNumVar * max_num_predecessors;
1318 typename T::type* E_left = E + T::kNumVar;
1319 typename T::type* profile = E_left + T::kNumVar;
1320
1321 std::vector<std::uint32_t> predecessors;
1322
1323 std::int32_t i = max_i;
1324 std::int32_t j = max_j;
1325 std::int32_t prev_i = 0, prev_j = 0;
1326
1327 std::uint32_t j_div = j / T::kNumVar;
1328 std::uint32_t j_mod = j % T::kNumVar;
1329
1330 bool load_next_segment = true;
1331
1332 Alignment alignment;
1333
1334 do {
1335 // check stop condition
1336 if (j == -1 || i == 0) {
1337 break;
1338 }
1339
1340 const auto& it = rank_to_node[i - 1];
1341 // load everything
1342 if (load_next_segment) {
1343 predecessors.clear();
1344
1345 // load current cells
1346 _mmxxx_store_si(
1347 reinterpret_cast<__mxxxi*>(H),
1348 pimpl_->H[i * matrix_width + j_div]);
1349 _mmxxx_store_si(
1350 reinterpret_cast<__mxxxi*>(E),
1351 pimpl_->E[i * matrix_width + j_div]);
1352
1353 // load predecessors cells
1354 if (it->inedges.empty()) {
1355 predecessors.emplace_back(0);
1356 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]);
1357 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(F_pred), pimpl_->F[j_div]);
1358 } else {
1359 std::uint32_t store_pos = 0;
1360 for (const auto& jt : it->inedges) {
1361 predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1);
1362 _mmxxx_store_si(
1363 reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
1364 pimpl_->H[predecessors.back() * matrix_width + j_div]);
1365 _mmxxx_store_si(
1366 reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]),
1367 pimpl_->F[predecessors.back() * matrix_width + j_div]);
1368 ++store_pos;
1369 }
1370 }
1371
1372 // load query profile cells
1373 _mmxxx_store_si(
1374 reinterpret_cast<__mxxxi*>(profile),
1375 pimpl_->sequence_profile[it->code * matrix_width + j_div]);
1376 }
1377
1378 // check stop condition
1379 if (type_ == AlignmentType::kSW && H[j_mod] == 0) {
1380 break;
1381 }
1382
1383 if (j_mod == 0) {
1384 // border case
1385 if (j_div > 0) {
1386 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1387 _mmxxx_store_si(
1388 reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]),
1389 pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]);
1390 }
1391 _mmxxx_store_si(
1392 reinterpret_cast<__mxxxi*>(H_left),
1393 pimpl_->H[i * matrix_width + j_div - 1]);
1394 _mmxxx_store_si(
1395 reinterpret_cast<__mxxxi*>(E_left),
1396 pimpl_->E[i * matrix_width + j_div - 1]);
1397 } else {
1398 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1399 H_diag_pred[(p + 1) * T::kNumVar - 1] =
1400 pimpl_->first_column[predecessors[p]];
1401 }
1402 H_left[T::kNumVar - 1] = pimpl_->first_column[i];
1403 E_left[T::kNumVar - 1] = pimpl_->first_column[i];
1404 }
1405 }
1406
1407 // find best predecessor cell
1408 bool predecessor_found = false, extend_left = false, extend_up = false;
1409
1410 if (i != 0) {
1411 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1412 if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) || // NOLINT
1413 (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) { // NOLINT
1414 prev_i = predecessors[p];
1415 prev_j = j - 1;
1416 predecessor_found = true;
1417 break;
1418 }
1419 }
1420 }
1421
1422 if (!predecessor_found && i != 0) {
1423 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1424 if ((extend_up = H[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) ||
1425 H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) {
1426 prev_i = predecessors[p];
1427 prev_j = j;
1428 predecessor_found = true;
1429 break;
1430 }
1431 }
1432 }
1433
1434 if (!predecessor_found) {
1435 if ((j_mod != 0 && ((extend_left = H[j_mod] == E[j_mod - 1] + e_) ||
1436 H[j_mod] == H[j_mod - 1] + g_)) ||
1437 (j_mod == 0 && ((extend_left = H[j_mod] == E_left[T::kNumVar - 1] + e_ ) || // NOLINT
1438 H[j_mod] == H_left[T::kNumVar - 1] + g_))) { // NOLINT
1439 prev_i = i;
1440 prev_j = j - 1;
1441 predecessor_found = true;
1442 }
1443 }
1444
1445 alignment.emplace_back(
1446 i == prev_i ? -1 : rank_to_node[i - 1]->id,
1447 j == prev_j ? -1 : j);
1448
1449 // update for next round
1450 load_next_segment =
1451 (i == prev_i ? false : true) ||
1452 (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false);
1453
1454 i = prev_i;
1455 j = prev_j;
1456 j_div = j / T::kNumVar;
1457 j_mod = j % T::kNumVar;
1458
1459 if (extend_left) {
1460 while (true) {
1461 // load
1462 if (j_mod == T::kNumVar - 1) {
1463 _mmxxx_store_si(
1464 reinterpret_cast<__mxxxi*>(E),
1465 pimpl_->E[i * matrix_width + j_div]);
1466 } else if (j_mod == 0) { // boarder case
1467 if (j_div > 0) {
1468 _mmxxx_store_si(
1469 reinterpret_cast<__mxxxi*>(E_left),
1470 pimpl_->E[i * matrix_width + j_div - 1]);
1471 }
1472 }
1473
1474 alignment.emplace_back(-1, j);
1475 --j;
1476 j_div = j / T::kNumVar;
1477 j_mod = j % T::kNumVar;
1478 if ((j == -1) ||
1479 (j_mod != T::kNumVar - 1 && E[j_mod] + e_ != E[j_mod + 1]) ||
1480 (j_mod == T::kNumVar - 1 && E_left[j_mod] + e_ != E[0])) {
1481 break;
1482 }
1483 }
1484 load_next_segment = true;
1485 } else if (extend_up) {
1486 while (true) {
1487 // load
1488 _mmxxx_store_si(
1489 reinterpret_cast<__mxxxi*>(F),
1490 pimpl_->F[i * matrix_width + j_div]);
1491
1492 prev_i = 0;
1493 predecessors.clear();
1494 std::uint32_t store_pos = 0;
1495 for (const auto& it : rank_to_node[i - 1]->inedges) {
1496 predecessors.emplace_back(pimpl_->node_id_to_rank[it->tail->id] + 1);
1497 _mmxxx_store_si(
1498 reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
1499 pimpl_->H[predecessors.back() * matrix_width + j_div]);
1500 _mmxxx_store_si(
1501 reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]),
1502 pimpl_->F[predecessors.back() * matrix_width + j_div]);
1503 ++store_pos;
1504 }
1505
1506 bool stop = false;
1507 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1508 if ((stop = F[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) ||
1509 F[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) {
1510 prev_i = predecessors[p];
1511 break;
1512 }
1513 }
1514
1515 alignment.emplace_back(rank_to_node[i - 1]->id, -1);
1516 i = prev_i;
1517
1518 if (stop || i == 0) {
1519 break;
1520 }
1521 }
1522 }
1523 } while (true);
1524
1525 delete[] backtrack_storage;
1526
1527 // update alignment for NW (backtrack stops on first row or column)
1528 if (type_ == AlignmentType::kNW) {
1529 while (i == 0 && j != -1) {
1530 alignment.emplace_back(-1, j);
1531 --j;
1532 }
1533 while (i != 0 && j == -1) {
1534 alignment.emplace_back(rank_to_node[i - 1]->id, -1);
1535
1536 const auto& it = rank_to_node[i - 1];
1537 if (it->inedges.empty()) {
1538 i = 0;
1539 } else {
1540 for (const auto& jt : it->inedges) {
1541 std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1;
1542 if (pimpl_->first_column[i] == pimpl_->first_column[pred_i] + e_) {
1543 i = pred_i;
1544 break;
1545 }
1546 }
1547 }
1548 }
1549 }
1550
1551 std::reverse(alignment.begin(), alignment.end());
1552 return alignment;
1553 #else
1554 return Alignment();
1555 #endif
1556 }
1557
1558 template<Architecture A> template <typename T>
Convex(std::uint32_t sequence_len,const Graph & graph,std::int32_t * score)1559 Alignment SimdAlignmentEngine<A>::Convex(
1560 std::uint32_t sequence_len,
1561 const Graph& graph,
1562 std::int32_t* score) noexcept {
1563 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
1564 std::uint64_t normal_matrix_width = sequence_len;
1565 std::uint64_t matrix_width =
1566 std::ceil(static_cast<double>(sequence_len) / T::kNumVar);
1567 std::uint64_t matrix_height = graph.nodes().size() + 1;
1568 const auto& rank_to_node = graph.rank_to_node();
1569
1570 typename T::type kNegativeInfinity =
1571 std::numeric_limits<typename T::type>::min() + 1024;
1572
1573 typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity; // NOLINT
1574 std::int32_t max_i = -1;
1575 std::int32_t max_j = -1;
1576 std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar;
1577 __mxxxi zeroes = T::_mmxxx_set1_epi(0);
1578 __mxxxi g = T::_mmxxx_set1_epi(g_ - e_);
1579 __mxxxi e = T::_mmxxx_set1_epi(e_);
1580 __mxxxi q = T::_mmxxx_set1_epi(q_ - c_);
1581 __mxxxi c = T::_mmxxx_set1_epi(c_);
1582
1583 __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0}; // NOLINT
1584
1585 for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) {
1586 unpacked[i] = kNegativeInfinity;
1587 if ((i & (i + 1)) == 0) {
1588 pimpl_->masks[j++] = _mmxxx_load_si(
1589 reinterpret_cast<const __mxxxi*>(unpacked));
1590 }
1591 }
1592 pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si(
1593 T::_mmxxx_set1_epi(kNegativeInfinity),
1594 T::kLSS);
1595
1596 pimpl_->penalties[0] = T::_mmxxx_set1_epi(e_);
1597 for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) {
1598 pimpl_->penalties[i] = T::_mmxxx_add_epi(
1599 pimpl_->penalties[i - 1],
1600 pimpl_->penalties[i - 1]);
1601 }
1602 pimpl_->penalties[T::kLogNumVar] = T::_mmxxx_set1_epi(c_);
1603 for (std::uint32_t i = T::kLogNumVar + 1; i < 2 * T::kLogNumVar; ++i) {
1604 pimpl_->penalties[i] = T::_mmxxx_add_epi(
1605 pimpl_->penalties[i - 1],
1606 pimpl_->penalties[i - 1]);
1607 }
1608
1609 // alignment
1610 for (const auto& it : rank_to_node) {
1611 __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]); // NOLINT
1612
1613 std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1;
1614
1615 __mxxxi* H_row = &(pimpl_->H[i * matrix_width]);
1616 __mxxxi* F_row = &(pimpl_->F[i * matrix_width]);
1617 __mxxxi* O_row = &(pimpl_->O[i * matrix_width]);
1618
1619 std::uint32_t pred_i = it->inedges.empty() ?
1620 0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1;
1621
1622 __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
1623 __mxxxi* F_pred_row = &(pimpl_->F[pred_i * matrix_width]);
1624 __mxxxi* O_pred_row = &(pimpl_->O[pred_i * matrix_width]);
1625
1626 __mxxxi x = _mmxxx_srli_si(
1627 T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
1628 T::kRSS);
1629
1630 for (std::uint64_t j = 0; j < matrix_width; ++j) {
1631 // update F
1632 F_row[j] = T::_mmxxx_add_epi(
1633 T::_mmxxx_max_epi(
1634 T::_mmxxx_add_epi(H_pred_row[j], g),
1635 F_pred_row[j]),
1636 e);
1637
1638 // update O
1639 O_row[j] = T::_mmxxx_add_epi(
1640 T::_mmxxx_max_epi(
1641 T::_mmxxx_add_epi(H_pred_row[j], q),
1642 O_pred_row[j]),
1643 c);
1644
1645 // update H
1646 H_row[j] = T::_mmxxx_add_epi(
1647 _mmxxx_or_si(
1648 _mmxxx_slli_si(H_pred_row[j], T::kLSS),
1649 x),
1650 char_profile[j]);
1651 x = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
1652 }
1653 // check other predecessors
1654 for (std::uint32_t p = 1; p < it->inedges.size(); ++p) {
1655 pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1;
1656
1657 H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
1658 F_pred_row = &(pimpl_->F[pred_i * matrix_width]);
1659 O_pred_row = &(pimpl_->O[pred_i * matrix_width]);
1660
1661 x = _mmxxx_srli_si(
1662 T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
1663 T::kRSS);
1664
1665 for (std::uint64_t j = 0; j < matrix_width; ++j) {
1666 // update F
1667 F_row[j] = T::_mmxxx_max_epi(
1668 F_row[j],
1669 T::_mmxxx_add_epi(
1670 T::_mmxxx_max_epi(
1671 T::_mmxxx_add_epi(H_pred_row[j], g),
1672 F_pred_row[j]),
1673 e));
1674
1675 // update O
1676 O_row[j] = T::_mmxxx_max_epi(
1677 O_row[j],
1678 T::_mmxxx_add_epi(
1679 T::_mmxxx_max_epi(
1680 T::_mmxxx_add_epi(H_pred_row[j], q),
1681 O_pred_row[j]),
1682 c));
1683
1684 // update H
1685 H_row[j] = T::_mmxxx_max_epi(
1686 H_row[j],
1687 T::_mmxxx_add_epi(
1688 _mmxxx_or_si(
1689 _mmxxx_slli_si(H_pred_row[j], T::kLSS),
1690 x),
1691 char_profile[j]));
1692
1693 x = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
1694 }
1695 }
1696
1697 __mxxxi* E_row = &(pimpl_->E[i * matrix_width]);
1698 __mxxxi* Q_row = &(pimpl_->Q[i * matrix_width]);
1699
1700 x = T::_mmxxx_set1_epi(pimpl_->first_column[i]);
1701 __mxxxi y = T::_mmxxx_set1_epi(pimpl_->first_column[i]);
1702
1703 __mxxxi score = zeroes;
1704
1705 for (std::uint64_t j = 0; j < matrix_width; ++j) {
1706 H_row[j] = T::_mmxxx_max_epi(
1707 H_row[j],
1708 T::_mmxxx_max_epi(F_row[j], O_row[j]));
1709
1710 E_row[j] = T::_mmxxx_add_epi(
1711 T::_mmxxx_add_epi(
1712 _mmxxx_or_si(
1713 _mmxxx_slli_si(H_row[j], T::kLSS),
1714 _mmxxx_srli_si(x, T::kRSS)),
1715 g),
1716 e);
1717
1718 T::_mmxxx_prefix_max(E_row[j], pimpl_->masks, pimpl_->penalties);
1719
1720 Q_row[j] = T::_mmxxx_add_epi(
1721 T::_mmxxx_add_epi(
1722 _mmxxx_or_si(
1723 _mmxxx_slli_si(H_row[j], T::kLSS),
1724 _mmxxx_srli_si(y, T::kRSS)),
1725 q),
1726 c);
1727
1728 T::_mmxxx_prefix_max(Q_row[j], pimpl_->masks, &pimpl_->penalties[T::kLogNumVar]); // NOLINT
1729
1730 H_row[j] = T::_mmxxx_max_epi(
1731 H_row[j],
1732 T::_mmxxx_max_epi(E_row[j], Q_row[j]));
1733
1734 x = T::_mmxxx_max_epi(
1735 H_row[j],
1736 T::_mmxxx_sub_epi(E_row[j], g));
1737
1738 y = T::_mmxxx_max_epi(
1739 H_row[j],
1740 T::_mmxxx_sub_epi(Q_row[j], q));
1741
1742 if (type_ == AlignmentType::kSW) {
1743 H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes);
1744 }
1745 score = T::_mmxxx_max_epi(score, H_row[j]);
1746 }
1747
1748 if (type_ == AlignmentType::kSW) {
1749 std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
1750 if (max_score < max_row_score) {
1751 max_score = max_row_score;
1752 max_i = i;
1753 }
1754 } else if (type_ == AlignmentType::kOV) {
1755 if (it->outedges.empty()) {
1756 std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
1757 if (max_score < max_row_score) {
1758 max_score = max_row_score;
1759 max_i = i;
1760 }
1761 }
1762 } else if (type_ == AlignmentType::kNW) {
1763 if (it->outedges.empty()) {
1764 std::int32_t max_row_score = _mmxxx_value_at<A, T>(
1765 H_row[matrix_width - 1],
1766 last_column_id);
1767 if (max_score < max_row_score) {
1768 max_score = max_row_score;
1769 max_i = i;
1770 }
1771 }
1772 }
1773 }
1774
1775 if (max_i == -1 && max_j == -1) {
1776 return Alignment();
1777 }
1778 if (score) {
1779 *score = max_score;
1780 }
1781
1782 if (type_ == AlignmentType::kSW) {
1783 max_j = _mmxxx_index_of<A, T>(
1784 &(pimpl_->H[max_i * matrix_width]),
1785 matrix_width, max_score);
1786 } else if (type_ == AlignmentType::kOV) {
1787 if (rank_to_node[max_i - 1]->outedges.empty()) {
1788 max_j = _mmxxx_index_of<A, T>(
1789 &(pimpl_->H[max_i * matrix_width]),
1790 matrix_width, max_score);
1791 } else {
1792 max_j = normal_matrix_width - 1;
1793 }
1794 } else if (type_ == AlignmentType::kNW) {
1795 max_j = normal_matrix_width - 1;
1796 }
1797
1798 // backtrack
1799 std::uint32_t max_num_predecessors = 1;
1800 for (std::uint32_t i = 0; i < static_cast<std::uint32_t>(max_i); ++i) {
1801 max_num_predecessors = std::max(
1802 max_num_predecessors,
1803 static_cast<std::uint32_t>(rank_to_node[i]->inedges.size()));
1804 }
1805
1806 typename T::type* backtrack_storage = nullptr;
1807 typename T::type* H = AllocateAlignedMemory<A>(
1808 &backtrack_storage,
1809 9 * T::kNumVar + 4 * T::kNumVar * max_num_predecessors,
1810 kRegisterSize / 8);
1811 typename T::type* H_pred = H + T::kNumVar;
1812 typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors;
1813 typename T::type* H_left = H_diag_pred + T::kNumVar * max_num_predecessors;
1814 typename T::type* F = H_left + T::kNumVar;
1815 typename T::type* F_pred = F + T::kNumVar;
1816 typename T::type* O = F_pred + T::kNumVar * max_num_predecessors;
1817 typename T::type* O_pred = O + T::kNumVar;
1818 typename T::type* E = O_pred + T::kNumVar * max_num_predecessors;
1819 typename T::type* E_left = E + T::kNumVar;
1820 typename T::type* Q = E_left + T::kNumVar;
1821 typename T::type* Q_left = Q + T::kNumVar;
1822 typename T::type* profile = Q_left + T::kNumVar;
1823
1824 std::vector<std::uint32_t> predecessors;
1825
1826 std::int32_t i = max_i;
1827 std::int32_t j = max_j;
1828 std::int32_t prev_i = 0, prev_j = 0;
1829
1830 std::uint32_t j_div = j / T::kNumVar;
1831 std::uint32_t j_mod = j % T::kNumVar;
1832
1833 bool load_next_segment = true;
1834
1835 Alignment alignment;
1836
1837 do {
1838 // check stop condition
1839 if (j == -1 || i == 0) {
1840 break;
1841 }
1842
1843 const auto& it = rank_to_node[i - 1];
1844 // load everything
1845 if (load_next_segment) {
1846 predecessors.clear();
1847
1848 // load current cells
1849 _mmxxx_store_si(
1850 reinterpret_cast<__mxxxi*>(H),
1851 pimpl_->H[i * matrix_width + j_div]);
1852 _mmxxx_store_si(
1853 reinterpret_cast<__mxxxi*>(E),
1854 pimpl_->E[i * matrix_width + j_div]);
1855 _mmxxx_store_si(
1856 reinterpret_cast<__mxxxi*>(Q),
1857 pimpl_->Q[i * matrix_width + j_div]);
1858
1859 // load predecessors cells
1860 if (it->inedges.empty()) {
1861 predecessors.emplace_back(0);
1862 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]);
1863 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(F_pred), pimpl_->F[j_div]);
1864 _mmxxx_store_si(reinterpret_cast<__mxxxi*>(O_pred), pimpl_->O[j_div]);
1865 } else {
1866 std::uint32_t store_pos = 0;
1867 for (const auto& jt : it->inedges) {
1868 predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1);
1869 _mmxxx_store_si(
1870 reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
1871 pimpl_->H[predecessors.back() * matrix_width + j_div]);
1872 _mmxxx_store_si(
1873 reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]),
1874 pimpl_->F[predecessors.back() * matrix_width + j_div]);
1875 _mmxxx_store_si(
1876 reinterpret_cast<__mxxxi*>(&O_pred[store_pos * T::kNumVar]),
1877 pimpl_->O[predecessors.back() * matrix_width + j_div]);
1878 ++store_pos;
1879 }
1880 }
1881
1882 // load query profile cells
1883 _mmxxx_store_si(
1884 reinterpret_cast<__mxxxi*>(profile),
1885 pimpl_->sequence_profile[it->code * matrix_width + j_div]);
1886 }
1887
1888 // check stop condition
1889 if (type_ == AlignmentType::kSW && H[j_mod] == 0) {
1890 break;
1891 }
1892
1893 if (j_mod == 0) {
1894 // border case
1895 if (j_div > 0) {
1896 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1897 _mmxxx_store_si(
1898 reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]),
1899 pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]);
1900 }
1901 _mmxxx_store_si(
1902 reinterpret_cast<__mxxxi*>(H_left),
1903 pimpl_->H[i * matrix_width + j_div - 1]);
1904 _mmxxx_store_si(
1905 reinterpret_cast<__mxxxi*>(E_left),
1906 pimpl_->E[i * matrix_width + j_div - 1]);
1907 _mmxxx_store_si(
1908 reinterpret_cast<__mxxxi*>(Q_left),
1909 pimpl_->Q[i * matrix_width + j_div - 1]);
1910 } else {
1911 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1912 H_diag_pred[(p + 1) * T::kNumVar - 1] = pimpl_->first_column[predecessors[p]]; // NOLINT
1913 }
1914 H_left[T::kNumVar - 1] = pimpl_->first_column[i];
1915 E_left[T::kNumVar - 1] = pimpl_->first_column[i];
1916 Q_left[T::kNumVar - 1] = pimpl_->first_column[i];
1917 }
1918 }
1919
1920 // find best predecessor cell
1921 bool predecessor_found = false, extend_left = false, extend_up = false;
1922
1923 if (i != 0) {
1924 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1925 if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) || // NOLINT
1926 (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) { // NOLINT
1927 prev_i = predecessors[p];
1928 prev_j = j - 1;
1929 predecessor_found = true;
1930 break;
1931 }
1932 }
1933 }
1934
1935 if (!predecessor_found && i != 0) {
1936 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1937 if ((extend_up = H[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) ||
1938 H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_ ||
1939 (extend_up = H[j_mod] == O_pred[p * T::kNumVar + j_mod] + c_) ||
1940 H[j_mod] == H_pred[p * T::kNumVar + j_mod] + q_) {
1941 prev_i = predecessors[p];
1942 prev_j = j;
1943 predecessor_found = true;
1944 break;
1945 }
1946 }
1947 }
1948
1949 if (!predecessor_found) {
1950 if ((j_mod != 0 && ((extend_left = H[j_mod] == E[j_mod - 1] + e_) ||
1951 H[j_mod] == H[j_mod - 1] + g_ ||
1952 (extend_left = H[j_mod] == Q[j_mod - 1] + c_) ||
1953 H[j_mod] == H[j_mod - 1] + q_)) ||
1954 (j_mod == 0 && ((extend_left = H[j_mod] == E_left[T::kNumVar - 1] + e_) || // NOLINT
1955 H[j_mod] == H_left[T::kNumVar - 1] + g_ || // NOLINT
1956 (extend_left = H[j_mod] == Q_left[T::kNumVar - 1] + c_) || // NOLINT
1957 H[j_mod] == H_left[T::kNumVar - 1] + q_))) { // NOLINT
1958 prev_i = i;
1959 prev_j = j - 1;
1960 predecessor_found = true;
1961 }
1962 }
1963
1964 alignment.emplace_back(
1965 i == prev_i ? -1 : rank_to_node[i - 1]->id,
1966 j == prev_j ? -1 : j);
1967
1968 // update for next round
1969 load_next_segment =
1970 (i == prev_i ? false : true) ||
1971 (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false);
1972
1973 i = prev_i;
1974 j = prev_j;
1975 j_div = j / T::kNumVar;
1976 j_mod = j % T::kNumVar;
1977
1978 if (extend_left) {
1979 while (true) {
1980 // load
1981 if (j_mod == T::kNumVar - 1) {
1982 _mmxxx_store_si(
1983 reinterpret_cast<__mxxxi*>(E),
1984 pimpl_->E[i * matrix_width + j_div]);
1985 _mmxxx_store_si(
1986 reinterpret_cast<__mxxxi*>(Q),
1987 pimpl_->Q[i * matrix_width + j_div]);
1988 } else if (j_mod == 0) { // boarder case
1989 if (j_div > 0) {
1990 _mmxxx_store_si(
1991 reinterpret_cast<__mxxxi*>(E_left),
1992 pimpl_->E[i * matrix_width + j_div - 1]);
1993 _mmxxx_store_si(
1994 reinterpret_cast<__mxxxi*>(Q_left),
1995 pimpl_->Q[i * matrix_width + j_div - 1]);
1996 }
1997 }
1998
1999 alignment.emplace_back(-1, j);
2000 --j;
2001 j_div = j / T::kNumVar;
2002 j_mod = j % T::kNumVar;
2003 if ((j == -1) ||
2004 (j_mod != T::kNumVar - 1 && E[j_mod] + e_ != E[j_mod + 1]) ||
2005 (j_mod == T::kNumVar - 1 && E_left[j_mod] + e_ != E[0]) ||
2006 (j_mod != T::kNumVar - 1 && Q[j_mod] + c_ != Q[j_mod + 1]) ||
2007 (j_mod == T::kNumVar - 1 && Q_left[j_mod] + c_ != Q[0])) {
2008 break;
2009 }
2010 }
2011 load_next_segment = true;
2012 } else if (extend_up) {
2013 while (true) {
2014 // load
2015 _mmxxx_store_si(
2016 reinterpret_cast<__mxxxi*>(F),
2017 pimpl_->F[i * matrix_width + j_div]);
2018 _mmxxx_store_si(
2019 reinterpret_cast<__mxxxi*>(O),
2020 pimpl_->O[i * matrix_width + j_div]);
2021
2022 predecessors.clear();
2023 std::uint32_t store_pos = 0;
2024 for (const auto& it : rank_to_node[i - 1]->inedges) {
2025 predecessors.emplace_back(pimpl_->node_id_to_rank[it->tail->id] + 1);
2026 _mmxxx_store_si(
2027 reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
2028 pimpl_->H[predecessors.back() * matrix_width + j_div]);
2029 _mmxxx_store_si(
2030 reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]),
2031 pimpl_->F[predecessors.back() * matrix_width + j_div]);
2032 _mmxxx_store_si(
2033 reinterpret_cast<__mxxxi*>(&O_pred[store_pos * T::kNumVar]),
2034 pimpl_->O[predecessors.back() * matrix_width + j_div]);
2035 ++store_pos;
2036 }
2037
2038 bool stop = true;
2039 prev_i = 0;
2040 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
2041 if (F[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_ ||
2042 O[j_mod] == O_pred[p * T::kNumVar + j_mod] + c_) {
2043 prev_i = predecessors[p];
2044 stop = false;
2045 break;
2046 }
2047 }
2048 if (stop == true) {
2049 for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
2050 if (F[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_ ||
2051 O[j_mod] == H_pred[p * T::kNumVar + j_mod] + q_) {
2052 prev_i = predecessors[p];
2053 break;
2054 }
2055 }
2056 }
2057
2058 alignment.emplace_back(rank_to_node[i - 1]->id, -1);
2059 i = prev_i;
2060
2061 if (stop || i == 0) {
2062 break;
2063 }
2064 }
2065 }
2066 } while (true);
2067
2068 delete[] backtrack_storage;
2069
2070 // update alignment for NW (backtrack stops on first row or column)
2071 if (type_ == AlignmentType::kNW) {
2072 while (i == 0 && j != -1) {
2073 alignment.emplace_back(-1, j);
2074 --j;
2075 }
2076 while (i != 0 && j == -1) {
2077 alignment.emplace_back(rank_to_node[i - 1]->id, -1);
2078
2079 const auto& it = rank_to_node[i - 1];
2080 if (it->inedges.empty()) {
2081 i = 0;
2082 } else {
2083 for (const auto& jt : it->inedges) {
2084 std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1;
2085 if (pimpl_->first_column[matrix_height + i] == pimpl_->first_column[matrix_height + pred_i] + e_ || // NOLINT
2086 pimpl_->first_column[2 * matrix_height + i] == pimpl_->first_column[2 * matrix_height + pred_i] + c_ ) { // NOLINT
2087 i = pred_i;
2088 break;
2089 }
2090 }
2091 }
2092 }
2093 }
2094
2095 std::reverse(alignment.begin(), alignment.end());
2096 return alignment;
2097 #else
2098 return Alignment();
2099 #endif
2100 }
2101
2102 } // namespace spoa
2103
2104 #endif // SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_
2105