1 // Copyright (c) 2020 Robert Vaser
2 
3 #ifndef SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_
4 #define SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_
5 
6 #include "simd_alignment_engine.hpp"
7 
8 #include <algorithm>
9 #include <cmath>
10 #include <iostream>
11 #include <limits>
12 #include <memory>
13 #include <stdexcept>
14 #include <vector>
15 
16 extern "C" {
17 #ifdef USE_SIMDE
18   #ifdef __AVX2__
19     #include <simde/x86/avx2.h>
20   #else
21     #include <simde/x86/sse4.1.h>  // SSE4.1 is covered better
22   #endif
23 #elif defined(__AVX2__) || defined(__SSE4_1__)
24   #include <immintrin.h>  // AVX2 and lower
25 #endif
26 }
27 
28 #include "spoa/graph.hpp"
29 
30 namespace spoa {
31 
32 // Taken from https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=216149
align(std::size_t __align,std::size_t __size,void * & __ptr,std::size_t & __space)33 inline void* align(
34     std::size_t __align,
35     std::size_t __size,
36     void*& __ptr,  // NOLINT
37     std::size_t& __space) noexcept {  // NOLINT
38   const auto __intptr = reinterpret_cast<uintptr_t>(__ptr);
39   const auto __aligned = (__intptr - 1u + __align) & -__align;
40   const auto __diff = __aligned - __intptr;
41   if ((__size + __diff) > __space) {
42     return nullptr;
43   } else {
44     __space -= __diff;
45     return __ptr = reinterpret_cast<void*>(__aligned);
46   }
47 }
48 
49 template<Architecture A, typename T>
AllocateAlignedMemory(T ** storage,std::size_t size,std::size_t alignment)50 T* AllocateAlignedMemory(
51     T** storage,
52     std::size_t size,
53     std::size_t alignment) {
54   *storage = new T[size + alignment - 1];
55   void* ptr = static_cast<void*>(*storage);
56   std::size_t storage_size = (size + alignment - 1) * sizeof(T);
57   return static_cast<T*>(align(alignment, size * sizeof(T), ptr, storage_size));
58 }
59 
60 template<Architecture A, typename T>
61 struct InstructionSet;
62 
63 #if defined(__AVX2__)
64 
65 constexpr std::uint32_t kRegisterSize = 256;
66 using __mxxxi = __m256i;
67 
_mmxxx_load_si(__mxxxi const * mem_addr)68 inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) {
69   return _mm256_load_si256(mem_addr);
70 }
71 
_mmxxx_store_si(__mxxxi * mem_addr,const __mxxxi & a)72 inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) {
73   _mm256_store_si256(mem_addr, a);
74 }
75 
_mmxxx_or_si(const __mxxxi & a,const __mxxxi & b)76 inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) {
77   return _mm256_or_si256(a, b);
78 }
79 
80 #define _mmxxx_slli_si(a, n) n < 16 ? \
81   _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0)), 16 - n) : \
82   _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 2, 0))
83 
84 #define _mmxxx_srli_si(a, n) \
85   _mm256_srli_si256(_mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), n - 16)  // NOLINT
86 
87 template<Architecture A>
88 struct InstructionSet<A, std::int16_t> {
89   using type = std::int16_t;
90   static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
91   static constexpr std::uint32_t kLogNumVar = 4;
92   static constexpr std::uint32_t kLSS = 2;   // Left Shift Size
93   static constexpr std::uint32_t kRSS = 30;  // Right Shift Size
_mmxxx_add_epispoa::InstructionSet94   static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
95     return _mm256_add_epi16(a, b);
96   }
_mmxxx_sub_epispoa::InstructionSet97   static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
98     return _mm256_sub_epi16(a, b);
99   }
_mmxxx_min_epispoa::InstructionSet100   static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
101     return _mm256_min_epi16(a, b);
102   }
_mmxxx_max_epispoa::InstructionSet103   static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
104     return _mm256_max_epi16(a, b);
105   }
_mmxxx_set1_epispoa::InstructionSet106   static inline __mxxxi _mmxxx_set1_epi(type a) {
107     return _mm256_set1_epi16(a);
108   }
_mmxxx_prefix_maxspoa::InstructionSet109   static inline void _mmxxx_prefix_max(
110       __mxxxi& a,  // NOLINT
111       const __mxxxi* masks,
112       const __mxxxi* penalties) {
113     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 2)));  // NOLINT
114     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 4)));  // NOLINT
115     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 8)));  // NOLINT
116     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[3], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[3]), 16)));  // NOLINT
117   }
118 };
119 
120 template<Architecture A>
121 struct InstructionSet<A, std::int32_t> {
122   using type = std::int32_t;
123   static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
124   static constexpr std::uint32_t kLogNumVar = 3;
125   static constexpr std::uint32_t kLSS = 4;
126   static constexpr std::uint32_t kRSS = 28;
_mmxxx_add_epispoa::InstructionSet127   static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
128     return _mm256_add_epi32(a, b);
129   }
_mmxxx_sub_epispoa::InstructionSet130   static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
131     return _mm256_sub_epi32(a, b);
132   }
_mmxxx_min_epispoa::InstructionSet133   static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
134     return _mm256_min_epi32(a, b);
135   }
_mmxxx_max_epispoa::InstructionSet136   static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
137     return _mm256_max_epi32(a, b);
138   }
_mmxxx_set1_epispoa::InstructionSet139   static inline __mxxxi _mmxxx_set1_epi(type a) {
140     return _mm256_set1_epi32(a);
141   }
_mmxxx_prefix_maxspoa::InstructionSet142   static inline void _mmxxx_prefix_max(
143       __mxxxi& a,  // NOLINT
144       const __mxxxi* masks,
145       const __mxxxi* penalties) {
146     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 4)));  // NOLINT
147     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 8)));  // NOLINT
148     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 16)));  // NOLINT
149     }
150 };
151 
152 #elif defined(__SSE4_1__) || defined(USE_SIMDE)
153 
154 constexpr std::uint32_t kRegisterSize = 128;
155 using __mxxxi = __m128i;
156 
_mmxxx_load_si(__mxxxi const * mem_addr)157 inline __mxxxi _mmxxx_load_si(__mxxxi const* mem_addr) {
158   return _mm_load_si128(mem_addr);
159 }
160 
_mmxxx_store_si(__mxxxi * mem_addr,const __mxxxi & a)161 inline void _mmxxx_store_si(__mxxxi* mem_addr, const __mxxxi& a) {
162   _mm_store_si128(mem_addr, a);
163 }
164 
_mmxxx_or_si(const __mxxxi & a,const __mxxxi & b)165 inline __mxxxi _mmxxx_or_si(const __mxxxi& a, const __mxxxi& b) {
166   return _mm_or_si128(a, b);
167 }
168 
169 #define _mmxxx_slli_si(a, n) \
170     _mm_slli_si128(a, n)
171 
172 #define _mmxxx_srli_si(a, n) \
173     _mm_srli_si128(a, n)
174 
175 template<Architecture A>
176 struct InstructionSet<A, std::int16_t> {
177   using type = std::int16_t;
178   static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
179   static constexpr std::uint32_t kLogNumVar = 3;
180   static constexpr std::uint32_t kLSS = 2;
181   static constexpr std::uint32_t kRSS = 14;
_mmxxx_add_epispoa::InstructionSet182   static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
183     return _mm_add_epi16(a, b);
184   }
_mmxxx_sub_epispoa::InstructionSet185   static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
186     return _mm_sub_epi16(a, b);
187   }
_mmxxx_min_epispoa::InstructionSet188   static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
189     return _mm_min_epi16(a, b);
190   }
_mmxxx_max_epispoa::InstructionSet191   static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
192     return _mm_max_epi16(a, b);
193   }
_mmxxx_set1_epispoa::InstructionSet194   static inline __mxxxi _mmxxx_set1_epi(type a) {
195     return _mm_set1_epi16(a);
196   }
_mmxxx_prefix_maxspoa::InstructionSet197   static inline void _mmxxx_prefix_max(
198       __mxxxi& a,  // NOLINT
199       const __mxxxi* masks,
200       const __mxxxi* penalties) {
201     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 2)));  // NOLINT
202     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 4)));  // NOLINT
203     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[2], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[2]), 8)));  // NOLINT
204   }
205 };
206 
207 template<Architecture A>
208 struct InstructionSet<A, std::int32_t> {
209   using type = std::int32_t;
210   static constexpr std::uint32_t kNumVar = kRegisterSize / (8 * sizeof(type));
211   static constexpr std::uint32_t kLogNumVar = 2;
212   static constexpr std::uint32_t kLSS = 4;
213   static constexpr std::uint32_t kRSS = 12;
_mmxxx_add_epispoa::InstructionSet214   static inline __mxxxi _mmxxx_add_epi(const __mxxxi& a, const __mxxxi& b) {
215     return _mm_add_epi32(a, b);
216   }
_mmxxx_sub_epispoa::InstructionSet217   static inline __mxxxi _mmxxx_sub_epi(const __mxxxi& a, const __mxxxi& b) {
218     return _mm_sub_epi32(a, b);
219   }
_mmxxx_min_epispoa::InstructionSet220   static inline __mxxxi _mmxxx_min_epi(const __mxxxi& a, const __mxxxi& b) {
221     return _mm_min_epi32(a, b);
222   }
_mmxxx_max_epispoa::InstructionSet223   static inline __mxxxi _mmxxx_max_epi(const __mxxxi& a, const __mxxxi& b) {
224     return _mm_max_epi32(a, b);
225   }
_mmxxx_set1_epispoa::InstructionSet226   static inline __mxxxi _mmxxx_set1_epi(type a) {
227     return _mm_set1_epi32(a);
228   }
_mmxxx_prefix_maxspoa::InstructionSet229   static inline void _mmxxx_prefix_max(
230       __mxxxi& a,  // NOLINT
231       const __mxxxi* masks,
232       const __mxxxi* penalties) {
233     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[0], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[0]), 4)));  // NOLINT
234     a = _mmxxx_max_epi(a, _mmxxx_or_si(masks[1], _mmxxx_slli_si(_mmxxx_add_epi(a, penalties[1]), 8)));  // NOLINT
235   }
236 };
237 
238 #endif
239 
240 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
241 
242 template<Architecture A, typename T>
_mmxxx_print(const __mxxxi & a)243 void _mmxxx_print(const __mxxxi& a) {
244   __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar];  // NOLINT
245   _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a);
246 
247   for (std::uint32_t i = 0; i < T::kNumVar; i++) {
248     std::cout << unpacked[i] << " ";
249   }
250 }
251 
252 template<Architecture A, typename T>
_mmxxx_max_value(const __mxxxi & a)253 typename T::type _mmxxx_max_value(const __mxxxi& a) {
254   typename T::type max_score = 0;
255   __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT
256   _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a);
257 
258   for (std::uint32_t i = 0; i < T::kNumVar; i++) {
259     max_score = std::max(max_score, unpacked[i]);
260   }
261   return max_score;
262 }
263 
264 template<Architecture A, typename T>
_mmxxx_value_at(const __mxxxi & a,std::uint32_t i)265 typename T::type _mmxxx_value_at(const __mxxxi& a, std::uint32_t i) {
266   __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT
267   _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), a);
268 
269   return unpacked[i];
270 }
271 
272 template<Architecture A, typename T>
_mmxxx_index_of(const __mxxxi * row,std::uint32_t row_width,typename T::type value)273 std::int32_t _mmxxx_index_of(
274     const __mxxxi* row,
275     std::uint32_t row_width,
276     typename T::type value) {
277   for (std::uint32_t i = 0; i < row_width; ++i) {
278     __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar]; // NOLINT
279     _mmxxx_store_si(reinterpret_cast<__mxxxi*>(unpacked), row[i]);
280 
281     for (std::uint32_t j = 0; j < T::kNumVar; j++) {
282       if (unpacked[j] == value) {
283         return i * T::kNumVar + j;
284       }
285     }
286   }
287   return -1;
288 }
289 
290 #endif
291 
292 template<Architecture A>
Create(AlignmentType type,AlignmentSubtype subtype,std::int8_t m,std::int8_t n,std::int8_t g,std::int8_t e,std::int8_t q,std::int8_t c)293 std::unique_ptr<AlignmentEngine> SimdAlignmentEngine<A>::Create(
294     AlignmentType type,
295     AlignmentSubtype subtype,
296     std::int8_t m,
297     std::int8_t n,
298     std::int8_t g,
299     std::int8_t e,
300     std::int8_t q,
301     std::int8_t c) {
302 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
303   return std::unique_ptr<AlignmentEngine>(
304       new SimdAlignmentEngine<A>(type, subtype, m, n, g, e, q, c));
305 #else
306   return nullptr;
307 #endif
308 }
309 
310 template<Architecture A>
311 struct SimdAlignmentEngine<A>::Implementation {
312 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
313   std::vector<std::uint32_t> node_id_to_rank;
314 
315   std::unique_ptr<__mxxxi[]> sequence_profile_storage;
316   std::uint64_t sequence_profile_size;
317   __mxxxi* sequence_profile;
318 
319   std::vector<std::int32_t> first_column;
320   std::unique_ptr<__mxxxi[]> M_storage;
321   std::uint64_t M_size;
322   __mxxxi* H;
323   __mxxxi* F;
324   __mxxxi* E;
325   __mxxxi* O;
326   __mxxxi* Q;
327 
328   std::unique_ptr<__mxxxi[]> masks_storage;
329   std::uint32_t masks_size;
330   __mxxxi* masks;
331 
332   std::unique_ptr<__mxxxi[]> penalties_storage;
333   std::uint32_t penalties_size;
334   __mxxxi* penalties;
335 
Implementationspoa::SimdAlignmentEngine::Implementation336   Implementation()
337       : node_id_to_rank(),
338         sequence_profile_storage(nullptr),
339         sequence_profile_size(0),
340         sequence_profile(nullptr),
341         first_column(),
342         M_storage(nullptr),
343         M_size(0),
344         H(nullptr),
345         F(nullptr),
346         E(nullptr),
347         O(nullptr),
348         Q(nullptr),
349         masks_storage(nullptr),
350         masks_size(0),
351         masks(nullptr),
352         penalties_storage(nullptr),
353         penalties_size(0),
354         penalties(nullptr) {
355   }
356 #endif
357 };
358 
359 template<Architecture A>
SimdAlignmentEngine(AlignmentType type,AlignmentSubtype subtype,std::int8_t m,std::int8_t n,std::int8_t g,std::int8_t e,std::int8_t q,std::int8_t c)360 SimdAlignmentEngine<A>::SimdAlignmentEngine(
361     AlignmentType type,
362     AlignmentSubtype subtype,
363     std::int8_t m,
364     std::int8_t n,
365     std::int8_t g,
366     std::int8_t e,
367     std::int8_t q,
368     std::int8_t c)
369     : AlignmentEngine(type, subtype, m, n, g, e, q, c),
370       pimpl_(new Implementation()) {
371 }
372 
373 template<Architecture A>
Prealloc(std::uint32_t max_sequence_len,std::uint8_t alphabet_size)374 void SimdAlignmentEngine<A>::Prealloc(
375     std::uint32_t max_sequence_len,
376     std::uint8_t alphabet_size) {
377   if (max_sequence_len > std::numeric_limits<int32_t>::max()) {
378     throw std::invalid_argument(
379         "[spoa::SimdAlignmentEngine::Prealloc] error: too large sequence!");
380   }
381 
382 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
383 
384   std::int64_t worst_case_score = WorstCaseAlignmentScore(
385       static_cast<std::int64_t>(max_sequence_len) + 8,
386       static_cast<std::int64_t>(max_sequence_len) * alphabet_size);
387 
388   if (worst_case_score < std::numeric_limits<std::int32_t>::min() + 1024) {
389     return;
390   } else if (worst_case_score < std::numeric_limits<std::int16_t>::min() + 1024) {  // NOLINT
391     try {
392       Realloc(
393           (max_sequence_len / InstructionSet<A, std::int32_t>::kNumVar) + 1,
394           static_cast<std::uint64_t>(max_sequence_len) * alphabet_size,
395           alphabet_size);
396     } catch (std::bad_alloc& ba) {
397       throw std::invalid_argument(
398           "[spoa::SimdAlignmentEngine::Prealloc] error: insufficient memory!");
399     }
400   } else {
401     try {
402       Realloc(
403           (max_sequence_len / InstructionSet<A, std::int16_t>::kNumVar) + 1,
404           static_cast<std::uint64_t>(max_sequence_len) * alphabet_size,
405           alphabet_size);
406     } catch (std::bad_alloc& ba) {
407       throw std::invalid_argument(
408           "[spoa::SimdAlignmentEngine::Prealloc] error: insufficient memory!");
409     }
410   }
411 
412 #endif
413 }
414 
415 template<Architecture A>
Realloc(std::uint64_t matrix_width,std::uint64_t matrix_height,std::uint8_t num_codes)416 void SimdAlignmentEngine<A>::Realloc(
417     std::uint64_t matrix_width,
418     std::uint64_t matrix_height,
419     std::uint8_t num_codes) {
420 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
421   if (pimpl_->node_id_to_rank.size() < matrix_height - 1) {
422     pimpl_->node_id_to_rank.resize(matrix_height - 1, 0);
423   }
424   if (pimpl_->sequence_profile_size < num_codes * matrix_width) {
425     __mxxxi* storage = nullptr;
426     pimpl_->sequence_profile_size = num_codes * matrix_width;
427     pimpl_->sequence_profile = AllocateAlignedMemory<A>(
428         &storage,
429         pimpl_->sequence_profile_size,
430         kRegisterSize / 8);
431     pimpl_->sequence_profile_storage.reset();
432     pimpl_->sequence_profile_storage = std::unique_ptr<__mxxxi[]>(storage);
433   }
434   if (subtype_ == AlignmentSubtype::kLinear) {
435     if (pimpl_->first_column.size() < matrix_height) {
436       pimpl_->first_column.resize(matrix_height, 0);
437     }
438     if (pimpl_->M_size < matrix_height * matrix_width) {
439       __mxxxi* storage = nullptr;
440       pimpl_->M_size = matrix_height * matrix_width;
441       pimpl_->H = AllocateAlignedMemory<A>(
442         &storage,
443         pimpl_->M_size,
444         kRegisterSize / 8);
445       pimpl_->M_storage.reset();
446       pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage);
447     }
448   } else if (subtype_ == AlignmentSubtype::kAffine) {
449     if (pimpl_->first_column.size() < 2 * matrix_height) {
450       pimpl_->first_column.resize(2 * matrix_height, 0);
451     }
452     if (pimpl_->M_size < 3 * matrix_height * matrix_width) {
453       __mxxxi* storage = nullptr;
454       pimpl_->M_size = 3 * matrix_height * matrix_width;
455       pimpl_->H = AllocateAlignedMemory<A>(
456         &storage,
457         pimpl_->M_size,
458         kRegisterSize / 8);
459       pimpl_->F = pimpl_->H + matrix_height * matrix_width;
460       pimpl_->E = pimpl_->F + matrix_height * matrix_width;
461       pimpl_->M_storage.reset();
462       pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage);
463     }
464   } else if (subtype_ == AlignmentSubtype::kConvex) {
465     if (pimpl_->first_column.size() < 3 * matrix_height) {
466       pimpl_->first_column.resize(3 * matrix_height, 0);
467     }
468     if (pimpl_->M_size < 5 * matrix_height * matrix_width) {
469       __mxxxi* storage = nullptr;
470       pimpl_->M_size = 5 * matrix_height * matrix_width;
471       pimpl_->H = AllocateAlignedMemory<A>(
472           &storage,
473           pimpl_->M_size,
474           kRegisterSize / 8);
475       pimpl_->F = pimpl_->H + matrix_height * matrix_width;
476       pimpl_->E = pimpl_->F + matrix_height * matrix_width;
477       pimpl_->O = pimpl_->E + matrix_height * matrix_width;
478       pimpl_->Q = pimpl_->O + matrix_height * matrix_width;
479       pimpl_->M_storage.reset();
480       pimpl_->M_storage = std::unique_ptr<__mxxxi[]>(storage);
481     }
482   }
483   if (pimpl_->masks_size < InstructionSet<A, std::int16_t>::kLogNumVar + 1) {
484     __mxxxi* storage = nullptr;
485     pimpl_->masks_size = InstructionSet<A, std::int16_t>::kLogNumVar + 1;
486     pimpl_->masks = AllocateAlignedMemory<A>(
487         &storage,
488         pimpl_->masks_size,
489         kRegisterSize / 8);
490     pimpl_->masks_storage.reset();
491     pimpl_->masks_storage = std::unique_ptr<__mxxxi[]>(storage);
492   }
493   if (pimpl_->penalties_size < 2 * InstructionSet<A, std::int16_t>::kLogNumVar) {  // NOLINT
494     __mxxxi* storage = nullptr;
495     pimpl_->penalties_size = 2 * InstructionSet<A, std::int16_t>::kLogNumVar;
496     pimpl_->penalties = AllocateAlignedMemory<A>(
497         &storage,
498         pimpl_->penalties_size,
499         kRegisterSize / 8);
500     pimpl_->penalties_storage.reset();
501     pimpl_->penalties_storage = std::unique_ptr<__mxxxi[]>(storage);
502   }
503 #endif
504 }
505 
506 template<Architecture A> template<typename T>
Initialize(const char * sequence,const Graph & graph,std::uint64_t normal_matrix_width,std::uint64_t matrix_width,std::uint64_t matrix_height)507 void SimdAlignmentEngine<A>::Initialize(
508     const char* sequence,
509     const Graph& graph,
510     std::uint64_t normal_matrix_width,
511     std::uint64_t matrix_width,
512     std::uint64_t matrix_height) noexcept {
513 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
514   std::int32_t padding_penatly = -1 * std::max(
515       std::max(abs(m_), abs(n_)),
516       std::max(abs(g_), abs(q_)));
517 
518   __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {};  // NOLINT
519 
520   for (std::uint32_t i = 0; i < graph.num_codes(); ++i) {
521     char c = graph.decoder(i);
522     for (std::uint32_t j = 0; j < matrix_width; ++j) {
523       for (std::uint32_t k = 0; k < T::kNumVar; ++k) {
524         unpacked[k] = (j * T::kNumVar + k) < normal_matrix_width ?
525             (c == sequence[j * T::kNumVar + k] ? m_ : n_) : padding_penatly;
526       }
527       pimpl_->sequence_profile[i * matrix_width + j] =
528           _mmxxx_load_si(reinterpret_cast<const __mxxxi*>(unpacked));
529     }
530   }
531 
532   const auto& rank_to_node = graph.rank_to_node();
533   for (std::uint32_t i = 0; i < rank_to_node.size(); ++i) {
534     pimpl_->node_id_to_rank[rank_to_node[i]->id] = i;
535   }
536 
537   typename T::type kNegativeInfinity =
538       std::numeric_limits<typename T::type>::min() + 1024;
539 
540   __mxxxi negative_infinities = T::_mmxxx_set1_epi(kNegativeInfinity);
541   __mxxxi zeroes = T::_mmxxx_set1_epi(0);
542 
543   // initialize secondary matrices
544   switch (subtype_) {
545     case AlignmentSubtype::kConvex:
546       for (std::uint32_t j = 0; j < matrix_width; ++j) {
547         pimpl_->O[j] = negative_infinities;
548         pimpl_->Q[j] = T::_mmxxx_set1_epi(q_ + j * T::kNumVar * c_);
549 
550         __mxxxi c = T::_mmxxx_set1_epi(c_);
551         for (std::uint32_t k = 1; k < T::kNumVar; ++k) {
552           c = _mmxxx_slli_si(c, T::kLSS);
553           pimpl_->Q[j] = T::_mmxxx_add_epi(pimpl_->Q[j], c);
554         }
555       }
556       pimpl_->first_column[2 * matrix_height] = 0;
557       for (std::uint32_t i = 1; i < matrix_height; ++i) {
558         const auto& edges = rank_to_node[i - 1]->inedges;
559         std::int32_t penalty = edges.empty() ? q_ - c_ : kNegativeInfinity;
560         for (const auto& it : edges) {
561           std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1;
562           penalty = std::max(penalty, pimpl_->first_column[2 * matrix_height + pred_i]);  // NOLINT
563         }
564         pimpl_->first_column[2 * matrix_height + i] = penalty + c_;
565       }
566     case AlignmentSubtype::kAffine:
567       for (std::uint32_t j = 0; j < matrix_width; ++j) {
568         pimpl_->F[j] = negative_infinities;
569         pimpl_->E[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * e_);
570 
571         __mxxxi e = T::_mmxxx_set1_epi(e_);
572         for (std::uint32_t k = 1; k < T::kNumVar; ++k) {
573           e = _mmxxx_slli_si(e, T::kLSS);
574           pimpl_->E[j] = T::_mmxxx_add_epi(pimpl_->E[j], e);
575         }
576       }
577       pimpl_->first_column[matrix_height] = 0;
578       for (std::uint32_t i = 1; i < matrix_height; ++i) {
579         const auto& edges = rank_to_node[i - 1]->inedges;
580         std::int32_t penalty = edges.empty() ? g_ - e_ : kNegativeInfinity;
581         for (const auto& it : edges) {
582           std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1;
583           penalty = std::max(penalty, pimpl_->first_column[matrix_height + pred_i]);  // NOLINT
584         }
585         pimpl_->first_column[matrix_height + i] = penalty + e_;
586       }
587     case AlignmentSubtype::kLinear:
588       default:
589         break;
590   }
591 
592   // initialize primary matrix
593   switch (type_) {
594     case AlignmentType::kSW:
595       for (std::uint32_t j = 0; j < matrix_width; ++j) {
596         pimpl_->H[j] = zeroes;
597       }
598       for (std::uint32_t i = 0; i < matrix_height; ++i) {
599         pimpl_->first_column[i] = 0;
600       }
601       break;
602     case AlignmentType::kNW:
603       switch (subtype_) {
604         case AlignmentSubtype::kConvex:
605           for (std::uint32_t i = 0; i < matrix_height; ++i) {
606             pimpl_->first_column[i] = std::max(
607                 pimpl_->first_column[matrix_height + i],
608                 pimpl_->first_column[2 * matrix_height + i]);
609           }
610           for (std::uint32_t j = 0; j < matrix_width; ++j) {
611             pimpl_->H[j] = T::_mmxxx_max_epi(pimpl_->E[j], pimpl_->Q[j]);
612           }
613           break;
614         case AlignmentSubtype::kAffine:
615           for (std::uint32_t i = 0; i < matrix_height; ++i) {
616             pimpl_->first_column[i] = pimpl_->first_column[matrix_height + i];
617           }
618           for (std::uint32_t j = 0; j < matrix_width; ++j) {
619             pimpl_->H[j] = pimpl_->E[j];
620           }
621           break;
622         case AlignmentSubtype::kLinear:
623           pimpl_->first_column[0] = 0;
624           for (std::uint32_t i = 1; i < matrix_height; ++i) {
625             const auto& edges = rank_to_node[i - 1]->inedges;
626             std::int32_t penalty = edges.empty() ? 0 : kNegativeInfinity;
627             for (const auto& it : edges) {
628               std::uint32_t pred_i = pimpl_->node_id_to_rank[it->tail->id] + 1;
629               penalty = std::max(penalty, pimpl_->first_column[pred_i]);
630             }
631             pimpl_->first_column[i] = penalty + g_;
632           }
633           for (std::uint32_t j = 0; j < matrix_width; ++j) {
634             pimpl_->H[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * g_);
635             __mxxxi g = T::_mmxxx_set1_epi(g_);
636 
637             for (std::uint32_t k = 1; k < T::kNumVar; ++k) {
638               g = _mmxxx_slli_si(g, T::kLSS);
639               pimpl_->H[j] = T::_mmxxx_add_epi(pimpl_->H[j], g);
640             }
641           }
642         default:
643           break;
644       }
645       break;
646     case AlignmentType::kOV:
647       switch (subtype_) {
648         case AlignmentSubtype::kConvex:
649           for (std::uint32_t j = 0; j < matrix_width; ++j) {
650             pimpl_->H[j] = T::_mmxxx_max_epi(pimpl_->E[j],
651             pimpl_->Q[j]);
652           }
653           break;
654         case AlignmentSubtype::kAffine:
655           for (std::uint32_t j = 0; j < matrix_width; ++j) {
656             pimpl_->H[j] = pimpl_->E[j];
657           }
658           break;
659         case AlignmentSubtype::kLinear:
660           for (std::uint32_t j = 0; j < matrix_width; ++j) {
661             pimpl_->H[j] = T::_mmxxx_set1_epi(g_ + j * T::kNumVar * g_);
662             __mxxxi g = T::_mmxxx_set1_epi(g_);
663 
664             for (std::uint32_t k = 1; k < T::kNumVar; ++k) {
665               g = _mmxxx_slli_si(g, T::kLSS);
666               pimpl_->H[j] = T::_mmxxx_add_epi(pimpl_->H[j], g);
667             }
668           }
669           break;
670         default:
671           break;
672       }
673       for (std::uint32_t i = 0; i < matrix_height; ++i) {
674         pimpl_->first_column[i] = 0;
675       }
676       break;
677     default:
678       break;
679   }
680 #endif
681 }
682 
683 template<Architecture A>
Align(const char * sequence,std::uint32_t sequence_len,const Graph & graph,std::int32_t * score)684 Alignment SimdAlignmentEngine<A>::Align(
685     const char* sequence, std::uint32_t sequence_len,
686     const Graph& graph,
687     std::int32_t* score) {
688   if (sequence_len > std::numeric_limits<int32_t>::max()) {
689     throw std::invalid_argument(
690         "[spoa::SimdAlignmentEngine::Align] error: too large sequence!");
691   }
692 
693   if (graph.nodes().empty() || sequence_len == 0) {
694     return Alignment();
695   }
696 
697 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
698 
699   std::int64_t worst_case_score = WorstCaseAlignmentScore(
700       sequence_len + 8,
701       graph.nodes().size());
702 
703   if (worst_case_score < std::numeric_limits<std::int32_t>::min() + 1024) {
704     throw std::invalid_argument(
705         "[spoa::SimdAlignmentEngine::Align] error: possible overflow!");
706   } else if (worst_case_score < std::numeric_limits<std::int16_t>::min() + 1024) {  // NOLINT
707     try {
708       Realloc(
709           std::ceil(static_cast<double>(sequence_len) / InstructionSet<A, std::int32_t>::kNumVar),  // NOLINT
710           graph.nodes().size() + 1,
711           graph.num_codes());
712     } catch (std::bad_alloc& ba) {
713       throw std::invalid_argument(
714           "[spoa::SimdAlignmentEngine::Align] error: insufficient memory!");
715     }
716     Initialize<InstructionSet<A, std::int32_t>>(
717         sequence,
718         graph,
719         sequence_len,
720         std::ceil(static_cast<double>(sequence_len) / InstructionSet<A, std::int32_t>::kNumVar),  // NOLINT
721         graph.nodes().size() + 1);
722 
723     if (subtype_ == AlignmentSubtype::kLinear) {
724       return Linear<InstructionSet<A, std::int32_t>>(sequence_len, graph, score);  // NOLINT
725     } else if (subtype_ == AlignmentSubtype::kAffine) {
726       return Affine<InstructionSet<A, std::int32_t>>(sequence_len, graph, score);  // NOLINT
727     } else if (subtype_ == AlignmentSubtype::kConvex) {
728       return Convex<InstructionSet<A, std::int32_t>>(sequence_len, graph, score);  // NOLINT
729     }
730   } else {
731     try {
732       Realloc(
733           std::ceil(static_cast<double>(sequence_len) / InstructionSet<A, std::int16_t>::kNumVar),  // NOLINT
734           graph.nodes().size() + 1,
735           graph.num_codes());
736     } catch (std::bad_alloc& ba) {
737       throw std::invalid_argument(
738           "[spoa::SimdAlignmentEngine::Align] error: insufficient memory!");
739     }
740     Initialize<InstructionSet<A, std::int16_t>>(
741         sequence,
742         graph,
743         sequence_len,
744         std::ceil(static_cast<double>(sequence_len) / InstructionSet<A, std::int16_t>::kNumVar),  // NOLINT
745         graph.nodes().size() + 1);
746 
747     if (subtype_ == AlignmentSubtype::kLinear) {
748       return Linear<InstructionSet<A, std::int16_t>>(sequence_len, graph, score);  // NOLINT
749     } else if (subtype_ == AlignmentSubtype::kAffine) {
750       return Affine<InstructionSet<A, std::int16_t>>(sequence_len, graph, score);  // NOLINT
751     } else if (subtype_ == AlignmentSubtype::kConvex) {
752       return Convex<InstructionSet<A, std::int16_t>>(sequence_len, graph, score);  // NOLINT
753     }
754   }
755 
756 #endif
757   return Alignment();
758 }
759 
760 template<Architecture A> template <typename T>
Linear(std::uint32_t sequence_len,const Graph & graph,std::int32_t * score)761 Alignment SimdAlignmentEngine<A>::Linear(
762     std::uint32_t sequence_len,
763     const Graph& graph,
764     std::int32_t* score) noexcept {
765 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
766   std::uint64_t normal_matrix_width = sequence_len;
767   std::uint64_t matrix_width =
768       std::ceil(static_cast<double>(sequence_len) / T::kNumVar);
769   const auto& rank_to_node = graph.rank_to_node();
770 
771   typename T::type kNegativeInfinity =
772       std::numeric_limits<typename T::type>::min() + 1024;
773 
774   __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0};  // NOLINT
775 
776   for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) {
777     unpacked[i] = kNegativeInfinity;
778     if ((i & (i + 1)) == 0) {
779       pimpl_->masks[j++] = _mmxxx_load_si(
780         reinterpret_cast<const __mxxxi*>(unpacked));
781     }
782   }
783   pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si(
784       T::_mmxxx_set1_epi(kNegativeInfinity),
785       T::kLSS);
786 
787   pimpl_->penalties[0] = T::_mmxxx_set1_epi(g_);
788   for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) {
789     pimpl_->penalties[i] = T::_mmxxx_add_epi(
790         pimpl_->penalties[i - 1],
791         pimpl_->penalties[i - 1]);
792   }
793 
794   typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity;  // NOLINT
795   std::int32_t max_i = -1;
796   std::int32_t max_j = -1;
797   std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar;
798   __mxxxi zeroes = T::_mmxxx_set1_epi(0);
799   __mxxxi g = T::_mmxxx_set1_epi(g_);
800 
801   // alignment
802   for (const auto& it : rank_to_node) {
803     __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]);  // NOLINT
804 
805     std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1;
806     std::uint32_t pred_i = it->inedges.empty() ?
807         0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1;
808 
809     __mxxxi* H_row = &(pimpl_->H[i * matrix_width]);
810     __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
811 
812     __mxxxi x = _mmxxx_srli_si(
813         T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
814         T::kRSS);
815 
816     for (std::uint64_t j = 0; j < matrix_width; ++j) {
817       // get diagonal
818       __mxxxi t1 = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
819       H_row[j] = _mmxxx_or_si(
820           _mmxxx_slli_si(H_pred_row[j], T::kLSS),
821           x);
822       x = t1;
823 
824       // update M
825       H_row[j] = T::_mmxxx_max_epi(
826           T::_mmxxx_add_epi(H_row[j], char_profile[j]),
827           T::_mmxxx_add_epi(H_pred_row[j], g));
828     }
829     // check other predecessors
830     for (std::uint32_t p = 1; p < it->inedges.size(); ++p) {
831       pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1;
832 
833       H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
834 
835       x = _mmxxx_srli_si(
836           T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
837           T::kRSS);
838 
839       for (std::uint64_t j = 0; j < matrix_width; ++j) {
840         // get diagonal
841         __mxxxi t1 = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
842         __mxxxi m = _mmxxx_or_si(
843             _mmxxx_slli_si(H_pred_row[j], T::kLSS),
844             x);
845         x = t1;
846 
847         // updage M
848         H_row[j] = T::_mmxxx_max_epi(
849             H_row[j],
850             T::_mmxxx_max_epi(
851                 T::_mmxxx_add_epi(m, char_profile[j]),
852                 T::_mmxxx_add_epi(H_pred_row[j], g)));
853       }
854     }
855 
856     __mxxxi score = T::_mmxxx_set1_epi(kNegativeInfinity);
857     x = _mmxxx_srli_si(
858         T::_mmxxx_add_epi(
859             T::_mmxxx_set1_epi(pimpl_->first_column[i]),
860             g),
861         T::kRSS);
862 
863     for (std::uint64_t j = 0; j < matrix_width; ++j) {
864       // add last element of previous vector into this one
865       H_row[j] = T::_mmxxx_max_epi(
866           H_row[j],
867           _mmxxx_or_si(x, pimpl_->masks[T::kLogNumVar]));
868 
869       T::_mmxxx_prefix_max(H_row[j], pimpl_->masks, pimpl_->penalties);
870 
871       x = _mmxxx_srli_si(
872           T::_mmxxx_add_epi(H_row[j], g),
873           T::kRSS);
874 
875       if (type_ == AlignmentType::kSW) {
876         H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes);
877       }
878       score = T::_mmxxx_max_epi(score, H_row[j]);
879     }
880 
881     if (type_ == AlignmentType::kSW) {
882       std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
883       if (max_score < max_row_score) {
884         max_score = max_row_score;
885         max_i = i;
886       }
887     } else if (type_ == AlignmentType::kOV) {
888       if (it->outedges.empty()) {
889         std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
890         if (max_score < max_row_score) {
891           max_score = max_row_score;
892           max_i = i;
893         }
894       }
895     } else if (type_ == AlignmentType::kNW) {
896       if (it->outedges.empty()) {
897         std::int32_t max_row_score = _mmxxx_value_at<A, T>(
898             H_row[matrix_width - 1],
899             last_column_id);
900         if (max_score < max_row_score) {
901           max_score = max_row_score;
902           max_i = i;
903         }
904       }
905     }
906   }
907 
908   if (max_i == -1 && max_j == -1) {
909     return Alignment();
910   }
911   if (score) {
912     *score = max_score;
913   }
914 
915   if (type_ == AlignmentType::kSW) {
916     max_j = _mmxxx_index_of<A, T>(
917         &(pimpl_->H[max_i * matrix_width]),
918         matrix_width,
919         max_score);
920   } else if (type_ == AlignmentType::kOV) {
921     if (rank_to_node[max_i - 1]->outedges.empty()) {
922       max_j = _mmxxx_index_of<A, T>(
923           &(pimpl_->H[max_i * matrix_width]),
924           matrix_width,
925           max_score);
926     } else {
927       max_j = normal_matrix_width - 1;
928     }
929   } else if (type_ == AlignmentType::kNW) {
930     max_j = normal_matrix_width - 1;
931   }
932 
933   // backtrack
934   std::uint32_t max_num_predecessors = 1;
935   for (std::uint32_t i = 0; i < static_cast<std::uint32_t>(max_i); ++i) {
936     max_num_predecessors = std::max(
937         max_num_predecessors,
938         static_cast<std::uint32_t>(rank_to_node[i]->inedges.size()));
939   }
940 
941   typename T::type* backtrack_storage = nullptr;
942   typename T::type* H = AllocateAlignedMemory<A>(
943       &backtrack_storage,
944       3 * T::kNumVar + 2 * T::kNumVar * max_num_predecessors,
945       kRegisterSize / 8);
946   typename T::type* H_pred = H + T::kNumVar;
947   typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors;
948   typename T::type* H_left_pred = H_diag_pred + T::kNumVar * max_num_predecessors;  // NOLINT
949   typename T::type* profile = H_left_pred + T::kNumVar;
950 
951   std::vector<std::uint32_t> predecessors;
952 
953   std::int32_t i = max_i;
954   std::int32_t j = max_j;
955   std::int32_t prev_i = 0, prev_j = 0;
956 
957   std::uint32_t j_div = j / T::kNumVar;
958   std::uint32_t j_mod = j % T::kNumVar;
959 
960   bool load_next_segment = true;
961 
962   Alignment alignment;
963 
964   do {
965     // check stop condition
966     if (j == -1 || i == 0) {
967       break;
968     }
969 
970     const auto& it = rank_to_node[i - 1];
971     // load everything
972     if (load_next_segment) {
973       predecessors.clear();
974 
975       // load current cells
976       _mmxxx_store_si(
977           reinterpret_cast<__mxxxi*>(H),
978           pimpl_->H[i * matrix_width + j_div]);
979 
980       // load predecessors cells
981       if (it->inedges.empty()) {
982         predecessors.emplace_back(0);
983         _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]);
984       } else {
985         std::uint32_t store_pos = 0;
986         for (const auto& jt : it->inedges) {
987           predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1);
988           _mmxxx_store_si(
989               reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
990               pimpl_->H[predecessors.back() * matrix_width + j_div]);
991           ++store_pos;
992         }
993       }
994 
995       // load query profile cells
996       _mmxxx_store_si(
997           reinterpret_cast<__mxxxi*>(profile),
998           pimpl_->sequence_profile[it->code * matrix_width + j_div]);
999     }
1000 
1001     // check stop condition
1002     if (type_ == AlignmentType::kSW && H[j_mod] == 0) {
1003       break;
1004     }
1005 
1006     if (j_mod == 0) {
1007       // border case
1008       if (j_div > 0) {
1009         _mmxxx_store_si(
1010             reinterpret_cast<__mxxxi*>(H_left_pred),
1011             pimpl_->H[i * matrix_width + j_div - 1]);
1012 
1013         for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1014           _mmxxx_store_si(
1015               reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]),
1016               pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]);
1017         }
1018       } else {
1019         H_left_pred[T::kNumVar - 1] = pimpl_->first_column[i];
1020 
1021         for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1022           H_diag_pred[(p + 1) * T::kNumVar - 1] =
1023               pimpl_->first_column[predecessors[p]];
1024         }
1025       }
1026     }
1027 
1028     // find best predecessor cell
1029     bool predecessor_found = false;
1030 
1031     if (i != 0) {
1032       for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1033         if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) ||  // NOLINT
1034             (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) {  // NOLINT
1035           prev_i = predecessors[p];
1036           prev_j = j - 1;
1037           predecessor_found = true;
1038           break;
1039         }
1040       }
1041     }
1042 
1043     if (!predecessor_found && i != 0) {
1044       for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1045         if (H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) {
1046           prev_i = predecessors[p];
1047           prev_j = j;
1048           predecessor_found = true;
1049           break;
1050         }
1051       }
1052     }
1053 
1054     if (!predecessor_found) {
1055       if ((j_mod == 0 && H[j_mod] == H_left_pred[T::kNumVar - 1] + g_) ||
1056           (j_mod != 0 && H[j_mod] == H[j_mod - 1] + g_)) {
1057         prev_i = i;
1058         prev_j = j - 1;
1059         predecessor_found = true;
1060       }
1061     }
1062 
1063     alignment.emplace_back(
1064         i == prev_i ? -1 : rank_to_node[i - 1]->id,
1065         j == prev_j ? -1 : j);
1066 
1067     // update for next round
1068     load_next_segment =
1069         (i == prev_i ? false : true) ||
1070         (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false);
1071 
1072     i = prev_i;
1073     j = prev_j;
1074     j_div = j / T::kNumVar;
1075     j_mod = j % T::kNumVar;
1076   } while (true);
1077 
1078   delete[] backtrack_storage;
1079 
1080   // update alignment for NW (backtrack stops on first row or column)
1081   if (type_ == AlignmentType::kNW) {
1082     while (i == 0 && j != -1) {
1083       alignment.emplace_back(-1, j);
1084       --j;
1085     }
1086     while (i != 0 && j == -1) {
1087       alignment.emplace_back(rank_to_node[i - 1]->id, -1);
1088 
1089       const auto& it = rank_to_node[i - 1];
1090       if (it->inedges.empty()) {
1091           i = 0;
1092       } else {
1093         for (const auto& jt : it->inedges) {
1094           std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1;
1095           if (pimpl_->first_column[i] == pimpl_->first_column[pred_i] + g_) {
1096             i = pred_i;
1097             break;
1098           }
1099         }
1100       }
1101     }
1102   }
1103 
1104   std::reverse(alignment.begin(), alignment.end());
1105   return alignment;
1106 #else
1107   return Alignment();
1108 #endif
1109 }
1110 
1111 template<Architecture A> template <typename T>
Affine(std::uint32_t sequence_len,const Graph & graph,std::int32_t * score)1112 Alignment SimdAlignmentEngine<A>::Affine(
1113     std::uint32_t sequence_len,
1114     const Graph& graph,
1115     std::int32_t* score) noexcept {
1116 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
1117   std::uint64_t normal_matrix_width = sequence_len;
1118   std::uint64_t matrix_width =
1119       std::ceil(static_cast<double>(sequence_len) / T::kNumVar);
1120   const auto& rank_to_node = graph.rank_to_node();
1121 
1122   typename T::type kNegativeInfinity =
1123       std::numeric_limits<typename T::type>::min() + 1024;
1124 
1125   typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity;  // NOLINT
1126   std::int32_t max_i = -1;
1127   std::int32_t max_j = -1;
1128   std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar;
1129   __mxxxi zeroes = T::_mmxxx_set1_epi(0);
1130   __mxxxi g = T::_mmxxx_set1_epi(g_ - e_);
1131   __mxxxi e = T::_mmxxx_set1_epi(e_);
1132 
1133   __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0};  // NOLINT
1134 
1135   for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) {
1136     unpacked[i] = kNegativeInfinity;
1137     if ((i & (i + 1)) == 0) {
1138       pimpl_->masks[j++] = _mmxxx_load_si(
1139           reinterpret_cast<const __mxxxi*>(unpacked));
1140     }
1141   }
1142   pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si(
1143       T::_mmxxx_set1_epi(kNegativeInfinity),
1144       T::kLSS);
1145 
1146   pimpl_->penalties[0] = T::_mmxxx_set1_epi(e_);
1147   for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) {
1148     pimpl_->penalties[i] = T::_mmxxx_add_epi(
1149         pimpl_->penalties[i - 1],
1150         pimpl_->penalties[i - 1]);
1151   }
1152 
1153   // alignment
1154   for (const auto& it : rank_to_node) {
1155     __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]);  // NOLINT
1156 
1157     std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1;
1158 
1159     __mxxxi* H_row = &(pimpl_->H[i * matrix_width]);
1160     __mxxxi* F_row = &(pimpl_->F[i * matrix_width]);
1161 
1162     std::uint32_t pred_i = it->inedges.empty() ?
1163         0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1;
1164 
1165     __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
1166     __mxxxi* F_pred_row = &(pimpl_->F[pred_i * matrix_width]);
1167 
1168     __mxxxi x = _mmxxx_srli_si(
1169         T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
1170         T::kRSS);
1171 
1172     for (std::uint64_t j = 0; j < matrix_width; ++j) {
1173       // update F
1174       F_row[j] = T::_mmxxx_add_epi(
1175           T::_mmxxx_max_epi(
1176               T::_mmxxx_add_epi(H_pred_row[j], g),
1177               F_pred_row[j]),
1178           e);
1179 
1180       // update H
1181       H_row[j] = T::_mmxxx_add_epi(
1182           _mmxxx_or_si(
1183               _mmxxx_slli_si(H_pred_row[j], T::kLSS),
1184               x),
1185           char_profile[j]);
1186       x = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
1187     }
1188     // check other predecessors
1189     for (std::uint32_t p = 1; p < it->inedges.size(); ++p) {
1190       pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1;
1191 
1192       H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
1193       F_pred_row = &(pimpl_->F[pred_i * matrix_width]);
1194 
1195       x = _mmxxx_srli_si(
1196           T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
1197           T::kRSS);
1198 
1199       for (std::uint64_t j = 0; j < matrix_width; ++j) {
1200         // update F
1201         F_row[j] = T::_mmxxx_max_epi(
1202             F_row[j],
1203             T::_mmxxx_add_epi(
1204                 T::_mmxxx_max_epi(
1205                     T::_mmxxx_add_epi(H_pred_row[j], g),
1206                     F_pred_row[j]),
1207                 e));
1208 
1209         // update H
1210         H_row[j] = T::_mmxxx_max_epi(
1211             H_row[j],
1212             T::_mmxxx_add_epi(
1213                 _mmxxx_or_si(
1214                     _mmxxx_slli_si(H_pred_row[j], T::kLSS),
1215                     x),
1216                 char_profile[j]));
1217         x = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
1218       }
1219     }
1220 
1221     __mxxxi* E_row = &(pimpl_->E[i * matrix_width]);
1222     __mxxxi score = zeroes;
1223     x = T::_mmxxx_set1_epi(pimpl_->first_column[i]);
1224 
1225     for (std::uint64_t j = 0; j < matrix_width; ++j) {
1226       H_row[j] = T::_mmxxx_max_epi(H_row[j], F_row[j]);
1227 
1228       E_row[j] = T::_mmxxx_add_epi(
1229           T::_mmxxx_add_epi(
1230               _mmxxx_or_si(
1231                   _mmxxx_slli_si(H_row[j], T::kLSS),
1232                   _mmxxx_srli_si(x, T::kRSS)),
1233               g),
1234           e);
1235 
1236       T::_mmxxx_prefix_max(E_row[j], pimpl_->masks, pimpl_->penalties);
1237 
1238       H_row[j] = T::_mmxxx_max_epi(H_row[j], E_row[j]);
1239       x = T::_mmxxx_max_epi(
1240           H_row[j],
1241           T::_mmxxx_sub_epi(E_row[j], g));
1242 
1243       if (type_ == AlignmentType::kSW) {
1244         H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes);
1245       }
1246       score = T::_mmxxx_max_epi(score, H_row[j]);
1247     }
1248 
1249     if (type_ == AlignmentType::kSW) {
1250       std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
1251       if (max_score < max_row_score) {
1252         max_score = max_row_score;
1253         max_i = i;
1254       }
1255     } else if (type_ == AlignmentType::kOV) {
1256       if (it->outedges.empty()) {
1257         std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
1258         if (max_score < max_row_score) {
1259           max_score = max_row_score;
1260           max_i = i;
1261         }
1262       }
1263     } else if (type_ == AlignmentType::kNW) {
1264       if (it->outedges.empty()) {
1265         std::int32_t max_row_score = _mmxxx_value_at<A, T>(
1266             H_row[matrix_width - 1],
1267             last_column_id);
1268         if (max_score < max_row_score) {
1269           max_score = max_row_score;
1270           max_i = i;
1271         }
1272       }
1273     }
1274   }
1275 
1276   if (max_i == -1 && max_j == -1) {
1277     return Alignment();
1278   }
1279   if (score) {
1280     *score = max_score;
1281   }
1282 
1283   if (type_ == AlignmentType::kSW) {
1284     max_j = _mmxxx_index_of<A, T>(
1285         &(pimpl_->H[max_i * matrix_width]),
1286         matrix_width, max_score);
1287   } else if (type_ == AlignmentType::kOV) {
1288     if (rank_to_node[max_i - 1]->outedges.empty()) {
1289       max_j = _mmxxx_index_of<A, T>(
1290           &(pimpl_->H[max_i * matrix_width]),
1291           matrix_width, max_score);
1292     } else {
1293       max_j = normal_matrix_width - 1;
1294     }
1295   } else if (type_ == AlignmentType::kNW) {
1296     max_j = normal_matrix_width - 1;
1297   }
1298 
1299   // backtrack
1300   std::uint32_t max_num_predecessors = 1;
1301   for (std::uint32_t i = 0; i < static_cast<std::uint32_t>(max_i); ++i) {
1302     max_num_predecessors = std::max(
1303         max_num_predecessors,
1304         static_cast<std::uint32_t>(rank_to_node[i]->inedges.size()));
1305   }
1306 
1307   typename T::type* backtrack_storage = nullptr;
1308   typename T::type* H = AllocateAlignedMemory<A>(
1309       &backtrack_storage,
1310       6 * T::kNumVar + 3 * T::kNumVar * max_num_predecessors,
1311       kRegisterSize / 8);
1312   typename T::type* H_pred = H + T::kNumVar;
1313   typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors;
1314   typename T::type* H_left = H_diag_pred + T::kNumVar * max_num_predecessors;
1315   typename T::type* F = H_left + T::kNumVar;
1316   typename T::type* F_pred = F + T::kNumVar;
1317   typename T::type* E = F_pred + T::kNumVar * max_num_predecessors;
1318   typename T::type* E_left = E + T::kNumVar;
1319   typename T::type* profile = E_left + T::kNumVar;
1320 
1321   std::vector<std::uint32_t> predecessors;
1322 
1323   std::int32_t i = max_i;
1324   std::int32_t j = max_j;
1325   std::int32_t prev_i = 0, prev_j = 0;
1326 
1327   std::uint32_t j_div = j / T::kNumVar;
1328   std::uint32_t j_mod = j % T::kNumVar;
1329 
1330   bool load_next_segment = true;
1331 
1332   Alignment alignment;
1333 
1334   do {
1335     // check stop condition
1336     if (j == -1 || i == 0) {
1337       break;
1338     }
1339 
1340     const auto& it = rank_to_node[i - 1];
1341     // load everything
1342     if (load_next_segment) {
1343       predecessors.clear();
1344 
1345       // load current cells
1346       _mmxxx_store_si(
1347           reinterpret_cast<__mxxxi*>(H),
1348           pimpl_->H[i * matrix_width + j_div]);
1349       _mmxxx_store_si(
1350           reinterpret_cast<__mxxxi*>(E),
1351           pimpl_->E[i * matrix_width + j_div]);
1352 
1353       // load predecessors cells
1354       if (it->inedges.empty()) {
1355         predecessors.emplace_back(0);
1356         _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]);
1357         _mmxxx_store_si(reinterpret_cast<__mxxxi*>(F_pred), pimpl_->F[j_div]);
1358       } else {
1359         std::uint32_t store_pos = 0;
1360         for (const auto& jt : it->inedges) {
1361           predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1);
1362           _mmxxx_store_si(
1363               reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
1364               pimpl_->H[predecessors.back() * matrix_width + j_div]);
1365           _mmxxx_store_si(
1366               reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]),
1367               pimpl_->F[predecessors.back() * matrix_width + j_div]);
1368           ++store_pos;
1369         }
1370       }
1371 
1372       // load query profile cells
1373       _mmxxx_store_si(
1374           reinterpret_cast<__mxxxi*>(profile),
1375           pimpl_->sequence_profile[it->code * matrix_width + j_div]);
1376     }
1377 
1378     // check stop condition
1379     if (type_ == AlignmentType::kSW && H[j_mod] == 0) {
1380       break;
1381     }
1382 
1383     if (j_mod == 0) {
1384       // border case
1385       if (j_div > 0) {
1386         for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1387           _mmxxx_store_si(
1388               reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]),
1389               pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]);
1390         }
1391         _mmxxx_store_si(
1392             reinterpret_cast<__mxxxi*>(H_left),
1393             pimpl_->H[i * matrix_width + j_div - 1]);
1394         _mmxxx_store_si(
1395             reinterpret_cast<__mxxxi*>(E_left),
1396             pimpl_->E[i * matrix_width + j_div - 1]);
1397       } else {
1398         for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1399           H_diag_pred[(p + 1) * T::kNumVar - 1] =
1400               pimpl_->first_column[predecessors[p]];
1401         }
1402         H_left[T::kNumVar - 1] = pimpl_->first_column[i];
1403         E_left[T::kNumVar - 1] = pimpl_->first_column[i];
1404       }
1405     }
1406 
1407     // find best predecessor cell
1408     bool predecessor_found = false, extend_left = false, extend_up = false;
1409 
1410     if (i != 0) {
1411       for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1412         if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) ||  // NOLINT
1413             (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) {  // NOLINT
1414           prev_i = predecessors[p];
1415           prev_j = j - 1;
1416           predecessor_found = true;
1417           break;
1418         }
1419       }
1420     }
1421 
1422     if (!predecessor_found && i != 0) {
1423       for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1424         if ((extend_up = H[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) ||
1425                          H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) {
1426           prev_i = predecessors[p];
1427           prev_j = j;
1428           predecessor_found = true;
1429           break;
1430         }
1431       }
1432     }
1433 
1434     if (!predecessor_found) {
1435       if ((j_mod != 0 && ((extend_left = H[j_mod] == E[j_mod - 1] + e_) ||
1436                                          H[j_mod] == H[j_mod - 1] + g_)) ||
1437           (j_mod == 0 && ((extend_left = H[j_mod] == E_left[T::kNumVar - 1] + e_ ) ||  // NOLINT
1438                                          H[j_mod] == H_left[T::kNumVar - 1] + g_))) {  // NOLINT
1439         prev_i = i;
1440         prev_j = j - 1;
1441         predecessor_found = true;
1442       }
1443     }
1444 
1445     alignment.emplace_back(
1446         i == prev_i ? -1 : rank_to_node[i - 1]->id,
1447         j == prev_j ? -1 : j);
1448 
1449     // update for next round
1450     load_next_segment =
1451         (i == prev_i ? false : true) ||
1452         (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false);
1453 
1454     i = prev_i;
1455     j = prev_j;
1456     j_div = j / T::kNumVar;
1457     j_mod = j % T::kNumVar;
1458 
1459     if (extend_left) {
1460       while (true) {
1461         // load
1462         if (j_mod == T::kNumVar - 1) {
1463           _mmxxx_store_si(
1464               reinterpret_cast<__mxxxi*>(E),
1465               pimpl_->E[i * matrix_width + j_div]);
1466         } else if (j_mod == 0) {  // boarder case
1467           if (j_div > 0) {
1468             _mmxxx_store_si(
1469                 reinterpret_cast<__mxxxi*>(E_left),
1470                 pimpl_->E[i * matrix_width + j_div - 1]);
1471           }
1472         }
1473 
1474         alignment.emplace_back(-1, j);
1475         --j;
1476         j_div = j / T::kNumVar;
1477         j_mod = j % T::kNumVar;
1478         if ((j == -1) ||
1479             (j_mod != T::kNumVar - 1 && E[j_mod] + e_ != E[j_mod + 1]) ||
1480             (j_mod == T::kNumVar - 1 && E_left[j_mod] + e_ != E[0])) {
1481           break;
1482         }
1483       }
1484       load_next_segment = true;
1485     } else if (extend_up) {
1486       while (true) {
1487         // load
1488         _mmxxx_store_si(
1489             reinterpret_cast<__mxxxi*>(F),
1490             pimpl_->F[i * matrix_width + j_div]);
1491 
1492         prev_i = 0;
1493         predecessors.clear();
1494         std::uint32_t store_pos = 0;
1495         for (const auto& it : rank_to_node[i - 1]->inedges) {
1496           predecessors.emplace_back(pimpl_->node_id_to_rank[it->tail->id] + 1);
1497           _mmxxx_store_si(
1498               reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
1499               pimpl_->H[predecessors.back() * matrix_width + j_div]);
1500           _mmxxx_store_si(
1501               reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]),
1502               pimpl_->F[predecessors.back() * matrix_width + j_div]);
1503           ++store_pos;
1504         }
1505 
1506         bool stop = false;
1507         for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1508           if ((stop = F[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_) ||
1509                       F[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) {
1510             prev_i = predecessors[p];
1511             break;
1512           }
1513         }
1514 
1515         alignment.emplace_back(rank_to_node[i - 1]->id, -1);
1516         i = prev_i;
1517 
1518         if (stop || i == 0) {
1519           break;
1520         }
1521       }
1522     }
1523   } while (true);
1524 
1525   delete[] backtrack_storage;
1526 
1527   // update alignment for NW (backtrack stops on first row or column)
1528   if (type_ == AlignmentType::kNW) {
1529     while (i == 0 && j != -1) {
1530       alignment.emplace_back(-1, j);
1531       --j;
1532     }
1533     while (i != 0 && j == -1) {
1534       alignment.emplace_back(rank_to_node[i - 1]->id, -1);
1535 
1536       const auto& it = rank_to_node[i - 1];
1537       if (it->inedges.empty()) {
1538         i = 0;
1539       } else {
1540         for (const auto& jt : it->inedges) {
1541           std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1;
1542           if (pimpl_->first_column[i] == pimpl_->first_column[pred_i] + e_) {
1543             i = pred_i;
1544             break;
1545           }
1546         }
1547       }
1548     }
1549   }
1550 
1551   std::reverse(alignment.begin(), alignment.end());
1552   return alignment;
1553 #else
1554   return Alignment();
1555 #endif
1556 }
1557 
1558 template<Architecture A> template <typename T>
Convex(std::uint32_t sequence_len,const Graph & graph,std::int32_t * score)1559 Alignment SimdAlignmentEngine<A>::Convex(
1560     std::uint32_t sequence_len,
1561     const Graph& graph,
1562     std::int32_t* score) noexcept {
1563 #if defined(__AVX2__) || defined(__SSE4_1__) || defined(USE_SIMDE)
1564   std::uint64_t normal_matrix_width = sequence_len;
1565   std::uint64_t matrix_width =
1566       std::ceil(static_cast<double>(sequence_len) / T::kNumVar);
1567   std::uint64_t matrix_height = graph.nodes().size() + 1;
1568   const auto& rank_to_node = graph.rank_to_node();
1569 
1570   typename T::type kNegativeInfinity =
1571       std::numeric_limits<typename T::type>::min() + 1024;
1572 
1573   typename T::type max_score = type_ == AlignmentType::kSW ? 0 : kNegativeInfinity;  // NOLINT
1574   std::int32_t max_i = -1;
1575   std::int32_t max_j = -1;
1576   std::uint32_t last_column_id = (normal_matrix_width - 1) % T::kNumVar;
1577   __mxxxi zeroes = T::_mmxxx_set1_epi(0);
1578   __mxxxi g = T::_mmxxx_set1_epi(g_ - e_);
1579   __mxxxi e = T::_mmxxx_set1_epi(e_);
1580   __mxxxi q = T::_mmxxx_set1_epi(q_ - c_);
1581   __mxxxi c = T::_mmxxx_set1_epi(c_);
1582 
1583   __attribute__((aligned(kRegisterSize / 8))) typename T::type unpacked[T::kNumVar] = {0};  // NOLINT
1584 
1585   for (std::uint32_t i = 0, j = 0; i < T::kNumVar && j < T::kLogNumVar; ++i) {
1586     unpacked[i] = kNegativeInfinity;
1587     if ((i & (i + 1)) == 0) {
1588       pimpl_->masks[j++] = _mmxxx_load_si(
1589           reinterpret_cast<const __mxxxi*>(unpacked));
1590     }
1591   }
1592   pimpl_->masks[T::kLogNumVar] = _mmxxx_slli_si(
1593       T::_mmxxx_set1_epi(kNegativeInfinity),
1594       T::kLSS);
1595 
1596   pimpl_->penalties[0] = T::_mmxxx_set1_epi(e_);
1597   for (std::uint32_t i = 1; i < T::kLogNumVar; ++i) {
1598     pimpl_->penalties[i] = T::_mmxxx_add_epi(
1599         pimpl_->penalties[i - 1],
1600         pimpl_->penalties[i - 1]);
1601   }
1602   pimpl_->penalties[T::kLogNumVar] = T::_mmxxx_set1_epi(c_);
1603   for (std::uint32_t i = T::kLogNumVar + 1; i < 2 * T::kLogNumVar; ++i) {
1604     pimpl_->penalties[i] = T::_mmxxx_add_epi(
1605         pimpl_->penalties[i - 1],
1606         pimpl_->penalties[i - 1]);
1607   }
1608 
1609   // alignment
1610   for (const auto& it : rank_to_node) {
1611     __mxxxi* char_profile = &(pimpl_->sequence_profile[it->code * matrix_width]);  // NOLINT
1612 
1613     std::uint32_t i = pimpl_->node_id_to_rank[it->id] + 1;
1614 
1615     __mxxxi* H_row = &(pimpl_->H[i * matrix_width]);
1616     __mxxxi* F_row = &(pimpl_->F[i * matrix_width]);
1617     __mxxxi* O_row = &(pimpl_->O[i * matrix_width]);
1618 
1619     std::uint32_t pred_i = it->inedges.empty() ?
1620         0 : pimpl_->node_id_to_rank[it->inedges[0]->tail->id] + 1;
1621 
1622     __mxxxi* H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
1623     __mxxxi* F_pred_row = &(pimpl_->F[pred_i * matrix_width]);
1624     __mxxxi* O_pred_row = &(pimpl_->O[pred_i * matrix_width]);
1625 
1626     __mxxxi x = _mmxxx_srli_si(
1627         T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
1628         T::kRSS);
1629 
1630     for (std::uint64_t j = 0; j < matrix_width; ++j) {
1631       // update F
1632       F_row[j] = T::_mmxxx_add_epi(
1633           T::_mmxxx_max_epi(
1634               T::_mmxxx_add_epi(H_pred_row[j], g),
1635               F_pred_row[j]),
1636           e);
1637 
1638       // update O
1639       O_row[j] = T::_mmxxx_add_epi(
1640           T::_mmxxx_max_epi(
1641               T::_mmxxx_add_epi(H_pred_row[j], q),
1642               O_pred_row[j]),
1643           c);
1644 
1645       // update H
1646       H_row[j] = T::_mmxxx_add_epi(
1647           _mmxxx_or_si(
1648               _mmxxx_slli_si(H_pred_row[j], T::kLSS),
1649               x),
1650           char_profile[j]);
1651       x = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
1652     }
1653     // check other predecessors
1654     for (std::uint32_t p = 1; p < it->inedges.size(); ++p) {
1655       pred_i = pimpl_->node_id_to_rank[it->inedges[p]->tail->id] + 1;
1656 
1657       H_pred_row = &(pimpl_->H[pred_i * matrix_width]);
1658       F_pred_row = &(pimpl_->F[pred_i * matrix_width]);
1659       O_pred_row = &(pimpl_->O[pred_i * matrix_width]);
1660 
1661       x = _mmxxx_srli_si(
1662           T::_mmxxx_set1_epi(pimpl_->first_column[pred_i]),
1663           T::kRSS);
1664 
1665       for (std::uint64_t j = 0; j < matrix_width; ++j) {
1666         // update F
1667         F_row[j] = T::_mmxxx_max_epi(
1668             F_row[j],
1669             T::_mmxxx_add_epi(
1670                 T::_mmxxx_max_epi(
1671                     T::_mmxxx_add_epi(H_pred_row[j], g),
1672                     F_pred_row[j]),
1673                 e));
1674 
1675         // update O
1676         O_row[j] = T::_mmxxx_max_epi(
1677             O_row[j],
1678             T::_mmxxx_add_epi(
1679                 T::_mmxxx_max_epi(
1680                     T::_mmxxx_add_epi(H_pred_row[j], q),
1681                     O_pred_row[j]),
1682                 c));
1683 
1684         // update H
1685         H_row[j] = T::_mmxxx_max_epi(
1686             H_row[j],
1687             T::_mmxxx_add_epi(
1688                 _mmxxx_or_si(
1689                     _mmxxx_slli_si(H_pred_row[j], T::kLSS),
1690                     x),
1691                 char_profile[j]));
1692 
1693         x = _mmxxx_srli_si(H_pred_row[j], T::kRSS);
1694       }
1695     }
1696 
1697     __mxxxi* E_row = &(pimpl_->E[i * matrix_width]);
1698     __mxxxi* Q_row = &(pimpl_->Q[i * matrix_width]);
1699 
1700     x = T::_mmxxx_set1_epi(pimpl_->first_column[i]);
1701     __mxxxi y = T::_mmxxx_set1_epi(pimpl_->first_column[i]);
1702 
1703     __mxxxi score = zeroes;
1704 
1705     for (std::uint64_t j = 0; j < matrix_width; ++j) {
1706       H_row[j] = T::_mmxxx_max_epi(
1707           H_row[j],
1708           T::_mmxxx_max_epi(F_row[j], O_row[j]));
1709 
1710       E_row[j] = T::_mmxxx_add_epi(
1711           T::_mmxxx_add_epi(
1712               _mmxxx_or_si(
1713                   _mmxxx_slli_si(H_row[j], T::kLSS),
1714                   _mmxxx_srli_si(x, T::kRSS)),
1715               g),
1716           e);
1717 
1718       T::_mmxxx_prefix_max(E_row[j], pimpl_->masks, pimpl_->penalties);
1719 
1720       Q_row[j] = T::_mmxxx_add_epi(
1721           T::_mmxxx_add_epi(
1722               _mmxxx_or_si(
1723                   _mmxxx_slli_si(H_row[j], T::kLSS),
1724                   _mmxxx_srli_si(y, T::kRSS)),
1725               q),
1726           c);
1727 
1728       T::_mmxxx_prefix_max(Q_row[j], pimpl_->masks, &pimpl_->penalties[T::kLogNumVar]);  // NOLINT
1729 
1730       H_row[j] = T::_mmxxx_max_epi(
1731           H_row[j],
1732           T::_mmxxx_max_epi(E_row[j], Q_row[j]));
1733 
1734       x = T::_mmxxx_max_epi(
1735           H_row[j],
1736           T::_mmxxx_sub_epi(E_row[j], g));
1737 
1738       y = T::_mmxxx_max_epi(
1739           H_row[j],
1740           T::_mmxxx_sub_epi(Q_row[j], q));
1741 
1742       if (type_ == AlignmentType::kSW) {
1743         H_row[j] = T::_mmxxx_max_epi(H_row[j], zeroes);
1744       }
1745       score = T::_mmxxx_max_epi(score, H_row[j]);
1746     }
1747 
1748     if (type_ == AlignmentType::kSW) {
1749       std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
1750       if (max_score < max_row_score) {
1751         max_score = max_row_score;
1752         max_i = i;
1753       }
1754     } else if (type_ == AlignmentType::kOV) {
1755       if (it->outedges.empty()) {
1756         std::int32_t max_row_score = _mmxxx_max_value<A, T>(score);
1757         if (max_score < max_row_score) {
1758           max_score = max_row_score;
1759           max_i = i;
1760         }
1761       }
1762     } else if (type_ == AlignmentType::kNW) {
1763       if (it->outedges.empty()) {
1764         std::int32_t max_row_score = _mmxxx_value_at<A, T>(
1765             H_row[matrix_width - 1],
1766             last_column_id);
1767         if (max_score < max_row_score) {
1768           max_score = max_row_score;
1769           max_i = i;
1770         }
1771       }
1772     }
1773   }
1774 
1775   if (max_i == -1 && max_j == -1) {
1776     return Alignment();
1777   }
1778   if (score) {
1779     *score = max_score;
1780   }
1781 
1782   if (type_ == AlignmentType::kSW) {
1783     max_j = _mmxxx_index_of<A, T>(
1784         &(pimpl_->H[max_i * matrix_width]),
1785         matrix_width, max_score);
1786   } else if (type_ == AlignmentType::kOV) {
1787     if (rank_to_node[max_i - 1]->outedges.empty()) {
1788       max_j = _mmxxx_index_of<A, T>(
1789           &(pimpl_->H[max_i * matrix_width]),
1790           matrix_width, max_score);
1791     } else {
1792       max_j = normal_matrix_width - 1;
1793     }
1794   } else if (type_ == AlignmentType::kNW) {
1795     max_j = normal_matrix_width - 1;
1796   }
1797 
1798   // backtrack
1799   std::uint32_t max_num_predecessors = 1;
1800   for (std::uint32_t i = 0; i < static_cast<std::uint32_t>(max_i); ++i) {
1801     max_num_predecessors = std::max(
1802         max_num_predecessors,
1803         static_cast<std::uint32_t>(rank_to_node[i]->inedges.size()));
1804   }
1805 
1806   typename T::type* backtrack_storage = nullptr;
1807   typename T::type* H = AllocateAlignedMemory<A>(
1808       &backtrack_storage,
1809       9 * T::kNumVar + 4 * T::kNumVar * max_num_predecessors,
1810       kRegisterSize / 8);
1811   typename T::type* H_pred = H + T::kNumVar;
1812   typename T::type* H_diag_pred = H_pred + T::kNumVar * max_num_predecessors;
1813   typename T::type* H_left = H_diag_pred + T::kNumVar * max_num_predecessors;
1814   typename T::type* F = H_left + T::kNumVar;
1815   typename T::type* F_pred = F + T::kNumVar;
1816   typename T::type* O = F_pred + T::kNumVar * max_num_predecessors;
1817   typename T::type* O_pred = O + T::kNumVar;
1818   typename T::type* E = O_pred + T::kNumVar * max_num_predecessors;
1819   typename T::type* E_left = E + T::kNumVar;
1820   typename T::type* Q = E_left + T::kNumVar;
1821   typename T::type* Q_left = Q + T::kNumVar;
1822   typename T::type* profile = Q_left + T::kNumVar;
1823 
1824   std::vector<std::uint32_t> predecessors;
1825 
1826   std::int32_t i = max_i;
1827   std::int32_t j = max_j;
1828   std::int32_t prev_i = 0, prev_j = 0;
1829 
1830   std::uint32_t j_div = j / T::kNumVar;
1831   std::uint32_t j_mod = j % T::kNumVar;
1832 
1833   bool load_next_segment = true;
1834 
1835   Alignment alignment;
1836 
1837   do {
1838     // check stop condition
1839     if (j == -1 || i == 0) {
1840       break;
1841     }
1842 
1843     const auto& it = rank_to_node[i - 1];
1844     // load everything
1845     if (load_next_segment) {
1846       predecessors.clear();
1847 
1848       // load current cells
1849       _mmxxx_store_si(
1850           reinterpret_cast<__mxxxi*>(H),
1851           pimpl_->H[i * matrix_width + j_div]);
1852       _mmxxx_store_si(
1853           reinterpret_cast<__mxxxi*>(E),
1854           pimpl_->E[i * matrix_width + j_div]);
1855       _mmxxx_store_si(
1856           reinterpret_cast<__mxxxi*>(Q),
1857           pimpl_->Q[i * matrix_width + j_div]);
1858 
1859       // load predecessors cells
1860       if (it->inedges.empty()) {
1861         predecessors.emplace_back(0);
1862         _mmxxx_store_si(reinterpret_cast<__mxxxi*>(H_pred), pimpl_->H[j_div]);
1863         _mmxxx_store_si(reinterpret_cast<__mxxxi*>(F_pred), pimpl_->F[j_div]);
1864         _mmxxx_store_si(reinterpret_cast<__mxxxi*>(O_pred), pimpl_->O[j_div]);
1865       } else {
1866         std::uint32_t store_pos = 0;
1867         for (const auto& jt : it->inedges) {
1868           predecessors.emplace_back(pimpl_->node_id_to_rank[jt->tail->id] + 1);
1869           _mmxxx_store_si(
1870               reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
1871               pimpl_->H[predecessors.back() * matrix_width + j_div]);
1872           _mmxxx_store_si(
1873               reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]),
1874               pimpl_->F[predecessors.back() * matrix_width + j_div]);
1875           _mmxxx_store_si(
1876               reinterpret_cast<__mxxxi*>(&O_pred[store_pos * T::kNumVar]),
1877               pimpl_->O[predecessors.back() * matrix_width + j_div]);
1878           ++store_pos;
1879         }
1880       }
1881 
1882       // load query profile cells
1883       _mmxxx_store_si(
1884           reinterpret_cast<__mxxxi*>(profile),
1885           pimpl_->sequence_profile[it->code * matrix_width + j_div]);
1886     }
1887 
1888     // check stop condition
1889     if (type_ == AlignmentType::kSW && H[j_mod] == 0) {
1890       break;
1891     }
1892 
1893     if (j_mod == 0) {
1894       // border case
1895       if (j_div > 0) {
1896         for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1897           _mmxxx_store_si(
1898               reinterpret_cast<__mxxxi*>(&H_diag_pred[p * T::kNumVar]),
1899               pimpl_->H[predecessors[p] * matrix_width + (j_div - 1)]);
1900         }
1901         _mmxxx_store_si(
1902             reinterpret_cast<__mxxxi*>(H_left),
1903             pimpl_->H[i * matrix_width + j_div - 1]);
1904         _mmxxx_store_si(
1905             reinterpret_cast<__mxxxi*>(E_left),
1906             pimpl_->E[i * matrix_width + j_div - 1]);
1907         _mmxxx_store_si(
1908             reinterpret_cast<__mxxxi*>(Q_left),
1909             pimpl_->Q[i * matrix_width + j_div - 1]);
1910       } else {
1911         for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1912           H_diag_pred[(p + 1) * T::kNumVar - 1] = pimpl_->first_column[predecessors[p]];  // NOLINT
1913         }
1914         H_left[T::kNumVar - 1] = pimpl_->first_column[i];
1915         E_left[T::kNumVar - 1] = pimpl_->first_column[i];
1916         Q_left[T::kNumVar - 1] = pimpl_->first_column[i];
1917       }
1918     }
1919 
1920     // find best predecessor cell
1921     bool predecessor_found = false, extend_left = false, extend_up = false;
1922 
1923     if (i != 0) {
1924       for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1925         if ((j_mod == 0 && H[j_mod] == H_diag_pred[(p + 1) * T::kNumVar - 1] + profile[j_mod]) ||  // NOLINT
1926             (j_mod != 0 && H[j_mod] == H_pred[p * T::kNumVar + j_mod - 1] + profile[j_mod])) {  // NOLINT
1927           prev_i = predecessors[p];
1928           prev_j = j - 1;
1929           predecessor_found = true;
1930           break;
1931         }
1932       }
1933     }
1934 
1935     if (!predecessor_found && i != 0) {
1936       for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
1937         if ((extend_up = H[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_) ||
1938                          H[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_  ||
1939             (extend_up = H[j_mod] == O_pred[p * T::kNumVar + j_mod] + c_) ||
1940                          H[j_mod] == H_pred[p * T::kNumVar + j_mod] + q_) {
1941           prev_i = predecessors[p];
1942           prev_j = j;
1943           predecessor_found = true;
1944           break;
1945         }
1946       }
1947     }
1948 
1949     if (!predecessor_found) {
1950       if ((j_mod != 0 && ((extend_left = H[j_mod] == E[j_mod - 1] + e_)  ||
1951                                          H[j_mod] == H[j_mod - 1] + g_   ||
1952                           (extend_left = H[j_mod] == Q[j_mod - 1] + c_)  ||
1953                                          H[j_mod] == H[j_mod - 1] + q_)) ||
1954           (j_mod == 0 && ((extend_left = H[j_mod] == E_left[T::kNumVar - 1] + e_) ||  // NOLINT
1955                                          H[j_mod] == H_left[T::kNumVar - 1] + g_  ||  // NOLINT
1956                           (extend_left = H[j_mod] == Q_left[T::kNumVar - 1] + c_) ||  // NOLINT
1957                                          H[j_mod] == H_left[T::kNumVar - 1] + q_))) {  // NOLINT
1958         prev_i = i;
1959         prev_j = j - 1;
1960         predecessor_found = true;
1961       }
1962     }
1963 
1964     alignment.emplace_back(
1965         i == prev_i ? -1 : rank_to_node[i - 1]->id,
1966         j == prev_j ? -1 : j);
1967 
1968     // update for next round
1969     load_next_segment =
1970         (i == prev_i ? false : true) ||
1971         (j != prev_j && prev_j % T::kNumVar == T::kNumVar - 1 ? true : false);
1972 
1973     i = prev_i;
1974     j = prev_j;
1975     j_div = j / T::kNumVar;
1976     j_mod = j % T::kNumVar;
1977 
1978     if (extend_left) {
1979       while (true) {
1980         // load
1981         if (j_mod == T::kNumVar - 1) {
1982           _mmxxx_store_si(
1983               reinterpret_cast<__mxxxi*>(E),
1984               pimpl_->E[i * matrix_width + j_div]);
1985           _mmxxx_store_si(
1986               reinterpret_cast<__mxxxi*>(Q),
1987               pimpl_->Q[i * matrix_width + j_div]);
1988         } else if (j_mod == 0) {  // boarder case
1989           if (j_div > 0) {
1990             _mmxxx_store_si(
1991                 reinterpret_cast<__mxxxi*>(E_left),
1992                 pimpl_->E[i * matrix_width + j_div - 1]);
1993             _mmxxx_store_si(
1994                 reinterpret_cast<__mxxxi*>(Q_left),
1995                 pimpl_->Q[i * matrix_width + j_div - 1]);
1996           }
1997         }
1998 
1999         alignment.emplace_back(-1, j);
2000         --j;
2001         j_div = j / T::kNumVar;
2002         j_mod = j % T::kNumVar;
2003         if ((j == -1) ||
2004             (j_mod != T::kNumVar - 1 &&      E[j_mod] + e_ != E[j_mod + 1]) ||
2005             (j_mod == T::kNumVar - 1 && E_left[j_mod] + e_ != E[0])         ||
2006             (j_mod != T::kNumVar - 1 &&      Q[j_mod] + c_ != Q[j_mod + 1]) ||
2007             (j_mod == T::kNumVar - 1 && Q_left[j_mod] + c_ != Q[0])) {
2008           break;
2009         }
2010       }
2011       load_next_segment = true;
2012     } else if (extend_up) {
2013       while (true) {
2014         // load
2015         _mmxxx_store_si(
2016             reinterpret_cast<__mxxxi*>(F),
2017             pimpl_->F[i * matrix_width + j_div]);
2018         _mmxxx_store_si(
2019             reinterpret_cast<__mxxxi*>(O),
2020             pimpl_->O[i * matrix_width + j_div]);
2021 
2022         predecessors.clear();
2023         std::uint32_t store_pos = 0;
2024         for (const auto& it : rank_to_node[i - 1]->inedges) {
2025           predecessors.emplace_back(pimpl_->node_id_to_rank[it->tail->id] + 1);
2026           _mmxxx_store_si(
2027               reinterpret_cast<__mxxxi*>(&H_pred[store_pos * T::kNumVar]),
2028               pimpl_->H[predecessors.back() * matrix_width + j_div]);
2029           _mmxxx_store_si(
2030               reinterpret_cast<__mxxxi*>(&F_pred[store_pos * T::kNumVar]),
2031               pimpl_->F[predecessors.back() * matrix_width + j_div]);
2032           _mmxxx_store_si(
2033               reinterpret_cast<__mxxxi*>(&O_pred[store_pos * T::kNumVar]),
2034               pimpl_->O[predecessors.back() * matrix_width + j_div]);
2035           ++store_pos;
2036         }
2037 
2038         bool stop = true;
2039         prev_i = 0;
2040         for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
2041           if (F[j_mod] == F_pred[p * T::kNumVar + j_mod] + e_ ||
2042               O[j_mod] == O_pred[p * T::kNumVar + j_mod] + c_) {
2043             prev_i = predecessors[p];
2044             stop = false;
2045             break;
2046           }
2047         }
2048         if (stop == true) {
2049           for (std::uint32_t p = 0; p < predecessors.size(); ++p) {
2050             if (F[j_mod] == H_pred[p * T::kNumVar + j_mod] + g_ ||
2051                 O[j_mod] == H_pred[p * T::kNumVar + j_mod] + q_) {
2052               prev_i = predecessors[p];
2053               break;
2054             }
2055           }
2056         }
2057 
2058         alignment.emplace_back(rank_to_node[i - 1]->id, -1);
2059         i = prev_i;
2060 
2061         if (stop || i == 0) {
2062           break;
2063         }
2064       }
2065     }
2066   } while (true);
2067 
2068   delete[] backtrack_storage;
2069 
2070   // update alignment for NW (backtrack stops on first row or column)
2071   if (type_ == AlignmentType::kNW) {
2072     while (i == 0 && j != -1) {
2073       alignment.emplace_back(-1, j);
2074       --j;
2075     }
2076     while (i != 0 && j == -1) {
2077       alignment.emplace_back(rank_to_node[i - 1]->id, -1);
2078 
2079       const auto& it = rank_to_node[i - 1];
2080       if (it->inedges.empty()) {
2081         i = 0;
2082       } else {
2083         for (const auto& jt : it->inedges) {
2084           std::uint32_t pred_i = pimpl_->node_id_to_rank[jt->tail->id] + 1;
2085           if (pimpl_->first_column[matrix_height + i]     == pimpl_->first_column[matrix_height + pred_i] + e_ ||  // NOLINT
2086               pimpl_->first_column[2 * matrix_height + i] == pimpl_->first_column[2 * matrix_height + pred_i] + c_ ) {  // NOLINT
2087             i = pred_i;
2088             break;
2089           }
2090         }
2091       }
2092     }
2093   }
2094 
2095   std::reverse(alignment.begin(), alignment.end());
2096   return alignment;
2097 #else
2098   return Alignment();
2099 #endif
2100 }
2101 
2102 }  // namespace spoa
2103 
2104 #endif  // SIMD_ALIGNMENT_ENGINE_IMPLEMENTATION_HPP_
2105