1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 //   * Redistribution's of source code must retain the above copyright notice,
23 //     this list of conditions and the following disclaimer.
24 //
25 //   * Redistribution's in binary form must reproduce the above copyright notice,
26 //     this list of conditions and the following disclaimer in the documentation
27 //     and/or other materials provided with the distribution.
28 //
29 //   * The name of the copyright holders may not be used to endorse or promote products
30 //     derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44 
45 #ifndef OPENCV_HAL_INTRIN_CPP_HPP
46 #define OPENCV_HAL_INTRIN_CPP_HPP
47 
48 #include <limits>
49 #include <cstring>
50 #include <algorithm>
51 #include "opencv2/core/saturate.hpp"
52 
53 //! @cond IGNORED
54 #define CV_SIMD128_CPP 1
55 #if defined(CV_FORCE_SIMD128_CPP)
56 #define CV_SIMD128 1
57 #define CV_SIMD128_64F 1
58 #endif
59 #if defined(CV_DOXYGEN)
60 #define CV_SIMD128 1
61 #define CV_SIMD128_64F 1
62 #define CV_SIMD256 1
63 #define CV_SIMD256_64F 1
64 #define CV_SIMD512 1
65 #define CV_SIMD512_64F 1
66 #else
67 #define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation
68 #define CV_SIMD512 0 // to avoid warnings during compilation
69 #endif
70 //! @endcond
71 
72 namespace cv
73 {
74 
75 #ifndef CV_DOXYGEN
76 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
77 #endif
78 
79 /** @addtogroup core_hal_intrin
80 
81 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
82 different platforms. Currently a few different SIMD extensions on different architectures are supported.
83 128 bit registers of various types support is implemented for a wide range of architectures
84 including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
85 256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
86 In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
87 will be chosen and code will work as expected although it could be slower.
88 
89 ### Types
90 
91 There are several types representing packed values vector registers, each type is
92 implemented as a structure based on a one SIMD register.
93 
94 - cv::v_uint8 and cv::v_int8: 8-bit integer values (unsigned/signed) - char
95 - cv::v_uint16 and cv::v_int16: 16-bit integer values (unsigned/signed) - short
96 - cv::v_uint32 and cv::v_int32: 32-bit integer values (unsigned/signed) - int
97 - cv::v_uint64 and cv::v_int64: 64-bit integer values (unsigned/signed) - int64
98 - cv::v_float32: 32-bit floating point values (signed) - float
99 - cv::v_float64: 64-bit floating point values (signed) - double
100 
101 Exact bit length(and value quantity) of listed types is compile time deduced and depends on architecture SIMD
102 capabilities chosen as available during compilation of the library. All the types contains __nlanes__ enumeration
103 to check for exact value quantity of the type.
104 
105 In case the exact bit length of the type is important it is possible to use specific fixed length register types.
106 
107 There are several types representing 128-bit registers.
108 
109 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
110 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
111 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsigned/signed) - int
112 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
113 - cv::v_float32x4: four 32-bit floating point values (signed) - float
114 - cv::v_float64x2: two 64-bit floating point values (signed) - double
115 
116 There are several types representing 256-bit registers.
117 
118 - cv::v_uint8x32 and cv::v_int8x32: thirty two 8-bit integer values (unsigned/signed) - char
119 - cv::v_uint16x16 and cv::v_int16x16: sixteen 16-bit integer values (unsigned/signed) - short
120 - cv::v_uint32x8 and cv::v_int32x8: eight 32-bit integer values (unsigned/signed) - int
121 - cv::v_uint64x4 and cv::v_int64x4: four 64-bit integer values (unsigned/signed) - int64
122 - cv::v_float32x8: eight 32-bit floating point values (signed) - float
123 - cv::v_float64x4: four 64-bit floating point values (signed) - double
124 
125 @note
126 256 bit registers at the moment implemented for AVX2 SIMD extension only, if you want to use this type directly,
127 don't forget to check the CV_SIMD256 preprocessor definition:
128 @code
129 #if CV_SIMD256
130 //...
131 #endif
132 @endcode
133 
134 There are several types representing 512-bit registers.
135 
136 - cv::v_uint8x64 and cv::v_int8x64: sixty four 8-bit integer values (unsigned/signed) - char
137 - cv::v_uint16x32 and cv::v_int16x32: thirty two 16-bit integer values (unsigned/signed) - short
138 - cv::v_uint32x16 and cv::v_int32x16: sixteen 32-bit integer values (unsigned/signed) - int
139 - cv::v_uint64x8 and cv::v_int64x8: eight 64-bit integer values (unsigned/signed) - int64
140 - cv::v_float32x16: sixteen 32-bit floating point values (signed) - float
141 - cv::v_float64x8: eight 64-bit floating point values (signed) - double
142 @note
143 512 bit registers at the moment implemented for AVX512 SIMD extension only, if you want to use this type directly,
144 don't forget to check the CV_SIMD512 preprocessor definition.
145 
146 @note
147 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
148 check the CV_SIMD128_64F preprocessor definition.
149 
150 ### Load and store operations
151 
152 These operations allow to set contents of the register explicitly or by loading it from some memory
153 block and to save contents of the register to memory block.
154 
155 There are variable size register load operations that provide result of maximum available size
156 depending on chosen platform capabilities.
157 - Constructors:
158 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
159 - Other create methods:
160 vx_setall_s8, vx_setall_u8, ...,
161 vx_setzero_u8, vx_setzero_s8, ...
162 - Memory load operations:
163 vx_load, vx_load_aligned, vx_load_low, vx_load_halves,
164 - Memory operations with expansion of values:
165 vx_load_expand, vx_load_expand_q
166 
167 Also there are fixed size register load/store operations.
168 
169 For 128 bit registers
170 - Constructors:
171 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
172 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
173 - Other create methods:
174 @ref v_setall_s8, @ref v_setall_u8, ...,
175 @ref v_setzero_u8, @ref v_setzero_s8, ...
176 - Memory load operations:
177 @ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
178 - Memory operations with expansion of values:
179 @ref v_load_expand, @ref v_load_expand_q
180 
181 For 256 bit registers(check CV_SIMD256 preprocessor definition)
182 - Constructors:
183 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
184 @ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) "from four values", ...
185 - Other create methods:
186 @ref v256_setall_s8, @ref v256_setall_u8, ...,
187 @ref v256_setzero_u8, @ref v256_setzero_s8, ...
188 - Memory load operations:
189 @ref v256_load, @ref v256_load_aligned, @ref v256_load_low, @ref v256_load_halves,
190 - Memory operations with expansion of values:
191 @ref v256_load_expand, @ref v256_load_expand_q
192 
193 For 512 bit registers(check CV_SIMD512 preprocessor definition)
194 - Constructors:
195 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
196 @ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7) "from eight values", ...
197 - Other create methods:
198 @ref v512_setall_s8, @ref v512_setall_u8, ...,
199 @ref v512_setzero_u8, @ref v512_setzero_s8, ...
200 - Memory load operations:
201 @ref v512_load, @ref v512_load_aligned, @ref v512_load_low, @ref v512_load_halves,
202 - Memory operations with expansion of values:
203 @ref v512_load_expand, @ref v512_load_expand_q
204 
205 Store to memory operations are similar across different platform capabilities:
206 @ref v_store, @ref v_store_aligned,
207 @ref v_store_high, @ref v_store_low
208 
209 ### Value reordering
210 
211 These operations allow to reorder or recombine elements in one or multiple vectors.
212 
213 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
214 - Expand: @ref v_expand, @ref v_expand_low, @ref v_expand_high
215 - Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
216 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
217 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
218 - Reverse: @ref v_reverse
219 - Extract: @ref v_extract
220 
221 
222 ### Arithmetic, bitwise and comparison operations
223 
224 Element-wise binary and unary operations.
225 
226 - Arithmetics:
227 @ref operator +(const v_reg &a, const v_reg &b) "+",
228 @ref operator -(const v_reg &a, const v_reg &b) "-",
229 @ref operator *(const v_reg &a, const v_reg &b) "*",
230 @ref operator /(const v_reg &a, const v_reg &b) "/",
231 @ref v_mul_expand
232 
233 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
234 
235 - Bitwise shifts:
236 @ref operator <<(const v_reg &a, int s) "<<",
237 @ref operator >>(const v_reg &a, int s) ">>",
238 @ref v_shl, @ref v_shr
239 
240 - Bitwise logic:
241 @ref operator &(const v_reg &a, const v_reg &b) "&",
242 @ref operator |(const v_reg &a, const v_reg &b) "|",
243 @ref operator ^(const v_reg &a, const v_reg &b) "^",
244 @ref operator ~(const v_reg &a) "~"
245 
246 - Comparison:
247 @ref operator >(const v_reg &a, const v_reg &b) ">",
248 @ref operator >=(const v_reg &a, const v_reg &b) ">=",
249 @ref operator <(const v_reg &a, const v_reg &b) "<",
250 @ref operator <=(const v_reg &a, const v_reg &b) "<=",
251 @ref operator ==(const v_reg &a, const v_reg &b) "==",
252 @ref operator !=(const v_reg &a, const v_reg &b) "!="
253 
254 - min/max: @ref v_min, @ref v_max
255 
256 ### Reduce and mask
257 
258 Most of these operations return only one value.
259 
260 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
261 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
262 
263 ### Other math
264 
265 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
266 - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
267 
268 ### Conversions
269 
270 Different type conversions and casts:
271 
272 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
273 - To float: @ref v_cvt_f32, @ref v_cvt_f64
274 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
275 
276 ### Matrix operations
277 
278 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast,
279 @ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4
280 
281 ### Usability
282 
283 Most operations are implemented only for some subset of the available types, following matrices
284 shows the applicability of different operations to the types.
285 
286 Regular integers:
287 
288 | Operations\\Types | uint 8 | int 8 | uint 16 | int 16 | uint 32 | int 32 |
289 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
290 |load, store        | x | x | x | x | x | x |
291 |interleave         | x | x | x | x | x | x |
292 |expand             | x | x | x | x | x | x |
293 |expand_low         | x | x | x | x | x | x |
294 |expand_high        | x | x | x | x | x | x |
295 |expand_q           | x | x |   |   |   |   |
296 |add, sub           | x | x | x | x | x | x |
297 |add_wrap, sub_wrap | x | x | x | x |   |   |
298 |mul_wrap           | x | x | x | x |   |   |
299 |mul                | x | x | x | x | x | x |
300 |mul_expand         | x | x | x | x | x |   |
301 |compare            | x | x | x | x | x | x |
302 |shift              |   |   | x | x | x | x |
303 |dotprod            |   |   |   | x |   | x |
304 |dotprod_fast       |   |   |   | x |   | x |
305 |dotprod_expand     | x | x | x | x |   | x |
306 |dotprod_expand_fast| x | x | x | x |   | x |
307 |logical            | x | x | x | x | x | x |
308 |min, max           | x | x | x | x | x | x |
309 |absdiff            | x | x | x | x | x | x |
310 |absdiffs           |   | x |   | x |   |   |
311 |reduce             | x | x | x | x | x | x |
312 |mask               | x | x | x | x | x | x |
313 |pack               | x | x | x | x | x | x |
314 |pack_u             | x |   | x |   |   |   |
315 |pack_b             | x |   |   |   |   |   |
316 |unpack             | x | x | x | x | x | x |
317 |extract            | x | x | x | x | x | x |
318 |rotate (lanes)     | x | x | x | x | x | x |
319 |cvt_flt32          |   |   |   |   |   | x |
320 |cvt_flt64          |   |   |   |   |   | x |
321 |transpose4x4       |   |   |   |   | x | x |
322 |reverse            | x | x | x | x | x | x |
323 |extract_n          | x | x | x | x | x | x |
324 |broadcast_element  |   |   |   |   | x | x |
325 
326 Big integers:
327 
328 | Operations\\Types | uint 64 | int 64 |
329 |-------------------|:-:|:-:|
330 |load, store        | x | x |
331 |add, sub           | x | x |
332 |shift              | x | x |
333 |logical            | x | x |
334 |reverse            | x | x |
335 |extract            | x | x |
336 |rotate (lanes)     | x | x |
337 |cvt_flt64          |   | x |
338 |extract_n          | x | x |
339 
340 Floating point:
341 
342 | Operations\\Types | float 32 | float 64 |
343 |-------------------|:-:|:-:|
344 |load, store        | x | x |
345 |interleave         | x |   |
346 |add, sub           | x | x |
347 |mul                | x | x |
348 |div                | x | x |
349 |compare            | x | x |
350 |min, max           | x | x |
351 |absdiff            | x | x |
352 |reduce             | x |   |
353 |mask               | x | x |
354 |unpack             | x | x |
355 |cvt_flt32          |   | x |
356 |cvt_flt64          | x |   |
357 |sqrt, abs          | x | x |
358 |float math         | x | x |
359 |transpose4x4       | x |   |
360 |extract            | x | x |
361 |rotate (lanes)     | x | x |
362 |reverse            | x | x |
363 |extract_n          | x | x |
364 |broadcast_element  | x |   |
365 
366  @{ */
367 
368 template<typename _Tp, int n> struct v_reg
369 {
370 //! @cond IGNORED
371     typedef _Tp lane_type;
372     enum { nlanes = n };
373 // !@endcond
374 
375     /** @brief Constructor
376 
377     Initializes register with data from memory
378     @param ptr pointer to memory block with data for register */
v_regcv::v_reg379     explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
380 
381     /** @brief Constructor
382 
383     Initializes register with two 64-bit values */
v_regcv::v_reg384     v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
385 
386     /** @brief Constructor
387 
388     Initializes register with four 32-bit values */
v_regcv::v_reg389     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
390 
391     /** @brief Constructor
392 
393     Initializes register with eight 16-bit values */
v_regcv::v_reg394     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
395            _Tp s4, _Tp s5, _Tp s6, _Tp s7)
396     {
397         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
398         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
399     }
400 
401     /** @brief Constructor
402 
403     Initializes register with sixteen 8-bit values */
v_regcv::v_reg404     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
405            _Tp s4, _Tp s5, _Tp s6, _Tp s7,
406            _Tp s8, _Tp s9, _Tp s10, _Tp s11,
407            _Tp s12, _Tp s13, _Tp s14, _Tp s15)
408     {
409         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
410         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
411         s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
412         s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
413     }
414 
415     /** @brief Default constructor
416 
417     Does not initialize anything*/
v_regcv::v_reg418     v_reg() {}
419 
420     /** @brief Copy constructor */
v_regcv::v_reg421     v_reg(const v_reg<_Tp, n> & r)
422     {
423         for( int i = 0; i < n; i++ )
424             s[i] = r.s[i];
425     }
426     /** @brief Access first value
427 
428     Returns value of the first lane according to register type, for example:
429     @code{.cpp}
430     v_int32x4 r(1, 2, 3, 4);
431     int v = r.get0(); // returns 1
432     v_uint64x2 r(1, 2);
433     uint64_t v = r.get0(); // returns 1
434     @endcode
435     */
get0cv::v_reg436     _Tp get0() const { return s[0]; }
437 
438 //! @cond IGNORED
getcv::v_reg439     _Tp get(const int i) const { return s[i]; }
highcv::v_reg440     v_reg<_Tp, n> high() const
441     {
442         v_reg<_Tp, n> c;
443         int i;
444         for( i = 0; i < n/2; i++ )
445         {
446             c.s[i] = s[i+(n/2)];
447             c.s[i+(n/2)] = 0;
448         }
449         return c;
450     }
451 
zerocv::v_reg452     static v_reg<_Tp, n> zero()
453     {
454         v_reg<_Tp, n> c;
455         for( int i = 0; i < n; i++ )
456             c.s[i] = (_Tp)0;
457         return c;
458     }
459 
allcv::v_reg460     static v_reg<_Tp, n> all(_Tp s)
461     {
462         v_reg<_Tp, n> c;
463         for( int i = 0; i < n; i++ )
464             c.s[i] = s;
465         return c;
466     }
467 
reinterpret_ascv::v_reg468     template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
469     {
470         size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
471         v_reg<_Tp2, n2> c;
472         std::memcpy(&c.s[0], &s[0], bytes);
473         return c;
474     }
475 
operator =cv::v_reg476     v_reg& operator=(const v_reg<_Tp, n> & r)
477     {
478         for( int i = 0; i < n; i++ )
479             s[i] = r.s[i];
480         return *this;
481     }
482 
483     _Tp s[n];
484 //! @endcond
485 };
486 
487 /** @brief Sixteen 8-bit unsigned integer values */
488 typedef v_reg<uchar, 16> v_uint8x16;
489 /** @brief Sixteen 8-bit signed integer values */
490 typedef v_reg<schar, 16> v_int8x16;
491 /** @brief Eight 16-bit unsigned integer values */
492 typedef v_reg<ushort, 8> v_uint16x8;
493 /** @brief Eight 16-bit signed integer values */
494 typedef v_reg<short, 8> v_int16x8;
495 /** @brief Four 32-bit unsigned integer values */
496 typedef v_reg<unsigned, 4> v_uint32x4;
497 /** @brief Four 32-bit signed integer values */
498 typedef v_reg<int, 4> v_int32x4;
499 /** @brief Four 32-bit floating point values (single precision) */
500 typedef v_reg<float, 4> v_float32x4;
501 /** @brief Two 64-bit floating point values (double precision) */
502 typedef v_reg<double, 2> v_float64x2;
503 /** @brief Two 64-bit unsigned integer values */
504 typedef v_reg<uint64, 2> v_uint64x2;
505 /** @brief Two 64-bit signed integer values */
506 typedef v_reg<int64, 2> v_int64x2;
507 
508 #if CV_SIMD256
509 /** @brief Thirty two 8-bit unsigned integer values */
510 typedef v_reg<uchar, 32> v_uint8x32;
511 /** @brief Thirty two 8-bit signed integer values */
512 typedef v_reg<schar, 32> v_int8x32;
513 /** @brief Sixteen 16-bit unsigned integer values */
514 typedef v_reg<ushort, 16> v_uint16x16;
515 /** @brief Sixteen 16-bit signed integer values */
516 typedef v_reg<short, 16> v_int16x16;
517 /** @brief Eight 32-bit unsigned integer values */
518 typedef v_reg<unsigned, 8> v_uint32x8;
519 /** @brief Eight 32-bit signed integer values */
520 typedef v_reg<int, 8> v_int32x8;
521 /** @brief Eight 32-bit floating point values (single precision) */
522 typedef v_reg<float, 8> v_float32x8;
523 /** @brief Four 64-bit floating point values (double precision) */
524 typedef v_reg<double, 4> v_float64x4;
525 /** @brief Four 64-bit unsigned integer values */
526 typedef v_reg<uint64, 4> v_uint64x4;
527 /** @brief Four 64-bit signed integer values */
528 typedef v_reg<int64, 4> v_int64x4;
529 #endif
530 
531 #if CV_SIMD512
532 /** @brief Sixty four 8-bit unsigned integer values */
533 typedef v_reg<uchar, 64> v_uint8x64;
534 /** @brief Sixty four 8-bit signed integer values */
535 typedef v_reg<schar, 64> v_int8x64;
536 /** @brief Thirty two 16-bit unsigned integer values */
537 typedef v_reg<ushort, 32> v_uint16x32;
538 /** @brief Thirty two 16-bit signed integer values */
539 typedef v_reg<short, 32> v_int16x32;
540 /** @brief Sixteen 32-bit unsigned integer values */
541 typedef v_reg<unsigned, 16> v_uint32x16;
542 /** @brief Sixteen 32-bit signed integer values */
543 typedef v_reg<int, 16> v_int32x16;
544 /** @brief Sixteen 32-bit floating point values (single precision) */
545 typedef v_reg<float, 16> v_float32x16;
546 /** @brief Eight 64-bit floating point values (double precision) */
547 typedef v_reg<double, 8> v_float64x8;
548 /** @brief Eight 64-bit unsigned integer values */
549 typedef v_reg<uint64, 8> v_uint64x8;
550 /** @brief Eight 64-bit signed integer values */
551 typedef v_reg<int64, 8> v_int64x8;
552 #endif
553 
554 enum {
555     simd128_width = 16,
556 #if CV_SIMD256
557     simd256_width = 32,
558 #endif
559 #if CV_SIMD512
560     simd512_width = 64,
561     simdmax_width = simd512_width
562 #elif CV_SIMD256
563     simdmax_width = simd256_width
564 #else
565     simdmax_width = simd128_width
566 #endif
567 };
568 
569 /** @brief Add values
570 
571 For all types. */
572 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
573 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
574 
575 /** @brief Subtract values
576 
577 For all types. */
578 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
579 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
580 
581 /** @brief Multiply values
582 
583 For 16- and 32-bit integer types and floating types. */
584 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
585 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
586 
587 /** @brief Divide values
588 
589 For floating types only. */
590 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
591 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
592 
593 
594 /** @brief Bitwise AND
595 
596 Only for integer types. */
597 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
598 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
599 
600 /** @brief Bitwise OR
601 
602 Only for integer types. */
603 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
604 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
605 
606 /** @brief Bitwise XOR
607 
608 Only for integer types.*/
609 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
610 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
611 
612 /** @brief Bitwise NOT
613 
614 Only for integer types.*/
615 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
616 
617 
618 #ifndef CV_DOXYGEN
619 
620 #define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
621 __CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
622 __CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
623 __CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
624 __CV_EXPAND(macro_name(short, __VA_ARGS__)) \
625 __CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
626 __CV_EXPAND(macro_name(int, __VA_ARGS__)) \
627 __CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
628 __CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
629 
630 #define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
631 __CV_EXPAND(macro_name(float, __VA_ARGS__)) \
632 __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
633 
634 #define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
635 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
636 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
637 
638 #define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
639 template<int n> inline \
640 v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
641 { \
642     v_reg<_Tp, n> c; \
643     for( int i = 0; i < n; i++ ) \
644         c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
645     return c; \
646 } \
647 template<int n> inline \
648 v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
649 { \
650     for( int i = 0; i < n; i++ ) \
651         a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
652     return a; \
653 }
654 
655 #define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
656 
657 CV__HAL_INTRIN_IMPL_BIN_OP(+)
658 CV__HAL_INTRIN_IMPL_BIN_OP(-)
659 CV__HAL_INTRIN_IMPL_BIN_OP(*)
660 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
661 
662 #define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
663 template<int n> CV_INLINE \
664 v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
665 { \
666     v_reg<_Tp, n> c; \
667     typedef typename V_TypeTraits<_Tp>::int_type itype; \
668     for( int i = 0; i < n; i++ ) \
669         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
670                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
671     return c; \
672 } \
673 template<int n> CV_INLINE \
674 v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
675 { \
676     typedef typename V_TypeTraits<_Tp>::int_type itype; \
677     for( int i = 0; i < n; i++ ) \
678         a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
679                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
680     return a; \
681 }
682 
683 #define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
684 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
685 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
686 
687 
688 CV__HAL_INTRIN_IMPL_BIT_OP(&)
689 CV__HAL_INTRIN_IMPL_BIT_OP(|)
690 CV__HAL_INTRIN_IMPL_BIT_OP(^)
691 
692 #define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
693 template<int n> CV_INLINE \
694 v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
695 { \
696     v_reg<_Tp, n> c; \
697     for( int i = 0; i < n; i++ ) \
698         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
699     return c; \
700 } \
701 
702 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
703 
704 #endif  // !CV_DOXYGEN
705 
706 
707 //! @brief Helper macro
708 //! @ingroup core_hal_intrin_impl
709 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
710 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
711 { \
712     v_reg<_Tp2, n> c; \
713     for( int i = 0; i < n; i++ ) \
714         c.s[i] = cfunc(a.s[i]); \
715     return c; \
716 }
717 
718 /** @brief Square root of elements
719 
720 Only for floating point types.*/
721 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
722 
723 //! @cond IGNORED
724 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
725 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
726 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
727 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
728 //! @endcond
729 
730 /** @brief Absolute value of elements
731 
732 Only for floating point types.*/
733 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
734                           typename V_TypeTraits<_Tp>::abs_type)
735 
736 //! @brief Helper macro
737 //! @ingroup core_hal_intrin_impl
738 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
739 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
740 { \
741     v_reg<_Tp, n> c; \
742     for( int i = 0; i < n; i++ ) \
743         c.s[i] = cfunc(a.s[i], b.s[i]); \
744     return c; \
745 }
746 
747 //! @brief Helper macro
748 //! @ingroup core_hal_intrin_impl
749 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
750 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
751 { \
752     _Tp c = a.s[0]; \
753     for( int i = 1; i < n; i++ ) \
754         c = cfunc(c, a.s[i]); \
755     return c; \
756 }
757 
758 /** @brief Choose min values for each pair
759 
760 Scheme:
761 @code
762 {A1 A2 ...}
763 {B1 B2 ...}
764 --------------
765 {min(A1,B1) min(A2,B2) ...}
766 @endcode
767 For all types except 64-bit integer. */
768 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
769 
770 /** @brief Choose max values for each pair
771 
772 Scheme:
773 @code
774 {A1 A2 ...}
775 {B1 B2 ...}
776 --------------
777 {max(A1,B1) max(A2,B2) ...}
778 @endcode
779 For all types except 64-bit integer. */
780 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
781 
782 /** @brief Find one min value
783 
784 Scheme:
785 @code
786 {A1 A2 A3 ...} => min(A1,A2,A3,...)
787 @endcode
788 For all types except 64-bit integer and 64-bit floating point types. */
789 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
790 
791 /** @brief Find one max value
792 
793 Scheme:
794 @code
795 {A1 A2 A3 ...} => max(A1,A2,A3,...)
796 @endcode
797 For all types except 64-bit integer and 64-bit floating point types. */
798 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
799 
800 static const unsigned char popCountTable[] =
801 {
802     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
803     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
804     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
805     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
806     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
807     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
808     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
809     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
810     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
811     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
812     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
813     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
814     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
815     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
816     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
817     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
818 };
819 /** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type
820 
821 Scheme:
822 @code
823 {A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...}
824 @endcode
825 For all integer types. */
826 template<typename _Tp, int n>
v_popcount(const v_reg<_Tp,n> & a)827 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
828 {
829     v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
830     for (int i = 0; i < n*(int)sizeof(_Tp); i++)
831         b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
832     return b;
833 }
834 
835 
836 //! @cond IGNORED
837 template<typename _Tp, int n>
v_minmax(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<_Tp,n> & minval,v_reg<_Tp,n> & maxval)838 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
839                       v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
840 {
841     for( int i = 0; i < n; i++ )
842     {
843         minval.s[i] = std::min(a.s[i], b.s[i]);
844         maxval.s[i] = std::max(a.s[i], b.s[i]);
845     }
846 }
847 //! @endcond
848 
849 //! @brief Helper macro
850 //! @ingroup core_hal_intrin_impl
851 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
852 template<typename _Tp, int n> \
853 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
854 { \
855     typedef typename V_TypeTraits<_Tp>::int_type itype; \
856     v_reg<_Tp, n> c; \
857     for( int i = 0; i < n; i++ ) \
858         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
859     return c; \
860 }
861 
862 /** @brief Less-than comparison
863 
864 For all types except 64-bit integer values. */
865 OPENCV_HAL_IMPL_CMP_OP(<)
866 
867 /** @brief Greater-than comparison
868 
869 For all types except 64-bit integer values. */
870 OPENCV_HAL_IMPL_CMP_OP(>)
871 
872 /** @brief Less-than or equal comparison
873 
874 For all types except 64-bit integer values. */
875 OPENCV_HAL_IMPL_CMP_OP(<=)
876 
877 /** @brief Greater-than or equal comparison
878 
879 For all types except 64-bit integer values. */
880 OPENCV_HAL_IMPL_CMP_OP(>=)
881 
882 /** @brief Equal comparison
883 
884 For all types except 64-bit integer values. */
885 OPENCV_HAL_IMPL_CMP_OP(==)
886 
887 /** @brief Not equal comparison
888 
889 For all types except 64-bit integer values. */
890 OPENCV_HAL_IMPL_CMP_OP(!=)
891 
892 template<int n>
v_not_nan(const v_reg<float,n> & a)893 inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
894 {
895     typedef typename V_TypeTraits<float>::int_type itype;
896     v_reg<float, n> c;
897     for (int i = 0; i < n; i++)
898         c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
899     return c;
900 }
901 template<int n>
v_not_nan(const v_reg<double,n> & a)902 inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
903 {
904     typedef typename V_TypeTraits<double>::int_type itype;
905     v_reg<double, n> c;
906     for (int i = 0; i < n; i++)
907         c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
908     return c;
909 }
910 
911 //! @brief Helper macro
912 //! @ingroup core_hal_intrin_impl
913 #define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
914 template<typename _Tp, int n> \
915 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
916 { \
917     typedef _Tp2 rtype; \
918     v_reg<rtype, n> c; \
919     for( int i = 0; i < n; i++ ) \
920         c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
921     return c; \
922 }
923 
924 /** @brief Add values without saturation
925 
926 For 8- and 16-bit integer values. */
927 OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
928 
929 /** @brief Subtract values without saturation
930 
931 For 8- and 16-bit integer values. */
932 OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
933 
934 /** @brief Multiply values without saturation
935 
936 For 8- and 16-bit integer values. */
937 OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
938 
939 //! @cond IGNORED
_absdiff(T a,T b)940 template<typename T> inline T _absdiff(T a, T b)
941 {
942     return a > b ? a - b : b - a;
943 }
944 //! @endcond
945 
946 /** @brief Absolute difference
947 
948 Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
949 Example:
950 @code{.cpp}
951 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
952 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
953 @endcode
954 For 8-, 16-, 32-bit integer source types. */
955 template<typename _Tp, int n>
v_absdiff(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)956 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
957 {
958     typedef typename V_TypeTraits<_Tp>::abs_type rtype;
959     v_reg<rtype, n> c;
960     const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
961     for( int i = 0; i < n; i++ )
962     {
963         rtype ua = a.s[i] ^ mask;
964         rtype ub = b.s[i] ^ mask;
965         c.s[i] = _absdiff(ua, ub);
966     }
967     return c;
968 }
969 
970 /** @overload
971 
972 For 32-bit floating point values */
v_absdiff(const v_reg<float,n> & a,const v_reg<float,n> & b)973 template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
974 {
975     v_reg<float, n> c;
976     for( int i = 0; i < c.nlanes; i++ )
977         c.s[i] = _absdiff(a.s[i], b.s[i]);
978     return c;
979 }
980 
981 /** @overload
982 
983 For 64-bit floating point values */
v_absdiff(const v_reg<double,n> & a,const v_reg<double,n> & b)984 template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
985 {
986     v_reg<double, n> c;
987     for( int i = 0; i < c.nlanes; i++ )
988         c.s[i] = _absdiff(a.s[i], b.s[i]);
989     return c;
990 }
991 
992 /** @brief Saturating absolute difference
993 
994 Returns \f$ saturate(|a - b|) \f$ .
995 For 8-, 16-bit signed integer source types. */
996 template<typename _Tp, int n>
v_absdiffs(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)997 inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
998 {
999     v_reg<_Tp, n> c;
1000     for( int i = 0; i < n; i++)
1001         c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
1002     return c;
1003 }
1004 
1005 /** @brief Inversed square root
1006 
1007 Returns \f$ 1/sqrt(a) \f$
1008 For floating point types only. */
1009 template<typename _Tp, int n>
v_invsqrt(const v_reg<_Tp,n> & a)1010 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
1011 {
1012     v_reg<_Tp, n> c;
1013     for( int i = 0; i < n; i++ )
1014         c.s[i] = 1.f/std::sqrt(a.s[i]);
1015     return c;
1016 }
1017 
1018 /** @brief Magnitude
1019 
1020 Returns \f$ sqrt(a^2 + b^2) \f$
1021 For floating point types only. */
1022 template<typename _Tp, int n>
v_magnitude(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1023 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1024 {
1025     v_reg<_Tp, n> c;
1026     for( int i = 0; i < n; i++ )
1027         c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
1028     return c;
1029 }
1030 
1031 /** @brief Square of the magnitude
1032 
1033 Returns \f$ a^2 + b^2 \f$
1034 For floating point types only. */
1035 template<typename _Tp, int n>
v_sqr_magnitude(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1036 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1037 {
1038     v_reg<_Tp, n> c;
1039     for( int i = 0; i < n; i++ )
1040         c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
1041     return c;
1042 }
1043 
1044 /** @brief Multiply and add
1045 
1046  Returns \f$ a*b + c \f$
1047  For floating point types and signed 32bit int only. */
1048 template<typename _Tp, int n>
v_fma(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c)1049 inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1050                            const v_reg<_Tp, n>& c)
1051 {
1052     v_reg<_Tp, n> d;
1053     for( int i = 0; i < n; i++ )
1054         d.s[i] = a.s[i]*b.s[i] + c.s[i];
1055     return d;
1056 }
1057 
1058 /** @brief A synonym for v_fma */
1059 template<typename _Tp, int n>
v_muladd(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c)1060 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1061                               const v_reg<_Tp, n>& c)
1062 {
1063     return v_fma(a, b, c);
1064 }
1065 
1066 /** @brief Dot product of elements
1067 
1068 Multiply values in two registers and sum adjacent result pairs.
1069 
1070 Scheme:
1071 @code
1072   {A1 A2 ...} // 16-bit
1073 x {B1 B2 ...} // 16-bit
1074 -------------
1075 {A1B1+A2B2 ...} // 32-bit
1076 
1077 @endcode
1078 */
1079 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1080 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1081 {
1082     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1083     v_reg<w_type, n/2> c;
1084     for( int i = 0; i < (n/2); i++ )
1085         c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
1086     return c;
1087 }
1088 
1089 /** @brief Dot product of elements
1090 
1091 Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
1092 Scheme:
1093 @code
1094   {A1 A2 ...} // 16-bit
1095 x {B1 B2 ...} // 16-bit
1096 -------------
1097   {A1B1+A2B2+C1 ...} // 32-bit
1098 @endcode
1099 */
1100 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c)1101 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1102           const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
1103 {
1104     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1105     v_reg<w_type, n/2> s;
1106     for( int i = 0; i < (n/2); i++ )
1107         s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
1108     return s;
1109 }
1110 
1111 /** @brief Fast Dot product of elements
1112 
1113 Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms,
1114 this intrinsic can be used if the sum among all lanes is only matters
1115 and also it should be yielding better performance on the affected platforms.
1116 
1117 */
1118 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod_fast(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1119 v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1120 { return v_dotprod(a, b); }
1121 
1122 /** @brief Fast Dot product of elements
1123 
1124 Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs.
1125 */
1126 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod_fast(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c)1127 v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1128                const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
1129 { return v_dotprod(a, b, c); }
1130 
1131 /** @brief Dot product of elements and expand
1132 
1133 Multiply values in two registers and expand the sum of adjacent result pairs.
1134 
1135 Scheme:
1136 @code
1137   {A1 A2 A3 A4 ...} // 8-bit
1138 x {B1 B2 B3 B4 ...} // 8-bit
1139 -------------
1140   {A1B1+A2B2+A3B3+A4B4 ...} // 32-bit
1141 
1142 @endcode
1143 */
1144 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
v_dotprod_expand(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1145 v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1146 {
1147     typedef typename V_TypeTraits<_Tp>::q_type q_type;
1148     v_reg<q_type, n/4> s;
1149     for( int i = 0; i < (n/4); i++ )
1150         s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
1151                  (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
1152     return s;
1153 }
1154 
1155 /** @brief Dot product of elements
1156 
1157 Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs.
1158 Scheme:
1159 @code
1160   {A1 A2 A3 A4 ...} // 8-bit
1161 x {B1 B2 B3 B4 ...} // 8-bit
1162 -------------
1163   {A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit
1164 @endcode
1165 */
1166 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
v_dotprod_expand(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<typename V_TypeTraits<_Tp>::q_type,n/4> & c)1167 v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1168                  const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
1169 {
1170     typedef typename V_TypeTraits<_Tp>::q_type q_type;
1171     v_reg<q_type, n/4> s;
1172     for( int i = 0; i < (n/4); i++ )
1173         s.s[i] = (q_type)a.s[i*4    ]*b.s[i*4    ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
1174                  (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
1175     return s;
1176 }
1177 
1178 /** @brief Fast Dot product of elements and expand
1179 
1180 Multiply values in two registers and expand the sum of adjacent result pairs.
1181 
1182 Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms,
1183 this intrinsic can be used if the sum among all lanes is only matters
1184 and also it should be yielding better performance on the affected platforms.
1185 
1186 */
1187 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
v_dotprod_expand_fast(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1188 v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1189 { return v_dotprod_expand(a, b); }
1190 
1191 /** @brief Fast Dot product of elements
1192 
1193 Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs.
1194 */
1195 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
v_dotprod_expand_fast(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<typename V_TypeTraits<_Tp>::q_type,n/4> & c)1196 v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1197                       const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
1198 { return v_dotprod_expand(a, b, c); }
1199 
1200 /** @brief Multiply and expand
1201 
1202 Multiply values two registers and store results in two registers with wider pack type.
1203 Scheme:
1204 @code
1205   {A B C D} // 32-bit
1206 x {E F G H} // 32-bit
1207 ---------------
1208 {AE BF}         // 64-bit
1209         {CG DH} // 64-bit
1210 @endcode
1211 Example:
1212 @code{.cpp}
1213 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
1214 v_uint64x2 c, d; // results
1215 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
1216 @endcode
1217 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
1218 */
v_mul_expand(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & d)1219 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1220                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
1221                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
1222 {
1223     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1224     for( int i = 0; i < (n/2); i++ )
1225     {
1226         c.s[i] = (w_type)a.s[i]*b.s[i];
1227         d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
1228     }
1229 }
1230 
1231 /** @brief Multiply and extract high part
1232 
1233 Multiply values two registers and store high part of the results.
1234 Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
1235 */
v_mul_hi(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1236 template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1237 {
1238     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1239     v_reg<_Tp, n> c;
1240     for (int i = 0; i < n; i++)
1241         c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
1242     return c;
1243 }
1244 
1245 //! @cond IGNORED
v_hsum(const v_reg<_Tp,n> & a,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c)1246 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
1247                                                  v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
1248 {
1249     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1250     for( int i = 0; i < (n/2); i++ )
1251     {
1252         c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
1253     }
1254 }
1255 //! @endcond
1256 
1257 //! @brief Helper macro
1258 //! @ingroup core_hal_intrin_impl
1259 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
1260 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
1261 { \
1262     v_reg<_Tp, n> c; \
1263     for( int i = 0; i < n; i++ ) \
1264         c.s[i] = (_Tp)(a.s[i] shift_op imm); \
1265     return c; \
1266 }
1267 
1268 /** @brief Bitwise shift left
1269 
1270 For 16-, 32- and 64-bit integer values. */
1271 OPENCV_HAL_IMPL_SHIFT_OP(<< )
1272 
1273 /** @brief Bitwise shift right
1274 
1275 For 16-, 32- and 64-bit integer values. */
1276 OPENCV_HAL_IMPL_SHIFT_OP(>> )
1277 
1278 //! @brief Helper macro
1279 //! @ingroup core_hal_intrin_impl
1280 #define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
1281 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
1282 { \
1283     v_reg<_Tp, n> b; \
1284     for (int i = 0; i < n; i++) \
1285     { \
1286         int sIndex = i opA imm; \
1287         if (0 <= sIndex && sIndex < n) \
1288         { \
1289             b.s[i] = a.s[sIndex]; \
1290         } \
1291         else \
1292         { \
1293             b.s[i] = 0; \
1294         } \
1295     } \
1296     return b; \
1297 } \
1298 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
1299 { \
1300     v_reg<_Tp, n> c; \
1301     for (int i = 0; i < n; i++) \
1302     { \
1303         int aIndex = i opA imm; \
1304         int bIndex = i opA imm opB n; \
1305         if (0 <= bIndex && bIndex < n) \
1306         { \
1307             c.s[i] = b.s[bIndex]; \
1308         } \
1309         else if (0 <= aIndex && aIndex < n) \
1310         { \
1311             c.s[i] = a.s[aIndex]; \
1312         } \
1313         else \
1314         { \
1315             c.s[i] = 0; \
1316         } \
1317     } \
1318     return c; \
1319 }
1320 
1321 /** @brief Element shift left among vector
1322 
1323 For all type */
1324 OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)
1325 
1326 /** @brief Element shift right among vector
1327 
1328 For all type */
1329 OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
1330 
1331 /** @brief Sum packed values
1332 
1333 Scheme:
1334 @code
1335 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
1336 @endcode
1337 */
v_reduce_sum(const v_reg<_Tp,n> & a)1338 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
1339 {
1340     typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
1341     for( int i = 1; i < n; i++ )
1342         c += a.s[i];
1343     return c;
1344 }
1345 
1346 /** @brief Sums all elements of each input vector, returns the vector of sums
1347 
1348  Scheme:
1349  @code
1350  result[0] = a[0] + a[1] + a[2] + a[3]
1351  result[1] = b[0] + b[1] + b[2] + b[3]
1352  result[2] = c[0] + c[1] + c[2] + c[3]
1353  result[3] = d[0] + d[1] + d[2] + d[3]
1354  @endcode
1355 */
v_reduce_sum4(const v_reg<float,n> & a,const v_reg<float,n> & b,const v_reg<float,n> & c,const v_reg<float,n> & d)1356 template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
1357     const v_reg<float, n>& c, const v_reg<float, n>& d)
1358 {
1359     v_reg<float, n> r;
1360     for(int i = 0; i < (n/4); i++)
1361     {
1362         r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
1363         r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
1364         r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
1365         r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
1366     }
1367     return r;
1368 }
1369 
1370 /** @brief Sum absolute differences of values
1371 
1372 Scheme:
1373 @code
1374 {A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
1375 @endcode
1376 For all types except 64-bit types.*/
v_reduce_sad(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1377 template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1378 {
1379     typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
1380     for (int i = 1; i < n; i++)
1381         c += _absdiff(a.s[i], b.s[i]);
1382     return c;
1383 }
1384 
1385 /** @brief Get negative values mask
1386 @deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough
1387 
1388 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
1389 Example:
1390 @code{.cpp}
1391 v_int32x4 r; // set to {-1, -1, 1, 1}
1392 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
1393 @endcode
1394 */
v_signmask(const v_reg<_Tp,n> & a)1395 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
1396 {
1397     int mask = 0;
1398     for( int i = 0; i < n; i++ )
1399         mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
1400     return mask;
1401 }
1402 
1403 /** @brief Get first negative lane index
1404 
1405 Returned value is an index of first negative lane (undefined for input of all positive values)
1406 Example:
1407 @code{.cpp}
1408 v_int32x4 r; // set to {0, 0, -1, -1}
1409 int idx = v_heading_zeros(r); // idx = 2
1410 @endcode
1411 */
v_scan_forward(const v_reg<_Tp,n> & a)1412 template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
1413 {
1414     for (int i = 0; i < n; i++)
1415         if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
1416             return i;
1417     return 0;
1418 }
1419 
1420 /** @brief Check if all packed values are less than zero
1421 
1422 Unsigned values will be casted to signed: `uchar 254 => char -2`.
1423 */
v_check_all(const v_reg<_Tp,n> & a)1424 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
1425 {
1426     for( int i = 0; i < n; i++ )
1427         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
1428             return false;
1429     return true;
1430 }
1431 
1432 /** @brief Check if any of packed values is less than zero
1433 
1434 Unsigned values will be casted to signed: `uchar 254 => char -2`.
1435 */
v_check_any(const v_reg<_Tp,n> & a)1436 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
1437 {
1438     for( int i = 0; i < n; i++ )
1439         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
1440             return true;
1441     return false;
1442 }
1443 
1444 /** @brief Per-element select (blend operation)
1445 
1446 Return value will be built by combining values _a_ and _b_ using the following scheme:
1447     result[i] = mask[i] ? a[i] : b[i];
1448 
1449 @note: _mask_ element values are restricted to these values:
1450 - 0: select element from _b_
1451 - 0xff/0xffff/etc: select element from _a_
1452 (fully compatible with bitwise-based operator)
1453 */
v_select(const v_reg<_Tp,n> & mask,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1454 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
1455                                                            const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1456 {
1457     typedef V_TypeTraits<_Tp> Traits;
1458     typedef typename Traits::int_type int_type;
1459     v_reg<_Tp, n> c;
1460     for( int i = 0; i < n; i++ )
1461     {
1462         int_type m = Traits::reinterpret_int(mask.s[i]);
1463         CV_DbgAssert(m == 0 || m == (~(int_type)0));  // restrict mask values: 0 or 0xff/0xffff/etc
1464         c.s[i] = m ? a.s[i] : b.s[i];
1465     }
1466     return c;
1467 }
1468 
1469 /** @brief Expand values to the wider pack type
1470 
1471 Copy contents of register to two registers with 2x wider pack type.
1472 Scheme:
1473 @code
1474  int32x4     int64x2 int64x2
1475 {A B C D} ==> {A B} , {C D}
1476 @endcode */
v_expand(const v_reg<_Tp,n> & a,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & b0,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & b1)1477 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
1478                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
1479                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
1480 {
1481     for( int i = 0; i < (n/2); i++ )
1482     {
1483         b0.s[i] = a.s[i];
1484         b1.s[i] = a.s[i+(n/2)];
1485     }
1486 }
1487 
1488 /** @brief Expand lower values to the wider pack type
1489 
1490 Same as cv::v_expand, but return lower half of the vector.
1491 
1492 Scheme:
1493 @code
1494  int32x4     int64x2
1495 {A B C D} ==> {A B}
1496 @endcode */
1497 template<typename _Tp, int n>
1498 inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_expand_low(const v_reg<_Tp,n> & a)1499 v_expand_low(const v_reg<_Tp, n>& a)
1500 {
1501     v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
1502     for( int i = 0; i < (n/2); i++ )
1503         b.s[i] = a.s[i];
1504     return b;
1505 }
1506 
1507 /** @brief Expand higher values to the wider pack type
1508 
1509 Same as cv::v_expand_low, but expand higher half of the vector instead.
1510 
1511 Scheme:
1512 @code
1513  int32x4     int64x2
1514 {A B C D} ==> {C D}
1515 @endcode */
1516 template<typename _Tp, int n>
1517 inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_expand_high(const v_reg<_Tp,n> & a)1518 v_expand_high(const v_reg<_Tp, n>& a)
1519 {
1520     v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
1521     for( int i = 0; i < (n/2); i++ )
1522         b.s[i] = a.s[i+(n/2)];
1523     return b;
1524 }
1525 
1526 //! @cond IGNORED
1527 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
v_reinterpret_as_int(const v_reg<_Tp,n> & a)1528     v_reinterpret_as_int(const v_reg<_Tp, n>& a)
1529 {
1530     v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
1531     for( int i = 0; i < n; i++ )
1532         c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
1533     return c;
1534 }
1535 
1536 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
v_reinterpret_as_uint(const v_reg<_Tp,n> & a)1537     v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
1538 {
1539     v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
1540     for( int i = 0; i < n; i++ )
1541         c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
1542     return c;
1543 }
1544 //! @endcond
1545 
1546 /** @brief Interleave two vectors
1547 
1548 Scheme:
1549 @code
1550   {A1 A2 A3 A4}
1551   {B1 B2 B3 B4}
1552 ---------------
1553   {A1 B1 A2 B2} and {A3 B3 A4 B4}
1554 @endcode
1555 For all types except 64-bit.
1556 */
v_zip(const v_reg<_Tp,n> & a0,const v_reg<_Tp,n> & a1,v_reg<_Tp,n> & b0,v_reg<_Tp,n> & b1)1557 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
1558                                                v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
1559 {
1560     int i;
1561     for( i = 0; i < n/2; i++ )
1562     {
1563         b0.s[i*2] = a0.s[i];
1564         b0.s[i*2+1] = a1.s[i];
1565     }
1566     for( ; i < n; i++ )
1567     {
1568         b1.s[i*2-n] = a0.s[i];
1569         b1.s[i*2-n+1] = a1.s[i];
1570     }
1571 }
1572 
1573 /** @brief Load register contents from memory
1574 
1575 @param ptr pointer to memory block with data
1576 @return register object
1577 
1578 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
1579 
1580 @note Use vx_load version to get maximum available register length result
1581 
1582 @note Alignment requirement:
1583 if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1584 Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1585  */
1586 template<typename _Tp>
v_load(const _Tp * ptr)1587 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)
1588 {
1589 #if CV_STRONG_ALIGNMENT
1590     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1591 #endif
1592     return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
1593 }
1594 
1595 #if CV_SIMD256
1596 /** @brief Load 256-bit length register contents from memory
1597 
1598 @param ptr pointer to memory block with data
1599 @return register object
1600 
1601 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x32, int ==> cv::v_int32x8, etc.
1602 
1603 @note Check CV_SIMD256 preprocessor definition prior to use.
1604 Use vx_load version to get maximum available register length result
1605 
1606 @note Alignment requirement:
1607 if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1608 Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1609  */
1610 template<typename _Tp>
v256_load(const _Tp * ptr)1611 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)
1612 {
1613 #if CV_STRONG_ALIGNMENT
1614     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1615 #endif
1616     return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
1617 }
1618 #endif
1619 
1620 #if CV_SIMD512
1621 /** @brief Load 512-bit length register contents from memory
1622 
1623 @param ptr pointer to memory block with data
1624 @return register object
1625 
1626 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x64, int ==> cv::v_int32x16, etc.
1627 
1628 @note Check CV_SIMD512 preprocessor definition prior to use.
1629 Use vx_load version to get maximum available register length result
1630 
1631 @note Alignment requirement:
1632 if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1633 Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1634  */
1635 template<typename _Tp>
v512_load(const _Tp * ptr)1636 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)
1637 {
1638 #if CV_STRONG_ALIGNMENT
1639     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1640 #endif
1641     return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
1642 }
1643 #endif
1644 
1645 /** @brief Load register contents from memory (aligned)
1646 
1647 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary in case of SIMD128, 32-byte - SIMD256, etc)
1648 
1649 @note Use vx_load_aligned version to get maximum available register length result
1650 */
1651 template<typename _Tp>
v_load_aligned(const _Tp * ptr)1652 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)
1653 {
1654     CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));
1655     return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
1656 }
1657 
1658 #if CV_SIMD256
1659 /** @brief Load register contents from memory (aligned)
1660 
1661 similar to cv::v256_load, but source memory block should be aligned (to 32-byte boundary in case of SIMD256, 64-byte - SIMD512, etc)
1662 
1663 @note Check CV_SIMD256 preprocessor definition prior to use.
1664 Use vx_load_aligned version to get maximum available register length result
1665 */
1666 template<typename _Tp>
v256_load_aligned(const _Tp * ptr)1667 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)
1668 {
1669     CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));
1670     return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
1671 }
1672 #endif
1673 
1674 #if CV_SIMD512
1675 /** @brief Load register contents from memory (aligned)
1676 
1677 similar to cv::v512_load, but source memory block should be aligned (to 64-byte boundary in case of SIMD512, etc)
1678 
1679 @note Check CV_SIMD512 preprocessor definition prior to use.
1680 Use vx_load_aligned version to get maximum available register length result
1681 */
1682 template<typename _Tp>
v512_load_aligned(const _Tp * ptr)1683 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)
1684 {
1685     CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));
1686     return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
1687 }
1688 #endif
1689 
1690 /** @brief Load 64-bits of data to lower part (high part is undefined).
1691 
1692 @param ptr memory block containing data for first half (0..n/2)
1693 
1694 @code{.cpp}
1695 int lo[2] = { 1, 2 };
1696 v_int32x4 r = v_load_low(lo);
1697 @endcode
1698 
1699 @note Use vx_load_low version to get maximum available register length result
1700 */
1701 template<typename _Tp>
v_load_low(const _Tp * ptr)1702 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)
1703 {
1704 #if CV_STRONG_ALIGNMENT
1705     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1706 #endif
1707     v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
1708     for( int i = 0; i < c.nlanes/2; i++ )
1709     {
1710         c.s[i] = ptr[i];
1711     }
1712     return c;
1713 }
1714 
1715 #if CV_SIMD256
1716 /** @brief Load 128-bits of data to lower part (high part is undefined).
1717 
1718 @param ptr memory block containing data for first half (0..n/2)
1719 
1720 @code{.cpp}
1721 int lo[4] = { 1, 2, 3, 4 };
1722 v_int32x8 r = v256_load_low(lo);
1723 @endcode
1724 
1725 @note Check CV_SIMD256 preprocessor definition prior to use.
1726 Use vx_load_low version to get maximum available register length result
1727 */
1728 template<typename _Tp>
v256_load_low(const _Tp * ptr)1729 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)
1730 {
1731 #if CV_STRONG_ALIGNMENT
1732     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1733 #endif
1734     v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
1735     for (int i = 0; i < c.nlanes / 2; i++)
1736     {
1737         c.s[i] = ptr[i];
1738     }
1739     return c;
1740 }
1741 #endif
1742 
1743 #if CV_SIMD512
1744 /** @brief Load 256-bits of data to lower part (high part is undefined).
1745 
1746 @param ptr memory block containing data for first half (0..n/2)
1747 
1748 @code{.cpp}
1749 int lo[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
1750 v_int32x16 r = v512_load_low(lo);
1751 @endcode
1752 
1753 @note Check CV_SIMD512 preprocessor definition prior to use.
1754 Use vx_load_low version to get maximum available register length result
1755 */
1756 template<typename _Tp>
v512_load_low(const _Tp * ptr)1757 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)
1758 {
1759 #if CV_STRONG_ALIGNMENT
1760     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1761 #endif
1762     v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
1763     for (int i = 0; i < c.nlanes / 2; i++)
1764     {
1765         c.s[i] = ptr[i];
1766     }
1767     return c;
1768 }
1769 #endif
1770 
1771 /** @brief Load register contents from two memory blocks
1772 
1773 @param loptr memory block containing data for first half (0..n/2)
1774 @param hiptr memory block containing data for second half (n/2..n)
1775 
1776 @code{.cpp}
1777 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
1778 v_int32x4 r = v_load_halves(lo, hi);
1779 @endcode
1780 
1781 @note Use vx_load_halves version to get maximum available register length result
1782 */
1783 template<typename _Tp>
v_load_halves(const _Tp * loptr,const _Tp * hiptr)1784 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
1785 {
1786 #if CV_STRONG_ALIGNMENT
1787     CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1788     CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1789 #endif
1790     v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
1791     for( int i = 0; i < c.nlanes/2; i++ )
1792     {
1793         c.s[i] = loptr[i];
1794         c.s[i+c.nlanes/2] = hiptr[i];
1795     }
1796     return c;
1797 }
1798 
1799 #if CV_SIMD256
1800 /** @brief Load register contents from two memory blocks
1801 
1802 @param loptr memory block containing data for first half (0..n/2)
1803 @param hiptr memory block containing data for second half (n/2..n)
1804 
1805 @code{.cpp}
1806 int lo[4] = { 1, 2, 3, 4 }, hi[4] = { 5, 6, 7, 8 };
1807 v_int32x8 r = v256_load_halves(lo, hi);
1808 @endcode
1809 
1810 @note Check CV_SIMD256 preprocessor definition prior to use.
1811 Use vx_load_halves version to get maximum available register length result
1812 */
1813 template<typename _Tp>
v256_load_halves(const _Tp * loptr,const _Tp * hiptr)1814 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)
1815 {
1816 #if CV_STRONG_ALIGNMENT
1817     CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1818     CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1819 #endif
1820     v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
1821     for (int i = 0; i < c.nlanes / 2; i++)
1822     {
1823         c.s[i] = loptr[i];
1824         c.s[i + c.nlanes / 2] = hiptr[i];
1825     }
1826     return c;
1827 }
1828 #endif
1829 
1830 #if CV_SIMD512
1831 /** @brief Load register contents from two memory blocks
1832 
1833 @param loptr memory block containing data for first half (0..n/2)
1834 @param hiptr memory block containing data for second half (n/2..n)
1835 
1836 @code{.cpp}
1837 int lo[4] = { 1, 2, 3, 4, 5, 6, 7, 8 }, hi[4] = { 9, 10, 11, 12, 13, 14, 15, 16 };
1838 v_int32x16 r = v512_load_halves(lo, hi);
1839 @endcode
1840 
1841 @note Check CV_SIMD512 preprocessor definition prior to use.
1842 Use vx_load_halves version to get maximum available register length result
1843 */
1844 template<typename _Tp>
v512_load_halves(const _Tp * loptr,const _Tp * hiptr)1845 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)
1846 {
1847 #if CV_STRONG_ALIGNMENT
1848     CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1849     CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1850 #endif
1851     v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
1852     for (int i = 0; i < c.nlanes / 2; i++)
1853     {
1854         c.s[i] = loptr[i];
1855         c.s[i + c.nlanes / 2] = hiptr[i];
1856     }
1857     return c;
1858 }
1859 #endif
1860 
1861 /** @brief Load register contents from memory with double expand
1862 
1863 Same as cv::v_load, but result pack type will be 2x wider than memory type.
1864 
1865 @code{.cpp}
1866 short buf[4] = {1, 2, 3, 4}; // type is int16
1867 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
1868 @endcode
1869 For 8-, 16-, 32-bit integer source types.
1870 
1871 @note Use vx_load_expand version to get maximum available register length result
1872 */
1873 template<typename _Tp>
1874 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
v_load_expand(const _Tp * ptr)1875 v_load_expand(const _Tp* ptr)
1876 {
1877 #if CV_STRONG_ALIGNMENT
1878     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1879 #endif
1880     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1881     v_reg<w_type, simd128_width / sizeof(w_type)> c;
1882     for( int i = 0; i < c.nlanes; i++ )
1883     {
1884         c.s[i] = ptr[i];
1885     }
1886     return c;
1887 }
1888 
1889 #if CV_SIMD256
1890 /** @brief Load register contents from memory with double expand
1891 
1892 Same as cv::v256_load, but result pack type will be 2x wider than memory type.
1893 
1894 @code{.cpp}
1895 short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int16
1896 v_int32x8 r = v256_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
1897 @endcode
1898 For 8-, 16-, 32-bit integer source types.
1899 
1900 @note Check CV_SIMD256 preprocessor definition prior to use.
1901 Use vx_load_expand version to get maximum available register length result
1902 */
1903 template<typename _Tp>
1904 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
v256_load_expand(const _Tp * ptr)1905 v256_load_expand(const _Tp* ptr)
1906 {
1907 #if CV_STRONG_ALIGNMENT
1908     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1909 #endif
1910     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1911     v_reg<w_type, simd256_width / sizeof(w_type)> c;
1912     for (int i = 0; i < c.nlanes; i++)
1913     {
1914         c.s[i] = ptr[i];
1915     }
1916     return c;
1917 }
1918 #endif
1919 
1920 #if CV_SIMD512
1921 /** @brief Load register contents from memory with double expand
1922 
1923 Same as cv::v512_load, but result pack type will be 2x wider than memory type.
1924 
1925 @code{.cpp}
1926 short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int16
1927 v_int32x16 r = v512_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
1928 @endcode
1929 For 8-, 16-, 32-bit integer source types.
1930 
1931 @note Check CV_SIMD512 preprocessor definition prior to use.
1932 Use vx_load_expand version to get maximum available register length result
1933 */
1934 template<typename _Tp>
1935 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
v512_load_expand(const _Tp * ptr)1936 v512_load_expand(const _Tp* ptr)
1937 {
1938 #if CV_STRONG_ALIGNMENT
1939     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1940 #endif
1941     typedef typename V_TypeTraits<_Tp>::w_type w_type;
1942     v_reg<w_type, simd512_width / sizeof(w_type)> c;
1943     for (int i = 0; i < c.nlanes; i++)
1944     {
1945         c.s[i] = ptr[i];
1946     }
1947     return c;
1948 }
1949 #endif
1950 
1951 /** @brief Load register contents from memory with quad expand
1952 
1953 Same as cv::v_load_expand, but result type is 4 times wider than source.
1954 @code{.cpp}
1955 char buf[4] = {1, 2, 3, 4}; // type is int8
1956 v_int32x4 r = v_load_expand_q(buf); // r = {1, 2, 3, 4} - type is int32
1957 @endcode
1958 For 8-bit integer source types.
1959 
1960 @note Use vx_load_expand_q version to get maximum available register length result
1961 */
1962 template<typename _Tp>
1963 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
v_load_expand_q(const _Tp * ptr)1964 v_load_expand_q(const _Tp* ptr)
1965 {
1966 #if CV_STRONG_ALIGNMENT
1967     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1968 #endif
1969     typedef typename V_TypeTraits<_Tp>::q_type q_type;
1970     v_reg<q_type, simd128_width / sizeof(q_type)> c;
1971     for( int i = 0; i < c.nlanes; i++ )
1972     {
1973         c.s[i] = ptr[i];
1974     }
1975     return c;
1976 }
1977 
1978 #if CV_SIMD256
1979 /** @brief Load register contents from memory with quad expand
1980 
1981 Same as cv::v256_load_expand, but result type is 4 times wider than source.
1982 @code{.cpp}
1983 char buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int8
1984 v_int32x8 r = v256_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
1985 @endcode
1986 For 8-bit integer source types.
1987 
1988 @note Check CV_SIMD256 preprocessor definition prior to use.
1989 Use vx_load_expand_q version to get maximum available register length result
1990 */
1991 template<typename _Tp>
1992 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
v256_load_expand_q(const _Tp * ptr)1993 v256_load_expand_q(const _Tp* ptr)
1994 {
1995 #if CV_STRONG_ALIGNMENT
1996     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1997 #endif
1998     typedef typename V_TypeTraits<_Tp>::q_type q_type;
1999     v_reg<q_type, simd256_width / sizeof(q_type)> c;
2000     for (int i = 0; i < c.nlanes; i++)
2001     {
2002         c.s[i] = ptr[i];
2003     }
2004     return c;
2005 }
2006 #endif
2007 
2008 #if CV_SIMD512
2009 /** @brief Load register contents from memory with quad expand
2010 
2011 Same as cv::v512_load_expand, but result type is 4 times wider than source.
2012 @code{.cpp}
2013 char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int8
2014 v_int32x16 r = v512_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
2015 @endcode
2016 For 8-bit integer source types.
2017 
2018 @note Check CV_SIMD512 preprocessor definition prior to use.
2019 Use vx_load_expand_q version to get maximum available register length result
2020 */
2021 template<typename _Tp>
2022 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
v512_load_expand_q(const _Tp * ptr)2023 v512_load_expand_q(const _Tp* ptr)
2024 {
2025 #if CV_STRONG_ALIGNMENT
2026     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2027 #endif
2028     typedef typename V_TypeTraits<_Tp>::q_type q_type;
2029     v_reg<q_type, simd512_width / sizeof(q_type)> c;
2030     for (int i = 0; i < c.nlanes; i++)
2031     {
2032         c.s[i] = ptr[i];
2033     }
2034     return c;
2035 }
2036 #endif
2037 
2038 /** @brief Load and deinterleave (2 channels)
2039 
2040 Load data from memory deinterleave and store to 2 registers.
2041 Scheme:
2042 @code
2043 {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
2044 @endcode
2045 For all types except 64-bit. */
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b)2046 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2047                                                             v_reg<_Tp, n>& b)
2048 {
2049 #if CV_STRONG_ALIGNMENT
2050     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2051 #endif
2052     int i, i2;
2053     for( i = i2 = 0; i < n; i++, i2 += 2 )
2054     {
2055         a.s[i] = ptr[i2];
2056         b.s[i] = ptr[i2+1];
2057     }
2058 }
2059 
2060 /** @brief Load and deinterleave (3 channels)
2061 
2062 Load data from memory deinterleave and store to 3 registers.
2063 Scheme:
2064 @code
2065 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
2066 @endcode
2067 For all types except 64-bit. */
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b,v_reg<_Tp,n> & c)2068 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2069                                                             v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
2070 {
2071 #if CV_STRONG_ALIGNMENT
2072     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2073 #endif
2074     int i, i3;
2075     for( i = i3 = 0; i < n; i++, i3 += 3 )
2076     {
2077         a.s[i] = ptr[i3];
2078         b.s[i] = ptr[i3+1];
2079         c.s[i] = ptr[i3+2];
2080     }
2081 }
2082 
2083 /** @brief Load and deinterleave (4 channels)
2084 
2085 Load data from memory deinterleave and store to 4 registers.
2086 Scheme:
2087 @code
2088 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
2089 @endcode
2090 For all types except 64-bit. */
2091 template<typename _Tp, int n>
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b,v_reg<_Tp,n> & c,v_reg<_Tp,n> & d)2092 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2093                                 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
2094                                 v_reg<_Tp, n>& d)
2095 {
2096 #if CV_STRONG_ALIGNMENT
2097     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2098 #endif
2099     int i, i4;
2100     for( i = i4 = 0; i < n; i++, i4 += 4 )
2101     {
2102         a.s[i] = ptr[i4];
2103         b.s[i] = ptr[i4+1];
2104         c.s[i] = ptr[i4+2];
2105         d.s[i] = ptr[i4+3];
2106     }
2107 }
2108 
2109 /** @brief Interleave and store (2 channels)
2110 
2111 Interleave and store data from 2 registers to memory.
2112 Scheme:
2113 @code
2114 {A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
2115 @endcode
2116 For all types except 64-bit. */
2117 template<typename _Tp, int n>
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,hal::StoreMode=hal::STORE_UNALIGNED)2118 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2119                                const v_reg<_Tp, n>& b,
2120                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2121 {
2122 #if CV_STRONG_ALIGNMENT
2123     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2124 #endif
2125     int i, i2;
2126     for( i = i2 = 0; i < n; i++, i2 += 2 )
2127     {
2128         ptr[i2] = a.s[i];
2129         ptr[i2+1] = b.s[i];
2130     }
2131 }
2132 
2133 /** @brief Interleave and store (3 channels)
2134 
2135 Interleave and store data from 3 registers to memory.
2136 Scheme:
2137 @code
2138 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
2139 @endcode
2140 For all types except 64-bit. */
2141 template<typename _Tp, int n>
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c,hal::StoreMode=hal::STORE_UNALIGNED)2142 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2143                                 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
2144                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2145 {
2146 #if CV_STRONG_ALIGNMENT
2147     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2148 #endif
2149     int i, i3;
2150     for( i = i3 = 0; i < n; i++, i3 += 3 )
2151     {
2152         ptr[i3] = a.s[i];
2153         ptr[i3+1] = b.s[i];
2154         ptr[i3+2] = c.s[i];
2155     }
2156 }
2157 
2158 /** @brief Interleave and store (4 channels)
2159 
2160 Interleave and store data from 4 registers to memory.
2161 Scheme:
2162 @code
2163 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
2164 @endcode
2165 For all types except 64-bit. */
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c,const v_reg<_Tp,n> & d,hal::StoreMode=hal::STORE_UNALIGNED)2166 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2167                                                             const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
2168                                                             const v_reg<_Tp, n>& d,
2169                                                             hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2170 {
2171 #if CV_STRONG_ALIGNMENT
2172     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2173 #endif
2174     int i, i4;
2175     for( i = i4 = 0; i < n; i++, i4 += 4 )
2176     {
2177         ptr[i4] = a.s[i];
2178         ptr[i4+1] = b.s[i];
2179         ptr[i4+2] = c.s[i];
2180         ptr[i4+3] = d.s[i];
2181     }
2182 }
2183 
2184 /** @brief Store data to memory
2185 
2186 Store register contents to memory.
2187 Scheme:
2188 @code
2189   REG {A B C D} ==> MEM {A B C D}
2190 @endcode
2191 Pointer can be unaligned. */
2192 template<typename _Tp, int n>
v_store(_Tp * ptr,const v_reg<_Tp,n> & a)2193 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
2194 {
2195 #if CV_STRONG_ALIGNMENT
2196     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2197 #endif
2198     for( int i = 0; i < n; i++ )
2199         ptr[i] = a.s[i];
2200 }
2201 
2202 template<typename _Tp, int n>
v_store(_Tp * ptr,const v_reg<_Tp,n> & a,hal::StoreMode)2203 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
2204 {
2205 #if CV_STRONG_ALIGNMENT
2206     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2207 #endif
2208     v_store(ptr, a);
2209 }
2210 
2211 /** @brief Store data to memory (lower half)
2212 
2213 Store lower half of register contents to memory.
2214 Scheme:
2215 @code
2216   REG {A B C D} ==> MEM {A B}
2217 @endcode */
2218 template<typename _Tp, int n>
v_store_low(_Tp * ptr,const v_reg<_Tp,n> & a)2219 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
2220 {
2221 #if CV_STRONG_ALIGNMENT
2222     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2223 #endif
2224     for( int i = 0; i < (n/2); i++ )
2225         ptr[i] = a.s[i];
2226 }
2227 
2228 /** @brief Store data to memory (higher half)
2229 
2230 Store higher half of register contents to memory.
2231 Scheme:
2232 @code
2233   REG {A B C D} ==> MEM {C D}
2234 @endcode */
2235 template<typename _Tp, int n>
v_store_high(_Tp * ptr,const v_reg<_Tp,n> & a)2236 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
2237 {
2238 #if CV_STRONG_ALIGNMENT
2239     CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2240 #endif
2241     for( int i = 0; i < (n/2); i++ )
2242         ptr[i] = a.s[i+(n/2)];
2243 }
2244 
2245 /** @brief Store data to memory (aligned)
2246 
2247 Store register contents to memory.
2248 Scheme:
2249 @code
2250   REG {A B C D} ==> MEM {A B C D}
2251 @endcode
2252 Pointer __should__ be aligned by 16-byte boundary. */
2253 template<typename _Tp, int n>
v_store_aligned(_Tp * ptr,const v_reg<_Tp,n> & a)2254 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
2255 {
2256     CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2257     v_store(ptr, a);
2258 }
2259 
2260 template<typename _Tp, int n>
v_store_aligned_nocache(_Tp * ptr,const v_reg<_Tp,n> & a)2261 inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
2262 {
2263     CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2264     v_store(ptr, a);
2265 }
2266 
2267 template<typename _Tp, int n>
v_store_aligned(_Tp * ptr,const v_reg<_Tp,n> & a,hal::StoreMode)2268 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
2269 {
2270     CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2271     v_store(ptr, a);
2272 }
2273 
2274 /** @brief Combine vector from first elements of two vectors
2275 
2276 Scheme:
2277 @code
2278   {A1 A2 A3 A4}
2279   {B1 B2 B3 B4}
2280 ---------------
2281   {A1 A2 B1 B2}
2282 @endcode
2283 For all types except 64-bit. */
2284 template<typename _Tp, int n>
v_combine_low(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)2285 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2286 {
2287     v_reg<_Tp, n> c;
2288     for( int i = 0; i < (n/2); i++ )
2289     {
2290         c.s[i] = a.s[i];
2291         c.s[i+(n/2)] = b.s[i];
2292     }
2293     return c;
2294 }
2295 
2296 /** @brief Combine vector from last elements of two vectors
2297 
2298 Scheme:
2299 @code
2300   {A1 A2 A3 A4}
2301   {B1 B2 B3 B4}
2302 ---------------
2303   {A3 A4 B3 B4}
2304 @endcode
2305 For all types except 64-bit. */
2306 template<typename _Tp, int n>
v_combine_high(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)2307 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2308 {
2309     v_reg<_Tp, n> c;
2310     for( int i = 0; i < (n/2); i++ )
2311     {
2312         c.s[i] = a.s[i+(n/2)];
2313         c.s[i+(n/2)] = b.s[i+(n/2)];
2314     }
2315     return c;
2316 }
2317 
2318 /** @brief Combine two vectors from lower and higher parts of two other vectors
2319 
2320 @code{.cpp}
2321 low = cv::v_combine_low(a, b);
2322 high = cv::v_combine_high(a, b);
2323 @endcode */
2324 template<typename _Tp, int n>
v_recombine(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<_Tp,n> & low,v_reg<_Tp,n> & high)2325 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
2326                         v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
2327 {
2328     for( int i = 0; i < (n/2); i++ )
2329     {
2330         low.s[i] = a.s[i];
2331         low.s[i+(n/2)] = b.s[i];
2332         high.s[i] = a.s[i+(n/2)];
2333         high.s[i+(n/2)] = b.s[i+(n/2)];
2334     }
2335 }
2336 
2337 /** @brief Vector reverse order
2338 
2339 Reverse the order of the vector
2340 Scheme:
2341 @code
2342   REG {A1 ... An} ==> REG {An ... A1}
2343 @endcode
2344 For all types. */
2345 template<typename _Tp, int n>
v_reverse(const v_reg<_Tp,n> & a)2346 inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
2347 {
2348     v_reg<_Tp, n> c;
2349     for( int i = 0; i < n; i++ )
2350         c.s[i] = a.s[n-i-1];
2351     return c;
2352 }
2353 
2354 /** @brief Vector extract
2355 
2356 Scheme:
2357 @code
2358   {A1 A2 A3 A4}
2359   {B1 B2 B3 B4}
2360 ========================
2361 shift = 1  {A2 A3 A4 B1}
2362 shift = 2  {A3 A4 B1 B2}
2363 shift = 3  {A4 B1 B2 B3}
2364 @endcode
2365 Restriction: 0 <= shift < nlanes
2366 
2367 Usage:
2368 @code
2369 v_int32x4 a, b, c;
2370 c = v_extract<2>(a, b);
2371 @endcode
2372 For all types. */
2373 template<int s, typename _Tp, int n>
v_extract(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)2374 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2375 {
2376     v_reg<_Tp, n> r;
2377     const int shift = n - s;
2378     int i = 0;
2379     for (; i < shift; ++i)
2380         r.s[i] = a.s[i+s];
2381     for (; i < n; ++i)
2382         r.s[i] = b.s[i-shift];
2383     return r;
2384 }
2385 
2386 /** @brief Vector extract
2387 
2388 Scheme:
2389 Return the s-th element of v.
2390 Restriction: 0 <= s < nlanes
2391 
2392 Usage:
2393 @code
2394 v_int32x4 a;
2395 int r;
2396 r = v_extract_n<2>(a);
2397 @endcode
2398 For all types. */
2399 template<int s, typename _Tp, int n>
v_extract_n(const v_reg<_Tp,n> & v)2400 inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
2401 {
2402     CV_DbgAssert(s >= 0 && s < n);
2403     return v.s[s];
2404 }
2405 
2406 /** @brief Broadcast i-th element of vector
2407 
2408 Scheme:
2409 @code
2410 { v[0] v[1] v[2] ... v[SZ] } => { v[i], v[i], v[i] ... v[i] }
2411 @endcode
2412 Restriction: 0 <= i < nlanes
2413 Supported types: 32-bit integers and floats (s32/u32/f32)
2414  */
2415 template<int i, typename _Tp, int n>
v_broadcast_element(const v_reg<_Tp,n> & a)2416 inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
2417 {
2418     CV_DbgAssert(i >= 0 && i < n);
2419     return v_reg<_Tp, n>::all(a.s[i]);
2420 }
2421 
2422 /** @brief Round elements
2423 
2424 Rounds each value. Input type is float vector ==> output type is int vector.
2425 @note Only for floating point types.
2426 */
v_round(const v_reg<float,n> & a)2427 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
2428 {
2429     v_reg<int, n> c;
2430     for( int i = 0; i < n; i++ )
2431         c.s[i] = cvRound(a.s[i]);
2432     return c;
2433 }
2434 
2435 /** @overload */
v_round(const v_reg<double,n> & a,const v_reg<double,n> & b)2436 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
2437 {
2438     v_reg<int, n*2> c;
2439     for( int i = 0; i < n; i++ )
2440     {
2441         c.s[i] = cvRound(a.s[i]);
2442         c.s[i+n] = cvRound(b.s[i]);
2443     }
2444     return c;
2445 }
2446 
2447 /** @brief Floor elements
2448 
2449 Floor each value. Input type is float vector ==> output type is int vector.
2450 @note Only for floating point types.
2451 */
v_floor(const v_reg<float,n> & a)2452 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
2453 {
2454     v_reg<int, n> c;
2455     for( int i = 0; i < n; i++ )
2456         c.s[i] = cvFloor(a.s[i]);
2457     return c;
2458 }
2459 
2460 /** @brief Ceil elements
2461 
2462 Ceil each value. Input type is float vector ==> output type is int vector.
2463 @note Only for floating point types.
2464 */
v_ceil(const v_reg<float,n> & a)2465 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
2466 {
2467     v_reg<int, n> c;
2468     for( int i = 0; i < n; i++ )
2469         c.s[i] = cvCeil(a.s[i]);
2470     return c;
2471 }
2472 
2473 /** @brief Truncate elements
2474 
2475 Truncate each value. Input type is float vector ==> output type is int vector.
2476 @note Only for floating point types.
2477 */
v_trunc(const v_reg<float,n> & a)2478 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
2479 {
2480     v_reg<int, n> c;
2481     for( int i = 0; i < n; i++ )
2482         c.s[i] = (int)(a.s[i]);
2483     return c;
2484 }
2485 
2486 /** @overload */
v_round(const v_reg<double,n> & a)2487 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
2488 {
2489     v_reg<int, n*2> c;
2490     for( int i = 0; i < n; i++ )
2491     {
2492         c.s[i] = cvRound(a.s[i]);
2493         c.s[i+n] = 0;
2494     }
2495     return c;
2496 }
2497 
2498 /** @overload */
v_floor(const v_reg<double,n> & a)2499 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
2500 {
2501     v_reg<int, n*2> c;
2502     for( int i = 0; i < n; i++ )
2503     {
2504         c.s[i] = cvFloor(a.s[i]);
2505         c.s[i+n] = 0;
2506     }
2507     return c;
2508 }
2509 
2510 /** @overload */
v_ceil(const v_reg<double,n> & a)2511 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
2512 {
2513     v_reg<int, n*2> c;
2514     for( int i = 0; i < n; i++ )
2515     {
2516         c.s[i] = cvCeil(a.s[i]);
2517         c.s[i+n] = 0;
2518     }
2519     return c;
2520 }
2521 
2522 /** @overload */
v_trunc(const v_reg<double,n> & a)2523 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
2524 {
2525     v_reg<int, n*2> c;
2526     for( int i = 0; i < n; i++ )
2527     {
2528         c.s[i] = (int)(a.s[i]);
2529         c.s[i+n] = 0;
2530     }
2531     return c;
2532 }
2533 
2534 /** @brief Convert to float
2535 
2536 Supported input type is cv::v_int32. */
v_cvt_f32(const v_reg<int,n> & a)2537 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
2538 {
2539     v_reg<float, n> c;
2540     for( int i = 0; i < n; i++ )
2541         c.s[i] = (float)a.s[i];
2542     return c;
2543 }
2544 
2545 /** @brief Convert lower half to float
2546 
2547 Supported input type is cv::v_float64. */
v_cvt_f32(const v_reg<double,n> & a)2548 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
2549 {
2550     v_reg<float, n*2> c;
2551     for( int i = 0; i < n; i++ )
2552     {
2553         c.s[i] = (float)a.s[i];
2554         c.s[i+n] = 0;
2555     }
2556     return c;
2557 }
2558 
2559 /** @brief Convert to float
2560 
2561 Supported input type is cv::v_float64. */
v_cvt_f32(const v_reg<double,n> & a,const v_reg<double,n> & b)2562 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
2563 {
2564     v_reg<float, n*2> c;
2565     for( int i = 0; i < n; i++ )
2566     {
2567         c.s[i] = (float)a.s[i];
2568         c.s[i+n] = (float)b.s[i];
2569     }
2570     return c;
2571 }
2572 
2573 /** @brief Convert lower half to double
2574 
2575 Supported input type is cv::v_int32. */
v_cvt_f64(const v_reg<int,n> & a)2576 template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
2577 {
2578     v_reg<double, (n/2)> c;
2579     for( int i = 0; i < (n/2); i++ )
2580         c.s[i] = (double)a.s[i];
2581     return c;
2582 }
2583 
2584 /** @brief Convert to double high part of vector
2585 
2586 Supported input type is cv::v_int32. */
v_cvt_f64_high(const v_reg<int,n> & a)2587 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
2588 {
2589     v_reg<double, (n/2)> c;
2590     for( int i = 0; i < (n/2); i++ )
2591         c.s[i] = (double)a.s[i + (n/2)];
2592     return c;
2593 }
2594 
2595 /** @brief Convert lower half to double
2596 
2597 Supported input type is cv::v_float32. */
v_cvt_f64(const v_reg<float,n> & a)2598 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
2599 {
2600     v_reg<double, (n/2)> c;
2601     for( int i = 0; i < (n/2); i++ )
2602         c.s[i] = (double)a.s[i];
2603     return c;
2604 }
2605 
2606 /** @brief Convert to double high part of vector
2607 
2608 Supported input type is cv::v_float32. */
v_cvt_f64_high(const v_reg<float,n> & a)2609 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
2610 {
2611     v_reg<double, (n/2)> c;
2612     for( int i = 0; i < (n/2); i++ )
2613         c.s[i] = (double)a.s[i + (n/2)];
2614     return c;
2615 }
2616 
2617 /** @brief Convert to double
2618 
2619 Supported input type is cv::v_int64. */
v_cvt_f64(const v_reg<int64,n> & a)2620 template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
2621 {
2622     v_reg<double, n> c;
2623     for( int i = 0; i < n; i++ )
2624         c.s[i] = (double)a.s[i];
2625     return c;
2626 }
2627 
2628 
v_lut(const _Tp * tab,const int * idx)2629 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)
2630 {
2631     v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2632     for (int i = 0; i < c.nlanes; i++)
2633         c.s[i] = tab[idx[i]];
2634     return c;
2635 }
v_lut_pairs(const _Tp * tab,const int * idx)2636 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)
2637 {
2638     v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2639     for (int i = 0; i < c.nlanes; i++)
2640         c.s[i] = tab[idx[i / 2] + i % 2];
2641     return c;
2642 }
v_lut_quads(const _Tp * tab,const int * idx)2643 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)
2644 {
2645     v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2646     for (int i = 0; i < c.nlanes; i++)
2647         c.s[i] = tab[idx[i / 4] + i % 4];
2648     return c;
2649 }
2650 
v_lut(const int * tab,const v_reg<int,n> & idx)2651 template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
2652 {
2653     v_reg<int, n> c;
2654     for( int i = 0; i < n; i++ )
2655         c.s[i] = tab[idx.s[i]];
2656     return c;
2657 }
2658 
v_lut(const unsigned * tab,const v_reg<int,n> & idx)2659 template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
2660 {
2661     v_reg<int, n> c;
2662     for (int i = 0; i < n; i++)
2663         c.s[i] = tab[idx.s[i]];
2664     return c;
2665 }
2666 
v_lut(const float * tab,const v_reg<int,n> & idx)2667 template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
2668 {
2669     v_reg<float, n> c;
2670     for( int i = 0; i < n; i++ )
2671         c.s[i] = tab[idx.s[i]];
2672     return c;
2673 }
2674 
v_lut(const double * tab,const v_reg<int,n> & idx)2675 template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
2676 {
2677     v_reg<double, n/2> c;
2678     for( int i = 0; i < n/2; i++ )
2679         c.s[i] = tab[idx.s[i]];
2680     return c;
2681 }
2682 
2683 
v_lut_deinterleave(const float * tab,const v_reg<int,n> & idx,v_reg<float,n> & x,v_reg<float,n> & y)2684 template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
2685                                                v_reg<float, n>& x, v_reg<float, n>& y)
2686 {
2687     for( int i = 0; i < n; i++ )
2688     {
2689         int j = idx.s[i];
2690         x.s[i] = tab[j];
2691         y.s[i] = tab[j+1];
2692     }
2693 }
2694 
v_lut_deinterleave(const double * tab,const v_reg<int,n * 2> & idx,v_reg<double,n> & x,v_reg<double,n> & y)2695 template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
2696                                                v_reg<double, n>& x, v_reg<double, n>& y)
2697 {
2698     for( int i = 0; i < n; i++ )
2699     {
2700         int j = idx.s[i];
2701         x.s[i] = tab[j];
2702         y.s[i] = tab[j+1];
2703     }
2704 }
2705 
v_interleave_pairs(const v_reg<_Tp,n> & vec)2706 template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
2707 {
2708     v_reg<_Tp, n> c;
2709     for (int i = 0; i < n/4; i++)
2710     {
2711         c.s[4*i  ] = vec.s[4*i  ];
2712         c.s[4*i+1] = vec.s[4*i+2];
2713         c.s[4*i+2] = vec.s[4*i+1];
2714         c.s[4*i+3] = vec.s[4*i+3];
2715     }
2716     return c;
2717 }
2718 
v_interleave_quads(const v_reg<_Tp,n> & vec)2719 template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
2720 {
2721     v_reg<_Tp, n> c;
2722     for (int i = 0; i < n/8; i++)
2723     {
2724         c.s[8*i  ] = vec.s[8*i  ];
2725         c.s[8*i+1] = vec.s[8*i+4];
2726         c.s[8*i+2] = vec.s[8*i+1];
2727         c.s[8*i+3] = vec.s[8*i+5];
2728         c.s[8*i+4] = vec.s[8*i+2];
2729         c.s[8*i+5] = vec.s[8*i+6];
2730         c.s[8*i+6] = vec.s[8*i+3];
2731         c.s[8*i+7] = vec.s[8*i+7];
2732     }
2733     return c;
2734 }
2735 
v_pack_triplets(const v_reg<_Tp,n> & vec)2736 template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
2737 {
2738     v_reg<_Tp, n> c;
2739     for (int i = 0; i < n/4; i++)
2740     {
2741         c.s[3*i  ] = vec.s[4*i  ];
2742         c.s[3*i+1] = vec.s[4*i+1];
2743         c.s[3*i+2] = vec.s[4*i+2];
2744     }
2745     return c;
2746 }
2747 
2748 /** @brief Transpose 4x4 matrix
2749 
2750 Scheme:
2751 @code
2752 a0  {A1 A2 A3 A4}
2753 a1  {B1 B2 B3 B4}
2754 a2  {C1 C2 C3 C4}
2755 a3  {D1 D2 D3 D4}
2756 ===============
2757 b0  {A1 B1 C1 D1}
2758 b1  {A2 B2 C2 D2}
2759 b2  {A3 B3 C3 D3}
2760 b3  {A4 B4 C4 D4}
2761 @endcode
2762 */
2763 template<typename _Tp, int n>
v_transpose4x4(v_reg<_Tp,n> & a0,const v_reg<_Tp,n> & a1,const v_reg<_Tp,n> & a2,const v_reg<_Tp,n> & a3,v_reg<_Tp,n> & b0,v_reg<_Tp,n> & b1,v_reg<_Tp,n> & b2,v_reg<_Tp,n> & b3)2764 inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
2765                             const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
2766                             v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,
2767                             v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
2768 {
2769     for (int i = 0; i < n / 4; i++)
2770     {
2771         b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
2772         b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
2773         b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
2774         b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
2775         b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
2776         b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
2777         b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
2778         b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
2779     }
2780 }
2781 
2782 //! @brief Helper macro
2783 //! @ingroup core_hal_intrin_impl
2784 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
2785 inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
2786 
2787 //! @name Init with zero
2788 //! @{
2789 //! @brief Create new vector with zero elements
OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16,v,u8)2790 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, v, u8)
2791 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, v, s8)
2792 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, v, u16)
2793 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, v, s16)
2794 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, v, u32)
2795 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, v, s32)
2796 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, v, f32)
2797 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, v, f64)
2798 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, v, u64)
2799 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, v, s64)
2800 
2801 #if CV_SIMD256
2802 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)
2803 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)
2804 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)
2805 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)
2806 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)
2807 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)
2808 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)
2809 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)
2810 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)
2811 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)
2812 #endif
2813 
2814 #if CV_SIMD512
2815 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)
2816 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)
2817 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)
2818 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)
2819 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)
2820 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)
2821 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)
2822 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)
2823 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)
2824 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
2825 #endif
2826 //! @}
2827 
2828 //! @brief Helper macro
2829 //! @ingroup core_hal_intrin_impl
2830 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
2831 inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
2832 
2833 //! @name Init with value
2834 //! @{
2835 //! @brief Create new vector with elements set to a specific value
2836 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, v, u8)
2837 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, v, s8)
2838 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)
2839 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, v, s16)
2840 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)
2841 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, v, s32)
2842 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, v, f32)
2843 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, v, f64)
2844 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)
2845 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, v, s64)
2846 
2847 #if CV_SIMD256
2848 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)
2849 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)
2850 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)
2851 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)
2852 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)
2853 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)
2854 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)
2855 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)
2856 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)
2857 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)
2858 #endif
2859 
2860 #if CV_SIMD512
2861 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)
2862 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)
2863 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)
2864 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)
2865 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)
2866 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)
2867 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)
2868 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)
2869 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)
2870 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)
2871 #endif
2872 //! @}
2873 
2874 //! @brief Helper macro
2875 //! @ingroup core_hal_intrin_impl
2876 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
2877 template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
2878     v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
2879 { return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }
2880 
2881 //! @name Reinterpret
2882 //! @{
2883 //! @brief Convert vector to different type without modifying underlying data.
2884 OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)
2885 OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)
2886 OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)
2887 OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)
2888 OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
2889 OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)
2890 OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)
2891 OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)
2892 OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)
2893 OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
2894 //! @}
2895 
2896 //! @brief Helper macro
2897 //! @ingroup core_hal_intrin_impl
2898 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
2899 template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
2900 { return a << shift; }
2901 
2902 //! @name Left shift
2903 //! @{
2904 //! @brief Shift left
2905 OPENCV_HAL_IMPL_C_SHIFTL(ushort)
2906 OPENCV_HAL_IMPL_C_SHIFTL(short)
2907 OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
2908 OPENCV_HAL_IMPL_C_SHIFTL(int)
2909 OPENCV_HAL_IMPL_C_SHIFTL(uint64)
2910 OPENCV_HAL_IMPL_C_SHIFTL(int64)
2911 //! @}
2912 
2913 //! @brief Helper macro
2914 //! @ingroup core_hal_intrin_impl
2915 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
2916 template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
2917 { return a >> shift; }
2918 
2919 //! @name Right shift
2920 //! @{
2921 //! @brief Shift right
2922 OPENCV_HAL_IMPL_C_SHIFTR(ushort)
2923 OPENCV_HAL_IMPL_C_SHIFTR(short)
2924 OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
2925 OPENCV_HAL_IMPL_C_SHIFTR(int)
2926 OPENCV_HAL_IMPL_C_SHIFTR(uint64)
2927 OPENCV_HAL_IMPL_C_SHIFTR(int64)
2928 //! @}
2929 
2930 //! @brief Helper macro
2931 //! @ingroup core_hal_intrin_impl
2932 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
2933 template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
2934 { \
2935     v_reg<_Tp, n> c; \
2936     for( int i = 0; i < n; i++ ) \
2937         c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2938     return c; \
2939 }
2940 
2941 //! @name Rounding shift
2942 //! @{
2943 //! @brief Rounding shift right
2944 OPENCV_HAL_IMPL_C_RSHIFTR(ushort)
2945 OPENCV_HAL_IMPL_C_RSHIFTR(short)
2946 OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)
2947 OPENCV_HAL_IMPL_C_RSHIFTR(int)
2948 OPENCV_HAL_IMPL_C_RSHIFTR(uint64)
2949 OPENCV_HAL_IMPL_C_RSHIFTR(int64)
2950 //! @}
2951 
2952 //! @brief Helper macro
2953 //! @ingroup core_hal_intrin_impl
2954 #define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
2955 template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
2956 { \
2957     v_reg<_Tpn, 2*n> c; \
2958     for( int i = 0; i < n; i++ ) \
2959     { \
2960         c.s[i] = cast<_Tpn>(a.s[i]); \
2961         c.s[i+n] = cast<_Tpn>(b.s[i]); \
2962     } \
2963     return c; \
2964 }
2965 
2966 //! @name Pack
2967 //! @{
2968 //! @brief Pack values from two vectors to one
2969 //!
2970 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
2971 //! converts to corresponding unsigned type.
2972 //!
2973 //! - pack: for 16-, 32- and 64-bit integer input types
2974 //! - pack_u: for 16- and 32-bit signed integer input types
2975 //!
2976 //! @note All variants except 64-bit use saturation.
2977 OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
2978 OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
2979 OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
2980 OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
2981 OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
2982 OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
2983 OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
2984 OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
2985 //! @}
2986 
2987 //! @brief Helper macro
2988 //! @ingroup core_hal_intrin_impl
2989 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
2990 template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
2991 { \
2992     v_reg<_Tpn, 2*n> c; \
2993     for( int i = 0; i < n; i++ ) \
2994     { \
2995         c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2996         c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2997     } \
2998     return c; \
2999 }
3000 
3001 //! @name Pack with rounding shift
3002 //! @{
3003 //! @brief Pack values from two vectors to one with rounding shift
3004 //!
3005 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
3006 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
3007 //!
3008 //! - pack: for 16-, 32- and 64-bit integer input types
3009 //! - pack_u: for 16- and 32-bit signed integer input types
3010 //!
3011 //! @note All variants except 64-bit use saturation.
3012 OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
3013 OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
3014 OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
3015 OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)
3016 OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
3017 OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
3018 OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
3019 OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
3020 //! @}
3021 
3022 //! @brief Helper macro
3023 //! @ingroup core_hal_intrin_impl
3024 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
3025 template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
3026 { \
3027     for( int i = 0; i < n; i++ ) \
3028         ptr[i] = cast<_Tpn>(a.s[i]); \
3029 }
3030 
3031 //! @name Pack and store
3032 //! @{
3033 //! @brief Store values from the input vector into memory with pack
3034 //!
3035 //! Values will be stored into memory with conversion to narrower type.
3036 //! Variant with _u_ suffix converts to corresponding unsigned type.
3037 //!
3038 //! - pack: for 16-, 32- and 64-bit integer input types
3039 //! - pack_u: for 16- and 32-bit signed integer input types
3040 //!
3041 //! @note All variants except 64-bit use saturation.
3042 OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
3043 OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
3044 OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
3045 OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)
3046 OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
3047 OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
3048 OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
3049 OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
3050 //! @}
3051 
3052 //! @brief Helper macro
3053 //! @ingroup core_hal_intrin_impl
3054 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
3055 template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
3056 { \
3057     for( int i = 0; i < n; i++ ) \
3058         ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
3059 }
3060 
3061 //! @name Pack and store with rounding shift
3062 //! @{
3063 //! @brief Store values from the input vector into memory with pack
3064 //!
3065 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
3066 //! memory. Variant with _u_ suffix converts to unsigned type.
3067 //!
3068 //! - pack: for 16-, 32- and 64-bit integer input types
3069 //! - pack_u: for 16- and 32-bit signed integer input types
3070 //!
3071 //! @note All variants except 64-bit use saturation.
3072 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)
3073 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)
3074 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
3075 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)
3076 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
3077 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
3078 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)
3079 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)
3080 //! @}
3081 
3082 //! @cond IGNORED
3083 template<typename _Tpm, typename _Tp, int n>
3084 inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
3085 {
3086     for (int i = 0; i < n; ++i)
3087     {
3088         mptr[i] = (_Tpm)a.s[i];
3089         mptr[i + n] = (_Tpm)b.s[i];
3090     }
3091 }
3092 //! @endcond
3093 
3094 //! @name Pack boolean values
3095 //! @{
3096 //! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
3097 //!
3098 //! @note Must provide valid boolean values to guarantee same result for all architectures.
3099 
3100 /** @brief
3101 //! For 16-bit boolean values
3102 
3103 Scheme:
3104 @code
3105 a  {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
3106 b  {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
3107 ===============
3108 {
3109    0xFF 0 0 0xFF 0 0xFF 0xFF 0
3110    0xFF 0 0xFF 0 0 0xFF 0 0xFF
3111 }
3112 @endcode */
3113 
v_pack_b(const v_reg<ushort,n> & a,const v_reg<ushort,n> & b)3114 template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
3115 {
3116     v_reg<uchar, 2*n> mask;
3117     _pack_b(mask.s, a, b);
3118     return mask;
3119 }
3120 
3121 /** @overload
3122 For 32-bit boolean values
3123 
3124 Scheme:
3125 @code
3126 a  {0xFFFF.. 0 0 0xFFFF..}
3127 b  {0 0xFFFF.. 0xFFFF.. 0}
3128 c  {0xFFFF.. 0 0xFFFF.. 0}
3129 d  {0 0xFFFF.. 0 0xFFFF..}
3130 ===============
3131 {
3132    0xFF 0 0 0xFF 0 0xFF 0xFF 0
3133    0xFF 0 0xFF 0 0 0xFF 0 0xFF
3134 }
3135 @endcode */
3136 
v_pack_b(const v_reg<unsigned,n> & a,const v_reg<unsigned,n> & b,const v_reg<unsigned,n> & c,const v_reg<unsigned,n> & d)3137 template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
3138                                                   const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
3139 {
3140     v_reg<uchar, 4*n> mask;
3141     _pack_b(mask.s, a, b);
3142     _pack_b(mask.s + 2*n, c, d);
3143     return mask;
3144 }
3145 
3146 /** @overload
3147 For 64-bit boolean values
3148 
3149 Scheme:
3150 @code
3151 a  {0xFFFF.. 0}
3152 b  {0 0xFFFF..}
3153 c  {0xFFFF.. 0}
3154 d  {0 0xFFFF..}
3155 
3156 e  {0xFFFF.. 0}
3157 f  {0xFFFF.. 0}
3158 g  {0 0xFFFF..}
3159 h  {0 0xFFFF..}
3160 ===============
3161 {
3162    0xFF 0 0 0xFF 0xFF 0 0 0xFF
3163    0xFF 0 0xFF 0 0 0xFF 0 0xFF
3164 }
3165 @endcode */
v_pack_b(const v_reg<uint64,n> & a,const v_reg<uint64,n> & b,const v_reg<uint64,n> & c,const v_reg<uint64,n> & d,const v_reg<uint64,n> & e,const v_reg<uint64,n> & f,const v_reg<uint64,n> & g,const v_reg<uint64,n> & h)3166 template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
3167                                                   const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
3168                                                   const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
3169                                                   const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
3170 {
3171     v_reg<uchar, 8*n> mask;
3172     _pack_b(mask.s, a, b);
3173     _pack_b(mask.s + 2*n, c, d);
3174     _pack_b(mask.s + 4*n, e, f);
3175     _pack_b(mask.s + 6*n, g, h);
3176     return mask;
3177 }
3178 //! @}
3179 
3180 /** @brief Matrix multiplication
3181 
3182 Scheme:
3183 @code
3184 {A0 A1 A2 A3}   |V0|
3185 {B0 B1 B2 B3}   |V1|
3186 {C0 C1 C2 C3}   |V2|
3187 {D0 D1 D2 D3} x |V3|
3188 ====================
3189 {R0 R1 R2 R3}, where:
3190 R0 = A0V0 + B0V1 + C0V2 + D0V3,
3191 R1 = A1V0 + B1V1 + C1V2 + D1V3
3192 ...
3193 @endcode
3194 */
3195 template<int n>
v_matmul(const v_reg<float,n> & v,const v_reg<float,n> & a,const v_reg<float,n> & b,const v_reg<float,n> & c,const v_reg<float,n> & d)3196 inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,
3197                                 const v_reg<float, n>& a, const v_reg<float, n>& b,
3198                                 const v_reg<float, n>& c, const v_reg<float, n>& d)
3199 {
3200     v_reg<float, n> res;
3201     for (int i = 0; i < n / 4; i++)
3202     {
3203         res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
3204         res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
3205         res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
3206         res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
3207     }
3208     return res;
3209 }
3210 
3211 /** @brief Matrix multiplication and add
3212 
3213 Scheme:
3214 @code
3215 {A0 A1 A2 A3}   |V0|   |D0|
3216 {B0 B1 B2 B3}   |V1|   |D1|
3217 {C0 C1 C2 C3} x |V2| + |D2|
3218 ====================   |D3|
3219 {R0 R1 R2 R3}, where:
3220 R0 = A0V0 + B0V1 + C0V2 + D0,
3221 R1 = A1V0 + B1V1 + C1V2 + D1
3222 ...
3223 @endcode
3224 */
3225 template<int n>
v_matmuladd(const v_reg<float,n> & v,const v_reg<float,n> & a,const v_reg<float,n> & b,const v_reg<float,n> & c,const v_reg<float,n> & d)3226 inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
3227                                    const v_reg<float, n>& a, const v_reg<float, n>& b,
3228                                    const v_reg<float, n>& c, const v_reg<float, n>& d)
3229 {
3230     v_reg<float, n> res;
3231     for (int i = 0; i < n / 4; i++)
3232     {
3233         res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
3234         res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
3235         res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
3236         res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
3237     }
3238     return res;
3239 }
3240 
3241 
v_dotprod_expand(const v_reg<int,n> & a,const v_reg<int,n> & b)3242 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
3243 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
v_dotprod_expand(const v_reg<int,n> & a,const v_reg<int,n> & b,const v_reg<double,n/2> & c)3244 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
3245                                                            const v_reg<double, n/2>& c)
3246 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
3247 
v_dotprod_expand_fast(const v_reg<int,n> & a,const v_reg<int,n> & b)3248 template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
3249 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_reg<int,n> & a,const v_reg<int,n> & b,const v_reg<double,n/2> & c)3250 template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
3251                                                                 const v_reg<double, n/2>& c)
3252 { return v_dotprod_expand(a, b, c); }
3253 
3254 ////// FP16 support ///////
3255 
3256 inline v_reg<float, simd128_width / sizeof(float)>
v_load_expand(const float16_t * ptr)3257 v_load_expand(const float16_t* ptr)
3258 {
3259     v_reg<float, simd128_width / sizeof(float)> v;
3260     for( int i = 0; i < v.nlanes; i++ )
3261     {
3262         v.s[i] = ptr[i];
3263     }
3264     return v;
3265 }
3266 #if CV_SIMD256
3267 inline v_reg<float, simd256_width / sizeof(float)>
v256_load_expand(const float16_t * ptr)3268 v256_load_expand(const float16_t* ptr)
3269 {
3270     v_reg<float, simd256_width / sizeof(float)> v;
3271     for (int i = 0; i < v.nlanes; i++)
3272     {
3273         v.s[i] = ptr[i];
3274     }
3275     return v;
3276 }
3277 #endif
3278 #if CV_SIMD512
3279 inline v_reg<float, simd512_width / sizeof(float)>
v512_load_expand(const float16_t * ptr)3280 v512_load_expand(const float16_t* ptr)
3281 {
3282     v_reg<float, simd512_width / sizeof(float)> v;
3283     for (int i = 0; i < v.nlanes; i++)
3284     {
3285         v.s[i] = ptr[i];
3286     }
3287     return v;
3288 }
3289 #endif
3290 
3291 template<int n> inline void
v_pack_store(float16_t * ptr,const v_reg<float,n> & v)3292 v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
3293 {
3294     for( int i = 0; i < v.nlanes; i++ )
3295     {
3296         ptr[i] = float16_t(v.s[i]);
3297     }
3298 }
3299 
v_cleanup()3300 inline void v_cleanup() {}
3301 #if CV_SIMD256
v256_cleanup()3302 inline void v256_cleanup() {}
3303 #endif
3304 #if CV_SIMD512
v512_cleanup()3305 inline void v512_cleanup() {}
3306 #endif
3307 
3308 //! @}
3309 
3310 #ifndef CV_DOXYGEN
3311 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3312 #endif
3313 }
3314 
3315 #if !defined(CV_DOXYGEN)
3316 #undef CV_SIMD256
3317 #undef CV_SIMD512
3318 #endif
3319 
3320 #endif
3321