1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
18 //
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
21 //
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
24 //
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
28 //
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
31 //
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
42 //
43 //M*/
44
45 #ifndef OPENCV_HAL_INTRIN_CPP_HPP
46 #define OPENCV_HAL_INTRIN_CPP_HPP
47
48 #include <limits>
49 #include <cstring>
50 #include <algorithm>
51 #include "opencv2/core/saturate.hpp"
52
53 //! @cond IGNORED
54 #define CV_SIMD128_CPP 1
55 #if defined(CV_FORCE_SIMD128_CPP)
56 #define CV_SIMD128 1
57 #define CV_SIMD128_64F 1
58 #endif
59 #if defined(CV_DOXYGEN)
60 #define CV_SIMD128 1
61 #define CV_SIMD128_64F 1
62 #define CV_SIMD256 1
63 #define CV_SIMD256_64F 1
64 #define CV_SIMD512 1
65 #define CV_SIMD512_64F 1
66 #else
67 #define CV_SIMD256 0 // Explicitly disable SIMD256 and SIMD512 support for scalar intrinsic implementation
68 #define CV_SIMD512 0 // to avoid warnings during compilation
69 #endif
70 //! @endcond
71
72 namespace cv
73 {
74
75 #ifndef CV_DOXYGEN
76 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
77 #endif
78
79 /** @addtogroup core_hal_intrin
80
81 "Universal intrinsics" is a types and functions set intended to simplify vectorization of code on
82 different platforms. Currently a few different SIMD extensions on different architectures are supported.
83 128 bit registers of various types support is implemented for a wide range of architectures
84 including x86(__SSE/SSE2/SSE4.2__), ARM(__NEON__), PowerPC(__VSX__), MIPS(__MSA__).
85 256 bit long registers are supported on x86(__AVX2__) and 512 bit long registers are supported on x86(__AVX512__).
86 In case when there is no SIMD extension available during compilation, fallback C++ implementation of intrinsics
87 will be chosen and code will work as expected although it could be slower.
88
89 ### Types
90
91 There are several types representing packed values vector registers, each type is
92 implemented as a structure based on a one SIMD register.
93
94 - cv::v_uint8 and cv::v_int8: 8-bit integer values (unsigned/signed) - char
95 - cv::v_uint16 and cv::v_int16: 16-bit integer values (unsigned/signed) - short
96 - cv::v_uint32 and cv::v_int32: 32-bit integer values (unsigned/signed) - int
97 - cv::v_uint64 and cv::v_int64: 64-bit integer values (unsigned/signed) - int64
98 - cv::v_float32: 32-bit floating point values (signed) - float
99 - cv::v_float64: 64-bit floating point values (signed) - double
100
101 Exact bit length(and value quantity) of listed types is compile time deduced and depends on architecture SIMD
102 capabilities chosen as available during compilation of the library. All the types contains __nlanes__ enumeration
103 to check for exact value quantity of the type.
104
105 In case the exact bit length of the type is important it is possible to use specific fixed length register types.
106
107 There are several types representing 128-bit registers.
108
109 - cv::v_uint8x16 and cv::v_int8x16: sixteen 8-bit integer values (unsigned/signed) - char
110 - cv::v_uint16x8 and cv::v_int16x8: eight 16-bit integer values (unsigned/signed) - short
111 - cv::v_uint32x4 and cv::v_int32x4: four 32-bit integer values (unsigned/signed) - int
112 - cv::v_uint64x2 and cv::v_int64x2: two 64-bit integer values (unsigned/signed) - int64
113 - cv::v_float32x4: four 32-bit floating point values (signed) - float
114 - cv::v_float64x2: two 64-bit floating point values (signed) - double
115
116 There are several types representing 256-bit registers.
117
118 - cv::v_uint8x32 and cv::v_int8x32: thirty two 8-bit integer values (unsigned/signed) - char
119 - cv::v_uint16x16 and cv::v_int16x16: sixteen 16-bit integer values (unsigned/signed) - short
120 - cv::v_uint32x8 and cv::v_int32x8: eight 32-bit integer values (unsigned/signed) - int
121 - cv::v_uint64x4 and cv::v_int64x4: four 64-bit integer values (unsigned/signed) - int64
122 - cv::v_float32x8: eight 32-bit floating point values (signed) - float
123 - cv::v_float64x4: four 64-bit floating point values (signed) - double
124
125 @note
126 256 bit registers at the moment implemented for AVX2 SIMD extension only, if you want to use this type directly,
127 don't forget to check the CV_SIMD256 preprocessor definition:
128 @code
129 #if CV_SIMD256
130 //...
131 #endif
132 @endcode
133
134 There are several types representing 512-bit registers.
135
136 - cv::v_uint8x64 and cv::v_int8x64: sixty four 8-bit integer values (unsigned/signed) - char
137 - cv::v_uint16x32 and cv::v_int16x32: thirty two 16-bit integer values (unsigned/signed) - short
138 - cv::v_uint32x16 and cv::v_int32x16: sixteen 32-bit integer values (unsigned/signed) - int
139 - cv::v_uint64x8 and cv::v_int64x8: eight 64-bit integer values (unsigned/signed) - int64
140 - cv::v_float32x16: sixteen 32-bit floating point values (signed) - float
141 - cv::v_float64x8: eight 64-bit floating point values (signed) - double
142 @note
143 512 bit registers at the moment implemented for AVX512 SIMD extension only, if you want to use this type directly,
144 don't forget to check the CV_SIMD512 preprocessor definition.
145
146 @note
147 cv::v_float64x2 is not implemented in NEON variant, if you want to use this type, don't forget to
148 check the CV_SIMD128_64F preprocessor definition.
149
150 ### Load and store operations
151
152 These operations allow to set contents of the register explicitly or by loading it from some memory
153 block and to save contents of the register to memory block.
154
155 There are variable size register load operations that provide result of maximum available size
156 depending on chosen platform capabilities.
157 - Constructors:
158 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
159 - Other create methods:
160 vx_setall_s8, vx_setall_u8, ...,
161 vx_setzero_u8, vx_setzero_s8, ...
162 - Memory load operations:
163 vx_load, vx_load_aligned, vx_load_low, vx_load_halves,
164 - Memory operations with expansion of values:
165 vx_load_expand, vx_load_expand_q
166
167 Also there are fixed size register load/store operations.
168
169 For 128 bit registers
170 - Constructors:
171 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
172 @ref v_reg::v_reg(_Tp s0, _Tp s1) "from two values", ...
173 - Other create methods:
174 @ref v_setall_s8, @ref v_setall_u8, ...,
175 @ref v_setzero_u8, @ref v_setzero_s8, ...
176 - Memory load operations:
177 @ref v_load, @ref v_load_aligned, @ref v_load_low, @ref v_load_halves,
178 - Memory operations with expansion of values:
179 @ref v_load_expand, @ref v_load_expand_q
180
181 For 256 bit registers(check CV_SIMD256 preprocessor definition)
182 - Constructors:
183 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
184 @ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) "from four values", ...
185 - Other create methods:
186 @ref v256_setall_s8, @ref v256_setall_u8, ...,
187 @ref v256_setzero_u8, @ref v256_setzero_s8, ...
188 - Memory load operations:
189 @ref v256_load, @ref v256_load_aligned, @ref v256_load_low, @ref v256_load_halves,
190 - Memory operations with expansion of values:
191 @ref v256_load_expand, @ref v256_load_expand_q
192
193 For 512 bit registers(check CV_SIMD512 preprocessor definition)
194 - Constructors:
195 @ref v_reg::v_reg(const _Tp *ptr) "from memory",
196 @ref v_reg::v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3, _Tp s4, _Tp s5, _Tp s6, _Tp s7) "from eight values", ...
197 - Other create methods:
198 @ref v512_setall_s8, @ref v512_setall_u8, ...,
199 @ref v512_setzero_u8, @ref v512_setzero_s8, ...
200 - Memory load operations:
201 @ref v512_load, @ref v512_load_aligned, @ref v512_load_low, @ref v512_load_halves,
202 - Memory operations with expansion of values:
203 @ref v512_load_expand, @ref v512_load_expand_q
204
205 Store to memory operations are similar across different platform capabilities:
206 @ref v_store, @ref v_store_aligned,
207 @ref v_store_high, @ref v_store_low
208
209 ### Value reordering
210
211 These operations allow to reorder or recombine elements in one or multiple vectors.
212
213 - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
214 - Expand: @ref v_expand, @ref v_expand_low, @ref v_expand_high
215 - Pack: @ref v_pack, @ref v_pack_u, @ref v_pack_b, @ref v_rshr_pack, @ref v_rshr_pack_u,
216 @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
217 - Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
218 - Reverse: @ref v_reverse
219 - Extract: @ref v_extract
220
221
222 ### Arithmetic, bitwise and comparison operations
223
224 Element-wise binary and unary operations.
225
226 - Arithmetics:
227 @ref operator +(const v_reg &a, const v_reg &b) "+",
228 @ref operator -(const v_reg &a, const v_reg &b) "-",
229 @ref operator *(const v_reg &a, const v_reg &b) "*",
230 @ref operator /(const v_reg &a, const v_reg &b) "/",
231 @ref v_mul_expand
232
233 - Non-saturating arithmetics: @ref v_add_wrap, @ref v_sub_wrap
234
235 - Bitwise shifts:
236 @ref operator <<(const v_reg &a, int s) "<<",
237 @ref operator >>(const v_reg &a, int s) ">>",
238 @ref v_shl, @ref v_shr
239
240 - Bitwise logic:
241 @ref operator &(const v_reg &a, const v_reg &b) "&",
242 @ref operator |(const v_reg &a, const v_reg &b) "|",
243 @ref operator ^(const v_reg &a, const v_reg &b) "^",
244 @ref operator ~(const v_reg &a) "~"
245
246 - Comparison:
247 @ref operator >(const v_reg &a, const v_reg &b) ">",
248 @ref operator >=(const v_reg &a, const v_reg &b) ">=",
249 @ref operator <(const v_reg &a, const v_reg &b) "<",
250 @ref operator <=(const v_reg &a, const v_reg &b) "<=",
251 @ref operator ==(const v_reg &a, const v_reg &b) "==",
252 @ref operator !=(const v_reg &a, const v_reg &b) "!="
253
254 - min/max: @ref v_min, @ref v_max
255
256 ### Reduce and mask
257
258 Most of these operations return only one value.
259
260 - Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum, @ref v_popcount
261 - Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
262
263 ### Other math
264
265 - Some frequent operations: @ref v_sqrt, @ref v_invsqrt, @ref v_magnitude, @ref v_sqr_magnitude
266 - Absolute values: @ref v_abs, @ref v_absdiff, @ref v_absdiffs
267
268 ### Conversions
269
270 Different type conversions and casts:
271
272 - Rounding: @ref v_round, @ref v_floor, @ref v_ceil, @ref v_trunc,
273 - To float: @ref v_cvt_f32, @ref v_cvt_f64
274 - Reinterpret: @ref v_reinterpret_as_u8, @ref v_reinterpret_as_s8, ...
275
276 ### Matrix operations
277
278 In these operations vectors represent matrix rows/columns: @ref v_dotprod, @ref v_dotprod_fast,
279 @ref v_dotprod_expand, @ref v_dotprod_expand_fast, @ref v_matmul, @ref v_transpose4x4
280
281 ### Usability
282
283 Most operations are implemented only for some subset of the available types, following matrices
284 shows the applicability of different operations to the types.
285
286 Regular integers:
287
288 | Operations\\Types | uint 8 | int 8 | uint 16 | int 16 | uint 32 | int 32 |
289 |-------------------|:-:|:-:|:-:|:-:|:-:|:-:|
290 |load, store | x | x | x | x | x | x |
291 |interleave | x | x | x | x | x | x |
292 |expand | x | x | x | x | x | x |
293 |expand_low | x | x | x | x | x | x |
294 |expand_high | x | x | x | x | x | x |
295 |expand_q | x | x | | | | |
296 |add, sub | x | x | x | x | x | x |
297 |add_wrap, sub_wrap | x | x | x | x | | |
298 |mul_wrap | x | x | x | x | | |
299 |mul | x | x | x | x | x | x |
300 |mul_expand | x | x | x | x | x | |
301 |compare | x | x | x | x | x | x |
302 |shift | | | x | x | x | x |
303 |dotprod | | | | x | | x |
304 |dotprod_fast | | | | x | | x |
305 |dotprod_expand | x | x | x | x | | x |
306 |dotprod_expand_fast| x | x | x | x | | x |
307 |logical | x | x | x | x | x | x |
308 |min, max | x | x | x | x | x | x |
309 |absdiff | x | x | x | x | x | x |
310 |absdiffs | | x | | x | | |
311 |reduce | x | x | x | x | x | x |
312 |mask | x | x | x | x | x | x |
313 |pack | x | x | x | x | x | x |
314 |pack_u | x | | x | | | |
315 |pack_b | x | | | | | |
316 |unpack | x | x | x | x | x | x |
317 |extract | x | x | x | x | x | x |
318 |rotate (lanes) | x | x | x | x | x | x |
319 |cvt_flt32 | | | | | | x |
320 |cvt_flt64 | | | | | | x |
321 |transpose4x4 | | | | | x | x |
322 |reverse | x | x | x | x | x | x |
323 |extract_n | x | x | x | x | x | x |
324 |broadcast_element | | | | | x | x |
325
326 Big integers:
327
328 | Operations\\Types | uint 64 | int 64 |
329 |-------------------|:-:|:-:|
330 |load, store | x | x |
331 |add, sub | x | x |
332 |shift | x | x |
333 |logical | x | x |
334 |reverse | x | x |
335 |extract | x | x |
336 |rotate (lanes) | x | x |
337 |cvt_flt64 | | x |
338 |extract_n | x | x |
339
340 Floating point:
341
342 | Operations\\Types | float 32 | float 64 |
343 |-------------------|:-:|:-:|
344 |load, store | x | x |
345 |interleave | x | |
346 |add, sub | x | x |
347 |mul | x | x |
348 |div | x | x |
349 |compare | x | x |
350 |min, max | x | x |
351 |absdiff | x | x |
352 |reduce | x | |
353 |mask | x | x |
354 |unpack | x | x |
355 |cvt_flt32 | | x |
356 |cvt_flt64 | x | |
357 |sqrt, abs | x | x |
358 |float math | x | x |
359 |transpose4x4 | x | |
360 |extract | x | x |
361 |rotate (lanes) | x | x |
362 |reverse | x | x |
363 |extract_n | x | x |
364 |broadcast_element | x | |
365
366 @{ */
367
368 template<typename _Tp, int n> struct v_reg
369 {
370 //! @cond IGNORED
371 typedef _Tp lane_type;
372 enum { nlanes = n };
373 // !@endcond
374
375 /** @brief Constructor
376
377 Initializes register with data from memory
378 @param ptr pointer to memory block with data for register */
v_regcv::v_reg379 explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
380
381 /** @brief Constructor
382
383 Initializes register with two 64-bit values */
v_regcv::v_reg384 v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
385
386 /** @brief Constructor
387
388 Initializes register with four 32-bit values */
v_regcv::v_reg389 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
390
391 /** @brief Constructor
392
393 Initializes register with eight 16-bit values */
v_regcv::v_reg394 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
395 _Tp s4, _Tp s5, _Tp s6, _Tp s7)
396 {
397 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
398 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
399 }
400
401 /** @brief Constructor
402
403 Initializes register with sixteen 8-bit values */
v_regcv::v_reg404 v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
405 _Tp s4, _Tp s5, _Tp s6, _Tp s7,
406 _Tp s8, _Tp s9, _Tp s10, _Tp s11,
407 _Tp s12, _Tp s13, _Tp s14, _Tp s15)
408 {
409 s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
410 s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
411 s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
412 s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
413 }
414
415 /** @brief Default constructor
416
417 Does not initialize anything*/
v_regcv::v_reg418 v_reg() {}
419
420 /** @brief Copy constructor */
v_regcv::v_reg421 v_reg(const v_reg<_Tp, n> & r)
422 {
423 for( int i = 0; i < n; i++ )
424 s[i] = r.s[i];
425 }
426 /** @brief Access first value
427
428 Returns value of the first lane according to register type, for example:
429 @code{.cpp}
430 v_int32x4 r(1, 2, 3, 4);
431 int v = r.get0(); // returns 1
432 v_uint64x2 r(1, 2);
433 uint64_t v = r.get0(); // returns 1
434 @endcode
435 */
get0cv::v_reg436 _Tp get0() const { return s[0]; }
437
438 //! @cond IGNORED
getcv::v_reg439 _Tp get(const int i) const { return s[i]; }
highcv::v_reg440 v_reg<_Tp, n> high() const
441 {
442 v_reg<_Tp, n> c;
443 int i;
444 for( i = 0; i < n/2; i++ )
445 {
446 c.s[i] = s[i+(n/2)];
447 c.s[i+(n/2)] = 0;
448 }
449 return c;
450 }
451
zerocv::v_reg452 static v_reg<_Tp, n> zero()
453 {
454 v_reg<_Tp, n> c;
455 for( int i = 0; i < n; i++ )
456 c.s[i] = (_Tp)0;
457 return c;
458 }
459
allcv::v_reg460 static v_reg<_Tp, n> all(_Tp s)
461 {
462 v_reg<_Tp, n> c;
463 for( int i = 0; i < n; i++ )
464 c.s[i] = s;
465 return c;
466 }
467
reinterpret_ascv::v_reg468 template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
469 {
470 size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
471 v_reg<_Tp2, n2> c;
472 std::memcpy(&c.s[0], &s[0], bytes);
473 return c;
474 }
475
operator =cv::v_reg476 v_reg& operator=(const v_reg<_Tp, n> & r)
477 {
478 for( int i = 0; i < n; i++ )
479 s[i] = r.s[i];
480 return *this;
481 }
482
483 _Tp s[n];
484 //! @endcond
485 };
486
487 /** @brief Sixteen 8-bit unsigned integer values */
488 typedef v_reg<uchar, 16> v_uint8x16;
489 /** @brief Sixteen 8-bit signed integer values */
490 typedef v_reg<schar, 16> v_int8x16;
491 /** @brief Eight 16-bit unsigned integer values */
492 typedef v_reg<ushort, 8> v_uint16x8;
493 /** @brief Eight 16-bit signed integer values */
494 typedef v_reg<short, 8> v_int16x8;
495 /** @brief Four 32-bit unsigned integer values */
496 typedef v_reg<unsigned, 4> v_uint32x4;
497 /** @brief Four 32-bit signed integer values */
498 typedef v_reg<int, 4> v_int32x4;
499 /** @brief Four 32-bit floating point values (single precision) */
500 typedef v_reg<float, 4> v_float32x4;
501 /** @brief Two 64-bit floating point values (double precision) */
502 typedef v_reg<double, 2> v_float64x2;
503 /** @brief Two 64-bit unsigned integer values */
504 typedef v_reg<uint64, 2> v_uint64x2;
505 /** @brief Two 64-bit signed integer values */
506 typedef v_reg<int64, 2> v_int64x2;
507
508 #if CV_SIMD256
509 /** @brief Thirty two 8-bit unsigned integer values */
510 typedef v_reg<uchar, 32> v_uint8x32;
511 /** @brief Thirty two 8-bit signed integer values */
512 typedef v_reg<schar, 32> v_int8x32;
513 /** @brief Sixteen 16-bit unsigned integer values */
514 typedef v_reg<ushort, 16> v_uint16x16;
515 /** @brief Sixteen 16-bit signed integer values */
516 typedef v_reg<short, 16> v_int16x16;
517 /** @brief Eight 32-bit unsigned integer values */
518 typedef v_reg<unsigned, 8> v_uint32x8;
519 /** @brief Eight 32-bit signed integer values */
520 typedef v_reg<int, 8> v_int32x8;
521 /** @brief Eight 32-bit floating point values (single precision) */
522 typedef v_reg<float, 8> v_float32x8;
523 /** @brief Four 64-bit floating point values (double precision) */
524 typedef v_reg<double, 4> v_float64x4;
525 /** @brief Four 64-bit unsigned integer values */
526 typedef v_reg<uint64, 4> v_uint64x4;
527 /** @brief Four 64-bit signed integer values */
528 typedef v_reg<int64, 4> v_int64x4;
529 #endif
530
531 #if CV_SIMD512
532 /** @brief Sixty four 8-bit unsigned integer values */
533 typedef v_reg<uchar, 64> v_uint8x64;
534 /** @brief Sixty four 8-bit signed integer values */
535 typedef v_reg<schar, 64> v_int8x64;
536 /** @brief Thirty two 16-bit unsigned integer values */
537 typedef v_reg<ushort, 32> v_uint16x32;
538 /** @brief Thirty two 16-bit signed integer values */
539 typedef v_reg<short, 32> v_int16x32;
540 /** @brief Sixteen 32-bit unsigned integer values */
541 typedef v_reg<unsigned, 16> v_uint32x16;
542 /** @brief Sixteen 32-bit signed integer values */
543 typedef v_reg<int, 16> v_int32x16;
544 /** @brief Sixteen 32-bit floating point values (single precision) */
545 typedef v_reg<float, 16> v_float32x16;
546 /** @brief Eight 64-bit floating point values (double precision) */
547 typedef v_reg<double, 8> v_float64x8;
548 /** @brief Eight 64-bit unsigned integer values */
549 typedef v_reg<uint64, 8> v_uint64x8;
550 /** @brief Eight 64-bit signed integer values */
551 typedef v_reg<int64, 8> v_int64x8;
552 #endif
553
554 enum {
555 simd128_width = 16,
556 #if CV_SIMD256
557 simd256_width = 32,
558 #endif
559 #if CV_SIMD512
560 simd512_width = 64,
561 simdmax_width = simd512_width
562 #elif CV_SIMD256
563 simdmax_width = simd256_width
564 #else
565 simdmax_width = simd128_width
566 #endif
567 };
568
569 /** @brief Add values
570
571 For all types. */
572 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator+(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
573 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator+=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
574
575 /** @brief Subtract values
576
577 For all types. */
578 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator-(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
579 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator-=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
580
581 /** @brief Multiply values
582
583 For 16- and 32-bit integer types and floating types. */
584 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator*(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
585 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator*=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
586
587 /** @brief Divide values
588
589 For floating types only. */
590 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator/(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
591 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator/=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
592
593
594 /** @brief Bitwise AND
595
596 Only for integer types. */
597 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator&(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
598 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator&=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
599
600 /** @brief Bitwise OR
601
602 Only for integer types. */
603 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator|(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
604 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator|=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
605
606 /** @brief Bitwise XOR
607
608 Only for integer types.*/
609 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator^(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
610 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n>& operator^=(v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b);
611
612 /** @brief Bitwise NOT
613
614 Only for integer types.*/
615 template<typename _Tp, int n> CV_INLINE v_reg<_Tp, n> operator~(const v_reg<_Tp, n>& a);
616
617
618 #ifndef CV_DOXYGEN
619
620 #define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
621 __CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
622 __CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
623 __CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
624 __CV_EXPAND(macro_name(short, __VA_ARGS__)) \
625 __CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
626 __CV_EXPAND(macro_name(int, __VA_ARGS__)) \
627 __CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
628 __CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
629
630 #define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
631 __CV_EXPAND(macro_name(float, __VA_ARGS__)) \
632 __CV_EXPAND(macro_name(double, __VA_ARGS__)) \
633
634 #define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
635 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
636 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
637
638 #define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
639 template<int n> inline \
640 v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
641 { \
642 v_reg<_Tp, n> c; \
643 for( int i = 0; i < n; i++ ) \
644 c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
645 return c; \
646 } \
647 template<int n> inline \
648 v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
649 { \
650 for( int i = 0; i < n; i++ ) \
651 a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
652 return a; \
653 }
654
655 #define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
656
657 CV__HAL_INTRIN_IMPL_BIN_OP(+)
658 CV__HAL_INTRIN_IMPL_BIN_OP(-)
659 CV__HAL_INTRIN_IMPL_BIN_OP(*)
660 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, /)
661
662 #define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
663 template<int n> CV_INLINE \
664 v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
665 { \
666 v_reg<_Tp, n> c; \
667 typedef typename V_TypeTraits<_Tp>::int_type itype; \
668 for( int i = 0; i < n; i++ ) \
669 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
670 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
671 return c; \
672 } \
673 template<int n> CV_INLINE \
674 v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
675 { \
676 typedef typename V_TypeTraits<_Tp>::int_type itype; \
677 for( int i = 0; i < n; i++ ) \
678 a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
679 V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
680 return a; \
681 }
682
683 #define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
684 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
685 CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) /* TODO: FIXIT remove this after masks refactoring */
686
687
688 CV__HAL_INTRIN_IMPL_BIT_OP(&)
689 CV__HAL_INTRIN_IMPL_BIT_OP(|)
690 CV__HAL_INTRIN_IMPL_BIT_OP(^)
691
692 #define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
693 template<int n> CV_INLINE \
694 v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
695 { \
696 v_reg<_Tp, n> c; \
697 for( int i = 0; i < n; i++ ) \
698 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
699 return c; \
700 } \
701
702 CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BITWISE_NOT_, ~)
703
704 #endif // !CV_DOXYGEN
705
706
707 //! @brief Helper macro
708 //! @ingroup core_hal_intrin_impl
709 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
710 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
711 { \
712 v_reg<_Tp2, n> c; \
713 for( int i = 0; i < n; i++ ) \
714 c.s[i] = cfunc(a.s[i]); \
715 return c; \
716 }
717
718 /** @brief Square root of elements
719
720 Only for floating point types.*/
721 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
722
723 //! @cond IGNORED
724 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
725 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
726 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
727 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
728 //! @endcond
729
730 /** @brief Absolute value of elements
731
732 Only for floating point types.*/
733 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
734 typename V_TypeTraits<_Tp>::abs_type)
735
736 //! @brief Helper macro
737 //! @ingroup core_hal_intrin_impl
738 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
739 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
740 { \
741 v_reg<_Tp, n> c; \
742 for( int i = 0; i < n; i++ ) \
743 c.s[i] = cfunc(a.s[i], b.s[i]); \
744 return c; \
745 }
746
747 //! @brief Helper macro
748 //! @ingroup core_hal_intrin_impl
749 #define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
750 template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
751 { \
752 _Tp c = a.s[0]; \
753 for( int i = 1; i < n; i++ ) \
754 c = cfunc(c, a.s[i]); \
755 return c; \
756 }
757
758 /** @brief Choose min values for each pair
759
760 Scheme:
761 @code
762 {A1 A2 ...}
763 {B1 B2 ...}
764 --------------
765 {min(A1,B1) min(A2,B2) ...}
766 @endcode
767 For all types except 64-bit integer. */
768 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, std::min)
769
770 /** @brief Choose max values for each pair
771
772 Scheme:
773 @code
774 {A1 A2 ...}
775 {B1 B2 ...}
776 --------------
777 {max(A1,B1) max(A2,B2) ...}
778 @endcode
779 For all types except 64-bit integer. */
780 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, std::max)
781
782 /** @brief Find one min value
783
784 Scheme:
785 @code
786 {A1 A2 A3 ...} => min(A1,A2,A3,...)
787 @endcode
788 For all types except 64-bit integer and 64-bit floating point types. */
789 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_min, std::min)
790
791 /** @brief Find one max value
792
793 Scheme:
794 @code
795 {A1 A2 A3 ...} => max(A1,A2,A3,...)
796 @endcode
797 For all types except 64-bit integer and 64-bit floating point types. */
798 OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(v_reduce_max, std::max)
799
800 static const unsigned char popCountTable[] =
801 {
802 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
803 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
804 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
805 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
806 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
807 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
808 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
809 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
810 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
811 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
812 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
813 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
814 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
815 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
816 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
817 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
818 };
819 /** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type
820
821 Scheme:
822 @code
823 {A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...}
824 @endcode
825 For all integer types. */
826 template<typename _Tp, int n>
v_popcount(const v_reg<_Tp,n> & a)827 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
828 {
829 v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
830 for (int i = 0; i < n*(int)sizeof(_Tp); i++)
831 b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
832 return b;
833 }
834
835
836 //! @cond IGNORED
837 template<typename _Tp, int n>
v_minmax(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<_Tp,n> & minval,v_reg<_Tp,n> & maxval)838 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
839 v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
840 {
841 for( int i = 0; i < n; i++ )
842 {
843 minval.s[i] = std::min(a.s[i], b.s[i]);
844 maxval.s[i] = std::max(a.s[i], b.s[i]);
845 }
846 }
847 //! @endcond
848
849 //! @brief Helper macro
850 //! @ingroup core_hal_intrin_impl
851 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
852 template<typename _Tp, int n> \
853 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
854 { \
855 typedef typename V_TypeTraits<_Tp>::int_type itype; \
856 v_reg<_Tp, n> c; \
857 for( int i = 0; i < n; i++ ) \
858 c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
859 return c; \
860 }
861
862 /** @brief Less-than comparison
863
864 For all types except 64-bit integer values. */
865 OPENCV_HAL_IMPL_CMP_OP(<)
866
867 /** @brief Greater-than comparison
868
869 For all types except 64-bit integer values. */
870 OPENCV_HAL_IMPL_CMP_OP(>)
871
872 /** @brief Less-than or equal comparison
873
874 For all types except 64-bit integer values. */
875 OPENCV_HAL_IMPL_CMP_OP(<=)
876
877 /** @brief Greater-than or equal comparison
878
879 For all types except 64-bit integer values. */
880 OPENCV_HAL_IMPL_CMP_OP(>=)
881
882 /** @brief Equal comparison
883
884 For all types except 64-bit integer values. */
885 OPENCV_HAL_IMPL_CMP_OP(==)
886
887 /** @brief Not equal comparison
888
889 For all types except 64-bit integer values. */
890 OPENCV_HAL_IMPL_CMP_OP(!=)
891
892 template<int n>
v_not_nan(const v_reg<float,n> & a)893 inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
894 {
895 typedef typename V_TypeTraits<float>::int_type itype;
896 v_reg<float, n> c;
897 for (int i = 0; i < n; i++)
898 c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
899 return c;
900 }
901 template<int n>
v_not_nan(const v_reg<double,n> & a)902 inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
903 {
904 typedef typename V_TypeTraits<double>::int_type itype;
905 v_reg<double, n> c;
906 for (int i = 0; i < n; i++)
907 c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
908 return c;
909 }
910
911 //! @brief Helper macro
912 //! @ingroup core_hal_intrin_impl
913 #define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
914 template<typename _Tp, int n> \
915 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
916 { \
917 typedef _Tp2 rtype; \
918 v_reg<rtype, n> c; \
919 for( int i = 0; i < n; i++ ) \
920 c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
921 return c; \
922 }
923
924 /** @brief Add values without saturation
925
926 For 8- and 16-bit integer values. */
927 OPENCV_HAL_IMPL_ARITHM_OP(v_add_wrap, +, (_Tp), _Tp)
928
929 /** @brief Subtract values without saturation
930
931 For 8- and 16-bit integer values. */
932 OPENCV_HAL_IMPL_ARITHM_OP(v_sub_wrap, -, (_Tp), _Tp)
933
934 /** @brief Multiply values without saturation
935
936 For 8- and 16-bit integer values. */
937 OPENCV_HAL_IMPL_ARITHM_OP(v_mul_wrap, *, (_Tp), _Tp)
938
939 //! @cond IGNORED
_absdiff(T a,T b)940 template<typename T> inline T _absdiff(T a, T b)
941 {
942 return a > b ? a - b : b - a;
943 }
944 //! @endcond
945
946 /** @brief Absolute difference
947
948 Returns \f$ |a - b| \f$ converted to corresponding unsigned type.
949 Example:
950 @code{.cpp}
951 v_int32x4 a, b; // {1, 2, 3, 4} and {4, 3, 2, 1}
952 v_uint32x4 c = v_absdiff(a, b); // result is {3, 1, 1, 3}
953 @endcode
954 For 8-, 16-, 32-bit integer source types. */
955 template<typename _Tp, int n>
v_absdiff(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)956 inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_absdiff(const v_reg<_Tp, n>& a, const v_reg<_Tp, n> & b)
957 {
958 typedef typename V_TypeTraits<_Tp>::abs_type rtype;
959 v_reg<rtype, n> c;
960 const rtype mask = (rtype)(std::numeric_limits<_Tp>::is_signed ? (1 << (sizeof(rtype)*8 - 1)) : 0);
961 for( int i = 0; i < n; i++ )
962 {
963 rtype ua = a.s[i] ^ mask;
964 rtype ub = b.s[i] ^ mask;
965 c.s[i] = _absdiff(ua, ub);
966 }
967 return c;
968 }
969
970 /** @overload
971
972 For 32-bit floating point values */
v_absdiff(const v_reg<float,n> & a,const v_reg<float,n> & b)973 template<int n> inline v_reg<float, n> v_absdiff(const v_reg<float, n>& a, const v_reg<float, n>& b)
974 {
975 v_reg<float, n> c;
976 for( int i = 0; i < c.nlanes; i++ )
977 c.s[i] = _absdiff(a.s[i], b.s[i]);
978 return c;
979 }
980
981 /** @overload
982
983 For 64-bit floating point values */
v_absdiff(const v_reg<double,n> & a,const v_reg<double,n> & b)984 template<int n> inline v_reg<double, n> v_absdiff(const v_reg<double, n>& a, const v_reg<double, n>& b)
985 {
986 v_reg<double, n> c;
987 for( int i = 0; i < c.nlanes; i++ )
988 c.s[i] = _absdiff(a.s[i], b.s[i]);
989 return c;
990 }
991
992 /** @brief Saturating absolute difference
993
994 Returns \f$ saturate(|a - b|) \f$ .
995 For 8-, 16-bit signed integer source types. */
996 template<typename _Tp, int n>
v_absdiffs(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)997 inline v_reg<_Tp, n> v_absdiffs(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
998 {
999 v_reg<_Tp, n> c;
1000 for( int i = 0; i < n; i++)
1001 c.s[i] = saturate_cast<_Tp>(std::abs(a.s[i] - b.s[i]));
1002 return c;
1003 }
1004
1005 /** @brief Inversed square root
1006
1007 Returns \f$ 1/sqrt(a) \f$
1008 For floating point types only. */
1009 template<typename _Tp, int n>
v_invsqrt(const v_reg<_Tp,n> & a)1010 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
1011 {
1012 v_reg<_Tp, n> c;
1013 for( int i = 0; i < n; i++ )
1014 c.s[i] = 1.f/std::sqrt(a.s[i]);
1015 return c;
1016 }
1017
1018 /** @brief Magnitude
1019
1020 Returns \f$ sqrt(a^2 + b^2) \f$
1021 For floating point types only. */
1022 template<typename _Tp, int n>
v_magnitude(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1023 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1024 {
1025 v_reg<_Tp, n> c;
1026 for( int i = 0; i < n; i++ )
1027 c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
1028 return c;
1029 }
1030
1031 /** @brief Square of the magnitude
1032
1033 Returns \f$ a^2 + b^2 \f$
1034 For floating point types only. */
1035 template<typename _Tp, int n>
v_sqr_magnitude(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1036 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1037 {
1038 v_reg<_Tp, n> c;
1039 for( int i = 0; i < n; i++ )
1040 c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
1041 return c;
1042 }
1043
1044 /** @brief Multiply and add
1045
1046 Returns \f$ a*b + c \f$
1047 For floating point types and signed 32bit int only. */
1048 template<typename _Tp, int n>
v_fma(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c)1049 inline v_reg<_Tp, n> v_fma(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1050 const v_reg<_Tp, n>& c)
1051 {
1052 v_reg<_Tp, n> d;
1053 for( int i = 0; i < n; i++ )
1054 d.s[i] = a.s[i]*b.s[i] + c.s[i];
1055 return d;
1056 }
1057
1058 /** @brief A synonym for v_fma */
1059 template<typename _Tp, int n>
v_muladd(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c)1060 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1061 const v_reg<_Tp, n>& c)
1062 {
1063 return v_fma(a, b, c);
1064 }
1065
1066 /** @brief Dot product of elements
1067
1068 Multiply values in two registers and sum adjacent result pairs.
1069
1070 Scheme:
1071 @code
1072 {A1 A2 ...} // 16-bit
1073 x {B1 B2 ...} // 16-bit
1074 -------------
1075 {A1B1+A2B2 ...} // 32-bit
1076
1077 @endcode
1078 */
1079 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1080 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1081 {
1082 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1083 v_reg<w_type, n/2> c;
1084 for( int i = 0; i < (n/2); i++ )
1085 c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
1086 return c;
1087 }
1088
1089 /** @brief Dot product of elements
1090
1091 Same as cv::v_dotprod, but add a third element to the sum of adjacent pairs.
1092 Scheme:
1093 @code
1094 {A1 A2 ...} // 16-bit
1095 x {B1 B2 ...} // 16-bit
1096 -------------
1097 {A1B1+A2B2+C1 ...} // 32-bit
1098 @endcode
1099 */
1100 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c)1101 v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1102 const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
1103 {
1104 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1105 v_reg<w_type, n/2> s;
1106 for( int i = 0; i < (n/2); i++ )
1107 s.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1] + c.s[i];
1108 return s;
1109 }
1110
1111 /** @brief Fast Dot product of elements
1112
1113 Same as cv::v_dotprod, but it may perform unorder sum between result pairs in some platforms,
1114 this intrinsic can be used if the sum among all lanes is only matters
1115 and also it should be yielding better performance on the affected platforms.
1116
1117 */
1118 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod_fast(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1119 v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1120 { return v_dotprod(a, b); }
1121
1122 /** @brief Fast Dot product of elements
1123
1124 Same as cv::v_dotprod_fast, but add a third element to the sum of adjacent pairs.
1125 */
1126 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_dotprod_fast(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c)1127 v_dotprod_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1128 const v_reg<typename V_TypeTraits<_Tp>::w_type, n / 2>& c)
1129 { return v_dotprod(a, b, c); }
1130
1131 /** @brief Dot product of elements and expand
1132
1133 Multiply values in two registers and expand the sum of adjacent result pairs.
1134
1135 Scheme:
1136 @code
1137 {A1 A2 A3 A4 ...} // 8-bit
1138 x {B1 B2 B3 B4 ...} // 8-bit
1139 -------------
1140 {A1B1+A2B2+A3B3+A4B4 ...} // 32-bit
1141
1142 @endcode
1143 */
1144 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
v_dotprod_expand(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1145 v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1146 {
1147 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1148 v_reg<q_type, n/4> s;
1149 for( int i = 0; i < (n/4); i++ )
1150 s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
1151 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3];
1152 return s;
1153 }
1154
1155 /** @brief Dot product of elements
1156
1157 Same as cv::v_dotprod_expand, but add a third element to the sum of adjacent pairs.
1158 Scheme:
1159 @code
1160 {A1 A2 A3 A4 ...} // 8-bit
1161 x {B1 B2 B3 B4 ...} // 8-bit
1162 -------------
1163 {A1B1+A2B2+A3B3+A4B4+C1 ...} // 32-bit
1164 @endcode
1165 */
1166 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
v_dotprod_expand(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<typename V_TypeTraits<_Tp>::q_type,n/4> & c)1167 v_dotprod_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1168 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
1169 {
1170 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1171 v_reg<q_type, n/4> s;
1172 for( int i = 0; i < (n/4); i++ )
1173 s.s[i] = (q_type)a.s[i*4 ]*b.s[i*4 ] + (q_type)a.s[i*4 + 1]*b.s[i*4 + 1] +
1174 (q_type)a.s[i*4 + 2]*b.s[i*4 + 2] + (q_type)a.s[i*4 + 3]*b.s[i*4 + 3] + c.s[i];
1175 return s;
1176 }
1177
1178 /** @brief Fast Dot product of elements and expand
1179
1180 Multiply values in two registers and expand the sum of adjacent result pairs.
1181
1182 Same as cv::v_dotprod_expand, but it may perform unorder sum between result pairs in some platforms,
1183 this intrinsic can be used if the sum among all lanes is only matters
1184 and also it should be yielding better performance on the affected platforms.
1185
1186 */
1187 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
v_dotprod_expand_fast(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1188 v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1189 { return v_dotprod_expand(a, b); }
1190
1191 /** @brief Fast Dot product of elements
1192
1193 Same as cv::v_dotprod_expand_fast, but add a third element to the sum of adjacent pairs.
1194 */
1195 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::q_type, n/4>
v_dotprod_expand_fast(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<typename V_TypeTraits<_Tp>::q_type,n/4> & c)1196 v_dotprod_expand_fast(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1197 const v_reg<typename V_TypeTraits<_Tp>::q_type, n / 4>& c)
1198 { return v_dotprod_expand(a, b, c); }
1199
1200 /** @brief Multiply and expand
1201
1202 Multiply values two registers and store results in two registers with wider pack type.
1203 Scheme:
1204 @code
1205 {A B C D} // 32-bit
1206 x {E F G H} // 32-bit
1207 ---------------
1208 {AE BF} // 64-bit
1209 {CG DH} // 64-bit
1210 @endcode
1211 Example:
1212 @code{.cpp}
1213 v_uint32x4 a, b; // {1,2,3,4} and {2,2,2,2}
1214 v_uint64x2 c, d; // results
1215 v_mul_expand(a, b, c, d); // c, d = {2,4}, {6, 8}
1216 @endcode
1217 Implemented only for 16- and unsigned 32-bit source types (v_int16x8, v_uint16x8, v_uint32x4).
1218 */
v_mul_expand(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & d)1219 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
1220 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
1221 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
1222 {
1223 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1224 for( int i = 0; i < (n/2); i++ )
1225 {
1226 c.s[i] = (w_type)a.s[i]*b.s[i];
1227 d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
1228 }
1229 }
1230
1231 /** @brief Multiply and extract high part
1232
1233 Multiply values two registers and store high part of the results.
1234 Implemented only for 16-bit source types (v_int16x8, v_uint16x8). Returns \f$ a*b >> 16 \f$
1235 */
v_mul_hi(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1236 template<typename _Tp, int n> inline v_reg<_Tp, n> v_mul_hi(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1237 {
1238 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1239 v_reg<_Tp, n> c;
1240 for (int i = 0; i < n; i++)
1241 c.s[i] = (_Tp)(((w_type)a.s[i] * b.s[i]) >> sizeof(_Tp)*8);
1242 return c;
1243 }
1244
1245 //! @cond IGNORED
v_hsum(const v_reg<_Tp,n> & a,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & c)1246 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
1247 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
1248 {
1249 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1250 for( int i = 0; i < (n/2); i++ )
1251 {
1252 c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
1253 }
1254 }
1255 //! @endcond
1256
1257 //! @brief Helper macro
1258 //! @ingroup core_hal_intrin_impl
1259 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
1260 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
1261 { \
1262 v_reg<_Tp, n> c; \
1263 for( int i = 0; i < n; i++ ) \
1264 c.s[i] = (_Tp)(a.s[i] shift_op imm); \
1265 return c; \
1266 }
1267
1268 /** @brief Bitwise shift left
1269
1270 For 16-, 32- and 64-bit integer values. */
1271 OPENCV_HAL_IMPL_SHIFT_OP(<< )
1272
1273 /** @brief Bitwise shift right
1274
1275 For 16-, 32- and 64-bit integer values. */
1276 OPENCV_HAL_IMPL_SHIFT_OP(>> )
1277
1278 //! @brief Helper macro
1279 //! @ingroup core_hal_intrin_impl
1280 #define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
1281 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
1282 { \
1283 v_reg<_Tp, n> b; \
1284 for (int i = 0; i < n; i++) \
1285 { \
1286 int sIndex = i opA imm; \
1287 if (0 <= sIndex && sIndex < n) \
1288 { \
1289 b.s[i] = a.s[sIndex]; \
1290 } \
1291 else \
1292 { \
1293 b.s[i] = 0; \
1294 } \
1295 } \
1296 return b; \
1297 } \
1298 template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
1299 { \
1300 v_reg<_Tp, n> c; \
1301 for (int i = 0; i < n; i++) \
1302 { \
1303 int aIndex = i opA imm; \
1304 int bIndex = i opA imm opB n; \
1305 if (0 <= bIndex && bIndex < n) \
1306 { \
1307 c.s[i] = b.s[bIndex]; \
1308 } \
1309 else if (0 <= aIndex && aIndex < n) \
1310 { \
1311 c.s[i] = a.s[aIndex]; \
1312 } \
1313 else \
1314 { \
1315 c.s[i] = 0; \
1316 } \
1317 } \
1318 return c; \
1319 }
1320
1321 /** @brief Element shift left among vector
1322
1323 For all type */
1324 OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +)
1325
1326 /** @brief Element shift right among vector
1327
1328 For all type */
1329 OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
1330
1331 /** @brief Sum packed values
1332
1333 Scheme:
1334 @code
1335 {A1 A2 A3 ...} => sum{A1,A2,A3,...}
1336 @endcode
1337 */
v_reduce_sum(const v_reg<_Tp,n> & a)1338 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
1339 {
1340 typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
1341 for( int i = 1; i < n; i++ )
1342 c += a.s[i];
1343 return c;
1344 }
1345
1346 /** @brief Sums all elements of each input vector, returns the vector of sums
1347
1348 Scheme:
1349 @code
1350 result[0] = a[0] + a[1] + a[2] + a[3]
1351 result[1] = b[0] + b[1] + b[2] + b[3]
1352 result[2] = c[0] + c[1] + c[2] + c[3]
1353 result[3] = d[0] + d[1] + d[2] + d[3]
1354 @endcode
1355 */
v_reduce_sum4(const v_reg<float,n> & a,const v_reg<float,n> & b,const v_reg<float,n> & c,const v_reg<float,n> & d)1356 template<int n> inline v_reg<float, n> v_reduce_sum4(const v_reg<float, n>& a, const v_reg<float, n>& b,
1357 const v_reg<float, n>& c, const v_reg<float, n>& d)
1358 {
1359 v_reg<float, n> r;
1360 for(int i = 0; i < (n/4); i++)
1361 {
1362 r.s[i*4 + 0] = a.s[i*4 + 0] + a.s[i*4 + 1] + a.s[i*4 + 2] + a.s[i*4 + 3];
1363 r.s[i*4 + 1] = b.s[i*4 + 0] + b.s[i*4 + 1] + b.s[i*4 + 2] + b.s[i*4 + 3];
1364 r.s[i*4 + 2] = c.s[i*4 + 0] + c.s[i*4 + 1] + c.s[i*4 + 2] + c.s[i*4 + 3];
1365 r.s[i*4 + 3] = d.s[i*4 + 0] + d.s[i*4 + 1] + d.s[i*4 + 2] + d.s[i*4 + 3];
1366 }
1367 return r;
1368 }
1369
1370 /** @brief Sum absolute differences of values
1371
1372 Scheme:
1373 @code
1374 {A1 A2 A3 ...} {B1 B2 B3 ...} => sum{ABS(A1-B1),abs(A2-B2),abs(A3-B3),...}
1375 @endcode
1376 For all types except 64-bit types.*/
v_reduce_sad(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1377 template<typename _Tp, int n> inline typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type v_reduce_sad(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1378 {
1379 typename V_TypeTraits< typename V_TypeTraits<_Tp>::abs_type >::sum_type c = _absdiff(a.s[0], b.s[0]);
1380 for (int i = 1; i < n; i++)
1381 c += _absdiff(a.s[i], b.s[i]);
1382 return c;
1383 }
1384
1385 /** @brief Get negative values mask
1386 @deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough
1387
1388 Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes.
1389 Example:
1390 @code{.cpp}
1391 v_int32x4 r; // set to {-1, -1, 1, 1}
1392 int mask = v_signmask(r); // mask = 3 <== 00000000 00000000 00000000 00000011
1393 @endcode
1394 */
v_signmask(const v_reg<_Tp,n> & a)1395 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
1396 {
1397 int mask = 0;
1398 for( int i = 0; i < n; i++ )
1399 mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
1400 return mask;
1401 }
1402
1403 /** @brief Get first negative lane index
1404
1405 Returned value is an index of first negative lane (undefined for input of all positive values)
1406 Example:
1407 @code{.cpp}
1408 v_int32x4 r; // set to {0, 0, -1, -1}
1409 int idx = v_heading_zeros(r); // idx = 2
1410 @endcode
1411 */
v_scan_forward(const v_reg<_Tp,n> & a)1412 template <typename _Tp, int n> inline int v_scan_forward(const v_reg<_Tp, n>& a)
1413 {
1414 for (int i = 0; i < n; i++)
1415 if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0)
1416 return i;
1417 return 0;
1418 }
1419
1420 /** @brief Check if all packed values are less than zero
1421
1422 Unsigned values will be casted to signed: `uchar 254 => char -2`.
1423 */
v_check_all(const v_reg<_Tp,n> & a)1424 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
1425 {
1426 for( int i = 0; i < n; i++ )
1427 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
1428 return false;
1429 return true;
1430 }
1431
1432 /** @brief Check if any of packed values is less than zero
1433
1434 Unsigned values will be casted to signed: `uchar 254 => char -2`.
1435 */
v_check_any(const v_reg<_Tp,n> & a)1436 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
1437 {
1438 for( int i = 0; i < n; i++ )
1439 if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
1440 return true;
1441 return false;
1442 }
1443
1444 /** @brief Per-element select (blend operation)
1445
1446 Return value will be built by combining values _a_ and _b_ using the following scheme:
1447 result[i] = mask[i] ? a[i] : b[i];
1448
1449 @note: _mask_ element values are restricted to these values:
1450 - 0: select element from _b_
1451 - 0xff/0xffff/etc: select element from _a_
1452 (fully compatible with bitwise-based operator)
1453 */
v_select(const v_reg<_Tp,n> & mask,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)1454 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
1455 const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
1456 {
1457 typedef V_TypeTraits<_Tp> Traits;
1458 typedef typename Traits::int_type int_type;
1459 v_reg<_Tp, n> c;
1460 for( int i = 0; i < n; i++ )
1461 {
1462 int_type m = Traits::reinterpret_int(mask.s[i]);
1463 CV_DbgAssert(m == 0 || m == (~(int_type)0)); // restrict mask values: 0 or 0xff/0xffff/etc
1464 c.s[i] = m ? a.s[i] : b.s[i];
1465 }
1466 return c;
1467 }
1468
1469 /** @brief Expand values to the wider pack type
1470
1471 Copy contents of register to two registers with 2x wider pack type.
1472 Scheme:
1473 @code
1474 int32x4 int64x2 int64x2
1475 {A B C D} ==> {A B} , {C D}
1476 @endcode */
v_expand(const v_reg<_Tp,n> & a,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & b0,v_reg<typename V_TypeTraits<_Tp>::w_type,n/2> & b1)1477 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
1478 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
1479 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
1480 {
1481 for( int i = 0; i < (n/2); i++ )
1482 {
1483 b0.s[i] = a.s[i];
1484 b1.s[i] = a.s[i+(n/2)];
1485 }
1486 }
1487
1488 /** @brief Expand lower values to the wider pack type
1489
1490 Same as cv::v_expand, but return lower half of the vector.
1491
1492 Scheme:
1493 @code
1494 int32x4 int64x2
1495 {A B C D} ==> {A B}
1496 @endcode */
1497 template<typename _Tp, int n>
1498 inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_expand_low(const v_reg<_Tp,n> & a)1499 v_expand_low(const v_reg<_Tp, n>& a)
1500 {
1501 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
1502 for( int i = 0; i < (n/2); i++ )
1503 b.s[i] = a.s[i];
1504 return b;
1505 }
1506
1507 /** @brief Expand higher values to the wider pack type
1508
1509 Same as cv::v_expand_low, but expand higher half of the vector instead.
1510
1511 Scheme:
1512 @code
1513 int32x4 int64x2
1514 {A B C D} ==> {C D}
1515 @endcode */
1516 template<typename _Tp, int n>
1517 inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
v_expand_high(const v_reg<_Tp,n> & a)1518 v_expand_high(const v_reg<_Tp, n>& a)
1519 {
1520 v_reg<typename V_TypeTraits<_Tp>::w_type, n/2> b;
1521 for( int i = 0; i < (n/2); i++ )
1522 b.s[i] = a.s[i+(n/2)];
1523 return b;
1524 }
1525
1526 //! @cond IGNORED
1527 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
v_reinterpret_as_int(const v_reg<_Tp,n> & a)1528 v_reinterpret_as_int(const v_reg<_Tp, n>& a)
1529 {
1530 v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
1531 for( int i = 0; i < n; i++ )
1532 c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
1533 return c;
1534 }
1535
1536 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
v_reinterpret_as_uint(const v_reg<_Tp,n> & a)1537 v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
1538 {
1539 v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
1540 for( int i = 0; i < n; i++ )
1541 c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
1542 return c;
1543 }
1544 //! @endcond
1545
1546 /** @brief Interleave two vectors
1547
1548 Scheme:
1549 @code
1550 {A1 A2 A3 A4}
1551 {B1 B2 B3 B4}
1552 ---------------
1553 {A1 B1 A2 B2} and {A3 B3 A4 B4}
1554 @endcode
1555 For all types except 64-bit.
1556 */
v_zip(const v_reg<_Tp,n> & a0,const v_reg<_Tp,n> & a1,v_reg<_Tp,n> & b0,v_reg<_Tp,n> & b1)1557 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
1558 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
1559 {
1560 int i;
1561 for( i = 0; i < n/2; i++ )
1562 {
1563 b0.s[i*2] = a0.s[i];
1564 b0.s[i*2+1] = a1.s[i];
1565 }
1566 for( ; i < n; i++ )
1567 {
1568 b1.s[i*2-n] = a0.s[i];
1569 b1.s[i*2-n+1] = a1.s[i];
1570 }
1571 }
1572
1573 /** @brief Load register contents from memory
1574
1575 @param ptr pointer to memory block with data
1576 @return register object
1577
1578 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x16, int ==> cv::v_int32x4, etc.
1579
1580 @note Use vx_load version to get maximum available register length result
1581
1582 @note Alignment requirement:
1583 if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1584 Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1585 */
1586 template<typename _Tp>
v_load(const _Tp * ptr)1587 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load(const _Tp* ptr)
1588 {
1589 #if CV_STRONG_ALIGNMENT
1590 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1591 #endif
1592 return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
1593 }
1594
1595 #if CV_SIMD256
1596 /** @brief Load 256-bit length register contents from memory
1597
1598 @param ptr pointer to memory block with data
1599 @return register object
1600
1601 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x32, int ==> cv::v_int32x8, etc.
1602
1603 @note Check CV_SIMD256 preprocessor definition prior to use.
1604 Use vx_load version to get maximum available register length result
1605
1606 @note Alignment requirement:
1607 if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1608 Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1609 */
1610 template<typename _Tp>
v256_load(const _Tp * ptr)1611 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load(const _Tp* ptr)
1612 {
1613 #if CV_STRONG_ALIGNMENT
1614 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1615 #endif
1616 return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
1617 }
1618 #endif
1619
1620 #if CV_SIMD512
1621 /** @brief Load 512-bit length register contents from memory
1622
1623 @param ptr pointer to memory block with data
1624 @return register object
1625
1626 @note Returned type will be detected from passed pointer type, for example uchar ==> cv::v_uint8x64, int ==> cv::v_int32x16, etc.
1627
1628 @note Check CV_SIMD512 preprocessor definition prior to use.
1629 Use vx_load version to get maximum available register length result
1630
1631 @note Alignment requirement:
1632 if CV_STRONG_ALIGNMENT=1 then passed pointer must be aligned (`sizeof(lane type)` should be enough).
1633 Do not cast pointer types without runtime check for pointer alignment (like `uchar*` => `int*`).
1634 */
1635 template<typename _Tp>
v512_load(const _Tp * ptr)1636 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load(const _Tp* ptr)
1637 {
1638 #if CV_STRONG_ALIGNMENT
1639 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1640 #endif
1641 return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
1642 }
1643 #endif
1644
1645 /** @brief Load register contents from memory (aligned)
1646
1647 similar to cv::v_load, but source memory block should be aligned (to 16-byte boundary in case of SIMD128, 32-byte - SIMD256, etc)
1648
1649 @note Use vx_load_aligned version to get maximum available register length result
1650 */
1651 template<typename _Tp>
v_load_aligned(const _Tp * ptr)1652 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_aligned(const _Tp* ptr)
1653 {
1654 CV_Assert(isAligned<sizeof(v_reg<_Tp, simd128_width / sizeof(_Tp)>)>(ptr));
1655 return v_reg<_Tp, simd128_width / sizeof(_Tp)>(ptr);
1656 }
1657
1658 #if CV_SIMD256
1659 /** @brief Load register contents from memory (aligned)
1660
1661 similar to cv::v256_load, but source memory block should be aligned (to 32-byte boundary in case of SIMD256, 64-byte - SIMD512, etc)
1662
1663 @note Check CV_SIMD256 preprocessor definition prior to use.
1664 Use vx_load_aligned version to get maximum available register length result
1665 */
1666 template<typename _Tp>
v256_load_aligned(const _Tp * ptr)1667 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_aligned(const _Tp* ptr)
1668 {
1669 CV_Assert(isAligned<sizeof(v_reg<_Tp, simd256_width / sizeof(_Tp)>)>(ptr));
1670 return v_reg<_Tp, simd256_width / sizeof(_Tp)>(ptr);
1671 }
1672 #endif
1673
1674 #if CV_SIMD512
1675 /** @brief Load register contents from memory (aligned)
1676
1677 similar to cv::v512_load, but source memory block should be aligned (to 64-byte boundary in case of SIMD512, etc)
1678
1679 @note Check CV_SIMD512 preprocessor definition prior to use.
1680 Use vx_load_aligned version to get maximum available register length result
1681 */
1682 template<typename _Tp>
v512_load_aligned(const _Tp * ptr)1683 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_aligned(const _Tp* ptr)
1684 {
1685 CV_Assert(isAligned<sizeof(v_reg<_Tp, simd512_width / sizeof(_Tp)>)>(ptr));
1686 return v_reg<_Tp, simd512_width / sizeof(_Tp)>(ptr);
1687 }
1688 #endif
1689
1690 /** @brief Load 64-bits of data to lower part (high part is undefined).
1691
1692 @param ptr memory block containing data for first half (0..n/2)
1693
1694 @code{.cpp}
1695 int lo[2] = { 1, 2 };
1696 v_int32x4 r = v_load_low(lo);
1697 @endcode
1698
1699 @note Use vx_load_low version to get maximum available register length result
1700 */
1701 template<typename _Tp>
v_load_low(const _Tp * ptr)1702 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_low(const _Tp* ptr)
1703 {
1704 #if CV_STRONG_ALIGNMENT
1705 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1706 #endif
1707 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
1708 for( int i = 0; i < c.nlanes/2; i++ )
1709 {
1710 c.s[i] = ptr[i];
1711 }
1712 return c;
1713 }
1714
1715 #if CV_SIMD256
1716 /** @brief Load 128-bits of data to lower part (high part is undefined).
1717
1718 @param ptr memory block containing data for first half (0..n/2)
1719
1720 @code{.cpp}
1721 int lo[4] = { 1, 2, 3, 4 };
1722 v_int32x8 r = v256_load_low(lo);
1723 @endcode
1724
1725 @note Check CV_SIMD256 preprocessor definition prior to use.
1726 Use vx_load_low version to get maximum available register length result
1727 */
1728 template<typename _Tp>
v256_load_low(const _Tp * ptr)1729 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_low(const _Tp* ptr)
1730 {
1731 #if CV_STRONG_ALIGNMENT
1732 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1733 #endif
1734 v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
1735 for (int i = 0; i < c.nlanes / 2; i++)
1736 {
1737 c.s[i] = ptr[i];
1738 }
1739 return c;
1740 }
1741 #endif
1742
1743 #if CV_SIMD512
1744 /** @brief Load 256-bits of data to lower part (high part is undefined).
1745
1746 @param ptr memory block containing data for first half (0..n/2)
1747
1748 @code{.cpp}
1749 int lo[8] = { 1, 2, 3, 4, 5, 6, 7, 8 };
1750 v_int32x16 r = v512_load_low(lo);
1751 @endcode
1752
1753 @note Check CV_SIMD512 preprocessor definition prior to use.
1754 Use vx_load_low version to get maximum available register length result
1755 */
1756 template<typename _Tp>
v512_load_low(const _Tp * ptr)1757 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_low(const _Tp* ptr)
1758 {
1759 #if CV_STRONG_ALIGNMENT
1760 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1761 #endif
1762 v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
1763 for (int i = 0; i < c.nlanes / 2; i++)
1764 {
1765 c.s[i] = ptr[i];
1766 }
1767 return c;
1768 }
1769 #endif
1770
1771 /** @brief Load register contents from two memory blocks
1772
1773 @param loptr memory block containing data for first half (0..n/2)
1774 @param hiptr memory block containing data for second half (n/2..n)
1775
1776 @code{.cpp}
1777 int lo[2] = { 1, 2 }, hi[2] = { 3, 4 };
1778 v_int32x4 r = v_load_halves(lo, hi);
1779 @endcode
1780
1781 @note Use vx_load_halves version to get maximum available register length result
1782 */
1783 template<typename _Tp>
v_load_halves(const _Tp * loptr,const _Tp * hiptr)1784 inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_load_halves(const _Tp* loptr, const _Tp* hiptr)
1785 {
1786 #if CV_STRONG_ALIGNMENT
1787 CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1788 CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1789 #endif
1790 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
1791 for( int i = 0; i < c.nlanes/2; i++ )
1792 {
1793 c.s[i] = loptr[i];
1794 c.s[i+c.nlanes/2] = hiptr[i];
1795 }
1796 return c;
1797 }
1798
1799 #if CV_SIMD256
1800 /** @brief Load register contents from two memory blocks
1801
1802 @param loptr memory block containing data for first half (0..n/2)
1803 @param hiptr memory block containing data for second half (n/2..n)
1804
1805 @code{.cpp}
1806 int lo[4] = { 1, 2, 3, 4 }, hi[4] = { 5, 6, 7, 8 };
1807 v_int32x8 r = v256_load_halves(lo, hi);
1808 @endcode
1809
1810 @note Check CV_SIMD256 preprocessor definition prior to use.
1811 Use vx_load_halves version to get maximum available register length result
1812 */
1813 template<typename _Tp>
v256_load_halves(const _Tp * loptr,const _Tp * hiptr)1814 inline v_reg<_Tp, simd256_width / sizeof(_Tp)> v256_load_halves(const _Tp* loptr, const _Tp* hiptr)
1815 {
1816 #if CV_STRONG_ALIGNMENT
1817 CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1818 CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1819 #endif
1820 v_reg<_Tp, simd256_width / sizeof(_Tp)> c;
1821 for (int i = 0; i < c.nlanes / 2; i++)
1822 {
1823 c.s[i] = loptr[i];
1824 c.s[i + c.nlanes / 2] = hiptr[i];
1825 }
1826 return c;
1827 }
1828 #endif
1829
1830 #if CV_SIMD512
1831 /** @brief Load register contents from two memory blocks
1832
1833 @param loptr memory block containing data for first half (0..n/2)
1834 @param hiptr memory block containing data for second half (n/2..n)
1835
1836 @code{.cpp}
1837 int lo[4] = { 1, 2, 3, 4, 5, 6, 7, 8 }, hi[4] = { 9, 10, 11, 12, 13, 14, 15, 16 };
1838 v_int32x16 r = v512_load_halves(lo, hi);
1839 @endcode
1840
1841 @note Check CV_SIMD512 preprocessor definition prior to use.
1842 Use vx_load_halves version to get maximum available register length result
1843 */
1844 template<typename _Tp>
v512_load_halves(const _Tp * loptr,const _Tp * hiptr)1845 inline v_reg<_Tp, simd512_width / sizeof(_Tp)> v512_load_halves(const _Tp* loptr, const _Tp* hiptr)
1846 {
1847 #if CV_STRONG_ALIGNMENT
1848 CV_Assert(isAligned<sizeof(_Tp)>(loptr));
1849 CV_Assert(isAligned<sizeof(_Tp)>(hiptr));
1850 #endif
1851 v_reg<_Tp, simd512_width / sizeof(_Tp)> c;
1852 for (int i = 0; i < c.nlanes / 2; i++)
1853 {
1854 c.s[i] = loptr[i];
1855 c.s[i + c.nlanes / 2] = hiptr[i];
1856 }
1857 return c;
1858 }
1859 #endif
1860
1861 /** @brief Load register contents from memory with double expand
1862
1863 Same as cv::v_load, but result pack type will be 2x wider than memory type.
1864
1865 @code{.cpp}
1866 short buf[4] = {1, 2, 3, 4}; // type is int16
1867 v_int32x4 r = v_load_expand(buf); // r = {1, 2, 3, 4} - type is int32
1868 @endcode
1869 For 8-, 16-, 32-bit integer source types.
1870
1871 @note Use vx_load_expand version to get maximum available register length result
1872 */
1873 template<typename _Tp>
1874 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
v_load_expand(const _Tp * ptr)1875 v_load_expand(const _Tp* ptr)
1876 {
1877 #if CV_STRONG_ALIGNMENT
1878 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1879 #endif
1880 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1881 v_reg<w_type, simd128_width / sizeof(w_type)> c;
1882 for( int i = 0; i < c.nlanes; i++ )
1883 {
1884 c.s[i] = ptr[i];
1885 }
1886 return c;
1887 }
1888
1889 #if CV_SIMD256
1890 /** @brief Load register contents from memory with double expand
1891
1892 Same as cv::v256_load, but result pack type will be 2x wider than memory type.
1893
1894 @code{.cpp}
1895 short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int16
1896 v_int32x8 r = v256_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
1897 @endcode
1898 For 8-, 16-, 32-bit integer source types.
1899
1900 @note Check CV_SIMD256 preprocessor definition prior to use.
1901 Use vx_load_expand version to get maximum available register length result
1902 */
1903 template<typename _Tp>
1904 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
v256_load_expand(const _Tp * ptr)1905 v256_load_expand(const _Tp* ptr)
1906 {
1907 #if CV_STRONG_ALIGNMENT
1908 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1909 #endif
1910 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1911 v_reg<w_type, simd256_width / sizeof(w_type)> c;
1912 for (int i = 0; i < c.nlanes; i++)
1913 {
1914 c.s[i] = ptr[i];
1915 }
1916 return c;
1917 }
1918 #endif
1919
1920 #if CV_SIMD512
1921 /** @brief Load register contents from memory with double expand
1922
1923 Same as cv::v512_load, but result pack type will be 2x wider than memory type.
1924
1925 @code{.cpp}
1926 short buf[8] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int16
1927 v_int32x16 r = v512_load_expand(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
1928 @endcode
1929 For 8-, 16-, 32-bit integer source types.
1930
1931 @note Check CV_SIMD512 preprocessor definition prior to use.
1932 Use vx_load_expand version to get maximum available register length result
1933 */
1934 template<typename _Tp>
1935 inline v_reg<typename V_TypeTraits<_Tp>::w_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::w_type)>
v512_load_expand(const _Tp * ptr)1936 v512_load_expand(const _Tp* ptr)
1937 {
1938 #if CV_STRONG_ALIGNMENT
1939 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1940 #endif
1941 typedef typename V_TypeTraits<_Tp>::w_type w_type;
1942 v_reg<w_type, simd512_width / sizeof(w_type)> c;
1943 for (int i = 0; i < c.nlanes; i++)
1944 {
1945 c.s[i] = ptr[i];
1946 }
1947 return c;
1948 }
1949 #endif
1950
1951 /** @brief Load register contents from memory with quad expand
1952
1953 Same as cv::v_load_expand, but result type is 4 times wider than source.
1954 @code{.cpp}
1955 char buf[4] = {1, 2, 3, 4}; // type is int8
1956 v_int32x4 r = v_load_expand_q(buf); // r = {1, 2, 3, 4} - type is int32
1957 @endcode
1958 For 8-bit integer source types.
1959
1960 @note Use vx_load_expand_q version to get maximum available register length result
1961 */
1962 template<typename _Tp>
1963 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd128_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
v_load_expand_q(const _Tp * ptr)1964 v_load_expand_q(const _Tp* ptr)
1965 {
1966 #if CV_STRONG_ALIGNMENT
1967 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1968 #endif
1969 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1970 v_reg<q_type, simd128_width / sizeof(q_type)> c;
1971 for( int i = 0; i < c.nlanes; i++ )
1972 {
1973 c.s[i] = ptr[i];
1974 }
1975 return c;
1976 }
1977
1978 #if CV_SIMD256
1979 /** @brief Load register contents from memory with quad expand
1980
1981 Same as cv::v256_load_expand, but result type is 4 times wider than source.
1982 @code{.cpp}
1983 char buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; // type is int8
1984 v_int32x8 r = v256_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8} - type is int32
1985 @endcode
1986 For 8-bit integer source types.
1987
1988 @note Check CV_SIMD256 preprocessor definition prior to use.
1989 Use vx_load_expand_q version to get maximum available register length result
1990 */
1991 template<typename _Tp>
1992 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd256_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
v256_load_expand_q(const _Tp * ptr)1993 v256_load_expand_q(const _Tp* ptr)
1994 {
1995 #if CV_STRONG_ALIGNMENT
1996 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
1997 #endif
1998 typedef typename V_TypeTraits<_Tp>::q_type q_type;
1999 v_reg<q_type, simd256_width / sizeof(q_type)> c;
2000 for (int i = 0; i < c.nlanes; i++)
2001 {
2002 c.s[i] = ptr[i];
2003 }
2004 return c;
2005 }
2006 #endif
2007
2008 #if CV_SIMD512
2009 /** @brief Load register contents from memory with quad expand
2010
2011 Same as cv::v512_load_expand, but result type is 4 times wider than source.
2012 @code{.cpp}
2013 char buf[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; // type is int8
2014 v_int32x16 r = v512_load_expand_q(buf); // r = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} - type is int32
2015 @endcode
2016 For 8-bit integer source types.
2017
2018 @note Check CV_SIMD512 preprocessor definition prior to use.
2019 Use vx_load_expand_q version to get maximum available register length result
2020 */
2021 template<typename _Tp>
2022 inline v_reg<typename V_TypeTraits<_Tp>::q_type, simd512_width / sizeof(typename V_TypeTraits<_Tp>::q_type)>
v512_load_expand_q(const _Tp * ptr)2023 v512_load_expand_q(const _Tp* ptr)
2024 {
2025 #if CV_STRONG_ALIGNMENT
2026 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2027 #endif
2028 typedef typename V_TypeTraits<_Tp>::q_type q_type;
2029 v_reg<q_type, simd512_width / sizeof(q_type)> c;
2030 for (int i = 0; i < c.nlanes; i++)
2031 {
2032 c.s[i] = ptr[i];
2033 }
2034 return c;
2035 }
2036 #endif
2037
2038 /** @brief Load and deinterleave (2 channels)
2039
2040 Load data from memory deinterleave and store to 2 registers.
2041 Scheme:
2042 @code
2043 {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
2044 @endcode
2045 For all types except 64-bit. */
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b)2046 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2047 v_reg<_Tp, n>& b)
2048 {
2049 #if CV_STRONG_ALIGNMENT
2050 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2051 #endif
2052 int i, i2;
2053 for( i = i2 = 0; i < n; i++, i2 += 2 )
2054 {
2055 a.s[i] = ptr[i2];
2056 b.s[i] = ptr[i2+1];
2057 }
2058 }
2059
2060 /** @brief Load and deinterleave (3 channels)
2061
2062 Load data from memory deinterleave and store to 3 registers.
2063 Scheme:
2064 @code
2065 {A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
2066 @endcode
2067 For all types except 64-bit. */
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b,v_reg<_Tp,n> & c)2068 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2069 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
2070 {
2071 #if CV_STRONG_ALIGNMENT
2072 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2073 #endif
2074 int i, i3;
2075 for( i = i3 = 0; i < n; i++, i3 += 3 )
2076 {
2077 a.s[i] = ptr[i3];
2078 b.s[i] = ptr[i3+1];
2079 c.s[i] = ptr[i3+2];
2080 }
2081 }
2082
2083 /** @brief Load and deinterleave (4 channels)
2084
2085 Load data from memory deinterleave and store to 4 registers.
2086 Scheme:
2087 @code
2088 {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
2089 @endcode
2090 For all types except 64-bit. */
2091 template<typename _Tp, int n>
v_load_deinterleave(const _Tp * ptr,v_reg<_Tp,n> & a,v_reg<_Tp,n> & b,v_reg<_Tp,n> & c,v_reg<_Tp,n> & d)2092 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
2093 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
2094 v_reg<_Tp, n>& d)
2095 {
2096 #if CV_STRONG_ALIGNMENT
2097 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2098 #endif
2099 int i, i4;
2100 for( i = i4 = 0; i < n; i++, i4 += 4 )
2101 {
2102 a.s[i] = ptr[i4];
2103 b.s[i] = ptr[i4+1];
2104 c.s[i] = ptr[i4+2];
2105 d.s[i] = ptr[i4+3];
2106 }
2107 }
2108
2109 /** @brief Interleave and store (2 channels)
2110
2111 Interleave and store data from 2 registers to memory.
2112 Scheme:
2113 @code
2114 {A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
2115 @endcode
2116 For all types except 64-bit. */
2117 template<typename _Tp, int n>
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,hal::StoreMode=hal::STORE_UNALIGNED)2118 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2119 const v_reg<_Tp, n>& b,
2120 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2121 {
2122 #if CV_STRONG_ALIGNMENT
2123 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2124 #endif
2125 int i, i2;
2126 for( i = i2 = 0; i < n; i++, i2 += 2 )
2127 {
2128 ptr[i2] = a.s[i];
2129 ptr[i2+1] = b.s[i];
2130 }
2131 }
2132
2133 /** @brief Interleave and store (3 channels)
2134
2135 Interleave and store data from 3 registers to memory.
2136 Scheme:
2137 @code
2138 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
2139 @endcode
2140 For all types except 64-bit. */
2141 template<typename _Tp, int n>
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c,hal::StoreMode=hal::STORE_UNALIGNED)2142 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2143 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
2144 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2145 {
2146 #if CV_STRONG_ALIGNMENT
2147 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2148 #endif
2149 int i, i3;
2150 for( i = i3 = 0; i < n; i++, i3 += 3 )
2151 {
2152 ptr[i3] = a.s[i];
2153 ptr[i3+1] = b.s[i];
2154 ptr[i3+2] = c.s[i];
2155 }
2156 }
2157
2158 /** @brief Interleave and store (4 channels)
2159
2160 Interleave and store data from 4 registers to memory.
2161 Scheme:
2162 @code
2163 {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...}
2164 @endcode
2165 For all types except 64-bit. */
v_store_interleave(_Tp * ptr,const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,const v_reg<_Tp,n> & c,const v_reg<_Tp,n> & d,hal::StoreMode=hal::STORE_UNALIGNED)2166 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
2167 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
2168 const v_reg<_Tp, n>& d,
2169 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED)
2170 {
2171 #if CV_STRONG_ALIGNMENT
2172 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2173 #endif
2174 int i, i4;
2175 for( i = i4 = 0; i < n; i++, i4 += 4 )
2176 {
2177 ptr[i4] = a.s[i];
2178 ptr[i4+1] = b.s[i];
2179 ptr[i4+2] = c.s[i];
2180 ptr[i4+3] = d.s[i];
2181 }
2182 }
2183
2184 /** @brief Store data to memory
2185
2186 Store register contents to memory.
2187 Scheme:
2188 @code
2189 REG {A B C D} ==> MEM {A B C D}
2190 @endcode
2191 Pointer can be unaligned. */
2192 template<typename _Tp, int n>
v_store(_Tp * ptr,const v_reg<_Tp,n> & a)2193 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
2194 {
2195 #if CV_STRONG_ALIGNMENT
2196 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2197 #endif
2198 for( int i = 0; i < n; i++ )
2199 ptr[i] = a.s[i];
2200 }
2201
2202 template<typename _Tp, int n>
v_store(_Tp * ptr,const v_reg<_Tp,n> & a,hal::StoreMode)2203 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
2204 {
2205 #if CV_STRONG_ALIGNMENT
2206 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2207 #endif
2208 v_store(ptr, a);
2209 }
2210
2211 /** @brief Store data to memory (lower half)
2212
2213 Store lower half of register contents to memory.
2214 Scheme:
2215 @code
2216 REG {A B C D} ==> MEM {A B}
2217 @endcode */
2218 template<typename _Tp, int n>
v_store_low(_Tp * ptr,const v_reg<_Tp,n> & a)2219 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
2220 {
2221 #if CV_STRONG_ALIGNMENT
2222 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2223 #endif
2224 for( int i = 0; i < (n/2); i++ )
2225 ptr[i] = a.s[i];
2226 }
2227
2228 /** @brief Store data to memory (higher half)
2229
2230 Store higher half of register contents to memory.
2231 Scheme:
2232 @code
2233 REG {A B C D} ==> MEM {C D}
2234 @endcode */
2235 template<typename _Tp, int n>
v_store_high(_Tp * ptr,const v_reg<_Tp,n> & a)2236 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
2237 {
2238 #if CV_STRONG_ALIGNMENT
2239 CV_Assert(isAligned<sizeof(_Tp)>(ptr));
2240 #endif
2241 for( int i = 0; i < (n/2); i++ )
2242 ptr[i] = a.s[i+(n/2)];
2243 }
2244
2245 /** @brief Store data to memory (aligned)
2246
2247 Store register contents to memory.
2248 Scheme:
2249 @code
2250 REG {A B C D} ==> MEM {A B C D}
2251 @endcode
2252 Pointer __should__ be aligned by 16-byte boundary. */
2253 template<typename _Tp, int n>
v_store_aligned(_Tp * ptr,const v_reg<_Tp,n> & a)2254 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
2255 {
2256 CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2257 v_store(ptr, a);
2258 }
2259
2260 template<typename _Tp, int n>
v_store_aligned_nocache(_Tp * ptr,const v_reg<_Tp,n> & a)2261 inline void v_store_aligned_nocache(_Tp* ptr, const v_reg<_Tp, n>& a)
2262 {
2263 CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2264 v_store(ptr, a);
2265 }
2266
2267 template<typename _Tp, int n>
v_store_aligned(_Tp * ptr,const v_reg<_Tp,n> & a,hal::StoreMode)2268 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a, hal::StoreMode /*mode*/)
2269 {
2270 CV_Assert(isAligned<sizeof(v_reg<_Tp, n>)>(ptr));
2271 v_store(ptr, a);
2272 }
2273
2274 /** @brief Combine vector from first elements of two vectors
2275
2276 Scheme:
2277 @code
2278 {A1 A2 A3 A4}
2279 {B1 B2 B3 B4}
2280 ---------------
2281 {A1 A2 B1 B2}
2282 @endcode
2283 For all types except 64-bit. */
2284 template<typename _Tp, int n>
v_combine_low(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)2285 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2286 {
2287 v_reg<_Tp, n> c;
2288 for( int i = 0; i < (n/2); i++ )
2289 {
2290 c.s[i] = a.s[i];
2291 c.s[i+(n/2)] = b.s[i];
2292 }
2293 return c;
2294 }
2295
2296 /** @brief Combine vector from last elements of two vectors
2297
2298 Scheme:
2299 @code
2300 {A1 A2 A3 A4}
2301 {B1 B2 B3 B4}
2302 ---------------
2303 {A3 A4 B3 B4}
2304 @endcode
2305 For all types except 64-bit. */
2306 template<typename _Tp, int n>
v_combine_high(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)2307 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2308 {
2309 v_reg<_Tp, n> c;
2310 for( int i = 0; i < (n/2); i++ )
2311 {
2312 c.s[i] = a.s[i+(n/2)];
2313 c.s[i+(n/2)] = b.s[i+(n/2)];
2314 }
2315 return c;
2316 }
2317
2318 /** @brief Combine two vectors from lower and higher parts of two other vectors
2319
2320 @code{.cpp}
2321 low = cv::v_combine_low(a, b);
2322 high = cv::v_combine_high(a, b);
2323 @endcode */
2324 template<typename _Tp, int n>
v_recombine(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b,v_reg<_Tp,n> & low,v_reg<_Tp,n> & high)2325 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
2326 v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
2327 {
2328 for( int i = 0; i < (n/2); i++ )
2329 {
2330 low.s[i] = a.s[i];
2331 low.s[i+(n/2)] = b.s[i];
2332 high.s[i] = a.s[i+(n/2)];
2333 high.s[i+(n/2)] = b.s[i+(n/2)];
2334 }
2335 }
2336
2337 /** @brief Vector reverse order
2338
2339 Reverse the order of the vector
2340 Scheme:
2341 @code
2342 REG {A1 ... An} ==> REG {An ... A1}
2343 @endcode
2344 For all types. */
2345 template<typename _Tp, int n>
v_reverse(const v_reg<_Tp,n> & a)2346 inline v_reg<_Tp, n> v_reverse(const v_reg<_Tp, n>& a)
2347 {
2348 v_reg<_Tp, n> c;
2349 for( int i = 0; i < n; i++ )
2350 c.s[i] = a.s[n-i-1];
2351 return c;
2352 }
2353
2354 /** @brief Vector extract
2355
2356 Scheme:
2357 @code
2358 {A1 A2 A3 A4}
2359 {B1 B2 B3 B4}
2360 ========================
2361 shift = 1 {A2 A3 A4 B1}
2362 shift = 2 {A3 A4 B1 B2}
2363 shift = 3 {A4 B1 B2 B3}
2364 @endcode
2365 Restriction: 0 <= shift < nlanes
2366
2367 Usage:
2368 @code
2369 v_int32x4 a, b, c;
2370 c = v_extract<2>(a, b);
2371 @endcode
2372 For all types. */
2373 template<int s, typename _Tp, int n>
v_extract(const v_reg<_Tp,n> & a,const v_reg<_Tp,n> & b)2374 inline v_reg<_Tp, n> v_extract(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
2375 {
2376 v_reg<_Tp, n> r;
2377 const int shift = n - s;
2378 int i = 0;
2379 for (; i < shift; ++i)
2380 r.s[i] = a.s[i+s];
2381 for (; i < n; ++i)
2382 r.s[i] = b.s[i-shift];
2383 return r;
2384 }
2385
2386 /** @brief Vector extract
2387
2388 Scheme:
2389 Return the s-th element of v.
2390 Restriction: 0 <= s < nlanes
2391
2392 Usage:
2393 @code
2394 v_int32x4 a;
2395 int r;
2396 r = v_extract_n<2>(a);
2397 @endcode
2398 For all types. */
2399 template<int s, typename _Tp, int n>
v_extract_n(const v_reg<_Tp,n> & v)2400 inline _Tp v_extract_n(const v_reg<_Tp, n>& v)
2401 {
2402 CV_DbgAssert(s >= 0 && s < n);
2403 return v.s[s];
2404 }
2405
2406 /** @brief Broadcast i-th element of vector
2407
2408 Scheme:
2409 @code
2410 { v[0] v[1] v[2] ... v[SZ] } => { v[i], v[i], v[i] ... v[i] }
2411 @endcode
2412 Restriction: 0 <= i < nlanes
2413 Supported types: 32-bit integers and floats (s32/u32/f32)
2414 */
2415 template<int i, typename _Tp, int n>
v_broadcast_element(const v_reg<_Tp,n> & a)2416 inline v_reg<_Tp, n> v_broadcast_element(const v_reg<_Tp, n>& a)
2417 {
2418 CV_DbgAssert(i >= 0 && i < n);
2419 return v_reg<_Tp, n>::all(a.s[i]);
2420 }
2421
2422 /** @brief Round elements
2423
2424 Rounds each value. Input type is float vector ==> output type is int vector.
2425 @note Only for floating point types.
2426 */
v_round(const v_reg<float,n> & a)2427 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
2428 {
2429 v_reg<int, n> c;
2430 for( int i = 0; i < n; i++ )
2431 c.s[i] = cvRound(a.s[i]);
2432 return c;
2433 }
2434
2435 /** @overload */
v_round(const v_reg<double,n> & a,const v_reg<double,n> & b)2436 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a, const v_reg<double, n>& b)
2437 {
2438 v_reg<int, n*2> c;
2439 for( int i = 0; i < n; i++ )
2440 {
2441 c.s[i] = cvRound(a.s[i]);
2442 c.s[i+n] = cvRound(b.s[i]);
2443 }
2444 return c;
2445 }
2446
2447 /** @brief Floor elements
2448
2449 Floor each value. Input type is float vector ==> output type is int vector.
2450 @note Only for floating point types.
2451 */
v_floor(const v_reg<float,n> & a)2452 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
2453 {
2454 v_reg<int, n> c;
2455 for( int i = 0; i < n; i++ )
2456 c.s[i] = cvFloor(a.s[i]);
2457 return c;
2458 }
2459
2460 /** @brief Ceil elements
2461
2462 Ceil each value. Input type is float vector ==> output type is int vector.
2463 @note Only for floating point types.
2464 */
v_ceil(const v_reg<float,n> & a)2465 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
2466 {
2467 v_reg<int, n> c;
2468 for( int i = 0; i < n; i++ )
2469 c.s[i] = cvCeil(a.s[i]);
2470 return c;
2471 }
2472
2473 /** @brief Truncate elements
2474
2475 Truncate each value. Input type is float vector ==> output type is int vector.
2476 @note Only for floating point types.
2477 */
v_trunc(const v_reg<float,n> & a)2478 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
2479 {
2480 v_reg<int, n> c;
2481 for( int i = 0; i < n; i++ )
2482 c.s[i] = (int)(a.s[i]);
2483 return c;
2484 }
2485
2486 /** @overload */
v_round(const v_reg<double,n> & a)2487 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
2488 {
2489 v_reg<int, n*2> c;
2490 for( int i = 0; i < n; i++ )
2491 {
2492 c.s[i] = cvRound(a.s[i]);
2493 c.s[i+n] = 0;
2494 }
2495 return c;
2496 }
2497
2498 /** @overload */
v_floor(const v_reg<double,n> & a)2499 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
2500 {
2501 v_reg<int, n*2> c;
2502 for( int i = 0; i < n; i++ )
2503 {
2504 c.s[i] = cvFloor(a.s[i]);
2505 c.s[i+n] = 0;
2506 }
2507 return c;
2508 }
2509
2510 /** @overload */
v_ceil(const v_reg<double,n> & a)2511 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
2512 {
2513 v_reg<int, n*2> c;
2514 for( int i = 0; i < n; i++ )
2515 {
2516 c.s[i] = cvCeil(a.s[i]);
2517 c.s[i+n] = 0;
2518 }
2519 return c;
2520 }
2521
2522 /** @overload */
v_trunc(const v_reg<double,n> & a)2523 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
2524 {
2525 v_reg<int, n*2> c;
2526 for( int i = 0; i < n; i++ )
2527 {
2528 c.s[i] = (int)(a.s[i]);
2529 c.s[i+n] = 0;
2530 }
2531 return c;
2532 }
2533
2534 /** @brief Convert to float
2535
2536 Supported input type is cv::v_int32. */
v_cvt_f32(const v_reg<int,n> & a)2537 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
2538 {
2539 v_reg<float, n> c;
2540 for( int i = 0; i < n; i++ )
2541 c.s[i] = (float)a.s[i];
2542 return c;
2543 }
2544
2545 /** @brief Convert lower half to float
2546
2547 Supported input type is cv::v_float64. */
v_cvt_f32(const v_reg<double,n> & a)2548 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a)
2549 {
2550 v_reg<float, n*2> c;
2551 for( int i = 0; i < n; i++ )
2552 {
2553 c.s[i] = (float)a.s[i];
2554 c.s[i+n] = 0;
2555 }
2556 return c;
2557 }
2558
2559 /** @brief Convert to float
2560
2561 Supported input type is cv::v_float64. */
v_cvt_f32(const v_reg<double,n> & a,const v_reg<double,n> & b)2562 template<int n> inline v_reg<float, n*2> v_cvt_f32(const v_reg<double, n>& a, const v_reg<double, n>& b)
2563 {
2564 v_reg<float, n*2> c;
2565 for( int i = 0; i < n; i++ )
2566 {
2567 c.s[i] = (float)a.s[i];
2568 c.s[i+n] = (float)b.s[i];
2569 }
2570 return c;
2571 }
2572
2573 /** @brief Convert lower half to double
2574
2575 Supported input type is cv::v_int32. */
v_cvt_f64(const v_reg<int,n> & a)2576 template<int n> CV_INLINE v_reg<double, n/2> v_cvt_f64(const v_reg<int, n>& a)
2577 {
2578 v_reg<double, (n/2)> c;
2579 for( int i = 0; i < (n/2); i++ )
2580 c.s[i] = (double)a.s[i];
2581 return c;
2582 }
2583
2584 /** @brief Convert to double high part of vector
2585
2586 Supported input type is cv::v_int32. */
v_cvt_f64_high(const v_reg<int,n> & a)2587 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<int, n>& a)
2588 {
2589 v_reg<double, (n/2)> c;
2590 for( int i = 0; i < (n/2); i++ )
2591 c.s[i] = (double)a.s[i + (n/2)];
2592 return c;
2593 }
2594
2595 /** @brief Convert lower half to double
2596
2597 Supported input type is cv::v_float32. */
v_cvt_f64(const v_reg<float,n> & a)2598 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64(const v_reg<float, n>& a)
2599 {
2600 v_reg<double, (n/2)> c;
2601 for( int i = 0; i < (n/2); i++ )
2602 c.s[i] = (double)a.s[i];
2603 return c;
2604 }
2605
2606 /** @brief Convert to double high part of vector
2607
2608 Supported input type is cv::v_float32. */
v_cvt_f64_high(const v_reg<float,n> & a)2609 template<int n> CV_INLINE v_reg<double, (n/2)> v_cvt_f64_high(const v_reg<float, n>& a)
2610 {
2611 v_reg<double, (n/2)> c;
2612 for( int i = 0; i < (n/2); i++ )
2613 c.s[i] = (double)a.s[i + (n/2)];
2614 return c;
2615 }
2616
2617 /** @brief Convert to double
2618
2619 Supported input type is cv::v_int64. */
v_cvt_f64(const v_reg<int64,n> & a)2620 template<int n> CV_INLINE v_reg<double, n> v_cvt_f64(const v_reg<int64, n>& a)
2621 {
2622 v_reg<double, n> c;
2623 for( int i = 0; i < n; i++ )
2624 c.s[i] = (double)a.s[i];
2625 return c;
2626 }
2627
2628
v_lut(const _Tp * tab,const int * idx)2629 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut(const _Tp* tab, const int* idx)
2630 {
2631 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2632 for (int i = 0; i < c.nlanes; i++)
2633 c.s[i] = tab[idx[i]];
2634 return c;
2635 }
v_lut_pairs(const _Tp * tab,const int * idx)2636 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_pairs(const _Tp* tab, const int* idx)
2637 {
2638 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2639 for (int i = 0; i < c.nlanes; i++)
2640 c.s[i] = tab[idx[i / 2] + i % 2];
2641 return c;
2642 }
v_lut_quads(const _Tp * tab,const int * idx)2643 template<typename _Tp> inline v_reg<_Tp, simd128_width / sizeof(_Tp)> v_lut_quads(const _Tp* tab, const int* idx)
2644 {
2645 v_reg<_Tp, simd128_width / sizeof(_Tp)> c;
2646 for (int i = 0; i < c.nlanes; i++)
2647 c.s[i] = tab[idx[i / 4] + i % 4];
2648 return c;
2649 }
2650
v_lut(const int * tab,const v_reg<int,n> & idx)2651 template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
2652 {
2653 v_reg<int, n> c;
2654 for( int i = 0; i < n; i++ )
2655 c.s[i] = tab[idx.s[i]];
2656 return c;
2657 }
2658
v_lut(const unsigned * tab,const v_reg<int,n> & idx)2659 template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
2660 {
2661 v_reg<int, n> c;
2662 for (int i = 0; i < n; i++)
2663 c.s[i] = tab[idx.s[i]];
2664 return c;
2665 }
2666
v_lut(const float * tab,const v_reg<int,n> & idx)2667 template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
2668 {
2669 v_reg<float, n> c;
2670 for( int i = 0; i < n; i++ )
2671 c.s[i] = tab[idx.s[i]];
2672 return c;
2673 }
2674
v_lut(const double * tab,const v_reg<int,n> & idx)2675 template<int n> inline v_reg<double, n/2> v_lut(const double* tab, const v_reg<int, n>& idx)
2676 {
2677 v_reg<double, n/2> c;
2678 for( int i = 0; i < n/2; i++ )
2679 c.s[i] = tab[idx.s[i]];
2680 return c;
2681 }
2682
2683
v_lut_deinterleave(const float * tab,const v_reg<int,n> & idx,v_reg<float,n> & x,v_reg<float,n> & y)2684 template<int n> inline void v_lut_deinterleave(const float* tab, const v_reg<int, n>& idx,
2685 v_reg<float, n>& x, v_reg<float, n>& y)
2686 {
2687 for( int i = 0; i < n; i++ )
2688 {
2689 int j = idx.s[i];
2690 x.s[i] = tab[j];
2691 y.s[i] = tab[j+1];
2692 }
2693 }
2694
v_lut_deinterleave(const double * tab,const v_reg<int,n * 2> & idx,v_reg<double,n> & x,v_reg<double,n> & y)2695 template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<int, n*2>& idx,
2696 v_reg<double, n>& x, v_reg<double, n>& y)
2697 {
2698 for( int i = 0; i < n; i++ )
2699 {
2700 int j = idx.s[i];
2701 x.s[i] = tab[j];
2702 y.s[i] = tab[j+1];
2703 }
2704 }
2705
v_interleave_pairs(const v_reg<_Tp,n> & vec)2706 template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
2707 {
2708 v_reg<_Tp, n> c;
2709 for (int i = 0; i < n/4; i++)
2710 {
2711 c.s[4*i ] = vec.s[4*i ];
2712 c.s[4*i+1] = vec.s[4*i+2];
2713 c.s[4*i+2] = vec.s[4*i+1];
2714 c.s[4*i+3] = vec.s[4*i+3];
2715 }
2716 return c;
2717 }
2718
v_interleave_quads(const v_reg<_Tp,n> & vec)2719 template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
2720 {
2721 v_reg<_Tp, n> c;
2722 for (int i = 0; i < n/8; i++)
2723 {
2724 c.s[8*i ] = vec.s[8*i ];
2725 c.s[8*i+1] = vec.s[8*i+4];
2726 c.s[8*i+2] = vec.s[8*i+1];
2727 c.s[8*i+3] = vec.s[8*i+5];
2728 c.s[8*i+4] = vec.s[8*i+2];
2729 c.s[8*i+5] = vec.s[8*i+6];
2730 c.s[8*i+6] = vec.s[8*i+3];
2731 c.s[8*i+7] = vec.s[8*i+7];
2732 }
2733 return c;
2734 }
2735
v_pack_triplets(const v_reg<_Tp,n> & vec)2736 template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
2737 {
2738 v_reg<_Tp, n> c;
2739 for (int i = 0; i < n/4; i++)
2740 {
2741 c.s[3*i ] = vec.s[4*i ];
2742 c.s[3*i+1] = vec.s[4*i+1];
2743 c.s[3*i+2] = vec.s[4*i+2];
2744 }
2745 return c;
2746 }
2747
2748 /** @brief Transpose 4x4 matrix
2749
2750 Scheme:
2751 @code
2752 a0 {A1 A2 A3 A4}
2753 a1 {B1 B2 B3 B4}
2754 a2 {C1 C2 C3 C4}
2755 a3 {D1 D2 D3 D4}
2756 ===============
2757 b0 {A1 B1 C1 D1}
2758 b1 {A2 B2 C2 D2}
2759 b2 {A3 B3 C3 D3}
2760 b3 {A4 B4 C4 D4}
2761 @endcode
2762 */
2763 template<typename _Tp, int n>
v_transpose4x4(v_reg<_Tp,n> & a0,const v_reg<_Tp,n> & a1,const v_reg<_Tp,n> & a2,const v_reg<_Tp,n> & a3,v_reg<_Tp,n> & b0,v_reg<_Tp,n> & b1,v_reg<_Tp,n> & b2,v_reg<_Tp,n> & b3)2764 inline void v_transpose4x4( v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
2765 const v_reg<_Tp, n>& a2, const v_reg<_Tp, n>& a3,
2766 v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1,
2767 v_reg<_Tp, n>& b2, v_reg<_Tp, n>& b3 )
2768 {
2769 for (int i = 0; i < n / 4; i++)
2770 {
2771 b0.s[0 + i*4] = a0.s[0 + i*4]; b0.s[1 + i*4] = a1.s[0 + i*4];
2772 b0.s[2 + i*4] = a2.s[0 + i*4]; b0.s[3 + i*4] = a3.s[0 + i*4];
2773 b1.s[0 + i*4] = a0.s[1 + i*4]; b1.s[1 + i*4] = a1.s[1 + i*4];
2774 b1.s[2 + i*4] = a2.s[1 + i*4]; b1.s[3 + i*4] = a3.s[1 + i*4];
2775 b2.s[0 + i*4] = a0.s[2 + i*4]; b2.s[1 + i*4] = a1.s[2 + i*4];
2776 b2.s[2 + i*4] = a2.s[2 + i*4]; b2.s[3 + i*4] = a3.s[2 + i*4];
2777 b3.s[0 + i*4] = a0.s[3 + i*4]; b3.s[1 + i*4] = a1.s[3 + i*4];
2778 b3.s[2 + i*4] = a2.s[3 + i*4]; b3.s[3 + i*4] = a3.s[3 + i*4];
2779 }
2780 }
2781
2782 //! @brief Helper macro
2783 //! @ingroup core_hal_intrin_impl
2784 #define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, prefix, suffix) \
2785 inline _Tpvec prefix##_setzero_##suffix() { return _Tpvec::zero(); }
2786
2787 //! @name Init with zero
2788 //! @{
2789 //! @brief Create new vector with zero elements
OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16,v,u8)2790 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x16, v, u8)
2791 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x16, v, s8)
2792 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x8, v, u16)
2793 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x8, v, s16)
2794 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x4, v, u32)
2795 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x4, v, s32)
2796 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x4, v, f32)
2797 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x2, v, f64)
2798 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x2, v, u64)
2799 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x2, v, s64)
2800
2801 #if CV_SIMD256
2802 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x32, v256, u8)
2803 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x32, v256, s8)
2804 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x16, v256, u16)
2805 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x16, v256, s16)
2806 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x8, v256, u32)
2807 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x8, v256, s32)
2808 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x8, v256, f32)
2809 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x4, v256, f64)
2810 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x4, v256, u64)
2811 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x4, v256, s64)
2812 #endif
2813
2814 #if CV_SIMD512
2815 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint8x64, v512, u8)
2816 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int8x64, v512, s8)
2817 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint16x32, v512, u16)
2818 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int16x32, v512, s16)
2819 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint32x16, v512, u32)
2820 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int32x16, v512, s32)
2821 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float32x16, v512, f32)
2822 OPENCV_HAL_IMPL_C_INIT_ZERO(v_float64x8, v512, f64)
2823 OPENCV_HAL_IMPL_C_INIT_ZERO(v_uint64x8, v512, u64)
2824 OPENCV_HAL_IMPL_C_INIT_ZERO(v_int64x8, v512, s64)
2825 #endif
2826 //! @}
2827
2828 //! @brief Helper macro
2829 //! @ingroup core_hal_intrin_impl
2830 #define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, prefix, suffix) \
2831 inline _Tpvec prefix##_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
2832
2833 //! @name Init with value
2834 //! @{
2835 //! @brief Create new vector with elements set to a specific value
2836 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x16, uchar, v, u8)
2837 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x16, schar, v, s8)
2838 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x8, ushort, v, u16)
2839 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x8, short, v, s16)
2840 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x4, unsigned, v, u32)
2841 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x4, int, v, s32)
2842 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x4, float, v, f32)
2843 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x2, double, v, f64)
2844 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x2, uint64, v, u64)
2845 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x2, int64, v, s64)
2846
2847 #if CV_SIMD256
2848 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x32, uchar, v256, u8)
2849 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x32, schar, v256, s8)
2850 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x16, ushort, v256, u16)
2851 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x16, short, v256, s16)
2852 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x8, unsigned, v256, u32)
2853 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x8, int, v256, s32)
2854 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x8, float, v256, f32)
2855 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x4, double, v256, f64)
2856 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x4, uint64, v256, u64)
2857 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x4, int64, v256, s64)
2858 #endif
2859
2860 #if CV_SIMD512
2861 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint8x64, uchar, v512, u8)
2862 OPENCV_HAL_IMPL_C_INIT_VAL(v_int8x64, schar, v512, s8)
2863 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint16x32, ushort, v512, u16)
2864 OPENCV_HAL_IMPL_C_INIT_VAL(v_int16x32, short, v512, s16)
2865 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint32x16, unsigned, v512, u32)
2866 OPENCV_HAL_IMPL_C_INIT_VAL(v_int32x16, int, v512, s32)
2867 OPENCV_HAL_IMPL_C_INIT_VAL(v_float32x16, float, v512, f32)
2868 OPENCV_HAL_IMPL_C_INIT_VAL(v_float64x8, double, v512, f64)
2869 OPENCV_HAL_IMPL_C_INIT_VAL(v_uint64x8, uint64, v512, u64)
2870 OPENCV_HAL_IMPL_C_INIT_VAL(v_int64x8, int64, v512, s64)
2871 #endif
2872 //! @}
2873
2874 //! @brief Helper macro
2875 //! @ingroup core_hal_intrin_impl
2876 #define OPENCV_HAL_IMPL_C_REINTERPRET(_Tp, suffix) \
2877 template<typename _Tp0, int n0> inline v_reg<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)> \
2878 v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
2879 { return a.template reinterpret_as<_Tp, n0*sizeof(_Tp0)/sizeof(_Tp)>(); }
2880
2881 //! @name Reinterpret
2882 //! @{
2883 //! @brief Convert vector to different type without modifying underlying data.
2884 OPENCV_HAL_IMPL_C_REINTERPRET(uchar, u8)
2885 OPENCV_HAL_IMPL_C_REINTERPRET(schar, s8)
2886 OPENCV_HAL_IMPL_C_REINTERPRET(ushort, u16)
2887 OPENCV_HAL_IMPL_C_REINTERPRET(short, s16)
2888 OPENCV_HAL_IMPL_C_REINTERPRET(unsigned, u32)
2889 OPENCV_HAL_IMPL_C_REINTERPRET(int, s32)
2890 OPENCV_HAL_IMPL_C_REINTERPRET(float, f32)
2891 OPENCV_HAL_IMPL_C_REINTERPRET(double, f64)
2892 OPENCV_HAL_IMPL_C_REINTERPRET(uint64, u64)
2893 OPENCV_HAL_IMPL_C_REINTERPRET(int64, s64)
2894 //! @}
2895
2896 //! @brief Helper macro
2897 //! @ingroup core_hal_intrin_impl
2898 #define OPENCV_HAL_IMPL_C_SHIFTL(_Tp) \
2899 template<int shift, int n> inline v_reg<_Tp, n> v_shl(const v_reg<_Tp, n>& a) \
2900 { return a << shift; }
2901
2902 //! @name Left shift
2903 //! @{
2904 //! @brief Shift left
2905 OPENCV_HAL_IMPL_C_SHIFTL(ushort)
2906 OPENCV_HAL_IMPL_C_SHIFTL(short)
2907 OPENCV_HAL_IMPL_C_SHIFTL(unsigned)
2908 OPENCV_HAL_IMPL_C_SHIFTL(int)
2909 OPENCV_HAL_IMPL_C_SHIFTL(uint64)
2910 OPENCV_HAL_IMPL_C_SHIFTL(int64)
2911 //! @}
2912
2913 //! @brief Helper macro
2914 //! @ingroup core_hal_intrin_impl
2915 #define OPENCV_HAL_IMPL_C_SHIFTR(_Tp) \
2916 template<int shift, int n> inline v_reg<_Tp, n> v_shr(const v_reg<_Tp, n>& a) \
2917 { return a >> shift; }
2918
2919 //! @name Right shift
2920 //! @{
2921 //! @brief Shift right
2922 OPENCV_HAL_IMPL_C_SHIFTR(ushort)
2923 OPENCV_HAL_IMPL_C_SHIFTR(short)
2924 OPENCV_HAL_IMPL_C_SHIFTR(unsigned)
2925 OPENCV_HAL_IMPL_C_SHIFTR(int)
2926 OPENCV_HAL_IMPL_C_SHIFTR(uint64)
2927 OPENCV_HAL_IMPL_C_SHIFTR(int64)
2928 //! @}
2929
2930 //! @brief Helper macro
2931 //! @ingroup core_hal_intrin_impl
2932 #define OPENCV_HAL_IMPL_C_RSHIFTR(_Tp) \
2933 template<int shift, int n> inline v_reg<_Tp, n> v_rshr(const v_reg<_Tp, n>& a) \
2934 { \
2935 v_reg<_Tp, n> c; \
2936 for( int i = 0; i < n; i++ ) \
2937 c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2938 return c; \
2939 }
2940
2941 //! @name Rounding shift
2942 //! @{
2943 //! @brief Rounding shift right
2944 OPENCV_HAL_IMPL_C_RSHIFTR(ushort)
2945 OPENCV_HAL_IMPL_C_RSHIFTR(short)
2946 OPENCV_HAL_IMPL_C_RSHIFTR(unsigned)
2947 OPENCV_HAL_IMPL_C_RSHIFTR(int)
2948 OPENCV_HAL_IMPL_C_RSHIFTR(uint64)
2949 OPENCV_HAL_IMPL_C_RSHIFTR(int64)
2950 //! @}
2951
2952 //! @brief Helper macro
2953 //! @ingroup core_hal_intrin_impl
2954 #define OPENCV_HAL_IMPL_C_PACK(_Tp, _Tpn, pack_suffix, cast) \
2955 template<int n> inline v_reg<_Tpn, 2*n> v_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
2956 { \
2957 v_reg<_Tpn, 2*n> c; \
2958 for( int i = 0; i < n; i++ ) \
2959 { \
2960 c.s[i] = cast<_Tpn>(a.s[i]); \
2961 c.s[i+n] = cast<_Tpn>(b.s[i]); \
2962 } \
2963 return c; \
2964 }
2965
2966 //! @name Pack
2967 //! @{
2968 //! @brief Pack values from two vectors to one
2969 //!
2970 //! Return vector type have twice more elements than input vector types. Variant with _u_ suffix also
2971 //! converts to corresponding unsigned type.
2972 //!
2973 //! - pack: for 16-, 32- and 64-bit integer input types
2974 //! - pack_u: for 16- and 32-bit signed integer input types
2975 //!
2976 //! @note All variants except 64-bit use saturation.
2977 OPENCV_HAL_IMPL_C_PACK(ushort, uchar, pack, saturate_cast)
2978 OPENCV_HAL_IMPL_C_PACK(short, schar, pack, saturate_cast)
2979 OPENCV_HAL_IMPL_C_PACK(unsigned, ushort, pack, saturate_cast)
2980 OPENCV_HAL_IMPL_C_PACK(int, short, pack, saturate_cast)
2981 OPENCV_HAL_IMPL_C_PACK(uint64, unsigned, pack, static_cast)
2982 OPENCV_HAL_IMPL_C_PACK(int64, int, pack, static_cast)
2983 OPENCV_HAL_IMPL_C_PACK(short, uchar, pack_u, saturate_cast)
2984 OPENCV_HAL_IMPL_C_PACK(int, ushort, pack_u, saturate_cast)
2985 //! @}
2986
2987 //! @brief Helper macro
2988 //! @ingroup core_hal_intrin_impl
2989 #define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tp, _Tpn, pack_suffix, cast) \
2990 template<int shift, int n> inline v_reg<_Tpn, 2*n> v_rshr_##pack_suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
2991 { \
2992 v_reg<_Tpn, 2*n> c; \
2993 for( int i = 0; i < n; i++ ) \
2994 { \
2995 c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2996 c.s[i+n] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
2997 } \
2998 return c; \
2999 }
3000
3001 //! @name Pack with rounding shift
3002 //! @{
3003 //! @brief Pack values from two vectors to one with rounding shift
3004 //!
3005 //! Values from the input vectors will be shifted right by _n_ bits with rounding, converted to narrower
3006 //! type and returned in the result vector. Variant with _u_ suffix converts to unsigned type.
3007 //!
3008 //! - pack: for 16-, 32- and 64-bit integer input types
3009 //! - pack_u: for 16- and 32-bit signed integer input types
3010 //!
3011 //! @note All variants except 64-bit use saturation.
3012 OPENCV_HAL_IMPL_C_RSHR_PACK(ushort, uchar, pack, saturate_cast)
3013 OPENCV_HAL_IMPL_C_RSHR_PACK(short, schar, pack, saturate_cast)
3014 OPENCV_HAL_IMPL_C_RSHR_PACK(unsigned, ushort, pack, saturate_cast)
3015 OPENCV_HAL_IMPL_C_RSHR_PACK(int, short, pack, saturate_cast)
3016 OPENCV_HAL_IMPL_C_RSHR_PACK(uint64, unsigned, pack, static_cast)
3017 OPENCV_HAL_IMPL_C_RSHR_PACK(int64, int, pack, static_cast)
3018 OPENCV_HAL_IMPL_C_RSHR_PACK(short, uchar, pack_u, saturate_cast)
3019 OPENCV_HAL_IMPL_C_RSHR_PACK(int, ushort, pack_u, saturate_cast)
3020 //! @}
3021
3022 //! @brief Helper macro
3023 //! @ingroup core_hal_intrin_impl
3024 #define OPENCV_HAL_IMPL_C_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
3025 template<int n> inline void v_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
3026 { \
3027 for( int i = 0; i < n; i++ ) \
3028 ptr[i] = cast<_Tpn>(a.s[i]); \
3029 }
3030
3031 //! @name Pack and store
3032 //! @{
3033 //! @brief Store values from the input vector into memory with pack
3034 //!
3035 //! Values will be stored into memory with conversion to narrower type.
3036 //! Variant with _u_ suffix converts to corresponding unsigned type.
3037 //!
3038 //! - pack: for 16-, 32- and 64-bit integer input types
3039 //! - pack_u: for 16- and 32-bit signed integer input types
3040 //!
3041 //! @note All variants except 64-bit use saturation.
3042 OPENCV_HAL_IMPL_C_PACK_STORE(ushort, uchar, pack, saturate_cast)
3043 OPENCV_HAL_IMPL_C_PACK_STORE(short, schar, pack, saturate_cast)
3044 OPENCV_HAL_IMPL_C_PACK_STORE(unsigned, ushort, pack, saturate_cast)
3045 OPENCV_HAL_IMPL_C_PACK_STORE(int, short, pack, saturate_cast)
3046 OPENCV_HAL_IMPL_C_PACK_STORE(uint64, unsigned, pack, static_cast)
3047 OPENCV_HAL_IMPL_C_PACK_STORE(int64, int, pack, static_cast)
3048 OPENCV_HAL_IMPL_C_PACK_STORE(short, uchar, pack_u, saturate_cast)
3049 OPENCV_HAL_IMPL_C_PACK_STORE(int, ushort, pack_u, saturate_cast)
3050 //! @}
3051
3052 //! @brief Helper macro
3053 //! @ingroup core_hal_intrin_impl
3054 #define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tp, _Tpn, pack_suffix, cast) \
3055 template<int shift, int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const v_reg<_Tp, n>& a) \
3056 { \
3057 for( int i = 0; i < n; i++ ) \
3058 ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (shift - 1))) >> shift); \
3059 }
3060
3061 //! @name Pack and store with rounding shift
3062 //! @{
3063 //! @brief Store values from the input vector into memory with pack
3064 //!
3065 //! Values will be shifted _n_ bits right with rounding, converted to narrower type and stored into
3066 //! memory. Variant with _u_ suffix converts to unsigned type.
3067 //!
3068 //! - pack: for 16-, 32- and 64-bit integer input types
3069 //! - pack_u: for 16- and 32-bit signed integer input types
3070 //!
3071 //! @note All variants except 64-bit use saturation.
3072 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(ushort, uchar, pack, saturate_cast)
3073 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, schar, pack, saturate_cast)
3074 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(unsigned, ushort, pack, saturate_cast)
3075 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, short, pack, saturate_cast)
3076 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(uint64, unsigned, pack, static_cast)
3077 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int64, int, pack, static_cast)
3078 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(short, uchar, pack_u, saturate_cast)
3079 OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(int, ushort, pack_u, saturate_cast)
3080 //! @}
3081
3082 //! @cond IGNORED
3083 template<typename _Tpm, typename _Tp, int n>
3084 inline void _pack_b(_Tpm* mptr, const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
3085 {
3086 for (int i = 0; i < n; ++i)
3087 {
3088 mptr[i] = (_Tpm)a.s[i];
3089 mptr[i + n] = (_Tpm)b.s[i];
3090 }
3091 }
3092 //! @endcond
3093
3094 //! @name Pack boolean values
3095 //! @{
3096 //! @brief Pack boolean values from multiple vectors to one unsigned 8-bit integer vector
3097 //!
3098 //! @note Must provide valid boolean values to guarantee same result for all architectures.
3099
3100 /** @brief
3101 //! For 16-bit boolean values
3102
3103 Scheme:
3104 @code
3105 a {0xFFFF 0 0 0xFFFF 0 0xFFFF 0xFFFF 0}
3106 b {0xFFFF 0 0xFFFF 0 0 0xFFFF 0 0xFFFF}
3107 ===============
3108 {
3109 0xFF 0 0 0xFF 0 0xFF 0xFF 0
3110 0xFF 0 0xFF 0 0 0xFF 0 0xFF
3111 }
3112 @endcode */
3113
v_pack_b(const v_reg<ushort,n> & a,const v_reg<ushort,n> & b)3114 template<int n> inline v_reg<uchar, 2*n> v_pack_b(const v_reg<ushort, n>& a, const v_reg<ushort, n>& b)
3115 {
3116 v_reg<uchar, 2*n> mask;
3117 _pack_b(mask.s, a, b);
3118 return mask;
3119 }
3120
3121 /** @overload
3122 For 32-bit boolean values
3123
3124 Scheme:
3125 @code
3126 a {0xFFFF.. 0 0 0xFFFF..}
3127 b {0 0xFFFF.. 0xFFFF.. 0}
3128 c {0xFFFF.. 0 0xFFFF.. 0}
3129 d {0 0xFFFF.. 0 0xFFFF..}
3130 ===============
3131 {
3132 0xFF 0 0 0xFF 0 0xFF 0xFF 0
3133 0xFF 0 0xFF 0 0 0xFF 0 0xFF
3134 }
3135 @endcode */
3136
v_pack_b(const v_reg<unsigned,n> & a,const v_reg<unsigned,n> & b,const v_reg<unsigned,n> & c,const v_reg<unsigned,n> & d)3137 template<int n> inline v_reg<uchar, 4*n> v_pack_b(const v_reg<unsigned, n>& a, const v_reg<unsigned, n>& b,
3138 const v_reg<unsigned, n>& c, const v_reg<unsigned, n>& d)
3139 {
3140 v_reg<uchar, 4*n> mask;
3141 _pack_b(mask.s, a, b);
3142 _pack_b(mask.s + 2*n, c, d);
3143 return mask;
3144 }
3145
3146 /** @overload
3147 For 64-bit boolean values
3148
3149 Scheme:
3150 @code
3151 a {0xFFFF.. 0}
3152 b {0 0xFFFF..}
3153 c {0xFFFF.. 0}
3154 d {0 0xFFFF..}
3155
3156 e {0xFFFF.. 0}
3157 f {0xFFFF.. 0}
3158 g {0 0xFFFF..}
3159 h {0 0xFFFF..}
3160 ===============
3161 {
3162 0xFF 0 0 0xFF 0xFF 0 0 0xFF
3163 0xFF 0 0xFF 0 0 0xFF 0 0xFF
3164 }
3165 @endcode */
v_pack_b(const v_reg<uint64,n> & a,const v_reg<uint64,n> & b,const v_reg<uint64,n> & c,const v_reg<uint64,n> & d,const v_reg<uint64,n> & e,const v_reg<uint64,n> & f,const v_reg<uint64,n> & g,const v_reg<uint64,n> & h)3166 template<int n> inline v_reg<uchar, 8*n> v_pack_b(const v_reg<uint64, n>& a, const v_reg<uint64, n>& b,
3167 const v_reg<uint64, n>& c, const v_reg<uint64, n>& d,
3168 const v_reg<uint64, n>& e, const v_reg<uint64, n>& f,
3169 const v_reg<uint64, n>& g, const v_reg<uint64, n>& h)
3170 {
3171 v_reg<uchar, 8*n> mask;
3172 _pack_b(mask.s, a, b);
3173 _pack_b(mask.s + 2*n, c, d);
3174 _pack_b(mask.s + 4*n, e, f);
3175 _pack_b(mask.s + 6*n, g, h);
3176 return mask;
3177 }
3178 //! @}
3179
3180 /** @brief Matrix multiplication
3181
3182 Scheme:
3183 @code
3184 {A0 A1 A2 A3} |V0|
3185 {B0 B1 B2 B3} |V1|
3186 {C0 C1 C2 C3} |V2|
3187 {D0 D1 D2 D3} x |V3|
3188 ====================
3189 {R0 R1 R2 R3}, where:
3190 R0 = A0V0 + B0V1 + C0V2 + D0V3,
3191 R1 = A1V0 + B1V1 + C1V2 + D1V3
3192 ...
3193 @endcode
3194 */
3195 template<int n>
v_matmul(const v_reg<float,n> & v,const v_reg<float,n> & a,const v_reg<float,n> & b,const v_reg<float,n> & c,const v_reg<float,n> & d)3196 inline v_reg<float, n> v_matmul(const v_reg<float, n>& v,
3197 const v_reg<float, n>& a, const v_reg<float, n>& b,
3198 const v_reg<float, n>& c, const v_reg<float, n>& d)
3199 {
3200 v_reg<float, n> res;
3201 for (int i = 0; i < n / 4; i++)
3202 {
3203 res.s[0 + i*4] = v.s[0 + i*4] * a.s[0 + i*4] + v.s[1 + i*4] * b.s[0 + i*4] + v.s[2 + i*4] * c.s[0 + i*4] + v.s[3 + i*4] * d.s[0 + i*4];
3204 res.s[1 + i*4] = v.s[0 + i*4] * a.s[1 + i*4] + v.s[1 + i*4] * b.s[1 + i*4] + v.s[2 + i*4] * c.s[1 + i*4] + v.s[3 + i*4] * d.s[1 + i*4];
3205 res.s[2 + i*4] = v.s[0 + i*4] * a.s[2 + i*4] + v.s[1 + i*4] * b.s[2 + i*4] + v.s[2 + i*4] * c.s[2 + i*4] + v.s[3 + i*4] * d.s[2 + i*4];
3206 res.s[3 + i*4] = v.s[0 + i*4] * a.s[3 + i*4] + v.s[1 + i*4] * b.s[3 + i*4] + v.s[2 + i*4] * c.s[3 + i*4] + v.s[3 + i*4] * d.s[3 + i*4];
3207 }
3208 return res;
3209 }
3210
3211 /** @brief Matrix multiplication and add
3212
3213 Scheme:
3214 @code
3215 {A0 A1 A2 A3} |V0| |D0|
3216 {B0 B1 B2 B3} |V1| |D1|
3217 {C0 C1 C2 C3} x |V2| + |D2|
3218 ==================== |D3|
3219 {R0 R1 R2 R3}, where:
3220 R0 = A0V0 + B0V1 + C0V2 + D0,
3221 R1 = A1V0 + B1V1 + C1V2 + D1
3222 ...
3223 @endcode
3224 */
3225 template<int n>
v_matmuladd(const v_reg<float,n> & v,const v_reg<float,n> & a,const v_reg<float,n> & b,const v_reg<float,n> & c,const v_reg<float,n> & d)3226 inline v_reg<float, n> v_matmuladd(const v_reg<float, n>& v,
3227 const v_reg<float, n>& a, const v_reg<float, n>& b,
3228 const v_reg<float, n>& c, const v_reg<float, n>& d)
3229 {
3230 v_reg<float, n> res;
3231 for (int i = 0; i < n / 4; i++)
3232 {
3233 res.s[0 + i * 4] = v.s[0 + i * 4] * a.s[0 + i * 4] + v.s[1 + i * 4] * b.s[0 + i * 4] + v.s[2 + i * 4] * c.s[0 + i * 4] + d.s[0 + i * 4];
3234 res.s[1 + i * 4] = v.s[0 + i * 4] * a.s[1 + i * 4] + v.s[1 + i * 4] * b.s[1 + i * 4] + v.s[2 + i * 4] * c.s[1 + i * 4] + d.s[1 + i * 4];
3235 res.s[2 + i * 4] = v.s[0 + i * 4] * a.s[2 + i * 4] + v.s[1 + i * 4] * b.s[2 + i * 4] + v.s[2 + i * 4] * c.s[2 + i * 4] + d.s[2 + i * 4];
3236 res.s[3 + i * 4] = v.s[0 + i * 4] * a.s[3 + i * 4] + v.s[1 + i * 4] * b.s[3 + i * 4] + v.s[2 + i * 4] * c.s[3 + i * 4] + d.s[3 + i * 4];
3237 }
3238 return res;
3239 }
3240
3241
v_dotprod_expand(const v_reg<int,n> & a,const v_reg<int,n> & b)3242 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b)
3243 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
v_dotprod_expand(const v_reg<int,n> & a,const v_reg<int,n> & b,const v_reg<double,n/2> & c)3244 template<int n> inline v_reg<double, n/2> v_dotprod_expand(const v_reg<int, n>& a, const v_reg<int, n>& b,
3245 const v_reg<double, n/2>& c)
3246 { return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
3247
v_dotprod_expand_fast(const v_reg<int,n> & a,const v_reg<int,n> & b)3248 template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b)
3249 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_reg<int,n> & a,const v_reg<int,n> & b,const v_reg<double,n/2> & c)3250 template<int n> inline v_reg<double, n/2> v_dotprod_expand_fast(const v_reg<int, n>& a, const v_reg<int, n>& b,
3251 const v_reg<double, n/2>& c)
3252 { return v_dotprod_expand(a, b, c); }
3253
3254 ////// FP16 support ///////
3255
3256 inline v_reg<float, simd128_width / sizeof(float)>
v_load_expand(const float16_t * ptr)3257 v_load_expand(const float16_t* ptr)
3258 {
3259 v_reg<float, simd128_width / sizeof(float)> v;
3260 for( int i = 0; i < v.nlanes; i++ )
3261 {
3262 v.s[i] = ptr[i];
3263 }
3264 return v;
3265 }
3266 #if CV_SIMD256
3267 inline v_reg<float, simd256_width / sizeof(float)>
v256_load_expand(const float16_t * ptr)3268 v256_load_expand(const float16_t* ptr)
3269 {
3270 v_reg<float, simd256_width / sizeof(float)> v;
3271 for (int i = 0; i < v.nlanes; i++)
3272 {
3273 v.s[i] = ptr[i];
3274 }
3275 return v;
3276 }
3277 #endif
3278 #if CV_SIMD512
3279 inline v_reg<float, simd512_width / sizeof(float)>
v512_load_expand(const float16_t * ptr)3280 v512_load_expand(const float16_t* ptr)
3281 {
3282 v_reg<float, simd512_width / sizeof(float)> v;
3283 for (int i = 0; i < v.nlanes; i++)
3284 {
3285 v.s[i] = ptr[i];
3286 }
3287 return v;
3288 }
3289 #endif
3290
3291 template<int n> inline void
v_pack_store(float16_t * ptr,const v_reg<float,n> & v)3292 v_pack_store(float16_t* ptr, const v_reg<float, n>& v)
3293 {
3294 for( int i = 0; i < v.nlanes; i++ )
3295 {
3296 ptr[i] = float16_t(v.s[i]);
3297 }
3298 }
3299
v_cleanup()3300 inline void v_cleanup() {}
3301 #if CV_SIMD256
v256_cleanup()3302 inline void v256_cleanup() {}
3303 #endif
3304 #if CV_SIMD512
v512_cleanup()3305 inline void v512_cleanup() {}
3306 #endif
3307
3308 //! @}
3309
3310 #ifndef CV_DOXYGEN
3311 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
3312 #endif
3313 }
3314
3315 #if !defined(CV_DOXYGEN)
3316 #undef CV_SIMD256
3317 #undef CV_SIMD512
3318 #endif
3319
3320 #endif
3321