1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 #ifndef OPENCV_HAL_INTRIN_WASM_HPP
6 #define OPENCV_HAL_INTRIN_WASM_HPP
7
8 #include <limits>
9 #include <cstring>
10 #include <algorithm>
11 #include "opencv2/core/saturate.hpp"
12
13 #define CV_SIMD128 1
14 #define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
15 #define CV_SIMD128_FP16 0
16
17 namespace cv
18 {
19
20 //! @cond IGNORED
21
22 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
23
24 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
25 // handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
26 #define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
27 #define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
28 #define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
29 #define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
30 #define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
31 #define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
32 #define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
33 #define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
34 #endif // COMPATIBILITY: <1.38.46
35
36 ///////// Types ///////////
37
38 struct v_uint8x16
39 {
40 typedef uchar lane_type;
41 typedef v128_t vector_type;
42 enum { nlanes = 16 };
43
v_uint8x16cv::v_uint8x1644 v_uint8x16() {}
v_uint8x16cv::v_uint8x1645 explicit v_uint8x16(v128_t v) : val(v) {}
v_uint8x16cv::v_uint8x1646 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
47 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
48 {
49 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
50 val = wasm_v128_load(v);
51 }
52
get0cv::v_uint8x1653 uchar get0() const
54 {
55 return (uchar)wasm_i8x16_extract_lane(val, 0);
56 }
57
58 v128_t val;
59 };
60
61 struct v_int8x16
62 {
63 typedef schar lane_type;
64 typedef v128_t vector_type;
65 enum { nlanes = 16 };
66
v_int8x16cv::v_int8x1667 v_int8x16() {}
v_int8x16cv::v_int8x1668 explicit v_int8x16(v128_t v) : val(v) {}
v_int8x16cv::v_int8x1669 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
70 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
71 {
72 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
73 val = wasm_v128_load(v);
74 }
75
get0cv::v_int8x1676 schar get0() const
77 {
78 return wasm_i8x16_extract_lane(val, 0);
79 }
80
81 v128_t val;
82 };
83
84 struct v_uint16x8
85 {
86 typedef ushort lane_type;
87 typedef v128_t vector_type;
88 enum { nlanes = 8 };
89
v_uint16x8cv::v_uint16x890 v_uint16x8() {}
v_uint16x8cv::v_uint16x891 explicit v_uint16x8(v128_t v) : val(v) {}
v_uint16x8cv::v_uint16x892 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
93 {
94 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
95 val = wasm_v128_load(v);
96 }
97
get0cv::v_uint16x898 ushort get0() const
99 {
100 return (ushort)wasm_i16x8_extract_lane(val, 0); // wasm_u16x8_extract_lane() unimplemented yet
101 }
102
103 v128_t val;
104 };
105
106 struct v_int16x8
107 {
108 typedef short lane_type;
109 typedef v128_t vector_type;
110 enum { nlanes = 8 };
111
v_int16x8cv::v_int16x8112 v_int16x8() {}
v_int16x8cv::v_int16x8113 explicit v_int16x8(v128_t v) : val(v) {}
v_int16x8cv::v_int16x8114 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
115 {
116 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
117 val = wasm_v128_load(v);
118 }
119
get0cv::v_int16x8120 short get0() const
121 {
122 return wasm_i16x8_extract_lane(val, 0);
123 }
124
125 v128_t val;
126 };
127
128 struct v_uint32x4
129 {
130 typedef unsigned lane_type;
131 typedef v128_t vector_type;
132 enum { nlanes = 4 };
133
v_uint32x4cv::v_uint32x4134 v_uint32x4() {}
v_uint32x4cv::v_uint32x4135 explicit v_uint32x4(v128_t v) : val(v) {}
v_uint32x4cv::v_uint32x4136 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
137 {
138 unsigned v[] = {v0, v1, v2, v3};
139 val = wasm_v128_load(v);
140 }
141
get0cv::v_uint32x4142 unsigned get0() const
143 {
144 return (unsigned)wasm_i32x4_extract_lane(val, 0);
145 }
146
147 v128_t val;
148 };
149
150 struct v_int32x4
151 {
152 typedef int lane_type;
153 typedef v128_t vector_type;
154 enum { nlanes = 4 };
155
v_int32x4cv::v_int32x4156 v_int32x4() {}
v_int32x4cv::v_int32x4157 explicit v_int32x4(v128_t v) : val(v) {}
v_int32x4cv::v_int32x4158 v_int32x4(int v0, int v1, int v2, int v3)
159 {
160 int v[] = {v0, v1, v2, v3};
161 val = wasm_v128_load(v);
162 }
163
get0cv::v_int32x4164 int get0() const
165 {
166 return wasm_i32x4_extract_lane(val, 0);
167 }
168
169 v128_t val;
170 };
171
172 struct v_float32x4
173 {
174 typedef float lane_type;
175 typedef v128_t vector_type;
176 enum { nlanes = 4 };
177
v_float32x4cv::v_float32x4178 v_float32x4() {}
v_float32x4cv::v_float32x4179 explicit v_float32x4(v128_t v) : val(v) {}
v_float32x4cv::v_float32x4180 v_float32x4(float v0, float v1, float v2, float v3)
181 {
182 float v[] = {v0, v1, v2, v3};
183 val = wasm_v128_load(v);
184 }
185
get0cv::v_float32x4186 float get0() const
187 {
188 return wasm_f32x4_extract_lane(val, 0);
189 }
190
191 v128_t val;
192 };
193
194 struct v_uint64x2
195 {
196 typedef uint64 lane_type;
197 typedef v128_t vector_type;
198 enum { nlanes = 2 };
199
v_uint64x2cv::v_uint64x2200 v_uint64x2() {}
v_uint64x2cv::v_uint64x2201 explicit v_uint64x2(v128_t v) : val(v) {}
v_uint64x2cv::v_uint64x2202 v_uint64x2(uint64 v0, uint64 v1)
203 {
204 uint64 v[] = {v0, v1};
205 val = wasm_v128_load(v);
206 }
207
get0cv::v_uint64x2208 uint64 get0() const
209 {
210 return (uint64)wasm_i64x2_extract_lane(val, 0);
211 }
212
213 v128_t val;
214 };
215
216 struct v_int64x2
217 {
218 typedef int64 lane_type;
219 typedef v128_t vector_type;
220 enum { nlanes = 2 };
221
v_int64x2cv::v_int64x2222 v_int64x2() {}
v_int64x2cv::v_int64x2223 explicit v_int64x2(v128_t v) : val(v) {}
v_int64x2cv::v_int64x2224 v_int64x2(int64 v0, int64 v1)
225 {
226 int64 v[] = {v0, v1};
227 val = wasm_v128_load(v);
228 }
229
get0cv::v_int64x2230 int64 get0() const
231 {
232 return wasm_i64x2_extract_lane(val, 0);
233 }
234
235 v128_t val;
236 };
237
238 struct v_float64x2
239 {
240 typedef double lane_type;
241 typedef v128_t vector_type;
242 enum { nlanes = 2 };
243
v_float64x2cv::v_float64x2244 v_float64x2() {}
v_float64x2cv::v_float64x2245 explicit v_float64x2(v128_t v) : val(v) {}
v_float64x2cv::v_float64x2246 v_float64x2(double v0, double v1)
247 {
248 double v[] = {v0, v1};
249 val = wasm_v128_load(v);
250 }
251
get0cv::v_float64x2252 double get0() const
253 {
254 return wasm_f64x2_extract_lane(val, 0);
255 }
256
257 v128_t val;
258 };
259
260 namespace
261 {
262 #define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
263 inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
264 OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
265 OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
266 OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
267 OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
268 OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
269 OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
270 OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
271 OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
272 OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
273 OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
274
275 static const unsigned char popCountTable[] =
276 {
277 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
278 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
279 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
280 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
281 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
282 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
283 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
284 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
285 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
286 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
287 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
288 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
289 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
290 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
291 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
292 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
293 };
294 } // namespace
295
wasm_unpacklo_i8x16(v128_t a,v128_t b)296 static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
297 return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
298 }
299
wasm_unpacklo_i16x8(v128_t a,v128_t b)300 static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
301 return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
302 }
303
wasm_unpacklo_i32x4(v128_t a,v128_t b)304 static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
305 return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
306 }
307
wasm_unpacklo_i64x2(v128_t a,v128_t b)308 static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
309 return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
310 }
311
wasm_unpackhi_i8x16(v128_t a,v128_t b)312 static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
313 return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
314 }
315
wasm_unpackhi_i16x8(v128_t a,v128_t b)316 static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
317 return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
318 }
319
wasm_unpackhi_i32x4(v128_t a,v128_t b)320 static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
321 return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
322 }
323
wasm_unpackhi_i64x2(v128_t a,v128_t b)324 static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
325 return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
326 }
327
328 /** Convert **/
329 // 8 >> 16
v128_cvtu8x16_i16x8(const v128_t & a)330 inline v128_t v128_cvtu8x16_i16x8(const v128_t& a)
331 {
332 const v128_t z = wasm_i8x16_splat(0);
333 return wasm_unpacklo_i8x16(a, z);
334 }
v128_cvti8x16_i16x8(const v128_t & a)335 inline v128_t v128_cvti8x16_i16x8(const v128_t& a)
336 { return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
337 // 8 >> 32
v128_cvtu8x16_i32x4(const v128_t & a)338 inline v128_t v128_cvtu8x16_i32x4(const v128_t& a)
339 {
340 const v128_t z = wasm_i8x16_splat(0);
341 return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
342 }
v128_cvti8x16_i32x4(const v128_t & a)343 inline v128_t v128_cvti8x16_i32x4(const v128_t& a)
344 {
345 v128_t r = wasm_unpacklo_i8x16(a, a);
346 r = wasm_unpacklo_i8x16(r, r);
347 return wasm_i32x4_shr(r, 24);
348 }
349 // 16 >> 32
v128_cvtu16x8_i32x4(const v128_t & a)350 inline v128_t v128_cvtu16x8_i32x4(const v128_t& a)
351 {
352 const v128_t z = wasm_i8x16_splat(0);
353 return wasm_unpacklo_i16x8(a, z);
354 }
v128_cvti16x8_i32x4(const v128_t & a)355 inline v128_t v128_cvti16x8_i32x4(const v128_t& a)
356 { return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
357 // 32 >> 64
v128_cvtu32x4_i64x2(const v128_t & a)358 inline v128_t v128_cvtu32x4_i64x2(const v128_t& a)
359 {
360 const v128_t z = wasm_i8x16_splat(0);
361 return wasm_unpacklo_i32x4(a, z);
362 }
v128_cvti32x4_i64x2(const v128_t & a)363 inline v128_t v128_cvti32x4_i64x2(const v128_t& a)
364 { return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
365
366 // 16 << 8
v128_cvtu8x16_i16x8_high(const v128_t & a)367 inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a)
368 {
369 const v128_t z = wasm_i8x16_splat(0);
370 return wasm_unpackhi_i8x16(a, z);
371 }
v128_cvti8x16_i16x8_high(const v128_t & a)372 inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a)
373 { return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
374 // 32 << 16
v128_cvtu16x8_i32x4_high(const v128_t & a)375 inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a)
376 {
377 const v128_t z = wasm_i8x16_splat(0);
378 return wasm_unpackhi_i16x8(a, z);
379 }
v128_cvti16x8_i32x4_high(const v128_t & a)380 inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a)
381 { return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
382 // 64 << 32
v128_cvtu32x4_i64x2_high(const v128_t & a)383 inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a)
384 {
385 const v128_t z = wasm_i8x16_splat(0);
386 return wasm_unpackhi_i32x4(a, z);
387 }
v128_cvti32x4_i64x2_high(const v128_t & a)388 inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
389 { return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
390
391 #define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
392 inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
393 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
394 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
395 { return _Tpvec(a.val); }
396
OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16,uchar,u8,i8x16,schar)397 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar)
398 OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar)
399 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short)
400 OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short)
401 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int)
402 OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int)
403 OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float)
404 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64)
405 OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64)
406 OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double)
407
408 //////////////// PACK ///////////////
409 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
410 {
411 v128_t maxval = wasm_i16x8_splat(255);
412 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
413 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
414 return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
415 }
v_pack(const v_int16x8 & a,const v_int16x8 & b)416 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
417 {
418 v128_t maxval = wasm_i16x8_splat(127);
419 v128_t minval = wasm_i16x8_splat(-128);
420 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
421 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
422 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
423 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
424 return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
425 }
v_pack(const v_uint32x4 & a,const v_uint32x4 & b)426 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
427 {
428 v128_t maxval = wasm_i32x4_splat(65535);
429 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
430 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
431 return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
432 }
v_pack(const v_int32x4 & a,const v_int32x4 & b)433 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
434 {
435 v128_t maxval = wasm_i32x4_splat(32767);
436 v128_t minval = wasm_i32x4_splat(-32768);
437 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
438 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
439 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
440 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
441 return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
442 }
v_pack(const v_uint64x2 & a,const v_uint64x2 & b)443 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
444 {
445 return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
446 }
v_pack(const v_int64x2 & a,const v_int64x2 & b)447 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
448 {
449 return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
450 }
v_pack_u(const v_int16x8 & a,const v_int16x8 & b)451 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
452 {
453 v128_t maxval = wasm_i16x8_splat(255);
454 v128_t minval = wasm_i16x8_splat(0);
455 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
456 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
457 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
458 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
459 return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
460 }
v_pack_u(const v_int32x4 & a,const v_int32x4 & b)461 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
462 {
463 v128_t maxval = wasm_i32x4_splat(65535);
464 v128_t minval = wasm_i32x4_splat(0);
465 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
466 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
467 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
468 v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
469 return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
470 }
471
472 template<int n>
v_rshr_pack(const v_uint16x8 & a,const v_uint16x8 & b)473 inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
474 {
475 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
476 v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
477 v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
478 v128_t maxval = wasm_i16x8_splat(255);
479 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
480 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
481 return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
482 }
483 template<int n>
v_rshr_pack(const v_int16x8 & a,const v_int16x8 & b)484 inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
485 {
486 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
487 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
488 v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
489 v128_t maxval = wasm_i16x8_splat(127);
490 v128_t minval = wasm_i16x8_splat(-128);
491 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
492 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
493 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
494 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
495 return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
496 }
497 template<int n>
v_rshr_pack(const v_uint32x4 & a,const v_uint32x4 & b)498 inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
499 {
500 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
501 v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
502 v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
503 v128_t maxval = wasm_i32x4_splat(65535);
504 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
505 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
506 return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
507 }
508 template<int n>
v_rshr_pack(const v_int32x4 & a,const v_int32x4 & b)509 inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
510 {
511 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
512 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
513 v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
514 v128_t maxval = wasm_i32x4_splat(32767);
515 v128_t minval = wasm_i16x8_splat(-32768);
516 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
517 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
518 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
519 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
520 return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
521 }
522 template<int n>
v_rshr_pack(const v_uint64x2 & a,const v_uint64x2 & b)523 inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
524 {
525 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
526 v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
527 v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
528 return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
529 }
530 template<int n>
v_rshr_pack(const v_int64x2 & a,const v_int64x2 & b)531 inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
532 {
533 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
534 v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
535 v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
536 return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
537 }
538 template<int n>
v_rshr_pack_u(const v_int16x8 & a,const v_int16x8 & b)539 inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
540 {
541 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
542 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
543 v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
544 v128_t maxval = wasm_i16x8_splat(255);
545 v128_t minval = wasm_i16x8_splat(0);
546 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
547 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
548 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
549 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
550 return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
551 }
552 template<int n>
v_rshr_pack_u(const v_int32x4 & a,const v_int32x4 & b)553 inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
554 {
555 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
556 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
557 v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
558 v128_t maxval = wasm_i32x4_splat(65535);
559 v128_t minval = wasm_i16x8_splat(0);
560 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
561 v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
562 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
563 v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
564 return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
565 }
566
v_pack_store(uchar * ptr,const v_uint16x8 & a)567 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
568 {
569 v128_t maxval = wasm_i16x8_splat(255);
570 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
571 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
572 uchar t_ptr[16];
573 wasm_v128_store(t_ptr, r);
574 for (int i=0; i<8; ++i) {
575 ptr[i] = t_ptr[i];
576 }
577 }
v_pack_store(schar * ptr,const v_int16x8 & a)578 inline void v_pack_store(schar* ptr, const v_int16x8& a)
579 {
580 v128_t maxval = wasm_i16x8_splat(127);
581 v128_t minval = wasm_i16x8_splat(-128);
582 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
583 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
584 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
585 schar t_ptr[16];
586 wasm_v128_store(t_ptr, r);
587 for (int i=0; i<8; ++i) {
588 ptr[i] = t_ptr[i];
589 }
590 }
v_pack_store(ushort * ptr,const v_uint32x4 & a)591 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
592 {
593 v128_t maxval = wasm_i32x4_splat(65535);
594 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
595 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
596 ushort t_ptr[8];
597 wasm_v128_store(t_ptr, r);
598 for (int i=0; i<4; ++i) {
599 ptr[i] = t_ptr[i];
600 }
601 }
v_pack_store(short * ptr,const v_int32x4 & a)602 inline void v_pack_store(short* ptr, const v_int32x4& a)
603 {
604 v128_t maxval = wasm_i32x4_splat(32767);
605 v128_t minval = wasm_i32x4_splat(-32768);
606 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
607 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
608 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
609 short t_ptr[8];
610 wasm_v128_store(t_ptr, r);
611 for (int i=0; i<4; ++i) {
612 ptr[i] = t_ptr[i];
613 }
614 }
v_pack_store(unsigned * ptr,const v_uint64x2 & a)615 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
616 {
617 v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
618 unsigned t_ptr[4];
619 wasm_v128_store(t_ptr, r);
620 for (int i=0; i<2; ++i) {
621 ptr[i] = t_ptr[i];
622 }
623 }
v_pack_store(int * ptr,const v_int64x2 & a)624 inline void v_pack_store(int* ptr, const v_int64x2& a)
625 {
626 v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
627 int t_ptr[4];
628 wasm_v128_store(t_ptr, r);
629 for (int i=0; i<2; ++i) {
630 ptr[i] = t_ptr[i];
631 }
632 }
v_pack_u_store(uchar * ptr,const v_int16x8 & a)633 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
634 {
635 v128_t maxval = wasm_i16x8_splat(255);
636 v128_t minval = wasm_i16x8_splat(0);
637 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
638 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
639 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
640 uchar t_ptr[16];
641 wasm_v128_store(t_ptr, r);
642 for (int i=0; i<8; ++i) {
643 ptr[i] = t_ptr[i];
644 }
645 }
v_pack_u_store(ushort * ptr,const v_int32x4 & a)646 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
647 {
648 v128_t maxval = wasm_i32x4_splat(65535);
649 v128_t minval = wasm_i32x4_splat(0);
650 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
651 v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
652 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
653 ushort t_ptr[8];
654 wasm_v128_store(t_ptr, r);
655 for (int i=0; i<4; ++i) {
656 ptr[i] = t_ptr[i];
657 }
658 }
659
660 template<int n>
v_rshr_pack_store(uchar * ptr,const v_uint16x8 & a)661 inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
662 {
663 v128_t delta = wasm_i16x8_splat((short)(1 << (n-1)));
664 v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
665 v128_t maxval = wasm_i16x8_splat(255);
666 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
667 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
668 uchar t_ptr[16];
669 wasm_v128_store(t_ptr, r);
670 for (int i=0; i<8; ++i) {
671 ptr[i] = t_ptr[i];
672 }
673 }
674 template<int n>
v_rshr_pack_store(schar * ptr,const v_int16x8 & a)675 inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
676 {
677 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
678 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
679 v128_t maxval = wasm_i16x8_splat(127);
680 v128_t minval = wasm_i16x8_splat(-128);
681 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
682 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
683 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
684 schar t_ptr[16];
685 wasm_v128_store(t_ptr, r);
686 for (int i=0; i<8; ++i) {
687 ptr[i] = t_ptr[i];
688 }
689 }
690 template<int n>
v_rshr_pack_store(ushort * ptr,const v_uint32x4 & a)691 inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
692 {
693 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
694 v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
695 v128_t maxval = wasm_i32x4_splat(65535);
696 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
697 v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
698 ushort t_ptr[8];
699 wasm_v128_store(t_ptr, r);
700 for (int i=0; i<4; ++i) {
701 ptr[i] = t_ptr[i];
702 }
703 }
704 template<int n>
v_rshr_pack_store(short * ptr,const v_int32x4 & a)705 inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
706 {
707 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
708 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
709 v128_t maxval = wasm_i32x4_splat(32767);
710 v128_t minval = wasm_i32x4_splat(-32768);
711 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
712 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
713 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
714 short t_ptr[8];
715 wasm_v128_store(t_ptr, r);
716 for (int i=0; i<4; ++i) {
717 ptr[i] = t_ptr[i];
718 }
719 }
720 template<int n>
v_rshr_pack_store(unsigned * ptr,const v_uint64x2 & a)721 inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
722 {
723 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
724 v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
725 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
726 unsigned t_ptr[4];
727 wasm_v128_store(t_ptr, r);
728 for (int i=0; i<2; ++i) {
729 ptr[i] = t_ptr[i];
730 }
731 }
732 template<int n>
v_rshr_pack_store(int * ptr,const v_int64x2 & a)733 inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
734 {
735 v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
736 v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
737 v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
738 int t_ptr[4];
739 wasm_v128_store(t_ptr, r);
740 for (int i=0; i<2; ++i) {
741 ptr[i] = t_ptr[i];
742 }
743 }
744 template<int n>
v_rshr_pack_u_store(uchar * ptr,const v_int16x8 & a)745 inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
746 {
747 v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
748 v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
749 v128_t maxval = wasm_i16x8_splat(255);
750 v128_t minval = wasm_i16x8_splat(0);
751 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
752 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
753 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
754 uchar t_ptr[16];
755 wasm_v128_store(t_ptr, r);
756 for (int i=0; i<8; ++i) {
757 ptr[i] = t_ptr[i];
758 }
759 }
760 template<int n>
v_rshr_pack_u_store(ushort * ptr,const v_int32x4 & a)761 inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
762 {
763 v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
764 v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
765 v128_t maxval = wasm_i32x4_splat(65535);
766 v128_t minval = wasm_i32x4_splat(0);
767 v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
768 v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
769 v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
770 ushort t_ptr[8];
771 wasm_v128_store(t_ptr, r);
772 for (int i=0; i<4; ++i) {
773 ptr[i] = t_ptr[i];
774 }
775 }
776
v_pack_b(const v_uint16x8 & a,const v_uint16x8 & b)777 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
778 {
779 v128_t maxval = wasm_i16x8_splat(255);
780 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
781 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
782 return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
783 }
784
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)785 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
786 const v_uint32x4& c, const v_uint32x4& d)
787 {
788 v128_t maxval = wasm_i32x4_splat(255);
789 v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
790 v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
791 v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
792 v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
793 v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
794 v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
795 return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
796 }
797
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)798 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
799 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
800 const v_uint64x2& g, const v_uint64x2& h)
801 {
802 v128_t maxval = wasm_i32x4_splat(255);
803 v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
804 v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
805 v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
806 v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
807 v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
808 v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
809 v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
810 v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
811 v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
812 v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
813 v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
814 v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
815 v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
816 v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
817 return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
818 }
819
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)820 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
821 const v_float32x4& m1, const v_float32x4& m2,
822 const v_float32x4& m3)
823 {
824 v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
825 v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
826 v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
827 v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
828 v0 = wasm_f32x4_mul(v0, m0.val);
829 v1 = wasm_f32x4_mul(v1, m1.val);
830 v2 = wasm_f32x4_mul(v2, m2.val);
831 v3 = wasm_f32x4_mul(v3, m3.val);
832
833 return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
834 }
835
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)836 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
837 const v_float32x4& m1, const v_float32x4& m2,
838 const v_float32x4& a)
839 {
840 v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
841 v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
842 v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
843 v0 = wasm_f32x4_mul(v0, m0.val);
844 v1 = wasm_f32x4_mul(v1, m1.val);
845 v2 = wasm_f32x4_mul(v2, m2.val);
846
847 return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
848 }
849
850 #define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
851 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
852 { \
853 return _Tpvec(intrin(a.val, b.val)); \
854 } \
855 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
856 { \
857 a.val = intrin(a.val, b.val); \
858 return a; \
859 }
860
861 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
862 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
863 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
864 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
865 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
866 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
867 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
868 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
869 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
870 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
871 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
872 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
873 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
874 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
875 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
876 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
877 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
878 OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
879 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
880 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
881 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
882 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
883 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
884 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
885 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
886 OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
887
888 // saturating multiply 8-bit, 16-bit
889 #define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \
890 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
891 { \
892 _Tpwvec c, d; \
893 v_mul_expand(a, b, c, d); \
894 return v_pack(c, d); \
895 } \
896 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
897 { a = a * b; return a; }
898
OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16,v_uint16x8)899 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
900 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16, v_int16x8)
901 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4)
902 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8, v_int32x4)
903
904 // Multiply and expand
905 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
906 v_uint16x8& c, v_uint16x8& d)
907 {
908 v_uint16x8 a0, a1, b0, b1;
909 v_expand(a, a0, a1);
910 v_expand(b, b0, b1);
911 c = v_mul_wrap(a0, b0);
912 d = v_mul_wrap(a1, b1);
913 }
914
v_mul_expand(const v_int8x16 & a,const v_int8x16 & b,v_int16x8 & c,v_int16x8 & d)915 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
916 v_int16x8& c, v_int16x8& d)
917 {
918 v_int16x8 a0, a1, b0, b1;
919 v_expand(a, a0, a1);
920 v_expand(b, b0, b1);
921 c = v_mul_wrap(a0, b0);
922 d = v_mul_wrap(a1, b1);
923 }
924
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)925 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
926 v_int32x4& c, v_int32x4& d)
927 {
928 v_int32x4 a0, a1, b0, b1;
929 v_expand(a, a0, a1);
930 v_expand(b, b0, b1);
931 c.val = wasm_i32x4_mul(a0.val, b0.val);
932 d.val = wasm_i32x4_mul(a1.val, b1.val);
933 }
934
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)935 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
936 v_uint32x4& c, v_uint32x4& d)
937 {
938 v_uint32x4 a0, a1, b0, b1;
939 v_expand(a, a0, a1);
940 v_expand(b, b0, b1);
941 c.val = wasm_i32x4_mul(a0.val, b0.val);
942 d.val = wasm_i32x4_mul(a1.val, b1.val);
943 }
944
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)945 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
946 v_uint64x2& c, v_uint64x2& d)
947 {
948 v_uint64x2 a0, a1, b0, b1;
949 v_expand(a, a0, a1);
950 v_expand(b, b0, b1);
951 c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
952 d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
953 }
954
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)955 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
956 {
957 v_int32x4 a0, a1, b0, b1;
958 v_expand(a, a0, a1);
959 v_expand(b, b0, b1);
960 v128_t c = wasm_i32x4_mul(a0.val, b0.val);
961 v128_t d = wasm_i32x4_mul(a1.val, b1.val);
962 return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
963 }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)964 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
965 {
966 v_uint32x4 a0, a1, b0, b1;
967 v_expand(a, a0, a1);
968 v_expand(b, b0, b1);
969 v128_t c = wasm_i32x4_mul(a0.val, b0.val);
970 v128_t d = wasm_i32x4_mul(a1.val, b1.val);
971 return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
972 }
973
974 //////// Dot Product ////////
975
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)976 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
977 {
978 v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
979 v128_t a1 = wasm_i32x4_shr(a.val, 16);
980 v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
981 v128_t b1 = wasm_i32x4_shr(b.val, 16);
982 v128_t c = wasm_i32x4_mul(a0, b0);
983 v128_t d = wasm_i32x4_mul(a1, b1);
984 return v_int32x4(wasm_i32x4_add(c, d));
985 }
986
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)987 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
988 { return v_dotprod(a, b) + c; }
989
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)990 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
991 {
992 v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
993 v128_t a1 = wasm_i64x2_shr(a.val, 32);
994 v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
995 v128_t b1 = wasm_i64x2_shr(b.val, 32);
996 v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
997 v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
998 return v_int64x2(wasm_i64x2_add(c, d));
999 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)1000 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1001 {
1002 return v_dotprod(a, b) + c;
1003 }
1004
1005 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)1006 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1007 {
1008 v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1009 v128_t a1 = wasm_u16x8_shr(a.val, 8);
1010 v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1011 v128_t b1 = wasm_u16x8_shr(b.val, 8);
1012 return v_uint32x4((
1013 v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1014 v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
1015 );
1016 }
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)1017 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1018 { return v_dotprod_expand(a, b) + c; }
1019
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)1020 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1021 {
1022 v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1023 v128_t a1 = wasm_i16x8_shr(a.val, 8);
1024 v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1025 v128_t b1 = wasm_i16x8_shr(b.val, 8);
1026 return v_int32x4(
1027 v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1028 v_dotprod(v_int16x8(a1), v_int16x8(b1))
1029 );
1030 }
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)1031 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1032 { return v_dotprod_expand(a, b) + c; }
1033
1034 // 16 >> 64
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)1035 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1036 {
1037 v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1038 v128_t a1 = wasm_u32x4_shr(a.val, 16);
1039 v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1040 v128_t b1 = wasm_u32x4_shr(b.val, 16);
1041 return v_uint64x2((
1042 v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1043 v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
1044 );
1045 }
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)1046 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1047 { return v_dotprod_expand(a, b) + c; }
1048
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)1049 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1050 {
1051 v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1052 v128_t a1 = wasm_i32x4_shr(a.val, 16);
1053 v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1054 v128_t b1 = wasm_i32x4_shr(b.val, 16);
1055 return v_int64x2((
1056 v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1057 v_dotprod(v_int32x4(a1), v_int32x4(b1)))
1058 );
1059 }
1060
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)1061 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1062 { return v_dotprod_expand(a, b) + c; }
1063
1064 // 32 >> 64f
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)1065 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1066 { return v_cvt_f64(v_dotprod(a, b)); }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)1067 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1068 { return v_dotprod_expand(a, b) + c; }
1069
1070 //////// Fast Dot Product ////////
1071
1072 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)1073 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1074 { return v_dotprod(a, b); }
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)1075 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1076 { return v_dotprod(a, b, c); }
1077
1078 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)1079 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1080 { return v_dotprod(a, b); }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)1081 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1082 { return v_dotprod(a, b, c); }
1083
1084 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)1085 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1086 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)1087 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1088 { return v_dotprod_expand(a, b, c); }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)1089 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1090 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)1091 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1092 { return v_dotprod_expand(a, b, c); }
1093
1094 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)1095 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1096 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)1097 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1098 { return v_dotprod_expand(a, b, c); }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)1099 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1100 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)1101 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1102 { return v_dotprod_expand(a, b, c); }
1103
1104 // 32 >> 64f
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)1105 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1106 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)1107 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1108 { return v_dotprod_expand(a, b, c); }
1109
1110 #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
1111 OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
1112 OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
1113 OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
1114 inline _Tpvec operator ~ (const _Tpvec& a) \
1115 { \
1116 return _Tpvec(wasm_v128_not(a.val)); \
1117 }
1118
1119 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16)
OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)1120 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)
1121 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8)
1122 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8)
1123 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4)
1124 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4)
1125 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2)
1126 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2)
1127 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4)
1128 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2)
1129
1130 inline v_float32x4 v_sqrt(const v_float32x4& x)
1131 {
1132 return v_float32x4(wasm_f32x4_sqrt(x.val));
1133 }
1134
v_invsqrt(const v_float32x4 & x)1135 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1136 {
1137 const v128_t _1_0 = wasm_f32x4_splat(1.0);
1138 return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
1139 }
1140
v_sqrt(const v_float64x2 & x)1141 inline v_float64x2 v_sqrt(const v_float64x2& x)
1142 {
1143 return v_float64x2(wasm_f64x2_sqrt(x.val));
1144 }
1145
v_invsqrt(const v_float64x2 & x)1146 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1147 {
1148 const v128_t _1_0 = wasm_f64x2_splat(1.0);
1149 return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
1150 }
1151
1152 #define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
1153 inline _Tpuvec v_abs(const _Tpsvec& x) \
1154 { \
1155 v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
1156 v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
1157 return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
1158 }
1159
1160 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7)
1161 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15)
1162 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31)
1163
v_abs(const v_float32x4 & x)1164 inline v_float32x4 v_abs(const v_float32x4& x)
1165 { return v_float32x4(wasm_f32x4_abs(x.val)); }
v_abs(const v_float64x2 & x)1166 inline v_float64x2 v_abs(const v_float64x2& x)
1167 {
1168 return v_float64x2(wasm_f64x2_abs(x.val));
1169 }
1170
1171 // TODO: exp, log, sin, cos
1172
1173 #define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
1174 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1175 { \
1176 return _Tpvec(intrin(a.val, b.val)); \
1177 }
1178
OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4,v_min,wasm_f32x4_min)1179 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
1180 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
1181 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
1182 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max)
1183
1184 #define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
1185 inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1186 { \
1187 return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
1188 } \
1189 inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1190 { \
1191 return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
1192 }
1193
1194 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16)
1195 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8)
1196 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4)
1197
1198 #define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
1199 inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1200 { \
1201 v128_t delta = wasm_##suffix##_splat(deltaNum); \
1202 v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1203 return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
1204 } \
1205 inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1206 { \
1207 v128_t delta = wasm_##suffix##_splat(deltaNum); \
1208 v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1209 return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
1210 }
1211
1212 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80)
1213 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
1214 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
1215
1216 #define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
1217 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1218 { return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
1219 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1220 { return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
1221 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1222 { return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
1223 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1224 { return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
1225 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1226 { return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
1227 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1228 { return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
1229
1230 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
1231 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16)
1232 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8)
1233 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8)
1234 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4)
1235 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4)
1236 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
1237 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
1238
1239 #define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
1240 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1241 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1242 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1243 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1244
1245 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
1246 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
1247
1248 inline v_float32x4 v_not_nan(const v_float32x4& a)
1249 {
1250 v128_t z = wasm_i32x4_splat(0x7fffffff);
1251 v128_t t = wasm_i32x4_splat(0x7f800000);
1252 return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
1253 }
v_not_nan(const v_float64x2 & a)1254 inline v_float64x2 v_not_nan(const v_float64x2& a)
1255 {
1256 v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
1257 v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
1258 return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
1259 }
1260
OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16,v_add_wrap,wasm_i8x16_add)1261 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add)
1262 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add)
1263 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add)
1264 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add)
1265 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
1266 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
1267 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
1268 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
1269 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
1270 // details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
1271 // 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
1272 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1273 {
1274 uchar a_[16], b_[16];
1275 wasm_v128_store(a_, a.val);
1276 wasm_v128_store(b_, b.val);
1277 for (int i = 0; i < 16; i++)
1278 a_[i] = (uchar)(a_[i] * b_[i]);
1279 return v_uint8x16(wasm_v128_load(a_));
1280 }
v_mul_wrap(const v_int8x16 & a,const v_int8x16 & b)1281 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1282 {
1283 schar a_[16], b_[16];
1284 wasm_v128_store(a_, a.val);
1285 wasm_v128_store(b_, b.val);
1286 for (int i = 0; i < 16; i++)
1287 a_[i] = (schar)(a_[i] * b_[i]);
1288 return v_int8x16(wasm_v128_load(a_));
1289 }
1290 #else
1291 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
1292 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
1293 #endif
OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8,v_mul_wrap,wasm_i16x8_mul)1294 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
1295 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
1296
1297
1298 /** Absolute difference **/
1299
1300 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1301 { return v_add_wrap(a - b, b - a); }
v_absdiff(const v_uint16x8 & a,const v_uint16x8 & b)1302 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1303 { return v_add_wrap(a - b, b - a); }
v_absdiff(const v_uint32x4 & a,const v_uint32x4 & b)1304 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1305 { return v_max(a, b) - v_min(a, b); }
1306
v_absdiff(const v_int8x16 & a,const v_int8x16 & b)1307 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1308 {
1309 v_int8x16 d = v_sub_wrap(a, b);
1310 v_int8x16 m = a < b;
1311 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1312 }
v_absdiff(const v_int16x8 & a,const v_int16x8 & b)1313 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1314 {
1315 return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1316 }
v_absdiff(const v_int32x4 & a,const v_int32x4 & b)1317 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1318 {
1319 v_int32x4 d = a - b;
1320 v_int32x4 m = a < b;
1321 return v_reinterpret_as_u32((d ^ m) - m);
1322 }
1323
1324 /** Saturating absolute difference **/
v_absdiffs(const v_int8x16 & a,const v_int8x16 & b)1325 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1326 {
1327 v_int8x16 d = a - b;
1328 v_int8x16 m = a < b;
1329 return (d ^ m) - m;
1330 }
v_absdiffs(const v_int16x8 & a,const v_int16x8 & b)1331 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1332 { return v_max(a, b) - v_min(a, b); }
1333
1334
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1335 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1336 {
1337 return a * b + c;
1338 }
1339
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1340 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1341 {
1342 return v_fma(a, b, c);
1343 }
1344
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1345 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1346 {
1347 return a * b + c;
1348 }
1349
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1350 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1351 {
1352 return a * b + c;
1353 }
1354
v_absdiff(const v_float32x4 & a,const v_float32x4 & b)1355 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1356 {
1357 v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
1358 return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
1359 }
v_absdiff(const v_float64x2 & a,const v_float64x2 & b)1360 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1361 {
1362 v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
1363 return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
1364 }
1365
1366 #define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
1367 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1368 { \
1369 v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1370 v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1371 return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
1372 } \
1373 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1374 { \
1375 v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1376 v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1377 return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
1378 } \
1379 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1380 { \
1381 return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
1382 }
1383
1384 OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
1385 OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
1386
1387 #define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
1388 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1389 { \
1390 return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1391 } \
1392 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1393 { \
1394 return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1395 } \
1396 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1397 { \
1398 return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1399 } \
1400 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1401 { \
1402 return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1403 } \
1404 template<int imm> \
1405 inline _Tpuvec v_shl(const _Tpuvec& a) \
1406 { \
1407 return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1408 } \
1409 template<int imm> \
1410 inline _Tpsvec v_shl(const _Tpsvec& a) \
1411 { \
1412 return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1413 } \
1414 template<int imm> \
1415 inline _Tpuvec v_shr(const _Tpuvec& a) \
1416 { \
1417 return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1418 } \
1419 template<int imm> \
1420 inline _Tpsvec v_shr(const _Tpsvec& a) \
1421 { \
1422 return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1423 }
1424
1425 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16)
1426 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8)
1427 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4)
1428 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2)
1429
1430 namespace hal_wasm_internal
1431 {
1432 template <int imm,
1433 bool is_invalid = ((imm < 0) || (imm > 16)),
1434 bool is_first = (imm == 0),
1435 bool is_second = (imm == 16),
1436 bool is_other = (((imm > 0) && (imm < 16)))>
1437 class v_wasm_palignr_u8_class;
1438
1439 template <int imm>
1440 class v_wasm_palignr_u8_class<imm, true, false, false, false>;
1441
1442 template <int imm>
1443 class v_wasm_palignr_u8_class<imm, false, true, false, false>
1444 {
1445 public:
operator ()(const v128_t & a,const v128_t &) const1446 inline v128_t operator()(const v128_t& a, const v128_t&) const
1447 {
1448 return a;
1449 }
1450 };
1451
1452 template <int imm>
1453 class v_wasm_palignr_u8_class<imm, false, false, true, false>
1454 {
1455 public:
operator ()(const v128_t &,const v128_t & b) const1456 inline v128_t operator()(const v128_t&, const v128_t& b) const
1457 {
1458 return b;
1459 }
1460 };
1461
1462 template <int imm>
1463 class v_wasm_palignr_u8_class<imm, false, false, false, true>
1464 {
1465 public:
operator ()(const v128_t & a,const v128_t & b) const1466 inline v128_t operator()(const v128_t& a, const v128_t& b) const
1467 {
1468 enum { imm2 = (sizeof(v128_t) - imm) };
1469 return wasm_v8x16_shuffle(a, b,
1470 imm, imm+1, imm+2, imm+3,
1471 imm+4, imm+5, imm+6, imm+7,
1472 imm+8, imm+9, imm+10, imm+11,
1473 imm+12, imm+13, imm+14, imm+15);
1474 }
1475 };
1476
1477 template <int imm>
v_wasm_palignr_u8(const v128_t & a,const v128_t & b)1478 inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b)
1479 {
1480 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8.");
1481 return v_wasm_palignr_u8_class<imm>()(a, b);
1482 }
1483 }
1484
1485 template<int imm, typename _Tpvec>
v_rotate_right(const _Tpvec & a)1486 inline _Tpvec v_rotate_right(const _Tpvec &a)
1487 {
1488 using namespace hal_wasm_internal;
1489 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1490 v128_t z = wasm_i8x16_splat(0);
1491 return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
1492 }
1493
1494 template<int imm, typename _Tpvec>
v_rotate_left(const _Tpvec & a)1495 inline _Tpvec v_rotate_left(const _Tpvec &a)
1496 {
1497 using namespace hal_wasm_internal;
1498 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1499 v128_t z = wasm_i8x16_splat(0);
1500 return _Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
1501 }
1502
1503 template<int imm, typename _Tpvec>
v_rotate_right(const _Tpvec & a,const _Tpvec & b)1504 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1505 {
1506 using namespace hal_wasm_internal;
1507 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1508 return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
1509 }
1510
1511 template<int imm, typename _Tpvec>
v_rotate_left(const _Tpvec & a,const _Tpvec & b)1512 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1513 {
1514 using namespace hal_wasm_internal;
1515 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1516 return _Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
1517 }
1518
1519 #define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1520 inline _Tpvec v_load(const _Tp* ptr) \
1521 { return _Tpvec(wasm_v128_load(ptr)); } \
1522 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1523 { return _Tpvec(wasm_v128_load(ptr)); } \
1524 inline _Tpvec v_load_low(const _Tp* ptr) \
1525 { \
1526 _Tp tmp[_Tpvec::nlanes] = {0}; \
1527 for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1528 tmp[i] = ptr[i]; \
1529 } \
1530 return _Tpvec(wasm_v128_load(tmp)); \
1531 } \
1532 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1533 { \
1534 _Tp tmp[_Tpvec::nlanes]; \
1535 for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1536 tmp[i] = ptr0[i]; \
1537 tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
1538 } \
1539 return _Tpvec(wasm_v128_load(tmp)); \
1540 } \
1541 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1542 { wasm_v128_store(ptr, a.val); } \
1543 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1544 { wasm_v128_store(ptr, a.val); } \
1545 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1546 { wasm_v128_store(ptr, a.val); } \
1547 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1548 { \
1549 wasm_v128_store(ptr, a.val); \
1550 } \
1551 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1552 { \
1553 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1554 wasm_v128_store(a_, a.val); \
1555 for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1556 ptr[i] = a_[i]; \
1557 } \
1558 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1559 { \
1560 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1561 wasm_v128_store(a_, a.val); \
1562 for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1563 ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
1564 }
1565
OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16,uchar)1566 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
1567 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar)
1568 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort)
1569 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short)
1570 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1571 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int)
1572 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64)
1573 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64)
1574 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
1575 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
1576
1577
1578 /** Reverse **/
1579 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1580 { return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
1581
v_reverse(const v_int8x16 & a)1582 inline v_int8x16 v_reverse(const v_int8x16 &a)
1583 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1584
v_reverse(const v_uint16x8 & a)1585 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1586 { return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
1587
v_reverse(const v_int16x8 & a)1588 inline v_int16x8 v_reverse(const v_int16x8 &a)
1589 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1590
v_reverse(const v_uint32x4 & a)1591 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1592 { return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
1593
v_reverse(const v_int32x4 & a)1594 inline v_int32x4 v_reverse(const v_int32x4 &a)
1595 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1596
v_reverse(const v_float32x4 & a)1597 inline v_float32x4 v_reverse(const v_float32x4 &a)
1598 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1599
v_reverse(const v_uint64x2 & a)1600 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1601 { return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
1602
v_reverse(const v_int64x2 & a)1603 inline v_int64x2 v_reverse(const v_int64x2 &a)
1604 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1605
v_reverse(const v_float64x2 & a)1606 inline v_float64x2 v_reverse(const v_float64x2 &a)
1607 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1608
1609
1610 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1611 inline scalartype v_reduce_sum(const _Tpvec& a) \
1612 { \
1613 regtype val = a.val; \
1614 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1615 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
1616 return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1617 }
1618
OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4,unsigned,v128_t,i32x4,i32x4)1619 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4)
1620 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4)
1621 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
1622
1623 // To do: Optimize v_reduce_sum with wasm intrin.
1624 // Now use fallback implementation as there is no widening op in wasm intrin.
1625
1626 #define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
1627 inline scalartype v_reduce_sum(const _Tpvec& a) \
1628 { \
1629 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1630 wasm_v128_store(a_, a.val); \
1631 scalartype c = a_[0]; \
1632 for (int i = 1; i < _Tpvec::nlanes; i++) \
1633 c += a_[i]; \
1634 return c; \
1635 }
1636
1637 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
1638 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int)
1639 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned)
1640 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
1641
1642
1643 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1644 inline scalartype v_reduce_sum(const _Tpvec& a) \
1645 { \
1646 regtype val = a.val; \
1647 val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1648 return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1649 }
1650 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
1651 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64, v128_t, i64x2, i64x2)
1652 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double, v128_t, f64x2,f64x2)
1653
1654 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1655 const v_float32x4& c, const v_float32x4& d)
1656 {
1657 v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
1658 v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
1659 return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
1660 }
1661
1662 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
1663 inline scalartype v_reduce_##func(const _Tpvec& a) \
1664 { \
1665 scalartype buf[_Tpvec::nlanes]; \
1666 v_store(buf, a); \
1667 scalartype tmp = buf[0]; \
1668 for (int i=1; i<_Tpvec::nlanes; ++i) { \
1669 tmp = scalar_func(tmp, buf[i]); \
1670 } \
1671 return tmp; \
1672 }
1673
OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16,uchar,max,std::max)1674 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max)
1675 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min)
1676 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max)
1677 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min)
1678 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max)
1679 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min)
1680 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max)
1681 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min)
1682 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max)
1683 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min)
1684 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max)
1685 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min)
1686 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max)
1687 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min)
1688
1689 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1690 {
1691 v_uint16x8 l16, h16;
1692 v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1693 v_expand(v_absdiff(a, b), l16, h16);
1694 v_expand(l16, l16_l32, l16_h32);
1695 v_expand(h16, h16_l32, h16_h32);
1696 return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1697 }
v_reduce_sad(const v_int8x16 & a,const v_int8x16 & b)1698 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1699 {
1700 v_uint16x8 l16, h16;
1701 v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1702 v_expand(v_absdiff(a, b), l16, h16);
1703 v_expand(l16, l16_l32, l16_h32);
1704 v_expand(h16, h16_l32, h16_h32);
1705 return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1706 }
v_reduce_sad(const v_uint16x8 & a,const v_uint16x8 & b)1707 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1708 {
1709 v_uint32x4 l, h;
1710 v_expand(v_absdiff(a, b), l, h);
1711 return v_reduce_sum(l + h);
1712 }
v_reduce_sad(const v_int16x8 & a,const v_int16x8 & b)1713 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1714 {
1715 v_uint32x4 l, h;
1716 v_expand(v_absdiff(a, b), l, h);
1717 return v_reduce_sum(l + h);
1718 }
v_reduce_sad(const v_uint32x4 & a,const v_uint32x4 & b)1719 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1720 {
1721 return v_reduce_sum(v_absdiff(a, b));
1722 }
v_reduce_sad(const v_int32x4 & a,const v_int32x4 & b)1723 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1724 {
1725 return v_reduce_sum(v_absdiff(a, b));
1726 }
v_reduce_sad(const v_float32x4 & a,const v_float32x4 & b)1727 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1728 {
1729 return v_reduce_sum(v_absdiff(a, b));
1730 }
1731
v_popcount(const v_uint8x16 & a)1732 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1733 {
1734 v128_t m1 = wasm_i32x4_splat(0x55555555);
1735 v128_t m2 = wasm_i32x4_splat(0x33333333);
1736 v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
1737 v128_t p = a.val;
1738 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
1739 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
1740 p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
1741 return v_uint8x16(p);
1742 }
v_popcount(const v_uint16x8 & a)1743 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1744 {
1745 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1746 p += v_rotate_right<1>(p);
1747 return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1748 }
v_popcount(const v_uint32x4 & a)1749 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1750 {
1751 v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1752 p += v_rotate_right<1>(p);
1753 p += v_rotate_right<2>(p);
1754 return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1755 }
v_popcount(const v_uint64x2 & a)1756 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1757 {
1758 uint64 a_[2], b_[2] = { 0 };
1759 wasm_v128_store(a_, a.val);
1760 for (int i = 0; i < 16; i++)
1761 b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
1762 return v_uint64x2(wasm_v128_load(b_));
1763 }
v_popcount(const v_int8x16 & a)1764 inline v_uint8x16 v_popcount(const v_int8x16& a)
1765 { return v_popcount(v_reinterpret_as_u8(a)); }
v_popcount(const v_int16x8 & a)1766 inline v_uint16x8 v_popcount(const v_int16x8& a)
1767 { return v_popcount(v_reinterpret_as_u16(a)); }
v_popcount(const v_int32x4 & a)1768 inline v_uint32x4 v_popcount(const v_int32x4& a)
1769 { return v_popcount(v_reinterpret_as_u32(a)); }
v_popcount(const v_int64x2 & a)1770 inline v_uint64x2 v_popcount(const v_int64x2& a)
1771 { return v_popcount(v_reinterpret_as_u64(a)); }
1772
1773 #define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
1774 inline int v_signmask(const _Tpvec& a) \
1775 { \
1776 _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1777 wasm_v128_store(a_, a.val); \
1778 int mask = 0; \
1779 for (int i = 0; i < _Tpvec::nlanes; i++) \
1780 mask |= (reinterpret_int(a_[i]) < 0) << i; \
1781 return mask; \
1782 } \
1783 inline bool v_check_all(const _Tpvec& a) \
1784 { return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
1785 inline bool v_check_any(const _Tpvec& a) \
1786 { return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
1787
OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16,i8x16,schar)1788 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar)
1789 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar)
1790 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short)
1791 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short)
1792 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int)
1793 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int)
1794 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float)
1795 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double)
1796
1797 #define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
1798 inline bool v_check_all(const _Tpvec& a) \
1799 { \
1800 v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1801 masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
1802 masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
1803 return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1804 } \
1805 inline bool v_check_any(const _Tpvec& a) \
1806 { \
1807 v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1808 masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
1809 masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
1810 return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1811 } \
1812
1813 OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32)
1814 OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32)
1815
1816
1817 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
v_scan_forward(const v_uint8x16 & a)1818 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
v_scan_forward(const v_int16x8 & a)1819 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
v_scan_forward(const v_uint16x8 & a)1820 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
v_scan_forward(const v_int32x4 & a)1821 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_uint32x4 & a)1822 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_float32x4 & a)1823 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_int64x2 & a)1824 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
v_scan_forward(const v_uint64x2 & a)1825 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
v_scan_forward(const v_float64x2 & a)1826 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1827
1828 #define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
1829 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1830 { \
1831 return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
1832 }
1833
1834 OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16)
OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)1835 OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)
1836 OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8)
1837 OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8)
1838 OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4)
1839 OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4)
1840 OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2)
1841 OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2)
1842 OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4)
1843 OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2)
1844
1845 #define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1846 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1847 { \
1848 b0.val = intrin(a.val); \
1849 b1.val = __CV_CAT(intrin, _high)(a.val); \
1850 } \
1851 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1852 { return _Tpwvec(intrin(a.val)); } \
1853 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1854 { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
1855 inline _Tpwvec v_load_expand(const _Tp* ptr) \
1856 { \
1857 v128_t a = wasm_v128_load(ptr); \
1858 return _Tpwvec(intrin(a)); \
1859 }
1860
1861 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8)
1862 OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16, v_int16x8, schar, v128_cvti8x16_i16x8)
1863 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4)
1864 OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8, v_int32x4, short, v128_cvti16x8_i32x4)
1865 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2)
1866 OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4, v_int64x2, int, v128_cvti32x4_i64x2)
1867
1868 #define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin) \
1869 inline _Tpvec v_load_expand_q(const _Tp* ptr) \
1870 { \
1871 v128_t a = wasm_v128_load(ptr); \
1872 return _Tpvec(intrin(a)); \
1873 }
1874
1875 OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4)
1876 OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4)
1877
1878 #define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
1879 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1880 { \
1881 b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
1882 b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
1883 } \
1884 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1885 { \
1886 return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
1887 } \
1888 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1889 { \
1890 return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
1891 } \
1892 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1893 { \
1894 c.val = wasm_unpacklo_i64x2(a.val, b.val); \
1895 d.val = wasm_unpackhi_i64x2(a.val, b.val); \
1896 }
1897
1898 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16)
1899 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16)
1900 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8)
1901 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8)
1902 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4)
1903 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4)
1904 OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4)
1905 OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2)
1906
1907 template<int s, typename _Tpvec>
1908 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1909 {
1910 return v_rotate_right<s>(a, b);
1911 }
1912
v_round(const v_float32x4 & a)1913 inline v_int32x4 v_round(const v_float32x4& a)
1914 {
1915 v128_t h = wasm_f32x4_splat(0.5);
1916 return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
1917 }
1918
v_floor(const v_float32x4 & a)1919 inline v_int32x4 v_floor(const v_float32x4& a)
1920 {
1921 v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1922 v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
1923 return v_int32x4(wasm_i32x4_add(a1, mask));
1924 }
1925
v_ceil(const v_float32x4 & a)1926 inline v_int32x4 v_ceil(const v_float32x4& a)
1927 {
1928 v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1929 v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
1930 return v_int32x4(wasm_i32x4_sub(a1, mask));
1931 }
1932
v_trunc(const v_float32x4 & a)1933 inline v_int32x4 v_trunc(const v_float32x4& a)
1934 { return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
1935
1936 #define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
1937 inline v_int32x4 func(const v_float64x2& a) \
1938 { \
1939 double a_[2]; \
1940 wasm_v128_store(a_, a.val); \
1941 int c_[4]; \
1942 c_[0] = cfunc(a_[0]); \
1943 c_[1] = cfunc(a_[1]); \
1944 c_[2] = 0; \
1945 c_[3] = 0; \
1946 return v_int32x4(wasm_v128_load(c_)); \
1947 }
1948
OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round,cvRound)1949 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
1950 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
1951 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
1952 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
1953
1954 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1955 {
1956 double a_[2], b_[2];
1957 wasm_v128_store(a_, a.val);
1958 wasm_v128_store(b_, b.val);
1959 int c_[4];
1960 c_[0] = cvRound(a_[0]);
1961 c_[1] = cvRound(a_[1]);
1962 c_[2] = cvRound(b_[0]);
1963 c_[3] = cvRound(b_[1]);
1964 return v_int32x4(wasm_v128_load(c_));
1965 }
1966
1967 #define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
1968 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1969 const _Tpvec& a2, const _Tpvec& a3, \
1970 _Tpvec& b0, _Tpvec& b1, \
1971 _Tpvec& b2, _Tpvec& b3) \
1972 { \
1973 v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
1974 v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
1975 v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
1976 v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
1977 \
1978 b0.val = wasm_unpacklo_i64x2(t0, t1); \
1979 b1.val = wasm_unpackhi_i64x2(t0, t1); \
1980 b2.val = wasm_unpacklo_i64x2(t2, t3); \
1981 b3.val = wasm_unpackhi_i64x2(t2, t3); \
1982 }
1983
OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4,i32x4)1984 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4)
1985 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4)
1986 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4)
1987
1988 // load deinterleave
1989 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1990 {
1991 v128_t t00 = wasm_v128_load(ptr);
1992 v128_t t01 = wasm_v128_load(ptr + 16);
1993
1994 a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
1995 b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
1996 }
1997
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c)1998 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1999 {
2000 v128_t t00 = wasm_v128_load(ptr);
2001 v128_t t01 = wasm_v128_load(ptr + 16);
2002 v128_t t02 = wasm_v128_load(ptr + 32);
2003
2004 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
2005 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
2006 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
2007
2008 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
2009 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
2010 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
2011 }
2012
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c,v_uint8x16 & d)2013 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2014 {
2015 v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2016 v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2017 v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
2018 v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
2019
2020 v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2021 v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2022 v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2023 v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2024
2025 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2026 b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2027 c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2028 d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2029 }
2030
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b)2031 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2032 {
2033 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1 a2 b2 a3 b3
2034 v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
2035
2036 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
2037 b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
2038 }
2039
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c)2040 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2041 {
2042 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1 b1 c1 a2 b2
2043 v128_t t01 = wasm_v128_load(ptr + 8); // c2 a3 b3 c3 a4 b4 c4 a5
2044 v128_t t02 = wasm_v128_load(ptr + 16); // b5 c5 a6 b6 c6 a7 b7 c7
2045
2046 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
2047 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
2048 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
2049
2050 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
2051 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
2052 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
2053 }
2054
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c,v_uint16x8 & d)2055 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2056 {
2057 v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2058 v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ...
2059 v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2060 v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
2061
2062 v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
2063 v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
2064 v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
2065 v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
2066
2067 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2068 b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2069 c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2070 d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2071 }
2072
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b)2073 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2074 {
2075 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1
2076 v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
2077
2078 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2079 b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2080 }
2081
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c)2082 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2083 {
2084 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1
2085 v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3
2086 v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4
2087
2088 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2089 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2090 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2091
2092 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2093 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2094 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2095 }
2096
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c,v_uint32x4 & d)2097 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2098 {
2099 v_uint32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0
2100 v_uint32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1
2101 v_uint32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2
2102 v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2103
2104 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2105 }
2106
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b)2107 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2108 {
2109 v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1
2110 v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
2111
2112 a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2113 b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2114 }
2115
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b,v_float32x4 & c)2116 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2117 {
2118 v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1
2119 v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3
2120 v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4
2121
2122 v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2123 v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2124 v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2125
2126 a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2127 b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2128 c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2129 }
2130
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b,v_float32x4 & c,v_float32x4 & d)2131 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2132 {
2133 v_float32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0
2134 v_float32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1
2135 v_float32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2
2136 v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2137
2138 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2139 }
2140
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b)2141 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2142 {
2143 v128_t t0 = wasm_v128_load(ptr); // a0 b0
2144 v128_t t1 = wasm_v128_load(ptr + 2); // a1 b1
2145
2146 a.val = wasm_unpacklo_i64x2(t0, t1);
2147 b.val = wasm_unpackhi_i64x2(t0, t1);
2148 }
2149
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b,v_uint64x2 & c)2150 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2151 {
2152 v128_t t0 = wasm_v128_load(ptr); // a0, b0
2153 v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
2154 v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
2155
2156 a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2157 b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
2158 c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2159 }
2160
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b,v_uint64x2 & c,v_uint64x2 & d)2161 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2162 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2163 {
2164 v128_t t0 = wasm_v128_load(ptr); // a0 b0
2165 v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0
2166 v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1
2167 v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1
2168
2169 a.val = wasm_unpacklo_i64x2(t0, t2);
2170 b.val = wasm_unpackhi_i64x2(t0, t2);
2171 c.val = wasm_unpacklo_i64x2(t1, t3);
2172 d.val = wasm_unpackhi_i64x2(t1, t3);
2173 }
2174
2175 // store interleave
2176
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,hal::StoreMode=hal::STORE_UNALIGNED)2177 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2178 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2179 {
2180 v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
2181 v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
2182
2183 wasm_v128_store(ptr, v0);
2184 wasm_v128_store(ptr + 16, v1);
2185 }
2186
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,hal::StoreMode=hal::STORE_UNALIGNED)2187 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2188 const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2189 {
2190 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
2191 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
2192 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
2193
2194 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
2195 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
2196 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
2197
2198 wasm_v128_store(ptr, t10);
2199 wasm_v128_store(ptr + 16, t11);
2200 wasm_v128_store(ptr + 32, t12);
2201 }
2202
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,const v_uint8x16 & d,hal::StoreMode=hal::STORE_UNALIGNED)2203 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2204 const v_uint8x16& c, const v_uint8x16& d,
2205 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2206 {
2207 // a0 a1 a2 a3 ....
2208 // b0 b1 b2 b3 ....
2209 // c0 c1 c2 c3 ....
2210 // d0 d1 d2 d3 ....
2211 v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ...
2212 v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ...
2213 v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ...
2214 v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ...
2215
2216 v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ...
2217 v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ...
2218 v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ...
2219 v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ...
2220
2221 wasm_v128_store(ptr, v0);
2222 wasm_v128_store(ptr + 16, v1);
2223 wasm_v128_store(ptr + 32, v2);
2224 wasm_v128_store(ptr + 48, v3);
2225 }
2226
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,hal::StoreMode=hal::STORE_UNALIGNED)2227 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2228 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2229 {
2230 v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
2231 v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
2232
2233 wasm_v128_store(ptr, v0);
2234 wasm_v128_store(ptr + 8, v1);
2235 }
2236
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,hal::StoreMode=hal::STORE_UNALIGNED)2237 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2238 const v_uint16x8& b, const v_uint16x8& c,
2239 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2240 {
2241 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
2242 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
2243 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
2244
2245 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
2246 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
2247 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
2248
2249 wasm_v128_store(ptr, t10);
2250 wasm_v128_store(ptr + 8, t11);
2251 wasm_v128_store(ptr + 16, t12);
2252 }
2253
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,const v_uint16x8 & d,hal::StoreMode=hal::STORE_UNALIGNED)2254 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2255 const v_uint16x8& c, const v_uint16x8& d,
2256 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2257 {
2258 // a0 a1 a2 a3 ....
2259 // b0 b1 b2 b3 ....
2260 // c0 c1 c2 c3 ....
2261 // d0 d1 d2 d3 ....
2262 v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ...
2263 v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ...
2264 v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ...
2265 v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ...
2266
2267 v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ...
2268 v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ...
2269 v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ...
2270 v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ...
2271
2272 wasm_v128_store(ptr, v0);
2273 wasm_v128_store(ptr + 8, v1);
2274 wasm_v128_store(ptr + 16, v2);
2275 wasm_v128_store(ptr + 24, v3);
2276 }
2277
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,hal::StoreMode=hal::STORE_UNALIGNED)2278 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2279 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2280 {
2281 v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2282 v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2283
2284 wasm_v128_store(ptr, v0);
2285 wasm_v128_store(ptr + 4, v1);
2286 }
2287
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,hal::StoreMode=hal::STORE_UNALIGNED)2288 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2289 const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2290 {
2291 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2292 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2293 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2294
2295 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2296 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2297 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2298
2299 wasm_v128_store(ptr, t10);
2300 wasm_v128_store(ptr + 4, t11);
2301 wasm_v128_store(ptr + 8, t12);
2302 }
2303
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d,hal::StoreMode=hal::STORE_UNALIGNED)2304 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2305 const v_uint32x4& c, const v_uint32x4& d,
2306 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2307 {
2308 v_uint32x4 v0, v1, v2, v3;
2309 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2310
2311 wasm_v128_store(ptr, v0.val);
2312 wasm_v128_store(ptr + 4, v1.val);
2313 wasm_v128_store(ptr + 8, v2.val);
2314 wasm_v128_store(ptr + 12, v3.val);
2315 }
2316
2317 // 2-channel, float only
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,hal::StoreMode=hal::STORE_UNALIGNED)2318 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2319 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2320 {
2321 v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2322 v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2323
2324 wasm_v128_store(ptr, v0);
2325 wasm_v128_store(ptr + 4, v1);
2326 }
2327
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,hal::StoreMode=hal::STORE_UNALIGNED)2328 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2329 const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2330 {
2331 v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2332 v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2333 v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2334
2335 v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2336 v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2337 v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2338
2339 wasm_v128_store(ptr, t10);
2340 wasm_v128_store(ptr + 4, t11);
2341 wasm_v128_store(ptr + 8, t12);
2342 }
2343
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d,hal::StoreMode=hal::STORE_UNALIGNED)2344 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2345 const v_float32x4& c, const v_float32x4& d,
2346 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2347 {
2348 v_float32x4 v0, v1, v2, v3;
2349 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2350
2351 wasm_v128_store(ptr, v0.val);
2352 wasm_v128_store(ptr + 4, v1.val);
2353 wasm_v128_store(ptr + 8, v2.val);
2354 wasm_v128_store(ptr + 12, v3.val);
2355 }
2356
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,hal::StoreMode=hal::STORE_UNALIGNED)2357 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2358 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2359 {
2360 v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2361 v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
2362
2363 wasm_v128_store(ptr, v0);
2364 wasm_v128_store(ptr + 2, v1);
2365 }
2366
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,hal::StoreMode=hal::STORE_UNALIGNED)2367 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2368 const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2369 {
2370 v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2371 v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
2372 v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2373
2374 wasm_v128_store(ptr, v0);
2375 wasm_v128_store(ptr + 2, v1);
2376 wasm_v128_store(ptr + 4, v2);
2377 }
2378
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,hal::StoreMode=hal::STORE_UNALIGNED)2379 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2380 const v_uint64x2& c, const v_uint64x2& d,
2381 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2382 {
2383 v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2384 v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
2385 v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
2386 v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
2387
2388 wasm_v128_store(ptr, v0);
2389 wasm_v128_store(ptr + 2, v1);
2390 wasm_v128_store(ptr + 4, v2);
2391 wasm_v128_store(ptr + 6, v3);
2392 }
2393
2394 #define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2395 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2396 { \
2397 _Tpvec1 a1, b1; \
2398 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2399 a0 = v_reinterpret_as_##suffix0(a1); \
2400 b0 = v_reinterpret_as_##suffix0(b1); \
2401 } \
2402 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2403 { \
2404 _Tpvec1 a1, b1, c1; \
2405 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2406 a0 = v_reinterpret_as_##suffix0(a1); \
2407 b0 = v_reinterpret_as_##suffix0(b1); \
2408 c0 = v_reinterpret_as_##suffix0(c1); \
2409 } \
2410 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2411 { \
2412 _Tpvec1 a1, b1, c1, d1; \
2413 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2414 a0 = v_reinterpret_as_##suffix0(a1); \
2415 b0 = v_reinterpret_as_##suffix0(b1); \
2416 c0 = v_reinterpret_as_##suffix0(c1); \
2417 d0 = v_reinterpret_as_##suffix0(d1); \
2418 } \
2419 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2420 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2421 { \
2422 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2423 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2424 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2425 } \
2426 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2427 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2428 { \
2429 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2430 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2431 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2432 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2433 } \
2434 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2435 const _Tpvec0& c0, const _Tpvec0& d0, \
2436 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2437 { \
2438 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2439 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2440 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2441 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2442 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2443 }
2444
OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16,schar,s8,v_uint8x16,uchar,u8)2445 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2446 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2447 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2448 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2449 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2450
2451 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2452 {
2453 return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
2454 }
2455
v_cvt_f32(const v_float64x2 & a)2456 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2457 {
2458 double a_[2];
2459 wasm_v128_store(a_, a.val);
2460 float c_[4];
2461 c_[0] = (float)(a_[0]);
2462 c_[1] = (float)(a_[1]);
2463 c_[2] = 0;
2464 c_[3] = 0;
2465 return v_float32x4(wasm_v128_load(c_));
2466 }
2467
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)2468 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2469 {
2470 double a_[2], b_[2];
2471 wasm_v128_store(a_, a.val);
2472 wasm_v128_store(b_, b.val);
2473 float c_[4];
2474 c_[0] = (float)(a_[0]);
2475 c_[1] = (float)(a_[1]);
2476 c_[2] = (float)(b_[0]);
2477 c_[3] = (float)(b_[1]);
2478 return v_float32x4(wasm_v128_load(c_));
2479 }
2480
v_cvt_f64(const v_int32x4 & a)2481 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2482 {
2483 #ifdef __wasm_unimplemented_simd128__
2484 v128_t p = v128_cvti32x4_i64x2(a.val);
2485 return v_float64x2(wasm_f64x2_convert_i64x2(p));
2486 #else
2487 int a_[4];
2488 wasm_v128_store(a_, a.val);
2489 double c_[2];
2490 c_[0] = (double)(a_[0]);
2491 c_[1] = (double)(a_[1]);
2492 return v_float64x2(wasm_v128_load(c_));
2493 #endif
2494 }
2495
v_cvt_f64_high(const v_int32x4 & a)2496 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2497 {
2498 #ifdef __wasm_unimplemented_simd128__
2499 v128_t p = v128_cvti32x4_i64x2_high(a.val);
2500 return v_float64x2(wasm_f64x2_convert_i64x2(p));
2501 #else
2502 int a_[4];
2503 wasm_v128_store(a_, a.val);
2504 double c_[2];
2505 c_[0] = (double)(a_[2]);
2506 c_[1] = (double)(a_[3]);
2507 return v_float64x2(wasm_v128_load(c_));
2508 #endif
2509 }
2510
v_cvt_f64(const v_float32x4 & a)2511 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2512 {
2513 float a_[4];
2514 wasm_v128_store(a_, a.val);
2515 double c_[2];
2516 c_[0] = (double)(a_[0]);
2517 c_[1] = (double)(a_[1]);
2518 return v_float64x2(wasm_v128_load(c_));
2519 }
2520
v_cvt_f64_high(const v_float32x4 & a)2521 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2522 {
2523 float a_[4];
2524 wasm_v128_store(a_, a.val);
2525 double c_[2];
2526 c_[0] = (double)(a_[2]);
2527 c_[1] = (double)(a_[3]);
2528 return v_float64x2(wasm_v128_load(c_));
2529 }
2530
v_cvt_f64(const v_int64x2 & a)2531 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2532 {
2533 #ifdef __wasm_unimplemented_simd128__
2534 return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
2535 #else
2536 int64 a_[2];
2537 wasm_v128_store(a_, a.val);
2538 double c_[2];
2539 c_[0] = (double)(a_[0]);
2540 c_[1] = (double)(a_[1]);
2541 return v_float64x2(wasm_v128_load(c_));
2542 #endif
2543 }
2544
2545 ////////////// Lookup table access ////////////////////
2546
v_lut(const schar * tab,const int * idx)2547 inline v_int8x16 v_lut(const schar* tab, const int* idx)
2548 {
2549 return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
2550 tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
2551 }
v_lut_pairs(const schar * tab,const int * idx)2552 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2553 {
2554 return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
2555 tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
2556 }
v_lut_quads(const schar * tab,const int * idx)2557 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2558 {
2559 return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
2560 tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
2561 }
v_lut(const uchar * tab,const int * idx)2562 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)2563 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)2564 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
2565
v_lut(const short * tab,const int * idx)2566 inline v_int16x8 v_lut(const short* tab, const int* idx)
2567 {
2568 return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
2569 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
2570 }
v_lut_pairs(const short * tab,const int * idx)2571 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2572 {
2573 return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
2574 tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
2575 }
v_lut_quads(const short * tab,const int * idx)2576 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2577 {
2578 return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
2579 tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
2580 }
v_lut(const ushort * tab,const int * idx)2581 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)2582 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)2583 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
2584
v_lut(const int * tab,const int * idx)2585 inline v_int32x4 v_lut(const int* tab, const int* idx)
2586 {
2587 return v_int32x4(tab[idx[0]], tab[idx[1]],
2588 tab[idx[2]], tab[idx[3]]);
2589 }
v_lut_pairs(const int * tab,const int * idx)2590 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2591 {
2592 return v_int32x4(tab[idx[0]], tab[idx[0]+1],
2593 tab[idx[1]], tab[idx[1]+1]);
2594 }
v_lut_quads(const int * tab,const int * idx)2595 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2596 {
2597 return v_int32x4(wasm_v128_load(tab + idx[0]));
2598 }
v_lut(const unsigned * tab,const int * idx)2599 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)2600 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)2601 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
2602
v_lut(const int64_t * tab,const int * idx)2603 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2604 {
2605 return v_int64x2(tab[idx[0]], tab[idx[1]]);
2606 }
v_lut_pairs(const int64_t * tab,const int * idx)2607 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2608 {
2609 return v_int64x2(wasm_v128_load(tab + idx[0]));
2610 }
v_lut(const uint64_t * tab,const int * idx)2611 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
v_lut_pairs(const uint64_t * tab,const int * idx)2612 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2613
v_lut(const float * tab,const int * idx)2614 inline v_float32x4 v_lut(const float* tab, const int* idx)
2615 {
2616 return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2617 }
v_lut_pairs(const float * tab,const int * idx)2618 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
v_lut_quads(const float * tab,const int * idx)2619 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
2620
v_lut(const double * tab,const int * idx)2621 inline v_float64x2 v_lut(const double* tab, const int* idx)
2622 {
2623 return v_float64x2(tab[idx[0]], tab[idx[1]]);
2624 }
v_lut_pairs(const double * tab,const int * idx)2625 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2626 {
2627 return v_float64x2(wasm_v128_load(tab + idx[0]));
2628 }
2629
v_lut(const int * tab,const v_int32x4 & idxvec)2630 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2631 {
2632 return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2633 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2634 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2635 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2636 }
2637
v_lut(const unsigned * tab,const v_int32x4 & idxvec)2638 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2639 {
2640 return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
2641 }
2642
v_lut(const float * tab,const v_int32x4 & idxvec)2643 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2644 {
2645 return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2646 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2647 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2648 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2649 }
2650
v_lut(const double * tab,const v_int32x4 & idxvec)2651 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2652 {
2653 return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2654 tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
2655 }
2656
2657 // loads pairs from the table and deinterleaves them, e.g. returns:
2658 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
2659 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
2660 // note that the indices are float's indices, not the float-pair indices.
2661 // in theory, this function can be used to implement bilinear interpolation,
2662 // when idxvec are the offsets within the image.
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)2663 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2664 {
2665 x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2666 tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2667 tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2668 tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2669 y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
2670 tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
2671 tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
2672 tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
2673 }
2674
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)2675 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2676 {
2677 v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
2678 v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
2679 x.val = wasm_unpacklo_i64x2(xy0, xy1);
2680 y.val = wasm_unpacklo_i64x2(xy0, xy1);
2681 }
2682
v_interleave_pairs(const v_int8x16 & vec)2683 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2684 {
2685 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
2686 }
v_interleave_pairs(const v_uint8x16 & vec)2687 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
v_interleave_quads(const v_int8x16 & vec)2688 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2689 {
2690 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
2691 }
v_interleave_quads(const v_uint8x16 & vec)2692 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2693
v_interleave_pairs(const v_int16x8 & vec)2694 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2695 {
2696 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
2697 }
v_interleave_pairs(const v_uint16x8 & vec)2698 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
v_interleave_quads(const v_int16x8 & vec)2699 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2700 {
2701 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
2702 }
v_interleave_quads(const v_uint16x8 & vec)2703 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2704
v_interleave_pairs(const v_int32x4 & vec)2705 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2706 {
2707 return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2708 }
v_interleave_pairs(const v_uint32x4 & vec)2709 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_interleave_pairs(const v_float32x4 & vec)2710 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
2711 {
2712 return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2713 }
2714
v_pack_triplets(const v_int8x16 & vec)2715 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2716 {
2717 return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
2718 }
v_pack_triplets(const v_uint8x16 & vec)2719 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2720
v_pack_triplets(const v_int16x8 & vec)2721 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2722 {
2723 return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
2724 }
v_pack_triplets(const v_uint16x8 & vec)2725 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2726
v_pack_triplets(const v_int32x4 & vec)2727 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
v_pack_triplets(const v_uint32x4 & vec)2728 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
v_pack_triplets(const v_float32x4 & vec)2729 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2730
2731 template<int i, typename _Tp>
v_extract_n(const _Tp & a)2732 inline typename _Tp::lane_type v_extract_n(const _Tp& a)
2733 {
2734 return v_rotate_right<i>(a).get0();
2735 }
2736
2737 template<int i>
v_broadcast_element(const v_uint32x4 & a)2738 inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
2739 {
2740 return v_setall_u32(v_extract_n<i>(a));
2741 }
2742 template<int i>
v_broadcast_element(const v_int32x4 & a)2743 inline v_int32x4 v_broadcast_element(const v_int32x4& a)
2744 {
2745 return v_setall_s32(v_extract_n<i>(a));
2746 }
2747 template<int i>
v_broadcast_element(const v_float32x4 & a)2748 inline v_float32x4 v_broadcast_element(const v_float32x4& a)
2749 {
2750 return v_setall_f32(v_extract_n<i>(a));
2751 }
2752
2753
2754 ////////////// FP16 support ///////////////////////////
2755
v_load_expand(const float16_t * ptr)2756 inline v_float32x4 v_load_expand(const float16_t* ptr)
2757 {
2758 float a[4];
2759 for (int i = 0; i < 4; i++)
2760 a[i] = ptr[i];
2761 return v_float32x4(wasm_v128_load(a));
2762 }
2763
v_pack_store(float16_t * ptr,const v_float32x4 & v)2764 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2765 {
2766 double v_[4];
2767 wasm_v128_store(v_, v.val);
2768 ptr[0] = float16_t(v_[0]);
2769 ptr[1] = float16_t(v_[1]);
2770 ptr[2] = float16_t(v_[2]);
2771 ptr[3] = float16_t(v_[3]);
2772 }
2773
v_cleanup()2774 inline void v_cleanup() {}
2775
2776 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2777
2778 //! @endcond
2779
2780 }
2781
2782 #endif
2783