1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_HAL_INTRIN_WASM_HPP
6 #define OPENCV_HAL_INTRIN_WASM_HPP
7 
8 #include <limits>
9 #include <cstring>
10 #include <algorithm>
11 #include "opencv2/core/saturate.hpp"
12 
13 #define CV_SIMD128 1
14 #define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
15 #define CV_SIMD128_FP16 0
16 
17 namespace cv
18 {
19 
20 //! @cond IGNORED
21 
22 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
23 
24 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
25 // handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
26 #define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
27 #define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
28 #define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
29 #define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
30 #define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
31 #define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
32 #define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
33 #define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
34 #endif // COMPATIBILITY: <1.38.46
35 
36 ///////// Types ///////////
37 
38 struct v_uint8x16
39 {
40     typedef uchar lane_type;
41     typedef v128_t vector_type;
42     enum { nlanes = 16 };
43 
v_uint8x16cv::v_uint8x1644     v_uint8x16() {}
v_uint8x16cv::v_uint8x1645     explicit v_uint8x16(v128_t v) : val(v) {}
v_uint8x16cv::v_uint8x1646     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
47             uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
48     {
49         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
50         val = wasm_v128_load(v);
51     }
52 
get0cv::v_uint8x1653     uchar get0() const
54     {
55         return (uchar)wasm_i8x16_extract_lane(val, 0);
56     }
57 
58     v128_t val;
59 };
60 
61 struct v_int8x16
62 {
63     typedef schar lane_type;
64     typedef v128_t vector_type;
65     enum { nlanes = 16 };
66 
v_int8x16cv::v_int8x1667     v_int8x16() {}
v_int8x16cv::v_int8x1668     explicit v_int8x16(v128_t v) : val(v) {}
v_int8x16cv::v_int8x1669     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
70             schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
71     {
72         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
73         val = wasm_v128_load(v);
74     }
75 
get0cv::v_int8x1676     schar get0() const
77     {
78         return wasm_i8x16_extract_lane(val, 0);
79     }
80 
81     v128_t val;
82 };
83 
84 struct v_uint16x8
85 {
86     typedef ushort lane_type;
87     typedef v128_t vector_type;
88     enum { nlanes = 8 };
89 
v_uint16x8cv::v_uint16x890     v_uint16x8() {}
v_uint16x8cv::v_uint16x891     explicit v_uint16x8(v128_t v) : val(v) {}
v_uint16x8cv::v_uint16x892     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
93     {
94         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
95         val = wasm_v128_load(v);
96     }
97 
get0cv::v_uint16x898     ushort get0() const
99     {
100         return (ushort)wasm_i16x8_extract_lane(val, 0);    // wasm_u16x8_extract_lane() unimplemented yet
101     }
102 
103     v128_t val;
104 };
105 
106 struct v_int16x8
107 {
108     typedef short lane_type;
109     typedef v128_t vector_type;
110     enum { nlanes = 8 };
111 
v_int16x8cv::v_int16x8112     v_int16x8() {}
v_int16x8cv::v_int16x8113     explicit v_int16x8(v128_t v) : val(v) {}
v_int16x8cv::v_int16x8114     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
115     {
116         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
117         val = wasm_v128_load(v);
118     }
119 
get0cv::v_int16x8120     short get0() const
121     {
122         return wasm_i16x8_extract_lane(val, 0);
123     }
124 
125     v128_t val;
126 };
127 
128 struct v_uint32x4
129 {
130     typedef unsigned lane_type;
131     typedef v128_t vector_type;
132     enum { nlanes = 4 };
133 
v_uint32x4cv::v_uint32x4134     v_uint32x4() {}
v_uint32x4cv::v_uint32x4135     explicit v_uint32x4(v128_t v) : val(v) {}
v_uint32x4cv::v_uint32x4136     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
137     {
138         unsigned v[] = {v0, v1, v2, v3};
139         val = wasm_v128_load(v);
140     }
141 
get0cv::v_uint32x4142     unsigned get0() const
143     {
144         return (unsigned)wasm_i32x4_extract_lane(val, 0);
145     }
146 
147     v128_t val;
148 };
149 
150 struct v_int32x4
151 {
152     typedef int lane_type;
153     typedef v128_t vector_type;
154     enum { nlanes = 4 };
155 
v_int32x4cv::v_int32x4156     v_int32x4() {}
v_int32x4cv::v_int32x4157     explicit v_int32x4(v128_t v) : val(v) {}
v_int32x4cv::v_int32x4158     v_int32x4(int v0, int v1, int v2, int v3)
159     {
160         int v[] = {v0, v1, v2, v3};
161         val = wasm_v128_load(v);
162     }
163 
get0cv::v_int32x4164     int get0() const
165     {
166         return wasm_i32x4_extract_lane(val, 0);
167     }
168 
169     v128_t val;
170 };
171 
172 struct v_float32x4
173 {
174     typedef float lane_type;
175     typedef v128_t vector_type;
176     enum { nlanes = 4 };
177 
v_float32x4cv::v_float32x4178     v_float32x4() {}
v_float32x4cv::v_float32x4179     explicit v_float32x4(v128_t v) : val(v) {}
v_float32x4cv::v_float32x4180     v_float32x4(float v0, float v1, float v2, float v3)
181     {
182         float v[] = {v0, v1, v2, v3};
183         val = wasm_v128_load(v);
184     }
185 
get0cv::v_float32x4186     float get0() const
187     {
188         return wasm_f32x4_extract_lane(val, 0);
189     }
190 
191     v128_t val;
192 };
193 
194 struct v_uint64x2
195 {
196     typedef uint64 lane_type;
197     typedef v128_t vector_type;
198     enum { nlanes = 2 };
199 
v_uint64x2cv::v_uint64x2200     v_uint64x2() {}
v_uint64x2cv::v_uint64x2201     explicit v_uint64x2(v128_t v) : val(v) {}
v_uint64x2cv::v_uint64x2202     v_uint64x2(uint64 v0, uint64 v1)
203     {
204         uint64 v[] = {v0, v1};
205         val = wasm_v128_load(v);
206     }
207 
get0cv::v_uint64x2208     uint64 get0() const
209     {
210         return (uint64)wasm_i64x2_extract_lane(val, 0);
211     }
212 
213     v128_t val;
214 };
215 
216 struct v_int64x2
217 {
218     typedef int64 lane_type;
219     typedef v128_t vector_type;
220     enum { nlanes = 2 };
221 
v_int64x2cv::v_int64x2222     v_int64x2() {}
v_int64x2cv::v_int64x2223     explicit v_int64x2(v128_t v) : val(v) {}
v_int64x2cv::v_int64x2224     v_int64x2(int64 v0, int64 v1)
225     {
226         int64 v[] = {v0, v1};
227         val = wasm_v128_load(v);
228     }
229 
get0cv::v_int64x2230     int64 get0() const
231     {
232         return wasm_i64x2_extract_lane(val, 0);
233     }
234 
235     v128_t val;
236 };
237 
238 struct v_float64x2
239 {
240     typedef double lane_type;
241     typedef v128_t vector_type;
242     enum { nlanes = 2 };
243 
v_float64x2cv::v_float64x2244     v_float64x2() {}
v_float64x2cv::v_float64x2245     explicit v_float64x2(v128_t v) : val(v) {}
v_float64x2cv::v_float64x2246     v_float64x2(double v0, double v1)
247     {
248         double v[] = {v0, v1};
249         val = wasm_v128_load(v);
250     }
251 
get0cv::v_float64x2252     double get0() const
253     {
254         return wasm_f64x2_extract_lane(val, 0);
255     }
256 
257     v128_t val;
258 };
259 
260 namespace
261 {
262 #define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
263 inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
264 OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
265 OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
266 OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
267 OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
268 OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
269 OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
270 OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
271 OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
272 OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
273 OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
274 
275 static const unsigned char popCountTable[] =
276 {
277     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
278     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
279     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
280     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
281     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
282     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
283     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
284     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
285     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
286     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
287     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
288     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
289     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
290     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
291     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
292     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
293 };
294 }  // namespace
295 
wasm_unpacklo_i8x16(v128_t a,v128_t b)296 static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
297     return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
298 }
299 
wasm_unpacklo_i16x8(v128_t a,v128_t b)300 static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
301     return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
302 }
303 
wasm_unpacklo_i32x4(v128_t a,v128_t b)304 static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
305     return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
306 }
307 
wasm_unpacklo_i64x2(v128_t a,v128_t b)308 static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
309     return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
310 }
311 
wasm_unpackhi_i8x16(v128_t a,v128_t b)312 static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
313     return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
314 }
315 
wasm_unpackhi_i16x8(v128_t a,v128_t b)316 static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
317     return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
318 }
319 
wasm_unpackhi_i32x4(v128_t a,v128_t b)320 static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
321     return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
322 }
323 
wasm_unpackhi_i64x2(v128_t a,v128_t b)324 static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
325     return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
326 }
327 
328 /** Convert **/
329 // 8 >> 16
v128_cvtu8x16_i16x8(const v128_t & a)330 inline v128_t v128_cvtu8x16_i16x8(const v128_t& a)
331 {
332     const v128_t z = wasm_i8x16_splat(0);
333     return wasm_unpacklo_i8x16(a, z);
334 }
v128_cvti8x16_i16x8(const v128_t & a)335 inline v128_t v128_cvti8x16_i16x8(const v128_t& a)
336 { return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
337 // 8 >> 32
v128_cvtu8x16_i32x4(const v128_t & a)338 inline v128_t v128_cvtu8x16_i32x4(const v128_t& a)
339 {
340     const v128_t z = wasm_i8x16_splat(0);
341     return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
342 }
v128_cvti8x16_i32x4(const v128_t & a)343 inline v128_t v128_cvti8x16_i32x4(const v128_t& a)
344 {
345     v128_t r = wasm_unpacklo_i8x16(a, a);
346     r = wasm_unpacklo_i8x16(r, r);
347     return wasm_i32x4_shr(r, 24);
348 }
349 // 16 >> 32
v128_cvtu16x8_i32x4(const v128_t & a)350 inline v128_t v128_cvtu16x8_i32x4(const v128_t& a)
351 {
352     const v128_t z = wasm_i8x16_splat(0);
353     return wasm_unpacklo_i16x8(a, z);
354 }
v128_cvti16x8_i32x4(const v128_t & a)355 inline v128_t v128_cvti16x8_i32x4(const v128_t& a)
356 { return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
357 // 32 >> 64
v128_cvtu32x4_i64x2(const v128_t & a)358 inline v128_t v128_cvtu32x4_i64x2(const v128_t& a)
359 {
360     const v128_t z = wasm_i8x16_splat(0);
361     return wasm_unpacklo_i32x4(a, z);
362 }
v128_cvti32x4_i64x2(const v128_t & a)363 inline v128_t v128_cvti32x4_i64x2(const v128_t& a)
364 { return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
365 
366 // 16 << 8
v128_cvtu8x16_i16x8_high(const v128_t & a)367 inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a)
368 {
369     const v128_t z = wasm_i8x16_splat(0);
370     return wasm_unpackhi_i8x16(a, z);
371 }
v128_cvti8x16_i16x8_high(const v128_t & a)372 inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a)
373 { return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
374 // 32 << 16
v128_cvtu16x8_i32x4_high(const v128_t & a)375 inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a)
376 {
377     const v128_t z = wasm_i8x16_splat(0);
378     return wasm_unpackhi_i16x8(a, z);
379 }
v128_cvti16x8_i32x4_high(const v128_t & a)380 inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a)
381 { return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
382 // 64 << 32
v128_cvtu32x4_i64x2_high(const v128_t & a)383 inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a)
384 {
385     const v128_t z = wasm_i8x16_splat(0);
386     return wasm_unpackhi_i32x4(a, z);
387 }
v128_cvti32x4_i64x2_high(const v128_t & a)388 inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
389 { return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
390 
391 #define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
392 inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
393 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
394 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
395 { return _Tpvec(a.val); }
396 
OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16,uchar,u8,i8x16,schar)397 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar)
398 OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar)
399 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short)
400 OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short)
401 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int)
402 OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int)
403 OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float)
404 OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64)
405 OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64)
406 OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double)
407 
408 //////////////// PACK ///////////////
409 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
410 {
411     v128_t maxval = wasm_i16x8_splat(255);
412     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
413     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
414     return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
415 }
v_pack(const v_int16x8 & a,const v_int16x8 & b)416 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
417 {
418     v128_t maxval = wasm_i16x8_splat(127);
419     v128_t minval = wasm_i16x8_splat(-128);
420     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
421     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
422     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
423     v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
424     return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
425 }
v_pack(const v_uint32x4 & a,const v_uint32x4 & b)426 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
427 {
428     v128_t maxval = wasm_i32x4_splat(65535);
429     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
430     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
431     return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
432 }
v_pack(const v_int32x4 & a,const v_int32x4 & b)433 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
434 {
435     v128_t maxval = wasm_i32x4_splat(32767);
436     v128_t minval = wasm_i32x4_splat(-32768);
437     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
438     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
439     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
440     v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
441     return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
442 }
v_pack(const v_uint64x2 & a,const v_uint64x2 & b)443 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
444 {
445     return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
446 }
v_pack(const v_int64x2 & a,const v_int64x2 & b)447 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
448 {
449     return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
450 }
v_pack_u(const v_int16x8 & a,const v_int16x8 & b)451 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
452 {
453     v128_t maxval = wasm_i16x8_splat(255);
454     v128_t minval = wasm_i16x8_splat(0);
455     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
456     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
457     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
458     v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
459     return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
460 }
v_pack_u(const v_int32x4 & a,const v_int32x4 & b)461 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
462 {
463     v128_t maxval = wasm_i32x4_splat(65535);
464     v128_t minval = wasm_i32x4_splat(0);
465     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
466     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
467     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
468     v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
469     return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
470 }
471 
472 template<int n>
v_rshr_pack(const v_uint16x8 & a,const v_uint16x8 & b)473 inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
474 {
475     v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
476     v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
477     v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
478     v128_t maxval = wasm_i16x8_splat(255);
479     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
480     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
481     return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
482 }
483 template<int n>
v_rshr_pack(const v_int16x8 & a,const v_int16x8 & b)484 inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
485 {
486     v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
487     v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
488     v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
489     v128_t maxval = wasm_i16x8_splat(127);
490     v128_t minval = wasm_i16x8_splat(-128);
491     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
492     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
493     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
494     v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
495     return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
496 }
497 template<int n>
v_rshr_pack(const v_uint32x4 & a,const v_uint32x4 & b)498 inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
499 {
500     v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
501     v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
502     v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
503     v128_t maxval = wasm_i32x4_splat(65535);
504     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
505     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
506     return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
507 }
508 template<int n>
v_rshr_pack(const v_int32x4 & a,const v_int32x4 & b)509 inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
510 {
511     v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
512     v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
513     v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
514     v128_t maxval = wasm_i32x4_splat(32767);
515     v128_t minval = wasm_i16x8_splat(-32768);
516     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
517     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
518     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
519     v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
520     return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
521 }
522 template<int n>
v_rshr_pack(const v_uint64x2 & a,const v_uint64x2 & b)523 inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
524 {
525     v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
526     v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
527     v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
528     return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
529 }
530 template<int n>
v_rshr_pack(const v_int64x2 & a,const v_int64x2 & b)531 inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
532 {
533     v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
534     v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
535     v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
536     return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
537 }
538 template<int n>
v_rshr_pack_u(const v_int16x8 & a,const v_int16x8 & b)539 inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
540 {
541     v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
542     v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
543     v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
544     v128_t maxval = wasm_i16x8_splat(255);
545     v128_t minval = wasm_i16x8_splat(0);
546     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
547     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
548     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
549     v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
550     return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
551 }
552 template<int n>
v_rshr_pack_u(const v_int32x4 & a,const v_int32x4 & b)553 inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
554 {
555     v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
556     v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
557     v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
558     v128_t maxval = wasm_i32x4_splat(65535);
559     v128_t minval = wasm_i16x8_splat(0);
560     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
561     v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
562     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
563     v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
564     return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
565 }
566 
v_pack_store(uchar * ptr,const v_uint16x8 & a)567 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
568 {
569     v128_t maxval = wasm_i16x8_splat(255);
570     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
571     v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
572     uchar t_ptr[16];
573     wasm_v128_store(t_ptr, r);
574     for (int i=0; i<8; ++i) {
575         ptr[i] = t_ptr[i];
576     }
577 }
v_pack_store(schar * ptr,const v_int16x8 & a)578 inline void v_pack_store(schar* ptr, const v_int16x8& a)
579 {
580     v128_t maxval = wasm_i16x8_splat(127);
581     v128_t minval = wasm_i16x8_splat(-128);
582     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
583     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
584     v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
585     schar t_ptr[16];
586     wasm_v128_store(t_ptr, r);
587     for (int i=0; i<8; ++i) {
588         ptr[i] = t_ptr[i];
589     }
590 }
v_pack_store(ushort * ptr,const v_uint32x4 & a)591 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
592 {
593     v128_t maxval = wasm_i32x4_splat(65535);
594     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
595     v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
596     ushort t_ptr[8];
597     wasm_v128_store(t_ptr, r);
598     for (int i=0; i<4; ++i) {
599         ptr[i] = t_ptr[i];
600     }
601 }
v_pack_store(short * ptr,const v_int32x4 & a)602 inline void v_pack_store(short* ptr, const v_int32x4& a)
603 {
604     v128_t maxval = wasm_i32x4_splat(32767);
605     v128_t minval = wasm_i32x4_splat(-32768);
606     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
607     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
608     v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
609     short t_ptr[8];
610     wasm_v128_store(t_ptr, r);
611     for (int i=0; i<4; ++i) {
612         ptr[i] = t_ptr[i];
613     }
614 }
v_pack_store(unsigned * ptr,const v_uint64x2 & a)615 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
616 {
617     v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
618     unsigned t_ptr[4];
619     wasm_v128_store(t_ptr, r);
620     for (int i=0; i<2; ++i) {
621         ptr[i] = t_ptr[i];
622     }
623 }
v_pack_store(int * ptr,const v_int64x2 & a)624 inline void v_pack_store(int* ptr, const v_int64x2& a)
625 {
626     v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
627     int t_ptr[4];
628     wasm_v128_store(t_ptr, r);
629     for (int i=0; i<2; ++i) {
630         ptr[i] = t_ptr[i];
631     }
632 }
v_pack_u_store(uchar * ptr,const v_int16x8 & a)633 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
634 {
635     v128_t maxval = wasm_i16x8_splat(255);
636     v128_t minval = wasm_i16x8_splat(0);
637     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
638     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
639     v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
640     uchar t_ptr[16];
641     wasm_v128_store(t_ptr, r);
642     for (int i=0; i<8; ++i) {
643         ptr[i] = t_ptr[i];
644     }
645 }
v_pack_u_store(ushort * ptr,const v_int32x4 & a)646 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
647 {
648     v128_t maxval = wasm_i32x4_splat(65535);
649     v128_t minval = wasm_i32x4_splat(0);
650     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
651     v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
652     v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
653     ushort t_ptr[8];
654     wasm_v128_store(t_ptr, r);
655     for (int i=0; i<4; ++i) {
656         ptr[i] = t_ptr[i];
657     }
658 }
659 
660 template<int n>
v_rshr_pack_store(uchar * ptr,const v_uint16x8 & a)661 inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
662 {
663     v128_t delta = wasm_i16x8_splat((short)(1 << (n-1)));
664     v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
665     v128_t maxval = wasm_i16x8_splat(255);
666     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
667     v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
668     uchar t_ptr[16];
669     wasm_v128_store(t_ptr, r);
670     for (int i=0; i<8; ++i) {
671         ptr[i] = t_ptr[i];
672     }
673 }
674 template<int n>
v_rshr_pack_store(schar * ptr,const v_int16x8 & a)675 inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
676 {
677     v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
678     v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
679     v128_t maxval = wasm_i16x8_splat(127);
680     v128_t minval = wasm_i16x8_splat(-128);
681     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
682     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
683     v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
684     schar t_ptr[16];
685     wasm_v128_store(t_ptr, r);
686     for (int i=0; i<8; ++i) {
687         ptr[i] = t_ptr[i];
688     }
689 }
690 template<int n>
v_rshr_pack_store(ushort * ptr,const v_uint32x4 & a)691 inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
692 {
693     v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
694     v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
695     v128_t maxval = wasm_i32x4_splat(65535);
696     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
697     v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
698     ushort t_ptr[8];
699     wasm_v128_store(t_ptr, r);
700     for (int i=0; i<4; ++i) {
701         ptr[i] = t_ptr[i];
702     }
703 }
704 template<int n>
v_rshr_pack_store(short * ptr,const v_int32x4 & a)705 inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
706 {
707     v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
708     v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
709     v128_t maxval = wasm_i32x4_splat(32767);
710     v128_t minval = wasm_i32x4_splat(-32768);
711     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
712     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
713     v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
714     short t_ptr[8];
715     wasm_v128_store(t_ptr, r);
716     for (int i=0; i<4; ++i) {
717         ptr[i] = t_ptr[i];
718     }
719 }
720 template<int n>
v_rshr_pack_store(unsigned * ptr,const v_uint64x2 & a)721 inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
722 {
723     v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
724     v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
725     v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
726     unsigned t_ptr[4];
727     wasm_v128_store(t_ptr, r);
728     for (int i=0; i<2; ++i) {
729         ptr[i] = t_ptr[i];
730     }
731 }
732 template<int n>
v_rshr_pack_store(int * ptr,const v_int64x2 & a)733 inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
734 {
735     v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
736     v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
737     v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
738     int t_ptr[4];
739     wasm_v128_store(t_ptr, r);
740     for (int i=0; i<2; ++i) {
741         ptr[i] = t_ptr[i];
742     }
743 }
744 template<int n>
v_rshr_pack_u_store(uchar * ptr,const v_int16x8 & a)745 inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
746 {
747     v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
748     v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
749     v128_t maxval = wasm_i16x8_splat(255);
750     v128_t minval = wasm_i16x8_splat(0);
751     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
752     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
753     v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
754     uchar t_ptr[16];
755     wasm_v128_store(t_ptr, r);
756     for (int i=0; i<8; ++i) {
757         ptr[i] = t_ptr[i];
758     }
759 }
760 template<int n>
v_rshr_pack_u_store(ushort * ptr,const v_int32x4 & a)761 inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
762 {
763     v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
764     v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
765     v128_t maxval = wasm_i32x4_splat(65535);
766     v128_t minval = wasm_i32x4_splat(0);
767     v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
768     v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
769     v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
770     ushort t_ptr[8];
771     wasm_v128_store(t_ptr, r);
772     for (int i=0; i<4; ++i) {
773         ptr[i] = t_ptr[i];
774     }
775 }
776 
v_pack_b(const v_uint16x8 & a,const v_uint16x8 & b)777 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
778 {
779     v128_t maxval = wasm_i16x8_splat(255);
780     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
781     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
782     return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
783 }
784 
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)785 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
786                            const v_uint32x4& c, const v_uint32x4& d)
787 {
788     v128_t maxval = wasm_i32x4_splat(255);
789     v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
790     v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
791     v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
792     v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
793     v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
794     v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
795     return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
796 }
797 
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)798 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
799                            const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
800                            const v_uint64x2& g, const v_uint64x2& h)
801 {
802     v128_t maxval = wasm_i32x4_splat(255);
803     v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
804     v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
805     v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
806     v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
807     v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
808     v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
809     v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
810     v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
811     v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
812     v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
813     v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
814     v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
815     v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
816     v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
817     return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
818 }
819 
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)820 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
821                             const v_float32x4& m1, const v_float32x4& m2,
822                             const v_float32x4& m3)
823 {
824     v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
825     v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
826     v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
827     v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
828     v0 = wasm_f32x4_mul(v0, m0.val);
829     v1 = wasm_f32x4_mul(v1, m1.val);
830     v2 = wasm_f32x4_mul(v2, m2.val);
831     v3 = wasm_f32x4_mul(v3, m3.val);
832 
833     return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
834 }
835 
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)836 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
837                                const v_float32x4& m1, const v_float32x4& m2,
838                                const v_float32x4& a)
839 {
840     v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
841     v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
842     v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
843     v0 = wasm_f32x4_mul(v0, m0.val);
844     v1 = wasm_f32x4_mul(v1, m1.val);
845     v2 = wasm_f32x4_mul(v2, m2.val);
846 
847     return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
848 }
849 
850 #define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
851 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
852 { \
853     return _Tpvec(intrin(a.val, b.val)); \
854 } \
855 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
856 { \
857     a.val = intrin(a.val, b.val); \
858     return a; \
859 }
860 
861 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
862 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
863 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
864 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
865 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
866 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
867 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
868 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
869 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
870 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
871 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
872 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
873 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
874 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
875 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
876 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
877 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
878 OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
879 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
880 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
881 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
882 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
883 OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
884 OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
885 OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
886 OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
887 
888 // saturating multiply 8-bit, 16-bit
889 #define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec)        \
890 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
891 {                                                            \
892     _Tpwvec c, d;                                            \
893     v_mul_expand(a, b, c, d);                                \
894     return v_pack(c, d);                                     \
895 }                                                            \
896 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
897 { a = a * b; return a; }
898 
OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16,v_uint16x8)899 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
900 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16,  v_int16x8)
901 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4)
902 OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8,  v_int32x4)
903 
904 //  Multiply and expand
905 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
906                          v_uint16x8& c, v_uint16x8& d)
907 {
908     v_uint16x8 a0, a1, b0, b1;
909     v_expand(a, a0, a1);
910     v_expand(b, b0, b1);
911     c = v_mul_wrap(a0, b0);
912     d = v_mul_wrap(a1, b1);
913 }
914 
v_mul_expand(const v_int8x16 & a,const v_int8x16 & b,v_int16x8 & c,v_int16x8 & d)915 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
916                          v_int16x8& c, v_int16x8& d)
917 {
918     v_int16x8 a0, a1, b0, b1;
919     v_expand(a, a0, a1);
920     v_expand(b, b0, b1);
921     c = v_mul_wrap(a0, b0);
922     d = v_mul_wrap(a1, b1);
923 }
924 
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)925 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
926                          v_int32x4& c, v_int32x4& d)
927 {
928     v_int32x4 a0, a1, b0, b1;
929     v_expand(a, a0, a1);
930     v_expand(b, b0, b1);
931     c.val = wasm_i32x4_mul(a0.val, b0.val);
932     d.val = wasm_i32x4_mul(a1.val, b1.val);
933 }
934 
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)935 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
936                          v_uint32x4& c, v_uint32x4& d)
937 {
938     v_uint32x4 a0, a1, b0, b1;
939     v_expand(a, a0, a1);
940     v_expand(b, b0, b1);
941     c.val = wasm_i32x4_mul(a0.val, b0.val);
942     d.val = wasm_i32x4_mul(a1.val, b1.val);
943 }
944 
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)945 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
946                          v_uint64x2& c, v_uint64x2& d)
947 {
948     v_uint64x2 a0, a1, b0, b1;
949     v_expand(a, a0, a1);
950     v_expand(b, b0, b1);
951     c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
952     d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
953 }
954 
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)955 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
956 {
957     v_int32x4 a0, a1, b0, b1;
958     v_expand(a, a0, a1);
959     v_expand(b, b0, b1);
960     v128_t c = wasm_i32x4_mul(a0.val, b0.val);
961     v128_t d = wasm_i32x4_mul(a1.val, b1.val);
962     return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
963 }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)964 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
965 {
966     v_uint32x4 a0, a1, b0, b1;
967     v_expand(a, a0, a1);
968     v_expand(b, b0, b1);
969     v128_t c = wasm_i32x4_mul(a0.val, b0.val);
970     v128_t d = wasm_i32x4_mul(a1.val, b1.val);
971     return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
972 }
973 
974 //////// Dot Product ////////
975 
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)976 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
977 {
978     v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
979     v128_t a1 = wasm_i32x4_shr(a.val, 16);
980     v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
981     v128_t b1 = wasm_i32x4_shr(b.val, 16);
982     v128_t c = wasm_i32x4_mul(a0, b0);
983     v128_t d = wasm_i32x4_mul(a1, b1);
984     return v_int32x4(wasm_i32x4_add(c, d));
985 }
986 
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)987 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
988 { return v_dotprod(a, b) + c; }
989 
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)990 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
991 {
992     v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
993     v128_t a1 = wasm_i64x2_shr(a.val, 32);
994     v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
995     v128_t b1 = wasm_i64x2_shr(b.val, 32);
996     v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
997     v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
998     return v_int64x2(wasm_i64x2_add(c, d));
999 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)1000 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1001 {
1002     return v_dotprod(a, b) + c;
1003 }
1004 
1005 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)1006 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
1007 {
1008     v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1009     v128_t a1 = wasm_u16x8_shr(a.val, 8);
1010     v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1011     v128_t b1 = wasm_u16x8_shr(b.val, 8);
1012     return v_uint32x4((
1013         v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1014         v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
1015     );
1016 }
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)1017 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1018 { return v_dotprod_expand(a, b) + c; }
1019 
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)1020 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
1021 {
1022     v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
1023     v128_t a1 = wasm_i16x8_shr(a.val, 8);
1024     v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
1025     v128_t b1 = wasm_i16x8_shr(b.val, 8);
1026     return v_int32x4(
1027         v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
1028         v_dotprod(v_int16x8(a1), v_int16x8(b1))
1029     );
1030 }
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)1031 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1032 { return v_dotprod_expand(a, b) + c; }
1033 
1034 // 16 >> 64
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)1035 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
1036 {
1037     v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1038     v128_t a1 = wasm_u32x4_shr(a.val, 16);
1039     v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1040     v128_t b1 = wasm_u32x4_shr(b.val, 16);
1041     return v_uint64x2((
1042         v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1043         v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
1044     );
1045 }
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)1046 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1047 { return v_dotprod_expand(a, b) + c; }
1048 
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)1049 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
1050 {
1051     v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
1052     v128_t a1 = wasm_i32x4_shr(a.val, 16);
1053     v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
1054     v128_t b1 = wasm_i32x4_shr(b.val, 16);
1055     return v_int64x2((
1056         v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
1057         v_dotprod(v_int32x4(a1), v_int32x4(b1)))
1058     );
1059 }
1060 
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)1061 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1062 { return v_dotprod_expand(a, b) + c; }
1063 
1064 // 32 >> 64f
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)1065 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
1066 { return v_cvt_f64(v_dotprod(a, b)); }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)1067 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1068 { return v_dotprod_expand(a, b) + c; }
1069 
1070 //////// Fast Dot Product ////////
1071 
1072 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)1073 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
1074 { return v_dotprod(a, b); }
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)1075 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
1076 { return v_dotprod(a, b, c); }
1077 
1078 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)1079 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
1080 { return v_dotprod(a, b); }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)1081 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
1082 { return v_dotprod(a, b, c); }
1083 
1084 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)1085 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
1086 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)1087 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
1088 { return v_dotprod_expand(a, b, c); }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)1089 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
1090 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)1091 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
1092 { return v_dotprod_expand(a, b, c); }
1093 
1094 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)1095 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
1096 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)1097 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
1098 { return v_dotprod_expand(a, b, c); }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)1099 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
1100 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)1101 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
1102 { return v_dotprod_expand(a, b, c); }
1103 
1104 // 32 >> 64f
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)1105 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
1106 { return v_dotprod_expand(a, b); }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)1107 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
1108 { return v_dotprod_expand(a, b, c); }
1109 
1110 #define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
1111 OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
1112 OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
1113 OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
1114 inline _Tpvec operator ~ (const _Tpvec& a) \
1115 { \
1116     return _Tpvec(wasm_v128_not(a.val)); \
1117 }
1118 
1119 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16)
OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)1120 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)
1121 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8)
1122 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8)
1123 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4)
1124 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4)
1125 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2)
1126 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2)
1127 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4)
1128 OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2)
1129 
1130 inline v_float32x4 v_sqrt(const v_float32x4& x)
1131 {
1132     return v_float32x4(wasm_f32x4_sqrt(x.val));
1133 }
1134 
v_invsqrt(const v_float32x4 & x)1135 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1136 {
1137     const v128_t _1_0 = wasm_f32x4_splat(1.0);
1138     return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
1139 }
1140 
v_sqrt(const v_float64x2 & x)1141 inline v_float64x2 v_sqrt(const v_float64x2& x)
1142 {
1143     return v_float64x2(wasm_f64x2_sqrt(x.val));
1144 }
1145 
v_invsqrt(const v_float64x2 & x)1146 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1147 {
1148     const v128_t _1_0 = wasm_f64x2_splat(1.0);
1149     return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
1150 }
1151 
1152 #define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
1153 inline _Tpuvec v_abs(const _Tpsvec& x) \
1154 { \
1155     v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
1156     v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
1157     return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
1158 }
1159 
1160 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7)
1161 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15)
1162 OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31)
1163 
v_abs(const v_float32x4 & x)1164 inline v_float32x4 v_abs(const v_float32x4& x)
1165 { return v_float32x4(wasm_f32x4_abs(x.val)); }
v_abs(const v_float64x2 & x)1166 inline v_float64x2 v_abs(const v_float64x2& x)
1167 {
1168     return v_float64x2(wasm_f64x2_abs(x.val));
1169 }
1170 
1171 // TODO: exp, log, sin, cos
1172 
1173 #define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
1174 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1175 { \
1176     return _Tpvec(intrin(a.val, b.val)); \
1177 }
1178 
OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4,v_min,wasm_f32x4_min)1179 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
1180 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
1181 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
1182 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max)
1183 
1184 #define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
1185 inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1186 { \
1187     return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
1188 } \
1189 inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1190 { \
1191     return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
1192 }
1193 
1194 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16)
1195 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8)
1196 OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4)
1197 
1198 #define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
1199 inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
1200 { \
1201     v128_t delta = wasm_##suffix##_splat(deltaNum); \
1202     v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1203     return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
1204 } \
1205 inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
1206 { \
1207     v128_t delta = wasm_##suffix##_splat(deltaNum); \
1208     v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
1209     return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
1210 }
1211 
1212 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80)
1213 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
1214 OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
1215 
1216 #define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
1217 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1218 { return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
1219 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1220 { return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
1221 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
1222 { return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
1223 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
1224 { return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
1225 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1226 { return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
1227 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1228 { return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
1229 
1230 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
1231 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16)
1232 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8)
1233 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8)
1234 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4)
1235 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4)
1236 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
1237 OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
1238 
1239 #define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
1240 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1241 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1242 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1243 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1244 
1245 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
1246 OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
1247 
1248 inline v_float32x4 v_not_nan(const v_float32x4& a)
1249 {
1250     v128_t z = wasm_i32x4_splat(0x7fffffff);
1251     v128_t t = wasm_i32x4_splat(0x7f800000);
1252     return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
1253 }
v_not_nan(const v_float64x2 & a)1254 inline v_float64x2 v_not_nan(const v_float64x2& a)
1255 {
1256     v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
1257     v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
1258     return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
1259 }
1260 
OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16,v_add_wrap,wasm_i8x16_add)1261 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add)
1262 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add)
1263 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add)
1264 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add)
1265 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
1266 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
1267 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
1268 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
1269 #if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
1270 // details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
1271 // 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
1272 inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
1273 {
1274     uchar a_[16], b_[16];
1275     wasm_v128_store(a_, a.val);
1276     wasm_v128_store(b_, b.val);
1277     for (int i = 0; i < 16; i++)
1278         a_[i] = (uchar)(a_[i] * b_[i]);
1279     return v_uint8x16(wasm_v128_load(a_));
1280 }
v_mul_wrap(const v_int8x16 & a,const v_int8x16 & b)1281 inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
1282 {
1283     schar a_[16], b_[16];
1284     wasm_v128_store(a_, a.val);
1285     wasm_v128_store(b_, b.val);
1286     for (int i = 0; i < 16; i++)
1287         a_[i] = (schar)(a_[i] * b_[i]);
1288     return v_int8x16(wasm_v128_load(a_));
1289 }
1290 #else
1291 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
1292 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
1293 #endif
OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8,v_mul_wrap,wasm_i16x8_mul)1294 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
1295 OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
1296 
1297 
1298 /** Absolute difference **/
1299 
1300 inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
1301 { return v_add_wrap(a - b,  b - a); }
v_absdiff(const v_uint16x8 & a,const v_uint16x8 & b)1302 inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
1303 { return v_add_wrap(a - b,  b - a); }
v_absdiff(const v_uint32x4 & a,const v_uint32x4 & b)1304 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1305 { return v_max(a, b) - v_min(a, b); }
1306 
v_absdiff(const v_int8x16 & a,const v_int8x16 & b)1307 inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
1308 {
1309     v_int8x16 d = v_sub_wrap(a, b);
1310     v_int8x16 m = a < b;
1311     return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1312 }
v_absdiff(const v_int16x8 & a,const v_int16x8 & b)1313 inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
1314 {
1315     return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
1316 }
v_absdiff(const v_int32x4 & a,const v_int32x4 & b)1317 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1318 {
1319     v_int32x4 d = a - b;
1320     v_int32x4 m = a < b;
1321     return v_reinterpret_as_u32((d ^ m) - m);
1322 }
1323 
1324 /** Saturating absolute difference **/
v_absdiffs(const v_int8x16 & a,const v_int8x16 & b)1325 inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
1326 {
1327     v_int8x16 d = a - b;
1328     v_int8x16 m = a < b;
1329     return (d ^ m) - m;
1330  }
v_absdiffs(const v_int16x8 & a,const v_int16x8 & b)1331 inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
1332 { return v_max(a, b) - v_min(a, b); }
1333 
1334 
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1335 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1336 {
1337     return a * b + c;
1338 }
1339 
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1340 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1341 {
1342     return v_fma(a, b, c);
1343 }
1344 
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1345 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1346 {
1347     return a * b + c;
1348 }
1349 
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1350 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1351 {
1352     return a * b + c;
1353 }
1354 
v_absdiff(const v_float32x4 & a,const v_float32x4 & b)1355 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
1356 {
1357     v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
1358     return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
1359 }
v_absdiff(const v_float64x2 & a,const v_float64x2 & b)1360 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
1361 {
1362     v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
1363     return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
1364 }
1365 
1366 #define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
1367 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1368 { \
1369     v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1370     v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1371     return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
1372 } \
1373 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1374 { \
1375     v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
1376     v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
1377     return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
1378 } \
1379 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1380 { \
1381     return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
1382 }
1383 
1384 OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
1385 OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
1386 
1387 #define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
1388 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1389 { \
1390     return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1391 } \
1392 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1393 { \
1394     return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1395 } \
1396 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1397 { \
1398     return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1399 } \
1400 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1401 { \
1402     return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1403 } \
1404 template<int imm> \
1405 inline _Tpuvec v_shl(const _Tpuvec& a) \
1406 { \
1407     return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
1408 } \
1409 template<int imm> \
1410 inline _Tpsvec v_shl(const _Tpsvec& a) \
1411 { \
1412     return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
1413 } \
1414 template<int imm> \
1415 inline _Tpuvec v_shr(const _Tpuvec& a) \
1416 { \
1417     return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
1418 } \
1419 template<int imm> \
1420 inline _Tpsvec v_shr(const _Tpsvec& a) \
1421 { \
1422     return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
1423 }
1424 
1425 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16)
1426 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8)
1427 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4)
1428 OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2)
1429 
1430 namespace hal_wasm_internal
1431 {
1432     template <int imm,
1433         bool is_invalid = ((imm < 0) || (imm > 16)),
1434         bool is_first = (imm == 0),
1435         bool is_second = (imm == 16),
1436         bool is_other = (((imm > 0) && (imm < 16)))>
1437     class v_wasm_palignr_u8_class;
1438 
1439     template <int imm>
1440     class v_wasm_palignr_u8_class<imm, true, false, false, false>;
1441 
1442     template <int imm>
1443     class v_wasm_palignr_u8_class<imm, false, true, false, false>
1444     {
1445     public:
operator ()(const v128_t & a,const v128_t &) const1446         inline v128_t operator()(const v128_t& a, const v128_t&) const
1447         {
1448             return a;
1449         }
1450     };
1451 
1452     template <int imm>
1453     class v_wasm_palignr_u8_class<imm, false, false, true, false>
1454     {
1455     public:
operator ()(const v128_t &,const v128_t & b) const1456         inline v128_t operator()(const v128_t&, const v128_t& b) const
1457         {
1458             return b;
1459         }
1460     };
1461 
1462     template <int imm>
1463     class v_wasm_palignr_u8_class<imm, false, false, false, true>
1464     {
1465     public:
operator ()(const v128_t & a,const v128_t & b) const1466         inline v128_t operator()(const v128_t& a, const v128_t& b) const
1467         {
1468             enum { imm2 = (sizeof(v128_t) - imm) };
1469             return wasm_v8x16_shuffle(a, b,
1470                                       imm, imm+1, imm+2, imm+3,
1471                                       imm+4, imm+5, imm+6, imm+7,
1472                                       imm+8, imm+9, imm+10, imm+11,
1473                                       imm+12, imm+13, imm+14, imm+15);
1474         }
1475     };
1476 
1477     template <int imm>
v_wasm_palignr_u8(const v128_t & a,const v128_t & b)1478     inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b)
1479     {
1480         CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8.");
1481         return v_wasm_palignr_u8_class<imm>()(a, b);
1482     }
1483 }
1484 
1485 template<int imm, typename _Tpvec>
v_rotate_right(const _Tpvec & a)1486 inline _Tpvec v_rotate_right(const _Tpvec &a)
1487 {
1488     using namespace hal_wasm_internal;
1489     enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1490     v128_t z = wasm_i8x16_splat(0);
1491     return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
1492 }
1493 
1494 template<int imm, typename _Tpvec>
v_rotate_left(const _Tpvec & a)1495 inline _Tpvec v_rotate_left(const _Tpvec &a)
1496 {
1497     using namespace hal_wasm_internal;
1498     enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1499     v128_t z = wasm_i8x16_splat(0);
1500     return _Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
1501 }
1502 
1503 template<int imm, typename _Tpvec>
v_rotate_right(const _Tpvec & a,const _Tpvec & b)1504 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1505 {
1506     using namespace hal_wasm_internal;
1507     enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1508     return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
1509 }
1510 
1511 template<int imm, typename _Tpvec>
v_rotate_left(const _Tpvec & a,const _Tpvec & b)1512 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1513 {
1514     using namespace hal_wasm_internal;
1515     enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1516     return _Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
1517 }
1518 
1519 #define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1520 inline _Tpvec v_load(const _Tp* ptr) \
1521 { return _Tpvec(wasm_v128_load(ptr)); } \
1522 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1523 { return _Tpvec(wasm_v128_load(ptr)); } \
1524 inline _Tpvec v_load_low(const _Tp* ptr) \
1525 { \
1526     _Tp tmp[_Tpvec::nlanes] = {0}; \
1527     for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1528         tmp[i] = ptr[i]; \
1529     } \
1530     return _Tpvec(wasm_v128_load(tmp)); \
1531 } \
1532 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1533 { \
1534     _Tp tmp[_Tpvec::nlanes]; \
1535     for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
1536         tmp[i] = ptr0[i]; \
1537         tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
1538     } \
1539     return _Tpvec(wasm_v128_load(tmp)); \
1540 } \
1541 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1542 { wasm_v128_store(ptr, a.val); } \
1543 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1544 { wasm_v128_store(ptr, a.val); } \
1545 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1546 { wasm_v128_store(ptr, a.val); } \
1547 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1548 { \
1549     wasm_v128_store(ptr, a.val); \
1550 } \
1551 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1552 { \
1553     _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1554     wasm_v128_store(a_, a.val); \
1555     for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1556         ptr[i] = a_[i]; \
1557 } \
1558 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1559 { \
1560     _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1561     wasm_v128_store(a_, a.val); \
1562     for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
1563         ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
1564 }
1565 
OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16,uchar)1566 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
1567 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar)
1568 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort)
1569 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short)
1570 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1571 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int)
1572 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64)
1573 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64)
1574 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
1575 OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
1576 
1577 
1578 /** Reverse **/
1579 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
1580 { return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
1581 
v_reverse(const v_int8x16 & a)1582 inline v_int8x16 v_reverse(const v_int8x16 &a)
1583 { return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
1584 
v_reverse(const v_uint16x8 & a)1585 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
1586 { return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
1587 
v_reverse(const v_int16x8 & a)1588 inline v_int16x8 v_reverse(const v_int16x8 &a)
1589 { return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
1590 
v_reverse(const v_uint32x4 & a)1591 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
1592 { return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
1593 
v_reverse(const v_int32x4 & a)1594 inline v_int32x4 v_reverse(const v_int32x4 &a)
1595 { return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
1596 
v_reverse(const v_float32x4 & a)1597 inline v_float32x4 v_reverse(const v_float32x4 &a)
1598 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
1599 
v_reverse(const v_uint64x2 & a)1600 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
1601 { return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
1602 
v_reverse(const v_int64x2 & a)1603 inline v_int64x2 v_reverse(const v_int64x2 &a)
1604 { return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
1605 
v_reverse(const v_float64x2 & a)1606 inline v_float64x2 v_reverse(const v_float64x2 &a)
1607 { return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
1608 
1609 
1610 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1611 inline scalartype v_reduce_sum(const _Tpvec& a) \
1612 { \
1613     regtype val = a.val; \
1614     val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1615     val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
1616     return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1617 }
1618 
OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4,unsigned,v128_t,i32x4,i32x4)1619 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4)
1620 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4)
1621 OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
1622 
1623 // To do: Optimize v_reduce_sum with wasm intrin.
1624 //        Now use fallback implementation as there is no widening op in wasm intrin.
1625 
1626 #define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
1627 inline scalartype v_reduce_sum(const _Tpvec& a) \
1628 { \
1629     _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1630     wasm_v128_store(a_, a.val); \
1631     scalartype c = a_[0]; \
1632     for (int i = 1; i < _Tpvec::nlanes; i++) \
1633         c += a_[i]; \
1634     return c; \
1635 }
1636 
1637 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
1638 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int)
1639 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned)
1640 OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
1641 
1642 
1643 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
1644 inline scalartype v_reduce_sum(const _Tpvec& a) \
1645 { \
1646     regtype val = a.val; \
1647     val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
1648     return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
1649 }
1650 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
1651 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64,  v128_t, i64x2, i64x2)
1652 OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double,  v128_t, f64x2,f64x2)
1653 
1654 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1655                                  const v_float32x4& c, const v_float32x4& d)
1656 {
1657     v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
1658     v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
1659     return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
1660 }
1661 
1662 #define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
1663 inline scalartype v_reduce_##func(const _Tpvec& a) \
1664 { \
1665     scalartype buf[_Tpvec::nlanes]; \
1666     v_store(buf, a); \
1667     scalartype tmp = buf[0]; \
1668     for (int i=1; i<_Tpvec::nlanes; ++i) { \
1669         tmp = scalar_func(tmp, buf[i]); \
1670     } \
1671     return tmp; \
1672 }
1673 
OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16,uchar,max,std::max)1674 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max)
1675 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min)
1676 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max)
1677 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min)
1678 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max)
1679 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min)
1680 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max)
1681 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min)
1682 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max)
1683 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min)
1684 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max)
1685 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min)
1686 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max)
1687 OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min)
1688 
1689 inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
1690 {
1691     v_uint16x8 l16, h16;
1692     v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1693     v_expand(v_absdiff(a, b), l16, h16);
1694     v_expand(l16, l16_l32, l16_h32);
1695     v_expand(h16, h16_l32, h16_h32);
1696     return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1697 }
v_reduce_sad(const v_int8x16 & a,const v_int8x16 & b)1698 inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
1699 {
1700     v_uint16x8 l16, h16;
1701     v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
1702     v_expand(v_absdiff(a, b), l16, h16);
1703     v_expand(l16, l16_l32, l16_h32);
1704     v_expand(h16, h16_l32, h16_h32);
1705     return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
1706 }
v_reduce_sad(const v_uint16x8 & a,const v_uint16x8 & b)1707 inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
1708 {
1709     v_uint32x4 l, h;
1710     v_expand(v_absdiff(a, b), l, h);
1711     return v_reduce_sum(l + h);
1712 }
v_reduce_sad(const v_int16x8 & a,const v_int16x8 & b)1713 inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
1714 {
1715     v_uint32x4 l, h;
1716     v_expand(v_absdiff(a, b), l, h);
1717     return v_reduce_sum(l + h);
1718 }
v_reduce_sad(const v_uint32x4 & a,const v_uint32x4 & b)1719 inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
1720 {
1721     return v_reduce_sum(v_absdiff(a, b));
1722 }
v_reduce_sad(const v_int32x4 & a,const v_int32x4 & b)1723 inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
1724 {
1725     return v_reduce_sum(v_absdiff(a, b));
1726 }
v_reduce_sad(const v_float32x4 & a,const v_float32x4 & b)1727 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
1728 {
1729     return v_reduce_sum(v_absdiff(a, b));
1730 }
1731 
v_popcount(const v_uint8x16 & a)1732 inline v_uint8x16 v_popcount(const v_uint8x16& a)
1733 {
1734     v128_t m1 = wasm_i32x4_splat(0x55555555);
1735     v128_t m2 = wasm_i32x4_splat(0x33333333);
1736     v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
1737     v128_t p = a.val;
1738     p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
1739     p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
1740     p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
1741     return v_uint8x16(p);
1742 }
v_popcount(const v_uint16x8 & a)1743 inline v_uint16x8 v_popcount(const v_uint16x8& a)
1744 {
1745     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1746     p += v_rotate_right<1>(p);
1747     return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
1748 }
v_popcount(const v_uint32x4 & a)1749 inline v_uint32x4 v_popcount(const v_uint32x4& a)
1750 {
1751     v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
1752     p += v_rotate_right<1>(p);
1753     p += v_rotate_right<2>(p);
1754     return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
1755 }
v_popcount(const v_uint64x2 & a)1756 inline v_uint64x2 v_popcount(const v_uint64x2& a)
1757 {
1758     uint64 a_[2], b_[2] = { 0 };
1759     wasm_v128_store(a_, a.val);
1760     for (int i = 0; i < 16; i++)
1761         b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
1762     return v_uint64x2(wasm_v128_load(b_));
1763 }
v_popcount(const v_int8x16 & a)1764 inline v_uint8x16 v_popcount(const v_int8x16& a)
1765 { return v_popcount(v_reinterpret_as_u8(a)); }
v_popcount(const v_int16x8 & a)1766 inline v_uint16x8 v_popcount(const v_int16x8& a)
1767 { return v_popcount(v_reinterpret_as_u16(a)); }
v_popcount(const v_int32x4 & a)1768 inline v_uint32x4 v_popcount(const v_int32x4& a)
1769 { return v_popcount(v_reinterpret_as_u32(a)); }
v_popcount(const v_int64x2 & a)1770 inline v_uint64x2 v_popcount(const v_int64x2& a)
1771 { return v_popcount(v_reinterpret_as_u64(a)); }
1772 
1773 #define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
1774 inline int v_signmask(const _Tpvec& a) \
1775 { \
1776     _Tpvec::lane_type a_[_Tpvec::nlanes]; \
1777     wasm_v128_store(a_, a.val); \
1778     int mask = 0; \
1779     for (int i = 0; i < _Tpvec::nlanes; i++) \
1780         mask |= (reinterpret_int(a_[i]) < 0) << i; \
1781     return mask; \
1782 } \
1783 inline bool v_check_all(const _Tpvec& a) \
1784 { return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
1785 inline bool v_check_any(const _Tpvec& a) \
1786 { return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
1787 
OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16,i8x16,schar)1788 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar)
1789 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar)
1790 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short)
1791 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short)
1792 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int)
1793 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int)
1794 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float)
1795 OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double)
1796 
1797 #define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
1798 inline bool v_check_all(const _Tpvec& a) \
1799 { \
1800     v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1801     masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
1802     masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
1803     return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1804 } \
1805 inline bool v_check_any(const _Tpvec& a) \
1806 { \
1807     v128_t masked = v_reinterpret_as_##esuffix(a).val; \
1808     masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
1809     masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
1810     return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
1811 } \
1812 
1813 OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32)
1814 OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32)
1815 
1816 
1817 inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
v_scan_forward(const v_uint8x16 & a)1818 inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
v_scan_forward(const v_int16x8 & a)1819 inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
v_scan_forward(const v_uint16x8 & a)1820 inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
v_scan_forward(const v_int32x4 & a)1821 inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_uint32x4 & a)1822 inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_float32x4 & a)1823 inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
v_scan_forward(const v_int64x2 & a)1824 inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
v_scan_forward(const v_uint64x2 & a)1825 inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
v_scan_forward(const v_float64x2 & a)1826 inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
1827 
1828 #define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
1829 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1830 { \
1831     return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
1832 }
1833 
1834 OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16)
OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)1835 OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)
1836 OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8)
1837 OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8)
1838 OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4)
1839 OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4)
1840 OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2)
1841 OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2)
1842 OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4)
1843 OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2)
1844 
1845 #define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
1846 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)      \
1847 {                                                                    \
1848     b0.val = intrin(a.val);                                          \
1849     b1.val = __CV_CAT(intrin, _high)(a.val);                         \
1850 }                                                                    \
1851 inline _Tpwvec v_expand_low(const _Tpvec& a)                         \
1852 { return _Tpwvec(intrin(a.val)); }                                   \
1853 inline _Tpwvec v_expand_high(const _Tpvec& a)                        \
1854 { return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); }                  \
1855 inline _Tpwvec v_load_expand(const _Tp* ptr)                         \
1856 {                                                                    \
1857     v128_t a = wasm_v128_load(ptr);                                  \
1858     return _Tpwvec(intrin(a));                                       \
1859 }
1860 
1861 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8)
1862 OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16,  v_int16x8,  schar, v128_cvti8x16_i16x8)
1863 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4)
1864 OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8,  v_int32x4,  short, v128_cvti16x8_i32x4)
1865 OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2)
1866 OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4,  v_int64x2,  int, v128_cvti32x4_i64x2)
1867 
1868 #define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin)  \
1869 inline _Tpvec v_load_expand_q(const _Tp* ptr)               \
1870 {                                                           \
1871     v128_t a = wasm_v128_load(ptr);                         \
1872     return _Tpvec(intrin(a));                               \
1873 }
1874 
1875 OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4)
1876 OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4)
1877 
1878 #define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
1879 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1880 { \
1881     b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
1882     b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
1883 } \
1884 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1885 { \
1886     return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
1887 } \
1888 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1889 { \
1890     return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
1891 } \
1892 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1893 { \
1894     c.val = wasm_unpacklo_i64x2(a.val, b.val); \
1895     d.val = wasm_unpackhi_i64x2(a.val, b.val); \
1896 }
1897 
1898 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16)
1899 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16)
1900 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8)
1901 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8)
1902 OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4)
1903 OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4)
1904 OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4)
1905 OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2)
1906 
1907 template<int s, typename _Tpvec>
1908 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1909 {
1910     return v_rotate_right<s>(a, b);
1911 }
1912 
v_round(const v_float32x4 & a)1913 inline v_int32x4 v_round(const v_float32x4& a)
1914 {
1915     v128_t h = wasm_f32x4_splat(0.5);
1916     return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
1917 }
1918 
v_floor(const v_float32x4 & a)1919 inline v_int32x4 v_floor(const v_float32x4& a)
1920 {
1921     v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1922     v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
1923     return v_int32x4(wasm_i32x4_add(a1, mask));
1924 }
1925 
v_ceil(const v_float32x4 & a)1926 inline v_int32x4 v_ceil(const v_float32x4& a)
1927 {
1928     v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
1929     v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
1930     return v_int32x4(wasm_i32x4_sub(a1, mask));
1931 }
1932 
v_trunc(const v_float32x4 & a)1933 inline v_int32x4 v_trunc(const v_float32x4& a)
1934 { return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
1935 
1936 #define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
1937 inline v_int32x4 func(const v_float64x2& a) \
1938 { \
1939     double a_[2]; \
1940     wasm_v128_store(a_, a.val); \
1941     int c_[4]; \
1942     c_[0] = cfunc(a_[0]); \
1943     c_[1] = cfunc(a_[1]); \
1944     c_[2] = 0; \
1945     c_[3] = 0; \
1946     return v_int32x4(wasm_v128_load(c_)); \
1947 }
1948 
OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round,cvRound)1949 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
1950 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
1951 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
1952 OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
1953 
1954 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
1955 {
1956     double a_[2], b_[2];
1957     wasm_v128_store(a_, a.val);
1958     wasm_v128_store(b_, b.val);
1959     int c_[4];
1960     c_[0] = cvRound(a_[0]);
1961     c_[1] = cvRound(a_[1]);
1962     c_[2] = cvRound(b_[0]);
1963     c_[3] = cvRound(b_[1]);
1964     return v_int32x4(wasm_v128_load(c_));
1965 }
1966 
1967 #define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
1968 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1969                            const _Tpvec& a2, const _Tpvec& a3, \
1970                            _Tpvec& b0, _Tpvec& b1, \
1971                            _Tpvec& b2, _Tpvec& b3) \
1972 { \
1973     v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
1974     v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
1975     v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
1976     v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
1977 \
1978     b0.val = wasm_unpacklo_i64x2(t0, t1); \
1979     b1.val = wasm_unpackhi_i64x2(t0, t1); \
1980     b2.val = wasm_unpacklo_i64x2(t2, t3); \
1981     b3.val = wasm_unpackhi_i64x2(t2, t3); \
1982 }
1983 
OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4,i32x4)1984 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4)
1985 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4)
1986 OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4)
1987 
1988 // load deinterleave
1989 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1990 {
1991     v128_t t00 = wasm_v128_load(ptr);
1992     v128_t t01 = wasm_v128_load(ptr + 16);
1993 
1994     a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
1995     b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
1996 }
1997 
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c)1998 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1999 {
2000     v128_t t00 = wasm_v128_load(ptr);
2001     v128_t t01 = wasm_v128_load(ptr + 16);
2002     v128_t t02 = wasm_v128_load(ptr + 32);
2003 
2004     v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
2005     v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
2006     v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
2007 
2008     a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
2009     b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
2010     c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
2011 }
2012 
v_load_deinterleave(const uchar * ptr,v_uint8x16 & a,v_uint8x16 & b,v_uint8x16 & c,v_uint8x16 & d)2013 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
2014 {
2015     v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
2016     v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2017     v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
2018     v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
2019 
2020     v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2021     v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
2022     v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2023     v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
2024 
2025     a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2026     b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2027     c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2028     d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2029 }
2030 
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b)2031 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
2032 {
2033     v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1 a2 b2 a3 b3
2034     v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
2035 
2036     a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
2037     b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
2038 }
2039 
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c)2040 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
2041 {
2042     v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1 b1 c1 a2 b2
2043     v128_t t01 = wasm_v128_load(ptr + 8);    // c2 a3 b3 c3 a4 b4 c4 a5
2044     v128_t t02 = wasm_v128_load(ptr + 16);  // b5 c5 a6 b6 c6 a7 b7 c7
2045 
2046     v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
2047     v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
2048     v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
2049 
2050     a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
2051     b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
2052     c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
2053 }
2054 
v_load_deinterleave(const ushort * ptr,v_uint16x8 & a,v_uint16x8 & b,v_uint16x8 & c,v_uint16x8 & d)2055 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
2056 {
2057     v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1
2058     v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ...
2059     v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
2060     v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
2061 
2062     v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
2063     v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
2064     v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
2065     v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
2066 
2067     a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2068     b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2069     c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2070     d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2071 }
2072 
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b)2073 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
2074 {
2075     v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1
2076     v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
2077 
2078     a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2079     b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2080 }
2081 
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c)2082 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
2083 {
2084     v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1
2085     v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
2086     v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
2087 
2088     v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2089     v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2090     v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2091 
2092     a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2093     b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2094     c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2095 }
2096 
v_load_deinterleave(const unsigned * ptr,v_uint32x4 & a,v_uint32x4 & b,v_uint32x4 & c,v_uint32x4 & d)2097 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
2098 {
2099     v_uint32x4 s0(wasm_v128_load(ptr));      // a0 b0 c0 d0
2100     v_uint32x4 s1(wasm_v128_load(ptr + 4));  // a1 b1 c1 d1
2101     v_uint32x4 s2(wasm_v128_load(ptr + 8));  // a2 b2 c2 d2
2102     v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2103 
2104     v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2105 }
2106 
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b)2107 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
2108 {
2109     v128_t v0 = wasm_v128_load(ptr);       // a0 b0 a1 b1
2110     v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
2111 
2112     a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
2113     b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
2114 }
2115 
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b,v_float32x4 & c)2116 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
2117 {
2118     v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1
2119     v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
2120     v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
2121 
2122     v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
2123     v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
2124     v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
2125 
2126     a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
2127     b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
2128     c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
2129 }
2130 
v_load_deinterleave(const float * ptr,v_float32x4 & a,v_float32x4 & b,v_float32x4 & c,v_float32x4 & d)2131 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
2132 {
2133     v_float32x4 s0(wasm_v128_load(ptr));      // a0 b0 c0 d0
2134     v_float32x4 s1(wasm_v128_load(ptr + 4));  // a1 b1 c1 d1
2135     v_float32x4 s2(wasm_v128_load(ptr + 8));  // a2 b2 c2 d2
2136     v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
2137 
2138     v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
2139 }
2140 
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b)2141 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
2142 {
2143     v128_t t0 = wasm_v128_load(ptr);      // a0 b0
2144     v128_t t1 = wasm_v128_load(ptr + 2);  // a1 b1
2145 
2146     a.val = wasm_unpacklo_i64x2(t0, t1);
2147     b.val = wasm_unpackhi_i64x2(t0, t1);
2148 }
2149 
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b,v_uint64x2 & c)2150 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
2151 {
2152     v128_t t0 = wasm_v128_load(ptr);     // a0, b0
2153     v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
2154     v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
2155 
2156     a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2157     b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
2158     c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
2159 }
2160 
v_load_deinterleave(const uint64 * ptr,v_uint64x2 & a,v_uint64x2 & b,v_uint64x2 & c,v_uint64x2 & d)2161 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
2162                                 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
2163 {
2164     v128_t t0 = wasm_v128_load(ptr);     // a0 b0
2165     v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0
2166     v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1
2167     v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1
2168 
2169     a.val = wasm_unpacklo_i64x2(t0, t2);
2170     b.val = wasm_unpackhi_i64x2(t0, t2);
2171     c.val = wasm_unpacklo_i64x2(t1, t3);
2172     d.val = wasm_unpackhi_i64x2(t1, t3);
2173 }
2174 
2175 // store interleave
2176 
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,hal::StoreMode=hal::STORE_UNALIGNED)2177 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2178                                 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2179 {
2180     v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
2181     v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
2182 
2183     wasm_v128_store(ptr, v0);
2184     wasm_v128_store(ptr + 16, v1);
2185 }
2186 
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,hal::StoreMode=hal::STORE_UNALIGNED)2187 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2188                                 const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2189 {
2190     v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
2191     v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
2192     v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
2193 
2194     v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
2195     v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
2196     v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
2197 
2198     wasm_v128_store(ptr, t10);
2199     wasm_v128_store(ptr + 16, t11);
2200     wasm_v128_store(ptr + 32, t12);
2201 }
2202 
v_store_interleave(uchar * ptr,const v_uint8x16 & a,const v_uint8x16 & b,const v_uint8x16 & c,const v_uint8x16 & d,hal::StoreMode=hal::STORE_UNALIGNED)2203 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2204                                 const v_uint8x16& c, const v_uint8x16& d,
2205                                 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2206 {
2207     // a0 a1 a2 a3 ....
2208     // b0 b1 b2 b3 ....
2209     // c0 c1 c2 c3 ....
2210     // d0 d1 d2 d3 ....
2211     v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ...
2212     v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ...
2213     v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ...
2214     v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ...
2215 
2216     v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ...
2217     v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ...
2218     v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ...
2219     v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ...
2220 
2221     wasm_v128_store(ptr, v0);
2222     wasm_v128_store(ptr + 16, v1);
2223     wasm_v128_store(ptr + 32, v2);
2224     wasm_v128_store(ptr + 48, v3);
2225 }
2226 
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,hal::StoreMode=hal::STORE_UNALIGNED)2227 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2228                                 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2229 {
2230     v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
2231     v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
2232 
2233     wasm_v128_store(ptr, v0);
2234     wasm_v128_store(ptr + 8, v1);
2235 }
2236 
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,hal::StoreMode=hal::STORE_UNALIGNED)2237 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2238                                 const v_uint16x8& b, const v_uint16x8& c,
2239                                 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2240 {
2241     v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
2242     v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
2243     v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
2244 
2245     v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
2246     v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
2247     v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
2248 
2249     wasm_v128_store(ptr, t10);
2250     wasm_v128_store(ptr + 8, t11);
2251     wasm_v128_store(ptr + 16, t12);
2252 }
2253 
v_store_interleave(ushort * ptr,const v_uint16x8 & a,const v_uint16x8 & b,const v_uint16x8 & c,const v_uint16x8 & d,hal::StoreMode=hal::STORE_UNALIGNED)2254 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2255                                 const v_uint16x8& c, const v_uint16x8& d,
2256                                 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2257 {
2258     // a0 a1 a2 a3 ....
2259     // b0 b1 b2 b3 ....
2260     // c0 c1 c2 c3 ....
2261     // d0 d1 d2 d3 ....
2262     v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ...
2263     v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ...
2264     v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ...
2265     v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ...
2266 
2267     v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ...
2268     v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ...
2269     v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ...
2270     v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ...
2271 
2272     wasm_v128_store(ptr, v0);
2273     wasm_v128_store(ptr + 8, v1);
2274     wasm_v128_store(ptr + 16, v2);
2275     wasm_v128_store(ptr + 24, v3);
2276 }
2277 
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,hal::StoreMode=hal::STORE_UNALIGNED)2278 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2279                                 hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2280 {
2281     v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2282     v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2283 
2284     wasm_v128_store(ptr, v0);
2285     wasm_v128_store(ptr + 4, v1);
2286 }
2287 
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,hal::StoreMode=hal::STORE_UNALIGNED)2288 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2289                                 const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2290 {
2291     v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2292     v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2293     v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2294 
2295     v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2296     v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2297     v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2298 
2299     wasm_v128_store(ptr, t10);
2300     wasm_v128_store(ptr + 4, t11);
2301     wasm_v128_store(ptr + 8, t12);
2302 }
2303 
v_store_interleave(unsigned * ptr,const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d,hal::StoreMode=hal::STORE_UNALIGNED)2304 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2305                                const v_uint32x4& c, const v_uint32x4& d,
2306                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2307 {
2308     v_uint32x4 v0, v1, v2, v3;
2309     v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2310 
2311     wasm_v128_store(ptr, v0.val);
2312     wasm_v128_store(ptr + 4, v1.val);
2313     wasm_v128_store(ptr + 8, v2.val);
2314     wasm_v128_store(ptr + 12, v3.val);
2315 }
2316 
2317 // 2-channel, float only
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,hal::StoreMode=hal::STORE_UNALIGNED)2318 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2319                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2320 {
2321     v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
2322     v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
2323 
2324     wasm_v128_store(ptr, v0);
2325     wasm_v128_store(ptr + 4, v1);
2326 }
2327 
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,hal::StoreMode=hal::STORE_UNALIGNED)2328 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2329                                const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2330 {
2331     v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
2332     v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
2333     v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
2334 
2335     v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
2336     v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
2337     v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
2338 
2339     wasm_v128_store(ptr, t10);
2340     wasm_v128_store(ptr + 4, t11);
2341     wasm_v128_store(ptr + 8, t12);
2342 }
2343 
v_store_interleave(float * ptr,const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d,hal::StoreMode=hal::STORE_UNALIGNED)2344 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2345                                const v_float32x4& c, const v_float32x4& d,
2346                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2347 {
2348     v_float32x4 v0, v1, v2, v3;
2349     v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2350 
2351     wasm_v128_store(ptr, v0.val);
2352     wasm_v128_store(ptr + 4, v1.val);
2353     wasm_v128_store(ptr + 8, v2.val);
2354     wasm_v128_store(ptr + 12, v3.val);
2355 }
2356 
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,hal::StoreMode=hal::STORE_UNALIGNED)2357 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2358                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2359 {
2360     v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2361     v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
2362 
2363     wasm_v128_store(ptr, v0);
2364     wasm_v128_store(ptr + 2, v1);
2365 }
2366 
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,hal::StoreMode=hal::STORE_UNALIGNED)2367 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2368                                const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2369 {
2370     v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
2371     v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
2372     v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
2373 
2374     wasm_v128_store(ptr, v0);
2375     wasm_v128_store(ptr + 2, v1);
2376     wasm_v128_store(ptr + 4, v2);
2377 }
2378 
v_store_interleave(uint64 * ptr,const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,hal::StoreMode=hal::STORE_UNALIGNED)2379 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2380                                const v_uint64x2& c, const v_uint64x2& d,
2381                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
2382 {
2383     v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
2384     v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
2385     v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
2386     v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
2387 
2388     wasm_v128_store(ptr, v0);
2389     wasm_v128_store(ptr + 2, v1);
2390     wasm_v128_store(ptr + 4, v2);
2391     wasm_v128_store(ptr + 6, v3);
2392 }
2393 
2394 #define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2395 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2396 { \
2397     _Tpvec1 a1, b1; \
2398     v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2399     a0 = v_reinterpret_as_##suffix0(a1); \
2400     b0 = v_reinterpret_as_##suffix0(b1); \
2401 } \
2402 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2403 { \
2404     _Tpvec1 a1, b1, c1; \
2405     v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2406     a0 = v_reinterpret_as_##suffix0(a1); \
2407     b0 = v_reinterpret_as_##suffix0(b1); \
2408     c0 = v_reinterpret_as_##suffix0(c1); \
2409 } \
2410 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2411 { \
2412     _Tpvec1 a1, b1, c1, d1; \
2413     v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2414     a0 = v_reinterpret_as_##suffix0(a1); \
2415     b0 = v_reinterpret_as_##suffix0(b1); \
2416     c0 = v_reinterpret_as_##suffix0(c1); \
2417     d0 = v_reinterpret_as_##suffix0(d1); \
2418 } \
2419 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2420                                 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2421 { \
2422     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2423     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2424     v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
2425 } \
2426 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2427                                 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2428 { \
2429     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2430     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2431     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2432     v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
2433 } \
2434 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2435                                 const _Tpvec0& c0, const _Tpvec0& d0, \
2436                                 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2437 { \
2438     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2439     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2440     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2441     _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2442     v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2443 }
2444 
OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16,schar,s8,v_uint8x16,uchar,u8)2445 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2446 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2447 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2448 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2449 OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2450 
2451 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2452 {
2453     return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
2454 }
2455 
v_cvt_f32(const v_float64x2 & a)2456 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2457 {
2458     double a_[2];
2459     wasm_v128_store(a_, a.val);
2460     float c_[4];
2461     c_[0] = (float)(a_[0]);
2462     c_[1] = (float)(a_[1]);
2463     c_[2] = 0;
2464     c_[3] = 0;
2465     return v_float32x4(wasm_v128_load(c_));
2466 }
2467 
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)2468 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2469 {
2470     double a_[2], b_[2];
2471     wasm_v128_store(a_, a.val);
2472     wasm_v128_store(b_, b.val);
2473     float c_[4];
2474     c_[0] = (float)(a_[0]);
2475     c_[1] = (float)(a_[1]);
2476     c_[2] = (float)(b_[0]);
2477     c_[3] = (float)(b_[1]);
2478     return v_float32x4(wasm_v128_load(c_));
2479 }
2480 
v_cvt_f64(const v_int32x4 & a)2481 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2482 {
2483 #ifdef __wasm_unimplemented_simd128__
2484     v128_t p = v128_cvti32x4_i64x2(a.val);
2485     return v_float64x2(wasm_f64x2_convert_i64x2(p));
2486 #else
2487     int a_[4];
2488     wasm_v128_store(a_, a.val);
2489     double c_[2];
2490     c_[0] = (double)(a_[0]);
2491     c_[1] = (double)(a_[1]);
2492     return v_float64x2(wasm_v128_load(c_));
2493 #endif
2494 }
2495 
v_cvt_f64_high(const v_int32x4 & a)2496 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2497 {
2498 #ifdef __wasm_unimplemented_simd128__
2499     v128_t p = v128_cvti32x4_i64x2_high(a.val);
2500     return v_float64x2(wasm_f64x2_convert_i64x2(p));
2501 #else
2502     int a_[4];
2503     wasm_v128_store(a_, a.val);
2504     double c_[2];
2505     c_[0] = (double)(a_[2]);
2506     c_[1] = (double)(a_[3]);
2507     return v_float64x2(wasm_v128_load(c_));
2508 #endif
2509 }
2510 
v_cvt_f64(const v_float32x4 & a)2511 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2512 {
2513     float a_[4];
2514     wasm_v128_store(a_, a.val);
2515     double c_[2];
2516     c_[0] = (double)(a_[0]);
2517     c_[1] = (double)(a_[1]);
2518     return v_float64x2(wasm_v128_load(c_));
2519 }
2520 
v_cvt_f64_high(const v_float32x4 & a)2521 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2522 {
2523     float a_[4];
2524     wasm_v128_store(a_, a.val);
2525     double c_[2];
2526     c_[0] = (double)(a_[2]);
2527     c_[1] = (double)(a_[3]);
2528     return v_float64x2(wasm_v128_load(c_));
2529 }
2530 
v_cvt_f64(const v_int64x2 & a)2531 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2532 {
2533 #ifdef __wasm_unimplemented_simd128__
2534     return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
2535 #else
2536     int64 a_[2];
2537     wasm_v128_store(a_, a.val);
2538     double c_[2];
2539     c_[0] = (double)(a_[0]);
2540     c_[1] = (double)(a_[1]);
2541     return v_float64x2(wasm_v128_load(c_));
2542 #endif
2543 }
2544 
2545 ////////////// Lookup table access ////////////////////
2546 
v_lut(const schar * tab,const int * idx)2547 inline v_int8x16 v_lut(const schar* tab, const int* idx)
2548 {
2549     return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
2550                      tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
2551 }
v_lut_pairs(const schar * tab,const int * idx)2552 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
2553 {
2554     return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
2555                      tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
2556 }
v_lut_quads(const schar * tab,const int * idx)2557 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
2558 {
2559     return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
2560                      tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
2561 }
v_lut(const uchar * tab,const int * idx)2562 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)2563 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)2564 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
2565 
v_lut(const short * tab,const int * idx)2566 inline v_int16x8 v_lut(const short* tab, const int* idx)
2567 {
2568     return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
2569                      tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
2570 }
v_lut_pairs(const short * tab,const int * idx)2571 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
2572 {
2573     return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
2574                      tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
2575 }
v_lut_quads(const short * tab,const int * idx)2576 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
2577 {
2578     return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
2579                      tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
2580 }
v_lut(const ushort * tab,const int * idx)2581 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)2582 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)2583 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
2584 
v_lut(const int * tab,const int * idx)2585 inline v_int32x4 v_lut(const int* tab, const int* idx)
2586 {
2587     return v_int32x4(tab[idx[0]], tab[idx[1]],
2588                      tab[idx[2]], tab[idx[3]]);
2589 }
v_lut_pairs(const int * tab,const int * idx)2590 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
2591 {
2592     return v_int32x4(tab[idx[0]], tab[idx[0]+1],
2593                      tab[idx[1]], tab[idx[1]+1]);
2594 }
v_lut_quads(const int * tab,const int * idx)2595 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
2596 {
2597     return v_int32x4(wasm_v128_load(tab + idx[0]));
2598 }
v_lut(const unsigned * tab,const int * idx)2599 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)2600 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)2601 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
2602 
v_lut(const int64_t * tab,const int * idx)2603 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
2604 {
2605     return v_int64x2(tab[idx[0]], tab[idx[1]]);
2606 }
v_lut_pairs(const int64_t * tab,const int * idx)2607 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
2608 {
2609     return v_int64x2(wasm_v128_load(tab + idx[0]));
2610 }
v_lut(const uint64_t * tab,const int * idx)2611 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
v_lut_pairs(const uint64_t * tab,const int * idx)2612 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
2613 
v_lut(const float * tab,const int * idx)2614 inline v_float32x4 v_lut(const float* tab, const int* idx)
2615 {
2616     return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
2617 }
v_lut_pairs(const float * tab,const int * idx)2618 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
v_lut_quads(const float * tab,const int * idx)2619 inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
2620 
v_lut(const double * tab,const int * idx)2621 inline v_float64x2 v_lut(const double* tab, const int* idx)
2622 {
2623     return v_float64x2(tab[idx[0]], tab[idx[1]]);
2624 }
v_lut_pairs(const double * tab,const int * idx)2625 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
2626 {
2627     return v_float64x2(wasm_v128_load(tab + idx[0]));
2628 }
2629 
v_lut(const int * tab,const v_int32x4 & idxvec)2630 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2631 {
2632     return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2633                      tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2634                      tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2635                      tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2636 }
2637 
v_lut(const unsigned * tab,const v_int32x4 & idxvec)2638 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
2639 {
2640     return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
2641 }
2642 
v_lut(const float * tab,const v_int32x4 & idxvec)2643 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2644 {
2645     return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2646                        tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2647                        tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2648                        tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2649 }
2650 
v_lut(const double * tab,const v_int32x4 & idxvec)2651 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2652 {
2653     return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2654                        tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
2655 }
2656 
2657 // loads pairs from the table and deinterleaves them, e.g. returns:
2658 //   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
2659 //   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
2660 // note that the indices are float's indices, not the float-pair indices.
2661 // in theory, this function can be used to implement bilinear interpolation,
2662 // when idxvec are the offsets within the image.
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)2663 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2664 {
2665     x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
2666                     tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
2667                     tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
2668                     tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
2669     y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
2670                     tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
2671                     tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
2672                     tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
2673 }
2674 
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)2675 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2676 {
2677     v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
2678     v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
2679     x.val = wasm_unpacklo_i64x2(xy0, xy1);
2680     y.val = wasm_unpacklo_i64x2(xy0, xy1);
2681 }
2682 
v_interleave_pairs(const v_int8x16 & vec)2683 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2684 {
2685     return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
2686 }
v_interleave_pairs(const v_uint8x16 & vec)2687 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
v_interleave_quads(const v_int8x16 & vec)2688 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2689 {
2690     return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
2691 }
v_interleave_quads(const v_uint8x16 & vec)2692 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
2693 
v_interleave_pairs(const v_int16x8 & vec)2694 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2695 {
2696     return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
2697 }
v_interleave_pairs(const v_uint16x8 & vec)2698 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
v_interleave_quads(const v_int16x8 & vec)2699 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2700 {
2701     return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
2702 }
v_interleave_quads(const v_uint16x8 & vec)2703 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2704 
v_interleave_pairs(const v_int32x4 & vec)2705 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2706 {
2707     return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2708 }
v_interleave_pairs(const v_uint32x4 & vec)2709 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_interleave_pairs(const v_float32x4 & vec)2710 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
2711 {
2712     return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
2713 }
2714 
v_pack_triplets(const v_int8x16 & vec)2715 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2716 {
2717     return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
2718 }
v_pack_triplets(const v_uint8x16 & vec)2719 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2720 
v_pack_triplets(const v_int16x8 & vec)2721 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2722 {
2723     return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
2724 }
v_pack_triplets(const v_uint16x8 & vec)2725 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2726 
v_pack_triplets(const v_int32x4 & vec)2727 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
v_pack_triplets(const v_uint32x4 & vec)2728 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
v_pack_triplets(const v_float32x4 & vec)2729 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2730 
2731 template<int i, typename _Tp>
v_extract_n(const _Tp & a)2732 inline typename _Tp::lane_type v_extract_n(const _Tp& a)
2733 {
2734     return v_rotate_right<i>(a).get0();
2735 }
2736 
2737 template<int i>
v_broadcast_element(const v_uint32x4 & a)2738 inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
2739 {
2740     return v_setall_u32(v_extract_n<i>(a));
2741 }
2742 template<int i>
v_broadcast_element(const v_int32x4 & a)2743 inline v_int32x4 v_broadcast_element(const v_int32x4& a)
2744 {
2745     return v_setall_s32(v_extract_n<i>(a));
2746 }
2747 template<int i>
v_broadcast_element(const v_float32x4 & a)2748 inline v_float32x4 v_broadcast_element(const v_float32x4& a)
2749 {
2750     return v_setall_f32(v_extract_n<i>(a));
2751 }
2752 
2753 
2754 ////////////// FP16 support ///////////////////////////
2755 
v_load_expand(const float16_t * ptr)2756 inline v_float32x4 v_load_expand(const float16_t* ptr)
2757 {
2758     float a[4];
2759     for (int i = 0; i < 4; i++)
2760         a[i] = ptr[i];
2761     return v_float32x4(wasm_v128_load(a));
2762 }
2763 
v_pack_store(float16_t * ptr,const v_float32x4 & v)2764 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2765 {
2766     double v_[4];
2767     wasm_v128_store(v_, v.val);
2768     ptr[0] = float16_t(v_[0]);
2769     ptr[1] = float16_t(v_[1]);
2770     ptr[2] = float16_t(v_[2]);
2771     ptr[3] = float16_t(v_[3]);
2772 }
2773 
v_cleanup()2774 inline void v_cleanup() {}
2775 
2776 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2777 
2778 //! @endcond
2779 
2780 }
2781 
2782 #endif
2783