1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4
5 // Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved.
6
7 #ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8 #define OPENCV_HAL_INTRIN_RISCVV_HPP
9
10 #include <float.h>
11 #include <algorithm>
12 #include "opencv2/core/utility.hpp"
13
14 namespace cv
15 {
16
17 //! @cond IGNORED
18
19 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20
21 #define CV_SIMD128 1
22 #define CV_SIMD128_64F 1
23 //////////// Types ////////////
24 struct v_uint8x16
25 {
26 typedef uchar lane_type;
27 enum { nlanes = 16 };
28
v_uint8x16cv::v_uint8x1629 v_uint8x16() {}
v_uint8x16cv::v_uint8x1630 explicit v_uint8x16(vuint8m1_t v) : val(v) {}
v_uint8x16cv::v_uint8x1631 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
32 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
33 {
34 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35 val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
36 }
get0cv::v_uint8x1637 uchar get0() const
38 {
39 return vmv_x_s_u8m1_u8(val, 16);
40 }
41
42 vuint8m1_t val;
43 };
44
45 struct v_int8x16
46 {
47 typedef schar lane_type;
48 enum { nlanes = 16 };
49
v_int8x16cv::v_int8x1650 v_int8x16() {}
v_int8x16cv::v_int8x1651 explicit v_int8x16(vint8m1_t v) : val(v) {}
v_int8x16cv::v_int8x1652 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
54 {
55 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56 val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
57 }
get0cv::v_int8x1658 schar get0() const
59 {
60 return vmv_x_s_i8m1_i8(val, 16);
61 }
62
63 vint8m1_t val;
64 };
65
66 struct v_uint16x8
67 {
68 typedef ushort lane_type;
69 enum { nlanes = 8 };
70
v_uint16x8cv::v_uint16x871 v_uint16x8() {}
v_uint16x8cv::v_uint16x872 explicit v_uint16x8(vuint16m1_t v) : val(v) {}
v_uint16x8cv::v_uint16x873 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
74 {
75 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76 val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
77 }
get0cv::v_uint16x878 ushort get0() const
79 {
80 return vmv_x_s_u16m1_u16(val, 8);
81 }
82
83 vuint16m1_t val;
84 };
85
86 struct v_int16x8
87 {
88 typedef short lane_type;
89 enum { nlanes = 8 };
90
v_int16x8cv::v_int16x891 v_int16x8() {}
v_int16x8cv::v_int16x892 explicit v_int16x8(vint16m1_t v) : val(v) {}
v_int16x8cv::v_int16x893 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
94 {
95 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96 val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
97 }
get0cv::v_int16x898 short get0() const
99 {
100 return vmv_x_s_i16m1_i16(val, 8);
101 }
102
103 vint16m1_t val;
104 };
105
106 struct v_uint32x4
107 {
108 typedef unsigned lane_type;
109 enum { nlanes = 4 };
110
v_uint32x4cv::v_uint32x4111 v_uint32x4() {}
v_uint32x4cv::v_uint32x4112 explicit v_uint32x4(vuint32m1_t v) : val(v) {}
v_uint32x4cv::v_uint32x4113 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
114 {
115 unsigned v[] = {v0, v1, v2, v3};
116 val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
117 }
get0cv::v_uint32x4118 unsigned get0() const
119 {
120 return vmv_x_s_u32m1_u32(val, 4);
121 }
122
123 vuint32m1_t val;
124 };
125
126 struct v_int32x4
127 {
128 typedef int lane_type;
129 enum { nlanes = 4 };
130
v_int32x4cv::v_int32x4131 v_int32x4() {}
v_int32x4cv::v_int32x4132 explicit v_int32x4(vint32m1_t v) : val(v) {}
v_int32x4cv::v_int32x4133 v_int32x4(int v0, int v1, int v2, int v3)
134 {
135 int v[] = {v0, v1, v2, v3};
136 val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
137 }
get0cv::v_int32x4138 int get0() const
139 {
140 return vmv_x_s_i32m1_i32(val, 4);
141 }
142 vint32m1_t val;
143 };
144
145 struct v_float32x4
146 {
147 typedef float lane_type;
148 enum { nlanes = 4 };
149
v_float32x4cv::v_float32x4150 v_float32x4() {}
v_float32x4cv::v_float32x4151 explicit v_float32x4(vfloat32m1_t v) : val(v) {}
v_float32x4cv::v_float32x4152 v_float32x4(float v0, float v1, float v2, float v3)
153 {
154 float v[] = {v0, v1, v2, v3};
155 val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
156 }
get0cv::v_float32x4157 float get0() const
158 {
159 return vfmv_f_s_f32m1_f32(val, 4);
160 }
161 vfloat32m1_t val;
162 };
163
164 struct v_uint64x2
165 {
166 typedef uint64 lane_type;
167 enum { nlanes = 2 };
168
v_uint64x2cv::v_uint64x2169 v_uint64x2() {}
v_uint64x2cv::v_uint64x2170 explicit v_uint64x2(vuint64m1_t v) : val(v) {}
v_uint64x2cv::v_uint64x2171 v_uint64x2(uint64 v0, uint64 v1)
172 {
173 uint64 v[] = {v0, v1};
174 val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
175 }
get0cv::v_uint64x2176 uint64 get0() const
177 {
178 return vmv_x_s_u64m1_u64(val, 2);
179 }
180 vuint64m1_t val;
181 };
182
183 struct v_int64x2
184 {
185 typedef int64 lane_type;
186 enum { nlanes = 2 };
187
v_int64x2cv::v_int64x2188 v_int64x2() {}
v_int64x2cv::v_int64x2189 explicit v_int64x2(vint64m1_t v) : val(v) {}
v_int64x2cv::v_int64x2190 v_int64x2(int64 v0, int64 v1)
191 {
192 int64 v[] = {v0, v1};
193 val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
194 }
get0cv::v_int64x2195 int64 get0() const
196 {
197 return vmv_x_s_i64m1_i64(val, 2);
198 }
199 vint64m1_t val;
200 };
201
202 struct v_float64x2
203 {
204 typedef double lane_type;
205 enum { nlanes = 2 };
206
v_float64x2cv::v_float64x2207 v_float64x2() {}
v_float64x2cv::v_float64x2208 explicit v_float64x2(vfloat64m1_t v) : val(v) {}
v_float64x2cv::v_float64x2209 v_float64x2(double v0, double v1)
210 {
211 double v[] = {v0, v1};
212 val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
213 }
get0cv::v_float64x2214 double get0() const
215 {
216 return vfmv_f_s_f64m1_f64(val, 2);
217 }
218 vfloat64m1_t val;
219 };
220
221 #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
222 inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
223 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
224 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
225 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
226 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
227 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
228 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
229 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
230 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
231 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
232 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
233
234
OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16,vuint8,u8)235 OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
236 OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
237 OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
238 OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
239 OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
240 OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
241 OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
242 OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
243 OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
244 OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
245 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
246 inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); } \
247 inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
248
249 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
250 OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
251 OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
252 OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
253 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
254 OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
255 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
256 OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
257 inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
v_setall_f32(float v)258 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
259
v_setzero_f64()260 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
v_setall_f64(double v)261 inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
262
263
264 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
265 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
266 { \
267 return _Tpvec(intrin(a.val, b.val)); \
268 } \
269 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
270 { \
271 a.val = intrin(a.val, b.val); \
272 return a; \
273 }
274
275 #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
276 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
277 { \
278 return _Tpvec(intrin(a.val, b.val, num)); \
279 } \
280 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
281 { \
282 a.val = intrin(a.val, b.val, num); \
283 return a; \
284 }
285
286 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
287 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
288 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
289 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
290 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
291 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
292 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
293 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
294 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vsadd_vv_i32m1, 4)
295 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vssub_vv_i32m1, 4)
296 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
297 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
298 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
299 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
300 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2)
301 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2)
302 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
303 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
304 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
305 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
306 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
operator /(const v_float32x4 & a,const v_float32x4 & b)307 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
308 {
309 return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
310 }
operator /=(v_float32x4 & a,const v_float32x4 & b)311 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
312 {
313 a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
314 return a;
315 }
316
317 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
318 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
319 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
operator /(const v_float64x2 & a,const v_float64x2 & b)320 inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
321 {
322 return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
323 }
operator /=(v_float64x2 & a,const v_float64x2 & b)324 inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
325 {
326 a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
327 return a;
328 }
329 // TODO: exp, log, sin, cos
330
331 #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
332 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
333 { \
334 return _Tpvec(intrin(a.val, b.val)); \
335 }
336
337 #define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
338 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
339 { \
340 return _Tpvec(intrin(a.val, b.val, num)); \
341 }
342 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
343 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
344 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
345 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
346 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
347 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
348 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
349 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
350 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
351 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
352 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
353 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
354 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
355 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
356 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
357 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
358
v_sqrt(const v_float32x4 & x)359 inline v_float32x4 v_sqrt(const v_float32x4& x)
360 {
361 return v_float32x4(vfsqrt_v_f32m1(x.val, 4));
362 }
363
v_invsqrt(const v_float32x4 & x)364 inline v_float32x4 v_invsqrt(const v_float32x4& x)
365 {
366 return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
367 }
368
v_magnitude(const v_float32x4 & a,const v_float32x4 & b)369 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
370 {
371 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
372 return v_sqrt(x);
373 }
374
v_sqr_magnitude(const v_float32x4 & a,const v_float32x4 & b)375 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
376 {
377 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
378 }
379
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)380 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
381 {
382 return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
383 }
384
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)385 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
386 {
387 return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
388 }
389
v_muladd(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)390 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
391 {
392 return v_fma(a, b, c);
393 }
394
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)395 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
396 {
397 return v_fma(a, b, c);
398 }
399
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)400 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
401 const v_float32x4& m1, const v_float32x4& m2,
402 const v_float32x4& m3)
403 {
404 vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
405 res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
406 res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
407 res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
408 return v_float32x4(res);
409 }
410
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)411 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
412 const v_float32x4& m1, const v_float32x4& m2,
413 const v_float32x4& a)
414 {
415 vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
416 res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
417 res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
418 res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
419 return v_float32x4(res);
420 }
421
v_sqrt(const v_float64x2 & x)422 inline v_float64x2 v_sqrt(const v_float64x2& x)
423 {
424 return v_float64x2(vfsqrt_v_f64m1(x.val, 2));
425 }
426
v_invsqrt(const v_float64x2 & x)427 inline v_float64x2 v_invsqrt(const v_float64x2& x)
428 {
429 return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
430 }
431
v_magnitude(const v_float64x2 & a,const v_float64x2 & b)432 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
433 {
434 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
435 return v_sqrt(x);
436 }
437
v_sqr_magnitude(const v_float64x2 & a,const v_float64x2 & b)438 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
439 {
440 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
441 }
442
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)443 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
444 {
445 return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
446 }
447
v_muladd(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)448 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
449 {
450 return v_fma(a, b, c);
451 }
452
453 #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
454 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
455 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
456 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
457 inline _Tpvec operator ~ (const _Tpvec & a) \
458 { \
459 return _Tpvec(vnot_v_##suffix(a.val, num)); \
460 }
461
462 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16)
463 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8)
464 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4)
465 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2)
466 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16, i8m1, 16)
467 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8, i16m1, 8)
468 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4, i32m1, 4)
469 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2, i64m1, 2)
470
471 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
472 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
473 { \
474 return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
475 } \
476 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
477 { \
478 a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
479 return a; \
480 }
481
482 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
483 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
484 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
485
operator ~(const v_float32x4 & a)486 inline v_float32x4 operator ~ (const v_float32x4& a)
487 {
488 return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
489 }
490
491 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
492 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
493 { \
494 return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
495 } \
496 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
497 { \
498 a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
499 return a; \
500 }
501
502 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
503 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
504 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
505
operator ~(const v_float64x2 & a)506 inline v_float64x2 operator ~ (const v_float64x2& a)
507 {
508 return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
509 }
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)510 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
511 {
512 return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
513 }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)514 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
515 {
516 return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
517 }
518
519 //#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
520 //inline _Tpuvec v_abs(const _Tpsvec& a) { \
521 // E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4);
522
523 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8)
524 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16)
525 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32)
526
v_abs(v_int32x4 x)527 inline v_uint32x4 v_abs(v_int32x4 x)
528 {
529 vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
530 return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
531 }
532
v_abs(v_int16x8 x)533 inline v_uint16x8 v_abs(v_int16x8 x)
534 {
535 vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
536 return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
537 }
538
v_abs(v_int8x16 x)539 inline v_uint8x16 v_abs(v_int8x16 x)
540 {
541 vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
542 return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
543 }
544
v_abs(v_float32x4 x)545 inline v_float32x4 v_abs(v_float32x4 x)
546 {
547 return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
548 }
549
v_abs(v_float64x2 x)550 inline v_float64x2 v_abs(v_float64x2 x)
551 {
552 return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
553 }
554
v_absdiff(const v_float32x4 & a,const v_float32x4 & b)555 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
556 {
557 vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
558 return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4);
559 }
560
v_absdiff(const v_float64x2 & a,const v_float64x2 & b)561 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
562 {
563 vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
564 return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2);
565 }
566
567 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
568 inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){ \
569 vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num); \
570 vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num); \
571 return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
572 }
573
574 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
575 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
576 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
577
578 /** Saturating absolute difference **/
v_absdiffs(v_int8x16 a,v_int8x16 b)579 inline v_int8x16 v_absdiffs(v_int8x16 a, v_int8x16 b){
580 vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
581 vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
582 return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
583 }
v_absdiffs(v_int16x8 a,v_int16x8 b)584 inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
585 vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
586 vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
587 return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
588 }
589
590 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
591 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){ \
592 vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
593 vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
594 return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num)); \
595 }
596
597 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
598 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
599 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
600
601 // Multiply and expand
v_mul_expand(const v_int8x16 & a,const v_int8x16 & b,v_int16x8 & c,v_int16x8 & d)602 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
603 v_int16x8& c, v_int16x8& d)
604 {
605 vint16m2_t res = vundefined_i16m2();
606 res = vwmul_vv_i16m2(a.val, b.val, 16);
607 c.val = vget_i16m2_i16m1(res, 0);
608 d.val = vget_i16m2_i16m1(res, 1);
609 }
610
v_mul_expand(const v_uint8x16 & a,const v_uint8x16 & b,v_uint16x8 & c,v_uint16x8 & d)611 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
612 v_uint16x8& c, v_uint16x8& d)
613 {
614 vuint16m2_t res = vundefined_u16m2();
615 res = vwmulu_vv_u16m2(a.val, b.val, 16);
616 c.val = vget_u16m2_u16m1(res, 0);
617 d.val = vget_u16m2_u16m1(res, 1);
618 }
619
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)620 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
621 v_int32x4& c, v_int32x4& d)
622 {
623 vint32m2_t res = vundefined_i32m2();
624 res = vwmul_vv_i32m2(a.val, b.val, 8);
625 c.val = vget_i32m2_i32m1(res, 0);
626 d.val = vget_i32m2_i32m1(res, 1);
627 }
628
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)629 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
630 v_uint32x4& c, v_uint32x4& d)
631 {
632 vuint32m2_t res = vundefined_u32m2();
633 res = vwmulu_vv_u32m2(a.val, b.val, 8);
634 c.val = vget_u32m2_u32m1(res, 0);
635 d.val = vget_u32m2_u32m1(res, 1);
636 }
637
v_mul_expand(const v_int32x4 & a,const v_int32x4 & b,v_int64x2 & c,v_int64x2 & d)638 inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
639 v_int64x2& c, v_int64x2& d)
640 {
641 vint64m2_t res = vundefined_i64m2();
642 res = vwmul_vv_i64m2(a.val, b.val, 4);
643 c.val = vget_i64m2_i64m1(res, 0);
644 d.val = vget_i64m2_i64m1(res, 1);
645 }
646
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)647 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
648 v_uint64x2& c, v_uint64x2& d)
649 {
650 vuint64m2_t res = vundefined_u64m2();
651 res = vwmulu_vv_u64m2(a.val, b.val, 4);
652 c.val = vget_u64m2_u64m1(res, 0);
653 d.val = vget_u64m2_u64m1(res, 1);
654 }
655
656 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
657 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
658 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
659 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
660 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
661 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
662 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
663 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
664 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
665 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
666 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
667 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
668 //////// Dot Product ////////
669 // 16 >> 32
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)670 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
671 {
672 vint32m2_t res = vundefined_i32m2();
673 res = vwmul_vv_i32m2(a.val, b.val, 8);
674 res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
675 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
676 }
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)677 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
678 {
679 vint32m2_t res = vundefined_i32m2();
680 res = vwmul_vv_i32m2(a.val, b.val, 8);
681 res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
682 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
683 }
684
685 // 32 >> 64
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)686 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
687 {
688 vint64m2_t res = vundefined_i64m2();
689 res = vwmul_vv_i64m2(a.val, b.val, 4);
690 res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
691 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
692 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)693 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
694 {
695 vint64m2_t res = vundefined_i64m2();
696 res = vwmul_vv_i64m2(a.val, b.val, 4);
697 res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
698 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
699 }
700
701 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)702 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
703 {
704 vuint16m2_t v1 = vundefined_u16m2();
705 vuint32m2_t v2 = vundefined_u32m2();
706 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
707 v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
708 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
709 return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
710 }
711
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)712 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
713 const v_uint32x4& c)
714 {
715 vuint16m2_t v1 = vundefined_u16m2();
716 vuint32m2_t v2 = vundefined_u32m2();
717 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
718 v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
719 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
720 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
721 }
722
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)723 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
724 {
725 vint16m2_t v1 = vundefined_i16m2();
726 vint32m2_t v2 = vundefined_i32m2();
727 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
728 v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
729 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
730 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
731 }
732
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)733 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
734 const v_int32x4& c)
735 {
736 vint16m2_t v1 = vundefined_i16m2();
737 vint32m2_t v2 = vundefined_i32m2();
738 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
739 v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
740 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
741 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
742 }
743
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)744 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
745 {
746 vuint32m2_t v1 = vundefined_u32m2();
747 vuint64m2_t v2 = vundefined_u64m2();
748 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
749 v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
750 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
751 return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
752 }
753
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)754 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
755 const v_uint64x2& c)
756 {
757 vuint32m2_t v1 = vundefined_u32m2();
758 vuint64m2_t v2 = vundefined_u64m2();
759 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
760 v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
761 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
762 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
763 }
764
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)765 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
766 {
767 vint32m2_t v1 = vundefined_i32m2();
768 vint64m2_t v2 = vundefined_i64m2();
769 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
770 v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
771 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
772 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
773 }
774
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)775 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
776 const v_int64x2& c)
777 {
778 vint32m2_t v1 = vundefined_i32m2();
779 vint64m2_t v2 = vundefined_i64m2();
780 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
781 v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
782 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
783 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
784 }
785
786 //////// Fast Dot Product ////////
787 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)788 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
789 {
790 vint32m2_t v1 = vundefined_i32m2();
791 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
792 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
793 }
794
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)795 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
796 {
797 vint32m2_t v1 = vundefined_i32m2();
798 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
799 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
800 }
801
802 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)803 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
804 {
805 vint64m2_t v1 = vundefined_i64m2();
806 v1 = vwmul_vv_i64m2(a.val, b.val, 4);
807 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
808 }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)809 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
810 {
811 vint64m2_t v1 = vundefined_i64m2();
812 v1 = vwmul_vv_i64m2(a.val, b.val, 8);
813 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
814 }
815
816 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)817 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
818 {
819 vuint16m2_t v1 = vundefined_u16m2();
820 vuint32m2_t v2 = vundefined_u32m2();
821 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
822 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
823 return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
824 }
825
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)826 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
827 {
828 vuint16m2_t v1 = vundefined_u16m2();
829 vuint32m2_t v2 = vundefined_u32m2();
830 v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
831 v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
832 return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
833 }
834
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)835 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
836 {
837 vint16m2_t v1 = vundefined_i16m2();
838 vint32m2_t v2 = vundefined_i32m2();
839 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
840 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
841 return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
842 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)843 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
844 {
845 vint16m2_t v1 = vundefined_i16m2();
846 vint32m2_t v2 = vundefined_i32m2();
847 v1 = vwmul_vv_i16m2(a.val, b.val, 16);
848 v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
849 return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
850 }
851
852 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)853 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
854 {
855 vuint32m2_t v1 = vundefined_u32m2();
856 vuint64m2_t v2 = vundefined_u64m2();
857 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
858 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
859 return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
860 }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)861 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
862 {
863 vuint32m2_t v1 = vundefined_u32m2();
864 vuint64m2_t v2 = vundefined_u64m2();
865 v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
866 v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
867 return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
868 }
869
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)870 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
871 {
872 vint32m2_t v1 = vundefined_i32m2();
873 vint64m2_t v2 = vundefined_i64m2();
874 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
875 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
876 return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
877 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)878 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
879 {
880 vint32m2_t v1 = vundefined_i32m2();
881 vint64m2_t v2 = vundefined_i64m2();
882 v1 = vwmul_vv_i32m2(a.val, b.val, 8);
883 v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
884 return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
885 }
886
887
888 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
889 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
890 {\
891 v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
892 val = intrin(val, a.val, val, num); \
893 return vmv_x_s_##len##m1_##len(val, num); \
894 }
895
896
897 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
898 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
899 {\
900 v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
901 val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num); \
902 return val[0]; \
903 }
904 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
905 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
906 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4)
907 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
908 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
909 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
v_reduce_sum(const v_float32x4 & a)910 inline float v_reduce_sum(const v_float32x4& a) \
911 {\
912 vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
913 val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4); \
914 return vfmv_f_s_f32m1_f32(val, 4); \
915 }
v_reduce_sum(const v_float64x2 & a)916 inline double v_reduce_sum(const v_float64x2& a) \
917 {\
918 vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
919 val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2); \
920 return vfmv_f_s_f64m1_f64(val, 2); \
921 }
v_reduce_sum(const v_uint64x2 & a)922 inline uint64 v_reduce_sum(const v_uint64x2& a)
923 { return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
924
v_reduce_sum(const v_int64x2 & a)925 inline int64 v_reduce_sum(const v_int64x2& a)
926 { return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
927
928 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func) \
929 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8, i8, int, func, red##func, 16) \
930 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8) \
931 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4) \
932 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2) \
933 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8, u8, unsigned, func, red##func##u, 16) \
934 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8) \
935 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4) \
936 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
937 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)938 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
939
940 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
941 const v_float32x4& c, const v_float32x4& d)
942 {
943 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
944 vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
945 vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
946 vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
947 a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
948 b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
949 c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
950 d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
951 return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
952 }
953
v_reduce_sad(const v_float32x4 & a,const v_float32x4 & b)954 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
955 {
956 vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
957 vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
958 vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
959 vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
960 a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
961 return a0[0];
962 }
963
964 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
965 inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){ \
966 _Tpvec2 x = v_absdiff(a, b); \
967 return v_reduce_sum(x); \
968 }
969
970 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16)
971 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16)
972 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8)
973 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8)
974 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
975 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
976
977 #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
978 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
979 { \
980 vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num); \
981 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
982 } \
983 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
984 { \
985 vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num); \
986 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
987 } \
988 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
989 { \
990 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num); \
991 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
992 } \
993 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
994 { \
995 vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num); \
996 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
997 } \
998 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
999 { \
1000 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num); \
1001 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1002 } \
1003 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1004 { \
1005 vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num); \
1006 return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num)); \
1007 } \
1008
1009 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1, 8, 16, _vv_)
1010 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_)
1011 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_)
1012 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_)
1013 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_)
1014 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_)
1015 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
1016 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
1017
1018 //TODO: ==
1019 inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
1020 {
1021 vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1022 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1023 return v_float32x4((vfloat32m1_t)res);
1024 }
operator !=(const v_float32x4 & a,const v_float32x4 & b)1025 inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
1026 {
1027 vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1028 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1029 return v_float32x4((vfloat32m1_t)res);
1030 }
operator <(const v_float32x4 & a,const v_float32x4 & b)1031 inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
1032 {
1033 vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1034 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1035 return v_float32x4((vfloat32m1_t)res);
1036 }
operator <=(const v_float32x4 & a,const v_float32x4 & b)1037 inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
1038 {
1039 vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1040 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1041 return v_float32x4((vfloat32m1_t)res);
1042 }
operator >(const v_float32x4 & a,const v_float32x4 & b)1043 inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
1044 {
1045 vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1046 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1047 return v_float32x4((vfloat32m1_t)res);
1048 }
operator >=(const v_float32x4 & a,const v_float32x4 & b)1049 inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
1050 {
1051 vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1052 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1053 return v_float32x4((vfloat32m1_t)res);
1054 }
v_not_nan(const v_float32x4 & a)1055 inline v_float32x4 v_not_nan(const v_float32x4& a)
1056 {
1057 vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
1058 vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1059 return v_float32x4((vfloat32m1_t)res);
1060 }
1061
1062 //TODO: ==
operator ==(const v_float64x2 & a,const v_float64x2 & b)1063 inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
1064 {
1065 vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1066 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1067 return v_float64x2((vfloat64m1_t)res);
1068 }
operator !=(const v_float64x2 & a,const v_float64x2 & b)1069 inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
1070 {
1071 vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1072 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1073 return v_float64x2((vfloat64m1_t)res);
1074 }
operator <(const v_float64x2 & a,const v_float64x2 & b)1075 inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
1076 {
1077 vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1078 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1079 return v_float64x2((vfloat64m1_t)res);
1080 }
operator <=(const v_float64x2 & a,const v_float64x2 & b)1081 inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
1082 {
1083 vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1084 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1085 return v_float64x2((vfloat64m1_t)res);
1086 }
operator >(const v_float64x2 & a,const v_float64x2 & b)1087 inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
1088 {
1089 vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1090 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1091 return v_float64x2((vfloat64m1_t)res);
1092 }
operator >=(const v_float64x2 & a,const v_float64x2 & b)1093 inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
1094 {
1095 vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1096 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1097 return v_float64x2((vfloat64m1_t)res);
1098 }
v_not_nan(const v_float64x2 & a)1099 inline v_float64x2 v_not_nan(const v_float64x2& a)
1100 {
1101 vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
1102 vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1103 return v_float64x2((vfloat64m1_t)res);
1104 }
1105 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1106 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1107 const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1108 v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1109 v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1110 { \
1111 v##_Tp##32m4_t val = vundefined_##_T##m4(); \
1112 val = vset_##_T##m4(val, 0, a0.val); \
1113 val = vset_##_T##m4(val, 1, a1.val); \
1114 val = vset_##_T##m4(val, 2, a2.val); \
1115 val = vset_##_T##m4(val, 3, a3.val); \
1116 val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16); \
1117 b0.val = vget_##_T##m4_##_T##m1(val, 0); \
1118 b1.val = vget_##_T##m4_##_T##m1(val, 1); \
1119 b2.val = vget_##_T##m4_##_T##m1(val, 2); \
1120 b3.val = vget_##_T##m4_##_T##m1(val, 3); \
1121 }
OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint,u32)1122 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
1123 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
1124 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
1125
1126
1127 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1128 inline _Tpvec operator << (const _Tpvec& a, int n) \
1129 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1130 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1131 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1132
1133 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1134 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1135 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1136 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1137 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1138 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1139 { return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1140
1141 // trade efficiency for convenience
1142 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1143 OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1144 OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1145
1146 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1147 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1148 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1149 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
1150 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1151 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1152 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1153 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
1154
1155 #if 0
1156 #define VUP4(n) {0, 1, 2, 3}
1157 #define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1158 #define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1159 #define VUP2(n) {0, 1}
1160 #endif
1161 #define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1162 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1163 { \
1164 suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1165 tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1166 return _Tpvec(tmp);\
1167 } \
1168 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1169 { \
1170 return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
1171 } \
1172 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1173 { return a; } \
1174 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1175 { \
1176 suffix##m2_t tmp = vundefined_##_T##m2(); \
1177 tmp = vset_##_T##m2(tmp, 0, a.val); \
1178 tmp = vset_##_T##m2(tmp, 1, b.val); \
1179 tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
1180 return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
1181 } \
1182 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1183 { \
1184 suffix##m2_t tmp = vundefined_##_T##m2(); \
1185 tmp = vset_##_T##m2(tmp, 0, b.val); \
1186 tmp = vset_##_T##m2(tmp, 1, a.val); \
1187 tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
1188 return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
1189 } \
1190 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1191 { \
1192 CV_UNUSED(b); return a; \
1193 }
1194
1195 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1196 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1197 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1198 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1199 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1200 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1201 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1202 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1203 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1204 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1205
1206 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
1207 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1208 { \
1209 typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1210 vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
1211 return _Tpvec(_Tp2##_t(tmp)); } \
1212 inline _Tpvec v_load_low(const _Tp* ptr) \
1213 { return _Tpvec(vle_v_##len(ptr, hnum)); }\
1214 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1215 { return _Tpvec(vle_v_##len(ptr, num)); } \
1216 inline _Tpvec v_load(const _Tp* ptr) \
1217 { return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
1218 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1219 { vse_v_##len(ptr, a.val, hnum);}\
1220 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1221 { \
1222 _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num); \
1223 vse_v_##len(ptr, a0, hnum);}\
1224 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1225 { vse_v_##len(ptr, a.val, num); } \
1226 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1227 { vse_v_##len(ptr, a.val, num); } \
1228 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1229 { vse_v_##len(ptr, a.val, num); } \
1230 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1231 { vse_v_##len(ptr, a.val, num); }
1232
1233 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
1234 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16, schar, vint8m1, i8m1, 8, 16)
1235 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
1236 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8, short, vint16m1, i16m1, 4, 8)
1237 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
1238 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4, int, vint32m1, i32m1, 2, 4)
1239 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
1240 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2, long, vint64m1, i64m1, 1, 2)
1241 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
1242 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
1243
1244
1245 ////////////// Lookup table access ////////////////////
1246
1247 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1248 {
1249 #if 1
1250 schar CV_DECL_ALIGNED(32) elems[16] =
1251 {
1252 tab[idx[ 0]],
1253 tab[idx[ 1]],
1254 tab[idx[ 2]],
1255 tab[idx[ 3]],
1256 tab[idx[ 4]],
1257 tab[idx[ 5]],
1258 tab[idx[ 6]],
1259 tab[idx[ 7]],
1260 tab[idx[ 8]],
1261 tab[idx[ 9]],
1262 tab[idx[10]],
1263 tab[idx[11]],
1264 tab[idx[12]],
1265 tab[idx[13]],
1266 tab[idx[14]],
1267 tab[idx[15]]
1268 };
1269 return v_int8x16(vle_v_i8m1(elems, 16));
1270 #else
1271 int32xm4_t index32 = vlev_int32xm4(idx, 16);
1272 vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
1273 vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
1274 return v_int8x16(vlxbv_i8m1(tab, index, 16));
1275 #endif
1276 }
1277
v_lut_pairs(const schar * tab,const int * idx)1278 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
1279 schar CV_DECL_ALIGNED(32) elems[16] =
1280 {
1281 tab[idx[0]],
1282 tab[idx[0] + 1],
1283 tab[idx[1]],
1284 tab[idx[1] + 1],
1285 tab[idx[2]],
1286 tab[idx[2] + 1],
1287 tab[idx[3]],
1288 tab[idx[3] + 1],
1289 tab[idx[4]],
1290 tab[idx[4] + 1],
1291 tab[idx[5]],
1292 tab[idx[5] + 1],
1293 tab[idx[6]],
1294 tab[idx[6] + 1],
1295 tab[idx[7]],
1296 tab[idx[7] + 1]
1297 };
1298 return v_int8x16(vle_v_i8m1(elems, 16));
1299 }
v_lut_quads(const schar * tab,const int * idx)1300 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1301 {
1302 schar CV_DECL_ALIGNED(32) elems[16] =
1303 {
1304 tab[idx[0]],
1305 tab[idx[0] + 1],
1306 tab[idx[0] + 2],
1307 tab[idx[0] + 3],
1308 tab[idx[1]],
1309 tab[idx[1] + 1],
1310 tab[idx[1] + 2],
1311 tab[idx[1] + 3],
1312 tab[idx[2]],
1313 tab[idx[2] + 1],
1314 tab[idx[2] + 2],
1315 tab[idx[2] + 3],
1316 tab[idx[3]],
1317 tab[idx[3] + 1],
1318 tab[idx[3] + 2],
1319 tab[idx[3] + 3]
1320 };
1321 return v_int8x16(vle_v_i8m1(elems, 16));
1322 }
1323
v_lut(const uchar * tab,const int * idx)1324 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)1325 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)1326 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1327
v_lut(const short * tab,const int * idx)1328 inline v_int16x8 v_lut(const short* tab, const int* idx)
1329 {
1330 short CV_DECL_ALIGNED(32) elems[8] =
1331 {
1332 tab[idx[0]],
1333 tab[idx[1]],
1334 tab[idx[2]],
1335 tab[idx[3]],
1336 tab[idx[4]],
1337 tab[idx[5]],
1338 tab[idx[6]],
1339 tab[idx[7]]
1340 };
1341 return v_int16x8(vle_v_i16m1(elems, 8));
1342 }
v_lut_pairs(const short * tab,const int * idx)1343 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1344 {
1345 short CV_DECL_ALIGNED(32) elems[8] =
1346 {
1347 tab[idx[0]],
1348 tab[idx[0] + 1],
1349 tab[idx[1]],
1350 tab[idx[1] + 1],
1351 tab[idx[2]],
1352 tab[idx[2] + 1],
1353 tab[idx[3]],
1354 tab[idx[3] + 1]
1355 };
1356 return v_int16x8(vle_v_i16m1(elems, 8));
1357 }
v_lut_quads(const short * tab,const int * idx)1358 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1359 {
1360 short CV_DECL_ALIGNED(32) elems[8] =
1361 {
1362 tab[idx[0]],
1363 tab[idx[0] + 1],
1364 tab[idx[0] + 2],
1365 tab[idx[0] + 3],
1366 tab[idx[1]],
1367 tab[idx[1] + 1],
1368 tab[idx[1] + 2],
1369 tab[idx[1] + 3]
1370 };
1371 return v_int16x8(vle_v_i16m1(elems, 8));
1372 }
v_lut(const ushort * tab,const int * idx)1373 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)1374 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)1375 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1376
v_lut(const int * tab,const int * idx)1377 inline v_int32x4 v_lut(const int* tab, const int* idx)
1378 {
1379 int CV_DECL_ALIGNED(32) elems[4] =
1380 {
1381 tab[idx[0]],
1382 tab[idx[1]],
1383 tab[idx[2]],
1384 tab[idx[3]]
1385 };
1386 return v_int32x4(vle_v_i32m1(elems, 4));
1387 }
v_lut_pairs(const int * tab,const int * idx)1388 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1389 {
1390 int CV_DECL_ALIGNED(32) elems[4] =
1391 {
1392 tab[idx[0]],
1393 tab[idx[0] + 1],
1394 tab[idx[1]],
1395 tab[idx[1] + 1]
1396 };
1397 return v_int32x4(vle_v_i32m1(elems, 4));
1398 }
v_lut_quads(const int * tab,const int * idx)1399 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1400 {
1401 return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
1402 }
v_lut(const unsigned * tab,const int * idx)1403 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)1404 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)1405 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1406
v_lut(const int64_t * tab,const int * idx)1407 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1408 {
1409 vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1410 return v_int64x2(res);
1411 }
v_lut_pairs(const int64_t * tab,const int * idx)1412 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1413 {
1414 return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
1415 }
1416
v_lut(const uint64_t * tab,const int * idx)1417 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
1418 {
1419 vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1420 return v_uint64x2(res);
1421 }
v_lut_pairs(const uint64_t * tab,const int * idx)1422 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
1423 {
1424 return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
1425 }
1426
v_lut(const float * tab,const int * idx)1427 inline v_float32x4 v_lut(const float* tab, const int* idx)
1428 {
1429 float CV_DECL_ALIGNED(32) elems[4] =
1430 {
1431 tab[idx[0]],
1432 tab[idx[1]],
1433 tab[idx[2]],
1434 tab[idx[3]]
1435 };
1436 return v_float32x4(vle_v_f32m1(elems, 4));
1437 }
v_lut_pairs(const float * tab,const int * idx)1438 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1439 {
1440 float CV_DECL_ALIGNED(32) elems[4] =
1441 {
1442 tab[idx[0]],
1443 tab[idx[0]+1],
1444 tab[idx[1]],
1445 tab[idx[1]+1]
1446 };
1447 return v_float32x4(vle_v_f32m1(elems, 4));
1448 }
v_lut_quads(const float * tab,const int * idx)1449 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1450 {
1451 return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
1452 }
v_lut(const double * tab,const int * idx)1453 inline v_float64x2 v_lut(const double* tab, const int* idx)
1454 {
1455 vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
1456 return v_float64x2(res);
1457 }
v_lut_pairs(const double * tab,const int * idx)1458 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1459 {
1460 return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
1461 }
1462
v_lut(const int * tab,const v_int32x4 & idxvec)1463 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1464 {
1465 int CV_DECL_ALIGNED(32) elems[4] =
1466 {
1467 tab[idxvec.val[0]],
1468 tab[idxvec.val[1]],
1469 tab[idxvec.val[2]],
1470 tab[idxvec.val[3]]
1471 };
1472 return v_int32x4(vle_v_i32m1(elems, 4));
1473 }
1474
v_lut(const unsigned * tab,const v_int32x4 & idxvec)1475 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1476 {
1477 unsigned CV_DECL_ALIGNED(32) elems[4] =
1478 {
1479 tab[idxvec.val[0]],
1480 tab[idxvec.val[1]],
1481 tab[idxvec.val[2]],
1482 tab[idxvec.val[3]]
1483 };
1484 return v_uint32x4(vle_v_u32m1(elems, 4));
1485 }
1486
v_lut(const float * tab,const v_int32x4 & idxvec)1487 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1488 {
1489 float CV_DECL_ALIGNED(32) elems[4] =
1490 {
1491 tab[idxvec.val[0]],
1492 tab[idxvec.val[1]],
1493 tab[idxvec.val[2]],
1494 tab[idxvec.val[3]]
1495 };
1496 return v_float32x4(vle_v_f32m1(elems, 4));
1497 }
v_lut(const double * tab,const v_int32x4 & idxvec)1498 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1499 {
1500 vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
1501 return v_float64x2(res);
1502 }
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)1503 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1504 {
1505 vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
1506 vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
1507
1508 x.val = vlxe_v_f32m1(tab, index_x, 4);
1509 y.val = vlxe_v_f32m1(tab, index_y, 4);
1510 }
1511
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)1512 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1513 {
1514 int CV_DECL_ALIGNED(32) idx[4];
1515 v_store_aligned(idx, idxvec);
1516
1517 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1518 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1519 }
1520
1521 #define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
1522 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1523 { \
1524 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1525 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1526 tmp = vset_##_T2##m2(tmp, 1, b.val); \
1527 return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1528 }\
1529 template<int n> inline \
1530 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1531 { \
1532 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1533 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1534 tmp = vset_##_T2##m2(tmp, 1, b.val); \
1535 return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1536 }\
1537 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1538 { \
1539 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1540 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1541 tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1542 asm("" ::: "memory"); \
1543 vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1544 }\
1545 template<int n> inline \
1546 void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1547 { \
1548 v##_Tp2##m2_t tmp = vundefined_##_T2##m2(); \
1549 tmp = vset_##_T2##m2(tmp, 0, a.val); \
1550 tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2)); \
1551 vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1552 }
1553 OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
1554 OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
1555 OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
1556 OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
1557 OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
1558 OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
1559
1560 // pack boolean
v_pack_b(const v_uint16x8 & a,const v_uint16x8 & b)1561 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1562 {
1563 vuint16m2_t tmp = vundefined_u16m2(); \
1564 tmp = vset_u16m2(tmp, 0, a.val); \
1565 tmp = vset_u16m2(tmp, 1, b.val); \
1566 return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
1567 }
1568
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)1569 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1570 const v_uint32x4& c, const v_uint32x4& d)
1571 {
1572 vuint32m4_t vabcd = vundefined_u32m4(); \
1573 vuint16m2_t v16 = vundefined_u16m2(); \
1574 vabcd = vset_u32m4(vabcd, 0, a.val); \
1575 vabcd = vset_u32m4(vabcd, 1, b.val); \
1576 vabcd = vset_u32m4(vabcd, 2, c.val); \
1577 vabcd = vset_u32m4(vabcd, 3, d.val); \
1578 v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
1579 return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1580 }
1581
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)1582 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1583 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1584 const v_uint64x2& g, const v_uint64x2& h)
1585 {
1586 vuint64m8_t v64 = vundefined_u64m8(); \
1587 vuint32m4_t v32 = vundefined_u32m4(); \
1588 vuint16m2_t v16 = vundefined_u16m2(); \
1589 v64 = vset_u64m8(v64, 0, a.val); \
1590 v64 = vset_u64m8(v64, 1, b.val); \
1591 v64 = vset_u64m8(v64, 2, c.val); \
1592 v64 = vset_u64m8(v64, 3, d.val); \
1593 v64 = vset_u64m8(v64, 4, e.val); \
1594 v64 = vset_u64m8(v64, 5, f.val); \
1595 v64 = vset_u64m8(v64, 6, g.val); \
1596 v64 = vset_u64m8(v64, 7, h.val); \
1597 v32 = vnsrl_vx_u32m4(v64, 0, 16);
1598 v16 = vnsrl_vx_u16m2(v32, 0, 16);
1599 return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1600 }
1601
1602 //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
1603 //{ \
1604 // int16xm2_u tmp; \
1605 // tmp.m1[0] = (vint16m1_t)a.val; \
1606 // tmp.m1[1] = (vint16m1_t)b.val; \
1607 // e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\
1608 // return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16));
1609 //}
1610
1611 #define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
1612 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1613 { \
1614 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1615 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1616 tmp = vset_##i##tp2##m2(tmp, 1, b.val); \
1617 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1618 return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1)); \
1619 } \
1620 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1621 { \
1622 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1623 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1624 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1625 return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2); \
1626 } \
1627 template<int n> inline \
1628 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1629 { \
1630 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1631 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1632 tmp = vset_##i##tp2##m2(tmp, 1, b.val); \
1633 vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1634 return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1)); \
1635 } \
1636 template<int n> inline \
1637 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1638 { \
1639 vint##tp2##m2_t tmp = vundefined_##i##tp2##m2(); \
1640 tmp = vset_##i##tp2##m2(tmp, 0, a.val); \
1641 vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1642 vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1); \
1643 return vse_v_u##tp1##m1(ptr, val, num2);\
1644 }
1645 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
1646 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
1647
1648 #ifdef __GNUC__
1649 #pragma GCC diagnostic push
1650 #pragma GCC diagnostic ignored "-Wuninitialized"
1651 #endif
1652
1653 // saturating multiply 8-bit, 16-bit
1654 #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec) \
1655 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
1656 { \
1657 _Tpwvec c, d; \
1658 v_mul_expand(a, b, c, d); \
1659 return v_pack(c, d); \
1660 } \
1661 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
1662 { a = a * b; return a; }
1663
1664 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16, v_int16x8)
1665 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
1666 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8, v_int32x4)
1667 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
1668
1669 #ifdef __GNUC__
1670 #pragma GCC diagnostic pop
1671 #endif
1672 static const signed char popCountTable[256] =
1673 {
1674 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1675 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1676 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1677 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1678 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1679 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1680 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1681 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1682 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1683 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1684 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1685 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1686 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1687 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1688 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1689 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
1690 };
1691
vcnt_u8(vuint8m1_t val)1692 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
1693 vuint8m1_t v0 = val & 1;
1694 return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
1695 }
1696
1697 inline v_uint8x16
v_popcount(const v_uint8x16 & a)1698 v_popcount(const v_uint8x16& a)
1699 {
1700 return v_uint8x16(vcnt_u8(a.val));
1701 }
1702
1703 inline v_uint8x16
v_popcount(const v_int8x16 & a)1704 v_popcount(const v_int8x16& a)
1705 {
1706 return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
1707 }
1708
1709 inline v_uint16x8
v_popcount(const v_uint16x8 & a)1710 v_popcount(const v_uint16x8& a)
1711 {
1712 vuint8m2_t tmp = vundefined_u8m2();
1713 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1714 vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1715 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1716 vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1717 return v_uint16x8(vget_u16m2_u16m1(res, 0));
1718 }
1719
1720 inline v_uint16x8
v_popcount(const v_int16x8 & a)1721 v_popcount(const v_int16x8& a)
1722 {
1723 vuint8m2_t tmp = vundefined_u8m2();
1724 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1725 vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1726 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1727 vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1728 return v_uint16x8(vget_u16m2_u16m1(res, 0));
1729 }
1730
1731 inline v_uint32x4
v_popcount(const v_uint32x4 & a)1732 v_popcount(const v_uint32x4& a)
1733 {
1734 vuint8m2_t tmp = vundefined_u8m2();
1735 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1736 vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1737 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1738 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1739 vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1740 vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1741 return v_uint32x4(vget_u32m2_u32m1(res, 0));
1742 }
1743
1744 inline v_uint32x4
v_popcount(const v_int32x4 & a)1745 v_popcount(const v_int32x4& a)
1746 {
1747 vuint8m2_t tmp = vundefined_u8m2();
1748 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1749 vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1750 0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1751 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1752 vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1753 vuint32m2_t res = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1754 return v_uint32x4(vget_u32m2_u32m1(res, 0));
1755 }
1756
1757 inline v_uint64x2
v_popcount(const v_uint64x2 & a)1758 v_popcount(const v_uint64x2& a)
1759 {
1760 vuint8m2_t tmp = vundefined_u8m2();
1761 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1762 vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1763 0x0F0E0D0C0B0A0908, 0x0000000000000000};
1764 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1765 vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1766 vuint8m1_t res1 = zero;
1767 vuint8m1_t res2 = zero;
1768 res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1769 res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1770
1771 return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
1772 }
1773
1774 inline v_uint64x2
v_popcount(const v_int64x2 & a)1775 v_popcount(const v_int64x2& a)
1776 {
1777 vuint8m2_t tmp = vundefined_u8m2();
1778 tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1779 vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1780 0x0F0E0D0C0B0A0908, 0x0000000000000000};
1781 tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32); \
1782 vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1783 vuint8m1_t res1 = zero;
1784 vuint8m1_t res2 = zero;
1785 res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1786 res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1787
1788 return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
1789 }
1790
1791 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
v_signmask(const v_uint8x16 & a)1792 inline int v_signmask(const v_uint8x16& a)
1793 {
1794 vuint8m1_t t0 = vsrl_vx_u8m1(a.val, 7, 16);
1795 vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1796 vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
1797 vuint32m1_t res = vmv_v_x_u32m1(0, 4);
1798 vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
1799 res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
1800 res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
1801 return vmv_x_s_u32m1_u32(res, 8);
1802 }
v_signmask(const v_int8x16 & a)1803 inline int v_signmask(const v_int8x16& a)
1804 {
1805 vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
1806 vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1807 vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
1808 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1809 vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
1810 res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
1811 res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
1812 return vmv_x_s_i32m1_i32(res, 8);
1813 }
1814
v_signmask(const v_int16x8 & a)1815 inline int v_signmask(const v_int16x8& a)
1816 {
1817 vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1818 vint16m1_t m1 = (vint16m1_t){SMASK};
1819 vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1820 vint16m1_t res = vmv_v_x_i16m1(0, 8);
1821 res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1822 return vmv_x_s_i16m1_i16(res, 8);
1823 }
v_signmask(const v_uint16x8 & a)1824 inline int v_signmask(const v_uint16x8& a)
1825 {
1826 vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1827 vint16m1_t m1 = (vint16m1_t){SMASK};
1828 vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1829 vint16m1_t res = vmv_v_x_i16m1(0, 8);
1830 res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1831 return vmv_x_s_i16m1_i16(res, 8);
1832 }
v_signmask(const v_int32x4 & a)1833 inline int v_signmask(const v_int32x4& a)
1834 {
1835 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1836 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1837 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1838 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1839 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1840 return vmv_x_s_i32m1_i32(res, 4);
1841 }
v_signmask(const v_uint32x4 & a)1842 inline int v_signmask(const v_uint32x4& a)
1843 {
1844 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
1845 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1846 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1847 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1848 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1849 return vmv_x_s_i32m1_i32(res, 4);
1850 }
v_signmask(const v_uint64x2 & a)1851 inline int v_signmask(const v_uint64x2& a)
1852 {
1853 vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
1854 int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
1855 return res;
1856 }
v_signmask(const v_int64x2 & a)1857 inline int v_signmask(const v_int64x2& a)
1858 { return v_signmask(v_reinterpret_as_u64(a)); }
v_signmask(const v_float64x2 & a)1859 inline int v_signmask(const v_float64x2& a)
1860 { return v_signmask(v_reinterpret_as_u64(a)); }
v_signmask(const v_float32x4 & a)1861 inline int v_signmask(const v_float32x4& a)
1862 {
1863 vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1864 vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1865 vint32m1_t res = vmv_v_x_i32m1(0, 4);
1866 vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1867 res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1868 return vmv_x_s_i32m1_i32(res, 4);
1869 }
1870
v_scan_forward(const v_int8x16 & a)1871 inline int v_scan_forward(const v_int8x16& a) {
1872 int val = v_signmask(a);
1873 if(val==0) return 0;
1874 else return trailingZeros32(val); }
v_scan_forward(const v_uint8x16 & a)1875 inline int v_scan_forward(const v_uint8x16& a) {
1876 int val = v_signmask(a);
1877 if(val==0) return 0;
1878 else return trailingZeros32(val); }
v_scan_forward(const v_int16x8 & a)1879 inline int v_scan_forward(const v_int16x8& a) {
1880 int val = v_signmask(a);
1881 if(val==0) return 0;
1882 else return trailingZeros32(val); }
v_scan_forward(const v_uint16x8 & a)1883 inline int v_scan_forward(const v_uint16x8& a) {
1884 int val = v_signmask(a);
1885 if(val==0) return 0;
1886 else return trailingZeros32(val); }
v_scan_forward(const v_int32x4 & a)1887 inline int v_scan_forward(const v_int32x4& a) {
1888 int val = v_signmask(a);
1889 if(val==0) return 0;
1890 else return trailingZeros32(val); }
v_scan_forward(const v_uint32x4 & a)1891 inline int v_scan_forward(const v_uint32x4& a) {
1892 int val = v_signmask(a);
1893 if(val==0) return 0;
1894 else return trailingZeros32(val); }
v_scan_forward(const v_float32x4 & a)1895 inline int v_scan_forward(const v_float32x4& a) {
1896 int val = v_signmask(a);
1897 if(val==0) return 0;
1898 else return trailingZeros32(val); }
v_scan_forward(const v_int64x2 & a)1899 inline int v_scan_forward(const v_int64x2& a) {
1900 int val = v_signmask(a);
1901 if(val==0) return 0;
1902 else return trailingZeros32(val); }
v_scan_forward(const v_uint64x2 & a)1903 inline int v_scan_forward(const v_uint64x2& a) {
1904 int val = v_signmask(a);
1905 if(val==0) return 0;
1906 else return trailingZeros32(val); }
1907
1908 #define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
1909 inline bool v_check_all(const v_##_Tpvec& a) \
1910 { \
1911 suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
1912 vuint64m1_t v1 = vuint64m1_t(v0); \
1913 return (v1[0] | v1[1]) == 0; \
1914 } \
1915 inline bool v_check_any(const v_##_Tpvec& a) \
1916 { \
1917 suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
1918 vuint64m1_t v1 = vuint64m1_t(v0); \
1919 return (v1[0] | v1[1]) != 0; \
1920 }
1921
1922 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8, u8m1, 7, 16)
1923 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
1924 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
1925 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
1926
v_check_all(const v_int8x16 & a)1927 inline bool v_check_all(const v_int8x16& a)
1928 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_all(const v_int16x8 & a)1929 inline bool v_check_all(const v_int16x8& a)
1930 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_all(const v_int32x4 & a)1931 inline bool v_check_all(const v_int32x4& a)
1932 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_all(const v_float32x4 & a)1933 inline bool v_check_all(const v_float32x4& a)
1934 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_all(const v_int64x2 & a)1935 inline bool v_check_all(const v_int64x2& a)
1936 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_all(const v_float64x2 & a)1937 inline bool v_check_all(const v_float64x2& a)
1938 { return v_check_all(v_reinterpret_as_u64(a)); }
1939
v_check_any(const v_int8x16 & a)1940 inline bool v_check_any(const v_int8x16& a)
1941 { return v_check_any(v_reinterpret_as_u8(a)); }
v_check_any(const v_int16x8 & a)1942 inline bool v_check_any(const v_int16x8& a)
1943 { return v_check_any(v_reinterpret_as_u16(a)); }
v_check_any(const v_int32x4 & a)1944 inline bool v_check_any(const v_int32x4& a)
1945 { return v_check_any(v_reinterpret_as_u32(a)); }
v_check_any(const v_float32x4 & a)1946 inline bool v_check_any(const v_float32x4& a)
1947 { return v_check_any(v_reinterpret_as_u32(a)); }
v_check_any(const v_int64x2 & a)1948 inline bool v_check_any(const v_int64x2& a)
1949 { return v_check_any(v_reinterpret_as_u64(a)); }
v_check_any(const v_float64x2 & a)1950 inline bool v_check_any(const v_float64x2& a)
1951 { return v_check_any(v_reinterpret_as_u64(a)); }
1952
1953 #define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
1954 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1955 { \
1956 return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
1957 }
1958
1959 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16, i8m1, vbool8_t, 16)
1960 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8, i16m1, vbool16_t, 8)
1961 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4, i32m1, vbool32_t, 4)
1962 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
1963 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
1964 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
v_select(const v_float32x4 & mask,const v_float32x4 & a,const v_float32x4 & b)1965 inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
1966 {
1967 return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
1968 }
v_select(const v_float64x2 & mask,const v_float64x2 & a,const v_float64x2 & b)1969 inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
1970 {
1971 return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
1972 }
1973
1974 #define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
1975 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
1976 { \
1977 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \
1978 b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0); \
1979 b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1); \
1980 } \
1981 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
1982 { \
1983 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2); \
1984 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1985 } \
1986 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
1987 { \
1988 _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1); \
1989 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
1990 } \
1991 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
1992 { \
1993 _T2##_t val = vle##_v_##_Tp1(ptr, num2); \
1994 _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2); \
1995 return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1996 }
1997
1998 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
1999 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort, u16m1, 8, u32, 4, vuint32m2, vuint16m1)
2000 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint, u32m1, 4, u64, 2, vuint64m2, vuint32m1)
2001 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar, i8m1, 16, i16, 8, vint16m2, vint8m1)
2002 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short, i16m1, 8, i32, 4, vint32m2, vint16m1)
2003 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int, i32m1, 4, i64, 2, vint64m2, vint32m1)
2004
v_load_expand_q(const uchar * ptr)2005 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2006 {
2007 vuint16m2_t b = vundefined_u16m2();
2008 vuint32m2_t c = vundefined_u32m2();
2009 vuint8m1_t val = vle_v_u8m1(ptr, 4); \
2010 b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4); \
2011 c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4); \
2012 return v_uint32x4(vget_u32m2_u32m1(c, 0));
2013 }
2014
v_load_expand_q(const schar * ptr)2015 inline v_int32x4 v_load_expand_q(const schar* ptr)
2016 {
2017 vint16m2_t b = vundefined_i16m2();
2018 vint32m2_t c = vundefined_i32m2();
2019 vint8m1_t val = vle_v_i8m1(ptr, 4); \
2020 b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4); \
2021 c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4); \
2022 return v_int32x4(vget_i32m2_i32m1(c, 0));
2023 }
2024 #define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
2025 #define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
2026 #define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
2027 #define VITL_2 (vuint64m2_t){0, 2, 1, 3}
2028 #define LOW_4 0x0000000100000000, 0x0000000500000004
2029 #define LOW_8 0x0003000200010000, 0x000B000A00090008
2030 #define LOW_16 0x0706050403020100, 0x1716151413121110
2031 #define HIGH_4 0x0000000300000002, 0x0000000700000006
2032 #define HIGH_8 0x0007000600050004, 0x000F000E000D000C
2033 #define HIGH_16 0x0F0E0D0C0B0A0908, 0x1F1E1D1C1B1A1918
2034 #define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
2035 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2036 { \
2037 v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2038 tmp = vset_##_T##m2(tmp, 0, a0.val); \
2039 tmp = vset_##_T##m2(tmp, 1, a1.val); \
2040 vuint64m2_t mask = VITL_##num; \
2041 tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2); \
2042 b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
2043 b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
2044 } \
2045 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2046 { \
2047 v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2048 return v_##_Tpvec(b0);\
2049 } \
2050 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2051 { \
2052 v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \
2053 v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \
2054 v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2055 return v_##_Tpvec(b1);\
2056 } \
2057 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2058 { \
2059 c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num); \
2060 v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num); \
2061 v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num); \
2062 d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num); \
2063 }
2064
2065 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
2066 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
2067 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
2068 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
2069 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
2070 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
2071 OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
2072 OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
2073
v_reverse(const v_uint8x16 & a)2074 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
2075 {
2076 vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2077 return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
2078 }
v_reverse(const v_int8x16 & a)2079 inline v_int8x16 v_reverse(const v_int8x16 &a)
2080 {
2081 vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2082 return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
2083 }
2084
v_reverse(const v_uint16x8 & a)2085 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
2086 {
2087 vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
2088 return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
2089 }
2090
v_reverse(const v_int16x8 & a)2091 inline v_int16x8 v_reverse(const v_int16x8 &a)
2092 {
2093 vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
2094 return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
2095 }
v_reverse(const v_uint32x4 & a)2096 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
2097 {
2098 return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2099 }
2100
v_reverse(const v_int32x4 & a)2101 inline v_int32x4 v_reverse(const v_int32x4 &a)
2102 {
2103 return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2104 }
2105
v_reverse(const v_float32x4 & a)2106 inline v_float32x4 v_reverse(const v_float32x4 &a)
2107 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
2108
v_reverse(const v_uint64x2 & a)2109 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
2110 {
2111 return v_uint64x2(a.val[1], a.val[0]);
2112 }
2113
v_reverse(const v_int64x2 & a)2114 inline v_int64x2 v_reverse(const v_int64x2 &a)
2115 {
2116 return v_int64x2(a.val[1], a.val[0]);
2117 }
2118
v_reverse(const v_float64x2 & a)2119 inline v_float64x2 v_reverse(const v_float64x2 &a)
2120 {
2121 return v_float64x2(a.val[1], a.val[0]);
2122 }
2123
2124 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2125 template <int n> \
2126 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2127 { return v_rotate_right<n>(a, b);}
2128 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0)
2129 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0)
2130 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1)
2131 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1)
2132 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2)
2133 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2)
2134 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3)
2135 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3)
2136 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
2137 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
2138
2139
2140 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
2141 template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
2142
OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16,uchar,u8)2143 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
2144 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
2145 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
2146 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
2147 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
2148 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
2149 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
2150 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
2151 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
2152 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
2153
2154 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2155 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2156
2157 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16)
2158 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16)
2159 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8)
2160 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8)
2161 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4)
2162 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
2163 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
2164 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
2165 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
2166 inline v_int32x4 v_round(const v_float32x4& a)
2167 {
2168 __builtin_riscv_fsrm(0);
2169 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2170 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2171 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2172 __builtin_riscv_fsrm(0);
2173 return v_int32x4(val);
2174 }
v_floor(const v_float32x4 & a)2175 inline v_int32x4 v_floor(const v_float32x4& a)
2176 {
2177 __builtin_riscv_fsrm(2);
2178 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2179 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2180 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2181 __builtin_riscv_fsrm(0);
2182 return v_int32x4(val);
2183 }
2184
v_ceil(const v_float32x4 & a)2185 inline v_int32x4 v_ceil(const v_float32x4& a)
2186 {
2187 __builtin_riscv_fsrm(3);
2188 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2189 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2190 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2191 __builtin_riscv_fsrm(0);
2192 return v_int32x4(val);
2193 }
2194
v_trunc(const v_float32x4 & a)2195 inline v_int32x4 v_trunc(const v_float32x4& a)
2196 {
2197 __builtin_riscv_fsrm(1);
2198 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2199 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2200 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2201 __builtin_riscv_fsrm(0);
2202 return v_int32x4(val);
2203 }
2204
v_round(const v_float64x2 & a)2205 inline v_int32x4 v_round(const v_float64x2& a)
2206 {
2207 __builtin_riscv_fsrm(0);
2208 vfloat64m2_t _val = vundefined_f64m2();
2209 _val = vset_f64m2(_val, 0, a.val);
2210 //_val = vset_f64m2(_val, 1, a.val);
2211 _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2212 vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2213 __builtin_riscv_fsrm(0);
2214 return v_int32x4(val);
2215 }
v_round(const v_float64x2 & a,const v_float64x2 & b)2216 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2217 {
2218 __builtin_riscv_fsrm(0);
2219 vfloat64m2_t _val = vundefined_f64m2();
2220 _val = vset_f64m2(_val, 0, a.val);
2221 _val = vset_f64m2(_val, 1, b.val);
2222 vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2223 __builtin_riscv_fsrm(0);
2224 return v_int32x4(val);
2225 }
v_floor(const v_float64x2 & a)2226 inline v_int32x4 v_floor(const v_float64x2& a)
2227 {
2228 __builtin_riscv_fsrm(2);
2229 vfloat64m2_t _val = vundefined_f64m2();
2230 _val = vset_f64m2(_val, 0, a.val);
2231 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2232
2233 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2234 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2235 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2236 __builtin_riscv_fsrm(0);
2237 return v_int32x4(val);
2238 }
2239
v_ceil(const v_float64x2 & a)2240 inline v_int32x4 v_ceil(const v_float64x2& a)
2241 {
2242 __builtin_riscv_fsrm(3);
2243 vfloat64m2_t _val = vundefined_f64m2();
2244 _val = vset_f64m2(_val, 0, a.val);
2245 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2246
2247 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2248 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2249 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2250 __builtin_riscv_fsrm(0);
2251 return v_int32x4(val);
2252 }
2253
v_trunc(const v_float64x2 & a)2254 inline v_int32x4 v_trunc(const v_float64x2& a)
2255 {
2256 __builtin_riscv_fsrm(1);
2257 vfloat64m2_t _val = vundefined_f64m2();
2258 _val = vset_f64m2(_val, 0, a.val);
2259 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2260
2261 vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2262 vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2263 vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2264 __builtin_riscv_fsrm(0);
2265 return v_int32x4(val);
2266 }
2267
2268 #define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \
2269 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2270 { \
2271 v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
2272 a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \
2273 b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \
2274 } \
2275 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2276 { \
2277 v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
2278 a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \
2279 b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \
2280 c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \
2281 }\
2282 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2283 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2284 { \
2285 v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
2286 a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \
2287 b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \
2288 c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \
2289 d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \
2290 } \
2291
2292 #define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T) \
2293 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2294 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2295 { \
2296 v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \
2297 ret = vset_##_T##m1x2(ret, 0, a.val); \
2298 ret = vset_##_T##m1x2(ret, 1, b.val); \
2299 intrin##2e_v_##_T##m1x2(ptr, ret, num); \
2300 } \
2301 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2302 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2303 { \
2304 v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \
2305 ret = vset_##_T##m1x3(ret, 0, a.val); \
2306 ret = vset_##_T##m1x3(ret, 1, b.val); \
2307 ret = vset_##_T##m1x3(ret, 2, c.val); \
2308 intrin##3e_v_##_T##m1x3(ptr, ret, num); \
2309 } \
2310 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2311 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2312 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2313 { \
2314 v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \
2315 ret = vset_##_T##m1x4(ret, 0, a.val); \
2316 ret = vset_##_T##m1x4(ret, 1, b.val); \
2317 ret = vset_##_T##m1x4(ret, 2, c.val); \
2318 ret = vset_##_T##m1x4(ret, 3, d.val); \
2319 intrin##4e_v_##_T##m1x4(ptr, ret, num); \
2320 }
2321
2322 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
2323 OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T) \
2324 OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
2325
2326 //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
2327 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
2328 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
2329 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
2330
2331 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
2332 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
2333 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
2334
2335 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
2336 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2337 { \
2338 v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
2339 a.val = vget_##_T##m1x2_##_T##m1(ret, 0); \
2340 b.val = vget_##_T##m1x2_##_T##m1(ret, 1); \
2341 } \
2342 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2343 { \
2344 v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num); \
2345 a.val = vget_##_T##m1x3_##_T##m1(ret, 0); \
2346 b.val = vget_##_T##m1x3_##_T##m1(ret, 1); \
2347 c.val = vget_##_T##m1x3_##_T##m1(ret, 2); \
2348 }\
2349 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2350 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2351 { \
2352 v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num); \
2353 a.val = vget_##_T##m1x4_##_T##m1(ret, 0); \
2354 b.val = vget_##_T##m1x4_##_T##m1(ret, 1); \
2355 c.val = vget_##_T##m1x4_##_T##m1(ret, 2); \
2356 d.val = vget_##_T##m1x4_##_T##m1(ret, 3); \
2357 } \
2358 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2359 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2360 { \
2361 v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2(); \
2362 ret = vset_##_T##m1x2(ret, 0, a.val); \
2363 ret = vset_##_T##m1x2(ret, 1, b.val); \
2364 vsseg2e_v_##_T##m1x2(ptr, ret, num); \
2365 } \
2366 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2367 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2368 { \
2369 v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3(); \
2370 ret = vset_##_T##m1x3(ret, 0, a.val); \
2371 ret = vset_##_T##m1x3(ret, 1, b.val); \
2372 ret = vset_##_T##m1x3(ret, 2, c.val); \
2373 vsseg3e_v_##_T##m1x3(ptr, ret, num); \
2374 } \
2375 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2376 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2377 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2378 { \
2379 v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4(); \
2380 ret = vset_##_T##m1x4(ret, 0, a.val); \
2381 ret = vset_##_T##m1x4(ret, 1, b.val); \
2382 ret = vset_##_T##m1x4(ret, 2, c.val); \
2383 ret = vset_##_T##m1x4(ret, 3, d.val); \
2384 vsseg4e_v_##_T##m1x4(ptr, ret, num); \
2385 }
2386 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
2387 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
2388
2389 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
2390 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
2391
v_cvt_f32(const v_int32x4 & a)2392 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2393 {
2394 return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4));
2395 }
2396
2397 #if CV_SIMD128_64F
v_cvt_f32(const v_float64x2 & a)2398 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2399 {
2400 vfloat64m2_t _val = vundefined_f64m2();
2401 _val = vset_f64m2(_val, 0, a.val);
2402 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2403 return v_float32x4(aval);
2404 }
2405
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)2406 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2407 {
2408 vfloat64m2_t _val = vundefined_f64m2();
2409 _val = vset_f64m2(_val, 0, a.val);
2410 _val = vset_f64m2(_val, 1, b.val);
2411 vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
2412 return v_float32x4(aval);
2413 }
2414
v_cvt_f64(const v_int32x4 & a)2415 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2416 {
2417 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2418 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2419 return v_float64x2(vget_f64m2_f64m1(_val, 0));
2420 }
2421
v_cvt_f64_high(const v_int32x4 & a)2422 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2423 {
2424 vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2425 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2426 return v_float64x2(vget_f64m2_f64m1(_val, 1));
2427 }
2428
v_cvt_f64(const v_float32x4 & a)2429 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2430 {
2431 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2432 return v_float64x2(vget_f64m2_f64m1(_val, 0));
2433 }
2434
v_cvt_f64_high(const v_float32x4 & a)2435 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2436 {
2437 vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(a.val, 4);
2438 return v_float64x2(vget_f64m2_f64m1(_val, 1));
2439 }
2440
v_cvt_f64(const v_int64x2 & a)2441 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2442 {
2443 return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2));
2444 }
2445
2446 #endif
v_interleave_pairs(const v_int8x16 & vec)2447 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2448 {
2449 vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2450 return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2451 }
v_interleave_pairs(const v_uint8x16 & vec)2452 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
2453 {
2454 return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
2455 }
2456
v_interleave_quads(const v_int8x16 & vec)2457 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2458 {
2459 vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2460 return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2461 }
v_interleave_quads(const v_uint8x16 & vec)2462 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
2463 {
2464 return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
2465 }
2466
v_interleave_pairs(const v_int16x8 & vec)2467 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2468 {
2469 vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2470 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
2471 }
v_interleave_pairs(const v_uint16x8 & vec)2472 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
v_interleave_quads(const v_int16x8 & vec)2473 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2474 {
2475 vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2476 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2477 }
v_interleave_quads(const v_uint16x8 & vec)2478 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2479
v_interleave_pairs(const v_int32x4 & vec)2480 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2481 {
2482 vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2483 return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2484 }
v_interleave_pairs(const v_uint32x4 & vec)2485 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_interleave_pairs(const v_float32x4 & vec)2486 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_pack_triplets(const v_int8x16 & vec)2487 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2488 {
2489 vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2490 return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2491 }
v_pack_triplets(const v_uint8x16 & vec)2492 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2493
v_pack_triplets(const v_int16x8 & vec)2494 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2495 {
2496 vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2497 return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2498 }
v_pack_triplets(const v_uint16x8 & vec)2499 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2500
v_pack_triplets(const v_int32x4 & vec)2501 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
v_pack_triplets(const v_uint32x4 & vec)2502 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
v_pack_triplets(const v_float32x4 & vec)2503 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2504
2505 #if CV_SIMD128_64F
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)2506 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
2507 { return v_cvt_f64(v_dotprod(a, b)); }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)2508 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
2509 const v_float64x2& c)
2510 { return v_dotprod_expand(a, b) + c; }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)2511 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
2512 {
2513 vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2514 vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
2515 return v_float64x2(res);
2516 }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)2517 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
2518 { v_float64x2 res = v_dotprod_expand_fast(a, b);
2519 return res + c; }
2520 #endif
2521 ////// FP16 support ///////
v_load_expand(const float16_t * ptr)2522 inline v_float32x4 v_load_expand(const float16_t* ptr)
2523 {
2524 vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
2525 vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2526 return v_float32x4(vget_f32m2_f32m1(v32, 0));
2527 }
2528
v_pack_store(float16_t * ptr,const v_float32x4 & v)2529 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2530 {
2531 vfloat32m2_t v32 = vundefined_f32m2();
2532 v32 = vset_f32m2(v32, 0, v.val);
2533 vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
2534 vse_v_f16m1((__fp16*)ptr, hv, 4);
2535 }
2536
2537
v_cleanup()2538 inline void v_cleanup() {}
2539
2540 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2541
2542 //! @endcond
2543
2544 }
2545 #endif
2546