1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
4 
5 // Copyright (C) 2015, PingTouGe Semiconductor Co., Ltd., all rights reserved.
6 
7 #ifndef OPENCV_HAL_INTRIN_RISCVV_HPP
8 #define OPENCV_HAL_INTRIN_RISCVV_HPP
9 
10 #include <float.h>
11 #include <algorithm>
12 #include "opencv2/core/utility.hpp"
13 
14 namespace cv
15 {
16 
17 //! @cond IGNORED
18 
19 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
20 
21 #define CV_SIMD128 1
22 #define CV_SIMD128_64F 1
23 //////////// Types ////////////
24 struct v_uint8x16
25 {
26     typedef uchar lane_type;
27     enum { nlanes = 16 };
28 
v_uint8x16cv::v_uint8x1629     v_uint8x16() {}
v_uint8x16cv::v_uint8x1630     explicit v_uint8x16(vuint8m1_t v) : val(v) {}
v_uint8x16cv::v_uint8x1631     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
32                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
33     {
34         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
35         val = (vuint8m1_t)vle_v_u8m1((unsigned char*)v, 16);
36     }
get0cv::v_uint8x1637     uchar get0() const
38     {
39         return vmv_x_s_u8m1_u8(val, 16);
40     }
41 
42     vuint8m1_t val;
43 };
44 
45 struct v_int8x16
46 {
47     typedef schar lane_type;
48     enum { nlanes = 16 };
49 
v_int8x16cv::v_int8x1650     v_int8x16() {}
v_int8x16cv::v_int8x1651     explicit v_int8x16(vint8m1_t v) : val(v) {}
v_int8x16cv::v_int8x1652     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
53                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
54     {
55         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
56         val = (vint8m1_t)vle_v_i8m1((schar*)v, 16);
57     }
get0cv::v_int8x1658     schar get0() const
59     {
60         return vmv_x_s_i8m1_i8(val, 16);
61     }
62 
63     vint8m1_t val;
64 };
65 
66 struct v_uint16x8
67 {
68     typedef ushort lane_type;
69     enum { nlanes = 8 };
70 
v_uint16x8cv::v_uint16x871     v_uint16x8() {}
v_uint16x8cv::v_uint16x872     explicit v_uint16x8(vuint16m1_t v) : val(v) {}
v_uint16x8cv::v_uint16x873     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
74     {
75         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
76         val = (vuint16m1_t)vle_v_u16m1((unsigned short*)v, 8);
77     }
get0cv::v_uint16x878     ushort get0() const
79     {
80         return vmv_x_s_u16m1_u16(val, 8);
81     }
82 
83     vuint16m1_t val;
84 };
85 
86 struct v_int16x8
87 {
88     typedef short lane_type;
89     enum { nlanes = 8 };
90 
v_int16x8cv::v_int16x891     v_int16x8() {}
v_int16x8cv::v_int16x892     explicit v_int16x8(vint16m1_t v) : val(v) {}
v_int16x8cv::v_int16x893     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
94     {
95         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
96         val = (vint16m1_t)vle_v_i16m1((signed short*)v, 8);
97     }
get0cv::v_int16x898     short get0() const
99     {
100         return vmv_x_s_i16m1_i16(val, 8);
101     }
102 
103     vint16m1_t val;
104 };
105 
106 struct v_uint32x4
107 {
108     typedef unsigned lane_type;
109     enum { nlanes = 4 };
110 
v_uint32x4cv::v_uint32x4111     v_uint32x4() {}
v_uint32x4cv::v_uint32x4112     explicit v_uint32x4(vuint32m1_t v) : val(v) {}
v_uint32x4cv::v_uint32x4113     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
114     {
115         unsigned v[] = {v0, v1, v2, v3};
116         val = (vuint32m1_t)vle_v_u32m1((unsigned int*)v, 4);
117     }
get0cv::v_uint32x4118     unsigned get0() const
119     {
120         return vmv_x_s_u32m1_u32(val, 4);
121     }
122 
123     vuint32m1_t val;
124 };
125 
126 struct v_int32x4
127 {
128     typedef int lane_type;
129     enum { nlanes = 4 };
130 
v_int32x4cv::v_int32x4131     v_int32x4() {}
v_int32x4cv::v_int32x4132     explicit v_int32x4(vint32m1_t v) : val(v) {}
v_int32x4cv::v_int32x4133     v_int32x4(int v0, int v1, int v2, int v3)
134     {
135         int v[] = {v0, v1, v2, v3};
136         val = (vint32m1_t)vle_v_i32m1((signed int*)v, 4);
137     }
get0cv::v_int32x4138     int get0() const
139     {
140         return vmv_x_s_i32m1_i32(val, 4);
141     }
142     vint32m1_t val;
143 };
144 
145 struct v_float32x4
146 {
147     typedef float lane_type;
148     enum { nlanes = 4 };
149 
v_float32x4cv::v_float32x4150     v_float32x4() {}
v_float32x4cv::v_float32x4151     explicit v_float32x4(vfloat32m1_t v) : val(v) {}
v_float32x4cv::v_float32x4152     v_float32x4(float v0, float v1, float v2, float v3)
153     {
154         float v[] = {v0, v1, v2, v3};
155         val = (vfloat32m1_t)vle_v_f32m1((float*)v, 4);
156     }
get0cv::v_float32x4157     float get0() const
158     {
159         return vfmv_f_s_f32m1_f32(val, 4);
160     }
161     vfloat32m1_t val;
162 };
163 
164 struct v_uint64x2
165 {
166     typedef uint64 lane_type;
167     enum { nlanes = 2 };
168 
v_uint64x2cv::v_uint64x2169     v_uint64x2() {}
v_uint64x2cv::v_uint64x2170     explicit v_uint64x2(vuint64m1_t v) : val(v) {}
v_uint64x2cv::v_uint64x2171     v_uint64x2(uint64 v0, uint64 v1)
172     {
173         uint64 v[] = {v0, v1};
174         val = (vuint64m1_t)vle_v_u64m1((unsigned long*)v, 2);
175     }
get0cv::v_uint64x2176     uint64 get0() const
177     {
178         return vmv_x_s_u64m1_u64(val, 2);
179     }
180     vuint64m1_t val;
181 };
182 
183 struct v_int64x2
184 {
185     typedef int64 lane_type;
186     enum { nlanes = 2 };
187 
v_int64x2cv::v_int64x2188     v_int64x2() {}
v_int64x2cv::v_int64x2189     explicit v_int64x2(vint64m1_t v) : val(v) {}
v_int64x2cv::v_int64x2190     v_int64x2(int64 v0, int64 v1)
191     {
192         int64 v[] = {v0, v1};
193         val = (vint64m1_t)vle_v_i64m1((long*)v, 2);
194     }
get0cv::v_int64x2195     int64 get0() const
196     {
197         return vmv_x_s_i64m1_i64(val, 2);
198     }
199     vint64m1_t val;
200 };
201 
202 struct v_float64x2
203 {
204     typedef double lane_type;
205     enum { nlanes = 2 };
206 
v_float64x2cv::v_float64x2207     v_float64x2() {}
v_float64x2cv::v_float64x2208     explicit v_float64x2(vfloat64m1_t v) : val(v) {}
v_float64x2cv::v_float64x2209     v_float64x2(double v0, double v1)
210     {
211         double v[] = {v0, v1};
212         val = (vfloat64m1_t)vle_v_f64m1((double*)v, 2);
213     }
get0cv::v_float64x2214     double get0() const
215     {
216         return vfmv_f_s_f64m1_f64(val, 2);
217     }
218     vfloat64m1_t val;
219 };
220 
221 #define OPENCV_HAL_IMPL_RISCVV_INIT(_Tpv, _Tp, suffix) \
222 inline _Tp##m1_t vreinterpretq_##suffix##_##suffix(_Tp##m1_t v) { return v; } \
223 inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16((vuint8m1_t)(v.val)); } \
224 inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16((vint8m1_t)(v.val)); } \
225 inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8((vuint16m1_t)(v.val)); } \
226 inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8((vint16m1_t)(v.val)); } \
227 inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4((vuint32m1_t)(v.val)); } \
228 inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4((vint32m1_t)(v.val)); } \
229 inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2((vuint64m1_t)(v.val)); } \
230 inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2((vint64m1_t)(v.val)); } \
231 inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4((vfloat32m1_t)(v.val)); }\
232 inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2((vfloat64m1_t)(v.val)); }
233 
234 
OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16,vuint8,u8)235 OPENCV_HAL_IMPL_RISCVV_INIT(uint8x16, vuint8, u8)
236 OPENCV_HAL_IMPL_RISCVV_INIT(int8x16, vint8, s8)
237 OPENCV_HAL_IMPL_RISCVV_INIT(uint16x8, vuint16, u16)
238 OPENCV_HAL_IMPL_RISCVV_INIT(int16x8, vint16, s16)
239 OPENCV_HAL_IMPL_RISCVV_INIT(uint32x4, vuint32, u32)
240 OPENCV_HAL_IMPL_RISCVV_INIT(int32x4, vint32, s32)
241 OPENCV_HAL_IMPL_RISCVV_INIT(uint64x2, vuint64, u64)
242 OPENCV_HAL_IMPL_RISCVV_INIT(int64x2, vint64, s64)
243 OPENCV_HAL_IMPL_RISCVV_INIT(float64x2, vfloat64, f64)
244 OPENCV_HAL_IMPL_RISCVV_INIT(float32x4, vfloat32, f32)
245 #define OPENCV_HAL_IMPL_RISCVV_INIT_SET(__Tp, _Tp, suffix, len, num) \
246 inline v_##_Tp##x##num v_setzero_##suffix() { return v_##_Tp##x##num((v##_Tp##m1_t){0}); }     \
247 inline v_##_Tp##x##num v_setall_##suffix(__Tp v) { return v_##_Tp##x##num(vmv_v_x_##len##m1(v, num)); }
248 
249 OPENCV_HAL_IMPL_RISCVV_INIT_SET(uchar, uint8, u8, u8, 16)
250 OPENCV_HAL_IMPL_RISCVV_INIT_SET(char, int8, s8, i8, 16)
251 OPENCV_HAL_IMPL_RISCVV_INIT_SET(ushort, uint16, u16, u16, 8)
252 OPENCV_HAL_IMPL_RISCVV_INIT_SET(short, int16, s16, i16, 8)
253 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned int, uint32, u32, u32, 4)
254 OPENCV_HAL_IMPL_RISCVV_INIT_SET(int, int32, s32, i32, 4)
255 OPENCV_HAL_IMPL_RISCVV_INIT_SET(unsigned long, uint64, u64, u64, 2)
256 OPENCV_HAL_IMPL_RISCVV_INIT_SET(long, int64, s64, i64, 2)
257 inline v_float32x4 v_setzero_f32() { return v_float32x4((vfloat32m1_t){0}); }
v_setall_f32(float v)258 inline v_float32x4 v_setall_f32(float v) { return v_float32x4(vfmv_v_f_f32m1(v, 4)); }
259 
v_setzero_f64()260 inline v_float64x2 v_setzero_f64() { return v_float64x2(vfmv_v_f_f64m1(0, 2)); }
v_setall_f64(double v)261 inline v_float64x2 v_setall_f64(double v) { return v_float64x2(vfmv_v_f_f64m1(v, 2)); }
262 
263 
264 #define OPENCV_HAL_IMPL_RISCVV_BIN_OP(bin_op, _Tpvec, intrin) \
265 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
266 { \
267     return _Tpvec(intrin(a.val, b.val)); \
268 } \
269 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
270 { \
271     a.val = intrin(a.val, b.val); \
272     return a; \
273 }
274 
275 #define OPENCV_HAL_IMPL_RISCVV_BIN_OPN(bin_op, _Tpvec, intrin, num) \
276 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
277 { \
278     return _Tpvec(intrin(a.val, b.val, num)); \
279 } \
280 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
281 { \
282     a.val = intrin(a.val, b.val, num); \
283     return a; \
284 }
285 
286 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint8x16, vsaddu_vv_u8m1, 16)
287 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint8x16, vssubu_vv_u8m1, 16)
288 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int8x16, vsadd_vv_i8m1, 16)
289 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int8x16, vssub_vv_i8m1, 16)
290 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint16x8, vsaddu_vv_u16m1, 8)
291 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint16x8, vssubu_vv_u16m1, 8)
292 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int16x8, vsadd_vv_i16m1, 8)
293 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int16x8, vssub_vv_i16m1, 8)
294 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int32x4, vsadd_vv_i32m1, 4)
295 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int32x4, vssub_vv_i32m1, 4)
296 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_int32x4, vmul_vv_i32m1, 4)
297 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint32x4, vadd_vv_u32m1, 4)
298 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint32x4, vsub_vv_u32m1, 4)
299 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_uint32x4, vmul_vv_u32m1, 4)
300 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_int64x2, vsadd_vv_i64m1, 2)
301 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_int64x2, vssub_vv_i64m1, 2)
302 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_uint64x2, vadd_vv_u64m1, 2)
303 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_uint64x2, vsub_vv_u64m1, 2)
304 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float32x4, vfadd_vv_f32m1, 4)
305 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float32x4, vfsub_vv_f32m1, 4)
306 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float32x4, vfmul_vv_f32m1, 4)
operator /(const v_float32x4 & a,const v_float32x4 & b)307 inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b)
308 {
309     return v_float32x4(vfdiv_vv_f32m1(a.val, b.val, 4));
310 }
operator /=(v_float32x4 & a,const v_float32x4 & b)311 inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
312 {
313     a.val = vfdiv_vv_f32m1(a.val, b.val, 4);
314     return a;
315 }
316 
317 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(+, v_float64x2, vfadd_vv_f64m1, 2)
318 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(-, v_float64x2, vfsub_vv_f64m1, 2)
319 OPENCV_HAL_IMPL_RISCVV_BIN_OPN(*, v_float64x2, vfmul_vv_f64m1, 2)
operator /(const v_float64x2 & a,const v_float64x2 & b)320 inline v_float64x2 operator / (const v_float64x2& a, const v_float64x2& b)
321 {
322     return v_float64x2(vfdiv_vv_f64m1(a.val, b.val, 2));
323 }
operator /=(v_float64x2 & a,const v_float64x2 & b)324 inline v_float64x2& operator /= (v_float64x2& a, const v_float64x2& b)
325 {
326     a.val = vfdiv_vv_f64m1(a.val, b.val, 2);
327     return a;
328 }
329 // TODO: exp, log, sin, cos
330 
331 #define OPENCV_HAL_IMPL_RISCVV_BIN_FUNC(_Tpvec, func, intrin) \
332 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
333 { \
334     return _Tpvec(intrin(a.val, b.val)); \
335 }
336 
337 #define OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(_Tpvec, func, intrin, num) \
338 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
339 { \
340     return _Tpvec(intrin(a.val, b.val, num)); \
341 }
342 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 16)
343 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 16)
344 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 16)
345 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 16)
346 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 8)
347 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 8)
348 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 8)
349 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 8)
350 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 4)
351 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 4)
352 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 4)
353 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 4)
354 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 4)
355 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 4)
356 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 2)
357 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 2)
358 
v_sqrt(const v_float32x4 & x)359 inline v_float32x4 v_sqrt(const v_float32x4& x)
360 {
361     return v_float32x4(vfsqrt_v_f32m1(x.val, 4));
362 }
363 
v_invsqrt(const v_float32x4 & x)364 inline v_float32x4 v_invsqrt(const v_float32x4& x)
365 {
366     return v_float32x4(vfrdiv_vf_f32m1(vfsqrt_v_f32m1(x.val, 4), 1, 4));
367 }
368 
v_magnitude(const v_float32x4 & a,const v_float32x4 & b)369 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
370 {
371     v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
372     return v_sqrt(x);
373 }
374 
v_sqr_magnitude(const v_float32x4 & a,const v_float32x4 & b)375 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
376 {
377     return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a.val, a.val, 4), b.val, b.val, 4));
378 }
379 
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)380 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
381 {
382     return v_float32x4(vfmacc_vv_f32m1(c.val, a.val, b.val, 4));
383 }
384 
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)385 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
386 {
387     return v_int32x4(vmacc_vv_i32m1(c.val, a.val, b.val, 4));
388 }
389 
v_muladd(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)390 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
391 {
392     return v_fma(a, b, c);
393 }
394 
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)395 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
396 {
397     return v_fma(a, b, c);
398 }
399 
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)400 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
401                             const v_float32x4& m1, const v_float32x4& m2,
402                             const v_float32x4& m3)
403 {
404     vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
405     res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
406     res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
407     res = vfmacc_vf_f32m1(res, v.val[3], m3.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
408     return v_float32x4(res);
409 }
410 
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)411 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
412                                const v_float32x4& m1, const v_float32x4& m2,
413                                const v_float32x4& a)
414 {
415     vfloat32m1_t res = vfmul_vf_f32m1(m0.val, v.val[0], 4);//vmuli_f32(m0.val, v.val, 0);
416     res = vfmacc_vf_f32m1(res, v.val[1], m1.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
417     res = vfmacc_vf_f32m1(res, v.val[2], m2.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
418     res = vfadd_vv_f32m1(res, a.val, 4);//vmulai_f32(res, m1.val, v.val, 1);
419     return v_float32x4(res);
420 }
421 
v_sqrt(const v_float64x2 & x)422 inline v_float64x2 v_sqrt(const v_float64x2& x)
423 {
424     return v_float64x2(vfsqrt_v_f64m1(x.val, 2));
425 }
426 
v_invsqrt(const v_float64x2 & x)427 inline v_float64x2 v_invsqrt(const v_float64x2& x)
428 {
429     return v_float64x2(vfrdiv_vf_f64m1(vfsqrt_v_f64m1(x.val, 2), 1, 2));
430 }
431 
v_magnitude(const v_float64x2 & a,const v_float64x2 & b)432 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
433 {
434     v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
435     return v_sqrt(x);
436 }
437 
v_sqr_magnitude(const v_float64x2 & a,const v_float64x2 & b)438 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
439 {
440     return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a.val, a.val, 2), b.val, b.val, 2));
441 }
442 
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)443 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
444 {
445     return v_float64x2(vfmacc_vv_f64m1(c.val, a.val, b.val, 2));
446 }
447 
v_muladd(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)448 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
449 {
450     return v_fma(a, b, c);
451 }
452 
453 #define OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(_Tpvec, suffix, num) \
454     OPENCV_HAL_IMPL_RISCVV_BIN_OPN(&, _Tpvec, vand_vv_##suffix, num) \
455     OPENCV_HAL_IMPL_RISCVV_BIN_OPN(|, _Tpvec, vor_vv_##suffix, num) \
456     OPENCV_HAL_IMPL_RISCVV_BIN_OPN(^, _Tpvec, vxor_vv_##suffix, num) \
457     inline _Tpvec operator ~ (const _Tpvec & a) \
458     { \
459         return _Tpvec(vnot_v_##suffix(a.val, num)); \
460     }
461 
462 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint8x16, u8m1, 16)
463 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint16x8, u16m1, 8)
464 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint32x4, u32m1, 4)
465 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_uint64x2, u64m1, 2)
466 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int8x16,  i8m1, 16)
467 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int16x8,  i16m1, 8)
468 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int32x4,  i32m1, 4)
469 OPENCV_HAL_IMPL_RISCVV_LOGIC_OPN(v_int64x2,  i64m1, 2)
470 
471 #define OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(bin_op, intrin) \
472 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
473 { \
474     return v_float32x4(vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4))); \
475 } \
476 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
477 { \
478     a.val = vfloat32m1_t(intrin(vint32m1_t(a.val), vint32m1_t(b.val), 4)); \
479     return a; \
480 }
481 
482 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(&, vand_vv_i32m1)
483 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(|, vor_vv_i32m1)
484 OPENCV_HAL_IMPL_RISCVV_FLT_BIT_OP(^, vxor_vv_i32m1)
485 
operator ~(const v_float32x4 & a)486 inline v_float32x4 operator ~ (const v_float32x4& a)
487 {
488     return v_float32x4((vfloat32m1_t)(vnot_v_i32m1((vint32m1_t)(a.val), 4)));
489 }
490 
491 #define OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(bin_op, intrin) \
492 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
493 { \
494     return v_float64x2(vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2))); \
495 } \
496 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
497 { \
498     a.val = vfloat64m1_t(intrin(vint64m1_t(a.val), vint64m1_t(b.val), 2)); \
499     return a; \
500 }
501 
502 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(&, vand_vv_i64m1)
503 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(|, vor_vv_i64m1)
504 OPENCV_HAL_IMPL_RISCVV_FLT_64BIT_OP(^, vxor_vv_i64m1)
505 
operator ~(const v_float64x2 & a)506 inline v_float64x2 operator ~ (const v_float64x2& a)
507 {
508     return v_float64x2((vfloat64m1_t)(vnot_v_i64m1((vint64m1_t)(a.val), 2)));
509 }
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)510 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
511 {
512     return v_int16x8(vmulh_vv_i16m1(a.val, b.val, 8));
513 }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)514 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
515 {
516     return v_uint16x8(vmulhu_vv_u16m1(a.val, b.val, 8));
517 }
518 
519 //#define OPENCV_HAL_IMPL_RISCVV_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
520 //inline _Tpuvec v_abs(const _Tpsvec& a) {    \
521 //    E##xm1_t mask=vmflt_vf_e32xm1_f32m1(x.val, 0.0, 4);
522 
523 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint8x16, v_int8x16, u8, s8)
524 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint16x8, v_int16x8, u16, s16)
525 //OPENCV_HAL_IMPL_RISCVV_ABS(v_uint32x4, v_int32x4, u32, s32)
526 
v_abs(v_int32x4 x)527 inline v_uint32x4 v_abs(v_int32x4 x)
528 {
529     vbool32_t mask=vmslt_vx_i32m1_b32(x.val, 0, 4);
530     return v_uint32x4((vuint32m1_t)vrsub_vx_i32m1_m(mask, x.val, x.val, 0, 4));
531 }
532 
v_abs(v_int16x8 x)533 inline v_uint16x8 v_abs(v_int16x8 x)
534 {
535     vbool16_t mask=vmslt_vx_i16m1_b16(x.val, 0, 8);
536     return v_uint16x8((vuint16m1_t)vrsub_vx_i16m1_m(mask, x.val, x.val, 0, 8));
537 }
538 
v_abs(v_int8x16 x)539 inline v_uint8x16 v_abs(v_int8x16 x)
540 {
541     vbool8_t mask=vmslt_vx_i8m1_b8(x.val, 0, 16);
542     return v_uint8x16((vuint8m1_t)vrsub_vx_i8m1_m(mask, x.val, x.val, 0, 16));
543 }
544 
v_abs(v_float32x4 x)545 inline v_float32x4 v_abs(v_float32x4 x)
546 {
547     return (v_float32x4)vfsgnjx_vv_f32m1(x.val, x.val, 4);
548 }
549 
v_abs(v_float64x2 x)550 inline v_float64x2 v_abs(v_float64x2 x)
551 {
552     return (v_float64x2)vfsgnjx_vv_f64m1(x.val, x.val, 2);
553 }
554 
v_absdiff(const v_float32x4 & a,const v_float32x4 & b)555 inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
556 {
557     vfloat32m1_t ret = vfsub_vv_f32m1(a.val, b.val, 4);
558     return (v_float32x4)vfsgnjx_vv_f32m1(ret, ret, 4);
559 }
560 
v_absdiff(const v_float64x2 & a,const v_float64x2 & b)561 inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
562 {
563     vfloat64m1_t ret = vfsub_vv_f64m1(a.val, b.val, 2);
564     return (v_float64x2)vfsgnjx_vv_f64m1(ret, ret, 2);
565 }
566 
567 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(bit, num) \
568 inline v_uint##bit##x##num v_absdiff(v_uint##bit##x##num a, v_uint##bit##x##num b){    \
569     vuint##bit##m1_t vmax = vmaxu_vv_u##bit##m1(a.val, b.val, num);    \
570     vuint##bit##m1_t vmin = vminu_vv_u##bit##m1(a.val, b.val, num);    \
571     return v_uint##bit##x##num(vsub_vv_u##bit##m1(vmax, vmin, num));\
572 }
573 
574 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(8, 16)
575 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(16, 8)
576 OPENCV_HAL_IMPL_RISCVV_ABSDIFF_U(32, 4)
577 
578 /** Saturating absolute difference **/
v_absdiffs(v_int8x16 a,v_int8x16 b)579 inline v_int8x16 v_absdiffs(v_int8x16 a, v_int8x16 b){
580     vint8m1_t vmax = vmax_vv_i8m1(a.val, b.val, 16);
581     vint8m1_t vmin = vmin_vv_i8m1(a.val, b.val, 16);
582     return v_int8x16(vssub_vv_i8m1(vmax, vmin, 16));
583 }
v_absdiffs(v_int16x8 a,v_int16x8 b)584 inline v_int16x8 v_absdiffs(v_int16x8 a, v_int16x8 b){
585     vint16m1_t vmax = vmax_vv_i16m1(a.val, b.val, 8);
586     vint16m1_t vmin = vmin_vv_i16m1(a.val, b.val, 8);
587     return v_int16x8(vssub_vv_i16m1(vmax, vmin, 8));
588 }
589 
590 #define OPENCV_HAL_IMPL_RISCVV_ABSDIFF(_Tpvec, _Tpv, num) \
591 inline v_uint##_Tpvec v_absdiff(v_int##_Tpvec a, v_int##_Tpvec b){    \
592      vint##_Tpv##_t max = vmax_vv_i##_Tpv(a.val, b.val, num);\
593      vint##_Tpv##_t min = vmin_vv_i##_Tpv(a.val, b.val, num);\
594     return v_uint##_Tpvec((vuint##_Tpv##_t)vsub_vv_i##_Tpv(max, min, num));    \
595 }
596 
597 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(8x16, 8m1, 16)
598 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(16x8, 16m1, 8)
599 OPENCV_HAL_IMPL_RISCVV_ABSDIFF(32x4, 32m1, 4)
600 
601 //  Multiply and expand
v_mul_expand(const v_int8x16 & a,const v_int8x16 & b,v_int16x8 & c,v_int16x8 & d)602 inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
603                          v_int16x8& c, v_int16x8& d)
604 {
605     vint16m2_t res = vundefined_i16m2();
606     res = vwmul_vv_i16m2(a.val, b.val, 16);
607     c.val = vget_i16m2_i16m1(res, 0);
608     d.val = vget_i16m2_i16m1(res, 1);
609 }
610 
v_mul_expand(const v_uint8x16 & a,const v_uint8x16 & b,v_uint16x8 & c,v_uint16x8 & d)611 inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
612                          v_uint16x8& c, v_uint16x8& d)
613 {
614     vuint16m2_t res = vundefined_u16m2();
615     res = vwmulu_vv_u16m2(a.val, b.val, 16);
616     c.val = vget_u16m2_u16m1(res, 0);
617     d.val = vget_u16m2_u16m1(res, 1);
618 }
619 
v_mul_expand(const v_int16x8 & a,const v_int16x8 & b,v_int32x4 & c,v_int32x4 & d)620 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
621                          v_int32x4& c, v_int32x4& d)
622 {
623     vint32m2_t res = vundefined_i32m2();
624     res = vwmul_vv_i32m2(a.val, b.val, 8);
625     c.val = vget_i32m2_i32m1(res, 0);
626     d.val = vget_i32m2_i32m1(res, 1);
627 }
628 
v_mul_expand(const v_uint16x8 & a,const v_uint16x8 & b,v_uint32x4 & c,v_uint32x4 & d)629 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
630                          v_uint32x4& c, v_uint32x4& d)
631 {
632     vuint32m2_t res = vundefined_u32m2();
633     res = vwmulu_vv_u32m2(a.val, b.val, 8);
634     c.val = vget_u32m2_u32m1(res, 0);
635     d.val = vget_u32m2_u32m1(res, 1);
636 }
637 
v_mul_expand(const v_int32x4 & a,const v_int32x4 & b,v_int64x2 & c,v_int64x2 & d)638 inline void v_mul_expand(const v_int32x4& a, const v_int32x4& b,
639                          v_int64x2& c, v_int64x2& d)
640 {
641     vint64m2_t res = vundefined_i64m2();
642     res = vwmul_vv_i64m2(a.val, b.val, 4);
643     c.val = vget_i64m2_i64m1(res, 0);
644     d.val = vget_i64m2_i64m1(res, 1);
645 }
646 
v_mul_expand(const v_uint32x4 & a,const v_uint32x4 & b,v_uint64x2 & c,v_uint64x2 & d)647 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
648                          v_uint64x2& c, v_uint64x2& d)
649 {
650     vuint64m2_t res = vundefined_u64m2();
651     res = vwmulu_vv_u64m2(a.val, b.val, 4);
652     c.val = vget_u64m2_u64m1(res, 0);
653     d.val = vget_u64m2_u64m1(res, 1);
654 }
655 
656 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 16)
657 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 16)
658 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 8)
659 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 8)
660 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 16)
661 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 16)
662 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 8)
663 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 8)
664 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 16)
665 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 16)
666 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 8)
667 OPENCV_HAL_IMPL_RISCVV_BINN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 8)
668 //////// Dot Product ////////
669 // 16 >> 32
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)670 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
671 {
672     vint32m2_t res = vundefined_i32m2();
673     res = vwmul_vv_i32m2(a.val, b.val, 8);
674     res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
675     return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0), vget_i32m2_i32m1(res, 1), 4));
676 }
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)677 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
678 {
679     vint32m2_t res = vundefined_i32m2();
680     res = vwmul_vv_i32m2(a.val, b.val, 8);
681     res = vrgather_vv_i32m2(res, (vuint32m2_t){0, 2, 4, 6, 1, 3, 5, 7}, 8);
682     return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(res, 0),vget_i32m2_i32m1(res, 1), 4), c.val, 4));
683 }
684 
685 // 32 >> 64
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)686 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
687 {
688     vint64m2_t res = vundefined_i64m2();
689     res = vwmul_vv_i64m2(a.val, b.val, 4);
690     res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
691     return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2));
692 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)693 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
694 {
695     vint64m2_t res = vundefined_i64m2();
696     res = vwmul_vv_i64m2(a.val, b.val, 4);
697     res = vrgather_vv_i64m2(res, (vuint64m2_t){0, 2, 1, 3}, 4);
698     return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(res, 0), vget_i64m2_i64m1(res, 1), 2), c.val, 2));
699 }
700 
701 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)702 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
703 {
704     vuint16m2_t v1 = vundefined_u16m2();
705     vuint32m2_t v2 = vundefined_u32m2();
706     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
707     v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
708     v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
709     return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
710 }
711 
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)712 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
713                                    const v_uint32x4& c)
714 {
715     vuint16m2_t v1 = vundefined_u16m2();
716     vuint32m2_t v2 = vundefined_u32m2();
717     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
718     v1 = vrgather_vv_u16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
719     v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
720     return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
721 }
722 
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)723 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
724 {
725     vint16m2_t v1 = vundefined_i16m2();
726     vint32m2_t v2 = vundefined_i32m2();
727     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
728     v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
729     v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
730     return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
731 }
732 
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)733 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
734                                    const v_int32x4& c)
735 {
736     vint16m2_t v1 = vundefined_i16m2();
737     vint32m2_t v2 = vundefined_i32m2();
738     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
739     v1 = vrgather_vv_i16m2(v1, (vuint16m2_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);
740     v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
741     return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
742 }
743 
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)744 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
745 {
746     vuint32m2_t v1 = vundefined_u32m2();
747     vuint64m2_t v2 = vundefined_u64m2();
748     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
749     v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
750     v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
751     return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
752 }
753 
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)754 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
755                                    const v_uint64x2& c)
756 {
757     vuint32m2_t v1 = vundefined_u32m2();
758     vuint64m2_t v2 = vundefined_u64m2();
759     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
760     v1 = vrgather_vv_u32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
761     v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
762     return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
763 }
764 
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)765 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
766 {
767     vint32m2_t v1 = vundefined_i32m2();
768     vint64m2_t v2 = vundefined_i64m2();
769     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
770     v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
771     v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
772     return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
773 }
774 
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)775 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
776                                    const v_int64x2& c)
777 {
778     vint32m2_t v1 = vundefined_i32m2();
779     vint64m2_t v2 = vundefined_i64m2();
780     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
781     v1 = vrgather_vv_i32m2(v1, (vuint32m2_t){0, 4, 1, 5, 2, 6, 3, 7}, 8);
782     v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
783     return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
784 }
785 
786 //////// Fast Dot Product ////////
787 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)788 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
789 {
790     vint32m2_t v1 = vundefined_i32m2();
791     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
792     return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4));
793 }
794 
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)795 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
796 {
797     vint32m2_t v1 = vundefined_i32m2();
798     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
799     return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4), c.val, 4));
800 }
801 
802 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)803 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
804 {
805     vint64m2_t v1 = vundefined_i64m2();
806     v1 = vwmul_vv_i64m2(a.val, b.val, 4);
807     return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2));
808 }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)809 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
810 {
811     vint64m2_t v1 = vundefined_i64m2();
812     v1 = vwmul_vv_i64m2(a.val, b.val, 8);
813     return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 4), c.val, 4));
814 }
815 
816 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)817 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
818 {
819     vuint16m2_t v1 = vundefined_u16m2();
820     vuint32m2_t v2 = vundefined_u32m2();
821     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
822     v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
823     return v_uint32x4(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4));
824 }
825 
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)826 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
827 {
828     vuint16m2_t v1 = vundefined_u16m2();
829     vuint32m2_t v2 = vundefined_u32m2();
830     v1 = vwmulu_vv_u16m2(a.val, b.val, 16);
831     v2 = vwaddu_vv_u32m2(vget_u16m2_u16m1(v1, 0), vget_u16m2_u16m1(v1, 1), 8);
832     return v_uint32x4(vadd_vv_u32m1(vadd_vv_u32m1(vget_u32m2_u32m1(v2, 0), vget_u32m2_u32m1(v2, 1), 4), c.val, 4));
833 }
834 
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)835 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
836 {
837     vint16m2_t v1 = vundefined_i16m2();
838     vint32m2_t v2 = vundefined_i32m2();
839     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
840     v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
841     return v_int32x4(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4));
842 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)843 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
844 {
845     vint16m2_t v1 = vundefined_i16m2();
846     vint32m2_t v2 = vundefined_i32m2();
847     v1 = vwmul_vv_i16m2(a.val, b.val, 16);
848     v2 = vwadd_vv_i32m2(vget_i16m2_i16m1(v1, 0), vget_i16m2_i16m1(v1, 1), 8);
849     return v_int32x4(vadd_vv_i32m1(vadd_vv_i32m1(vget_i32m2_i32m1(v2, 0), vget_i32m2_i32m1(v2, 1), 4), c.val, 4));
850 }
851 
852 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)853 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
854 {
855     vuint32m2_t v1 = vundefined_u32m2();
856     vuint64m2_t v2 = vundefined_u64m2();
857     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
858     v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
859     return v_uint64x2(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2));
860 }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)861 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
862 {
863     vuint32m2_t v1 = vundefined_u32m2();
864     vuint64m2_t v2 = vundefined_u64m2();
865     v1 = vwmulu_vv_u32m2(a.val, b.val, 8);
866     v2 = vwaddu_vv_u64m2(vget_u32m2_u32m1(v1, 0), vget_u32m2_u32m1(v1, 1), 4);
867     return v_uint64x2(vadd_vv_u64m1(vadd_vv_u64m1(vget_u64m2_u64m1(v2, 0), vget_u64m2_u64m1(v2, 1), 2), c.val, 2));
868 }
869 
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)870 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
871 {
872     vint32m2_t v1 = vundefined_i32m2();
873     vint64m2_t v2 = vundefined_i64m2();
874     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
875     v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
876     return v_int64x2(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2));
877 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)878 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
879 {
880     vint32m2_t v1 = vundefined_i32m2();
881     vint64m2_t v2 = vundefined_i64m2();
882     v1 = vwmul_vv_i32m2(a.val, b.val, 8);
883     v2 = vwadd_vv_i64m2(vget_i32m2_i32m1(v1, 0), vget_i32m2_i32m1(v1, 1), 4);
884     return v_int64x2(vadd_vv_i64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v2, 0), vget_i64m2_i64m1(v2, 1), 2), c.val, 2));
885 }
886 
887 
888 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(_Tpvec, _Tpvec2, len, scalartype, func, intrin, num) \
889 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
890 {\
891     v##_Tpvec2##m1_t val = vmv_v_x_##len##m1(0, num); \
892     val = intrin(val, a.val, val, num);    \
893     return vmv_x_s_##len##m1_##len(val, num);    \
894 }
895 
896 
897 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(_Tpvec, _Tpvec2, scalartype, func, funcu, num) \
898 inline scalartype v_reduce_##func(const v_##_Tpvec##x##num& a) \
899 {\
900     v##_Tpvec##m1_t val = (v##_Tpvec##m1_t)vmv_v_x_i8m1(0, num); \
901     val = v##funcu##_vs_##_Tpvec2##m1_##_Tpvec2##m1(val, a.val, a.val, num);    \
902     return val[0];    \
903 }
904 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int8, int16, i16, int, sum, vwredsum_vs_i8m1_i16m1, 16)
905 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int16, int32, i32, int, sum, vwredsum_vs_i16m1_i32m1, 8)
906 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(int32, int64, i64, int, sum, vwredsum_vs_i32m1_i64m1, 4)
907 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint8, uint16, u16, unsigned, sum, vwredsumu_vs_u8m1_u16m1, 16)
908 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint16, uint32, u32, unsigned, sum, vwredsumu_vs_u16m1_u32m1, 8)
909 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_W(uint32, uint64, u64, unsigned, sum, vwredsumu_vs_u32m1_u64m1, 4)
v_reduce_sum(const v_float32x4 & a)910 inline float v_reduce_sum(const v_float32x4& a) \
911 {\
912     vfloat32m1_t val = vfmv_v_f_f32m1(0.0, 4); \
913     val = vfredsum_vs_f32m1_f32m1(val, a.val, val, 4);    \
914     return vfmv_f_s_f32m1_f32(val, 4);    \
915 }
v_reduce_sum(const v_float64x2 & a)916 inline double v_reduce_sum(const v_float64x2& a) \
917 {\
918     vfloat64m1_t val = vfmv_v_f_f64m1(0.0, 2); \
919     val = vfredsum_vs_f64m1_f64m1(val, a.val, val, 2);    \
920     return vfmv_f_s_f64m1_f64(val, 2);    \
921 }
v_reduce_sum(const v_uint64x2 & a)922 inline uint64 v_reduce_sum(const v_uint64x2& a)
923 { return vext_x_v_u64m1_u64((vuint64m1_t)a.val, 0, 2)+vext_x_v_u64m1_u64((vuint64m1_t)a.val, 1, 2); }
924 
v_reduce_sum(const v_int64x2 & a)925 inline int64 v_reduce_sum(const v_int64x2& a)
926 { return vext_x_v_i64m1_i64((vint64m1_t)a.val, 0, 2)+vext_x_v_i64m1_i64((vint64m1_t)a.val, 1, 2); }
927 
928 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(func)    \
929 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int8,  i8, int, func, red##func, 16)    \
930 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int16, i16, int, func, red##func, 8)    \
931 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int32, i32, int, func, red##func, 4)    \
932 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(int64, i64, int, func, red##func, 2)    \
933 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint8,  u8, unsigned, func, red##func##u, 16)    \
934 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint16, u16, unsigned, func, red##func##u, 8)    \
935 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(uint32, u32, unsigned, func, red##func##u, 4)    \
936 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP_(float32, f32, float, func, fred##func, 4)
937 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(max)
OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)938 OPENCV_HAL_IMPL_RISCVV_REDUCE_OP(min)
939 
940 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
941                                  const v_float32x4& c, const v_float32x4& d)
942 {
943     vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
944     vfloat32m1_t b0 = vfmv_v_f_f32m1(0.0, 4);
945     vfloat32m1_t c0 = vfmv_v_f_f32m1(0.0, 4);
946     vfloat32m1_t d0 = vfmv_v_f_f32m1(0.0, 4);
947     a0 = vfredsum_vs_f32m1_f32m1(a0, a.val, a0, 4);
948     b0 = vfredsum_vs_f32m1_f32m1(b0, b.val, b0, 4);
949     c0 = vfredsum_vs_f32m1_f32m1(c0, c.val, c0, 4);
950     d0 = vfredsum_vs_f32m1_f32m1(d0, d.val, d0, 4);
951     return v_float32x4(a0[0], b0[0], c0[0], d0[0]);
952 }
953 
v_reduce_sad(const v_float32x4 & a,const v_float32x4 & b)954 inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
955 {
956     vfloat32m1_t a0 = vfmv_v_f_f32m1(0.0, 4);
957     vfloat32m1_t x = vfsub_vv_f32m1(a.val, b.val, 4);
958     vbool32_t mask=vmflt_vf_f32m1_b32(x, 0, 4);
959     vfloat32m1_t val = vfrsub_vf_f32m1_m(mask, x, x, 0, 4);
960     a0 = vfredsum_vs_f32m1_f32m1(a0, val, a0, 4);
961     return a0[0];
962 }
963 
964 #define OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(_Tpvec, _Tpvec2) \
965 inline unsigned v_reduce_sad(const _Tpvec& a, const _Tpvec&b){    \
966     _Tpvec2 x = v_absdiff(a, b);    \
967     return v_reduce_sum(x);    \
968 }
969 
970 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int8x16, v_uint8x16)
971 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint8x16, v_uint8x16)
972 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int16x8, v_uint16x8)
973 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint16x8, v_uint16x8)
974 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_int32x4, v_uint32x4)
975 OPENCV_HAL_IMPL_RISCVV_REDUCE_SAD(v_uint32x4, v_uint32x4)
976 
977 #define OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(_Tpvec, _Tp, _T, num, uv) \
978 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
979 { \
980     vbool##_T##_t mask = vmseq_vv_##_Tp##_b##_T(a.val, b.val, num);    \
981     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
982 } \
983 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
984 { \
985     vbool##_T##_t mask = vmsne_vv_##_Tp##_b##_T(a.val, b.val, num);    \
986     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
987 } \
988 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
989 { \
990     vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(a.val, b.val, num);    \
991     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
992 } \
993 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
994 { \
995     vbool##_T##_t mask = vmslt##uv##_Tp##_b##_T(b.val, a.val, num);    \
996     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
997 } \
998 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
999 { \
1000     vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(a.val, b.val, num);    \
1001     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
1002 } \
1003 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1004 { \
1005     vbool##_T##_t mask = vmsle##uv##_Tp##_b##_T(b.val, a.val, num);    \
1006     return _Tpvec(vmerge_vxm_##_Tp(mask, vmv_v_x_##_Tp(0, num), -1, num));    \
1007 } \
1008 
1009 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int8x16, i8m1,  8, 16, _vv_)
1010 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int16x8, i16m1, 16, 8, _vv_)
1011 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int32x4, i32m1, 32, 4, _vv_)
1012 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_int64x2, i64m1, 64, 2, _vv_)
1013 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint8x16, u8m1, 8, 16, u_vv_)
1014 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint16x8, u16m1, 16, 8, u_vv_)
1015 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint32x4, u32m1, 32, 4, u_vv_)
1016 OPENCV_HAL_IMPL_RISCVV_INT_CMP_OP(v_uint64x2, u64m1, 64, 2, u_vv_)
1017 
1018 //TODO: ==
1019 inline v_float32x4 operator == (const v_float32x4& a, const v_float32x4& b)
1020 {
1021     vbool32_t mask = vmfeq_vv_f32m1_b32(a.val, b.val, 4);
1022     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1023     return v_float32x4((vfloat32m1_t)res);
1024 }
operator !=(const v_float32x4 & a,const v_float32x4 & b)1025 inline v_float32x4 operator != (const v_float32x4& a, const v_float32x4& b)
1026 {
1027     vbool32_t mask = vmfne_vv_f32m1_b32(a.val, b.val, 4);
1028     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1029     return v_float32x4((vfloat32m1_t)res);
1030 }
operator <(const v_float32x4 & a,const v_float32x4 & b)1031 inline v_float32x4 operator < (const v_float32x4& a, const v_float32x4& b)
1032 {
1033     vbool32_t mask = vmflt_vv_f32m1_b32(a.val, b.val, 4);
1034     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1035     return v_float32x4((vfloat32m1_t)res);
1036 }
operator <=(const v_float32x4 & a,const v_float32x4 & b)1037 inline v_float32x4 operator <= (const v_float32x4& a, const v_float32x4& b)
1038 {
1039     vbool32_t mask = vmfle_vv_f32m1_b32(a.val, b.val, 4);
1040     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1041     return v_float32x4((vfloat32m1_t)res);
1042 }
operator >(const v_float32x4 & a,const v_float32x4 & b)1043 inline v_float32x4 operator > (const v_float32x4& a, const v_float32x4& b)
1044 {
1045     vbool32_t mask = vmfgt_vv_f32m1_b32(a.val, b.val, 4);
1046     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1047     return v_float32x4((vfloat32m1_t)res);
1048 }
operator >=(const v_float32x4 & a,const v_float32x4 & b)1049 inline v_float32x4 operator >= (const v_float32x4& a, const v_float32x4& b)
1050 {
1051     vbool32_t mask = vmfge_vv_f32m1_b32(a.val, b.val, 4);
1052     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1053     return v_float32x4((vfloat32m1_t)res);
1054 }
v_not_nan(const v_float32x4 & a)1055 inline v_float32x4 v_not_nan(const v_float32x4& a)
1056 {
1057     vbool32_t mask = vmford_vv_f32m1_b32(a.val, a.val, 4);
1058     vint32m1_t res = vmerge_vxm_i32m1(mask, vmv_v_x_i32m1(0.0, 4), -1, 4);
1059     return v_float32x4((vfloat32m1_t)res);
1060 }
1061 
1062 //TODO: ==
operator ==(const v_float64x2 & a,const v_float64x2 & b)1063 inline v_float64x2 operator == (const v_float64x2& a, const v_float64x2& b)
1064 {
1065     vbool64_t mask = vmfeq_vv_f64m1_b64(a.val, b.val, 2);
1066     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1067     return v_float64x2((vfloat64m1_t)res);
1068 }
operator !=(const v_float64x2 & a,const v_float64x2 & b)1069 inline v_float64x2 operator != (const v_float64x2& a, const v_float64x2& b)
1070 {
1071     vbool64_t mask = vmfne_vv_f64m1_b64(a.val, b.val, 2);
1072     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1073     return v_float64x2((vfloat64m1_t)res);
1074 }
operator <(const v_float64x2 & a,const v_float64x2 & b)1075 inline v_float64x2 operator < (const v_float64x2& a, const v_float64x2& b)
1076 {
1077     vbool64_t mask = vmflt_vv_f64m1_b64(a.val, b.val, 2);
1078     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1079     return v_float64x2((vfloat64m1_t)res);
1080 }
operator <=(const v_float64x2 & a,const v_float64x2 & b)1081 inline v_float64x2 operator <= (const v_float64x2& a, const v_float64x2& b)
1082 {
1083     vbool64_t mask = vmfle_vv_f64m1_b64(a.val, b.val, 2);
1084     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1085     return v_float64x2((vfloat64m1_t)res);
1086 }
operator >(const v_float64x2 & a,const v_float64x2 & b)1087 inline v_float64x2 operator > (const v_float64x2& a, const v_float64x2& b)
1088 {
1089     vbool64_t mask = vmfgt_vv_f64m1_b64(a.val, b.val, 2);
1090     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1091     return v_float64x2((vfloat64m1_t)res);
1092 }
operator >=(const v_float64x2 & a,const v_float64x2 & b)1093 inline v_float64x2 operator >= (const v_float64x2& a, const v_float64x2& b)
1094 {
1095     vbool64_t mask = vmfge_vv_f64m1_b64(a.val, b.val, 2);
1096     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1097     return v_float64x2((vfloat64m1_t)res);
1098 }
v_not_nan(const v_float64x2 & a)1099 inline v_float64x2 v_not_nan(const v_float64x2& a)
1100 {
1101     vbool64_t mask = vmford_vv_f64m1_b64(a.val, a.val, 2);
1102     vint64m1_t res = vmerge_vxm_i64m1(mask, vmv_v_x_i64m1(0.0, 2), -1, 2);
1103     return v_float64x2((vfloat64m1_t)res);
1104 }
1105 #define OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(_Tp, _T) \
1106 inline void v_transpose4x4(const v_##_Tp##32x4& a0, const v_##_Tp##32x4& a1, \
1107                          const v_##_Tp##32x4& a2, const v_##_Tp##32x4& a3, \
1108                          v_##_Tp##32x4& b0, v_##_Tp##32x4& b1, \
1109                          v_##_Tp##32x4& b2, v_##_Tp##32x4& b3) \
1110 { \
1111     v##_Tp##32m4_t val = vundefined_##_T##m4();    \
1112     val = vset_##_T##m4(val, 0, a0.val);    \
1113     val = vset_##_T##m4(val, 1, a1.val);    \
1114     val = vset_##_T##m4(val, 2, a2.val);    \
1115     val = vset_##_T##m4(val, 3, a3.val);   \
1116     val = vrgather_vv_##_T##m4(val, (vuint32m4_t){0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}, 16);    \
1117     b0.val = vget_##_T##m4_##_T##m1(val, 0);   \
1118     b1.val = vget_##_T##m4_##_T##m1(val, 1);   \
1119     b2.val = vget_##_T##m4_##_T##m1(val, 2);   \
1120     b3.val = vget_##_T##m4_##_T##m1(val, 3);   \
1121 }
OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint,u32)1122 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(uint, u32)
1123 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(int, i32)
1124 OPENCV_HAL_IMPL_RISCVV_TRANSPOSE4x4(float, f32)
1125 
1126 
1127 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(_Tpvec, suffix, _T, num) \
1128 inline _Tpvec operator << (const _Tpvec& a, int n) \
1129 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); } \
1130 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1131 { return _Tpvec((vsll_vx_##_T##m1(a.val, n, num))); }
1132 
1133 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(_Tpvec, suffix, _T, num, intric) \
1134 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1135 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); } \
1136 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1137 { return _Tpvec((v##intric##_vx_##_T##m1(a.val, n, num))); }\
1138 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
1139 { return _Tpvec((v##intric##_vx_##_T##m1(vadd_vx_##_T##m1(a.val, 1<<(n-1), num), n, num))); }
1140 
1141 // trade efficiency for convenience
1142 #define OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(suffix, _T, num, intrin) \
1143 OPENCV_HAL_IMPL_RISCVV_SHIFT_LEFT(v_##suffix##x##num, suffix, _T, num) \
1144 OPENCV_HAL_IMPL_RISCVV_SHIFT_RIGHT(v_##suffix##x##num, suffix, _T, num, intrin)
1145 
1146 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint8, u8, 16, srl)
1147 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint16, u16, 8, srl)
1148 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint32, u32, 4, srl)
1149 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(uint64, u64, 2, srl)
1150 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int8, i8, 16, sra)
1151 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int16, i16, 8, sra)
1152 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int32, i32, 4, sra)
1153 OPENCV_HAL_IMPL_RISCVV_SHIFT_OP(int64, i64, 2, sra)
1154 
1155 #if 0
1156 #define VUP4(n) {0, 1, 2, 3}
1157 #define VUP8(n) {0, 1, 2, 3, 4, 5, 6, 7}
1158 #define VUP16(n) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
1159 #define VUP2(n) {0, 1}
1160 #endif
1161 #define OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(_Tpvec, suffix, _T, num, num2, vmv, len) \
1162 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1163 {    \
1164     suffix##m1_t tmp = vmv##_##_T##m1(0, num);\
1165         tmp = vslideup_vx_##_T##m1_m(vmset_m_##len(num), tmp, a.val, n, num);\
1166         return _Tpvec(tmp);\
1167 } \
1168 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1169 {     \
1170         return _Tpvec(vslidedown_vx_##_T##m1(a.val, n, num));\
1171 } \
1172 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1173 { return a; } \
1174 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1175 { \
1176     suffix##m2_t tmp = vundefined_##_T##m2();    \
1177     tmp = vset_##_T##m2(tmp, 0, a.val);          \
1178     tmp = vset_##_T##m2(tmp, 1, b.val);          \
1179         tmp = vslidedown_vx_##_T##m2(tmp, n, num2);\
1180         return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 0));\
1181 } \
1182 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1183 { \
1184     suffix##m2_t tmp = vundefined_##_T##m2();    \
1185     tmp = vset_##_T##m2(tmp, 0, b.val);    \
1186     tmp = vset_##_T##m2(tmp, 1, a.val);    \
1187         tmp = vslideup_vx_##_T##m2(tmp, n, num2);\
1188         return _Tpvec(vget_##_T##m2_##_T##m1(tmp, 1));\
1189 } \
1190 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1191 { \
1192     CV_UNUSED(b); return a; \
1193 }
1194 
1195 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint8x16, vuint8, u8, 16, 32, vmv_v_x, b8)
1196 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int8x16, vint8, i8, 16, 32, vmv_v_x, b8)
1197 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint16x8, vuint16, u16, 8, 16, vmv_v_x, b16)
1198 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int16x8, vint16, i16, 8, 16, vmv_v_x, b16)
1199 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint32x4, vuint32, u32, 4, 8, vmv_v_x, b32)
1200 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int32x4, vint32, i32, 4, 8, vmv_v_x, b32)
1201 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_uint64x2, vuint64, u64, 2, 4, vmv_v_x, b64)
1202 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_int64x2, vint64, i64, 2, 4, vmv_v_x, b64)
1203 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float32x4, vfloat32, f32, 4, 8, vfmv_v_f, b32)
1204 OPENCV_HAL_IMPL_RISCVV_ROTATE_OP(v_float64x2, vfloat64, f64, 2, 4, vfmv_v_f, b64)
1205 
1206 #define OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(_Tpvec, _Tp, _Tp2, len, hnum, num) \
1207 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1208 { \
1209   typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
1210   vuint64m1_t tmp = {*(unaligned_uint64*)ptr0, *(unaligned_uint64*)ptr1};\
1211     return _Tpvec(_Tp2##_t(tmp)); } \
1212 inline _Tpvec v_load_low(const _Tp* ptr) \
1213 { return _Tpvec(vle_v_##len(ptr, hnum)); }\
1214 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1215 { return _Tpvec(vle_v_##len(ptr, num)); } \
1216 inline _Tpvec v_load(const _Tp* ptr) \
1217 { return _Tpvec((_Tp2##_t)vle_v_##len((const _Tp *)ptr, num)); } \
1218 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1219 { vse_v_##len(ptr, a.val, hnum);}\
1220 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1221 { \
1222   _Tp2##_t a0 = vslidedown_vx_##len(a.val, hnum, num);    \
1223   vse_v_##len(ptr, a0, hnum);}\
1224 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1225 { vse_v_##len(ptr, a.val, num); } \
1226 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1227 { vse_v_##len(ptr, a.val, num); } \
1228 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1229 { vse_v_##len(ptr, a.val, num); } \
1230 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
1231 { vse_v_##len(ptr, a.val, num); }
1232 
1233 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint8x16, uchar, vuint8m1, u8m1, 8, 16)
1234 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int8x16,  schar, vint8m1, i8m1, 8, 16)
1235 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint16x8, ushort, vuint16m1, u16m1, 4, 8)
1236 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int16x8,  short,  vint16m1, i16m1, 4, 8)
1237 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint32x4, unsigned, vuint32m1, u32m1, 2, 4)
1238 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int32x4,  int,     vint32m1, i32m1, 2, 4)
1239 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_uint64x2, unsigned long, vuint64m1, u64m1, 1, 2)
1240 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_int64x2,  long,     vint64m1, i64m1, 1, 2)
1241 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float32x4, float, vfloat32m1, f32m1, 2, 4)
1242 OPENCV_HAL_IMPL_RISCVV_LOADSTORE_OP(v_float64x2, double, vfloat64m1, f64m1, 1, 2)
1243 
1244 
1245 ////////////// Lookup table access ////////////////////
1246 
1247 inline v_int8x16 v_lut(const schar* tab, const int* idx)
1248 {
1249 #if 1
1250     schar CV_DECL_ALIGNED(32) elems[16] =
1251     {
1252         tab[idx[ 0]],
1253         tab[idx[ 1]],
1254         tab[idx[ 2]],
1255         tab[idx[ 3]],
1256         tab[idx[ 4]],
1257         tab[idx[ 5]],
1258         tab[idx[ 6]],
1259         tab[idx[ 7]],
1260         tab[idx[ 8]],
1261         tab[idx[ 9]],
1262         tab[idx[10]],
1263         tab[idx[11]],
1264         tab[idx[12]],
1265         tab[idx[13]],
1266         tab[idx[14]],
1267         tab[idx[15]]
1268     };
1269     return v_int8x16(vle_v_i8m1(elems, 16));
1270 #else
1271     int32xm4_t index32 = vlev_int32xm4(idx, 16);
1272     vint16m2_t index16 = vnsra_vx_i16m2_int32xm4(index32, 0, 16);
1273     vint8m1_t index = vnsra_vx_i8m1_i16m2(index16, 0, 16);
1274     return v_int8x16(vlxbv_i8m1(tab, index, 16));
1275 #endif
1276 }
1277 
v_lut_pairs(const schar * tab,const int * idx)1278 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx){
1279     schar CV_DECL_ALIGNED(32) elems[16] =
1280     {
1281         tab[idx[0]],
1282         tab[idx[0] + 1],
1283         tab[idx[1]],
1284         tab[idx[1] + 1],
1285         tab[idx[2]],
1286         tab[idx[2] + 1],
1287         tab[idx[3]],
1288         tab[idx[3] + 1],
1289         tab[idx[4]],
1290         tab[idx[4] + 1],
1291         tab[idx[5]],
1292         tab[idx[5] + 1],
1293         tab[idx[6]],
1294         tab[idx[6] + 1],
1295         tab[idx[7]],
1296         tab[idx[7] + 1]
1297     };
1298     return v_int8x16(vle_v_i8m1(elems, 16));
1299 }
v_lut_quads(const schar * tab,const int * idx)1300 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
1301 {
1302     schar CV_DECL_ALIGNED(32) elems[16] =
1303     {
1304         tab[idx[0]],
1305         tab[idx[0] + 1],
1306         tab[idx[0] + 2],
1307         tab[idx[0] + 3],
1308         tab[idx[1]],
1309         tab[idx[1] + 1],
1310         tab[idx[1] + 2],
1311         tab[idx[1] + 3],
1312         tab[idx[2]],
1313         tab[idx[2] + 1],
1314         tab[idx[2] + 2],
1315         tab[idx[2] + 3],
1316         tab[idx[3]],
1317         tab[idx[3] + 1],
1318         tab[idx[3] + 2],
1319         tab[idx[3] + 3]
1320     };
1321     return v_int8x16(vle_v_i8m1(elems, 16));
1322 }
1323 
v_lut(const uchar * tab,const int * idx)1324 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)1325 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)1326 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
1327 
v_lut(const short * tab,const int * idx)1328 inline v_int16x8 v_lut(const short* tab, const int* idx)
1329 {
1330     short CV_DECL_ALIGNED(32) elems[8] =
1331     {
1332         tab[idx[0]],
1333         tab[idx[1]],
1334         tab[idx[2]],
1335         tab[idx[3]],
1336         tab[idx[4]],
1337         tab[idx[5]],
1338         tab[idx[6]],
1339         tab[idx[7]]
1340     };
1341     return v_int16x8(vle_v_i16m1(elems, 8));
1342 }
v_lut_pairs(const short * tab,const int * idx)1343 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
1344 {
1345     short CV_DECL_ALIGNED(32) elems[8] =
1346     {
1347         tab[idx[0]],
1348         tab[idx[0] + 1],
1349         tab[idx[1]],
1350         tab[idx[1] + 1],
1351         tab[idx[2]],
1352         tab[idx[2] + 1],
1353         tab[idx[3]],
1354         tab[idx[3] + 1]
1355     };
1356     return v_int16x8(vle_v_i16m1(elems, 8));
1357 }
v_lut_quads(const short * tab,const int * idx)1358 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
1359 {
1360     short CV_DECL_ALIGNED(32) elems[8] =
1361     {
1362         tab[idx[0]],
1363         tab[idx[0] + 1],
1364         tab[idx[0] + 2],
1365         tab[idx[0] + 3],
1366         tab[idx[1]],
1367         tab[idx[1] + 1],
1368         tab[idx[1] + 2],
1369         tab[idx[1] + 3]
1370     };
1371     return v_int16x8(vle_v_i16m1(elems, 8));
1372 }
v_lut(const ushort * tab,const int * idx)1373 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)1374 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)1375 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
1376 
v_lut(const int * tab,const int * idx)1377 inline v_int32x4 v_lut(const int* tab, const int* idx)
1378 {
1379     int CV_DECL_ALIGNED(32) elems[4] =
1380     {
1381         tab[idx[0]],
1382         tab[idx[1]],
1383         tab[idx[2]],
1384         tab[idx[3]]
1385     };
1386     return v_int32x4(vle_v_i32m1(elems, 4));
1387 }
v_lut_pairs(const int * tab,const int * idx)1388 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
1389 {
1390     int CV_DECL_ALIGNED(32) elems[4] =
1391     {
1392         tab[idx[0]],
1393         tab[idx[0] + 1],
1394         tab[idx[1]],
1395         tab[idx[1] + 1]
1396     };
1397     return v_int32x4(vle_v_i32m1(elems, 4));
1398 }
v_lut_quads(const int * tab,const int * idx)1399 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
1400 {
1401     return v_int32x4(vle_v_i32m1(tab+idx[0], 4));
1402 }
v_lut(const unsigned * tab,const int * idx)1403 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)1404 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)1405 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1406 
v_lut(const int64_t * tab,const int * idx)1407 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1408 {
1409     vint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1410     return v_int64x2(res);
1411 }
v_lut_pairs(const int64_t * tab,const int * idx)1412 inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
1413 {
1414     return v_int64x2(vle_v_i64m1(tab+idx[0], 2));
1415 }
1416 
v_lut(const uint64_t * tab,const int * idx)1417 inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx)
1418 {
1419     vuint64m1_t res = {tab[idx[0]], tab[idx[1]]};
1420     return v_uint64x2(res);
1421 }
v_lut_pairs(const uint64_t * tab,const int * idx)1422 inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx)
1423 {
1424     return v_uint64x2(vle_v_u64m1(tab+idx[0], 2));
1425 }
1426 
v_lut(const float * tab,const int * idx)1427 inline v_float32x4 v_lut(const float* tab, const int* idx)
1428 {
1429     float CV_DECL_ALIGNED(32) elems[4] =
1430     {
1431         tab[idx[0]],
1432         tab[idx[1]],
1433         tab[idx[2]],
1434         tab[idx[3]]
1435     };
1436     return v_float32x4(vle_v_f32m1(elems, 4));
1437 }
v_lut_pairs(const float * tab,const int * idx)1438 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1439 {
1440     float CV_DECL_ALIGNED(32) elems[4] =
1441     {
1442         tab[idx[0]],
1443         tab[idx[0]+1],
1444         tab[idx[1]],
1445         tab[idx[1]+1]
1446     };
1447     return v_float32x4(vle_v_f32m1(elems, 4));
1448 }
v_lut_quads(const float * tab,const int * idx)1449 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1450 {
1451     return v_float32x4(vle_v_f32m1(tab + idx[0], 4));
1452 }
v_lut(const double * tab,const int * idx)1453 inline v_float64x2 v_lut(const double* tab, const int* idx)
1454 {
1455     vfloat64m1_t res = {tab[idx[0]], tab[idx[1]]};
1456     return v_float64x2(res);
1457 }
v_lut_pairs(const double * tab,const int * idx)1458 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1459 {
1460     return v_float64x2(vle_v_f64m1(tab+idx[0], 2));
1461 }
1462 
v_lut(const int * tab,const v_int32x4 & idxvec)1463 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1464 {
1465     int CV_DECL_ALIGNED(32) elems[4] =
1466     {
1467         tab[idxvec.val[0]],
1468         tab[idxvec.val[1]],
1469         tab[idxvec.val[2]],
1470         tab[idxvec.val[3]]
1471     };
1472     return v_int32x4(vle_v_i32m1(elems, 4));
1473 }
1474 
v_lut(const unsigned * tab,const v_int32x4 & idxvec)1475 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1476 {
1477     unsigned CV_DECL_ALIGNED(32) elems[4] =
1478     {
1479         tab[idxvec.val[0]],
1480         tab[idxvec.val[1]],
1481         tab[idxvec.val[2]],
1482         tab[idxvec.val[3]]
1483     };
1484     return v_uint32x4(vle_v_u32m1(elems, 4));
1485 }
1486 
v_lut(const float * tab,const v_int32x4 & idxvec)1487 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1488 {
1489     float CV_DECL_ALIGNED(32) elems[4] =
1490     {
1491         tab[idxvec.val[0]],
1492         tab[idxvec.val[1]],
1493         tab[idxvec.val[2]],
1494         tab[idxvec.val[3]]
1495     };
1496     return v_float32x4(vle_v_f32m1(elems, 4));
1497 }
v_lut(const double * tab,const v_int32x4 & idxvec)1498 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1499 {
1500     vfloat64m1_t res = {tab[idxvec.val[0]], tab[idxvec.val[1]]};
1501     return v_float64x2(res);
1502 }
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)1503 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1504 {
1505     vint32m1_t index_x = vmul_vx_i32m1(idxvec.val, 4, 4);
1506     vint32m1_t index_y = vadd_vx_i32m1(index_x, 4, 4);
1507 
1508     x.val = vlxe_v_f32m1(tab, index_x, 4);
1509     y.val = vlxe_v_f32m1(tab, index_y, 4);
1510 }
1511 
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)1512 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1513 {
1514     int CV_DECL_ALIGNED(32) idx[4];
1515     v_store_aligned(idx, idxvec);
1516 
1517     x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1518     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1519 }
1520 
1521 #define OPENCV_HAL_IMPL_RISCVV_PACKS(_Tp, _Tp2, _T2, num2, _T1, num, intrin, shr, _Type) \
1522 inline v_##_Tp##x##num v_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1523 { \
1524     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
1525     tmp = vset_##_T2##m2(tmp, 0, a.val);    \
1526     tmp = vset_##_T2##m2(tmp, 1, b.val);    \
1527     return v_##_Tp##x##num(shr##_##_T1##m1(tmp, 0, num)); \
1528 }\
1529 template<int n> inline \
1530 v_##_Tp##x##num v_rshr_pack(const v_##_Tp2##x##num2& a, const v_##_Tp2##x##num2& b) \
1531 { \
1532     v##_Tp2##m2_t  tmp = vundefined_##_T2##m2();    \
1533     tmp = vset_##_T2##m2(tmp, 0, a.val);    \
1534     tmp = vset_##_T2##m2(tmp, 1, b.val);    \
1535     return v_##_Tp##x##num(intrin##_##_T1##m1(tmp, n, num)); \
1536 }\
1537 inline void v_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1538 { \
1539     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
1540     tmp = vset_##_T2##m2(tmp, 0, a.val);    \
1541     tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
1542     asm("" ::: "memory");                                       \
1543     vse_v_##_T1##m1(ptr, shr##_##_T1##m1(tmp, 0, num), num2); \
1544 }\
1545 template<int n> inline \
1546 void v_rshr_pack_store(_Type* ptr, const v_##_Tp2##x##num2& a) \
1547 { \
1548     v##_Tp2##m2_t tmp = vundefined_##_T2##m2();    \
1549     tmp = vset_##_T2##m2(tmp, 0, a.val);    \
1550     tmp = vset_##_T2##m2(tmp, 1, vmv_v_x_##_T2##m1(0, num2));    \
1551     vse_v_##_T1##m1(ptr, intrin##_##_T1##m1(tmp, n, num), num2); \
1552 }
1553 OPENCV_HAL_IMPL_RISCVV_PACKS(int8, int16, i16, 8, i8, 16, vnclip_vx, vnclip_vx, signed char)
1554 OPENCV_HAL_IMPL_RISCVV_PACKS(int16, int32, i32, 4, i16, 8, vnclip_vx, vnclip_vx, signed short)
1555 OPENCV_HAL_IMPL_RISCVV_PACKS(int32, int64, i64, 2, i32, 4, vnclip_vx, vnsra_vx, int)
1556 OPENCV_HAL_IMPL_RISCVV_PACKS(uint8, uint16, u16, 8, u8, 16, vnclipu_vx, vnclipu_vx, unsigned char)
1557 OPENCV_HAL_IMPL_RISCVV_PACKS(uint16, uint32, u32, 4, u16, 8, vnclipu_vx, vnclipu_vx, unsigned short)
1558 OPENCV_HAL_IMPL_RISCVV_PACKS(uint32, uint64, u64, 2, u32, 4, vnclipu_vx, vnsrl_vx, unsigned int)
1559 
1560 // pack boolean
v_pack_b(const v_uint16x8 & a,const v_uint16x8 & b)1561 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1562 {
1563     vuint16m2_t tmp = vundefined_u16m2();    \
1564     tmp = vset_u16m2(tmp, 0, a.val);    \
1565     tmp = vset_u16m2(tmp, 1, b.val);    \
1566     return v_uint8x16(vnsrl_vx_u8m1(tmp, 0, 16));
1567 }
1568 
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)1569 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1570                            const v_uint32x4& c, const v_uint32x4& d)
1571 {
1572     vuint32m4_t vabcd = vundefined_u32m4();    \
1573     vuint16m2_t v16 = vundefined_u16m2();    \
1574     vabcd = vset_u32m4(vabcd, 0, a.val);    \
1575     vabcd = vset_u32m4(vabcd, 1, b.val);    \
1576     vabcd = vset_u32m4(vabcd, 2, c.val);    \
1577     vabcd = vset_u32m4(vabcd, 3, d.val);    \
1578     v16 = vnsrl_vx_u16m2(vabcd, 0, 16);
1579     return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1580 }
1581 
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)1582 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1583                            const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1584                            const v_uint64x2& g, const v_uint64x2& h)
1585 {
1586     vuint64m8_t v64 = vundefined_u64m8();    \
1587     vuint32m4_t v32 = vundefined_u32m4();    \
1588     vuint16m2_t v16 = vundefined_u16m2();    \
1589     v64 = vset_u64m8(v64, 0, a.val);    \
1590     v64 = vset_u64m8(v64, 1, b.val);    \
1591     v64 = vset_u64m8(v64, 2, c.val);    \
1592     v64 = vset_u64m8(v64, 3, d.val);    \
1593     v64 = vset_u64m8(v64, 4, e.val);    \
1594     v64 = vset_u64m8(v64, 5, f.val);    \
1595     v64 = vset_u64m8(v64, 6, g.val);    \
1596     v64 = vset_u64m8(v64, 7, h.val);    \
1597     v32 = vnsrl_vx_u32m4(v64, 0, 16);
1598     v16 = vnsrl_vx_u16m2(v32, 0, 16);
1599     return v_uint8x16(vnsrl_vx_u8m1(v16, 0, 16));
1600 }
1601 
1602 //inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) \
1603 //{ \
1604 //    int16xm2_u tmp;    \
1605 //    tmp.m1[0] = (vint16m1_t)a.val;    \
1606 //    tmp.m1[1] = (vint16m1_t)b.val;    \
1607 //    e8xm1_t mask = (e8xm1_t)vmsge_vx_e16xm2_i16m2(tmp.v, 0, 16);\
1608 //    return v_uint8x16(vnclipuvi_mask_u8m1_u16m2(vmv_v_x_u8m1(0, 16), (vuint16m2_t)tmp.v, 0, mask, 16));
1609 //}
1610 
1611 #define OPENCV_HAL_IMPL_RISCVV_PACK_U(tp1, num1, tp2, num2, _Tp) \
1612 inline v_uint##tp1##x##num1 v_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1613 { \
1614     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
1615     tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
1616     tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
1617     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1618     return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1));    \
1619 } \
1620 inline void v_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1621 { \
1622     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
1623     tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
1624     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1625     return vse_v_u##tp1##m1(ptr, vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, 0, num1), num2);    \
1626 } \
1627 template<int n> inline \
1628 v_uint##tp1##x##num1 v_rshr_pack_u(const v_int##tp2##x##num2& a, const v_int##tp2##x##num2& b) \
1629 { \
1630     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
1631     tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
1632     tmp = vset_##i##tp2##m2(tmp, 1, b.val);    \
1633     vint##tp2##m2_t val = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1634     return v_uint##tp1##x##num1(vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val, n, num1));    \
1635 } \
1636 template<int n> inline \
1637 void v_rshr_pack_u_store(_Tp* ptr, const v_int##tp2##x##num2& a) \
1638 { \
1639     vint##tp2##m2_t tmp = vundefined_##i##tp2##m2();    \
1640     tmp = vset_##i##tp2##m2(tmp, 0, a.val);    \
1641     vint##tp2##m2_t val_ = vmax_vx_i##tp2##m2(tmp, 0, num1);\
1642     vuint##tp1##m1_t val = vnclipu_vx_u##tp1##m1((vuint##tp2##m2_t)val_, n, num1);    \
1643     return vse_v_u##tp1##m1(ptr, val, num2);\
1644 }
1645 OPENCV_HAL_IMPL_RISCVV_PACK_U(8, 16, 16, 8, unsigned char )
1646 OPENCV_HAL_IMPL_RISCVV_PACK_U(16, 8, 32, 4, unsigned short)
1647 
1648 #ifdef __GNUC__
1649 #pragma GCC diagnostic push
1650 #pragma GCC diagnostic ignored "-Wuninitialized"
1651 #endif
1652 
1653 // saturating multiply 8-bit, 16-bit
1654 #define OPENCV_HAL_IMPL_RISCVV_MUL_SAT(_Tpvec, _Tpwvec)            \
1655     inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
1656     {                                                            \
1657         _Tpwvec c, d;                                            \
1658         v_mul_expand(a, b, c, d);                                \
1659         return v_pack(c, d);                                     \
1660     }                                                            \
1661     inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
1662     { a = a * b; return a; }
1663 
1664 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int8x16,  v_int16x8)
1665 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint8x16, v_uint16x8)
1666 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_int16x8,  v_int32x4)
1667 OPENCV_HAL_IMPL_RISCVV_MUL_SAT(v_uint16x8, v_uint32x4)
1668 
1669 #ifdef __GNUC__
1670 #pragma GCC diagnostic pop
1671 #endif
1672 static const signed char popCountTable[256] =
1673 {
1674     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1675     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1676     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1677     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1678     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1679     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1680     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1681     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1682     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1683     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1684     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1685     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1686     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1687     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1688     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1689     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
1690 };
1691 
vcnt_u8(vuint8m1_t val)1692 inline vuint8m1_t vcnt_u8(vuint8m1_t val){
1693     vuint8m1_t v0 = val & 1;
1694     return vlxe_v_u8m1((unsigned char*)popCountTable, val >> 1, 16)+v0;
1695 }
1696 
1697 inline v_uint8x16
v_popcount(const v_uint8x16 & a)1698 v_popcount(const v_uint8x16& a)
1699 {
1700     return v_uint8x16(vcnt_u8(a.val));
1701 }
1702 
1703 inline v_uint8x16
v_popcount(const v_int8x16 & a)1704 v_popcount(const v_int8x16& a)
1705 {
1706     return v_uint8x16(vcnt_u8((vuint8m1_t)a.val));
1707 }
1708 
1709 inline v_uint16x8
v_popcount(const v_uint16x8 & a)1710 v_popcount(const v_uint16x8& a)
1711 {
1712     vuint8m2_t tmp = vundefined_u8m2();
1713     tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1714     vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1715     tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
1716     vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1717     return v_uint16x8(vget_u16m2_u16m1(res, 0));
1718 }
1719 
1720 inline v_uint16x8
v_popcount(const v_int16x8 & a)1721 v_popcount(const v_int16x8& a)
1722 {
1723     vuint8m2_t tmp = vundefined_u8m2();
1724     tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1725     vuint64m2_t mask = (vuint64m2_t){0x0E0C0A0806040200, 0, 0x0F0D0B0907050301, 0};
1726     tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
1727     vuint16m2_t res = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 8);
1728     return v_uint16x8(vget_u16m2_u16m1(res, 0));
1729 }
1730 
1731 inline v_uint32x4
v_popcount(const v_uint32x4 & a)1732 v_popcount(const v_uint32x4& a)
1733 {
1734     vuint8m2_t tmp = vundefined_u8m2();
1735     tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1736     vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1737                      0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1738     tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
1739     vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1740     vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1741     return v_uint32x4(vget_u32m2_u32m1(res, 0));
1742 }
1743 
1744 inline v_uint32x4
v_popcount(const v_int32x4 & a)1745 v_popcount(const v_int32x4& a)
1746 {
1747     vuint8m2_t tmp = vundefined_u8m2();
1748     tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1749     vuint64m2_t mask = (vuint64m2_t){0xFFFFFFFF0C080400, 0xFFFFFFFF0D090501,
1750                      0xFFFFFFFF0E0A0602, 0xFFFFFFFF0F0B0703};
1751     tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
1752     vuint16m2_t res_ = vwaddu_vv_u16m2(vget_u8m2_u8m1(tmp, 0), vget_u8m2_u8m1(tmp, 1), 16);
1753     vuint32m2_t res  = vwaddu_vv_u32m2(vget_u16m2_u16m1(res_, 0), vget_u16m2_u16m1(res_, 1), 8);
1754     return v_uint32x4(vget_u32m2_u32m1(res, 0));
1755 }
1756 
1757 inline v_uint64x2
v_popcount(const v_uint64x2 & a)1758 v_popcount(const v_uint64x2& a)
1759 {
1760     vuint8m2_t tmp = vundefined_u8m2();
1761     tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1762     vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1763                      0x0F0E0D0C0B0A0908, 0x0000000000000000};
1764     tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
1765     vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1766     vuint8m1_t res1 = zero;
1767     vuint8m1_t res2 = zero;
1768     res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1769     res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1770 
1771     return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
1772 }
1773 
1774 inline v_uint64x2
v_popcount(const v_int64x2 & a)1775 v_popcount(const v_int64x2& a)
1776 {
1777     vuint8m2_t tmp = vundefined_u8m2();
1778     tmp = vset_u8m2(tmp, 0, vcnt_u8((vuint8m1_t)a.val));
1779     vuint64m2_t mask = (vuint64m2_t){0x0706050403020100, 0x0000000000000000,
1780                      0x0F0E0D0C0B0A0908, 0x0000000000000000};
1781     tmp = vrgather_vv_u8m2(tmp, (vuint8m2_t)mask, 32);    \
1782     vuint8m1_t zero = vmv_v_x_u8m1(0, 16);
1783     vuint8m1_t res1 = zero;
1784     vuint8m1_t res2 = zero;
1785     res1 = vredsum_vs_u8m1_u8m1(res1, vget_u8m2_u8m1(tmp, 0), zero, 8);
1786     res2 = vredsum_vs_u8m1_u8m1(res2, vget_u8m2_u8m1(tmp, 1), zero, 8);
1787 
1788     return v_uint64x2((unsigned long)vmv_x_s_u8m1_u8(res1, 8), (unsigned long)vmv_x_s_u8m1_u8(res2, 8));
1789 }
1790 
1791 #define SMASK 1, 2, 4, 8, 16, 32, 64, 128
v_signmask(const v_uint8x16 & a)1792 inline int v_signmask(const v_uint8x16& a)
1793 {
1794     vuint8m1_t t0  = vsrl_vx_u8m1(a.val, 7, 16);
1795     vuint8m1_t m1  = (vuint8m1_t){SMASK, SMASK};
1796     vuint16m2_t t1 = vwmulu_vv_u16m2(t0, m1, 16);
1797     vuint32m1_t res = vmv_v_x_u32m1(0, 4);
1798     vuint32m2_t t2 = vwmulu_vx_u32m2(vget_u16m2_u16m1(t1, 1), 256, 8);
1799     res = vredsum_vs_u32m2_u32m1(res, t2, res, 8);
1800     res = vwredsumu_vs_u16m1_u32m1(res, vget_u16m2_u16m1(t1, 0), res, 8);
1801     return vmv_x_s_u32m1_u32(res, 8);
1802 }
v_signmask(const v_int8x16 & a)1803 inline int v_signmask(const v_int8x16& a)
1804 {
1805     vuint8m1_t t0 = vsrl_vx_u8m1((vuint8m1_t)a.val, 7, 16);
1806     vuint8m1_t m1 = (vuint8m1_t){SMASK, SMASK};
1807     vint16m2_t t1 = (vint16m2_t)vwmulu_vv_u16m2(t0, m1, 16);
1808     vint32m1_t res = vmv_v_x_i32m1(0, 4);
1809     vint32m2_t t2 = vwmul_vx_i32m2(vget_i16m2_i16m1(t1, 1), 256, 8);
1810     res = vredsum_vs_i32m2_i32m1(res, t2, res, 8);
1811     res = vwredsum_vs_i16m1_i32m1(res, vget_i16m2_i16m1(t1, 0), res, 8);
1812     return vmv_x_s_i32m1_i32(res, 8);
1813 }
1814 
v_signmask(const v_int16x8 & a)1815 inline int v_signmask(const v_int16x8& a)
1816 {
1817     vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1818     vint16m1_t m1 = (vint16m1_t){SMASK};
1819     vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1820     vint16m1_t res = vmv_v_x_i16m1(0, 8);
1821     res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1822     return vmv_x_s_i16m1_i16(res, 8);
1823 }
v_signmask(const v_uint16x8 & a)1824 inline int v_signmask(const v_uint16x8& a)
1825 {
1826     vint16m1_t t0 = (vint16m1_t)vsrl_vx_u16m1((vuint16m1_t)a.val, 15, 8);
1827     vint16m1_t m1 = (vint16m1_t){SMASK};
1828     vint16m1_t t1 = vmul_vv_i16m1(t0, m1, 8);
1829     vint16m1_t res = vmv_v_x_i16m1(0, 8);
1830     res = vredsum_vs_i16m1_i16m1(res, t1, res, 8);
1831     return vmv_x_s_i16m1_i16(res, 8);
1832 }
v_signmask(const v_int32x4 & a)1833 inline int v_signmask(const v_int32x4& a)
1834 {
1835     vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1836     vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1837     vint32m1_t res = vmv_v_x_i32m1(0, 4);
1838     vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1839     res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1840     return vmv_x_s_i32m1_i32(res, 4);
1841 }
v_signmask(const v_uint32x4 & a)1842 inline int v_signmask(const v_uint32x4& a)
1843 {
1844     vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1(a.val, 31, 4);
1845     vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1846     vint32m1_t res = vmv_v_x_i32m1(0, 4);
1847     vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1848     res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1849     return vmv_x_s_i32m1_i32(res, 4);
1850 }
v_signmask(const v_uint64x2 & a)1851 inline int v_signmask(const v_uint64x2& a)
1852 {
1853     vuint64m1_t v0 = vsrl_vx_u64m1(a.val, 63, 2);
1854     int res = (int)vext_x_v_u64m1_u64(v0, 0, 2) + ((int)vext_x_v_u64m1_u64(v0, 1, 2) << 1);
1855     return res;
1856 }
v_signmask(const v_int64x2 & a)1857 inline int v_signmask(const v_int64x2& a)
1858 { return v_signmask(v_reinterpret_as_u64(a)); }
v_signmask(const v_float64x2 & a)1859 inline int v_signmask(const v_float64x2& a)
1860 { return v_signmask(v_reinterpret_as_u64(a)); }
v_signmask(const v_float32x4 & a)1861 inline int v_signmask(const v_float32x4& a)
1862 {
1863     vint32m1_t t0 = (vint32m1_t)vsrl_vx_u32m1((vuint32m1_t)a.val, 31, 4);
1864     vint32m1_t m1 = (vint32m1_t){1, 2, 4, 8};
1865     vint32m1_t res = vmv_v_x_i32m1(0, 4);
1866     vint32m1_t t1 = vmul_vv_i32m1(t0, m1, 4);
1867     res = vredsum_vs_i32m1_i32m1(res, t1, res, 4);
1868     return vmv_x_s_i32m1_i32(res, 4);
1869 }
1870 
v_scan_forward(const v_int8x16 & a)1871 inline int v_scan_forward(const v_int8x16& a) {
1872 int val = v_signmask(a);
1873 if(val==0) return 0;
1874 else return trailingZeros32(val); }
v_scan_forward(const v_uint8x16 & a)1875 inline int v_scan_forward(const v_uint8x16& a) {
1876 int val = v_signmask(a);
1877 if(val==0) return 0;
1878 else return trailingZeros32(val); }
v_scan_forward(const v_int16x8 & a)1879 inline int v_scan_forward(const v_int16x8& a) {
1880 int val = v_signmask(a);
1881 if(val==0) return 0;
1882 else return trailingZeros32(val); }
v_scan_forward(const v_uint16x8 & a)1883 inline int v_scan_forward(const v_uint16x8& a) {
1884 int val = v_signmask(a);
1885 if(val==0) return 0;
1886 else return trailingZeros32(val); }
v_scan_forward(const v_int32x4 & a)1887 inline int v_scan_forward(const v_int32x4& a) {
1888 int val = v_signmask(a);
1889 if(val==0) return 0;
1890 else return trailingZeros32(val); }
v_scan_forward(const v_uint32x4 & a)1891 inline int v_scan_forward(const v_uint32x4& a) {
1892 int val = v_signmask(a);
1893 if(val==0) return 0;
1894 else return trailingZeros32(val); }
v_scan_forward(const v_float32x4 & a)1895 inline int v_scan_forward(const v_float32x4& a) {
1896 int val = v_signmask(a);
1897 if(val==0) return 0;
1898 else return trailingZeros32(val); }
v_scan_forward(const v_int64x2 & a)1899 inline int v_scan_forward(const v_int64x2& a) {
1900 int val = v_signmask(a);
1901 if(val==0) return 0;
1902 else return trailingZeros32(val); }
v_scan_forward(const v_uint64x2 & a)1903 inline int v_scan_forward(const v_uint64x2& a) {
1904 int val = v_signmask(a);
1905 if(val==0) return 0;
1906 else return trailingZeros32(val); }
1907 
1908 #define OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(_Tpvec, suffix, _T, shift, num) \
1909 inline bool v_check_all(const v_##_Tpvec& a) \
1910 { \
1911     suffix##m1_t v0 = vsrl_vx_##_T(vnot_v_##_T(a.val, num), shift, num); \
1912     vuint64m1_t v1 = vuint64m1_t(v0); \
1913     return (v1[0] | v1[1]) == 0; \
1914 } \
1915 inline bool v_check_any(const v_##_Tpvec& a) \
1916 { \
1917     suffix##m1_t v0 = vsrl_vx_##_T(a.val, shift, num); \
1918     vuint64m1_t v1 = vuint64m1_t(v0); \
1919     return (v1[0] | v1[1]) != 0; \
1920 }
1921 
1922 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint8x16, vuint8,  u8m1, 7, 16)
1923 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint16x8, vuint16, u16m1, 15, 8)
1924 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint32x4, vuint32, u32m1, 31, 4)
1925 OPENCV_HAL_IMPL_RISCVV_CHECK_ALLANY(uint64x2, vuint64, u64m1, 63, 2)
1926 
v_check_all(const v_int8x16 & a)1927 inline bool v_check_all(const v_int8x16& a)
1928 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_all(const v_int16x8 & a)1929 inline bool v_check_all(const v_int16x8& a)
1930 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_all(const v_int32x4 & a)1931 inline bool v_check_all(const v_int32x4& a)
1932 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_all(const v_float32x4 & a)1933 inline bool v_check_all(const v_float32x4& a)
1934 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_all(const v_int64x2 & a)1935 inline bool v_check_all(const v_int64x2& a)
1936 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_all(const v_float64x2 & a)1937 inline bool v_check_all(const v_float64x2& a)
1938 { return v_check_all(v_reinterpret_as_u64(a)); }
1939 
v_check_any(const v_int8x16 & a)1940 inline bool v_check_any(const v_int8x16& a)
1941 { return v_check_any(v_reinterpret_as_u8(a)); }
v_check_any(const v_int16x8 & a)1942 inline bool v_check_any(const v_int16x8& a)
1943 { return v_check_any(v_reinterpret_as_u16(a)); }
v_check_any(const v_int32x4 & a)1944 inline bool v_check_any(const v_int32x4& a)
1945 { return v_check_any(v_reinterpret_as_u32(a)); }
v_check_any(const v_float32x4 & a)1946 inline bool v_check_any(const v_float32x4& a)
1947 { return v_check_any(v_reinterpret_as_u32(a)); }
v_check_any(const v_int64x2 & a)1948 inline bool v_check_any(const v_int64x2& a)
1949 { return v_check_any(v_reinterpret_as_u64(a)); }
v_check_any(const v_float64x2 & a)1950 inline bool v_check_any(const v_float64x2& a)
1951 { return v_check_any(v_reinterpret_as_u64(a)); }
1952 
1953 #define OPENCV_HAL_IMPL_RISCVV_SELECT(_Tpvec, suffix, _Tpvec2, num) \
1954 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1955 { \
1956     return _Tpvec(vmerge_vvm_##suffix(_Tpvec2(mask.val), b.val, a.val, num)); \
1957 }
1958 
1959 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int8x16,  i8m1, vbool8_t, 16)
1960 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int16x8,  i16m1, vbool16_t, 8)
1961 OPENCV_HAL_IMPL_RISCVV_SELECT(v_int32x4,  i32m1, vbool32_t, 4)
1962 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint8x16, u8m1, vbool8_t, 16)
1963 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint16x8, u16m1, vbool16_t, 8)
1964 OPENCV_HAL_IMPL_RISCVV_SELECT(v_uint32x4, u32m1, vbool32_t, 4)
v_select(const v_float32x4 & mask,const v_float32x4 & a,const v_float32x4 & b)1965 inline v_float32x4 v_select(const v_float32x4& mask, const v_float32x4& a, const v_float32x4& b)
1966 {
1967     return v_float32x4((vfloat32m1_t)vmerge_vvm_u32m1((vbool32_t)mask.val, (vuint32m1_t)b.val, (vuint32m1_t)a.val, 4));
1968 }
v_select(const v_float64x2 & mask,const v_float64x2 & a,const v_float64x2 & b)1969 inline v_float64x2 v_select(const v_float64x2& mask, const v_float64x2& a, const v_float64x2& b)
1970 {
1971     return v_float64x2((vfloat64m1_t)vmerge_vvm_u64m1((vbool64_t)mask.val, (vuint64m1_t)b.val, (vuint64m1_t)a.val, 2));
1972 }
1973 
1974 #define OPENCV_HAL_IMPL_RISCVV_EXPAND(add, _Tpvec, _Tpwvec, _Tp, _Tp1, num1, _Tp2, num2, _T1, _T2) \
1975 inline void v_expand(const _Tpvec& a, v_##_Tpwvec& b0, v_##_Tpwvec& b1) \
1976 { \
1977     _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
1978     b0.val = vget_##_Tp2##m2_##_Tp2##m1(b, 0);  \
1979     b1.val = vget_##_Tp2##m2_##_Tp2##m1(b, 1);  \
1980 } \
1981 inline v_##_Tpwvec v_expand_low(const _Tpvec& a) \
1982 { \
1983     _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num2), num2);    \
1984     return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1985 } \
1986 inline v_##_Tpwvec v_expand_high(const _Tpvec& a) \
1987 { \
1988     _T1##_t b = vw##add##_vv_##_Tp2##m2(a.val, vmv_v_x_##_Tp1(0, num1), num1);    \
1989     return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 1)); \
1990 } \
1991 inline v_##_Tpwvec v_load_expand(const _Tp* ptr) \
1992 { \
1993     _T2##_t val = vle##_v_##_Tp1(ptr, num2);    \
1994     _T1##_t b = vw##add##_vv_##_Tp2##m2(val, vmv_v_x_##_Tp1(0, num2), num2);    \
1995     return v_##_Tpwvec(vget_##_Tp2##m2_##_Tp2##m1(b, 0)); \
1996 }
1997 
1998 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint8x16, uint16x8, uchar, u8m1, 16, u16, 8, vuint16m2, vuint8m1)
1999 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint16x8, uint32x4, ushort,  u16m1, 8, u32, 4, vuint32m2, vuint16m1)
2000 OPENCV_HAL_IMPL_RISCVV_EXPAND(addu, v_uint32x4, uint64x2, uint,  u32m1, 4, u64, 2, vuint64m2, vuint32m1)
2001 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int8x16, int16x8, schar,  i8m1, 16, i16, 8, vint16m2, vint8m1)
2002 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int16x8, int32x4, short,  i16m1, 8, i32, 4, vint32m2, vint16m1)
2003 OPENCV_HAL_IMPL_RISCVV_EXPAND(add, v_int32x4, int64x2, int,  i32m1, 4, i64, 2, vint64m2, vint32m1)
2004 
v_load_expand_q(const uchar * ptr)2005 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2006 {
2007     vuint16m2_t b = vundefined_u16m2();
2008     vuint32m2_t c = vundefined_u32m2();
2009     vuint8m1_t val = vle_v_u8m1(ptr, 4);    \
2010     b = vwaddu_vv_u16m2(val, vmv_v_x_u8m1(0, 4), 4);    \
2011     c = vwaddu_vv_u32m2(vget_u16m2_u16m1(b, 0), vmv_v_x_u16m1(0, 4), 4);    \
2012     return v_uint32x4(vget_u32m2_u32m1(c, 0));
2013 }
2014 
v_load_expand_q(const schar * ptr)2015 inline v_int32x4 v_load_expand_q(const schar* ptr)
2016 {
2017     vint16m2_t b = vundefined_i16m2();
2018     vint32m2_t c = vundefined_i32m2();
2019     vint8m1_t val = vle_v_i8m1(ptr, 4);    \
2020     b = vwadd_vv_i16m2(val, vmv_v_x_i8m1(0, 4), 4);    \
2021     c = vwadd_vv_i32m2(vget_i16m2_i16m1(b, 0), vmv_v_x_i16m1(0, 4), 4);    \
2022     return v_int32x4(vget_i32m2_i32m1(c, 0));
2023 }
2024 #define VITL_16 (vuint64m2_t){0x1303120211011000, 0x1707160615051404, 0x1B0B1A0A19091808, 0x1F0F1E0E1D0D1C0C}
2025 #define VITL_8 (vuint64m2_t){0x0009000100080000, 0x000B0003000A0002, 0x000D0005000C0004, 0x000F0007000E0006}
2026 #define VITL_4 (vuint64m2_t){0x0000000400000000, 0x0000000500000001, 0x0000000600000002, 0x0000000700000003}
2027 #define VITL_2 (vuint64m2_t){0, 2, 1, 3}
2028 #define LOW_4  0x0000000100000000, 0x0000000500000004
2029 #define LOW_8  0x0003000200010000, 0x000B000A00090008
2030 #define LOW_16 0x0706050403020100, 0x1716151413121110
2031 #define HIGH_4  0x0000000300000002, 0x0000000700000006
2032 #define HIGH_8  0x0007000600050004, 0x000F000E000D000C
2033 #define HIGH_16 0x0F0E0D0C0B0A0908,  0x1F1E1D1C1B1A1918
2034 #define OPENCV_HAL_IMPL_RISCVV_UNPACKS(_Tpvec, _Tp, _T, _UTp, _UT, num, num2, len, numh) \
2035 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2036 { \
2037     v##_Tp##m2_t tmp = vundefined_##_T##m2();\
2038     tmp = vset_##_T##m2(tmp, 0, a0.val); \
2039     tmp = vset_##_T##m2(tmp, 1, a1.val); \
2040     vuint64m2_t mask = VITL_##num;    \
2041     tmp = (v##_Tp##m2_t)vrgather_vv_##_T##m2((v##_Tp##m2_t)tmp, (v##_UTp##m2_t)mask, num2);    \
2042     b0.val = vget_##_T##m2_##_T##m1(tmp, 0); \
2043     b1.val = vget_##_T##m2_##_T##m1(tmp, 1); \
2044 } \
2045 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2046 { \
2047     v##_Tp##m1_t b0 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
2048     return v_##_Tpvec(b0);\
2049 } \
2050 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2051 { \
2052     v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
2053     v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
2054     v##_Tp##m1_t b1 = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
2055     return v_##_Tpvec(b1);\
2056 } \
2057 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2058 { \
2059     c.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a.val, b.val, numh, num);    \
2060     v##_Tp##m1_t b0 = vslidedown_vx_##_T##m1(b.val, numh, num);    \
2061     v##_Tp##m1_t a0 = vslidedown_vx_##_T##m1(a.val, numh, num);    \
2062     d.val = vslideup_vx_##_T##m1_m(vmset_m_##len(num), a0, b0, numh, num);    \
2063 }
2064 
2065 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint8x16, uint8, u8, uint8, u8, 16, 32, b8, 8)
2066 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int8x16, int8, i8, uint8, u8, 16, 32, b8, 8)
2067 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint16x8, uint16, u16, uint16, u16, 8, 16, b16, 4)
2068 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int16x8, int16, i16, uint16, u16, 8, 16, b16, 4)
2069 OPENCV_HAL_IMPL_RISCVV_UNPACKS(uint32x4, uint32, u32, uint32, u32, 4, 8, b32, 2)
2070 OPENCV_HAL_IMPL_RISCVV_UNPACKS(int32x4, int32, i32, uint32, u32, 4, 8, b32, 2)
2071 OPENCV_HAL_IMPL_RISCVV_UNPACKS(float32x4, float32, f32, uint32, u32, 4, 8, b32, 2)
2072 OPENCV_HAL_IMPL_RISCVV_UNPACKS(float64x2, float64, f64, uint64, u64, 2, 4, b64, 1)
2073 
v_reverse(const v_uint8x16 & a)2074 inline v_uint8x16 v_reverse(const v_uint8x16 &a)
2075 {
2076     vuint64m1_t mask = (vuint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2077     return v_uint8x16(vrgather_vv_u8m1(a.val, (vuint8m1_t)mask, 16));
2078 }
v_reverse(const v_int8x16 & a)2079 inline v_int8x16 v_reverse(const v_int8x16 &a)
2080 {
2081     vint64m1_t mask = (vint64m1_t){0x08090A0B0C0D0E0F, 0x0001020304050607};
2082     return v_int8x16(vrgather_vv_i8m1(a.val, (vuint8m1_t)mask, 16));
2083 }
2084 
v_reverse(const v_uint16x8 & a)2085 inline v_uint16x8 v_reverse(const v_uint16x8 &a)
2086 {
2087     vuint64m1_t mask = (vuint64m1_t){0x0004000500060007, 0x000000100020003};
2088     return v_uint16x8(vrgather_vv_u16m1(a.val, (vuint16m1_t)mask, 8));
2089 }
2090 
v_reverse(const v_int16x8 & a)2091 inline v_int16x8 v_reverse(const v_int16x8 &a)
2092 {
2093     vint64m1_t mask = (vint64m1_t){0x0004000500060007, 0x000000100020003};
2094     return v_int16x8(vrgather_vv_i16m1(a.val, (vuint16m1_t)mask, 8));
2095 }
v_reverse(const v_uint32x4 & a)2096 inline v_uint32x4 v_reverse(const v_uint32x4 &a)
2097 {
2098     return v_uint32x4(vrgather_vv_u32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2099 }
2100 
v_reverse(const v_int32x4 & a)2101 inline v_int32x4 v_reverse(const v_int32x4 &a)
2102 {
2103     return v_int32x4(vrgather_vv_i32m1(a.val, (vuint32m1_t){3, 2, 1, 0}, 4));
2104 }
2105 
v_reverse(const v_float32x4 & a)2106 inline v_float32x4 v_reverse(const v_float32x4 &a)
2107 { return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
2108 
v_reverse(const v_uint64x2 & a)2109 inline v_uint64x2 v_reverse(const v_uint64x2 &a)
2110 {
2111     return v_uint64x2(a.val[1], a.val[0]);
2112 }
2113 
v_reverse(const v_int64x2 & a)2114 inline v_int64x2 v_reverse(const v_int64x2 &a)
2115 {
2116     return v_int64x2(a.val[1], a.val[0]);
2117 }
2118 
v_reverse(const v_float64x2 & a)2119 inline v_float64x2 v_reverse(const v_float64x2 &a)
2120 {
2121     return v_float64x2(a.val[1], a.val[0]);
2122 }
2123 
2124 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT(_Tpvec, suffix, size) \
2125 template <int n> \
2126 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
2127 { return v_rotate_right<n>(a, b);}
2128 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint8x16, u8, 0)
2129 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int8x16, s8, 0)
2130 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint16x8, u16, 1)
2131 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int16x8, s16, 1)
2132 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint32x4, u32, 2)
2133 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int32x4, s32, 2)
2134 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_uint64x2, u64, 3)
2135 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_int64x2, s64, 3)
2136 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float32x4, f32, 2)
2137 OPENCV_HAL_IMPL_RISCVV_EXTRACT(v_float64x2, f64, 3)
2138 
2139 
2140 #define OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(_Tpvec, _Tp, suffix) \
2141 template<int i> inline _Tp v_extract_n(_Tpvec v) { return v.val[i]; }
2142 
OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16,uchar,u8)2143 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint8x16, uchar, u8)
2144 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int8x16, schar, s8)
2145 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint16x8, ushort, u16)
2146 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int16x8, short, s16)
2147 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint32x4, uint, u32)
2148 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int32x4, int, s32)
2149 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_uint64x2, uint64, u64)
2150 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_int64x2, int64, s64)
2151 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float32x4, float, f32)
2152 OPENCV_HAL_IMPL_RISCVV_EXTRACT_N(v_float64x2, double, f64)
2153 
2154 #define OPENCV_HAL_IMPL_RISCVV_BROADCAST(_Tpvec, _Tp, num) \
2155 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { return _Tpvec(vrgather_vx_##_Tp##m1(v.val, i, num)); }
2156 
2157 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint8x16, u8, 16)
2158 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int8x16, i8, 16)
2159 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint16x8, u16, 8)
2160 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int16x8, i16, 8)
2161 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint32x4, u32, 4)
2162 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int32x4, i32, 4)
2163 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_uint64x2, u64, 2)
2164 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_int64x2, i64, 2)
2165 OPENCV_HAL_IMPL_RISCVV_BROADCAST(v_float32x4, f32, 4)
2166 inline v_int32x4 v_round(const v_float32x4& a)
2167 {
2168     __builtin_riscv_fsrm(0);
2169     vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2170     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2171     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2172     __builtin_riscv_fsrm(0);
2173     return v_int32x4(val);
2174 }
v_floor(const v_float32x4 & a)2175 inline v_int32x4 v_floor(const v_float32x4& a)
2176 {
2177     __builtin_riscv_fsrm(2);
2178     vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2179     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2180     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2181     __builtin_riscv_fsrm(0);
2182     return v_int32x4(val);
2183 }
2184 
v_ceil(const v_float32x4 & a)2185 inline v_int32x4 v_ceil(const v_float32x4& a)
2186 {
2187     __builtin_riscv_fsrm(3);
2188     vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2189     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2190     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2191     __builtin_riscv_fsrm(0);
2192     return v_int32x4(val);
2193 }
2194 
v_trunc(const v_float32x4 & a)2195 inline v_int32x4 v_trunc(const v_float32x4& a)
2196 {
2197     __builtin_riscv_fsrm(1);
2198     vint32m1_t nan = vand_vx_i32m1((vint32m1_t)a.val, 0x7f800000, 4);
2199     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2200     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), a.val, 4);
2201     __builtin_riscv_fsrm(0);
2202     return v_int32x4(val);
2203 }
2204 
v_round(const v_float64x2 & a)2205 inline v_int32x4 v_round(const v_float64x2& a)
2206 {
2207     __builtin_riscv_fsrm(0);
2208     vfloat64m2_t _val = vundefined_f64m2();
2209     _val = vset_f64m2(_val, 0, a.val);
2210     //_val = vset_f64m2(_val, 1, a.val);
2211     _val = vset_f64m2(_val, 1, vfmv_v_f_f64m1(0, 2));
2212     vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2213     __builtin_riscv_fsrm(0);
2214     return v_int32x4(val);
2215 }
v_round(const v_float64x2 & a,const v_float64x2 & b)2216 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2217 {
2218     __builtin_riscv_fsrm(0);
2219     vfloat64m2_t _val = vundefined_f64m2();
2220     _val = vset_f64m2(_val, 0, a.val);
2221     _val = vset_f64m2(_val, 1, b.val);
2222     vint32m1_t val = vfncvt_x_f_v_i32m1(_val, 4);
2223     __builtin_riscv_fsrm(0);
2224     return v_int32x4(val);
2225 }
v_floor(const v_float64x2 & a)2226 inline v_int32x4 v_floor(const v_float64x2& a)
2227 {
2228     __builtin_riscv_fsrm(2);
2229     vfloat64m2_t _val = vundefined_f64m2();
2230     _val = vset_f64m2(_val, 0, a.val);
2231     vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2232 
2233     vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2234     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2235     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2236     __builtin_riscv_fsrm(0);
2237     return v_int32x4(val);
2238 }
2239 
v_ceil(const v_float64x2 & a)2240 inline v_int32x4 v_ceil(const v_float64x2& a)
2241 {
2242     __builtin_riscv_fsrm(3);
2243     vfloat64m2_t _val = vundefined_f64m2();
2244     _val = vset_f64m2(_val, 0, a.val);
2245     vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2246 
2247     vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2248     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2249     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2250     __builtin_riscv_fsrm(0);
2251     return v_int32x4(val);
2252 }
2253 
v_trunc(const v_float64x2 & a)2254 inline v_int32x4 v_trunc(const v_float64x2& a)
2255 {
2256     __builtin_riscv_fsrm(1);
2257     vfloat64m2_t _val = vundefined_f64m2();
2258     _val = vset_f64m2(_val, 0, a.val);
2259     vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2260 
2261     vint32m1_t nan = vand_vx_i32m1((vint32m1_t)aval, 0x7f800000, 4);
2262     vbool32_t mask = vmsne_vx_i32m1_b32(nan, 0x7f800000, 4);
2263     vint32m1_t val = vfcvt_x_f_v_i32m1_m(mask, vmv_v_x_i32m1(0, 4), aval, 4);
2264     __builtin_riscv_fsrm(0);
2265     return v_int32x4(val);
2266 }
2267 
2268 #define OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
2269 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2270 { \
2271     v##_Tpvec##m1x2_t ret = intrin##2e_v_##_T##m1x2(ptr, num);\
2272     a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
2273     b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
2274 } \
2275 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2276 { \
2277     v##_Tpvec##m1x3_t ret = intrin##3e_v_##_T##m1x3(ptr, num);\
2278     a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
2279     b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
2280     c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
2281 }\
2282 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2283                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2284 { \
2285     v##_Tpvec##m1x4_t ret = intrin##4e_v_##_T##m1x4(ptr, num);\
2286     a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
2287     b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
2288     c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
2289     d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
2290 } \
2291 
2292 #define OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(intrin, _Tpvec, num, _Tp, _T)    \
2293 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2294                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2295 { \
2296     v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();      \
2297     ret = vset_##_T##m1x2(ret, 0, a.val);  \
2298     ret = vset_##_T##m1x2(ret, 1, b.val);  \
2299     intrin##2e_v_##_T##m1x2(ptr, ret, num); \
2300 } \
2301 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2302                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2303 { \
2304     v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();       \
2305     ret = vset_##_T##m1x3(ret, 0, a.val);  \
2306     ret = vset_##_T##m1x3(ret, 1, b.val);  \
2307     ret = vset_##_T##m1x3(ret, 2, c.val);  \
2308     intrin##3e_v_##_T##m1x3(ptr, ret, num); \
2309 } \
2310 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2311                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2312                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2313 { \
2314     v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();             \
2315     ret = vset_##_T##m1x4(ret, 0, a.val);  \
2316     ret = vset_##_T##m1x4(ret, 1, b.val);  \
2317     ret = vset_##_T##m1x4(ret, 2, c.val);  \
2318     ret = vset_##_T##m1x4(ret, 3, d.val);  \
2319     intrin##4e_v_##_T##m1x4(ptr, ret, num); \
2320 }
2321 
2322 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(_Tpvec, _Tp, num, ld, st, _T) \
2323 OPENCV_HAL_IMPL_RISCVV_LOAD_DEINTERLEAVED(ld, _Tpvec, num, _Tp, _T)    \
2324 OPENCV_HAL_IMPL_RISCVV_STORE_INTERLEAVED(st, _Tpvec, num, _Tp, _T)
2325 
2326 //OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, uchar, )
2327 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int8, schar, 16, vlseg, vsseg, i8)
2328 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int16, short, 8, vlseg, vsseg, i16)
2329 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(int32, int, 4, vlseg, vsseg, i32)
2330 
2331 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint8, unsigned char, 16, vlseg, vsseg, u8)
2332 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint16, unsigned short, 8, vlseg, vsseg, u16)
2333 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED(uint32, unsigned int, 4, vlseg, vsseg, u32)
2334 
2335 #define OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(_Tpvec, _Tp, num, _T) \
2336 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b) \
2337 { \
2338     v##_Tpvec##m1x2_t ret = vlseg2e_v_##_T##m1x2(ptr, num); \
2339     a.val = vget_##_T##m1x2_##_T##m1(ret, 0);  \
2340     b.val = vget_##_T##m1x2_##_T##m1(ret, 1);  \
2341 } \
2342 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, v_##_Tpvec##x##num& c) \
2343 { \
2344     v##_Tpvec##m1x3_t ret = vlseg3e_v_##_T##m1x3(ptr, num);    \
2345     a.val = vget_##_T##m1x3_##_T##m1(ret, 0);  \
2346     b.val = vget_##_T##m1x3_##_T##m1(ret, 1);  \
2347     c.val = vget_##_T##m1x3_##_T##m1(ret, 2);  \
2348 }\
2349 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec##x##num& a, v_##_Tpvec##x##num& b, \
2350                                 v_##_Tpvec##x##num& c, v_##_Tpvec##x##num& d) \
2351 { \
2352     v##_Tpvec##m1x4_t ret = vlseg4e_v_##_T##m1x4(ptr, num);    \
2353     a.val = vget_##_T##m1x4_##_T##m1(ret, 0);  \
2354     b.val = vget_##_T##m1x4_##_T##m1(ret, 1);  \
2355     c.val = vget_##_T##m1x4_##_T##m1(ret, 2);  \
2356     d.val = vget_##_T##m1x4_##_T##m1(ret, 3);  \
2357 } \
2358 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2359                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2360 { \
2361     v##_Tpvec##m1x2_t ret = vundefined_##_T##m1x2();    \
2362     ret = vset_##_T##m1x2(ret, 0, a.val);  \
2363     ret = vset_##_T##m1x2(ret, 1, b.val);  \
2364     vsseg2e_v_##_T##m1x2(ptr, ret, num);    \
2365 } \
2366 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2367                                 const v_##_Tpvec##x##num& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2368 { \
2369     v##_Tpvec##m1x3_t ret = vundefined_##_T##m1x3();    \
2370     ret = vset_##_T##m1x3(ret, 0, a.val);  \
2371     ret = vset_##_T##m1x3(ret, 1, b.val);  \
2372     ret = vset_##_T##m1x3(ret, 2, c.val);  \
2373     vsseg3e_v_##_T##m1x3(ptr, ret, num);    \
2374 } \
2375 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec##x##num& a, const v_##_Tpvec##x##num& b, \
2376                                 const v_##_Tpvec##x##num& c, const v_##_Tpvec##x##num& d, \
2377                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2378 { \
2379     v##_Tpvec##m1x4_t ret = vundefined_##_T##m1x4();    \
2380     ret = vset_##_T##m1x4(ret, 0, a.val);  \
2381     ret = vset_##_T##m1x4(ret, 1, b.val);  \
2382     ret = vset_##_T##m1x4(ret, 2, c.val);  \
2383     ret = vset_##_T##m1x4(ret, 3, d.val);  \
2384     vsseg4e_v_##_T##m1x4(ptr, ret, num);    \
2385 }
2386 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float32, float, 4, f32)
2387 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(float64, double, 2, f64)
2388 
2389 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(uint64, unsigned long, 2, u64)
2390 OPENCV_HAL_IMPL_RISCVV_INTERLEAVED_(int64, long, 2, i64)
2391 
v_cvt_f32(const v_int32x4 & a)2392 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2393 {
2394     return v_float32x4(vfcvt_f_x_v_f32m1(a.val, 4));
2395 }
2396 
2397 #if CV_SIMD128_64F
v_cvt_f32(const v_float64x2 & a)2398 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2399 {
2400     vfloat64m2_t _val = vundefined_f64m2();
2401     _val = vset_f64m2(_val, 0, a.val);
2402     vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 2);
2403     return v_float32x4(aval);
2404 }
2405 
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)2406 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2407 {
2408     vfloat64m2_t _val = vundefined_f64m2();
2409     _val = vset_f64m2(_val, 0, a.val);
2410     _val = vset_f64m2(_val, 1, b.val);
2411     vfloat32m1_t aval = vfncvt_f_f_v_f32m1(_val, 4);
2412     return v_float32x4(aval);
2413 }
2414 
v_cvt_f64(const v_int32x4 & a)2415 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2416 {
2417     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2418     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2419     return v_float64x2(vget_f64m2_f64m1(_val, 0));
2420 }
2421 
v_cvt_f64_high(const v_int32x4 & a)2422 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2423 {
2424     vfloat32m1_t val = vfcvt_f_x_v_f32m1(a.val, 4);
2425     vfloat64m2_t _val = vfwcvt_f_f_v_f64m2(val, 4);
2426     return v_float64x2(vget_f64m2_f64m1(_val, 1));
2427 }
2428 
v_cvt_f64(const v_float32x4 & a)2429 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2430 {
2431     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
2432     return v_float64x2(vget_f64m2_f64m1(_val, 0));
2433 }
2434 
v_cvt_f64_high(const v_float32x4 & a)2435 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2436 {
2437     vfloat64m2_t _val  = vfwcvt_f_f_v_f64m2(a.val, 4);
2438     return v_float64x2(vget_f64m2_f64m1(_val, 1));
2439 }
2440 
v_cvt_f64(const v_int64x2 & a)2441 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
2442 {
2443     return v_float64x2(vfcvt_f_x_v_f64m1(a.val, 2));
2444 }
2445 
2446 #endif
v_interleave_pairs(const v_int8x16 & vec)2447 inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
2448 {
2449     vuint64m1_t m0 = {0x0705060403010200, 0x0F0D0E0C0B090A08};
2450     return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2451 }
v_interleave_pairs(const v_uint8x16 & vec)2452 inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
2453 {
2454     return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec)));
2455 }
2456 
v_interleave_quads(const v_int8x16 & vec)2457 inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
2458 {
2459     vuint64m1_t m0 = {0x0703060205010400, 0x0F0B0E0A0D090C08};
2460     return v_int8x16(vrgather_vv_i8m1(vec.val, (vuint8m1_t)m0, 16));
2461 }
v_interleave_quads(const v_uint8x16 & vec)2462 inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec)
2463 {
2464     return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec)));
2465 }
2466 
v_interleave_pairs(const v_int16x8 & vec)2467 inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
2468 {
2469     vuint64m1_t m0 = {0x0706030205040100, 0x0F0E0B0A0D0C0908};
2470     return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)vec.val, (vuint8m1_t)m0, 16));
2471 }
v_interleave_pairs(const v_uint16x8 & vec)2472 inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
v_interleave_quads(const v_int16x8 & vec)2473 inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
2474 {
2475     vuint64m1_t m0 = {0x0B0A030209080100, 0x0F0E07060D0C0504};
2476     return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2477 }
v_interleave_quads(const v_uint16x8 & vec)2478 inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
2479 
v_interleave_pairs(const v_int32x4 & vec)2480 inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
2481 {
2482     vuint64m1_t m0 = {0x0B0A090803020100, 0x0F0E0D0C07060504};
2483     return v_int32x4((vint32m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2484 }
v_interleave_pairs(const v_uint32x4 & vec)2485 inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_interleave_pairs(const v_float32x4 & vec)2486 inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
v_pack_triplets(const v_int8x16 & vec)2487 inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
2488 {
2489     vuint64m1_t m0 = {0x0908060504020100, 0xFFFFFFFF0E0D0C0A};
2490     return v_int8x16((vint8m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2491 }
v_pack_triplets(const v_uint8x16 & vec)2492 inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
2493 
v_pack_triplets(const v_int16x8 & vec)2494 inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
2495 {
2496     vuint64m1_t m0 = {0x0908050403020100, 0xFFFFFFFF0D0C0B0A};
2497     return v_int16x8((vint16m1_t)vrgather_vv_u8m1((vuint8m1_t)(vec.val), (vuint8m1_t)m0, 16));
2498 }
v_pack_triplets(const v_uint16x8 & vec)2499 inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
2500 
v_pack_triplets(const v_int32x4 & vec)2501 inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
v_pack_triplets(const v_uint32x4 & vec)2502 inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
v_pack_triplets(const v_float32x4 & vec)2503 inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
2504 
2505 #if CV_SIMD128_64F
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)2506 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
2507 { return v_cvt_f64(v_dotprod(a, b)); }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)2508 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
2509                                     const v_float64x2& c)
2510 { return v_dotprod_expand(a, b) + c; }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)2511 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
2512 {
2513     vint64m2_t v1 = vwmul_vv_i64m2(a.val, b.val, 4);
2514     vfloat64m1_t res = vfcvt_f_x_v_f64m1(vadd_vv_i64m1(vget_i64m2_i64m1(v1, 0), vget_i64m2_i64m1(v1, 1), 2), 2);
2515     return v_float64x2(res);
2516 }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)2517 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
2518 { v_float64x2 res = v_dotprod_expand_fast(a, b);
2519   return res + c; }
2520 #endif
2521 ////// FP16 support ///////
v_load_expand(const float16_t * ptr)2522 inline v_float32x4 v_load_expand(const float16_t* ptr)
2523 {
2524     vfloat16m1_t v = vle_v_f16m1((__fp16*)ptr, 4);
2525     vfloat32m2_t v32 = vfwcvt_f_f_v_f32m2(v, 4);
2526     return v_float32x4(vget_f32m2_f32m1(v32, 0));
2527 }
2528 
v_pack_store(float16_t * ptr,const v_float32x4 & v)2529 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2530 {
2531     vfloat32m2_t v32 = vundefined_f32m2();
2532     v32 = vset_f32m2(v32, 0, v.val);
2533     vfloat16m1_t hv = vfncvt_f_f_v_f16m1(v32, 4);
2534     vse_v_f16m1((__fp16*)ptr, hv, 4);
2535 }
2536 
2537 
v_cleanup()2538 inline void v_cleanup() {}
2539 
2540 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2541 
2542 //! @endcond
2543 
2544 }
2545 #endif
2546