1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 // The original implementation has been contributed by Yin Zhang.
6 // Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
7 
8 #ifndef OPENCV_HAL_INTRIN_RVV_HPP
9 #define OPENCV_HAL_INTRIN_RVV_HPP
10 
11 #include <algorithm>
12 
13 namespace cv
14 {
15 
16 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
17 
18 #define CV_SIMD128 1
19 #define CV_SIMD128_64F 1
20 
21 //////////// Unsupported native intrinsics in C++ ////////////
22 
23 struct vuint8mf2_t
24 {
25     uchar val[8] = {0};
vuint8mf2_tcv::vuint8mf2_t26     vuint8mf2_t() {}
vuint8mf2_tcv::vuint8mf2_t27     vuint8mf2_t(const uchar* ptr)
28     {
29         for (int i = 0; i < 8; ++i)
30         {
31             val[i] = ptr[i];
32         }
33     }
34 };
35 struct vint8mf2_t
36 {
37     schar val[8] = {0};
vint8mf2_tcv::vint8mf2_t38     vint8mf2_t() {}
vint8mf2_tcv::vint8mf2_t39     vint8mf2_t(const schar* ptr)
40     {
41         for (int i = 0; i < 8; ++i)
42         {
43             val[i] = ptr[i];
44         }
45     }
46 };
47 struct vuint16mf2_t
48 {
49     ushort val[4] = {0};
vuint16mf2_tcv::vuint16mf2_t50     vuint16mf2_t() {}
vuint16mf2_tcv::vuint16mf2_t51     vuint16mf2_t(const ushort* ptr)
52     {
53         for (int i = 0; i < 4; ++i)
54         {
55             val[i] = ptr[i];
56         }
57     }
58 };
59 struct vint16mf2_t
60 {
61     short val[4] = {0};
vint16mf2_tcv::vint16mf2_t62     vint16mf2_t() {}
vint16mf2_tcv::vint16mf2_t63     vint16mf2_t(const short* ptr)
64     {
65         for (int i = 0; i < 4; ++i)
66         {
67             val[i] = ptr[i];
68         }
69     }
70 };
71 struct vuint32mf2_t
72 {
73     unsigned val[2] = {0};
vuint32mf2_tcv::vuint32mf2_t74     vuint32mf2_t() {}
vuint32mf2_tcv::vuint32mf2_t75     vuint32mf2_t(const unsigned* ptr)
76     {
77         val[0] = ptr[0];
78         val[1] = ptr[1];
79     }
80 };
81 struct vint32mf2_t
82 {
83     int val[2] = {0};
vint32mf2_tcv::vint32mf2_t84     vint32mf2_t() {}
vint32mf2_tcv::vint32mf2_t85     vint32mf2_t(const int* ptr)
86     {
87         val[0] = ptr[0];
88         val[1] = ptr[1];
89     }
90 };
91 struct vfloat32mf2_t
92 {
93     float val[2] = {0};
vfloat32mf2_tcv::vfloat32mf2_t94     vfloat32mf2_t() {}
vfloat32mf2_tcv::vfloat32mf2_t95     vfloat32mf2_t(const float* ptr)
96     {
97         val[0] = ptr[0];
98         val[1] = ptr[1];
99     }
100 };
101 struct vuint64mf2_t
102 {
103     uint64 val[1] = {0};
vuint64mf2_tcv::vuint64mf2_t104     vuint64mf2_t() {}
vuint64mf2_tcv::vuint64mf2_t105     vuint64mf2_t(const uint64* ptr)
106     {
107         val[0] = ptr[0];
108     }
109 };
110 struct vint64mf2_t
111 {
112     int64 val[1] = {0};
vint64mf2_tcv::vint64mf2_t113     vint64mf2_t() {}
vint64mf2_tcv::vint64mf2_t114     vint64mf2_t(const int64* ptr)
115     {
116         val[0] = ptr[0];
117     }
118 };
119 struct vfloat64mf2_t
120 {
121     double val[1] = {0};
vfloat64mf2_tcv::vfloat64mf2_t122     vfloat64mf2_t() {}
vfloat64mf2_tcv::vfloat64mf2_t123     vfloat64mf2_t(const double* ptr)
124     {
125         val[0] = ptr[0];
126     }
127 };
128 struct vuint8mf4_t
129 {
130     uchar val[4] = {0};
vuint8mf4_tcv::vuint8mf4_t131     vuint8mf4_t() {}
vuint8mf4_tcv::vuint8mf4_t132     vuint8mf4_t(const uchar* ptr)
133     {
134         for (int i = 0; i < 4; ++i)
135         {
136             val[i] = ptr[i];
137         }
138     }
139 };
140 struct vint8mf4_t
141 {
142     schar val[4] = {0};
vint8mf4_tcv::vint8mf4_t143     vint8mf4_t() {}
vint8mf4_tcv::vint8mf4_t144     vint8mf4_t(const schar* ptr)
145     {
146         for (int i = 0; i < 4; ++i)
147         {
148             val[i] = ptr[i];
149         }
150     }
151 };
152 
153 #define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
154 inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr) \
155 { \
156     return _Tpvec(ptr); \
157 } \
158 inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v) \
159 { \
160     for (int i = 0; i < n; ++i) \
161     { \
162             ptr[i] = v.val[i]; \
163     } \
164 }
165 
166 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
167 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
168 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
169 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
170 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
171 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
172 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
173 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
174 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
175 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
176 
177 
178 #define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
179 inline _Tpwvec wcvt (_Tpvec v) \
180 { \
181     _wTp tmp[n]; \
182     for (int i = 0; i < n; ++i) \
183     { \
184             tmp[i] = (_wTp)v.val[i]; \
185     } \
186     vsetvlmax_e##width##m1(); \
187     return vle##width##_v_##suffix##m1(tmp); \
188 }
189 
190 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
191 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
192 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
193 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
194 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
195 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
196 
vle8_v_u8mf4(const uint8_t * base)197 inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base)
198 {
199     return vuint8mf4_t(base);
200 }
vle8_v_i8mf4(const int8_t * base)201 inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base)
202 {
203     return vint8mf4_t(base);
204 }
205 
vwcvtu_x_x_v_u16mf2(vuint8mf4_t src)206 inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src)
207 {
208     ushort tmp[4];
209     for (int i = 0; i < 4; ++i)
210     {
211             tmp[i] = (ushort)src.val[i];
212     }
213     return vle16_v_u16mf2(tmp);
214 }
vwcvt_x_x_v_i16mf2(vint8mf4_t src)215 inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src)
216 {
217     short tmp[4];
218     for (int i = 0; i < 4; ++i)
219     {
220             tmp[i] = (short)src.val[i];
221     }
222     return vle16_v_i16mf2(tmp);
223 }
224 
225 //////////// Types ////////////
226 
227 struct v_uint8x16
228 {
229     typedef uchar lane_type;
230     enum { nlanes = 16 };
231 
v_uint8x16cv::v_uint8x16232     v_uint8x16() {}
v_uint8x16cv::v_uint8x16233     explicit v_uint8x16(vuint8m1_t v)
234     {
235         vsetvlmax_e8m1();
236         vse8_v_u8m1(val, v);
237     }
v_uint8x16cv::v_uint8x16238     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
239                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
240     {
241         uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
242         for (int i = 0; i < nlanes; ++i)
243         {
244             val[i] = v[i];
245         }
246     }
operator vuint8m1_tcv::v_uint8x16247     operator vuint8m1_t() const
248     {
249         vsetvlmax_e8m1();
250         return vle8_v_u8m1(val);
251     }
get0cv::v_uint8x16252     uchar get0() const
253     {
254         return val[0];
255     }
256 
257     uchar val[16];
258 };
259 
260 struct v_int8x16
261 {
262     typedef schar lane_type;
263     enum { nlanes = 16 };
264 
v_int8x16cv::v_int8x16265     v_int8x16() {}
v_int8x16cv::v_int8x16266     explicit v_int8x16(vint8m1_t v)
267     {
268         vsetvlmax_e8m1();
269         vse8_v_i8m1(val, v);
270     }
v_int8x16cv::v_int8x16271     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
272                schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
273     {
274         schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
275         for (int i = 0; i < nlanes; ++i)
276         {
277             val[i] = v[i];
278         }
279     }
operator vint8m1_tcv::v_int8x16280     operator vint8m1_t() const
281     {
282         vsetvlmax_e8m1();
283         return vle8_v_i8m1(val);
284     }
get0cv::v_int8x16285     schar get0() const
286     {
287         return val[0];
288     }
289 
290     schar val[16];
291 };
292 
293 struct v_uint16x8
294 {
295     typedef ushort lane_type;
296     enum { nlanes = 8 };
297 
v_uint16x8cv::v_uint16x8298     v_uint16x8() {}
v_uint16x8cv::v_uint16x8299     explicit v_uint16x8(vuint16m1_t v)
300     {
301         vsetvlmax_e16m1();
302         vse16_v_u16m1(val, v);
303     }
v_uint16x8cv::v_uint16x8304     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
305     {
306         ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
307         for (int i = 0; i < nlanes; ++i)
308         {
309             val[i] = v[i];
310         }
311     }
operator vuint16m1_tcv::v_uint16x8312     operator vuint16m1_t() const
313     {
314         vsetvlmax_e16m1();
315         return vle16_v_u16m1(val);
316     }
get0cv::v_uint16x8317     ushort get0() const
318     {
319         return val[0];
320     }
321 
322     ushort val[8];
323 };
324 
325 struct v_int16x8
326 {
327     typedef short lane_type;
328     enum { nlanes = 8 };
329 
v_int16x8cv::v_int16x8330     v_int16x8() {}
v_int16x8cv::v_int16x8331     explicit v_int16x8(vint16m1_t v)
332     {
333         vsetvlmax_e16m1();
334         vse16_v_i16m1(val, v);
335     }
v_int16x8cv::v_int16x8336     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
337     {
338         short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
339         for (int i = 0; i < nlanes; ++i)
340         {
341             val[i] = v[i];
342         }
343     }
operator vint16m1_tcv::v_int16x8344     operator vint16m1_t() const
345     {
346         vsetvlmax_e16m1();
347         return vle16_v_i16m1(val);
348     }
get0cv::v_int16x8349     short get0() const
350     {
351         return val[0];
352     }
353 
354     short val[8];
355 };
356 
357 struct v_uint32x4
358 {
359     typedef unsigned lane_type;
360     enum { nlanes = 4 };
361 
v_uint32x4cv::v_uint32x4362     v_uint32x4() {}
v_uint32x4cv::v_uint32x4363     explicit v_uint32x4(vuint32m1_t v)
364     {
365         vsetvlmax_e32m1();
366         vse32_v_u32m1(val, v);
367     }
v_uint32x4cv::v_uint32x4368     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
369     {
370         unsigned v[] = {v0, v1, v2, v3};
371         for (int i = 0; i < nlanes; ++i)
372         {
373             val[i] = v[i];
374         }
375     }
operator vuint32m1_tcv::v_uint32x4376     operator vuint32m1_t() const
377     {
378         vsetvlmax_e32m1();
379         return vle32_v_u32m1(val);
380     }
get0cv::v_uint32x4381     unsigned get0() const
382     {
383         return val[0];
384     }
385 
386     unsigned val[4];
387 };
388 
389 struct v_int32x4
390 {
391     typedef int lane_type;
392     enum { nlanes = 4 };
393 
v_int32x4cv::v_int32x4394     v_int32x4() {}
v_int32x4cv::v_int32x4395     explicit v_int32x4(vint32m1_t v)
396     {
397         vsetvlmax_e32m1();
398         vse32_v_i32m1(val, v);
399     }
v_int32x4cv::v_int32x4400     v_int32x4(int v0, int v1, int v2, int v3)
401     {
402         int v[] = {v0, v1, v2, v3};
403         for (int i = 0; i < nlanes; ++i)
404         {
405             val[i] = v[i];
406         }
407     }
operator vint32m1_tcv::v_int32x4408     operator vint32m1_t() const
409     {
410         vsetvlmax_e32m1();
411         return vle32_v_i32m1(val);
412     }
get0cv::v_int32x4413     int get0() const
414     {
415         return val[0];
416     }
417     int val[4];
418 };
419 
420 struct v_float32x4
421 {
422     typedef float lane_type;
423     enum { nlanes = 4 };
424 
v_float32x4cv::v_float32x4425     v_float32x4() {}
v_float32x4cv::v_float32x4426     explicit v_float32x4(vfloat32m1_t v)
427     {
428         vsetvlmax_e32m1();
429         vse32_v_f32m1(val, v);
430     }
v_float32x4cv::v_float32x4431     v_float32x4(float v0, float v1, float v2, float v3)
432     {
433         float v[] = {v0, v1, v2, v3};
434         for (int i = 0; i < nlanes; ++i)
435         {
436             val[i] = v[i];
437         }
438     }
operator vfloat32m1_tcv::v_float32x4439     operator vfloat32m1_t() const
440     {
441         vsetvlmax_e32m1();
442         return vle32_v_f32m1(val);
443     }
get0cv::v_float32x4444     float get0() const
445     {
446         return val[0];
447     }
448     float val[4];
449 };
450 
451 struct v_uint64x2
452 {
453     typedef uint64 lane_type;
454     enum { nlanes = 2 };
455 
v_uint64x2cv::v_uint64x2456     v_uint64x2() {}
v_uint64x2cv::v_uint64x2457     explicit v_uint64x2(vuint64m1_t v)
458     {
459         vsetvlmax_e64m1();
460         vse64_v_u64m1(val, v);
461     }
v_uint64x2cv::v_uint64x2462     v_uint64x2(uint64 v0, uint64 v1)
463     {
464         uint64 v[] = {v0, v1};
465         for (int i = 0; i < nlanes; ++i)
466         {
467             val[i] = v[i];
468         }
469     }
operator vuint64m1_tcv::v_uint64x2470     operator vuint64m1_t() const
471     {
472         vsetvlmax_e64m1();
473         return vle64_v_u64m1(val);
474     }
get0cv::v_uint64x2475     uint64 get0() const
476     {
477         return val[0];
478     }
479 
480     uint64 val[2];
481 };
482 
483 struct v_int64x2
484 {
485     typedef int64 lane_type;
486     enum { nlanes = 2 };
487 
v_int64x2cv::v_int64x2488     v_int64x2() {}
v_int64x2cv::v_int64x2489     explicit v_int64x2(vint64m1_t v)
490     {
491         vsetvlmax_e64m1();
492         vse64_v_i64m1(val, v);
493     }
v_int64x2cv::v_int64x2494     v_int64x2(int64 v0, int64 v1)
495     {
496         int64 v[] = {v0, v1};
497         for (int i = 0; i < nlanes; ++i)
498         {
499             val[i] = v[i];
500         }
501     }
operator vint64m1_tcv::v_int64x2502     operator vint64m1_t() const
503     {
504         vsetvlmax_e64m1();
505         return vle64_v_i64m1(val);
506     }
get0cv::v_int64x2507     int64 get0() const
508     {
509         return val[0];
510     }
511 
512     int64 val[2];
513 };
514 
515 #if CV_SIMD128_64F
516 struct v_float64x2
517 {
518     typedef double lane_type;
519     enum { nlanes = 2 };
520 
v_float64x2cv::v_float64x2521     v_float64x2() {}
v_float64x2cv::v_float64x2522     explicit v_float64x2(vfloat64m1_t v)
523     {
524         vsetvlmax_e64m1();
525         vse64_v_f64m1(val, v);
526     }
v_float64x2cv::v_float64x2527     v_float64x2(double v0, double v1)
528     {
529         double v[] = {v0, v1};
530         for (int i = 0; i < nlanes; ++i)
531         {
532             val[i] = v[i];
533         }
534     }
operator vfloat64m1_tcv::v_float64x2535     operator vfloat64m1_t() const
536     {
537         vsetvlmax_e64m1();
538         return vle64_v_f64m1(val);
539     }
get0cv::v_float64x2540     double get0() const
541     {
542         return val[0];
543     }
544 
545     double val[2];
546 };
547 #endif
548 
549 
550 //////////// Initial ////////////
551 
552 #define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, width, suffix1, suffix2) \
553 inline v_##_Tpvec v_setzero_##suffix1() \
554 { \
555     vsetvlmax_e##width##m1(); \
556     return v_##_Tpvec(vzero_##suffix2##m1()); \
557 } \
558 inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
559 { \
560     vsetvlmax_e##width##m1(); \
561     return v_##_Tpvec(vmv_v_x_##suffix2##m1(v)); \
562 }
563 
564 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, 8, u8, u8)
565 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, 8, s8, i8)
566 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, 16, u16, u16)
567 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, 16, s16, i16)
568 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, 32, u32, u32)
569 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, 32, s32, i32)
570 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, 64, u64, u64)
571 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, 64, s64, i64)
572 
573 #define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, width, suffix) \
574 inline v_##_Tpv v_setzero_##suffix() \
575 { \
576     vsetvlmax_e##width##m1(); \
577     return v_##_Tpv(vzero_##suffix##m1()); \
578 } \
579 inline v_##_Tpv v_setall_##suffix(_Tp v) \
580 { \
581     vsetvlmax_e##width##m1(); \
582     return v_##_Tpv(vfmv_v_f_##suffix##m1(v)); \
583 }
584 
585 OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, 32, f32)
586 #if CV_SIMD128_64F
587 OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, 64, f64)
588 #endif
589 
590 //////////// Reinterpret ////////////
591 
592 #define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
593 inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
594 
OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16,u8)595 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
596 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
597 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
598 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
599 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
600 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
601 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
602 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
603 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
604 #if CV_SIMD128_64F
605 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
606 #endif
607 
608 #define OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(_Tpvec1, _Tpvec2, _nTpvec1, _nTpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
609 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
610 { \
611     vsetvlmax_e##width2##m1(); \
612     return v_##_Tpvec1((_nTpvec1)vle##width2##_v_##nsuffix2##m1(v.val)); \
613 } \
614 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
615 { \
616     vsetvlmax_e##width1##m1(); \
617     return v_##_Tpvec2((_nTpvec2)vle##width1##_v_##nsuffix1##m1(v.val)); \
618 }
619 
620 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int8x16, vuint8m1_t, vint8m1_t, u8, s8, u8, i8, 8, 8)
621 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int16x8, vuint16m1_t, vint16m1_t, u16, s16, u16, i16, 16, 16)
622 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int32x4, vuint32m1_t, vint32m1_t, u32, s32, u32, i32, 32, 32)
623 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float32x4, vuint32m1_t, vfloat32m1_t, u32, f32, u32, f32, 32, 32)
624 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float32x4, vint32m1_t, vfloat32m1_t, s32, f32, i32, f32, 32, 32)
625 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int64x2, vuint64m1_t, vint64m1_t, u64, s64, u64, i64, 64, 64)
626 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint16x8, vuint8m1_t, vuint16m1_t, u8, u16, u8, u16, 8, 16)
627 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint32x4, vuint8m1_t, vuint32m1_t, u8, u32, u8, u32, 8, 32)
628 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint64x2, vuint8m1_t, vuint64m1_t, u8, u64, u8, u64, 8, 64)
629 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint32x4, vuint16m1_t, vuint32m1_t, u16, u32, u16, u32, 16, 32)
630 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint64x2, vuint16m1_t, vuint64m1_t, u16, u64, u16, u64, 16, 64)
631 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, uint64x2, vuint32m1_t, vuint64m1_t, u32, u64, u32, u64, 32, 64)
632 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int16x8, vint8m1_t, vint16m1_t, s8, s16, i8, i16, 8, 16)
633 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int32x4, vint8m1_t, vint32m1_t, s8, s32, i8, i32, 8, 32)
634 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int64x2, vint8m1_t, vint64m1_t, s8, s64, i8, i64, 8, 64)
635 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int32x4, vint16m1_t, vint32m1_t, s16, s32, i16, i32, 16, 32)
636 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int64x2, vint16m1_t, vint64m1_t, s16, s64, i16, i64, 16, 64)
637 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, int64x2, vint32m1_t, vint64m1_t, s32, s64, i32, i64, 32, 64)
638 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int16x8, vuint8m1_t, vint16m1_t, u8, s16, u8, i16, 8, 16)
639 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int32x4, vuint8m1_t, vint32m1_t, u8, s32, u8, i32, 8, 32)
640 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int64x2, vuint8m1_t, vint64m1_t, u8, s64, u8, i64, 8, 64)
641 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int8x16, vuint16m1_t, vint8m1_t, u16, s8, u16, i8, 16, 8)
642 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int32x4, vuint16m1_t, vint32m1_t, u16, s32, u16, i32, 16, 32)
643 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int64x2, vuint16m1_t, vint64m1_t, u16, s64, u16, i64, 16, 64)
644 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int8x16, vuint32m1_t, vint8m1_t, u32, s8, u32, i8, 32, 8)
645 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int16x8, vuint32m1_t, vint16m1_t, u32, s16, u32, i16, 32, 16)
646 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int64x2, vuint32m1_t, vint64m1_t, u32, s64, u32, i64, 32, 64)
647 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int8x16, vuint64m1_t, vint8m1_t, u64, s8, u64, i8, 64, 8)
648 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int16x8, vuint64m1_t, vint16m1_t, u64, s16, u64, i16, 64, 16)
649 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int32x4, vuint64m1_t, vint32m1_t, u64, s32, u64, i32, 64, 32)
650 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float32x4, vuint8m1_t, vfloat32m1_t, u8, f32, u8, f32, 8, 32)
651 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float32x4, vuint16m1_t, vfloat32m1_t, u16, f32, u16, f32, 16, 32)
652 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float32x4, vuint64m1_t, vfloat32m1_t, u64, f32, u64, f32, 64, 32)
653 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float32x4, vint8m1_t, vfloat32m1_t, s8, f32, i8, f32, 8, 32)
654 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float32x4, vint16m1_t, vfloat32m1_t, s16, f32, i16, f32, 16, 32)
655 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float32x4, vint64m1_t, vfloat32m1_t, s64, f32, i64, f32, 64, 32)
656 #if CV_SIMD128_64F
657 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float64x2, vuint64m1_t, vfloat64m1_t, u64, f64, u64, f64, 64, 64)
658 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float64x2, vint64m1_t, vfloat64m1_t, s64, f64, i64, f64, 64, 64)
659 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float64x2, vuint8m1_t, vfloat64m1_t, u8, f64, u8, f64, 8, 64)
660 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float64x2, vuint16m1_t, vfloat64m1_t, u16, f64, u16, f64, 16, 64)
661 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float64x2, vuint32m1_t, vfloat64m1_t, u32, f64, u32, f64, 32, 64)
662 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float64x2, vint8m1_t, vfloat64m1_t, s8, f64, i8, f64, 8, 64)
663 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float64x2, vint16m1_t, vfloat64m1_t, s16, f64, i16, f64, 16, 64)
664 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float64x2, vint32m1_t, vfloat64m1_t, s32, f64, i32, f64, 32, 64)
665 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(float32x4, float64x2, vfloat32m1_t, vfloat64m1_t, f32, f64, f32, f64, 32, 64)
666 #endif
667 
668 ////////////// Extract //////////////
669 
670 #define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, suffix, width, vmv) \
671 template <int s> \
672 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
673 { \
674     vsetvlmax_e##width##m1(); \
675     return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, s), b, _Tpvec::nlanes - s)); \
676 } \
677 template<int i> inline _Tp v_extract_n(_Tpvec v) \
678 { \
679     vsetvlmax_e##width##m1(); \
680     return _Tp(vmv(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), v, i))); \
681 }
682 
683 
684 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8x16, uchar, u8, 8, vmv_x_s_u8m1_u8)
685 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8x16, schar, i8, 8, vmv_x_s_i8m1_i8)
686 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16x8, ushort, u16, 16, vmv_x_s_u16m1_u16)
687 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16x8, short, i16, 16, vmv_x_s_i16m1_i16)
688 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32x4, uint, u32, 32, vmv_x_s_u32m1_u32)
689 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32x4, int, i32, 32, vmv_x_s_i32m1_i32)
690 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64x2, uint64, u64, 64, vmv_x_s_u64m1_u64)
691 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64x2, int64, i64, 64, vmv_x_s_i64m1_i64)
692 OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32x4, float, f32, 32, vfmv_f_s_f32m1_f32)
693 #if CV_SIMD128_64F
694 OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64x2, double, f64, 64, vfmv_f_s_f64m1_f64)
695 #endif
696 
697 ////////////// Load/Store //////////////
698 
699 #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, width, suffix) \
700 inline _Tpvec v_load(const _Tp* ptr) \
701 { \
702     vsetvlmax_e8m1(); \
703     return _Tpvec((_nTpvec)vle8_v_u8m1((uchar*)ptr)); \
704 } \
705 inline _Tpvec v_load_aligned(const _Tp* ptr) \
706 { \
707     vsetvlmax_e##width##m1(); \
708     return _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
709 } \
710 inline _Tpvec v_load_low(const _Tp* ptr) \
711 { \
712     vsetvl_e##width##m1(hvl); \
713     _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
714     vsetvlmax_e##width##m1(); \
715     return res; \
716 } \
717 inline void v_store(_Tp* ptr, const _Tpvec& a) \
718 { \
719     vsetvlmax_e8m1(); \
720     vse8_v_u8m1((uchar*)ptr, vle8_v_u8m1((uchar*)a.val)); \
721 } \
722 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
723 { \
724     vsetvlmax_e##width##m1(); \
725     vse##width##_v_##suffix##m1(ptr, a); \
726 } \
727 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
728 { \
729     vsetvlmax_e##width##m1(); \
730     vse##width##_v_##suffix##m1(ptr, a); \
731 } \
732 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
733 { \
734     vsetvlmax_e##width##m1(); \
735     vse##width##_v_##suffix##m1(ptr, a); \
736 } \
737 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
738 { \
739     _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
740     vsetvlmax_e##width##m1(); \
741     vse##width##_v_##suffix##m1(tmp_ptr, a); \
742     for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
743     { \
744         ptr[i] = tmp_ptr[i]; \
745     } \
746 } \
747 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
748 { \
749     _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
750     vsetvlmax_e##width##m1(); \
751     vse##width##_v_##suffix##m1(tmp_ptr, a); \
752     for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
753     { \
754         ptr[i] = tmp_ptr[i+_Tpvec::nlanes/2]; \
755     } \
756 }
757 
758 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 8, u8)
759 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 8, i8)
760 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 16, u16)
761 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 16, i16)
762 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 32, u32)
763 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 32, i32)
764 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 64, u64)
765 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 64, i64)
766 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 32, f32)
767 #if CV_SIMD128_64F
768 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 64, f64)
769 #endif
770 
771 inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
772 {
773     schar CV_DECL_ALIGNED(32) elems[16] =
774     {
775         ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
776         ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
777     };
778     vsetvlmax_e8m1();
779     return v_int8x16(vle8_v_i8m1(elems));
780 }
v_load_halves(const uchar * ptr0,const uchar * ptr1)781 inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
782 
v_load_halves(const short * ptr0,const short * ptr1)783 inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
784 {
785     short CV_DECL_ALIGNED(32) elems[8] =
786     {
787         ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
788     };
789     vsetvlmax_e16m1();
790     return v_int16x8(vle16_v_i16m1(elems));
791 }
v_load_halves(const ushort * ptr0,const ushort * ptr1)792 inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
793 
v_load_halves(const int * ptr0,const int * ptr1)794 inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
795 {
796     int CV_DECL_ALIGNED(32) elems[4] =
797     {
798         ptr0[0], ptr0[1], ptr1[0], ptr1[1]
799     };
800     vsetvlmax_e32m1();
801     return v_int32x4(vle32_v_i32m1(elems));
802 }
v_load_halves(const float * ptr0,const float * ptr1)803 inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
804 {
805     float CV_DECL_ALIGNED(32) elems[4] =
806     {
807         ptr0[0], ptr0[1], ptr1[0], ptr1[1]
808     };
809     vsetvlmax_e32m1();
810     return v_float32x4(vle32_v_f32m1(elems));
811 }
v_load_halves(const unsigned * ptr0,const unsigned * ptr1)812 inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
813 
v_load_halves(const int64 * ptr0,const int64 * ptr1)814 inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
815 {
816     int64 CV_DECL_ALIGNED(32) elems[2] =
817     {
818         ptr0[0], ptr1[0]
819     };
820     vsetvlmax_e64m1();
821     return v_int64x2(vle64_v_i64m1(elems));
822 }
v_load_halves(const uint64 * ptr0,const uint64 * ptr1)823 inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
824 
825 #if CV_SIMD128_64F
v_load_halves(const double * ptr0,const double * ptr1)826 inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
827 {
828     double CV_DECL_ALIGNED(32) elems[2] =
829     {
830         ptr0[0], ptr1[0]
831     };
832     vsetvlmax_e64m1();
833     return v_float64x2(vle64_v_f64m1(elems));
834 }
835 #endif
836 
837 
838 ////////////// Lookup table access ////////////////////
839 
v_lut(const schar * tab,const int * idx)840 inline v_int8x16 v_lut(const schar* tab, const int* idx)
841 {
842     schar CV_DECL_ALIGNED(32) elems[16] =
843     {
844         tab[idx[ 0]],
845         tab[idx[ 1]],
846         tab[idx[ 2]],
847         tab[idx[ 3]],
848         tab[idx[ 4]],
849         tab[idx[ 5]],
850         tab[idx[ 6]],
851         tab[idx[ 7]],
852         tab[idx[ 8]],
853         tab[idx[ 9]],
854         tab[idx[10]],
855         tab[idx[11]],
856         tab[idx[12]],
857         tab[idx[13]],
858         tab[idx[14]],
859         tab[idx[15]]
860     };
861     vsetvlmax_e8m1();
862     return v_int8x16(vle8_v_i8m1(elems));
863 }
v_lut_pairs(const schar * tab,const int * idx)864 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
865 {
866     schar CV_DECL_ALIGNED(32) elems[16] =
867     {
868         tab[idx[0]],
869         tab[idx[0] + 1],
870         tab[idx[1]],
871         tab[idx[1] + 1],
872         tab[idx[2]],
873         tab[idx[2] + 1],
874         tab[idx[3]],
875         tab[idx[3] + 1],
876         tab[idx[4]],
877         tab[idx[4] + 1],
878         tab[idx[5]],
879         tab[idx[5] + 1],
880         tab[idx[6]],
881         tab[idx[6] + 1],
882         tab[idx[7]],
883         tab[idx[7] + 1]
884     };
885     vsetvlmax_e8m1();
886     return v_int8x16(vle8_v_i8m1(elems));
887 }
v_lut_quads(const schar * tab,const int * idx)888 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
889 {
890     schar CV_DECL_ALIGNED(32) elems[16] =
891     {
892         tab[idx[0]],
893         tab[idx[0] + 1],
894         tab[idx[0] + 2],
895         tab[idx[0] + 3],
896         tab[idx[1]],
897         tab[idx[1] + 1],
898         tab[idx[1] + 2],
899         tab[idx[1] + 3],
900         tab[idx[2]],
901         tab[idx[2] + 1],
902         tab[idx[2] + 2],
903         tab[idx[2] + 3],
904         tab[idx[3]],
905         tab[idx[3] + 1],
906         tab[idx[3] + 2],
907         tab[idx[3] + 3]
908     };
909     vsetvlmax_e8m1();
910     return v_int8x16(vle8_v_i8m1(elems));
911 }
v_lut(const uchar * tab,const int * idx)912 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)913 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)914 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
915 
v_lut(const short * tab,const int * idx)916 inline v_int16x8 v_lut(const short* tab, const int* idx)
917 {
918     short CV_DECL_ALIGNED(32) elems[8] =
919     {
920         tab[idx[0]],
921         tab[idx[1]],
922         tab[idx[2]],
923         tab[idx[3]],
924         tab[idx[4]],
925         tab[idx[5]],
926         tab[idx[6]],
927         tab[idx[7]]
928     };
929     vsetvlmax_e16m1();
930     return v_int16x8(vle16_v_i16m1(elems));
931 }
v_lut_pairs(const short * tab,const int * idx)932 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
933 {
934     short CV_DECL_ALIGNED(32) elems[8] =
935     {
936         tab[idx[0]],
937         tab[idx[0] + 1],
938         tab[idx[1]],
939         tab[idx[1] + 1],
940         tab[idx[2]],
941         tab[idx[2] + 1],
942         tab[idx[3]],
943         tab[idx[3] + 1]
944     };
945     vsetvlmax_e16m1();
946     return v_int16x8(vle16_v_i16m1(elems));
947 }
v_lut_quads(const short * tab,const int * idx)948 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
949 {
950     short CV_DECL_ALIGNED(32) elems[8] =
951     {
952         tab[idx[0]],
953         tab[idx[0] + 1],
954         tab[idx[0] + 2],
955         tab[idx[0] + 3],
956         tab[idx[1]],
957         tab[idx[1] + 1],
958         tab[idx[1] + 2],
959         tab[idx[1] + 3]
960     };
961     vsetvlmax_e16m1();
962     return v_int16x8(vle16_v_i16m1(elems));
963 }
v_lut(const ushort * tab,const int * idx)964 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)965 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)966 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
967 
v_lut(const int * tab,const int * idx)968 inline v_int32x4 v_lut(const int* tab, const int* idx)
969 {
970     int CV_DECL_ALIGNED(32) elems[4] =
971     {
972         tab[idx[0]],
973         tab[idx[1]],
974         tab[idx[2]],
975         tab[idx[3]]
976     };
977     vsetvlmax_e32m1();
978     return v_int32x4(vle32_v_i32m1(elems));
979 }
v_lut_pairs(const int * tab,const int * idx)980 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
981 {
982     int CV_DECL_ALIGNED(32) elems[4] =
983     {
984         tab[idx[0]],
985         tab[idx[0] + 1],
986         tab[idx[1]],
987         tab[idx[1] + 1]
988     };
989     vsetvlmax_e32m1();
990     return v_int32x4(vle32_v_i32m1(elems));
991 }
v_lut_quads(const int * tab,const int * idx)992 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
993 {
994     vsetvlmax_e32m1();
995     return v_int32x4(vle32_v_i32m1(tab + idx[0]));
996 }
997 
v_lut(const unsigned * tab,const int * idx)998 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)999 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)1000 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1001 
v_lut(const int64_t * tab,const int * idx)1002 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1003 {
1004     int64_t CV_DECL_ALIGNED(32) elems[2] =
1005     {
1006         tab[idx[0]],
1007         tab[idx[1]]
1008     };
1009     vsetvlmax_e64m1();
1010     return v_int64x2(vle64_v_i64m1(elems));
1011 }
v_lut_pairs(const int64 * tab,const int * idx)1012 inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
1013 {
1014     vsetvlmax_e64m1();
1015     return v_int64x2(vle64_v_i64m1(tab + idx[0]));
1016 }
v_lut(const uint64 * tab,const int * idx)1017 inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
v_lut_pairs(const uint64 * tab,const int * idx)1018 inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1019 
v_lut(const float * tab,const int * idx)1020 inline v_float32x4 v_lut(const float* tab, const int* idx)
1021 {
1022     float CV_DECL_ALIGNED(32) elems[4] =
1023     {
1024         tab[idx[0]],
1025         tab[idx[1]],
1026         tab[idx[2]],
1027         tab[idx[3]]
1028     };
1029     vsetvlmax_e32m1();
1030     return v_float32x4(vle32_v_f32m1(elems));
1031 }
v_lut_pairs(const float * tab,const int * idx)1032 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1033 {
1034     float CV_DECL_ALIGNED(32) elems[4] =
1035     {
1036         tab[idx[0]],
1037         tab[idx[0] + 1],
1038         tab[idx[1]],
1039         tab[idx[1] + 1]
1040     };
1041     vsetvlmax_e32m1();
1042     return v_float32x4(vle32_v_f32m1(elems));
1043 }
v_lut_quads(const float * tab,const int * idx)1044 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1045 {
1046     vsetvlmax_e32m1();
1047     return v_float32x4(vle32_v_f32m1(tab + idx[0]));
1048 }
1049 
v_lut(const int * tab,const v_int32x4 & idxvec)1050 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1051 {
1052     int CV_DECL_ALIGNED(32) elems[4] =
1053     {
1054         tab[v_extract_n<0>(idxvec)],
1055         tab[v_extract_n<1>(idxvec)],
1056         tab[v_extract_n<2>(idxvec)],
1057         tab[v_extract_n<3>(idxvec)]
1058     };
1059     vsetvlmax_e32m1();
1060     return v_int32x4(vle32_v_i32m1(elems));
1061 }
1062 
v_lut(const unsigned * tab,const v_int32x4 & idxvec)1063 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1064 {
1065     unsigned CV_DECL_ALIGNED(32) elems[4] =
1066     {
1067         tab[v_extract_n<0>(idxvec)],
1068         tab[v_extract_n<1>(idxvec)],
1069         tab[v_extract_n<2>(idxvec)],
1070         tab[v_extract_n<3>(idxvec)]
1071     };
1072     vsetvlmax_e32m1();
1073     return v_uint32x4(vle32_v_u32m1(elems));
1074 }
1075 
v_lut(const float * tab,const v_int32x4 & idxvec)1076 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1077 {
1078     float CV_DECL_ALIGNED(32) elems[4] =
1079     {
1080         tab[v_extract_n<0>(idxvec)],
1081         tab[v_extract_n<1>(idxvec)],
1082         tab[v_extract_n<2>(idxvec)],
1083         tab[v_extract_n<3>(idxvec)]
1084     };
1085     vsetvlmax_e32m1();
1086     return v_float32x4(vle32_v_f32m1(elems));
1087 }
1088 
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)1089 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1090 {
1091     int CV_DECL_ALIGNED(32) idx[4];
1092     v_store_aligned(idx, idxvec);
1093 
1094     x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1095     y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1096 }
1097 
1098 #if CV_SIMD128_64F
v_lut(const double * tab,const int * idx)1099 inline v_float64x2 v_lut(const double* tab, const int* idx)
1100 {
1101     double CV_DECL_ALIGNED(32) elems[2] =
1102     {
1103         tab[idx[0]],
1104         tab[idx[1]]
1105     };
1106     vsetvlmax_e64m1();
1107     return v_float64x2(vle64_v_f64m1(elems));
1108 }
1109 
v_lut_pairs(const double * tab,const int * idx)1110 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1111 {
1112     vsetvlmax_e64m1();
1113     return v_float64x2(vle64_v_f64m1(tab + idx[0]));
1114 }
1115 
v_lut(const double * tab,const v_int32x4 & idxvec)1116 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1117 {
1118     double CV_DECL_ALIGNED(32) elems[2] =
1119     {
1120         tab[v_extract_n<0>(idxvec)],
1121         tab[v_extract_n<1>(idxvec)]
1122     };
1123     vsetvlmax_e64m1();
1124     return v_float64x2(vle64_v_f64m1(elems));
1125 }
1126 
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)1127 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1128 {
1129     int CV_DECL_ALIGNED(32) idx[4] = {0};
1130     v_store_aligned(idx, idxvec);
1131 
1132     x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1133     y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1134 }
1135 #endif
1136 
1137 ////////////// Pack boolean ////////////////////
1138 
v_pack_b(const v_uint16x8 & a,const v_uint16x8 & b)1139 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1140 {
1141     ushort CV_DECL_ALIGNED(32) ptr[16] = {0};
1142     v_store(ptr, a);
1143     v_store(ptr + 8, b);
1144     vsetvlmax_e8m1();
1145     return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr), 0));
1146 }
1147 
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)1148 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1149                            const v_uint32x4& c, const v_uint32x4& d)
1150 {
1151     unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
1152     v_store(ptr, a);
1153     v_store(ptr + 4, b);
1154     v_store(ptr + 8, c);
1155     v_store(ptr + 12, d);
1156     vsetvlmax_e8m1();
1157     return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr), 0), 0));
1158 }
1159 
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)1160 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1161                            const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1162                            const v_uint64x2& g, const v_uint64x2& h)
1163 {
1164     uint64 CV_DECL_ALIGNED(32) ptr[16] = {0};
1165     v_store(ptr, a);
1166     v_store(ptr + 2, b);
1167     v_store(ptr + 4, c);
1168     v_store(ptr + 6, d);
1169     v_store(ptr + 8, e);
1170     v_store(ptr + 10, f);
1171     v_store(ptr + 12, g);
1172     v_store(ptr + 14, h);
1173     vsetvlmax_e8m1();
1174     return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr), 0), 0), 0));
1175 }
1176 
1177 ////////////// Arithmetics //////////////
1178 #define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, width) \
1179 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1180 { \
1181     vsetvlmax_e##width##m1(); \
1182     return _Tpvec(intrin(a, b)); \
1183 } \
1184 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
1185 { \
1186     vsetvlmax_e##width##m1(); \
1187     a = _Tpvec(intrin(a, b)); \
1188     return a; \
1189 }
1190 
1191 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 8)
1192 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 8)
1193 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 8)
1194 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 8)
1195 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 8)
1196 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 8)
1197 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 16)
1198 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 16)
1199 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 16)
1200 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 16)
1201 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 16)
1202 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 16)
1203 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 32)
1204 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 32)
1205 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 32)
1206 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 32)
1207 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 32)
1208 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 32)
1209 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 32)
1210 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 32)
1211 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 32)
1212 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 32)
1213 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 32)
1214 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 32)
1215 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 64)
1216 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 64)
1217 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 64)
1218 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 64)
1219 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 64)
1220 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 64)
1221 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 64)
1222 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 64)
1223 #if CV_SIMD128_64F
1224 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 64)
1225 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 64)
1226 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 64)
1227 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 64)
1228 #endif
1229 
1230 
1231 ////////////// Bitwise logic //////////////
1232 
1233 #define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, width) \
1234 OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, width) \
1235 OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, width) \
1236 OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, width) \
1237 inline _Tpvec operator ~ (const _Tpvec& a) \
1238 { \
1239     vsetvlmax_e##width##m1(); \
1240     return _Tpvec(vnot_v_##suffix##m1(a)); \
1241 }
1242 
1243 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 8)
1244 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 8)
1245 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 16)
1246 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 16)
1247 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 32)
1248 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 32)
1249 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 64)
1250 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 64)
1251 
1252 #define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
1253 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
1254 { \
1255     vsetvlmax_e32m1(); \
1256     return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
1257 } \
1258 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
1259 { \
1260     vsetvlmax_e32m1(); \
1261     a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
1262     return a; \
1263 }
1264 
1265 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
1266 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
1267 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
1268 
operator ~(const v_float32x4 & a)1269 inline v_float32x4 operator ~ (const v_float32x4& a)
1270 {
1271     vsetvlmax_e32m1();
1272     return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a))));
1273 }
1274 
1275 #if CV_SIMD128_64F
1276 #define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
1277 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
1278 { \
1279     vsetvlmax_e64m1(); \
1280     return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
1281 } \
1282 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
1283 { \
1284     vsetvlmax_e64m1(); \
1285     a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
1286     return a; \
1287 }
1288 
1289 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
1290 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
1291 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
1292 
operator ~(const v_float64x2 & a)1293 inline v_float64x2 operator ~ (const v_float64x2& a)
1294 {
1295     vsetvlmax_e64m1();
1296     return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a))));
1297 }
1298 #endif
1299 
1300 ////////////// Bitwise shifts //////////////
1301 
1302 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, width) \
1303 inline _Tpvec operator << (const _Tpvec& a, int n) \
1304 { \
1305     vsetvlmax_e##width##m1(); \
1306     return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1307 } \
1308 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1309 { \
1310     vsetvlmax_e##width##m1(); \
1311     return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
1312 } \
1313 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1314 { \
1315     vsetvlmax_e##width##m1(); \
1316     return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1317 } \
1318 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1319 { \
1320     vsetvlmax_e##width##m1(); \
1321     return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
1322 }
1323 
1324 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, width) \
1325 inline _Tpvec operator << (const _Tpvec& a, int n) \
1326 { \
1327     vsetvlmax_e##width##m1(); \
1328     return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1329 } \
1330 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1331 { \
1332     vsetvlmax_e##width##m1(); \
1333     return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
1334 } \
1335 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1336 { \
1337     vsetvlmax_e##width##m1(); \
1338     return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1339 } \
1340 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1341 { \
1342     vsetvlmax_e##width##m1(); \
1343     return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
1344 }
1345 
1346 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 8)
1347 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 16)
1348 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 32)
1349 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 64)
1350 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 8)
1351 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 16)
1352 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 32)
1353 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 64)
1354 
1355 
1356 ////////////// Comparison //////////////
1357 
1358 #define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
1359 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1360 { \
1361     vsetvlmax_e##width##m1(); \
1362     return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
1363 }
1364 
1365 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
1366 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1367 { \
1368     vsetvlmax_e##width##m1(); \
1369     return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
1370 }
1371 
1372 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width) \
1373 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
1374 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
1375 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, width) \
1376 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, width) \
1377 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, width) \
1378 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, width)
1379 
1380 #define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width) \
1381 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
1382 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
1383 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, width) \
1384 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, width) \
1385 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, width) \
1386 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, width)
1387 
1388 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width) \
1389 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, width) \
1390 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, width) \
1391 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, width) \
1392 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, width) \
1393 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, width) \
1394 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, width)
1395 
1396 
1397 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8)
1398 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16)
1399 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32)
1400 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64)
1401 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8)
1402 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16)
1403 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32)
1404 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64)
1405 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32)
1406 #if CV_SIMD128_64F
1407 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64)
1408 #endif
1409 
v_not_nan(const v_float32x4 & a)1410 inline v_float32x4 v_not_nan(const v_float32x4& a)
1411 { return a == a; }
1412 
1413 #if CV_SIMD128_64F
v_not_nan(const v_float64x2 & a)1414 inline v_float64x2 v_not_nan(const v_float64x2& a)
1415 { return a == a; }
1416 #endif
1417 
1418 ////////////// Min/Max //////////////
1419 
1420 #define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, width) \
1421 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1422 { \
1423     vsetvlmax_e##width##m1(); \
1424     return _Tpvec(intrin(a, b)); \
1425 }
1426 
1427 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 8)
1428 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 8)
1429 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 8)
1430 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 8)
1431 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 16)
1432 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 16)
1433 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 16)
1434 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 16)
1435 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 32)
1436 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 32)
1437 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 32)
1438 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 32)
1439 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 32)
1440 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 32)
1441 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 64)
1442 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 64)
1443 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 64)
1444 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 64)
1445 #if CV_SIMD128_64F
1446 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 64)
1447 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 64)
1448 #endif
1449 
1450 ////////////// Arithmetics wrap //////////////
1451 
1452 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 8)
1453 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 8)
1454 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 16)
1455 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 16)
1456 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 8)
1457 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 8)
1458 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 16)
1459 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 16)
1460 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 8)
1461 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 8)
1462 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 16)
1463 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 16)
1464 
1465 ////////////// Reduce //////////////
1466 
1467 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, wwidth, red) \
1468 inline scalartype v_reduce_sum(const _Tpvec& a)  \
1469 { \
1470     vsetvlmax_e##wwidth##m1(); \
1471     _nwTpvec zero = vzero_##wsuffix##m1(); \
1472     _nwTpvec res = vzero_##wsuffix##m1(); \
1473     res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero); \
1474     return (scalartype)(_wTpvec(res).get0()); \
1475 }
1476 
1477 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
1478 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
1479 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 32, wredsumu)
1480 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 32, wredsum)
1481 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 64, wredsumu)
1482 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 64, wredsum)
1483 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 32, fredsum)
1484 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 64, redsum)
1485 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 64, redsum)
1486 #if CV_SIMD128_64F
1487 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 64, fredsum)
1488 #endif
1489 
1490 
1491 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, width, red) \
1492 inline scalartype v_reduce_##func(const _Tpvec& a)  \
1493 { \
1494     vsetvlmax_e##width##m1(); \
1495     _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a)); \
1496     return scalartype(res.get0()); \
1497 }
1498 
1499 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 8, redminu)
1500 OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 8, redmin)
1501 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 16, redminu)
1502 OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 16, redmin)
1503 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 32, redminu)
1504 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 32, redmin)
1505 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 32, fredmin)
1506 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 8, redmaxu)
1507 OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 8, redmax)
1508 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 16, redmaxu)
1509 OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 16, redmax)
1510 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 32, redmaxu)
1511 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 32, redmax)
1512 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 32, fredmax)
1513 
1514 
v_reduce_sum4(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d)1515 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1516                                  const v_float32x4& c, const v_float32x4& d)
1517 {
1518     float CV_DECL_ALIGNED(32) elems[4] =
1519     {
1520         v_reduce_sum(a),
1521         v_reduce_sum(b),
1522         v_reduce_sum(c),
1523         v_reduce_sum(d)
1524     };
1525     vsetvlmax_e32m1();
1526     return v_float32x4(vle32_v_f32m1(elems));
1527 }
1528 
1529 ////////////// Square-Root //////////////
1530 
v_sqrt(const v_float32x4 & x)1531 inline v_float32x4 v_sqrt(const v_float32x4& x)
1532 {
1533     vsetvlmax_e32m1();
1534     return v_float32x4(vfsqrt_v_f32m1(x));
1535 }
1536 
v_invsqrt(const v_float32x4 & x)1537 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1538 {
1539     v_float32x4 one = v_setall_f32(1.0f);
1540     return one / v_sqrt(x);
1541 }
1542 
1543 #if CV_SIMD128_64F
v_sqrt(const v_float64x2 & x)1544 inline v_float64x2 v_sqrt(const v_float64x2& x)
1545 {
1546     vsetvlmax_e64m1();
1547     return v_float64x2(vfsqrt_v_f64m1(x));
1548 }
1549 
v_invsqrt(const v_float64x2 & x)1550 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1551 {
1552     v_float64x2 one = v_setall_f64(1.0f);
1553     return one / v_sqrt(x);
1554 }
1555 #endif
1556 
v_magnitude(const v_float32x4 & a,const v_float32x4 & b)1557 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1558 {
1559     vsetvlmax_e32m1();
1560     v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
1561     return v_sqrt(x);
1562 }
1563 
v_sqr_magnitude(const v_float32x4 & a,const v_float32x4 & b)1564 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1565 {
1566     vsetvlmax_e32m1();
1567     return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
1568 }
1569 
1570 #if CV_SIMD128_64F
v_magnitude(const v_float64x2 & a,const v_float64x2 & b)1571 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1572 {
1573     vsetvlmax_e64m1();
1574     v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
1575     return v_sqrt(x);
1576 }
1577 
v_sqr_magnitude(const v_float64x2 & a,const v_float64x2 & b)1578 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1579 {
1580     vsetvlmax_e64m1();
1581     return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
1582 }
1583 #endif
1584 
1585 ////////////// Multiply-Add //////////////
1586 
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1587 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1588 {
1589     vsetvlmax_e32m1();
1590     return v_float32x4(vfmacc_vv_f32m1(c, a, b));
1591 }
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1592 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1593 {
1594     vsetvlmax_e32m1();
1595     return v_int32x4(vmacc_vv_i32m1(c, a, b));
1596 }
1597 
v_muladd(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1598 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1599 {
1600     return v_fma(a, b, c);
1601 }
1602 
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1603 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1604 {
1605     return v_fma(a, b, c);
1606 }
1607 
1608 #if CV_SIMD128_64F
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1609 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1610 {
1611     vsetvlmax_e64m1();
1612     return v_float64x2(vfmacc_vv_f64m1(c, a, b));
1613 }
1614 
v_muladd(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1615 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1616 {
1617     return v_fma(a, b, c);
1618 }
1619 #endif
1620 
1621 ////////////// Check all/any //////////////
1622 
1623 #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, width) \
1624 inline bool v_check_all(const _Tpvec& a) \
1625 { \
1626     vsetvlmax_e##width##m1(); \
1627     v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a), shift)); \
1628     return (v.val[0] | v.val[1]) == 0; \
1629 } \
1630 inline bool v_check_any(const _Tpvec& a) \
1631 { \
1632     vsetvlmax_e##width##m1(); \
1633     v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift)); \
1634     return (v.val[0] | v.val[1]) != 0; \
1635 }
1636 
1637 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 8)
1638 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 16)
1639 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 32)
1640 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 64)
1641 
1642 
v_check_all(const v_int8x16 & a)1643 inline bool v_check_all(const v_int8x16& a)
1644 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_any(const v_int8x16 & a)1645 inline bool v_check_any(const v_int8x16& a)
1646 { return v_check_any(v_reinterpret_as_u8(a)); }
1647 
v_check_all(const v_int16x8 & a)1648 inline bool v_check_all(const v_int16x8& a)
1649 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_any(const v_int16x8 & a)1650 inline bool v_check_any(const v_int16x8& a)
1651 { return v_check_any(v_reinterpret_as_u16(a)); }
1652 
v_check_all(const v_int32x4 & a)1653 inline bool v_check_all(const v_int32x4& a)
1654 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_any(const v_int32x4 & a)1655 inline bool v_check_any(const v_int32x4& a)
1656 { return v_check_any(v_reinterpret_as_u32(a)); }
1657 
v_check_all(const v_float32x4 & a)1658 inline bool v_check_all(const v_float32x4& a)
1659 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_any(const v_float32x4 & a)1660 inline bool v_check_any(const v_float32x4& a)
1661 { return v_check_any(v_reinterpret_as_u32(a)); }
1662 
v_check_all(const v_int64x2 & a)1663 inline bool v_check_all(const v_int64x2& a)
1664 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_any(const v_int64x2 & a)1665 inline bool v_check_any(const v_int64x2& a)
1666 { return v_check_any(v_reinterpret_as_u64(a)); }
1667 
1668 #if CV_SIMD128_64F
v_check_all(const v_float64x2 & a)1669 inline bool v_check_all(const v_float64x2& a)
1670 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_any(const v_float64x2 & a)1671 inline bool v_check_any(const v_float64x2& a)
1672 { return v_check_any(v_reinterpret_as_u64(a)); }
1673 #endif
1674 
1675 ////////////// abs //////////////
1676 
1677 #define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
1678 inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
1679 { \
1680     return v_max(a, b) - v_min(a, b); \
1681 }
1682 
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16,absdiff)1683 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
1684 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
1685 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
1686 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
1687 #if CV_SIMD128_64F
1688 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
1689 #endif
1690 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
1691 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
1692 
1693 #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width) \
1694 inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1695 { \
1696     vsetvlmax_e##width##m1(); \
1697     return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b)), 0)); \
1698 }
1699 
1700 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 8)
1701 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 16)
1702 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 32)
1703 
1704 #define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
1705 inline _Tprvec v_abs(const _Tpvec& a) \
1706 { \
1707     return v_absdiff(a, v_setzero_##suffix()); \
1708 }
1709 
1710 OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
1711 OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
1712 OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
1713 OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
1714 #if CV_SIMD128_64F
1715 OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
1716 #endif
1717 
1718 
1719 #define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
1720 inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
1721 { \
1722     return v_reduce_sum(v_absdiff(a, b)); \
1723 }
1724 
1725 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
1726 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
1727 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
1728 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
1729 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
1730 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
1731 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
1732 
1733 ////////////// Select //////////////
1734 
1735 #define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, width) \
1736 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1737 { \
1738     vsetvlmax_e##width##m1(); \
1739     return _Tpvec(merge(ne(mask, 0), b, a)); \
1740 }
1741 
1742 OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 8)
1743 OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 8)
1744 OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 16)
1745 OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 16)
1746 OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 32)
1747 OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 32)
1748 OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 32)
1749 #if CV_SIMD128_64F
1750 OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 64)
1751 #endif
1752 
1753 ////////////// Rotate shift //////////////
1754 
1755 #define OPENCV_HAL_IMPL_RVV_ROTATE_OP(_Tpvec, suffix, width) \
1756 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1757 { \
1758     vsetvlmax_e##width##m1(); \
1759     return _Tpvec(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
1760 } \
1761 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1762 { \
1763     vsetvlmax_e##width##m1(); \
1764     return _Tpvec(vslideup_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
1765 } \
1766 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1767 { return a; } \
1768 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1769 { \
1770     vsetvlmax_e##width##m1(); \
1771     return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n), b, _Tpvec::nlanes - n)); \
1772 } \
1773 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1774 { \
1775     vsetvlmax_e##width##m1(); \
1776     return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), b, _Tpvec::nlanes - n), a, n)); \
1777 } \
1778 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1779 { CV_UNUSED(b); return a; }
1780 
1781 
1782 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint8x16, u8, 8)
1783 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int8x16, i8, 8)
1784 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint16x8, u16, 16)
1785 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int16x8, i16, 16)
1786 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint32x4, u32, 32)
1787 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int32x4, i32, 32)
1788 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float32x4, f32, 32)
1789 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint64x2, u64, 64)
1790 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int64x2, i64, 64)
1791 #if CV_SIMD128_64F
1792 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float64x2, f64, 64)
1793 #endif
1794 
1795 ////////////// Convert to float //////////////
1796 
1797 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1798 {
1799     vsetvlmax_e32m1();
1800     return v_float32x4(vfcvt_f_x_v_f32m1(a));
1801 }
1802 
1803 #if CV_SIMD128_64F
v_cvt_f32(const v_float64x2 & a)1804 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1805 {
1806     double arr[4] = {a.val[0], a.val[1], 0, 0};
1807     vsetvlmax_e64m2();
1808     vfloat64m2_t tmp = vle64_v_f64m2(arr);
1809     vsetvlmax_e32m1();
1810     return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
1811 }
1812 
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)1813 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1814 {
1815     double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
1816     vsetvlmax_e64m2();
1817     vfloat64m2_t tmp = vle64_v_f64m2(arr);
1818     vsetvlmax_e32m1();
1819     return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
1820 }
1821 
v_cvt_f64(const v_int32x4 & a)1822 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1823 {
1824     double CV_DECL_ALIGNED(32) ptr[4] = {0};
1825     vsetvlmax_e64m2();
1826     vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
1827     double CV_DECL_ALIGNED(32) elems[2] =
1828     {
1829         ptr[0], ptr[1]
1830     };
1831     vsetvlmax_e64m1();
1832     return v_float64x2(vle64_v_f64m1(elems));
1833 }
1834 
v_cvt_f64_high(const v_int32x4 & a)1835 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1836 {
1837     double CV_DECL_ALIGNED(32) ptr[4] = {0};
1838     vsetvlmax_e64m2();
1839     vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
1840     double CV_DECL_ALIGNED(32) elems[2] =
1841     {
1842         ptr[2], ptr[3]
1843     };
1844     vsetvlmax_e64m1();
1845     return v_float64x2(vle64_v_f64m1(elems));
1846 }
1847 
v_cvt_f64(const v_float32x4 & a)1848 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1849 {
1850     double CV_DECL_ALIGNED(32) ptr[4] = {0};
1851     vsetvlmax_e64m2();
1852     vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
1853     double CV_DECL_ALIGNED(32) elems[2] =
1854     {
1855         ptr[0], ptr[1]
1856     };
1857     vsetvlmax_e64m1();
1858     return v_float64x2(vle64_v_f64m1(elems));
1859 }
1860 
v_cvt_f64_high(const v_float32x4 & a)1861 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1862 {
1863     double CV_DECL_ALIGNED(32) ptr[4] = {0};
1864     vsetvlmax_e64m2();
1865     vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
1866     double CV_DECL_ALIGNED(32) elems[2] =
1867     {
1868         ptr[2], ptr[3]
1869     };
1870     vsetvlmax_e64m1();
1871     return v_float64x2(vle64_v_f64m1(elems));
1872 }
1873 
v_cvt_f64(const v_int64x2 & a)1874 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1875 {
1876     vsetvlmax_e64m1();
1877     return v_float64x2(vfcvt_f_x_v_f64m1(a));
1878 }
1879 #endif
1880 
1881 ////////////// Broadcast //////////////
1882 
1883 #define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
1884 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
1885 { \
1886     return v_setall_##suffix(v_extract_n<i>(v)); \
1887 }
1888 
OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16,u8)1889 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
1890 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
1891 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
1892 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
1893 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
1894 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
1895 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
1896 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
1897 OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
1898 #if CV_SIMD128_64F
1899 OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
1900 #endif
1901 
1902 ////////////// Transpose4x4 //////////////
1903 
1904 #define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
1905 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1906                          const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1907                          v_##_Tpvec& b0, v_##_Tpvec& b1, \
1908                          v_##_Tpvec& b2, v_##_Tpvec& b3) \
1909 { \
1910     _Tp CV_DECL_ALIGNED(32) elems0[4] = \
1911     { \
1912         v_extract_n<0>(a0), \
1913         v_extract_n<0>(a1), \
1914         v_extract_n<0>(a2), \
1915         v_extract_n<0>(a3) \
1916     }; \
1917     b0 = v_load(elems0); \
1918     _Tp CV_DECL_ALIGNED(32) elems1[4] = \
1919     { \
1920         v_extract_n<1>(a0), \
1921         v_extract_n<1>(a1), \
1922         v_extract_n<1>(a2), \
1923         v_extract_n<1>(a3) \
1924     }; \
1925     b1 = v_load(elems1); \
1926     _Tp CV_DECL_ALIGNED(32) elems2[4] = \
1927     { \
1928         v_extract_n<2>(a0), \
1929         v_extract_n<2>(a1), \
1930         v_extract_n<2>(a2), \
1931         v_extract_n<2>(a3) \
1932     }; \
1933     b2 = v_load(elems2); \
1934     _Tp CV_DECL_ALIGNED(32) elems3[4] = \
1935     { \
1936         v_extract_n<3>(a0), \
1937         v_extract_n<3>(a1), \
1938         v_extract_n<3>(a2), \
1939         v_extract_n<3>(a3) \
1940     }; \
1941     b3 = v_load(elems3); \
1942 }
1943 
1944 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
1945 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
1946 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
1947 
1948 ////////////// Reverse //////////////
1949 
1950 #define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, width, suffix) \
1951 inline _Tpvec v_reverse(const _Tpvec& a)  \
1952 { \
1953     _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
1954     _Tp CV_DECL_ALIGNED(32) ptra[_Tpvec::nlanes] = {0}; \
1955     v_store(ptra, a); \
1956     for (int i = 0; i < _Tpvec::nlanes; i++) \
1957     { \
1958         ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
1959     } \
1960     return v_load(ptr); \
1961 }
1962 
1963 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, 8, u8)
1964 OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, 8, i8)
1965 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, 16, u16)
1966 OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, 16, i16)
1967 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, 32, u32)
1968 OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, 32, i32)
1969 OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, 32, f32)
1970 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, 64, u64)
1971 OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, 64, i64)
1972 #if CV_SIMD128_64F
1973 OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, 64, f64)
1974 #endif
1975 
1976 //////////// Value reordering ////////////
1977 
1978 #define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt) \
1979 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1980 { \
1981     _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
1982     _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
1983     v_store_low(lptr, a); \
1984     v_store_high(hptr, a); \
1985     b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
1986     b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
1987 } \
1988 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1989 { \
1990     _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
1991     v_store_low(lptr, a); \
1992     return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
1993 } \
1994 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1995 { \
1996     _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
1997     v_store_high(hptr, a); \
1998     return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
1999 } \
2000 inline _Tpwvec v_load_expand(const _Tp* ptr) \
2001 { \
2002     return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr))); \
2003 }
2004 
2005 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1)
2006 OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1)
2007 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1)
2008 OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1)
2009 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1)
2010 OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1)
2011 
2012 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2013 {
2014     vsetvlmax_e32m1();
2015     return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr))));
2016 }
2017 
v_load_expand_q(const schar * ptr)2018 inline v_int32x4 v_load_expand_q(const schar* ptr)
2019 {
2020     vsetvlmax_e32m1();
2021     return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr))));
2022 }
2023 
2024 
2025 #define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr) \
2026 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
2027 { \
2028     _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2029     v_store(arr, a); \
2030     v_store(arr + _wTpvec::nlanes, b); \
2031     vsetvlmax_e##width##m2(); \
2032     return _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0)); \
2033 } \
2034 inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
2035 { \
2036     _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2037     v_store(arr, a); \
2038     v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2039     vsetvlmax_e##width##m2(); \
2040     v_store(ptr, _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0))); \
2041 } \
2042 template<int n> inline \
2043 _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
2044 { \
2045     _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2046     v_store(arr, a); \
2047     v_store(arr + _wTpvec::nlanes, b); \
2048     vsetvlmax_e##width##m2(); \
2049     return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n)); \
2050 } \
2051 template<int n> inline \
2052 void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
2053 { \
2054     _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2055     v_store(arr, a); \
2056     v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2057     vsetvlmax_e##width##m2(); \
2058     v_store(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n))); \
2059 }
2060 
2061 OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 16, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1)
2062 OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 16, i16, vnclip_wx_i8m1, vnclip_wx_i8m1)
2063 OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 32, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1)
2064 OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 32, i32, vnclip_wx_i16m1, vnclip_wx_i16m1)
2065 OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 64, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1)
2066 OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 64, i64, vnclip_wx_i32m1, vnsra_wx_i32m1)
2067 
2068 
2069 #define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast) \
2070 inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
2071 { \
2072     _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2073     v_store(arr, a); \
2074     v_store(arr + _wTpvec::nlanes, b); \
2075     vsetvlmax_e##width##m2(); \
2076     return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0)); \
2077 } \
2078 inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2079 { \
2080     _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2081     v_store(arr, a); \
2082     v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2083     vsetvlmax_e##width##m2(); \
2084     v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0))); \
2085 } \
2086 template<int n> inline \
2087 _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
2088 { \
2089     _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2090     v_store(arr, a); \
2091     v_store(arr + _wTpvec::nlanes, b); \
2092     vsetvlmax_e##width##m2(); \
2093     return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n)); \
2094 } \
2095 template<int n> inline \
2096 void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2097 { \
2098     _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2099     v_store(arr, a); \
2100     v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2101     vsetvlmax_e##width##m2(); \
2102     v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n))); \
2103 }
2104 
2105 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 16, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2)
2106 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 32, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2)
2107 
2108 
2109 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, width, suffix) \
2110 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2111 { \
2112     _Tp CV_DECL_ALIGNED(32) ptra0[v_##_Tpvec::nlanes] = {0}; \
2113     _Tp CV_DECL_ALIGNED(32) ptra1[v_##_Tpvec::nlanes] = {0}; \
2114     _Tp CV_DECL_ALIGNED(32) ptrb0[v_##_Tpvec::nlanes] = {0}; \
2115     _Tp CV_DECL_ALIGNED(32) ptrb1[v_##_Tpvec::nlanes] = {0}; \
2116     v_store(ptra0, a0); \
2117     v_store(ptra1, a1); \
2118     int i; \
2119     for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
2120     { \
2121         ptrb0[i*2] = ptra0[i]; \
2122         ptrb0[i*2+1] = ptra1[i]; \
2123     } \
2124     for( ; i < v_##_Tpvec::nlanes; i++ ) \
2125     { \
2126         ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
2127         ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
2128     } \
2129     b0 = v_load(ptrb0); \
2130     b1 = v_load(ptrb1); \
2131 } \
2132 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2133 { \
2134     _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
2135     _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2136     v_store_low(ptra, a); \
2137     v_store_low(ptrb, b); \
2138     return v_load_halves(ptra, ptrb); \
2139 } \
2140 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2141 { \
2142     _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
2143     _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2144     v_store_high(ptra, a); \
2145     v_store_high(ptrb, b); \
2146     return v_load_halves(ptra, ptrb); \
2147 } \
2148 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2149 { \
2150     c = v_combine_low(a, b); \
2151     d = v_combine_high(a, b); \
2152 }
2153 
2154 OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, 8, u8)
2155 OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, 8, i8)
2156 OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, 16, u16)
2157 OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, 16, i16)
2158 OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, 32, u32)
2159 OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, 32, i32)
2160 OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, 32, f32)
2161 #if CV_SIMD128_64F
2162 OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, 64, f64)
2163 #endif
2164 
2165 
2166 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width) \
2167 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2168 { \
2169     _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2170     _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2171     int i, i2; \
2172     for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2173     { \
2174         ptra[i] = ptr[i2]; \
2175         ptrb[i] = ptr[i2+1]; \
2176     } \
2177     a = v_load(ptra); \
2178     b = v_load(ptrb); \
2179 } \
2180 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2181 { \
2182     _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2183     _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2184     _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2185     int i, i3; \
2186     for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2187     { \
2188         ptra[i] = ptr[i3]; \
2189         ptrb[i] = ptr[i3+1]; \
2190         ptrc[i] = ptr[i3+2]; \
2191     } \
2192     a = v_load(ptra); \
2193     b = v_load(ptrb); \
2194     c = v_load(ptrc); \
2195 } \
2196 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2197                                 v_##_Tpvec& c, v_##_Tpvec& d) \
2198 { \
2199     _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2200     _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2201     _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2202     _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
2203     int i, i4; \
2204     for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2205     { \
2206         ptra[i] = ptr[i4]; \
2207         ptrb[i] = ptr[i4+1]; \
2208         ptrc[i] = ptr[i4+2]; \
2209         ptrd[i] = ptr[i4+3]; \
2210     } \
2211     a = v_load(ptra); \
2212     b = v_load(ptrb); \
2213     c = v_load(ptrc); \
2214     d = v_load(ptrd); \
2215 } \
2216 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2217                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2218 { \
2219     int i, i2; \
2220     _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2221     _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2222     v_store(ptra, a); \
2223     v_store(ptrb, b); \
2224     for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2225     { \
2226         ptr[i2] = ptra[i]; \
2227         ptr[i2+1] = ptrb[i]; \
2228     } \
2229 } \
2230 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2231                                 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2232 { \
2233     int i, i3; \
2234     _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2235     _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2236     _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2237     v_store(ptra, a); \
2238     v_store(ptrb, b); \
2239     v_store(ptrc, c); \
2240     for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2241     { \
2242         ptr[i3] = ptra[i]; \
2243         ptr[i3+1] = ptrb[i]; \
2244         ptr[i3+2] = ptrc[i]; \
2245     } \
2246 } \
2247 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2248                                 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2249                                 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2250 { \
2251     int i, i4; \
2252     _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2253     _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2254     _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2255     _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
2256     v_store(ptra, a); \
2257     v_store(ptrb, b); \
2258     v_store(ptrc, c); \
2259     v_store(ptrd, d); \
2260     for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2261     { \
2262         ptr[i4] = ptra[i]; \
2263         ptr[i4+1] = ptrb[i]; \
2264         ptr[i4+2] = ptrc[i]; \
2265         ptr[i4+3] = ptrd[i]; \
2266     } \
2267 } \
2268 inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
2269 { \
2270     _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
2271     _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
2272     v_store(ptrvec, vec); \
2273     for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
2274     { \
2275         ptr[4*i  ] = ptrvec[4*i  ]; \
2276         ptr[4*i+1] = ptrvec[4*i+2]; \
2277         ptr[4*i+2] = ptrvec[4*i+1]; \
2278         ptr[4*i+3] = ptrvec[4*i+3]; \
2279     } \
2280     return v_load(ptr); \
2281 } \
2282 inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
2283 { \
2284     _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
2285     _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
2286     v_store(ptrvec, vec); \
2287     for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
2288     { \
2289         ptr[8*i  ] = ptrvec[4*i  ]; \
2290         ptr[8*i+1] = ptrvec[4*i+4]; \
2291         ptr[8*i+2] = ptrvec[4*i+1]; \
2292         ptr[8*i+3] = ptrvec[4*i+5]; \
2293         ptr[8*i+4] = ptrvec[4*i+2]; \
2294         ptr[8*i+5] = ptrvec[4*i+6]; \
2295         ptr[8*i+6] = ptrvec[4*i+3]; \
2296         ptr[8*i+7] = ptrvec[4*i+7]; \
2297     } \
2298     return v_load(ptr); \
2299 }
2300 
2301 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar, u8, 8)
2302 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar, i8, 8)
2303 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort, u16, 16)
2304 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short, i16, 16)
2305 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned, u32, 32)
2306 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int, i32, 32)
2307 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float, f32, 32)
2308 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64, u64, 64)
2309 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64, i64, 64)
2310 #if CV_SIMD128_64F
2311 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double, f64, 64)
2312 #endif
2313 
2314 //////////// PopCount ////////////
2315 
2316 static const unsigned char popCountTable[] =
2317 {
2318     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2319     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2320     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2321     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2322     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2323     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2324     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2325     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2326     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2327     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2328     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2329     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2330     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2331     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2332     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2333     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2334 };
2335 
2336 #define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
2337 inline _rTpvec v_popcount(const _Tpvec& a) \
2338 { \
2339     uchar CV_DECL_ALIGNED(32) ptra[16] = {0}; \
2340     v_store(ptra, v_reinterpret_as_u8(a)); \
2341     _rTp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2342     v_store(ptr, v_setzero_##suffix()); \
2343     for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
2344         ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
2345     return v_load(ptr); \
2346 }
2347 
OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16,v_uint8x16,uchar,uchar,u8)2348 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
2349 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
2350 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
2351 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
2352 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
2353 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
2354 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
2355 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
2356 
2357 //////////// SignMask ////////////
2358 
2359 #define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, width, shift) \
2360 inline int v_signmask(const _Tpvec& a) \
2361 { \
2362     int mask = 0; \
2363     vsetvlmax_e##width##m1(); \
2364     _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift)); \
2365     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
2366         mask |= (int)(tmp.val[i]) << i; \
2367     return mask; \
2368 }
2369 
2370 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 8, 7)
2371 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 16, 15)
2372 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 32, 31)
2373 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 64, 63)
2374 
2375 inline int v_signmask(const v_int8x16& a)
2376 { return v_signmask(v_reinterpret_as_u8(a)); }
v_signmask(const v_int16x8 & a)2377 inline int v_signmask(const v_int16x8& a)
2378 { return v_signmask(v_reinterpret_as_u16(a)); }
v_signmask(const v_int32x4 & a)2379 inline int v_signmask(const v_int32x4& a)
2380 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_float32x4 & a)2381 inline int v_signmask(const v_float32x4& a)
2382 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_int64x2 & a)2383 inline int v_signmask(const v_int64x2& a)
2384 { return v_signmask(v_reinterpret_as_u64(a)); }
2385 #if CV_SIMD128_64F
v_signmask(const v_float64x2 & a)2386 inline int v_signmask(const v_float64x2& a)
2387 { return v_signmask(v_reinterpret_as_u64(a)); }
2388 #endif
2389 
2390 
2391 //////////// Scan forward ////////////
2392 
2393 #define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
2394 inline int v_scan_forward(const _Tpvec& a) \
2395 { \
2396     _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2397     v_store(ptr, v_reinterpret_as_##suffix(a)); \
2398     for (int i = 0; i < _Tpvec::nlanes; i++) \
2399         if(int(ptr[i]) < 0) \
2400             return i; \
2401     return 0; \
2402 }
2403 
OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16,uchar,u8)2404 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
2405 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
2406 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
2407 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
2408 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
2409 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
2410 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
2411 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
2412 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
2413 #if CV_SIMD128_64F
2414 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
2415 #endif
2416 
2417 //////////// Pack triplets ////////////
2418 
2419 #define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \
2420 inline _Tpvec v_pack_triplets(const _Tpvec& vec) \
2421 { \
2422     _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2423     _Tp CV_DECL_ALIGNED(32) ptrvec[_Tpvec::nlanes] = {0}; \
2424     v_store(ptrvec, vec); \
2425     for (int i = 0; i < _Tpvec::nlanes/4; i++) \
2426     { \
2427         ptr[3*i  ] = ptrvec[4*i  ]; \
2428         ptr[3*i+1] = ptrvec[4*i+2]; \
2429         ptr[3*i+2] = ptrvec[4*i+2]; \
2430     } \
2431     return v_load(ptr); \
2432 }
2433 
2434 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8x16, uchar)
2435 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8x16, schar)
2436 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16x8, ushort)
2437 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16x8, short)
2438 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32x4, unsigned)
2439 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32x4, int)
2440 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32x4, float)
2441 
2442 
2443 ////// FP16 support ///////
2444 
2445 #if CV_FP16
2446 inline v_float32x4 v_load_expand(const float16_t* ptr)
2447 {
2448     return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr)));
2449 }
2450 
v_pack_store(float16_t * ptr,const v_float32x4 & v)2451 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2452 {
2453     vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v));
2454 }
2455 #else
2456 inline v_float32x4 v_load_expand(const float16_t* ptr)
2457 {
2458     const int N = 4;
2459     float buf[N];
2460     for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2461     return v_load(buf);
2462 }
2463 
2464 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2465 {
2466     const int N = 4;
2467     float buf[N];
2468     v_store(buf, v);
2469     for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2470 }
2471 #endif
2472 
2473 ////////////// Rounding //////////////
2474 
v_round(const v_float32x4 & a)2475 inline v_int32x4 v_round(const v_float32x4& a)
2476 {
2477     vsetvlmax_e32m1();
2478     return v_int32x4(vfcvt_x_f_v_i32m1(a));
2479 }
2480 
v_floor(const v_float32x4 & a)2481 inline v_int32x4 v_floor(const v_float32x4& a)
2482 {
2483     v_float32x4 ZP5 = v_setall_f32(0.5f);
2484     v_float32x4 t = a - ZP5;
2485     vsetvlmax_e32m1();
2486     return v_int32x4(vfcvt_x_f_v_i32m1(t));
2487 }
2488 
v_ceil(const v_float32x4 & a)2489 inline v_int32x4 v_ceil(const v_float32x4& a)
2490 {
2491     v_float32x4 ZP5 = v_setall_f32(0.5f);
2492     v_float32x4 t = a + ZP5;
2493     vsetvlmax_e32m1();
2494     return v_int32x4(vfcvt_x_f_v_i32m1(t));
2495 }
2496 
v_trunc(const v_float32x4 & a)2497 inline v_int32x4 v_trunc(const v_float32x4& a)
2498 {
2499     vsetvlmax_e32m1();
2500     return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a));
2501 }
2502 #if CV_SIMD128_64F
v_round(const v_float64x2 & a)2503 inline v_int32x4 v_round(const v_float64x2& a)
2504 {
2505     double arr[4] = {a.val[0], a.val[1], 0, 0};
2506     vsetvlmax_e64m2();
2507     vfloat64m2_t tmp = vle64_v_f64m2(arr);
2508     return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2509 }
2510 
v_round(const v_float64x2 & a,const v_float64x2 & b)2511 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2512 {
2513     double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2514     vsetvlmax_e64m2();
2515     vfloat64m2_t tmp = vle64_v_f64m2(arr);
2516     return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2517 }
2518 
v_floor(const v_float64x2 & a)2519 inline v_int32x4 v_floor(const v_float64x2& a)
2520 {
2521     double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
2522     vsetvlmax_e64m2();
2523     vfloat64m2_t tmp = vle64_v_f64m2(arr);
2524     return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2525 }
2526 
v_ceil(const v_float64x2 & a)2527 inline v_int32x4 v_ceil(const v_float64x2& a)
2528 {
2529     double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
2530     vsetvlmax_e64m2();
2531     vfloat64m2_t tmp = vle64_v_f64m2(arr);
2532     return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2533 }
2534 
v_trunc(const v_float64x2 & a)2535 inline v_int32x4 v_trunc(const v_float64x2& a)
2536 {
2537     double arr[4] = {a.val[0], a.val[1], 0, 0};
2538     vsetvlmax_e64m2();
2539     vfloat64m2_t tmp = vle64_v_f64m2(arr);
2540     return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp));
2541 }
2542 #endif
2543 
2544 
2545 //////// Dot Product ////////
2546 
2547 // 16 >> 32
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)2548 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
2549 {
2550     int CV_DECL_ALIGNED(32) ptr[8] = {0};
2551     v_int32x4 t1, t2;
2552     vsetvlmax_e32m2();
2553     vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2554     v_load_deinterleave(ptr, t1, t2);
2555     return t1 + t2;
2556 }
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)2557 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
2558 {
2559     int CV_DECL_ALIGNED(32) ptr[8] = {0};
2560     v_int32x4 t1, t2;
2561     vsetvlmax_e32m2();
2562     vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2563     v_load_deinterleave(ptr, t1, t2);
2564     return t1 + t2 + c;
2565 }
2566 
2567 // 32 >> 64
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)2568 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
2569 {
2570     int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2571     v_int64x2 t1, t2;
2572     vsetvlmax_e64m2();
2573     vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2574     v_load_deinterleave(ptr, t1, t2);
2575     return t1 + t2;
2576 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)2577 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
2578 {
2579     int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2580     v_int64x2 t1, t2;
2581     vsetvlmax_e64m2();
2582     vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2583     v_load_deinterleave(ptr, t1, t2);
2584     return t1 + t2 + c;
2585 }
2586 
2587 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)2588 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
2589 {
2590     unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2591     v_uint32x4 t1, t2, t3, t4;
2592     vsetvlmax_e32m4();
2593     vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2594     v_load_deinterleave(ptr, t1, t2, t3, t4);
2595     return t1 + t2 + t3 + t4;
2596 }
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)2597 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
2598                                    const v_uint32x4& c)
2599 {
2600     unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2601     v_uint32x4 t1, t2, t3, t4;
2602     vsetvlmax_e32m4();
2603     vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2604     v_load_deinterleave(ptr, t1, t2, t3, t4);
2605     return t1 + t2 + t3 + t4 + c;
2606 }
2607 
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)2608 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
2609 {
2610     int CV_DECL_ALIGNED(32) ptr[16] = {0};
2611     v_int32x4 t1, t2, t3, t4;
2612     vsetvlmax_e32m4();
2613     vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2614     v_load_deinterleave(ptr, t1, t2, t3, t4);
2615     return t1 + t2 + t3 + t4;
2616 }
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)2617 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
2618                                   const v_int32x4& c)
2619 {
2620     int CV_DECL_ALIGNED(32) ptr[16] = {0};
2621     v_int32x4 t1, t2, t3, t4;
2622     vsetvlmax_e32m4();
2623     vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2624     v_load_deinterleave(ptr, t1, t2, t3, t4);
2625     return t1 + t2 + t3 + t4 + c;
2626 }
2627 
2628 // 16 >> 64
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)2629 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
2630 {
2631     uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2632     v_uint64x2 t1, t2, t3, t4;
2633     vsetvlmax_e64m4();
2634     vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2635     v_load_deinterleave(ptr, t1, t2, t3, t4);
2636     return t1 + t2 + t3 + t4;
2637 }
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)2638 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
2639 {
2640     uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2641     v_uint64x2 t1, t2, t3, t4;
2642     vsetvlmax_e64m4();
2643     vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2644     v_load_deinterleave(ptr, t1, t2, t3, t4);
2645     return t1 + t2 + t3 + t4 + c;
2646 }
2647 
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)2648 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
2649 {
2650     int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2651     v_int64x2 t1, t2, t3, t4;
2652     vsetvlmax_e64m4();
2653     vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2654     v_load_deinterleave(ptr, t1, t2, t3, t4);
2655     return t1 + t2 + t3 + t4;
2656 }
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)2657 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
2658                                   const v_int64x2& c)
2659 {
2660     int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2661     v_int64x2 t1, t2, t3, t4;
2662     vsetvlmax_e64m4();
2663     vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2664     v_load_deinterleave(ptr, t1, t2, t3, t4);
2665     return t1 + t2 + t3 + t4 + c;
2666 }
2667 
2668 // 32 >> 64f
2669 #if CV_SIMD128_64F
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)2670 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
2671 { return v_cvt_f64(v_dotprod(a, b)); }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)2672 inline v_float64x2 v_dotprod_expand(const v_int32x4& a,   const v_int32x4& b,
2673                                     const v_float64x2& c)
2674 { return v_dotprod_expand(a, b) + c; }
2675 #endif
2676 
2677 //////// Fast Dot Product ////////
2678 
2679 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)2680 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
2681 {
2682     int CV_DECL_ALIGNED(32) ptr[8] = {0};
2683     vsetvlmax_e32m2();
2684     vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2685     v_int32x4 t1 = v_load(ptr);
2686     v_int32x4 t2 = v_load(ptr+4);
2687     return t1 + t2;
2688 }
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)2689 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
2690 {
2691     int CV_DECL_ALIGNED(32) ptr[8] = {0};
2692     vsetvlmax_e32m2();
2693     vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2694     v_int32x4 t1 = v_load(ptr);
2695     v_int32x4 t2 = v_load(ptr+4);
2696     return t1 + t2 + c;
2697 }
2698 
2699 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)2700 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
2701 {
2702     int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2703     vsetvlmax_e64m2();
2704     vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2705     v_int64x2 t1 = v_load(ptr);
2706     v_int64x2 t2 = v_load(ptr+2);
2707     return t1 + t2;
2708 }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)2709 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
2710 {
2711     int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2712     vsetvlmax_e64m2();
2713     vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2714     v_int64x2 t1 = v_load(ptr);
2715     v_int64x2 t2 = v_load(ptr+2);
2716     return t1 + t2 + c;
2717 }
2718 
2719 
2720 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)2721 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
2722 {
2723     unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2724     vsetvlmax_e32m4();
2725     vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2726     v_uint32x4 t1 = v_load(ptr);
2727     v_uint32x4 t2 = v_load(ptr+4);
2728     v_uint32x4 t3 = v_load(ptr+8);
2729     v_uint32x4 t4 = v_load(ptr+12);
2730     return t1 + t2 + t3 + t4;
2731 }
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)2732 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
2733 {
2734     unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2735     vsetvlmax_e32m4();
2736     vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2737     v_uint32x4 t1 = v_load(ptr);
2738     v_uint32x4 t2 = v_load(ptr+4);
2739     v_uint32x4 t3 = v_load(ptr+8);
2740     v_uint32x4 t4 = v_load(ptr+12);
2741     return t1 + t2 + t3 + t4 + c;
2742 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)2743 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
2744 {
2745     int CV_DECL_ALIGNED(32) ptr[16] = {0};
2746     vsetvlmax_e32m4();
2747     vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2748     v_int32x4 t1 = v_load(ptr);
2749     v_int32x4 t2 = v_load(ptr+4);
2750     v_int32x4 t3 = v_load(ptr+8);
2751     v_int32x4 t4 = v_load(ptr+12);
2752     return t1 + t2 + t3 + t4;
2753 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)2754 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
2755 {
2756     int CV_DECL_ALIGNED(32) ptr[16] = {0};
2757     vsetvlmax_e32m4();
2758     vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2759     v_int32x4 t1 = v_load(ptr);
2760     v_int32x4 t2 = v_load(ptr+4);
2761     v_int32x4 t3 = v_load(ptr+8);
2762     v_int32x4 t4 = v_load(ptr+12);
2763     return t1 + t2 + t3 + t4 + c;
2764 }
2765 
2766 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)2767 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
2768 {
2769     uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2770     vsetvlmax_e64m4();
2771     vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2772     v_uint64x2 t1 = v_load(ptr);
2773     v_uint64x2 t2 = v_load(ptr+2);
2774     v_uint64x2 t3 = v_load(ptr+4);
2775     v_uint64x2 t4 = v_load(ptr+6);
2776     return t1 + t2 + t3 + t4;
2777 }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)2778 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
2779 {
2780     uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2781     vsetvlmax_e64m4();
2782     vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2783     v_uint64x2 t1 = v_load(ptr);
2784     v_uint64x2 t2 = v_load(ptr+2);
2785     v_uint64x2 t3 = v_load(ptr+4);
2786     v_uint64x2 t4 = v_load(ptr+6);
2787     return t1 + t2 + t3 + t4 + c;
2788 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)2789 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
2790 {
2791     int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2792     vsetvlmax_e64m4();
2793     vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2794     v_int64x2 t1 = v_load(ptr);
2795     v_int64x2 t2 = v_load(ptr+2);
2796     v_int64x2 t3 = v_load(ptr+4);
2797     v_int64x2 t4 = v_load(ptr+6);
2798     return t1 + t2 + t3 + t4;
2799 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)2800 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
2801 {
2802     int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2803     vsetvlmax_e64m4();
2804     vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2805     v_int64x2 t1 = v_load(ptr);
2806     v_int64x2 t2 = v_load(ptr+2);
2807     v_int64x2 t3 = v_load(ptr+4);
2808     v_int64x2 t4 = v_load(ptr+6);
2809     return t1 + t2 + t3 + t4 + c;
2810 }
2811 
2812 // 32 >> 64f
2813 #if CV_SIMD128_64F
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)2814 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
2815 { return v_cvt_f64(v_dotprod_fast(a, b)); }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)2816 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
2817 { return v_dotprod_expand_fast(a, b) + c; }
2818 #endif
2819 
2820 
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)2821 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
2822                             const v_float32x4& m1, const v_float32x4& m2,
2823                             const v_float32x4& m3)
2824 {
2825     vsetvlmax_e32m1();
2826     vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
2827     res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
2828     res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
2829     res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3);
2830     return v_float32x4(res);
2831 }
2832 
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)2833 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
2834                                const v_float32x4& m1, const v_float32x4& m2,
2835                                const v_float32x4& a)
2836 {
2837     vsetvlmax_e32m1();
2838     vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
2839     res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
2840     res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
2841     return v_float32x4(res) + a;
2842 }
2843 
2844 #define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width) \
2845 inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
2846 { \
2847     _Tpw CV_DECL_ALIGNED(32) ptr[_Tpwvec::nlanes*2] = {0}; \
2848     vsetvlmax_e##width##m2(); \
2849     vse##width##_v_##suffix##m2(ptr, wmul(a, b)); \
2850     vsetvlmax_e##width##m1(); \
2851     c = _Tpwvec(vle##width##_v_##suffix##m1(ptr)); \
2852     d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes)); \
2853 }
2854 
2855 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16)
2856 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16)
2857 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32)
2858 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32)
2859 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64)
2860 
2861 
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)2862 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
2863 {
2864     vsetvlmax_e16m1();
2865     return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b), 16));
2866 }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)2867 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
2868 {
2869     vsetvlmax_e16m1();
2870     return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b), 16));
2871 }
2872 
2873 
2874 //////// Saturating Multiply ////////
2875 
2876 #define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
2877 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
2878 { \
2879     _wTpvec c, d; \
2880     v_mul_expand(a, b, c, d); \
2881     return v_pack(c, d); \
2882 } \
2883 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
2884 { \
2885     a = a * b; \
2886     return a; \
2887 }
2888 
OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16,v_uint16x8)2889 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
2890 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
2891 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
2892 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
2893 
2894 
2895 inline void v_cleanup() {}
2896 
2897 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2898 
2899 
2900 }
2901 
2902 #endif
2903