1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 // The original implementation has been contributed by Yin Zhang.
6 // Copyright (C) 2020, Institute of Software, Chinese Academy of Sciences.
7
8 #ifndef OPENCV_HAL_INTRIN_RVV_HPP
9 #define OPENCV_HAL_INTRIN_RVV_HPP
10
11 #include <algorithm>
12
13 namespace cv
14 {
15
16 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
17
18 #define CV_SIMD128 1
19 #define CV_SIMD128_64F 1
20
21 //////////// Unsupported native intrinsics in C++ ////////////
22
23 struct vuint8mf2_t
24 {
25 uchar val[8] = {0};
vuint8mf2_tcv::vuint8mf2_t26 vuint8mf2_t() {}
vuint8mf2_tcv::vuint8mf2_t27 vuint8mf2_t(const uchar* ptr)
28 {
29 for (int i = 0; i < 8; ++i)
30 {
31 val[i] = ptr[i];
32 }
33 }
34 };
35 struct vint8mf2_t
36 {
37 schar val[8] = {0};
vint8mf2_tcv::vint8mf2_t38 vint8mf2_t() {}
vint8mf2_tcv::vint8mf2_t39 vint8mf2_t(const schar* ptr)
40 {
41 for (int i = 0; i < 8; ++i)
42 {
43 val[i] = ptr[i];
44 }
45 }
46 };
47 struct vuint16mf2_t
48 {
49 ushort val[4] = {0};
vuint16mf2_tcv::vuint16mf2_t50 vuint16mf2_t() {}
vuint16mf2_tcv::vuint16mf2_t51 vuint16mf2_t(const ushort* ptr)
52 {
53 for (int i = 0; i < 4; ++i)
54 {
55 val[i] = ptr[i];
56 }
57 }
58 };
59 struct vint16mf2_t
60 {
61 short val[4] = {0};
vint16mf2_tcv::vint16mf2_t62 vint16mf2_t() {}
vint16mf2_tcv::vint16mf2_t63 vint16mf2_t(const short* ptr)
64 {
65 for (int i = 0; i < 4; ++i)
66 {
67 val[i] = ptr[i];
68 }
69 }
70 };
71 struct vuint32mf2_t
72 {
73 unsigned val[2] = {0};
vuint32mf2_tcv::vuint32mf2_t74 vuint32mf2_t() {}
vuint32mf2_tcv::vuint32mf2_t75 vuint32mf2_t(const unsigned* ptr)
76 {
77 val[0] = ptr[0];
78 val[1] = ptr[1];
79 }
80 };
81 struct vint32mf2_t
82 {
83 int val[2] = {0};
vint32mf2_tcv::vint32mf2_t84 vint32mf2_t() {}
vint32mf2_tcv::vint32mf2_t85 vint32mf2_t(const int* ptr)
86 {
87 val[0] = ptr[0];
88 val[1] = ptr[1];
89 }
90 };
91 struct vfloat32mf2_t
92 {
93 float val[2] = {0};
vfloat32mf2_tcv::vfloat32mf2_t94 vfloat32mf2_t() {}
vfloat32mf2_tcv::vfloat32mf2_t95 vfloat32mf2_t(const float* ptr)
96 {
97 val[0] = ptr[0];
98 val[1] = ptr[1];
99 }
100 };
101 struct vuint64mf2_t
102 {
103 uint64 val[1] = {0};
vuint64mf2_tcv::vuint64mf2_t104 vuint64mf2_t() {}
vuint64mf2_tcv::vuint64mf2_t105 vuint64mf2_t(const uint64* ptr)
106 {
107 val[0] = ptr[0];
108 }
109 };
110 struct vint64mf2_t
111 {
112 int64 val[1] = {0};
vint64mf2_tcv::vint64mf2_t113 vint64mf2_t() {}
vint64mf2_tcv::vint64mf2_t114 vint64mf2_t(const int64* ptr)
115 {
116 val[0] = ptr[0];
117 }
118 };
119 struct vfloat64mf2_t
120 {
121 double val[1] = {0};
vfloat64mf2_tcv::vfloat64mf2_t122 vfloat64mf2_t() {}
vfloat64mf2_tcv::vfloat64mf2_t123 vfloat64mf2_t(const double* ptr)
124 {
125 val[0] = ptr[0];
126 }
127 };
128 struct vuint8mf4_t
129 {
130 uchar val[4] = {0};
vuint8mf4_tcv::vuint8mf4_t131 vuint8mf4_t() {}
vuint8mf4_tcv::vuint8mf4_t132 vuint8mf4_t(const uchar* ptr)
133 {
134 for (int i = 0; i < 4; ++i)
135 {
136 val[i] = ptr[i];
137 }
138 }
139 };
140 struct vint8mf4_t
141 {
142 schar val[4] = {0};
vint8mf4_tcv::vint8mf4_t143 vint8mf4_t() {}
vint8mf4_tcv::vint8mf4_t144 vint8mf4_t(const schar* ptr)
145 {
146 for (int i = 0; i < 4; ++i)
147 {
148 val[i] = ptr[i];
149 }
150 }
151 };
152
153 #define OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(_Tpvec, _Tp, suffix, width, n) \
154 inline _Tpvec vle##width##_v_##suffix##mf2(const _Tp* ptr) \
155 { \
156 return _Tpvec(ptr); \
157 } \
158 inline void vse##width##_v_##suffix##mf2(_Tp* ptr, _Tpvec v) \
159 { \
160 for (int i = 0; i < n; ++i) \
161 { \
162 ptr[i] = v.val[i]; \
163 } \
164 }
165
166 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint8mf2_t, uint8_t, u8, 8, 8)
167 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint8mf2_t, int8_t, i8, 8, 8)
168 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint16mf2_t, uint16_t, u16, 16, 4)
169 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint16mf2_t, int16_t, i16, 16, 4)
170 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint32mf2_t, uint32_t, u32, 32, 2)
171 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint32mf2_t, int32_t, i32, 32, 2)
172 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat32mf2_t, float32_t, f32, 32, 2)
173 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vuint64mf2_t, uint64_t, u64, 64, 1)
174 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vint64mf2_t, int64_t, i64, 64, 1)
175 OPENCV_HAL_IMPL_RVV_NATIVE_LOADSTORE_MF2(vfloat64mf2_t, float64_t, f64, 64, 1)
176
177
178 #define OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(_Tpwvec, _Tpvec, _wTp, wcvt, suffix, width, n) \
179 inline _Tpwvec wcvt (_Tpvec v) \
180 { \
181 _wTp tmp[n]; \
182 for (int i = 0; i < n; ++i) \
183 { \
184 tmp[i] = (_wTp)v.val[i]; \
185 } \
186 vsetvlmax_e##width##m1(); \
187 return vle##width##_v_##suffix##m1(tmp); \
188 }
189
190 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint16m1_t, vuint8mf2_t, ushort, vwcvtu_x_x_v_u16m1, u16, 16, 8)
191 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint16m1_t, vint8mf2_t, short, vwcvt_x_x_v_i16m1, i16, 16, 8)
192 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint32m1_t, vuint16mf2_t, unsigned, vwcvtu_x_x_v_u32m1, u32, 32, 4)
193 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint32m1_t, vint16mf2_t, int, vwcvt_x_x_v_i32m1, i32, 32, 4)
194 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vuint64m1_t, vuint32mf2_t, uint64, vwcvtu_x_x_v_u64m1, u64, 64, 2)
195 OPENCV_HAL_IMPL_RVV_NATIVE_WCVT(vint64m1_t, vint32mf2_t, int64, vwcvt_x_x_v_i64m1, i64, 64, 2)
196
vle8_v_u8mf4(const uint8_t * base)197 inline vuint8mf4_t vle8_v_u8mf4 (const uint8_t *base)
198 {
199 return vuint8mf4_t(base);
200 }
vle8_v_i8mf4(const int8_t * base)201 inline vint8mf4_t vle8_v_i8mf4 (const int8_t *base)
202 {
203 return vint8mf4_t(base);
204 }
205
vwcvtu_x_x_v_u16mf2(vuint8mf4_t src)206 inline vuint16mf2_t vwcvtu_x_x_v_u16mf2 (vuint8mf4_t src)
207 {
208 ushort tmp[4];
209 for (int i = 0; i < 4; ++i)
210 {
211 tmp[i] = (ushort)src.val[i];
212 }
213 return vle16_v_u16mf2(tmp);
214 }
vwcvt_x_x_v_i16mf2(vint8mf4_t src)215 inline vint16mf2_t vwcvt_x_x_v_i16mf2 (vint8mf4_t src)
216 {
217 short tmp[4];
218 for (int i = 0; i < 4; ++i)
219 {
220 tmp[i] = (short)src.val[i];
221 }
222 return vle16_v_i16mf2(tmp);
223 }
224
225 //////////// Types ////////////
226
227 struct v_uint8x16
228 {
229 typedef uchar lane_type;
230 enum { nlanes = 16 };
231
v_uint8x16cv::v_uint8x16232 v_uint8x16() {}
v_uint8x16cv::v_uint8x16233 explicit v_uint8x16(vuint8m1_t v)
234 {
235 vsetvlmax_e8m1();
236 vse8_v_u8m1(val, v);
237 }
v_uint8x16cv::v_uint8x16238 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
239 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
240 {
241 uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
242 for (int i = 0; i < nlanes; ++i)
243 {
244 val[i] = v[i];
245 }
246 }
operator vuint8m1_tcv::v_uint8x16247 operator vuint8m1_t() const
248 {
249 vsetvlmax_e8m1();
250 return vle8_v_u8m1(val);
251 }
get0cv::v_uint8x16252 uchar get0() const
253 {
254 return val[0];
255 }
256
257 uchar val[16];
258 };
259
260 struct v_int8x16
261 {
262 typedef schar lane_type;
263 enum { nlanes = 16 };
264
v_int8x16cv::v_int8x16265 v_int8x16() {}
v_int8x16cv::v_int8x16266 explicit v_int8x16(vint8m1_t v)
267 {
268 vsetvlmax_e8m1();
269 vse8_v_i8m1(val, v);
270 }
v_int8x16cv::v_int8x16271 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
272 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
273 {
274 schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
275 for (int i = 0; i < nlanes; ++i)
276 {
277 val[i] = v[i];
278 }
279 }
operator vint8m1_tcv::v_int8x16280 operator vint8m1_t() const
281 {
282 vsetvlmax_e8m1();
283 return vle8_v_i8m1(val);
284 }
get0cv::v_int8x16285 schar get0() const
286 {
287 return val[0];
288 }
289
290 schar val[16];
291 };
292
293 struct v_uint16x8
294 {
295 typedef ushort lane_type;
296 enum { nlanes = 8 };
297
v_uint16x8cv::v_uint16x8298 v_uint16x8() {}
v_uint16x8cv::v_uint16x8299 explicit v_uint16x8(vuint16m1_t v)
300 {
301 vsetvlmax_e16m1();
302 vse16_v_u16m1(val, v);
303 }
v_uint16x8cv::v_uint16x8304 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
305 {
306 ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
307 for (int i = 0; i < nlanes; ++i)
308 {
309 val[i] = v[i];
310 }
311 }
operator vuint16m1_tcv::v_uint16x8312 operator vuint16m1_t() const
313 {
314 vsetvlmax_e16m1();
315 return vle16_v_u16m1(val);
316 }
get0cv::v_uint16x8317 ushort get0() const
318 {
319 return val[0];
320 }
321
322 ushort val[8];
323 };
324
325 struct v_int16x8
326 {
327 typedef short lane_type;
328 enum { nlanes = 8 };
329
v_int16x8cv::v_int16x8330 v_int16x8() {}
v_int16x8cv::v_int16x8331 explicit v_int16x8(vint16m1_t v)
332 {
333 vsetvlmax_e16m1();
334 vse16_v_i16m1(val, v);
335 }
v_int16x8cv::v_int16x8336 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
337 {
338 short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
339 for (int i = 0; i < nlanes; ++i)
340 {
341 val[i] = v[i];
342 }
343 }
operator vint16m1_tcv::v_int16x8344 operator vint16m1_t() const
345 {
346 vsetvlmax_e16m1();
347 return vle16_v_i16m1(val);
348 }
get0cv::v_int16x8349 short get0() const
350 {
351 return val[0];
352 }
353
354 short val[8];
355 };
356
357 struct v_uint32x4
358 {
359 typedef unsigned lane_type;
360 enum { nlanes = 4 };
361
v_uint32x4cv::v_uint32x4362 v_uint32x4() {}
v_uint32x4cv::v_uint32x4363 explicit v_uint32x4(vuint32m1_t v)
364 {
365 vsetvlmax_e32m1();
366 vse32_v_u32m1(val, v);
367 }
v_uint32x4cv::v_uint32x4368 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
369 {
370 unsigned v[] = {v0, v1, v2, v3};
371 for (int i = 0; i < nlanes; ++i)
372 {
373 val[i] = v[i];
374 }
375 }
operator vuint32m1_tcv::v_uint32x4376 operator vuint32m1_t() const
377 {
378 vsetvlmax_e32m1();
379 return vle32_v_u32m1(val);
380 }
get0cv::v_uint32x4381 unsigned get0() const
382 {
383 return val[0];
384 }
385
386 unsigned val[4];
387 };
388
389 struct v_int32x4
390 {
391 typedef int lane_type;
392 enum { nlanes = 4 };
393
v_int32x4cv::v_int32x4394 v_int32x4() {}
v_int32x4cv::v_int32x4395 explicit v_int32x4(vint32m1_t v)
396 {
397 vsetvlmax_e32m1();
398 vse32_v_i32m1(val, v);
399 }
v_int32x4cv::v_int32x4400 v_int32x4(int v0, int v1, int v2, int v3)
401 {
402 int v[] = {v0, v1, v2, v3};
403 for (int i = 0; i < nlanes; ++i)
404 {
405 val[i] = v[i];
406 }
407 }
operator vint32m1_tcv::v_int32x4408 operator vint32m1_t() const
409 {
410 vsetvlmax_e32m1();
411 return vle32_v_i32m1(val);
412 }
get0cv::v_int32x4413 int get0() const
414 {
415 return val[0];
416 }
417 int val[4];
418 };
419
420 struct v_float32x4
421 {
422 typedef float lane_type;
423 enum { nlanes = 4 };
424
v_float32x4cv::v_float32x4425 v_float32x4() {}
v_float32x4cv::v_float32x4426 explicit v_float32x4(vfloat32m1_t v)
427 {
428 vsetvlmax_e32m1();
429 vse32_v_f32m1(val, v);
430 }
v_float32x4cv::v_float32x4431 v_float32x4(float v0, float v1, float v2, float v3)
432 {
433 float v[] = {v0, v1, v2, v3};
434 for (int i = 0; i < nlanes; ++i)
435 {
436 val[i] = v[i];
437 }
438 }
operator vfloat32m1_tcv::v_float32x4439 operator vfloat32m1_t() const
440 {
441 vsetvlmax_e32m1();
442 return vle32_v_f32m1(val);
443 }
get0cv::v_float32x4444 float get0() const
445 {
446 return val[0];
447 }
448 float val[4];
449 };
450
451 struct v_uint64x2
452 {
453 typedef uint64 lane_type;
454 enum { nlanes = 2 };
455
v_uint64x2cv::v_uint64x2456 v_uint64x2() {}
v_uint64x2cv::v_uint64x2457 explicit v_uint64x2(vuint64m1_t v)
458 {
459 vsetvlmax_e64m1();
460 vse64_v_u64m1(val, v);
461 }
v_uint64x2cv::v_uint64x2462 v_uint64x2(uint64 v0, uint64 v1)
463 {
464 uint64 v[] = {v0, v1};
465 for (int i = 0; i < nlanes; ++i)
466 {
467 val[i] = v[i];
468 }
469 }
operator vuint64m1_tcv::v_uint64x2470 operator vuint64m1_t() const
471 {
472 vsetvlmax_e64m1();
473 return vle64_v_u64m1(val);
474 }
get0cv::v_uint64x2475 uint64 get0() const
476 {
477 return val[0];
478 }
479
480 uint64 val[2];
481 };
482
483 struct v_int64x2
484 {
485 typedef int64 lane_type;
486 enum { nlanes = 2 };
487
v_int64x2cv::v_int64x2488 v_int64x2() {}
v_int64x2cv::v_int64x2489 explicit v_int64x2(vint64m1_t v)
490 {
491 vsetvlmax_e64m1();
492 vse64_v_i64m1(val, v);
493 }
v_int64x2cv::v_int64x2494 v_int64x2(int64 v0, int64 v1)
495 {
496 int64 v[] = {v0, v1};
497 for (int i = 0; i < nlanes; ++i)
498 {
499 val[i] = v[i];
500 }
501 }
operator vint64m1_tcv::v_int64x2502 operator vint64m1_t() const
503 {
504 vsetvlmax_e64m1();
505 return vle64_v_i64m1(val);
506 }
get0cv::v_int64x2507 int64 get0() const
508 {
509 return val[0];
510 }
511
512 int64 val[2];
513 };
514
515 #if CV_SIMD128_64F
516 struct v_float64x2
517 {
518 typedef double lane_type;
519 enum { nlanes = 2 };
520
v_float64x2cv::v_float64x2521 v_float64x2() {}
v_float64x2cv::v_float64x2522 explicit v_float64x2(vfloat64m1_t v)
523 {
524 vsetvlmax_e64m1();
525 vse64_v_f64m1(val, v);
526 }
v_float64x2cv::v_float64x2527 v_float64x2(double v0, double v1)
528 {
529 double v[] = {v0, v1};
530 for (int i = 0; i < nlanes; ++i)
531 {
532 val[i] = v[i];
533 }
534 }
operator vfloat64m1_tcv::v_float64x2535 operator vfloat64m1_t() const
536 {
537 vsetvlmax_e64m1();
538 return vle64_v_f64m1(val);
539 }
get0cv::v_float64x2540 double get0() const
541 {
542 return val[0];
543 }
544
545 double val[2];
546 };
547 #endif
548
549
550 //////////// Initial ////////////
551
552 #define OPENCV_HAL_IMPL_RVV_INIT_INTEGER(_Tpvec, _Tp, width, suffix1, suffix2) \
553 inline v_##_Tpvec v_setzero_##suffix1() \
554 { \
555 vsetvlmax_e##width##m1(); \
556 return v_##_Tpvec(vzero_##suffix2##m1()); \
557 } \
558 inline v_##_Tpvec v_setall_##suffix1(_Tp v) \
559 { \
560 vsetvlmax_e##width##m1(); \
561 return v_##_Tpvec(vmv_v_x_##suffix2##m1(v)); \
562 }
563
564 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint8x16, uchar, 8, u8, u8)
565 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int8x16, schar, 8, s8, i8)
566 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint16x8, ushort, 16, u16, u16)
567 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int16x8, short, 16, s16, i16)
568 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint32x4, unsigned, 32, u32, u32)
569 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int32x4, int, 32, s32, i32)
570 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(uint64x2, uint64, 64, u64, u64)
571 OPENCV_HAL_IMPL_RVV_INIT_INTEGER(int64x2, int64, 64, s64, i64)
572
573 #define OPENCV_HAL_IMPL_RVV_INIT_FP(_Tpv, _Tp, width, suffix) \
574 inline v_##_Tpv v_setzero_##suffix() \
575 { \
576 vsetvlmax_e##width##m1(); \
577 return v_##_Tpv(vzero_##suffix##m1()); \
578 } \
579 inline v_##_Tpv v_setall_##suffix(_Tp v) \
580 { \
581 vsetvlmax_e##width##m1(); \
582 return v_##_Tpv(vfmv_v_f_##suffix##m1(v)); \
583 }
584
585 OPENCV_HAL_IMPL_RVV_INIT_FP(float32x4, float, 32, f32)
586 #if CV_SIMD128_64F
587 OPENCV_HAL_IMPL_RVV_INIT_FP(float64x2, double, 64, f64)
588 #endif
589
590 //////////// Reinterpret ////////////
591
592 #define OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(_Tpvec, suffix) \
593 inline v_##_Tpvec v_reinterpret_as_##suffix(const v_##_Tpvec& v) { return v; }
594
OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16,u8)595 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint8x16, u8)
596 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int8x16, s8)
597 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint16x8, u16)
598 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int16x8, s16)
599 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint32x4, u32)
600 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int32x4, s32)
601 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float32x4, f32)
602 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(uint64x2, u64)
603 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(int64x2, s64)
604 #if CV_SIMD128_64F
605 OPENCV_HAL_IMPL_RVV_SELF_REINTERPRET(float64x2, f64)
606 #endif
607
608 #define OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(_Tpvec1, _Tpvec2, _nTpvec1, _nTpvec2, suffix1, suffix2, nsuffix1, nsuffix2, width1, width2) \
609 inline v_##_Tpvec1 v_reinterpret_as_##suffix1(const v_##_Tpvec2& v) \
610 { \
611 vsetvlmax_e##width2##m1(); \
612 return v_##_Tpvec1((_nTpvec1)vle##width2##_v_##nsuffix2##m1(v.val)); \
613 } \
614 inline v_##_Tpvec2 v_reinterpret_as_##suffix2(const v_##_Tpvec1& v) \
615 { \
616 vsetvlmax_e##width1##m1(); \
617 return v_##_Tpvec2((_nTpvec2)vle##width1##_v_##nsuffix1##m1(v.val)); \
618 }
619
620 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int8x16, vuint8m1_t, vint8m1_t, u8, s8, u8, i8, 8, 8)
621 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int16x8, vuint16m1_t, vint16m1_t, u16, s16, u16, i16, 16, 16)
622 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int32x4, vuint32m1_t, vint32m1_t, u32, s32, u32, i32, 32, 32)
623 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float32x4, vuint32m1_t, vfloat32m1_t, u32, f32, u32, f32, 32, 32)
624 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float32x4, vint32m1_t, vfloat32m1_t, s32, f32, i32, f32, 32, 32)
625 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int64x2, vuint64m1_t, vint64m1_t, u64, s64, u64, i64, 64, 64)
626 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint16x8, vuint8m1_t, vuint16m1_t, u8, u16, u8, u16, 8, 16)
627 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint32x4, vuint8m1_t, vuint32m1_t, u8, u32, u8, u32, 8, 32)
628 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, uint64x2, vuint8m1_t, vuint64m1_t, u8, u64, u8, u64, 8, 64)
629 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint32x4, vuint16m1_t, vuint32m1_t, u16, u32, u16, u32, 16, 32)
630 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, uint64x2, vuint16m1_t, vuint64m1_t, u16, u64, u16, u64, 16, 64)
631 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, uint64x2, vuint32m1_t, vuint64m1_t, u32, u64, u32, u64, 32, 64)
632 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int16x8, vint8m1_t, vint16m1_t, s8, s16, i8, i16, 8, 16)
633 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int32x4, vint8m1_t, vint32m1_t, s8, s32, i8, i32, 8, 32)
634 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, int64x2, vint8m1_t, vint64m1_t, s8, s64, i8, i64, 8, 64)
635 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int32x4, vint16m1_t, vint32m1_t, s16, s32, i16, i32, 16, 32)
636 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, int64x2, vint16m1_t, vint64m1_t, s16, s64, i16, i64, 16, 64)
637 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, int64x2, vint32m1_t, vint64m1_t, s32, s64, i32, i64, 32, 64)
638 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int16x8, vuint8m1_t, vint16m1_t, u8, s16, u8, i16, 8, 16)
639 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int32x4, vuint8m1_t, vint32m1_t, u8, s32, u8, i32, 8, 32)
640 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, int64x2, vuint8m1_t, vint64m1_t, u8, s64, u8, i64, 8, 64)
641 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int8x16, vuint16m1_t, vint8m1_t, u16, s8, u16, i8, 16, 8)
642 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int32x4, vuint16m1_t, vint32m1_t, u16, s32, u16, i32, 16, 32)
643 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, int64x2, vuint16m1_t, vint64m1_t, u16, s64, u16, i64, 16, 64)
644 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int8x16, vuint32m1_t, vint8m1_t, u32, s8, u32, i8, 32, 8)
645 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int16x8, vuint32m1_t, vint16m1_t, u32, s16, u32, i16, 32, 16)
646 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, int64x2, vuint32m1_t, vint64m1_t, u32, s64, u32, i64, 32, 64)
647 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int8x16, vuint64m1_t, vint8m1_t, u64, s8, u64, i8, 64, 8)
648 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int16x8, vuint64m1_t, vint16m1_t, u64, s16, u64, i16, 64, 16)
649 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, int32x4, vuint64m1_t, vint32m1_t, u64, s32, u64, i32, 64, 32)
650 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float32x4, vuint8m1_t, vfloat32m1_t, u8, f32, u8, f32, 8, 32)
651 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float32x4, vuint16m1_t, vfloat32m1_t, u16, f32, u16, f32, 16, 32)
652 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float32x4, vuint64m1_t, vfloat32m1_t, u64, f32, u64, f32, 64, 32)
653 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float32x4, vint8m1_t, vfloat32m1_t, s8, f32, i8, f32, 8, 32)
654 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float32x4, vint16m1_t, vfloat32m1_t, s16, f32, i16, f32, 16, 32)
655 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float32x4, vint64m1_t, vfloat32m1_t, s64, f32, i64, f32, 64, 32)
656 #if CV_SIMD128_64F
657 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint64x2, float64x2, vuint64m1_t, vfloat64m1_t, u64, f64, u64, f64, 64, 64)
658 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int64x2, float64x2, vint64m1_t, vfloat64m1_t, s64, f64, i64, f64, 64, 64)
659 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint8x16, float64x2, vuint8m1_t, vfloat64m1_t, u8, f64, u8, f64, 8, 64)
660 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint16x8, float64x2, vuint16m1_t, vfloat64m1_t, u16, f64, u16, f64, 16, 64)
661 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(uint32x4, float64x2, vuint32m1_t, vfloat64m1_t, u32, f64, u32, f64, 32, 64)
662 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int8x16, float64x2, vint8m1_t, vfloat64m1_t, s8, f64, i8, f64, 8, 64)
663 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int16x8, float64x2, vint16m1_t, vfloat64m1_t, s16, f64, i16, f64, 16, 64)
664 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(int32x4, float64x2, vint32m1_t, vfloat64m1_t, s32, f64, i32, f64, 32, 64)
665 OPENCV_HAL_IMPL_RVV_ONE_TIME_REINTERPRET(float32x4, float64x2, vfloat32m1_t, vfloat64m1_t, f32, f64, f32, f64, 32, 64)
666 #endif
667
668 ////////////// Extract //////////////
669
670 #define OPENCV_HAL_IMPL_RVV_EXTRACT(_Tpvec, _Tp, suffix, width, vmv) \
671 template <int s> \
672 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
673 { \
674 vsetvlmax_e##width##m1(); \
675 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, s), b, _Tpvec::nlanes - s)); \
676 } \
677 template<int i> inline _Tp v_extract_n(_Tpvec v) \
678 { \
679 vsetvlmax_e##width##m1(); \
680 return _Tp(vmv(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), v, i))); \
681 }
682
683
684 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint8x16, uchar, u8, 8, vmv_x_s_u8m1_u8)
685 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int8x16, schar, i8, 8, vmv_x_s_i8m1_i8)
686 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint16x8, ushort, u16, 16, vmv_x_s_u16m1_u16)
687 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int16x8, short, i16, 16, vmv_x_s_i16m1_i16)
688 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint32x4, uint, u32, 32, vmv_x_s_u32m1_u32)
689 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int32x4, int, i32, 32, vmv_x_s_i32m1_i32)
690 OPENCV_HAL_IMPL_RVV_EXTRACT(v_uint64x2, uint64, u64, 64, vmv_x_s_u64m1_u64)
691 OPENCV_HAL_IMPL_RVV_EXTRACT(v_int64x2, int64, i64, 64, vmv_x_s_i64m1_i64)
692 OPENCV_HAL_IMPL_RVV_EXTRACT(v_float32x4, float, f32, 32, vfmv_f_s_f32m1_f32)
693 #if CV_SIMD128_64F
694 OPENCV_HAL_IMPL_RVV_EXTRACT(v_float64x2, double, f64, 64, vfmv_f_s_f64m1_f64)
695 #endif
696
697 ////////////// Load/Store //////////////
698
699 #define OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(_Tpvec, _nTpvec, _Tp, hvl, width, suffix) \
700 inline _Tpvec v_load(const _Tp* ptr) \
701 { \
702 vsetvlmax_e8m1(); \
703 return _Tpvec((_nTpvec)vle8_v_u8m1((uchar*)ptr)); \
704 } \
705 inline _Tpvec v_load_aligned(const _Tp* ptr) \
706 { \
707 vsetvlmax_e##width##m1(); \
708 return _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
709 } \
710 inline _Tpvec v_load_low(const _Tp* ptr) \
711 { \
712 vsetvl_e##width##m1(hvl); \
713 _Tpvec res = _Tpvec(vle##width##_v_##suffix##m1(ptr)); \
714 vsetvlmax_e##width##m1(); \
715 return res; \
716 } \
717 inline void v_store(_Tp* ptr, const _Tpvec& a) \
718 { \
719 vsetvlmax_e8m1(); \
720 vse8_v_u8m1((uchar*)ptr, vle8_v_u8m1((uchar*)a.val)); \
721 } \
722 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
723 { \
724 vsetvlmax_e##width##m1(); \
725 vse##width##_v_##suffix##m1(ptr, a); \
726 } \
727 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
728 { \
729 vsetvlmax_e##width##m1(); \
730 vse##width##_v_##suffix##m1(ptr, a); \
731 } \
732 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
733 { \
734 vsetvlmax_e##width##m1(); \
735 vse##width##_v_##suffix##m1(ptr, a); \
736 } \
737 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
738 { \
739 _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
740 vsetvlmax_e##width##m1(); \
741 vse##width##_v_##suffix##m1(tmp_ptr, a); \
742 for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
743 { \
744 ptr[i] = tmp_ptr[i]; \
745 } \
746 } \
747 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
748 { \
749 _Tp CV_DECL_ALIGNED(32) tmp_ptr[_Tpvec::nlanes] = {0}; \
750 vsetvlmax_e##width##m1(); \
751 vse##width##_v_##suffix##m1(tmp_ptr, a); \
752 for(int i = 0; i < _Tpvec::nlanes/2; ++i) \
753 { \
754 ptr[i] = tmp_ptr[i+_Tpvec::nlanes/2]; \
755 } \
756 }
757
758 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint8x16, vuint8m1_t, uchar, 8, 8, u8)
759 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int8x16, vint8m1_t, schar, 8, 8, i8)
760 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint16x8, vuint16m1_t, ushort, 4, 16, u16)
761 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int16x8, vint16m1_t, short, 4, 16, i16)
762 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint32x4, vuint32m1_t, unsigned, 2, 32, u32)
763 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int32x4, vint32m1_t, int, 2, 32, i32)
764 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_uint64x2, vuint64m1_t, uint64, 1, 64, u64)
765 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_int64x2, vint64m1_t, int64, 1, 64, i64)
766 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float32x4, vfloat32m1_t, float, 2, 32, f32)
767 #if CV_SIMD128_64F
768 OPENCV_HAL_IMPL_RVV_LOADSTORE_OP(v_float64x2, vfloat64m1_t, double, 1, 64, f64)
769 #endif
770
771 inline v_int8x16 v_load_halves(const schar* ptr0, const schar* ptr1)
772 {
773 schar CV_DECL_ALIGNED(32) elems[16] =
774 {
775 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr0[4], ptr0[5], ptr0[6], ptr0[7],
776 ptr1[0], ptr1[1], ptr1[2], ptr1[3], ptr1[4], ptr1[5], ptr1[6], ptr1[7]
777 };
778 vsetvlmax_e8m1();
779 return v_int8x16(vle8_v_i8m1(elems));
780 }
v_load_halves(const uchar * ptr0,const uchar * ptr1)781 inline v_uint8x16 v_load_halves(const uchar* ptr0, const uchar* ptr1) { return v_reinterpret_as_u8(v_load_halves((schar*)ptr0, (schar*)ptr1)); }
782
v_load_halves(const short * ptr0,const short * ptr1)783 inline v_int16x8 v_load_halves(const short* ptr0, const short* ptr1)
784 {
785 short CV_DECL_ALIGNED(32) elems[8] =
786 {
787 ptr0[0], ptr0[1], ptr0[2], ptr0[3], ptr1[0], ptr1[1], ptr1[2], ptr1[3]
788 };
789 vsetvlmax_e16m1();
790 return v_int16x8(vle16_v_i16m1(elems));
791 }
v_load_halves(const ushort * ptr0,const ushort * ptr1)792 inline v_uint16x8 v_load_halves(const ushort* ptr0, const ushort* ptr1) { return v_reinterpret_as_u16(v_load_halves((short*)ptr0, (short*)ptr1)); }
793
v_load_halves(const int * ptr0,const int * ptr1)794 inline v_int32x4 v_load_halves(const int* ptr0, const int* ptr1)
795 {
796 int CV_DECL_ALIGNED(32) elems[4] =
797 {
798 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
799 };
800 vsetvlmax_e32m1();
801 return v_int32x4(vle32_v_i32m1(elems));
802 }
v_load_halves(const float * ptr0,const float * ptr1)803 inline v_float32x4 v_load_halves(const float* ptr0, const float* ptr1)
804 {
805 float CV_DECL_ALIGNED(32) elems[4] =
806 {
807 ptr0[0], ptr0[1], ptr1[0], ptr1[1]
808 };
809 vsetvlmax_e32m1();
810 return v_float32x4(vle32_v_f32m1(elems));
811 }
v_load_halves(const unsigned * ptr0,const unsigned * ptr1)812 inline v_uint32x4 v_load_halves(const unsigned* ptr0, const unsigned* ptr1) { return v_reinterpret_as_u32(v_load_halves((int*)ptr0, (int*)ptr1)); }
813
v_load_halves(const int64 * ptr0,const int64 * ptr1)814 inline v_int64x2 v_load_halves(const int64* ptr0, const int64* ptr1)
815 {
816 int64 CV_DECL_ALIGNED(32) elems[2] =
817 {
818 ptr0[0], ptr1[0]
819 };
820 vsetvlmax_e64m1();
821 return v_int64x2(vle64_v_i64m1(elems));
822 }
v_load_halves(const uint64 * ptr0,const uint64 * ptr1)823 inline v_uint64x2 v_load_halves(const uint64* ptr0, const uint64* ptr1) { return v_reinterpret_as_u64(v_load_halves((int64*)ptr0, (int64*)ptr1)); }
824
825 #if CV_SIMD128_64F
v_load_halves(const double * ptr0,const double * ptr1)826 inline v_float64x2 v_load_halves(const double* ptr0, const double* ptr1)
827 {
828 double CV_DECL_ALIGNED(32) elems[2] =
829 {
830 ptr0[0], ptr1[0]
831 };
832 vsetvlmax_e64m1();
833 return v_float64x2(vle64_v_f64m1(elems));
834 }
835 #endif
836
837
838 ////////////// Lookup table access ////////////////////
839
v_lut(const schar * tab,const int * idx)840 inline v_int8x16 v_lut(const schar* tab, const int* idx)
841 {
842 schar CV_DECL_ALIGNED(32) elems[16] =
843 {
844 tab[idx[ 0]],
845 tab[idx[ 1]],
846 tab[idx[ 2]],
847 tab[idx[ 3]],
848 tab[idx[ 4]],
849 tab[idx[ 5]],
850 tab[idx[ 6]],
851 tab[idx[ 7]],
852 tab[idx[ 8]],
853 tab[idx[ 9]],
854 tab[idx[10]],
855 tab[idx[11]],
856 tab[idx[12]],
857 tab[idx[13]],
858 tab[idx[14]],
859 tab[idx[15]]
860 };
861 vsetvlmax_e8m1();
862 return v_int8x16(vle8_v_i8m1(elems));
863 }
v_lut_pairs(const schar * tab,const int * idx)864 inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
865 {
866 schar CV_DECL_ALIGNED(32) elems[16] =
867 {
868 tab[idx[0]],
869 tab[idx[0] + 1],
870 tab[idx[1]],
871 tab[idx[1] + 1],
872 tab[idx[2]],
873 tab[idx[2] + 1],
874 tab[idx[3]],
875 tab[idx[3] + 1],
876 tab[idx[4]],
877 tab[idx[4] + 1],
878 tab[idx[5]],
879 tab[idx[5] + 1],
880 tab[idx[6]],
881 tab[idx[6] + 1],
882 tab[idx[7]],
883 tab[idx[7] + 1]
884 };
885 vsetvlmax_e8m1();
886 return v_int8x16(vle8_v_i8m1(elems));
887 }
v_lut_quads(const schar * tab,const int * idx)888 inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
889 {
890 schar CV_DECL_ALIGNED(32) elems[16] =
891 {
892 tab[idx[0]],
893 tab[idx[0] + 1],
894 tab[idx[0] + 2],
895 tab[idx[0] + 3],
896 tab[idx[1]],
897 tab[idx[1] + 1],
898 tab[idx[1] + 2],
899 tab[idx[1] + 3],
900 tab[idx[2]],
901 tab[idx[2] + 1],
902 tab[idx[2] + 2],
903 tab[idx[2] + 3],
904 tab[idx[3]],
905 tab[idx[3] + 1],
906 tab[idx[3] + 2],
907 tab[idx[3] + 3]
908 };
909 vsetvlmax_e8m1();
910 return v_int8x16(vle8_v_i8m1(elems));
911 }
v_lut(const uchar * tab,const int * idx)912 inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
v_lut_pairs(const uchar * tab,const int * idx)913 inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
v_lut_quads(const uchar * tab,const int * idx)914 inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
915
v_lut(const short * tab,const int * idx)916 inline v_int16x8 v_lut(const short* tab, const int* idx)
917 {
918 short CV_DECL_ALIGNED(32) elems[8] =
919 {
920 tab[idx[0]],
921 tab[idx[1]],
922 tab[idx[2]],
923 tab[idx[3]],
924 tab[idx[4]],
925 tab[idx[5]],
926 tab[idx[6]],
927 tab[idx[7]]
928 };
929 vsetvlmax_e16m1();
930 return v_int16x8(vle16_v_i16m1(elems));
931 }
v_lut_pairs(const short * tab,const int * idx)932 inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
933 {
934 short CV_DECL_ALIGNED(32) elems[8] =
935 {
936 tab[idx[0]],
937 tab[idx[0] + 1],
938 tab[idx[1]],
939 tab[idx[1] + 1],
940 tab[idx[2]],
941 tab[idx[2] + 1],
942 tab[idx[3]],
943 tab[idx[3] + 1]
944 };
945 vsetvlmax_e16m1();
946 return v_int16x8(vle16_v_i16m1(elems));
947 }
v_lut_quads(const short * tab,const int * idx)948 inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
949 {
950 short CV_DECL_ALIGNED(32) elems[8] =
951 {
952 tab[idx[0]],
953 tab[idx[0] + 1],
954 tab[idx[0] + 2],
955 tab[idx[0] + 3],
956 tab[idx[1]],
957 tab[idx[1] + 1],
958 tab[idx[1] + 2],
959 tab[idx[1] + 3]
960 };
961 vsetvlmax_e16m1();
962 return v_int16x8(vle16_v_i16m1(elems));
963 }
v_lut(const ushort * tab,const int * idx)964 inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
v_lut_pairs(const ushort * tab,const int * idx)965 inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
v_lut_quads(const ushort * tab,const int * idx)966 inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
967
v_lut(const int * tab,const int * idx)968 inline v_int32x4 v_lut(const int* tab, const int* idx)
969 {
970 int CV_DECL_ALIGNED(32) elems[4] =
971 {
972 tab[idx[0]],
973 tab[idx[1]],
974 tab[idx[2]],
975 tab[idx[3]]
976 };
977 vsetvlmax_e32m1();
978 return v_int32x4(vle32_v_i32m1(elems));
979 }
v_lut_pairs(const int * tab,const int * idx)980 inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
981 {
982 int CV_DECL_ALIGNED(32) elems[4] =
983 {
984 tab[idx[0]],
985 tab[idx[0] + 1],
986 tab[idx[1]],
987 tab[idx[1] + 1]
988 };
989 vsetvlmax_e32m1();
990 return v_int32x4(vle32_v_i32m1(elems));
991 }
v_lut_quads(const int * tab,const int * idx)992 inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
993 {
994 vsetvlmax_e32m1();
995 return v_int32x4(vle32_v_i32m1(tab + idx[0]));
996 }
997
v_lut(const unsigned * tab,const int * idx)998 inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
v_lut_pairs(const unsigned * tab,const int * idx)999 inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
v_lut_quads(const unsigned * tab,const int * idx)1000 inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
1001
v_lut(const int64_t * tab,const int * idx)1002 inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
1003 {
1004 int64_t CV_DECL_ALIGNED(32) elems[2] =
1005 {
1006 tab[idx[0]],
1007 tab[idx[1]]
1008 };
1009 vsetvlmax_e64m1();
1010 return v_int64x2(vle64_v_i64m1(elems));
1011 }
v_lut_pairs(const int64 * tab,const int * idx)1012 inline v_int64x2 v_lut_pairs(const int64* tab, const int* idx)
1013 {
1014 vsetvlmax_e64m1();
1015 return v_int64x2(vle64_v_i64m1(tab + idx[0]));
1016 }
v_lut(const uint64 * tab,const int * idx)1017 inline v_uint64x2 v_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
v_lut_pairs(const uint64 * tab,const int * idx)1018 inline v_uint64x2 v_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
1019
v_lut(const float * tab,const int * idx)1020 inline v_float32x4 v_lut(const float* tab, const int* idx)
1021 {
1022 float CV_DECL_ALIGNED(32) elems[4] =
1023 {
1024 tab[idx[0]],
1025 tab[idx[1]],
1026 tab[idx[2]],
1027 tab[idx[3]]
1028 };
1029 vsetvlmax_e32m1();
1030 return v_float32x4(vle32_v_f32m1(elems));
1031 }
v_lut_pairs(const float * tab,const int * idx)1032 inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
1033 {
1034 float CV_DECL_ALIGNED(32) elems[4] =
1035 {
1036 tab[idx[0]],
1037 tab[idx[0] + 1],
1038 tab[idx[1]],
1039 tab[idx[1] + 1]
1040 };
1041 vsetvlmax_e32m1();
1042 return v_float32x4(vle32_v_f32m1(elems));
1043 }
v_lut_quads(const float * tab,const int * idx)1044 inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
1045 {
1046 vsetvlmax_e32m1();
1047 return v_float32x4(vle32_v_f32m1(tab + idx[0]));
1048 }
1049
v_lut(const int * tab,const v_int32x4 & idxvec)1050 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
1051 {
1052 int CV_DECL_ALIGNED(32) elems[4] =
1053 {
1054 tab[v_extract_n<0>(idxvec)],
1055 tab[v_extract_n<1>(idxvec)],
1056 tab[v_extract_n<2>(idxvec)],
1057 tab[v_extract_n<3>(idxvec)]
1058 };
1059 vsetvlmax_e32m1();
1060 return v_int32x4(vle32_v_i32m1(elems));
1061 }
1062
v_lut(const unsigned * tab,const v_int32x4 & idxvec)1063 inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
1064 {
1065 unsigned CV_DECL_ALIGNED(32) elems[4] =
1066 {
1067 tab[v_extract_n<0>(idxvec)],
1068 tab[v_extract_n<1>(idxvec)],
1069 tab[v_extract_n<2>(idxvec)],
1070 tab[v_extract_n<3>(idxvec)]
1071 };
1072 vsetvlmax_e32m1();
1073 return v_uint32x4(vle32_v_u32m1(elems));
1074 }
1075
v_lut(const float * tab,const v_int32x4 & idxvec)1076 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
1077 {
1078 float CV_DECL_ALIGNED(32) elems[4] =
1079 {
1080 tab[v_extract_n<0>(idxvec)],
1081 tab[v_extract_n<1>(idxvec)],
1082 tab[v_extract_n<2>(idxvec)],
1083 tab[v_extract_n<3>(idxvec)]
1084 };
1085 vsetvlmax_e32m1();
1086 return v_float32x4(vle32_v_f32m1(elems));
1087 }
1088
v_lut_deinterleave(const float * tab,const v_int32x4 & idxvec,v_float32x4 & x,v_float32x4 & y)1089 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
1090 {
1091 int CV_DECL_ALIGNED(32) idx[4];
1092 v_store_aligned(idx, idxvec);
1093
1094 x = v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
1095 y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
1096 }
1097
1098 #if CV_SIMD128_64F
v_lut(const double * tab,const int * idx)1099 inline v_float64x2 v_lut(const double* tab, const int* idx)
1100 {
1101 double CV_DECL_ALIGNED(32) elems[2] =
1102 {
1103 tab[idx[0]],
1104 tab[idx[1]]
1105 };
1106 vsetvlmax_e64m1();
1107 return v_float64x2(vle64_v_f64m1(elems));
1108 }
1109
v_lut_pairs(const double * tab,const int * idx)1110 inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
1111 {
1112 vsetvlmax_e64m1();
1113 return v_float64x2(vle64_v_f64m1(tab + idx[0]));
1114 }
1115
v_lut(const double * tab,const v_int32x4 & idxvec)1116 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
1117 {
1118 double CV_DECL_ALIGNED(32) elems[2] =
1119 {
1120 tab[v_extract_n<0>(idxvec)],
1121 tab[v_extract_n<1>(idxvec)]
1122 };
1123 vsetvlmax_e64m1();
1124 return v_float64x2(vle64_v_f64m1(elems));
1125 }
1126
v_lut_deinterleave(const double * tab,const v_int32x4 & idxvec,v_float64x2 & x,v_float64x2 & y)1127 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
1128 {
1129 int CV_DECL_ALIGNED(32) idx[4] = {0};
1130 v_store_aligned(idx, idxvec);
1131
1132 x = v_float64x2(tab[idx[0]], tab[idx[1]]);
1133 y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
1134 }
1135 #endif
1136
1137 ////////////// Pack boolean ////////////////////
1138
v_pack_b(const v_uint16x8 & a,const v_uint16x8 & b)1139 inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
1140 {
1141 ushort CV_DECL_ALIGNED(32) ptr[16] = {0};
1142 v_store(ptr, a);
1143 v_store(ptr + 8, b);
1144 vsetvlmax_e8m1();
1145 return v_uint8x16(vnsrl_wx_u8m1(vle16_v_u16m2(ptr), 0));
1146 }
1147
v_pack_b(const v_uint32x4 & a,const v_uint32x4 & b,const v_uint32x4 & c,const v_uint32x4 & d)1148 inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
1149 const v_uint32x4& c, const v_uint32x4& d)
1150 {
1151 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
1152 v_store(ptr, a);
1153 v_store(ptr + 4, b);
1154 v_store(ptr + 8, c);
1155 v_store(ptr + 12, d);
1156 vsetvlmax_e8m1();
1157 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vle32_v_u32m4(ptr), 0), 0));
1158 }
1159
v_pack_b(const v_uint64x2 & a,const v_uint64x2 & b,const v_uint64x2 & c,const v_uint64x2 & d,const v_uint64x2 & e,const v_uint64x2 & f,const v_uint64x2 & g,const v_uint64x2 & h)1160 inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
1161 const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
1162 const v_uint64x2& g, const v_uint64x2& h)
1163 {
1164 uint64 CV_DECL_ALIGNED(32) ptr[16] = {0};
1165 v_store(ptr, a);
1166 v_store(ptr + 2, b);
1167 v_store(ptr + 4, c);
1168 v_store(ptr + 6, d);
1169 v_store(ptr + 8, e);
1170 v_store(ptr + 10, f);
1171 v_store(ptr + 12, g);
1172 v_store(ptr + 14, h);
1173 vsetvlmax_e8m1();
1174 return v_uint8x16(vnsrl_wx_u8m1(vnsrl_wx_u16m2(vnsrl_wx_u32m4(vle64_v_u64m8(ptr), 0), 0), 0));
1175 }
1176
1177 ////////////// Arithmetics //////////////
1178 #define OPENCV_HAL_IMPL_RVV_BIN_OP(bin_op, _Tpvec, intrin, width) \
1179 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
1180 { \
1181 vsetvlmax_e##width##m1(); \
1182 return _Tpvec(intrin(a, b)); \
1183 } \
1184 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
1185 { \
1186 vsetvlmax_e##width##m1(); \
1187 a = _Tpvec(intrin(a, b)); \
1188 return a; \
1189 }
1190
1191 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint8x16, vsaddu_vv_u8m1, 8)
1192 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint8x16, vssubu_vv_u8m1, 8)
1193 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint8x16, vdivu_vv_u8m1, 8)
1194 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int8x16, vsadd_vv_i8m1, 8)
1195 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int8x16, vssub_vv_i8m1, 8)
1196 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int8x16, vdiv_vv_i8m1, 8)
1197 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint16x8, vsaddu_vv_u16m1, 16)
1198 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint16x8, vssubu_vv_u16m1, 16)
1199 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint16x8, vdivu_vv_u16m1, 16)
1200 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int16x8, vsadd_vv_i16m1, 16)
1201 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int16x8, vssub_vv_i16m1, 16)
1202 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int16x8, vdiv_vv_i16m1, 16)
1203 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint32x4, vadd_vv_u32m1, 32)
1204 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint32x4, vsub_vv_u32m1, 32)
1205 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint32x4, vmul_vv_u32m1, 32)
1206 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint32x4, vdivu_vv_u32m1, 32)
1207 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int32x4, vadd_vv_i32m1, 32)
1208 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int32x4, vsub_vv_i32m1, 32)
1209 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int32x4, vmul_vv_i32m1, 32)
1210 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int32x4, vdiv_vv_i32m1, 32)
1211 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float32x4, vfadd_vv_f32m1, 32)
1212 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float32x4, vfsub_vv_f32m1, 32)
1213 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float32x4, vfmul_vv_f32m1, 32)
1214 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float32x4, vfdiv_vv_f32m1, 32)
1215 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_uint64x2, vadd_vv_u64m1, 64)
1216 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_uint64x2, vsub_vv_u64m1, 64)
1217 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_uint64x2, vmul_vv_u64m1, 64)
1218 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_uint64x2, vdivu_vv_u64m1, 64)
1219 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_int64x2, vadd_vv_i64m1, 64)
1220 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_int64x2, vsub_vv_i64m1, 64)
1221 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_int64x2, vmul_vv_i64m1, 64)
1222 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_int64x2, vdiv_vv_i64m1, 64)
1223 #if CV_SIMD128_64F
1224 OPENCV_HAL_IMPL_RVV_BIN_OP(+, v_float64x2, vfadd_vv_f64m1, 64)
1225 OPENCV_HAL_IMPL_RVV_BIN_OP(-, v_float64x2, vfsub_vv_f64m1, 64)
1226 OPENCV_HAL_IMPL_RVV_BIN_OP(*, v_float64x2, vfmul_vv_f64m1, 64)
1227 OPENCV_HAL_IMPL_RVV_BIN_OP(/, v_float64x2, vfdiv_vv_f64m1, 64)
1228 #endif
1229
1230
1231 ////////////// Bitwise logic //////////////
1232
1233 #define OPENCV_HAL_IMPL_RVV_LOGIC_OP(_Tpvec, suffix, width) \
1234 OPENCV_HAL_IMPL_RVV_BIN_OP(&, _Tpvec, vand_vv_##suffix##m1, width) \
1235 OPENCV_HAL_IMPL_RVV_BIN_OP(|, _Tpvec, vor_vv_##suffix##m1, width) \
1236 OPENCV_HAL_IMPL_RVV_BIN_OP(^, _Tpvec, vxor_vv_##suffix##m1, width) \
1237 inline _Tpvec operator ~ (const _Tpvec& a) \
1238 { \
1239 vsetvlmax_e##width##m1(); \
1240 return _Tpvec(vnot_v_##suffix##m1(a)); \
1241 }
1242
1243 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint8x16, u8, 8)
1244 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int8x16, i8, 8)
1245 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint16x8, u16, 16)
1246 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int16x8, i16, 16)
1247 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint32x4, u32, 32)
1248 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int32x4, i32, 32)
1249 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_uint64x2, u64, 64)
1250 OPENCV_HAL_IMPL_RVV_LOGIC_OP(v_int64x2, i64, 64)
1251
1252 #define OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(bin_op, intrin) \
1253 inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
1254 { \
1255 vsetvlmax_e32m1(); \
1256 return v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
1257 } \
1258 inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
1259 { \
1260 vsetvlmax_e32m1(); \
1261 a = v_float32x4(vreinterpret_v_i32m1_f32m1(intrin(vreinterpret_v_f32m1_i32m1(a), vreinterpret_v_f32m1_i32m1(b)))); \
1262 return a; \
1263 }
1264
1265 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(&, vand_vv_i32m1)
1266 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(|, vor_vv_i32m1)
1267 OPENCV_HAL_IMPL_RVV_FLT_BIT_OP(^, vxor_vv_i32m1)
1268
operator ~(const v_float32x4 & a)1269 inline v_float32x4 operator ~ (const v_float32x4& a)
1270 {
1271 vsetvlmax_e32m1();
1272 return v_float32x4(vreinterpret_v_i32m1_f32m1(vnot_v_i32m1(vreinterpret_v_f32m1_i32m1(a))));
1273 }
1274
1275 #if CV_SIMD128_64F
1276 #define OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(bin_op, intrin) \
1277 inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
1278 { \
1279 vsetvlmax_e64m1(); \
1280 return v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
1281 } \
1282 inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
1283 { \
1284 vsetvlmax_e64m1(); \
1285 a = v_float64x2(vreinterpret_v_i64m1_f64m1(intrin(vreinterpret_v_f64m1_i64m1(a), vreinterpret_v_f64m1_i64m1(b)))); \
1286 return a; \
1287 }
1288
1289 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(&, vand_vv_i64m1)
1290 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(|, vor_vv_i64m1)
1291 OPENCV_HAL_IMPL_RVV_FLT64_BIT_OP(^, vxor_vv_i64m1)
1292
operator ~(const v_float64x2 & a)1293 inline v_float64x2 operator ~ (const v_float64x2& a)
1294 {
1295 vsetvlmax_e64m1();
1296 return v_float64x2(vreinterpret_v_i64m1_f64m1(vnot_v_i64m1(vreinterpret_v_f64m1_i64m1(a))));
1297 }
1298 #endif
1299
1300 ////////////// Bitwise shifts //////////////
1301
1302 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(_Tpvec, suffix, width) \
1303 inline _Tpvec operator << (const _Tpvec& a, int n) \
1304 { \
1305 vsetvlmax_e##width##m1(); \
1306 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1307 } \
1308 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1309 { \
1310 vsetvlmax_e##width##m1(); \
1311 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
1312 } \
1313 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1314 { \
1315 vsetvlmax_e##width##m1(); \
1316 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1317 } \
1318 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1319 { \
1320 vsetvlmax_e##width##m1(); \
1321 return _Tpvec(vsrl_vx_##suffix##m1(a, uint8_t(n))); \
1322 }
1323
1324 #define OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(_Tpvec, suffix, width) \
1325 inline _Tpvec operator << (const _Tpvec& a, int n) \
1326 { \
1327 vsetvlmax_e##width##m1(); \
1328 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1329 } \
1330 inline _Tpvec operator >> (const _Tpvec& a, int n) \
1331 { \
1332 vsetvlmax_e##width##m1(); \
1333 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
1334 } \
1335 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
1336 { \
1337 vsetvlmax_e##width##m1(); \
1338 return _Tpvec(vsll_vx_##suffix##m1(a, uint8_t(n))); \
1339 } \
1340 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
1341 { \
1342 vsetvlmax_e##width##m1(); \
1343 return _Tpvec(vsra_vx_##suffix##m1(a, uint8_t(n))); \
1344 }
1345
1346 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint8x16, u8, 8)
1347 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint16x8, u16, 16)
1348 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint32x4, u32, 32)
1349 OPENCV_HAL_IMPL_RVV_UNSIGNED_SHIFT_OP(v_uint64x2, u64, 64)
1350 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int8x16, i8, 8)
1351 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int16x8, i16, 16)
1352 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int32x4, i32, 32)
1353 OPENCV_HAL_IMPL_RVV_SIGNED_SHIFT_OP(v_int64x2, i64, 64)
1354
1355
1356 ////////////// Comparison //////////////
1357
1358 #define OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
1359 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1360 { \
1361 vsetvlmax_e##width##m1(); \
1362 return _Tpvec(vmerge_vxm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
1363 }
1364
1365 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, op, intrin, suffix, width) \
1366 inline _Tpvec operator op (const _Tpvec& a, const _Tpvec& b) \
1367 { \
1368 vsetvlmax_e##width##m1(); \
1369 return _Tpvec(vfmerge_vfm_##suffix##m1(intrin(a, b), vzero_##suffix##m1(), 1)); \
1370 }
1371
1372 #define OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(_Tpvec, suffix, width) \
1373 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
1374 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
1375 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmsltu_vv_##suffix##m1_b##width, suffix, width) \
1376 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgtu_vv_##suffix##m1_b##width, suffix, width) \
1377 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsleu_vv_##suffix##m1_b##width, suffix, width) \
1378 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsgeu_vv_##suffix##m1_b##width, suffix, width)
1379
1380 #define OPENCV_HAL_IMPL_RVV_SIGNED_CMP(_Tpvec, suffix, width) \
1381 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, ==, vmseq_vv_##suffix##m1_b##width, suffix, width) \
1382 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, !=, vmsne_vv_##suffix##m1_b##width, suffix, width) \
1383 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <, vmslt_vv_##suffix##m1_b##width, suffix, width) \
1384 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >, vmsgt_vv_##suffix##m1_b##width, suffix, width) \
1385 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, <=, vmsle_vv_##suffix##m1_b##width, suffix, width) \
1386 OPENCV_HAL_IMPL_RVV_INT_CMP_OP(_Tpvec, >=, vmsge_vv_##suffix##m1_b##width, suffix, width)
1387
1388 #define OPENCV_HAL_IMPL_RVV_FLOAT_CMP(_Tpvec, suffix, width) \
1389 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, ==, vmfeq_vv_##suffix##m1_b##width, suffix, width) \
1390 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, !=, vmfne_vv_##suffix##m1_b##width, suffix, width) \
1391 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <, vmflt_vv_##suffix##m1_b##width, suffix, width) \
1392 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >, vmfgt_vv_##suffix##m1_b##width, suffix, width) \
1393 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, <=, vmfle_vv_##suffix##m1_b##width, suffix, width) \
1394 OPENCV_HAL_IMPL_RVV_FLOAT_CMP_OP(_Tpvec, >=, vmfge_vv_##suffix##m1_b##width, suffix, width)
1395
1396
1397 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint8x16, u8, 8)
1398 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint16x8, u16, 16)
1399 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint32x4, u32, 32)
1400 OPENCV_HAL_IMPL_RVV_UNSIGNED_CMP(v_uint64x2, u64, 64)
1401 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int8x16, i8, 8)
1402 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int16x8, i16, 16)
1403 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int32x4, i32, 32)
1404 OPENCV_HAL_IMPL_RVV_SIGNED_CMP(v_int64x2, i64, 64)
1405 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float32x4, f32, 32)
1406 #if CV_SIMD128_64F
1407 OPENCV_HAL_IMPL_RVV_FLOAT_CMP(v_float64x2, f64, 64)
1408 #endif
1409
v_not_nan(const v_float32x4 & a)1410 inline v_float32x4 v_not_nan(const v_float32x4& a)
1411 { return a == a; }
1412
1413 #if CV_SIMD128_64F
v_not_nan(const v_float64x2 & a)1414 inline v_float64x2 v_not_nan(const v_float64x2& a)
1415 { return a == a; }
1416 #endif
1417
1418 ////////////// Min/Max //////////////
1419
1420 #define OPENCV_HAL_IMPL_RVV_BIN_FUNC(_Tpvec, func, intrin, width) \
1421 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
1422 { \
1423 vsetvlmax_e##width##m1(); \
1424 return _Tpvec(intrin(a, b)); \
1425 }
1426
1427 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_min, vminu_vv_u8m1, 8)
1428 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_max, vmaxu_vv_u8m1, 8)
1429 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_min, vmin_vv_i8m1, 8)
1430 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_max, vmax_vv_i8m1, 8)
1431 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_min, vminu_vv_u16m1, 16)
1432 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_max, vmaxu_vv_u16m1, 16)
1433 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_min, vmin_vv_i16m1, 16)
1434 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_max, vmax_vv_i16m1, 16)
1435 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_min, vminu_vv_u32m1, 32)
1436 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint32x4, v_max, vmaxu_vv_u32m1, 32)
1437 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_min, vmin_vv_i32m1, 32)
1438 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int32x4, v_max, vmax_vv_i32m1, 32)
1439 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_min, vfmin_vv_f32m1, 32)
1440 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float32x4, v_max, vfmax_vv_f32m1, 32)
1441 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_min, vminu_vv_u64m1, 64)
1442 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint64x2, v_max, vmaxu_vv_u64m1, 64)
1443 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_min, vmin_vv_i64m1, 64)
1444 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int64x2, v_max, vmax_vv_i64m1, 64)
1445 #if CV_SIMD128_64F
1446 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_min, vfmin_vv_f64m1, 64)
1447 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_float64x2, v_max, vfmax_vv_f64m1, 64)
1448 #endif
1449
1450 ////////////// Arithmetics wrap //////////////
1451
1452 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_add_wrap, vadd_vv_u8m1, 8)
1453 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_add_wrap, vadd_vv_i8m1, 8)
1454 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_add_wrap, vadd_vv_u16m1, 16)
1455 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_add_wrap, vadd_vv_i16m1, 16)
1456 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_sub_wrap, vsub_vv_u8m1, 8)
1457 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_sub_wrap, vsub_vv_i8m1, 8)
1458 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_sub_wrap, vsub_vv_u16m1, 16)
1459 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_sub_wrap, vsub_vv_i16m1, 16)
1460 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint8x16, v_mul_wrap, vmul_vv_u8m1, 8)
1461 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int8x16, v_mul_wrap, vmul_vv_i8m1, 8)
1462 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_uint16x8, v_mul_wrap, vmul_vv_u16m1, 16)
1463 OPENCV_HAL_IMPL_RVV_BIN_FUNC(v_int16x8, v_mul_wrap, vmul_vv_i16m1, 16)
1464
1465 ////////////// Reduce //////////////
1466
1467 #define OPENCV_HAL_IMPL_RVV_REDUCE_SUM(_Tpvec, _wTpvec, _nwTpvec, scalartype, suffix, wsuffix, wwidth, red) \
1468 inline scalartype v_reduce_sum(const _Tpvec& a) \
1469 { \
1470 vsetvlmax_e##wwidth##m1(); \
1471 _nwTpvec zero = vzero_##wsuffix##m1(); \
1472 _nwTpvec res = vzero_##wsuffix##m1(); \
1473 res = v##red##_vs_##suffix##m1_##wsuffix##m1(res, a, zero); \
1474 return (scalartype)(_wTpvec(res).get0()); \
1475 }
1476
1477 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint8x16, v_uint16x8, vuint16m1_t, unsigned, u8, u16, 16, wredsumu)
1478 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int8x16, v_int16x8, vint16m1_t, int, i8, i16, 16, wredsum)
1479 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint16x8, v_uint32x4, vuint32m1_t, unsigned, u16, u32, 32, wredsumu)
1480 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int16x8, v_int32x4, vint32m1_t, int, i16, i32, 32, wredsum)
1481 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint32x4, v_uint64x2, vuint64m1_t, unsigned, u32, u64, 64, wredsumu)
1482 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int32x4, v_int64x2, vint64m1_t, int, i32, i64, 64, wredsum)
1483 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float32x4, v_float32x4, vfloat32m1_t, float, f32, f32, 32, fredsum)
1484 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_uint64x2, v_uint64x2, vuint64m1_t, uint64, u64, u64, 64, redsum)
1485 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_int64x2, v_int64x2, vint64m1_t, int64, i64, i64, 64, redsum)
1486 #if CV_SIMD128_64F
1487 OPENCV_HAL_IMPL_RVV_REDUCE_SUM(v_float64x2, v_float64x2, vfloat64m1_t, double, f64, f64, 64, fredsum)
1488 #endif
1489
1490
1491 #define OPENCV_HAL_IMPL_RVV_REDUCE(_Tpvec, func, scalartype, suffix, width, red) \
1492 inline scalartype v_reduce_##func(const _Tpvec& a) \
1493 { \
1494 vsetvlmax_e##width##m1(); \
1495 _Tpvec res = _Tpvec(v##red##_vs_##suffix##m1_##suffix##m1(a, a, a)); \
1496 return scalartype(res.get0()); \
1497 }
1498
1499 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, min, uchar, u8, 8, redminu)
1500 OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, min, schar, i8, 8, redmin)
1501 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, min, ushort, u16, 16, redminu)
1502 OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, min, short, i16, 16, redmin)
1503 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, min, unsigned, u32, 32, redminu)
1504 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, min, int, i32, 32, redmin)
1505 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, min, float, f32, 32, fredmin)
1506 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint8x16, max, uchar, u8, 8, redmaxu)
1507 OPENCV_HAL_IMPL_RVV_REDUCE(v_int8x16, max, schar, i8, 8, redmax)
1508 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint16x8, max, ushort, u16, 16, redmaxu)
1509 OPENCV_HAL_IMPL_RVV_REDUCE(v_int16x8, max, short, i16, 16, redmax)
1510 OPENCV_HAL_IMPL_RVV_REDUCE(v_uint32x4, max, unsigned, u32, 32, redmaxu)
1511 OPENCV_HAL_IMPL_RVV_REDUCE(v_int32x4, max, int, i32, 32, redmax)
1512 OPENCV_HAL_IMPL_RVV_REDUCE(v_float32x4, max, float, f32, 32, fredmax)
1513
1514
v_reduce_sum4(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c,const v_float32x4 & d)1515 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1516 const v_float32x4& c, const v_float32x4& d)
1517 {
1518 float CV_DECL_ALIGNED(32) elems[4] =
1519 {
1520 v_reduce_sum(a),
1521 v_reduce_sum(b),
1522 v_reduce_sum(c),
1523 v_reduce_sum(d)
1524 };
1525 vsetvlmax_e32m1();
1526 return v_float32x4(vle32_v_f32m1(elems));
1527 }
1528
1529 ////////////// Square-Root //////////////
1530
v_sqrt(const v_float32x4 & x)1531 inline v_float32x4 v_sqrt(const v_float32x4& x)
1532 {
1533 vsetvlmax_e32m1();
1534 return v_float32x4(vfsqrt_v_f32m1(x));
1535 }
1536
v_invsqrt(const v_float32x4 & x)1537 inline v_float32x4 v_invsqrt(const v_float32x4& x)
1538 {
1539 v_float32x4 one = v_setall_f32(1.0f);
1540 return one / v_sqrt(x);
1541 }
1542
1543 #if CV_SIMD128_64F
v_sqrt(const v_float64x2 & x)1544 inline v_float64x2 v_sqrt(const v_float64x2& x)
1545 {
1546 vsetvlmax_e64m1();
1547 return v_float64x2(vfsqrt_v_f64m1(x));
1548 }
1549
v_invsqrt(const v_float64x2 & x)1550 inline v_float64x2 v_invsqrt(const v_float64x2& x)
1551 {
1552 v_float64x2 one = v_setall_f64(1.0f);
1553 return one / v_sqrt(x);
1554 }
1555 #endif
1556
v_magnitude(const v_float32x4 & a,const v_float32x4 & b)1557 inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
1558 {
1559 vsetvlmax_e32m1();
1560 v_float32x4 x(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
1561 return v_sqrt(x);
1562 }
1563
v_sqr_magnitude(const v_float32x4 & a,const v_float32x4 & b)1564 inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
1565 {
1566 vsetvlmax_e32m1();
1567 return v_float32x4(vfmacc_vv_f32m1(vfmul_vv_f32m1(a, a), b, b));
1568 }
1569
1570 #if CV_SIMD128_64F
v_magnitude(const v_float64x2 & a,const v_float64x2 & b)1571 inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
1572 {
1573 vsetvlmax_e64m1();
1574 v_float64x2 x(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
1575 return v_sqrt(x);
1576 }
1577
v_sqr_magnitude(const v_float64x2 & a,const v_float64x2 & b)1578 inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
1579 {
1580 vsetvlmax_e64m1();
1581 return v_float64x2(vfmacc_vv_f64m1(vfmul_vv_f64m1(a, a), b, b));
1582 }
1583 #endif
1584
1585 ////////////// Multiply-Add //////////////
1586
v_fma(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1587 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1588 {
1589 vsetvlmax_e32m1();
1590 return v_float32x4(vfmacc_vv_f32m1(c, a, b));
1591 }
v_fma(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1592 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1593 {
1594 vsetvlmax_e32m1();
1595 return v_int32x4(vmacc_vv_i32m1(c, a, b));
1596 }
1597
v_muladd(const v_float32x4 & a,const v_float32x4 & b,const v_float32x4 & c)1598 inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1599 {
1600 return v_fma(a, b, c);
1601 }
1602
v_muladd(const v_int32x4 & a,const v_int32x4 & b,const v_int32x4 & c)1603 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1604 {
1605 return v_fma(a, b, c);
1606 }
1607
1608 #if CV_SIMD128_64F
v_fma(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1609 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1610 {
1611 vsetvlmax_e64m1();
1612 return v_float64x2(vfmacc_vv_f64m1(c, a, b));
1613 }
1614
v_muladd(const v_float64x2 & a,const v_float64x2 & b,const v_float64x2 & c)1615 inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1616 {
1617 return v_fma(a, b, c);
1618 }
1619 #endif
1620
1621 ////////////// Check all/any //////////////
1622
1623 #define OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(_Tpvec, suffix, shift, width) \
1624 inline bool v_check_all(const _Tpvec& a) \
1625 { \
1626 vsetvlmax_e##width##m1(); \
1627 v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(vnot_v_##suffix##m1(a), shift)); \
1628 return (v.val[0] | v.val[1]) == 0; \
1629 } \
1630 inline bool v_check_any(const _Tpvec& a) \
1631 { \
1632 vsetvlmax_e##width##m1(); \
1633 v_uint64x2 v = v_uint64x2((vuint64m1_t)vsrl_vx_##suffix##m1(a, shift)); \
1634 return (v.val[0] | v.val[1]) != 0; \
1635 }
1636
1637 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint8x16, u8, 7, 8)
1638 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint16x8, u16, 15, 16)
1639 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint32x4, u32, 31, 32)
1640 OPENCV_HAL_IMPL_RVV_CHECK_ALLANY(v_uint64x2, u64, 63, 64)
1641
1642
v_check_all(const v_int8x16 & a)1643 inline bool v_check_all(const v_int8x16& a)
1644 { return v_check_all(v_reinterpret_as_u8(a)); }
v_check_any(const v_int8x16 & a)1645 inline bool v_check_any(const v_int8x16& a)
1646 { return v_check_any(v_reinterpret_as_u8(a)); }
1647
v_check_all(const v_int16x8 & a)1648 inline bool v_check_all(const v_int16x8& a)
1649 { return v_check_all(v_reinterpret_as_u16(a)); }
v_check_any(const v_int16x8 & a)1650 inline bool v_check_any(const v_int16x8& a)
1651 { return v_check_any(v_reinterpret_as_u16(a)); }
1652
v_check_all(const v_int32x4 & a)1653 inline bool v_check_all(const v_int32x4& a)
1654 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_any(const v_int32x4 & a)1655 inline bool v_check_any(const v_int32x4& a)
1656 { return v_check_any(v_reinterpret_as_u32(a)); }
1657
v_check_all(const v_float32x4 & a)1658 inline bool v_check_all(const v_float32x4& a)
1659 { return v_check_all(v_reinterpret_as_u32(a)); }
v_check_any(const v_float32x4 & a)1660 inline bool v_check_any(const v_float32x4& a)
1661 { return v_check_any(v_reinterpret_as_u32(a)); }
1662
v_check_all(const v_int64x2 & a)1663 inline bool v_check_all(const v_int64x2& a)
1664 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_any(const v_int64x2 & a)1665 inline bool v_check_any(const v_int64x2& a)
1666 { return v_check_any(v_reinterpret_as_u64(a)); }
1667
1668 #if CV_SIMD128_64F
v_check_all(const v_float64x2 & a)1669 inline bool v_check_all(const v_float64x2& a)
1670 { return v_check_all(v_reinterpret_as_u64(a)); }
v_check_any(const v_float64x2 & a)1671 inline bool v_check_any(const v_float64x2& a)
1672 { return v_check_any(v_reinterpret_as_u64(a)); }
1673 #endif
1674
1675 ////////////// abs //////////////
1676
1677 #define OPENCV_HAL_IMPL_RVV_ABSDIFF(_Tpvec, abs) \
1678 inline _Tpvec v_##abs(const _Tpvec& a, const _Tpvec& b) \
1679 { \
1680 return v_max(a, b) - v_min(a, b); \
1681 }
1682
OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16,absdiff)1683 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint8x16, absdiff)
1684 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint16x8, absdiff)
1685 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_uint32x4, absdiff)
1686 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float32x4, absdiff)
1687 #if CV_SIMD128_64F
1688 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_float64x2, absdiff)
1689 #endif
1690 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int8x16, absdiffs)
1691 OPENCV_HAL_IMPL_RVV_ABSDIFF(v_int16x8, absdiffs)
1692
1693 #define OPENCV_HAL_IMPL_RVV_ABSDIFF_S(_Tpvec, _rTpvec, _nwTpvec, sub, rshr, width) \
1694 inline _rTpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1695 { \
1696 vsetvlmax_e##width##m1(); \
1697 return _rTpvec(rshr((_nwTpvec)sub(v_max(a, b), v_min(a, b)), 0)); \
1698 }
1699
1700 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int8x16, v_uint8x16, vuint16m2_t, vwsub_vv_i16m2, vnclipu_wx_u8m1, 8)
1701 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int16x8, v_uint16x8, vuint32m2_t, vwsub_vv_i32m2, vnclipu_wx_u16m1, 16)
1702 OPENCV_HAL_IMPL_RVV_ABSDIFF_S(v_int32x4, v_uint32x4, vuint64m2_t, vwsub_vv_i64m2, vnclipu_wx_u32m1, 32)
1703
1704 #define OPENCV_HAL_IMPL_RVV_ABS(_Tprvec, _Tpvec, suffix) \
1705 inline _Tprvec v_abs(const _Tpvec& a) \
1706 { \
1707 return v_absdiff(a, v_setzero_##suffix()); \
1708 }
1709
1710 OPENCV_HAL_IMPL_RVV_ABS(v_uint8x16, v_int8x16, s8)
1711 OPENCV_HAL_IMPL_RVV_ABS(v_uint16x8, v_int16x8, s16)
1712 OPENCV_HAL_IMPL_RVV_ABS(v_uint32x4, v_int32x4, s32)
1713 OPENCV_HAL_IMPL_RVV_ABS(v_float32x4, v_float32x4, f32)
1714 #if CV_SIMD128_64F
1715 OPENCV_HAL_IMPL_RVV_ABS(v_float64x2, v_float64x2, f64)
1716 #endif
1717
1718
1719 #define OPENCV_HAL_IMPL_RVV_REDUCE_SAD(_Tpvec, scalartype) \
1720 inline scalartype v_reduce_sad(const _Tpvec& a, const _Tpvec& b) \
1721 { \
1722 return v_reduce_sum(v_absdiff(a, b)); \
1723 }
1724
1725 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint8x16, unsigned)
1726 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int8x16, unsigned)
1727 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint16x8, unsigned)
1728 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int16x8, unsigned)
1729 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_uint32x4, unsigned)
1730 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_int32x4, unsigned)
1731 OPENCV_HAL_IMPL_RVV_REDUCE_SAD(v_float32x4, float)
1732
1733 ////////////// Select //////////////
1734
1735 #define OPENCV_HAL_IMPL_RVV_SELECT(_Tpvec, merge, ne, width) \
1736 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1737 { \
1738 vsetvlmax_e##width##m1(); \
1739 return _Tpvec(merge(ne(mask, 0), b, a)); \
1740 }
1741
1742 OPENCV_HAL_IMPL_RVV_SELECT(v_uint8x16, vmerge_vvm_u8m1, vmsne_vx_u8m1_b8, 8)
1743 OPENCV_HAL_IMPL_RVV_SELECT(v_int8x16, vmerge_vvm_i8m1, vmsne_vx_i8m1_b8, 8)
1744 OPENCV_HAL_IMPL_RVV_SELECT(v_uint16x8, vmerge_vvm_u16m1, vmsne_vx_u16m1_b16, 16)
1745 OPENCV_HAL_IMPL_RVV_SELECT(v_int16x8, vmerge_vvm_i16m1, vmsne_vx_i16m1_b16, 16)
1746 OPENCV_HAL_IMPL_RVV_SELECT(v_uint32x4, vmerge_vvm_u32m1, vmsne_vx_u32m1_b32, 32)
1747 OPENCV_HAL_IMPL_RVV_SELECT(v_int32x4, vmerge_vvm_i32m1, vmsne_vx_i32m1_b32, 32)
1748 OPENCV_HAL_IMPL_RVV_SELECT(v_float32x4, vmerge_vvm_f32m1, vmfne_vf_f32m1_b32, 32)
1749 #if CV_SIMD128_64F
1750 OPENCV_HAL_IMPL_RVV_SELECT(v_float64x2, vmerge_vvm_f64m1, vmfne_vf_f64m1_b64, 64)
1751 #endif
1752
1753 ////////////// Rotate shift //////////////
1754
1755 #define OPENCV_HAL_IMPL_RVV_ROTATE_OP(_Tpvec, suffix, width) \
1756 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
1757 { \
1758 vsetvlmax_e##width##m1(); \
1759 return _Tpvec(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
1760 } \
1761 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
1762 { \
1763 vsetvlmax_e##width##m1(); \
1764 return _Tpvec(vslideup_vx_##suffix##m1(vzero_##suffix##m1(), a, n)); \
1765 } \
1766 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
1767 { return a; } \
1768 template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
1769 { \
1770 vsetvlmax_e##width##m1(); \
1771 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), a, n), b, _Tpvec::nlanes - n)); \
1772 } \
1773 template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
1774 { \
1775 vsetvlmax_e##width##m1(); \
1776 return _Tpvec(vslideup_vx_##suffix##m1(vslidedown_vx_##suffix##m1(vzero_##suffix##m1(), b, _Tpvec::nlanes - n), a, n)); \
1777 } \
1778 template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
1779 { CV_UNUSED(b); return a; }
1780
1781
1782 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint8x16, u8, 8)
1783 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int8x16, i8, 8)
1784 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint16x8, u16, 16)
1785 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int16x8, i16, 16)
1786 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint32x4, u32, 32)
1787 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int32x4, i32, 32)
1788 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float32x4, f32, 32)
1789 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_uint64x2, u64, 64)
1790 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_int64x2, i64, 64)
1791 #if CV_SIMD128_64F
1792 OPENCV_HAL_IMPL_RVV_ROTATE_OP(v_float64x2, f64, 64)
1793 #endif
1794
1795 ////////////// Convert to float //////////////
1796
1797 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1798 {
1799 vsetvlmax_e32m1();
1800 return v_float32x4(vfcvt_f_x_v_f32m1(a));
1801 }
1802
1803 #if CV_SIMD128_64F
v_cvt_f32(const v_float64x2 & a)1804 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1805 {
1806 double arr[4] = {a.val[0], a.val[1], 0, 0};
1807 vsetvlmax_e64m2();
1808 vfloat64m2_t tmp = vle64_v_f64m2(arr);
1809 vsetvlmax_e32m1();
1810 return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
1811 }
1812
v_cvt_f32(const v_float64x2 & a,const v_float64x2 & b)1813 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
1814 {
1815 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
1816 vsetvlmax_e64m2();
1817 vfloat64m2_t tmp = vle64_v_f64m2(arr);
1818 vsetvlmax_e32m1();
1819 return v_float32x4(vfncvt_f_f_w_f32m1(tmp));
1820 }
1821
v_cvt_f64(const v_int32x4 & a)1822 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1823 {
1824 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1825 vsetvlmax_e64m2();
1826 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
1827 double CV_DECL_ALIGNED(32) elems[2] =
1828 {
1829 ptr[0], ptr[1]
1830 };
1831 vsetvlmax_e64m1();
1832 return v_float64x2(vle64_v_f64m1(elems));
1833 }
1834
v_cvt_f64_high(const v_int32x4 & a)1835 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1836 {
1837 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1838 vsetvlmax_e64m2();
1839 vse64_v_f64m2(ptr, vfwcvt_f_x_v_f64m2(a));
1840 double CV_DECL_ALIGNED(32) elems[2] =
1841 {
1842 ptr[2], ptr[3]
1843 };
1844 vsetvlmax_e64m1();
1845 return v_float64x2(vle64_v_f64m1(elems));
1846 }
1847
v_cvt_f64(const v_float32x4 & a)1848 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1849 {
1850 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1851 vsetvlmax_e64m2();
1852 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
1853 double CV_DECL_ALIGNED(32) elems[2] =
1854 {
1855 ptr[0], ptr[1]
1856 };
1857 vsetvlmax_e64m1();
1858 return v_float64x2(vle64_v_f64m1(elems));
1859 }
1860
v_cvt_f64_high(const v_float32x4 & a)1861 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1862 {
1863 double CV_DECL_ALIGNED(32) ptr[4] = {0};
1864 vsetvlmax_e64m2();
1865 vse64_v_f64m2(ptr, vfwcvt_f_f_v_f64m2(a));
1866 double CV_DECL_ALIGNED(32) elems[2] =
1867 {
1868 ptr[2], ptr[3]
1869 };
1870 vsetvlmax_e64m1();
1871 return v_float64x2(vle64_v_f64m1(elems));
1872 }
1873
v_cvt_f64(const v_int64x2 & a)1874 inline v_float64x2 v_cvt_f64(const v_int64x2& a)
1875 {
1876 vsetvlmax_e64m1();
1877 return v_float64x2(vfcvt_f_x_v_f64m1(a));
1878 }
1879 #endif
1880
1881 ////////////// Broadcast //////////////
1882
1883 #define OPENCV_HAL_IMPL_RVV_BROADCAST(_Tpvec, suffix) \
1884 template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) \
1885 { \
1886 return v_setall_##suffix(v_extract_n<i>(v)); \
1887 }
1888
OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16,u8)1889 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint8x16, u8)
1890 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int8x16, s8)
1891 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint16x8, u16)
1892 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int16x8, s16)
1893 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint32x4, u32)
1894 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int32x4, s32)
1895 OPENCV_HAL_IMPL_RVV_BROADCAST(v_uint64x2, u64)
1896 OPENCV_HAL_IMPL_RVV_BROADCAST(v_int64x2, s64)
1897 OPENCV_HAL_IMPL_RVV_BROADCAST(v_float32x4, f32)
1898 #if CV_SIMD128_64F
1899 OPENCV_HAL_IMPL_RVV_BROADCAST(v_float64x2, f64)
1900 #endif
1901
1902 ////////////// Transpose4x4 //////////////
1903
1904 #define OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(_Tpvec, _Tp, suffix) \
1905 inline void v_transpose4x4(const v_##_Tpvec& a0, const v_##_Tpvec& a1, \
1906 const v_##_Tpvec& a2, const v_##_Tpvec& a3, \
1907 v_##_Tpvec& b0, v_##_Tpvec& b1, \
1908 v_##_Tpvec& b2, v_##_Tpvec& b3) \
1909 { \
1910 _Tp CV_DECL_ALIGNED(32) elems0[4] = \
1911 { \
1912 v_extract_n<0>(a0), \
1913 v_extract_n<0>(a1), \
1914 v_extract_n<0>(a2), \
1915 v_extract_n<0>(a3) \
1916 }; \
1917 b0 = v_load(elems0); \
1918 _Tp CV_DECL_ALIGNED(32) elems1[4] = \
1919 { \
1920 v_extract_n<1>(a0), \
1921 v_extract_n<1>(a1), \
1922 v_extract_n<1>(a2), \
1923 v_extract_n<1>(a3) \
1924 }; \
1925 b1 = v_load(elems1); \
1926 _Tp CV_DECL_ALIGNED(32) elems2[4] = \
1927 { \
1928 v_extract_n<2>(a0), \
1929 v_extract_n<2>(a1), \
1930 v_extract_n<2>(a2), \
1931 v_extract_n<2>(a3) \
1932 }; \
1933 b2 = v_load(elems2); \
1934 _Tp CV_DECL_ALIGNED(32) elems3[4] = \
1935 { \
1936 v_extract_n<3>(a0), \
1937 v_extract_n<3>(a1), \
1938 v_extract_n<3>(a2), \
1939 v_extract_n<3>(a3) \
1940 }; \
1941 b3 = v_load(elems3); \
1942 }
1943
1944 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(uint32x4, unsigned, u32)
1945 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(int32x4, int, i32)
1946 OPENCV_HAL_IMPL_RVV_TRANSPOSE4x4(float32x4, float, f32)
1947
1948 ////////////// Reverse //////////////
1949
1950 #define OPENCV_HAL_IMPL_RVV_REVERSE(_Tpvec, _Tp, width, suffix) \
1951 inline _Tpvec v_reverse(const _Tpvec& a) \
1952 { \
1953 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
1954 _Tp CV_DECL_ALIGNED(32) ptra[_Tpvec::nlanes] = {0}; \
1955 v_store(ptra, a); \
1956 for (int i = 0; i < _Tpvec::nlanes; i++) \
1957 { \
1958 ptr[i] = ptra[_Tpvec::nlanes-i-1]; \
1959 } \
1960 return v_load(ptr); \
1961 }
1962
1963 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint8x16, uchar, 8, u8)
1964 OPENCV_HAL_IMPL_RVV_REVERSE(v_int8x16, schar, 8, i8)
1965 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint16x8, ushort, 16, u16)
1966 OPENCV_HAL_IMPL_RVV_REVERSE(v_int16x8, short, 16, i16)
1967 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint32x4, unsigned, 32, u32)
1968 OPENCV_HAL_IMPL_RVV_REVERSE(v_int32x4, int, 32, i32)
1969 OPENCV_HAL_IMPL_RVV_REVERSE(v_float32x4, float, 32, f32)
1970 OPENCV_HAL_IMPL_RVV_REVERSE(v_uint64x2, uint64, 64, u64)
1971 OPENCV_HAL_IMPL_RVV_REVERSE(v_int64x2, int64, 64, i64)
1972 #if CV_SIMD128_64F
1973 OPENCV_HAL_IMPL_RVV_REVERSE(v_float64x2, double, 64, f64)
1974 #endif
1975
1976 //////////// Value reordering ////////////
1977
1978 #define OPENCV_HAL_IMPL_RVV_EXPAND(_Tpwvec, _Tp, _Tpvec, width, suffix, wcvt) \
1979 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1980 { \
1981 _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
1982 _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
1983 v_store_low(lptr, a); \
1984 v_store_high(hptr, a); \
1985 b0 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
1986 b1 = _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
1987 } \
1988 inline _Tpwvec v_expand_low(const _Tpvec& a) \
1989 { \
1990 _Tp CV_DECL_ALIGNED(32) lptr[_Tpvec::nlanes/2] = {0}; \
1991 v_store_low(lptr, a); \
1992 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(lptr))); \
1993 } \
1994 inline _Tpwvec v_expand_high(const _Tpvec& a) \
1995 { \
1996 _Tp CV_DECL_ALIGNED(32) hptr[_Tpvec::nlanes/2] = {0}; \
1997 v_store_high(hptr, a); \
1998 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(hptr))); \
1999 } \
2000 inline _Tpwvec v_load_expand(const _Tp* ptr) \
2001 { \
2002 return _Tpwvec(wcvt(vle##width##_v_##suffix##mf2(ptr))); \
2003 }
2004
2005 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint16x8, uchar, v_uint8x16, 8, u8, vwcvtu_x_x_v_u16m1)
2006 OPENCV_HAL_IMPL_RVV_EXPAND(v_int16x8, schar, v_int8x16, 8, i8, vwcvt_x_x_v_i16m1)
2007 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint32x4, ushort, v_uint16x8, 16, u16, vwcvtu_x_x_v_u32m1)
2008 OPENCV_HAL_IMPL_RVV_EXPAND(v_int32x4, short, v_int16x8, 16, i16, vwcvt_x_x_v_i32m1)
2009 OPENCV_HAL_IMPL_RVV_EXPAND(v_uint64x2, uint, v_uint32x4, 32, u32, vwcvtu_x_x_v_u64m1)
2010 OPENCV_HAL_IMPL_RVV_EXPAND(v_int64x2, int, v_int32x4, 32, i32, vwcvt_x_x_v_i64m1)
2011
2012 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
2013 {
2014 vsetvlmax_e32m1();
2015 return v_uint32x4(vwcvtu_x_x_v_u32m1(vwcvtu_x_x_v_u16mf2(vle8_v_u8mf4(ptr))));
2016 }
2017
v_load_expand_q(const schar * ptr)2018 inline v_int32x4 v_load_expand_q(const schar* ptr)
2019 {
2020 vsetvlmax_e32m1();
2021 return v_int32x4(vwcvt_x_x_v_i32m1(vwcvt_x_x_v_i16mf2(vle8_v_i8mf4(ptr))));
2022 }
2023
2024
2025 #define OPENCV_HAL_IMPL_RVV_PACK(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, shr) \
2026 inline _Tpvec v_pack(const _wTpvec& a, const _wTpvec& b) \
2027 { \
2028 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2029 v_store(arr, a); \
2030 v_store(arr + _wTpvec::nlanes, b); \
2031 vsetvlmax_e##width##m2(); \
2032 return _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0)); \
2033 } \
2034 inline void v_pack_store(_Tp* ptr, const _wTpvec& a) \
2035 { \
2036 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2037 v_store(arr, a); \
2038 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2039 vsetvlmax_e##width##m2(); \
2040 v_store(ptr, _Tpvec(shr(vle##width##_v_##suffix##m2(arr), 0))); \
2041 } \
2042 template<int n> inline \
2043 _Tpvec v_rshr_pack(const _wTpvec& a, const _wTpvec& b) \
2044 { \
2045 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2046 v_store(arr, a); \
2047 v_store(arr + _wTpvec::nlanes, b); \
2048 vsetvlmax_e##width##m2(); \
2049 return _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n)); \
2050 } \
2051 template<int n> inline \
2052 void v_rshr_pack_store(_Tp* ptr, const _wTpvec& a) \
2053 { \
2054 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2055 v_store(arr, a); \
2056 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2057 vsetvlmax_e##width##m2(); \
2058 v_store(ptr, _Tpvec(rshr(vle##width##_v_##suffix##m2(arr), n))); \
2059 }
2060
2061 OPENCV_HAL_IMPL_RVV_PACK(v_uint8x16, uchar, v_uint16x8, ushort, 16, u16, vnclipu_wx_u8m1, vnclipu_wx_u8m1)
2062 OPENCV_HAL_IMPL_RVV_PACK(v_int8x16, schar, v_int16x8, short, 16, i16, vnclip_wx_i8m1, vnclip_wx_i8m1)
2063 OPENCV_HAL_IMPL_RVV_PACK(v_uint16x8, ushort, v_uint32x4, unsigned, 32, u32, vnclipu_wx_u16m1, vnclipu_wx_u16m1)
2064 OPENCV_HAL_IMPL_RVV_PACK(v_int16x8, short, v_int32x4, int, 32, i32, vnclip_wx_i16m1, vnclip_wx_i16m1)
2065 OPENCV_HAL_IMPL_RVV_PACK(v_uint32x4, unsigned, v_uint64x2, uint64, 64, u64, vnclipu_wx_u32m1, vnsrl_wx_u32m1)
2066 OPENCV_HAL_IMPL_RVV_PACK(v_int32x4, int, v_int64x2, int64, 64, i64, vnclip_wx_i32m1, vnsra_wx_i32m1)
2067
2068
2069 #define OPENCV_HAL_IMPL_RVV_PACK_U(_Tpvec, _Tp, _wTpvec, _wTp, width, suffix, rshr, cast) \
2070 inline _Tpvec v_pack_u(const _wTpvec& a, const _wTpvec& b) \
2071 { \
2072 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2073 v_store(arr, a); \
2074 v_store(arr + _wTpvec::nlanes, b); \
2075 vsetvlmax_e##width##m2(); \
2076 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0)); \
2077 } \
2078 inline void v_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2079 { \
2080 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2081 v_store(arr, a); \
2082 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2083 vsetvlmax_e##width##m2(); \
2084 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), 0))); \
2085 } \
2086 template<int n> inline \
2087 _Tpvec v_rshr_pack_u(const _wTpvec& a, const _wTpvec& b) \
2088 { \
2089 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2090 v_store(arr, a); \
2091 v_store(arr + _wTpvec::nlanes, b); \
2092 vsetvlmax_e##width##m2(); \
2093 return _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n)); \
2094 } \
2095 template<int n> inline \
2096 void v_rshr_pack_u_store(_Tp* ptr, const _wTpvec& a) \
2097 { \
2098 _wTp CV_DECL_ALIGNED(32) arr[_Tpvec::nlanes] = {0}; \
2099 v_store(arr, a); \
2100 v_store(arr + _wTpvec::nlanes, _wTpvec(vzero_##suffix##m1())); \
2101 vsetvlmax_e##width##m2(); \
2102 v_store(ptr, _Tpvec(rshr(cast(vmax_vx_##suffix##m2(vle##width##_v_##suffix##m2(arr), 0)), n))); \
2103 }
2104
2105 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint8x16, uchar, v_int16x8, short, 16, i16, vnclipu_wx_u8m1, vreinterpret_v_i16m2_u16m2)
2106 OPENCV_HAL_IMPL_RVV_PACK_U(v_uint16x8, ushort, v_int32x4, int, 32, i32, vnclipu_wx_u16m1, vreinterpret_v_i32m2_u32m2)
2107
2108
2109 #define OPENCV_HAL_IMPL_RVV_UNPACKS(_Tpvec, _Tp, width, suffix) \
2110 inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
2111 { \
2112 _Tp CV_DECL_ALIGNED(32) ptra0[v_##_Tpvec::nlanes] = {0}; \
2113 _Tp CV_DECL_ALIGNED(32) ptra1[v_##_Tpvec::nlanes] = {0}; \
2114 _Tp CV_DECL_ALIGNED(32) ptrb0[v_##_Tpvec::nlanes] = {0}; \
2115 _Tp CV_DECL_ALIGNED(32) ptrb1[v_##_Tpvec::nlanes] = {0}; \
2116 v_store(ptra0, a0); \
2117 v_store(ptra1, a1); \
2118 int i; \
2119 for( i = 0; i < v_##_Tpvec::nlanes/2; i++ ) \
2120 { \
2121 ptrb0[i*2] = ptra0[i]; \
2122 ptrb0[i*2+1] = ptra1[i]; \
2123 } \
2124 for( ; i < v_##_Tpvec::nlanes; i++ ) \
2125 { \
2126 ptrb1[i*2-v_##_Tpvec::nlanes] = ptra0[i]; \
2127 ptrb1[i*2-v_##_Tpvec::nlanes+1] = ptra1[i]; \
2128 } \
2129 b0 = v_load(ptrb0); \
2130 b1 = v_load(ptrb1); \
2131 } \
2132 inline v_##_Tpvec v_combine_low(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2133 { \
2134 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
2135 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2136 v_store_low(ptra, a); \
2137 v_store_low(ptrb, b); \
2138 return v_load_halves(ptra, ptrb); \
2139 } \
2140 inline v_##_Tpvec v_combine_high(const v_##_Tpvec& a, const v_##_Tpvec& b) \
2141 { \
2142 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes/2] = {0}; \
2143 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes/2] = {0}; \
2144 v_store_high(ptra, a); \
2145 v_store_high(ptrb, b); \
2146 return v_load_halves(ptra, ptrb); \
2147 } \
2148 inline void v_recombine(const v_##_Tpvec& a, const v_##_Tpvec& b, v_##_Tpvec& c, v_##_Tpvec& d) \
2149 { \
2150 c = v_combine_low(a, b); \
2151 d = v_combine_high(a, b); \
2152 }
2153
2154 OPENCV_HAL_IMPL_RVV_UNPACKS(uint8x16, uchar, 8, u8)
2155 OPENCV_HAL_IMPL_RVV_UNPACKS(int8x16, schar, 8, i8)
2156 OPENCV_HAL_IMPL_RVV_UNPACKS(uint16x8, ushort, 16, u16)
2157 OPENCV_HAL_IMPL_RVV_UNPACKS(int16x8, short, 16, i16)
2158 OPENCV_HAL_IMPL_RVV_UNPACKS(uint32x4, unsigned, 32, u32)
2159 OPENCV_HAL_IMPL_RVV_UNPACKS(int32x4, int, 32, i32)
2160 OPENCV_HAL_IMPL_RVV_UNPACKS(float32x4, float, 32, f32)
2161 #if CV_SIMD128_64F
2162 OPENCV_HAL_IMPL_RVV_UNPACKS(float64x2, double, 64, f64)
2163 #endif
2164
2165
2166 #define OPENCV_HAL_IMPL_RVV_INTERLEAVED(_Tpvec, _Tp, suffix, width) \
2167 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
2168 { \
2169 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2170 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2171 int i, i2; \
2172 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2173 { \
2174 ptra[i] = ptr[i2]; \
2175 ptrb[i] = ptr[i2+1]; \
2176 } \
2177 a = v_load(ptra); \
2178 b = v_load(ptrb); \
2179 } \
2180 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
2181 { \
2182 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2183 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2184 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2185 int i, i3; \
2186 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2187 { \
2188 ptra[i] = ptr[i3]; \
2189 ptrb[i] = ptr[i3+1]; \
2190 ptrc[i] = ptr[i3+2]; \
2191 } \
2192 a = v_load(ptra); \
2193 b = v_load(ptrb); \
2194 c = v_load(ptrc); \
2195 } \
2196 inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
2197 v_##_Tpvec& c, v_##_Tpvec& d) \
2198 { \
2199 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2200 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2201 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2202 _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
2203 int i, i4; \
2204 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2205 { \
2206 ptra[i] = ptr[i4]; \
2207 ptrb[i] = ptr[i4+1]; \
2208 ptrc[i] = ptr[i4+2]; \
2209 ptrd[i] = ptr[i4+3]; \
2210 } \
2211 a = v_load(ptra); \
2212 b = v_load(ptrb); \
2213 c = v_load(ptrc); \
2214 d = v_load(ptrd); \
2215 } \
2216 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2217 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2218 { \
2219 int i, i2; \
2220 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2221 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2222 v_store(ptra, a); \
2223 v_store(ptrb, b); \
2224 for( i = i2 = 0; i < v_##_Tpvec::nlanes; i++, i2 += 2 ) \
2225 { \
2226 ptr[i2] = ptra[i]; \
2227 ptr[i2+1] = ptrb[i]; \
2228 } \
2229 } \
2230 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2231 const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
2232 { \
2233 int i, i3; \
2234 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2235 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2236 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2237 v_store(ptra, a); \
2238 v_store(ptrb, b); \
2239 v_store(ptrc, c); \
2240 for( i = i3 = 0; i < v_##_Tpvec::nlanes; i++, i3 += 3 ) \
2241 { \
2242 ptr[i3] = ptra[i]; \
2243 ptr[i3+1] = ptrb[i]; \
2244 ptr[i3+2] = ptrc[i]; \
2245 } \
2246 } \
2247 inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
2248 const v_##_Tpvec& c, const v_##_Tpvec& d, \
2249 hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
2250 { \
2251 int i, i4; \
2252 _Tp CV_DECL_ALIGNED(32) ptra[v_##_Tpvec::nlanes] = {0}; \
2253 _Tp CV_DECL_ALIGNED(32) ptrb[v_##_Tpvec::nlanes] = {0}; \
2254 _Tp CV_DECL_ALIGNED(32) ptrc[v_##_Tpvec::nlanes] = {0}; \
2255 _Tp CV_DECL_ALIGNED(32) ptrd[v_##_Tpvec::nlanes] = {0}; \
2256 v_store(ptra, a); \
2257 v_store(ptrb, b); \
2258 v_store(ptrc, c); \
2259 v_store(ptrd, d); \
2260 for( i = i4 = 0; i < v_##_Tpvec::nlanes; i++, i4 += 4 ) \
2261 { \
2262 ptr[i4] = ptra[i]; \
2263 ptr[i4+1] = ptrb[i]; \
2264 ptr[i4+2] = ptrc[i]; \
2265 ptr[i4+3] = ptrd[i]; \
2266 } \
2267 } \
2268 inline v_##_Tpvec v_interleave_pairs(const v_##_Tpvec& vec) \
2269 { \
2270 _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
2271 _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
2272 v_store(ptrvec, vec); \
2273 for (int i = 0; i < v_##_Tpvec::nlanes/4; i++) \
2274 { \
2275 ptr[4*i ] = ptrvec[4*i ]; \
2276 ptr[4*i+1] = ptrvec[4*i+2]; \
2277 ptr[4*i+2] = ptrvec[4*i+1]; \
2278 ptr[4*i+3] = ptrvec[4*i+3]; \
2279 } \
2280 return v_load(ptr); \
2281 } \
2282 inline v_##_Tpvec v_interleave_quads(const v_##_Tpvec& vec) \
2283 { \
2284 _Tp CV_DECL_ALIGNED(32) ptr[v_##_Tpvec::nlanes] = {0}; \
2285 _Tp CV_DECL_ALIGNED(32) ptrvec[v_##_Tpvec::nlanes] = {0}; \
2286 v_store(ptrvec, vec); \
2287 for (int i = 0; i < v_##_Tpvec::nlanes/8; i++) \
2288 { \
2289 ptr[8*i ] = ptrvec[4*i ]; \
2290 ptr[8*i+1] = ptrvec[4*i+4]; \
2291 ptr[8*i+2] = ptrvec[4*i+1]; \
2292 ptr[8*i+3] = ptrvec[4*i+5]; \
2293 ptr[8*i+4] = ptrvec[4*i+2]; \
2294 ptr[8*i+5] = ptrvec[4*i+6]; \
2295 ptr[8*i+6] = ptrvec[4*i+3]; \
2296 ptr[8*i+7] = ptrvec[4*i+7]; \
2297 } \
2298 return v_load(ptr); \
2299 }
2300
2301 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint8x16, uchar, u8, 8)
2302 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int8x16, schar, i8, 8)
2303 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint16x8, ushort, u16, 16)
2304 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int16x8, short, i16, 16)
2305 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint32x4, unsigned, u32, 32)
2306 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int32x4, int, i32, 32)
2307 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float32x4, float, f32, 32)
2308 OPENCV_HAL_IMPL_RVV_INTERLEAVED(uint64x2, uint64, u64, 64)
2309 OPENCV_HAL_IMPL_RVV_INTERLEAVED(int64x2, int64, i64, 64)
2310 #if CV_SIMD128_64F
2311 OPENCV_HAL_IMPL_RVV_INTERLEAVED(float64x2, double, f64, 64)
2312 #endif
2313
2314 //////////// PopCount ////////////
2315
2316 static const unsigned char popCountTable[] =
2317 {
2318 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
2319 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2320 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2321 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2322 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2323 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2324 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2325 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2326 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2327 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2328 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2329 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2330 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2331 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2332 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2333 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
2334 };
2335
2336 #define OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(_rTpvec, _Tpvec, _rTp, _Tp, suffix) \
2337 inline _rTpvec v_popcount(const _Tpvec& a) \
2338 { \
2339 uchar CV_DECL_ALIGNED(32) ptra[16] = {0}; \
2340 v_store(ptra, v_reinterpret_as_u8(a)); \
2341 _rTp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2342 v_store(ptr, v_setzero_##suffix()); \
2343 for (int i = 0; i < _Tpvec::nlanes*(int)sizeof(_Tp); i++) \
2344 ptr[i/sizeof(_Tp)] += popCountTable[ptra[i]]; \
2345 return v_load(ptr); \
2346 }
2347
OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16,v_uint8x16,uchar,uchar,u8)2348 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_uint8x16, uchar, uchar, u8)
2349 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint8x16, v_int8x16, uchar, schar, u8)
2350 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_uint16x8, ushort, ushort, u16)
2351 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint16x8, v_int16x8, ushort, short, u16)
2352 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_uint32x4, unsigned, unsigned, u32)
2353 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint32x4, v_int32x4, unsigned, int, u32)
2354 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_uint64x2, uint64, uint64, u64)
2355 OPENCV_HAL_IMPL_RVV_POPCOUNT_OP(v_uint64x2, v_int64x2, uint64, int64, u64)
2356
2357 //////////// SignMask ////////////
2358
2359 #define OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(_Tpvec, _Tp, suffix, width, shift) \
2360 inline int v_signmask(const _Tpvec& a) \
2361 { \
2362 int mask = 0; \
2363 vsetvlmax_e##width##m1(); \
2364 _Tpvec tmp = _Tpvec(vsrl_vx_##suffix##m1(a, shift)); \
2365 for( int i = 0; i < _Tpvec::nlanes; i++ ) \
2366 mask |= (int)(tmp.val[i]) << i; \
2367 return mask; \
2368 }
2369
2370 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint8x16, uchar, u8, 8, 7)
2371 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint16x8, ushort, u16, 16, 15)
2372 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint32x4, unsigned, u32, 32, 31)
2373 OPENCV_HAL_IMPL_RVV_SIGNMASK_OP(v_uint64x2, uint64, u64, 64, 63)
2374
2375 inline int v_signmask(const v_int8x16& a)
2376 { return v_signmask(v_reinterpret_as_u8(a)); }
v_signmask(const v_int16x8 & a)2377 inline int v_signmask(const v_int16x8& a)
2378 { return v_signmask(v_reinterpret_as_u16(a)); }
v_signmask(const v_int32x4 & a)2379 inline int v_signmask(const v_int32x4& a)
2380 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_float32x4 & a)2381 inline int v_signmask(const v_float32x4& a)
2382 { return v_signmask(v_reinterpret_as_u32(a)); }
v_signmask(const v_int64x2 & a)2383 inline int v_signmask(const v_int64x2& a)
2384 { return v_signmask(v_reinterpret_as_u64(a)); }
2385 #if CV_SIMD128_64F
v_signmask(const v_float64x2 & a)2386 inline int v_signmask(const v_float64x2& a)
2387 { return v_signmask(v_reinterpret_as_u64(a)); }
2388 #endif
2389
2390
2391 //////////// Scan forward ////////////
2392
2393 #define OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(_Tpvec, _Tp, suffix) \
2394 inline int v_scan_forward(const _Tpvec& a) \
2395 { \
2396 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2397 v_store(ptr, v_reinterpret_as_##suffix(a)); \
2398 for (int i = 0; i < _Tpvec::nlanes; i++) \
2399 if(int(ptr[i]) < 0) \
2400 return i; \
2401 return 0; \
2402 }
2403
OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16,uchar,u8)2404 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint8x16, uchar, u8)
2405 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int8x16, schar, s8)
2406 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint16x8, ushort, u16)
2407 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int16x8, short, s16)
2408 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint32x4, unsigned, u32)
2409 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int32x4, int, s32)
2410 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float32x4, float, f32)
2411 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_uint64x2, uint64, u64)
2412 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_int64x2, int64, s64)
2413 #if CV_SIMD128_64F
2414 OPENCV_HAL_IMPL_RVV_SCAN_FORWOARD_OP(v_float64x2, double, f64)
2415 #endif
2416
2417 //////////// Pack triplets ////////////
2418
2419 #define OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(_Tpvec, _Tp) \
2420 inline _Tpvec v_pack_triplets(const _Tpvec& vec) \
2421 { \
2422 _Tp CV_DECL_ALIGNED(32) ptr[_Tpvec::nlanes] = {0}; \
2423 _Tp CV_DECL_ALIGNED(32) ptrvec[_Tpvec::nlanes] = {0}; \
2424 v_store(ptrvec, vec); \
2425 for (int i = 0; i < _Tpvec::nlanes/4; i++) \
2426 { \
2427 ptr[3*i ] = ptrvec[4*i ]; \
2428 ptr[3*i+1] = ptrvec[4*i+2]; \
2429 ptr[3*i+2] = ptrvec[4*i+2]; \
2430 } \
2431 return v_load(ptr); \
2432 }
2433
2434 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint8x16, uchar)
2435 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int8x16, schar)
2436 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint16x8, ushort)
2437 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int16x8, short)
2438 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_uint32x4, unsigned)
2439 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_int32x4, int)
2440 OPENCV_HAL_IMPL_RVV_PACK_TRIPLETS(v_float32x4, float)
2441
2442
2443 ////// FP16 support ///////
2444
2445 #if CV_FP16
2446 inline v_float32x4 v_load_expand(const float16_t* ptr)
2447 {
2448 return v_float32x4(vfwcvt_f_f_v_f32m1(vle16_v_f16mf2(ptr)));
2449 }
2450
v_pack_store(float16_t * ptr,const v_float32x4 & v)2451 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2452 {
2453 vse16_v_f16mf2(ptr, vfncvt_f_f_w_f16mf2(v));
2454 }
2455 #else
2456 inline v_float32x4 v_load_expand(const float16_t* ptr)
2457 {
2458 const int N = 4;
2459 float buf[N];
2460 for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
2461 return v_load(buf);
2462 }
2463
2464 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
2465 {
2466 const int N = 4;
2467 float buf[N];
2468 v_store(buf, v);
2469 for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
2470 }
2471 #endif
2472
2473 ////////////// Rounding //////////////
2474
v_round(const v_float32x4 & a)2475 inline v_int32x4 v_round(const v_float32x4& a)
2476 {
2477 vsetvlmax_e32m1();
2478 return v_int32x4(vfcvt_x_f_v_i32m1(a));
2479 }
2480
v_floor(const v_float32x4 & a)2481 inline v_int32x4 v_floor(const v_float32x4& a)
2482 {
2483 v_float32x4 ZP5 = v_setall_f32(0.5f);
2484 v_float32x4 t = a - ZP5;
2485 vsetvlmax_e32m1();
2486 return v_int32x4(vfcvt_x_f_v_i32m1(t));
2487 }
2488
v_ceil(const v_float32x4 & a)2489 inline v_int32x4 v_ceil(const v_float32x4& a)
2490 {
2491 v_float32x4 ZP5 = v_setall_f32(0.5f);
2492 v_float32x4 t = a + ZP5;
2493 vsetvlmax_e32m1();
2494 return v_int32x4(vfcvt_x_f_v_i32m1(t));
2495 }
2496
v_trunc(const v_float32x4 & a)2497 inline v_int32x4 v_trunc(const v_float32x4& a)
2498 {
2499 vsetvlmax_e32m1();
2500 return v_int32x4(vfcvt_rtz_x_f_v_i32m1(a));
2501 }
2502 #if CV_SIMD128_64F
v_round(const v_float64x2 & a)2503 inline v_int32x4 v_round(const v_float64x2& a)
2504 {
2505 double arr[4] = {a.val[0], a.val[1], 0, 0};
2506 vsetvlmax_e64m2();
2507 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2508 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2509 }
2510
v_round(const v_float64x2 & a,const v_float64x2 & b)2511 inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
2512 {
2513 double arr[4] = {a.val[0], a.val[1], b.val[0], b.val[1]};
2514 vsetvlmax_e64m2();
2515 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2516 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2517 }
2518
v_floor(const v_float64x2 & a)2519 inline v_int32x4 v_floor(const v_float64x2& a)
2520 {
2521 double arr[4] = {a.val[0]-0.5f, a.val[1]-0.5f, 0, 0};
2522 vsetvlmax_e64m2();
2523 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2524 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2525 }
2526
v_ceil(const v_float64x2 & a)2527 inline v_int32x4 v_ceil(const v_float64x2& a)
2528 {
2529 double arr[4] = {a.val[0]+0.5f, a.val[1]+0.5f, 0, 0};
2530 vsetvlmax_e64m2();
2531 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2532 return v_int32x4(vfncvt_x_f_w_i32m1(tmp));
2533 }
2534
v_trunc(const v_float64x2 & a)2535 inline v_int32x4 v_trunc(const v_float64x2& a)
2536 {
2537 double arr[4] = {a.val[0], a.val[1], 0, 0};
2538 vsetvlmax_e64m2();
2539 vfloat64m2_t tmp = vle64_v_f64m2(arr);
2540 return v_int32x4(vfncvt_rtz_x_f_w_i32m1(tmp));
2541 }
2542 #endif
2543
2544
2545 //////// Dot Product ////////
2546
2547 // 16 >> 32
v_dotprod(const v_int16x8 & a,const v_int16x8 & b)2548 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
2549 {
2550 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2551 v_int32x4 t1, t2;
2552 vsetvlmax_e32m2();
2553 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2554 v_load_deinterleave(ptr, t1, t2);
2555 return t1 + t2;
2556 }
v_dotprod(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)2557 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
2558 {
2559 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2560 v_int32x4 t1, t2;
2561 vsetvlmax_e32m2();
2562 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2563 v_load_deinterleave(ptr, t1, t2);
2564 return t1 + t2 + c;
2565 }
2566
2567 // 32 >> 64
v_dotprod(const v_int32x4 & a,const v_int32x4 & b)2568 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
2569 {
2570 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2571 v_int64x2 t1, t2;
2572 vsetvlmax_e64m2();
2573 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2574 v_load_deinterleave(ptr, t1, t2);
2575 return t1 + t2;
2576 }
v_dotprod(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)2577 inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
2578 {
2579 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2580 v_int64x2 t1, t2;
2581 vsetvlmax_e64m2();
2582 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2583 v_load_deinterleave(ptr, t1, t2);
2584 return t1 + t2 + c;
2585 }
2586
2587 // 8 >> 32
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b)2588 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
2589 {
2590 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2591 v_uint32x4 t1, t2, t3, t4;
2592 vsetvlmax_e32m4();
2593 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2594 v_load_deinterleave(ptr, t1, t2, t3, t4);
2595 return t1 + t2 + t3 + t4;
2596 }
v_dotprod_expand(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)2597 inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
2598 const v_uint32x4& c)
2599 {
2600 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2601 v_uint32x4 t1, t2, t3, t4;
2602 vsetvlmax_e32m4();
2603 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2604 v_load_deinterleave(ptr, t1, t2, t3, t4);
2605 return t1 + t2 + t3 + t4 + c;
2606 }
2607
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b)2608 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
2609 {
2610 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2611 v_int32x4 t1, t2, t3, t4;
2612 vsetvlmax_e32m4();
2613 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2614 v_load_deinterleave(ptr, t1, t2, t3, t4);
2615 return t1 + t2 + t3 + t4;
2616 }
v_dotprod_expand(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)2617 inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
2618 const v_int32x4& c)
2619 {
2620 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2621 v_int32x4 t1, t2, t3, t4;
2622 vsetvlmax_e32m4();
2623 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2624 v_load_deinterleave(ptr, t1, t2, t3, t4);
2625 return t1 + t2 + t3 + t4 + c;
2626 }
2627
2628 // 16 >> 64
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b)2629 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
2630 {
2631 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2632 v_uint64x2 t1, t2, t3, t4;
2633 vsetvlmax_e64m4();
2634 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2635 v_load_deinterleave(ptr, t1, t2, t3, t4);
2636 return t1 + t2 + t3 + t4;
2637 }
v_dotprod_expand(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)2638 inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
2639 {
2640 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2641 v_uint64x2 t1, t2, t3, t4;
2642 vsetvlmax_e64m4();
2643 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2644 v_load_deinterleave(ptr, t1, t2, t3, t4);
2645 return t1 + t2 + t3 + t4 + c;
2646 }
2647
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b)2648 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
2649 {
2650 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2651 v_int64x2 t1, t2, t3, t4;
2652 vsetvlmax_e64m4();
2653 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2654 v_load_deinterleave(ptr, t1, t2, t3, t4);
2655 return t1 + t2 + t3 + t4;
2656 }
v_dotprod_expand(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)2657 inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
2658 const v_int64x2& c)
2659 {
2660 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2661 v_int64x2 t1, t2, t3, t4;
2662 vsetvlmax_e64m4();
2663 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2664 v_load_deinterleave(ptr, t1, t2, t3, t4);
2665 return t1 + t2 + t3 + t4 + c;
2666 }
2667
2668 // 32 >> 64f
2669 #if CV_SIMD128_64F
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b)2670 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
2671 { return v_cvt_f64(v_dotprod(a, b)); }
v_dotprod_expand(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)2672 inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
2673 const v_float64x2& c)
2674 { return v_dotprod_expand(a, b) + c; }
2675 #endif
2676
2677 //////// Fast Dot Product ////////
2678
2679 // 16 >> 32
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b)2680 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
2681 {
2682 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2683 vsetvlmax_e32m2();
2684 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2685 v_int32x4 t1 = v_load(ptr);
2686 v_int32x4 t2 = v_load(ptr+4);
2687 return t1 + t2;
2688 }
v_dotprod_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int32x4 & c)2689 inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
2690 {
2691 int CV_DECL_ALIGNED(32) ptr[8] = {0};
2692 vsetvlmax_e32m2();
2693 vse32_v_i32m2(ptr, vwmul_vv_i32m2(a, b));
2694 v_int32x4 t1 = v_load(ptr);
2695 v_int32x4 t2 = v_load(ptr+4);
2696 return t1 + t2 + c;
2697 }
2698
2699 // 32 >> 64
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b)2700 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
2701 {
2702 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2703 vsetvlmax_e64m2();
2704 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2705 v_int64x2 t1 = v_load(ptr);
2706 v_int64x2 t2 = v_load(ptr+2);
2707 return t1 + t2;
2708 }
v_dotprod_fast(const v_int32x4 & a,const v_int32x4 & b,const v_int64x2 & c)2709 inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
2710 {
2711 int64 CV_DECL_ALIGNED(32) ptr[4] = {0};
2712 vsetvlmax_e64m2();
2713 vse64_v_i64m2(ptr, vwmul_vv_i64m2(a, b));
2714 v_int64x2 t1 = v_load(ptr);
2715 v_int64x2 t2 = v_load(ptr+2);
2716 return t1 + t2 + c;
2717 }
2718
2719
2720 // 8 >> 32
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b)2721 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
2722 {
2723 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2724 vsetvlmax_e32m4();
2725 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2726 v_uint32x4 t1 = v_load(ptr);
2727 v_uint32x4 t2 = v_load(ptr+4);
2728 v_uint32x4 t3 = v_load(ptr+8);
2729 v_uint32x4 t4 = v_load(ptr+12);
2730 return t1 + t2 + t3 + t4;
2731 }
v_dotprod_expand_fast(const v_uint8x16 & a,const v_uint8x16 & b,const v_uint32x4 & c)2732 inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
2733 {
2734 unsigned CV_DECL_ALIGNED(32) ptr[16] = {0};
2735 vsetvlmax_e32m4();
2736 vse32_v_u32m4(ptr, vqmaccu_vv_u32m4(vzero_u32m4(), a, b));
2737 v_uint32x4 t1 = v_load(ptr);
2738 v_uint32x4 t2 = v_load(ptr+4);
2739 v_uint32x4 t3 = v_load(ptr+8);
2740 v_uint32x4 t4 = v_load(ptr+12);
2741 return t1 + t2 + t3 + t4 + c;
2742 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b)2743 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
2744 {
2745 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2746 vsetvlmax_e32m4();
2747 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2748 v_int32x4 t1 = v_load(ptr);
2749 v_int32x4 t2 = v_load(ptr+4);
2750 v_int32x4 t3 = v_load(ptr+8);
2751 v_int32x4 t4 = v_load(ptr+12);
2752 return t1 + t2 + t3 + t4;
2753 }
v_dotprod_expand_fast(const v_int8x16 & a,const v_int8x16 & b,const v_int32x4 & c)2754 inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
2755 {
2756 int CV_DECL_ALIGNED(32) ptr[16] = {0};
2757 vsetvlmax_e32m4();
2758 vse32_v_i32m4(ptr, vqmacc_vv_i32m4(vzero_i32m4(), a, b));
2759 v_int32x4 t1 = v_load(ptr);
2760 v_int32x4 t2 = v_load(ptr+4);
2761 v_int32x4 t3 = v_load(ptr+8);
2762 v_int32x4 t4 = v_load(ptr+12);
2763 return t1 + t2 + t3 + t4 + c;
2764 }
2765
2766 // 16 >> 64
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b)2767 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
2768 {
2769 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2770 vsetvlmax_e64m4();
2771 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2772 v_uint64x2 t1 = v_load(ptr);
2773 v_uint64x2 t2 = v_load(ptr+2);
2774 v_uint64x2 t3 = v_load(ptr+4);
2775 v_uint64x2 t4 = v_load(ptr+6);
2776 return t1 + t2 + t3 + t4;
2777 }
v_dotprod_expand_fast(const v_uint16x8 & a,const v_uint16x8 & b,const v_uint64x2 & c)2778 inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
2779 {
2780 uint64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2781 vsetvlmax_e64m4();
2782 vse64_v_u64m4(ptr, vqmaccu_vv_u64m4(vzero_u64m4(), a, b));
2783 v_uint64x2 t1 = v_load(ptr);
2784 v_uint64x2 t2 = v_load(ptr+2);
2785 v_uint64x2 t3 = v_load(ptr+4);
2786 v_uint64x2 t4 = v_load(ptr+6);
2787 return t1 + t2 + t3 + t4 + c;
2788 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b)2789 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
2790 {
2791 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2792 vsetvlmax_e64m4();
2793 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2794 v_int64x2 t1 = v_load(ptr);
2795 v_int64x2 t2 = v_load(ptr+2);
2796 v_int64x2 t3 = v_load(ptr+4);
2797 v_int64x2 t4 = v_load(ptr+6);
2798 return t1 + t2 + t3 + t4;
2799 }
v_dotprod_expand_fast(const v_int16x8 & a,const v_int16x8 & b,const v_int64x2 & c)2800 inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
2801 {
2802 int64 CV_DECL_ALIGNED(32) ptr[8] = {0};
2803 vsetvlmax_e64m4();
2804 vse64_v_i64m4(ptr, vqmacc_vv_i64m4(vzero_i64m4(), a, b));
2805 v_int64x2 t1 = v_load(ptr);
2806 v_int64x2 t2 = v_load(ptr+2);
2807 v_int64x2 t3 = v_load(ptr+4);
2808 v_int64x2 t4 = v_load(ptr+6);
2809 return t1 + t2 + t3 + t4 + c;
2810 }
2811
2812 // 32 >> 64f
2813 #if CV_SIMD128_64F
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b)2814 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
2815 { return v_cvt_f64(v_dotprod_fast(a, b)); }
v_dotprod_expand_fast(const v_int32x4 & a,const v_int32x4 & b,const v_float64x2 & c)2816 inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
2817 { return v_dotprod_expand_fast(a, b) + c; }
2818 #endif
2819
2820
v_matmul(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & m3)2821 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
2822 const v_float32x4& m1, const v_float32x4& m2,
2823 const v_float32x4& m3)
2824 {
2825 vsetvlmax_e32m1();
2826 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
2827 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
2828 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
2829 res = vfmacc_vf_f32m1(res, v_extract_n<3>(v), m3);
2830 return v_float32x4(res);
2831 }
2832
v_matmuladd(const v_float32x4 & v,const v_float32x4 & m0,const v_float32x4 & m1,const v_float32x4 & m2,const v_float32x4 & a)2833 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
2834 const v_float32x4& m1, const v_float32x4& m2,
2835 const v_float32x4& a)
2836 {
2837 vsetvlmax_e32m1();
2838 vfloat32m1_t res = vfmul_vf_f32m1(m0, v_extract_n<0>(v));
2839 res = vfmacc_vf_f32m1(res, v_extract_n<1>(v), m1);
2840 res = vfmacc_vf_f32m1(res, v_extract_n<2>(v), m2);
2841 return v_float32x4(res) + a;
2842 }
2843
2844 #define OPENCV_HAL_IMPL_RVV_MUL_EXPAND(_Tpvec, _Tpwvec, _Tpw, suffix, wmul, width) \
2845 inline void v_mul_expand(const _Tpvec& a, const _Tpvec& b, _Tpwvec& c, _Tpwvec& d) \
2846 { \
2847 _Tpw CV_DECL_ALIGNED(32) ptr[_Tpwvec::nlanes*2] = {0}; \
2848 vsetvlmax_e##width##m2(); \
2849 vse##width##_v_##suffix##m2(ptr, wmul(a, b)); \
2850 vsetvlmax_e##width##m1(); \
2851 c = _Tpwvec(vle##width##_v_##suffix##m1(ptr)); \
2852 d = _Tpwvec(vle##width##_v_##suffix##m1(ptr+_Tpwvec::nlanes)); \
2853 }
2854
2855 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint8x16, v_uint16x8, ushort, u16, vwmulu_vv_u16m2, 16)
2856 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int8x16, v_int16x8, short, i16, vwmul_vv_i16m2, 16)
2857 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint16x8, v_uint32x4, unsigned, u32, vwmulu_vv_u32m2, 32)
2858 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_int16x8, v_int32x4, int, i32, vwmul_vv_i32m2, 32)
2859 OPENCV_HAL_IMPL_RVV_MUL_EXPAND(v_uint32x4, v_uint64x2, uint64, u64, vwmulu_vv_u64m2, 64)
2860
2861
v_mul_hi(const v_int16x8 & a,const v_int16x8 & b)2862 inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
2863 {
2864 vsetvlmax_e16m1();
2865 return v_int16x8(vnsra_wx_i16m1(vwmul_vv_i32m2(a, b), 16));
2866 }
v_mul_hi(const v_uint16x8 & a,const v_uint16x8 & b)2867 inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
2868 {
2869 vsetvlmax_e16m1();
2870 return v_uint16x8(vnsrl_wx_u16m1(vwmulu_vv_u32m2(a, b), 16));
2871 }
2872
2873
2874 //////// Saturating Multiply ////////
2875
2876 #define OPENCV_HAL_IMPL_RVV_MUL_SAT(_Tpvec, _wTpvec) \
2877 inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
2878 { \
2879 _wTpvec c, d; \
2880 v_mul_expand(a, b, c, d); \
2881 return v_pack(c, d); \
2882 } \
2883 inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
2884 { \
2885 a = a * b; \
2886 return a; \
2887 }
2888
OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16,v_uint16x8)2889 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint8x16, v_uint16x8)
2890 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int8x16, v_int16x8)
2891 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_uint16x8, v_uint32x4)
2892 OPENCV_HAL_IMPL_RVV_MUL_SAT(v_int16x8, v_int32x4)
2893
2894
2895 inline void v_cleanup() {}
2896
2897 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2898
2899
2900 }
2901
2902 #endif
2903