1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "unaryop_riscv.h"
16
17 #if __riscv_vector
18 #ifdef RVV_SPEC_0_7
19 #include "riscv_v_071_fix.h"
20 #else
21 #include <riscv_vector.h>
22 #endif
23 #include "rvv_mathfun.h"
24 #include "rvv_mathfun_fp16s.h"
25 #endif // __riscv_vector
26
27 #include <math.h>
28
29 namespace ncnn {
30
UnaryOp_riscv()31 UnaryOp_riscv::UnaryOp_riscv()
32 {
33 #if __riscv_vector
34 support_packing = true;
35 #if __riscv_zfh
36 support_fp16_storage = true;
37 #endif
38 #endif // __riscv_vector
39 }
40
41 #if __riscv_vector
42 template<typename Op>
unary_op_inplace(Mat & a,const Option & opt)43 static int unary_op_inplace(Mat& a, const Option& opt)
44 {
45 Op op;
46
47 int w = a.w;
48 int h = a.h;
49 int channels = a.c;
50 int size = w * h;
51 int elempack = a.elempack;
52
53 #pragma omp parallel for num_threads(opt.num_threads)
54 for (int q = 0; q < channels; q++)
55 {
56 float* ptr = a.channel(q);
57
58 int n = size * elempack;
59 while (n > 0)
60 {
61 word_type vl = vsetvl_e32m8(n);
62
63 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
64 _p = op(_p, vl);
65 vse32_v_f32m8(ptr, _p, vl);
66
67 ptr += vl;
68 n -= vl;
69 }
70 }
71
72 return 0;
73 }
74
75 struct unary_op_abs
76 {
operator ()ncnn::unary_op_abs77 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
78 {
79 return vfsgnj_vf_f32m8(x, 1.f, vl);
80 }
81 };
82
83 struct unary_op_neg
84 {
operator ()ncnn::unary_op_neg85 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
86 {
87 return vfneg_v_f32m8(x, vl);
88 }
89 };
90
91 struct unary_op_floor
92 {
operator ()ncnn::unary_op_floor93 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
94 {
95 vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
96 vbool4_t _mask = vmfgt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
97 return vfcvt_f_x_v_f32m8(vsub_vx_i32m8_m(_mask, _xi, _xi, 1, vl), vl);
98 }
99 };
100
101 struct unary_op_ceil
102 {
operator ()ncnn::unary_op_ceil103 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
104 {
105 vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
106 vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
107 return vfcvt_f_x_v_f32m8(vadd_vx_i32m8_m(_mask, _xi, _xi, 1, vl), vl);
108 }
109 };
110
111 struct unary_op_square
112 {
operator ()ncnn::unary_op_square113 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
114 {
115 return vfmul_vv_f32m8(x, x, vl);
116 }
117 };
118
119 struct unary_op_sqrt
120 {
operator ()ncnn::unary_op_sqrt121 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
122 {
123 return vfsqrt_v_f32m8(x, vl);
124 }
125 };
126
127 struct unary_op_rsqrt
128 {
operator ()ncnn::unary_op_rsqrt129 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
130 {
131 vfloat32m8_t _reciprocal = vfrsqrt7_v_f32m8(x, vl);
132 _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
133 // _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
134 return _reciprocal;
135 }
136 };
137
138 struct unary_op_exp
139 {
operator ()ncnn::unary_op_exp140 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
141 {
142 return exp_ps(x, vl);
143 }
144 };
145
146 struct unary_op_log
147 {
operator ()ncnn::unary_op_log148 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
149 {
150 return log_ps(x, vl);
151 }
152 };
153
154 struct unary_op_sin
155 {
operator ()ncnn::unary_op_sin156 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
157 {
158 return sin_ps(x, vl);
159 }
160 };
161
162 struct unary_op_cos
163 {
operator ()ncnn::unary_op_cos164 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
165 {
166 return cos_ps(x, vl);
167 }
168 };
169
170 struct unary_op_tan
171 {
operator ()ncnn::unary_op_tan172 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
173 {
174 // TODO rvv optimize
175 std::vector<float> tmp(vl);
176 vse32_v_f32m8(tmp.data(), x, vl);
177 for (int i = 0; i < vl; i++)
178 {
179 tmp[i] = tan(tmp[i]);
180 }
181 return vle32_v_f32m8(tmp.data(), vl);
182 }
183 };
184
185 struct unary_op_asin
186 {
operator ()ncnn::unary_op_asin187 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
188 {
189 // TODO rvv optimize
190 std::vector<float> tmp(vl);
191 vse32_v_f32m8(tmp.data(), x, vl);
192 for (int i = 0; i < vl; i++)
193 {
194 tmp[i] = asin(tmp[i]);
195 }
196 return vle32_v_f32m8(tmp.data(), vl);
197 }
198 };
199
200 struct unary_op_acos
201 {
operator ()ncnn::unary_op_acos202 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
203 {
204 // TODO rvv optimize
205 std::vector<float> tmp(vl);
206 vse32_v_f32m8(tmp.data(), x, vl);
207 for (int i = 0; i < vl; i++)
208 {
209 tmp[i] = acos(tmp[i]);
210 }
211 return vle32_v_f32m8(tmp.data(), vl);
212 }
213 };
214
215 struct unary_op_atan
216 {
operator ()ncnn::unary_op_atan217 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
218 {
219 // TODO rvv optimize
220 std::vector<float> tmp(vl);
221 vse32_v_f32m8(tmp.data(), x, vl);
222 for (int i = 0; i < vl; i++)
223 {
224 tmp[i] = atan(tmp[i]);
225 }
226 return vle32_v_f32m8(tmp.data(), vl);
227 }
228 };
229
230 struct unary_op_reciprocal
231 {
operator ()ncnn::unary_op_reciprocal232 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
233 {
234 vfloat32m8_t _reciprocal = vfrec7_v_f32m8(x, vl);
235 _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
236 // _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
237 return _reciprocal;
238 }
239 };
240
241 struct unary_op_tanh
242 {
operator ()ncnn::unary_op_tanh243 vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
244 {
245 return tanh_ps(x, vl);
246 }
247 };
248 #endif // __riscv_vector
249
forward_inplace(Mat & bottom_top_blob,const Option & opt) const250 int UnaryOp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
251 {
252 int elembits = bottom_top_blob.elembits();
253
254 #if __riscv_vector && __riscv_zfh
255 if (opt.use_fp16_storage && elembits == 16)
256 return forward_inplace_fp16s(bottom_top_blob, opt);
257 #endif
258
259 #if __riscv_vector
260 if (op_type == Operation_ABS)
261 return unary_op_inplace<unary_op_abs>(bottom_top_blob, opt);
262
263 if (op_type == Operation_NEG)
264 return unary_op_inplace<unary_op_neg>(bottom_top_blob, opt);
265
266 if (op_type == Operation_FLOOR)
267 return unary_op_inplace<unary_op_floor>(bottom_top_blob, opt);
268
269 if (op_type == Operation_CEIL)
270 return unary_op_inplace<unary_op_ceil>(bottom_top_blob, opt);
271
272 if (op_type == Operation_SQUARE)
273 return unary_op_inplace<unary_op_square>(bottom_top_blob, opt);
274
275 if (op_type == Operation_SQRT)
276 return unary_op_inplace<unary_op_sqrt>(bottom_top_blob, opt);
277
278 if (op_type == Operation_RSQRT)
279 return unary_op_inplace<unary_op_rsqrt>(bottom_top_blob, opt);
280
281 if (op_type == Operation_EXP)
282 return unary_op_inplace<unary_op_exp>(bottom_top_blob, opt);
283
284 if (op_type == Operation_LOG)
285 return unary_op_inplace<unary_op_log>(bottom_top_blob, opt);
286
287 if (op_type == Operation_SIN)
288 return unary_op_inplace<unary_op_sin>(bottom_top_blob, opt);
289
290 if (op_type == Operation_COS)
291 return unary_op_inplace<unary_op_cos>(bottom_top_blob, opt);
292
293 if (op_type == Operation_TAN)
294 return unary_op_inplace<unary_op_tan>(bottom_top_blob, opt);
295
296 if (op_type == Operation_ASIN)
297 return unary_op_inplace<unary_op_asin>(bottom_top_blob, opt);
298
299 if (op_type == Operation_ACOS)
300 return unary_op_inplace<unary_op_acos>(bottom_top_blob, opt);
301
302 if (op_type == Operation_ATAN)
303 return unary_op_inplace<unary_op_atan>(bottom_top_blob, opt);
304
305 if (op_type == Operation_RECIPROCAL)
306 return unary_op_inplace<unary_op_reciprocal>(bottom_top_blob, opt);
307
308 if (op_type == Operation_TANH)
309 return unary_op_inplace<unary_op_tanh>(bottom_top_blob, opt);
310
311 return 0;
312 #else // __riscv_vector
313 return UnaryOp::forward_inplace(bottom_top_blob, opt);
314 #endif // __riscv_vector
315 }
316
317 #if __riscv_vector && __riscv_zfh
318 template<typename Op>
unary_op_inplace_fp16s(Mat & a,const Option & opt)319 static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
320 {
321 Op op;
322
323 int w = a.w;
324 int h = a.h;
325 int channels = a.c;
326 int size = w * h;
327 int elempack = a.elempack;
328
329 #pragma omp parallel for num_threads(opt.num_threads)
330 for (int q = 0; q < channels; q++)
331 {
332 __fp16* ptr = a.channel(q);
333
334 int n = size * elempack;
335 while (n > 0)
336 {
337 word_type vl = vsetvl_e16m8(n);
338
339 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
340 _p = op(_p, vl);
341 vse16_v_f16m8(ptr, _p, vl);
342
343 ptr += vl;
344 n -= vl;
345 }
346 }
347
348 return 0;
349 }
350
351 struct unary_op_abs_fp16s
352 {
operator ()ncnn::unary_op_abs_fp16s353 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
354 {
355 return vfsgnj_vf_f16m8(x, 1.f, vl);
356 }
357 };
358
359 struct unary_op_neg_fp16s
360 {
operator ()ncnn::unary_op_neg_fp16s361 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
362 {
363 return vfneg_v_f16m8(x, vl);
364 }
365 };
366
367 struct unary_op_floor_fp16s
368 {
operator ()ncnn::unary_op_floor_fp16s369 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
370 {
371 vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl);
372 vbool2_t _mask = vmfgt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl);
373 return vfcvt_f_x_v_f16m8(vsub_vx_i16m8_m(_mask, _xi, _xi, 1, vl), vl);
374 }
375 };
376
377 struct unary_op_ceil_fp16s
378 {
operator ()ncnn::unary_op_ceil_fp16s379 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
380 {
381 vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl);
382 vbool2_t _mask = vmflt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl);
383 return vfcvt_f_x_v_f16m8(vadd_vx_i16m8_m(_mask, _xi, _xi, 1, vl), vl);
384 }
385 };
386
387 struct unary_op_square_fp16s
388 {
operator ()ncnn::unary_op_square_fp16s389 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
390 {
391 return vfmul_vv_f16m8(x, x, vl);
392 }
393 };
394
395 struct unary_op_sqrt_fp16s
396 {
operator ()ncnn::unary_op_sqrt_fp16s397 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
398 {
399 return vfsqrt_v_f16m8(x, vl);
400 }
401 };
402
403 struct unary_op_rsqrt_fp16s
404 {
operator ()ncnn::unary_op_rsqrt_fp16s405 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
406 {
407 vfloat16m8_t _reciprocal = vfrsqrt7_v_f16m8(x, vl);
408 _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
409 // _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
410 return _reciprocal;
411 }
412 };
413
414 struct unary_op_exp_fp16s
415 {
operator ()ncnn::unary_op_exp_fp16s416 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
417 {
418 return exp_ps(x, vl);
419 }
420 };
421
422 struct unary_op_log_fp16s
423 {
operator ()ncnn::unary_op_log_fp16s424 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
425 {
426 return log_ps(x, vl);
427 }
428 };
429
430 struct unary_op_sin_fp16s
431 {
operator ()ncnn::unary_op_sin_fp16s432 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
433 {
434 return sin_ps(x, vl);
435 }
436 };
437
438 struct unary_op_cos_fp16s
439 {
operator ()ncnn::unary_op_cos_fp16s440 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
441 {
442 return cos_ps(x, vl);
443 }
444 };
445
446 struct unary_op_tan_fp16s
447 {
operator ()ncnn::unary_op_tan_fp16s448 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
449 {
450 // TODO rvv optimize
451 std::vector<__fp16> tmp(vl);
452 vse16_v_f16m8(tmp.data(), x, vl);
453 for (int i = 0; i < vl; i++)
454 {
455 tmp[i] = tan((float)tmp[i]);
456 }
457 return vle16_v_f16m8(tmp.data(), vl);
458 }
459 };
460
461 struct unary_op_asin_fp16s
462 {
operator ()ncnn::unary_op_asin_fp16s463 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
464 {
465 // TODO rvv optimize
466 std::vector<__fp16> tmp(vl);
467 vse16_v_f16m8(tmp.data(), x, vl);
468 for (int i = 0; i < vl; i++)
469 {
470 tmp[i] = asin((float)tmp[i]);
471 }
472 return vle16_v_f16m8(tmp.data(), vl);
473 }
474 };
475
476 struct unary_op_acos_fp16s
477 {
operator ()ncnn::unary_op_acos_fp16s478 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
479 {
480 // TODO rvv optimize
481 std::vector<__fp16> tmp(vl);
482 vse16_v_f16m8(tmp.data(), x, vl);
483 for (int i = 0; i < vl; i++)
484 {
485 tmp[i] = acos((float)tmp[i]);
486 }
487 return vle16_v_f16m8(tmp.data(), vl);
488 }
489 };
490
491 struct unary_op_atan_fp16s
492 {
operator ()ncnn::unary_op_atan_fp16s493 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
494 {
495 // TODO rvv optimize
496 std::vector<__fp16> tmp(vl);
497 vse16_v_f16m8(tmp.data(), x, vl);
498 for (int i = 0; i < vl; i++)
499 {
500 tmp[i] = atan((float)tmp[i]);
501 }
502 return vle16_v_f16m8(tmp.data(), vl);
503 }
504 };
505
506 struct unary_op_reciprocal_fp16s
507 {
operator ()ncnn::unary_op_reciprocal_fp16s508 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
509 {
510 vfloat16m8_t _reciprocal = vfrec7_v_f16m8(x, vl);
511 _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
512 // _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
513 return _reciprocal;
514 }
515 };
516
517 struct unary_op_tanh_fp16s
518 {
operator ()ncnn::unary_op_tanh_fp16s519 vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
520 {
521 return tanh_ps(x, vl);
522 }
523 };
524
forward_inplace_fp16s(Mat & bottom_top_blob,const Option & opt) const525 int UnaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
526 {
527 if (op_type == Operation_ABS)
528 return unary_op_inplace_fp16s<unary_op_abs_fp16s>(bottom_top_blob, opt);
529
530 if (op_type == Operation_NEG)
531 return unary_op_inplace_fp16s<unary_op_neg_fp16s>(bottom_top_blob, opt);
532
533 if (op_type == Operation_FLOOR)
534 return unary_op_inplace_fp16s<unary_op_floor_fp16s>(bottom_top_blob, opt);
535
536 if (op_type == Operation_CEIL)
537 return unary_op_inplace_fp16s<unary_op_ceil_fp16s>(bottom_top_blob, opt);
538
539 if (op_type == Operation_SQUARE)
540 return unary_op_inplace_fp16s<unary_op_square_fp16s>(bottom_top_blob, opt);
541
542 if (op_type == Operation_SQRT)
543 return unary_op_inplace_fp16s<unary_op_sqrt_fp16s>(bottom_top_blob, opt);
544
545 if (op_type == Operation_RSQRT)
546 return unary_op_inplace_fp16s<unary_op_rsqrt_fp16s>(bottom_top_blob, opt);
547
548 if (op_type == Operation_EXP)
549 return unary_op_inplace_fp16s<unary_op_exp_fp16s>(bottom_top_blob, opt);
550
551 if (op_type == Operation_LOG)
552 return unary_op_inplace_fp16s<unary_op_log_fp16s>(bottom_top_blob, opt);
553
554 if (op_type == Operation_SIN)
555 return unary_op_inplace_fp16s<unary_op_sin_fp16s>(bottom_top_blob, opt);
556
557 if (op_type == Operation_COS)
558 return unary_op_inplace_fp16s<unary_op_cos_fp16s>(bottom_top_blob, opt);
559
560 if (op_type == Operation_TAN)
561 return unary_op_inplace_fp16s<unary_op_tan_fp16s>(bottom_top_blob, opt);
562
563 if (op_type == Operation_ASIN)
564 return unary_op_inplace_fp16s<unary_op_asin_fp16s>(bottom_top_blob, opt);
565
566 if (op_type == Operation_ACOS)
567 return unary_op_inplace_fp16s<unary_op_acos_fp16s>(bottom_top_blob, opt);
568
569 if (op_type == Operation_ATAN)
570 return unary_op_inplace_fp16s<unary_op_atan_fp16s>(bottom_top_blob, opt);
571
572 if (op_type == Operation_RECIPROCAL)
573 return unary_op_inplace_fp16s<unary_op_reciprocal_fp16s>(bottom_top_blob, opt);
574
575 if (op_type == Operation_TANH)
576 return unary_op_inplace_fp16s<unary_op_tanh_fp16s>(bottom_top_blob, opt);
577
578 return 0;
579 }
580 #endif // __riscv_vector && __riscv_zfh
581
582 } // namespace ncnn
583