1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "unaryop_riscv.h"
16 
17 #if __riscv_vector
18 #ifdef RVV_SPEC_0_7
19 #include "riscv_v_071_fix.h"
20 #else
21 #include <riscv_vector.h>
22 #endif
23 #include "rvv_mathfun.h"
24 #include "rvv_mathfun_fp16s.h"
25 #endif // __riscv_vector
26 
27 #include <math.h>
28 
29 namespace ncnn {
30 
UnaryOp_riscv()31 UnaryOp_riscv::UnaryOp_riscv()
32 {
33 #if __riscv_vector
34     support_packing = true;
35 #if __riscv_zfh
36     support_fp16_storage = true;
37 #endif
38 #endif // __riscv_vector
39 }
40 
41 #if __riscv_vector
42 template<typename Op>
unary_op_inplace(Mat & a,const Option & opt)43 static int unary_op_inplace(Mat& a, const Option& opt)
44 {
45     Op op;
46 
47     int w = a.w;
48     int h = a.h;
49     int channels = a.c;
50     int size = w * h;
51     int elempack = a.elempack;
52 
53     #pragma omp parallel for num_threads(opt.num_threads)
54     for (int q = 0; q < channels; q++)
55     {
56         float* ptr = a.channel(q);
57 
58         int n = size * elempack;
59         while (n > 0)
60         {
61             word_type vl = vsetvl_e32m8(n);
62 
63             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
64             _p = op(_p, vl);
65             vse32_v_f32m8(ptr, _p, vl);
66 
67             ptr += vl;
68             n -= vl;
69         }
70     }
71 
72     return 0;
73 }
74 
75 struct unary_op_abs
76 {
operator ()ncnn::unary_op_abs77     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
78     {
79         return vfsgnj_vf_f32m8(x, 1.f, vl);
80     }
81 };
82 
83 struct unary_op_neg
84 {
operator ()ncnn::unary_op_neg85     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
86     {
87         return vfneg_v_f32m8(x, vl);
88     }
89 };
90 
91 struct unary_op_floor
92 {
operator ()ncnn::unary_op_floor93     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
94     {
95         vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
96         vbool4_t _mask = vmfgt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
97         return vfcvt_f_x_v_f32m8(vsub_vx_i32m8_m(_mask, _xi, _xi, 1, vl), vl);
98     }
99 };
100 
101 struct unary_op_ceil
102 {
operator ()ncnn::unary_op_ceil103     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
104     {
105         vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
106         vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
107         return vfcvt_f_x_v_f32m8(vadd_vx_i32m8_m(_mask, _xi, _xi, 1, vl), vl);
108     }
109 };
110 
111 struct unary_op_square
112 {
operator ()ncnn::unary_op_square113     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
114     {
115         return vfmul_vv_f32m8(x, x, vl);
116     }
117 };
118 
119 struct unary_op_sqrt
120 {
operator ()ncnn::unary_op_sqrt121     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
122     {
123         return vfsqrt_v_f32m8(x, vl);
124     }
125 };
126 
127 struct unary_op_rsqrt
128 {
operator ()ncnn::unary_op_rsqrt129     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
130     {
131         vfloat32m8_t _reciprocal = vfrsqrt7_v_f32m8(x, vl);
132         _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
133         // _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
134         return _reciprocal;
135     }
136 };
137 
138 struct unary_op_exp
139 {
operator ()ncnn::unary_op_exp140     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
141     {
142         return exp_ps(x, vl);
143     }
144 };
145 
146 struct unary_op_log
147 {
operator ()ncnn::unary_op_log148     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
149     {
150         return log_ps(x, vl);
151     }
152 };
153 
154 struct unary_op_sin
155 {
operator ()ncnn::unary_op_sin156     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
157     {
158         return sin_ps(x, vl);
159     }
160 };
161 
162 struct unary_op_cos
163 {
operator ()ncnn::unary_op_cos164     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
165     {
166         return cos_ps(x, vl);
167     }
168 };
169 
170 struct unary_op_tan
171 {
operator ()ncnn::unary_op_tan172     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
173     {
174         // TODO rvv optimize
175         std::vector<float> tmp(vl);
176         vse32_v_f32m8(tmp.data(), x, vl);
177         for (int i = 0; i < vl; i++)
178         {
179             tmp[i] = tan(tmp[i]);
180         }
181         return vle32_v_f32m8(tmp.data(), vl);
182     }
183 };
184 
185 struct unary_op_asin
186 {
operator ()ncnn::unary_op_asin187     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
188     {
189         // TODO rvv optimize
190         std::vector<float> tmp(vl);
191         vse32_v_f32m8(tmp.data(), x, vl);
192         for (int i = 0; i < vl; i++)
193         {
194             tmp[i] = asin(tmp[i]);
195         }
196         return vle32_v_f32m8(tmp.data(), vl);
197     }
198 };
199 
200 struct unary_op_acos
201 {
operator ()ncnn::unary_op_acos202     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
203     {
204         // TODO rvv optimize
205         std::vector<float> tmp(vl);
206         vse32_v_f32m8(tmp.data(), x, vl);
207         for (int i = 0; i < vl; i++)
208         {
209             tmp[i] = acos(tmp[i]);
210         }
211         return vle32_v_f32m8(tmp.data(), vl);
212     }
213 };
214 
215 struct unary_op_atan
216 {
operator ()ncnn::unary_op_atan217     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
218     {
219         // TODO rvv optimize
220         std::vector<float> tmp(vl);
221         vse32_v_f32m8(tmp.data(), x, vl);
222         for (int i = 0; i < vl; i++)
223         {
224             tmp[i] = atan(tmp[i]);
225         }
226         return vle32_v_f32m8(tmp.data(), vl);
227     }
228 };
229 
230 struct unary_op_reciprocal
231 {
operator ()ncnn::unary_op_reciprocal232     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
233     {
234         vfloat32m8_t _reciprocal = vfrec7_v_f32m8(x, vl);
235         _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
236         // _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
237         return _reciprocal;
238     }
239 };
240 
241 struct unary_op_tanh
242 {
operator ()ncnn::unary_op_tanh243     vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
244     {
245         return tanh_ps(x, vl);
246     }
247 };
248 #endif // __riscv_vector
249 
forward_inplace(Mat & bottom_top_blob,const Option & opt) const250 int UnaryOp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
251 {
252     int elembits = bottom_top_blob.elembits();
253 
254 #if __riscv_vector && __riscv_zfh
255     if (opt.use_fp16_storage && elembits == 16)
256         return forward_inplace_fp16s(bottom_top_blob, opt);
257 #endif
258 
259 #if __riscv_vector
260     if (op_type == Operation_ABS)
261         return unary_op_inplace<unary_op_abs>(bottom_top_blob, opt);
262 
263     if (op_type == Operation_NEG)
264         return unary_op_inplace<unary_op_neg>(bottom_top_blob, opt);
265 
266     if (op_type == Operation_FLOOR)
267         return unary_op_inplace<unary_op_floor>(bottom_top_blob, opt);
268 
269     if (op_type == Operation_CEIL)
270         return unary_op_inplace<unary_op_ceil>(bottom_top_blob, opt);
271 
272     if (op_type == Operation_SQUARE)
273         return unary_op_inplace<unary_op_square>(bottom_top_blob, opt);
274 
275     if (op_type == Operation_SQRT)
276         return unary_op_inplace<unary_op_sqrt>(bottom_top_blob, opt);
277 
278     if (op_type == Operation_RSQRT)
279         return unary_op_inplace<unary_op_rsqrt>(bottom_top_blob, opt);
280 
281     if (op_type == Operation_EXP)
282         return unary_op_inplace<unary_op_exp>(bottom_top_blob, opt);
283 
284     if (op_type == Operation_LOG)
285         return unary_op_inplace<unary_op_log>(bottom_top_blob, opt);
286 
287     if (op_type == Operation_SIN)
288         return unary_op_inplace<unary_op_sin>(bottom_top_blob, opt);
289 
290     if (op_type == Operation_COS)
291         return unary_op_inplace<unary_op_cos>(bottom_top_blob, opt);
292 
293     if (op_type == Operation_TAN)
294         return unary_op_inplace<unary_op_tan>(bottom_top_blob, opt);
295 
296     if (op_type == Operation_ASIN)
297         return unary_op_inplace<unary_op_asin>(bottom_top_blob, opt);
298 
299     if (op_type == Operation_ACOS)
300         return unary_op_inplace<unary_op_acos>(bottom_top_blob, opt);
301 
302     if (op_type == Operation_ATAN)
303         return unary_op_inplace<unary_op_atan>(bottom_top_blob, opt);
304 
305     if (op_type == Operation_RECIPROCAL)
306         return unary_op_inplace<unary_op_reciprocal>(bottom_top_blob, opt);
307 
308     if (op_type == Operation_TANH)
309         return unary_op_inplace<unary_op_tanh>(bottom_top_blob, opt);
310 
311     return 0;
312 #else  // __riscv_vector
313     return UnaryOp::forward_inplace(bottom_top_blob, opt);
314 #endif // __riscv_vector
315 }
316 
317 #if __riscv_vector && __riscv_zfh
318 template<typename Op>
unary_op_inplace_fp16s(Mat & a,const Option & opt)319 static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
320 {
321     Op op;
322 
323     int w = a.w;
324     int h = a.h;
325     int channels = a.c;
326     int size = w * h;
327     int elempack = a.elempack;
328 
329     #pragma omp parallel for num_threads(opt.num_threads)
330     for (int q = 0; q < channels; q++)
331     {
332         __fp16* ptr = a.channel(q);
333 
334         int n = size * elempack;
335         while (n > 0)
336         {
337             word_type vl = vsetvl_e16m8(n);
338 
339             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
340             _p = op(_p, vl);
341             vse16_v_f16m8(ptr, _p, vl);
342 
343             ptr += vl;
344             n -= vl;
345         }
346     }
347 
348     return 0;
349 }
350 
351 struct unary_op_abs_fp16s
352 {
operator ()ncnn::unary_op_abs_fp16s353     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
354     {
355         return vfsgnj_vf_f16m8(x, 1.f, vl);
356     }
357 };
358 
359 struct unary_op_neg_fp16s
360 {
operator ()ncnn::unary_op_neg_fp16s361     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
362     {
363         return vfneg_v_f16m8(x, vl);
364     }
365 };
366 
367 struct unary_op_floor_fp16s
368 {
operator ()ncnn::unary_op_floor_fp16s369     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
370     {
371         vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl);
372         vbool2_t _mask = vmfgt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl);
373         return vfcvt_f_x_v_f16m8(vsub_vx_i16m8_m(_mask, _xi, _xi, 1, vl), vl);
374     }
375 };
376 
377 struct unary_op_ceil_fp16s
378 {
operator ()ncnn::unary_op_ceil_fp16s379     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
380     {
381         vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl);
382         vbool2_t _mask = vmflt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl);
383         return vfcvt_f_x_v_f16m8(vadd_vx_i16m8_m(_mask, _xi, _xi, 1, vl), vl);
384     }
385 };
386 
387 struct unary_op_square_fp16s
388 {
operator ()ncnn::unary_op_square_fp16s389     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
390     {
391         return vfmul_vv_f16m8(x, x, vl);
392     }
393 };
394 
395 struct unary_op_sqrt_fp16s
396 {
operator ()ncnn::unary_op_sqrt_fp16s397     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
398     {
399         return vfsqrt_v_f16m8(x, vl);
400     }
401 };
402 
403 struct unary_op_rsqrt_fp16s
404 {
operator ()ncnn::unary_op_rsqrt_fp16s405     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
406     {
407         vfloat16m8_t _reciprocal = vfrsqrt7_v_f16m8(x, vl);
408         _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
409         // _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
410         return _reciprocal;
411     }
412 };
413 
414 struct unary_op_exp_fp16s
415 {
operator ()ncnn::unary_op_exp_fp16s416     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
417     {
418         return exp_ps(x, vl);
419     }
420 };
421 
422 struct unary_op_log_fp16s
423 {
operator ()ncnn::unary_op_log_fp16s424     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
425     {
426         return log_ps(x, vl);
427     }
428 };
429 
430 struct unary_op_sin_fp16s
431 {
operator ()ncnn::unary_op_sin_fp16s432     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
433     {
434         return sin_ps(x, vl);
435     }
436 };
437 
438 struct unary_op_cos_fp16s
439 {
operator ()ncnn::unary_op_cos_fp16s440     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
441     {
442         return cos_ps(x, vl);
443     }
444 };
445 
446 struct unary_op_tan_fp16s
447 {
operator ()ncnn::unary_op_tan_fp16s448     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
449     {
450         // TODO rvv optimize
451         std::vector<__fp16> tmp(vl);
452         vse16_v_f16m8(tmp.data(), x, vl);
453         for (int i = 0; i < vl; i++)
454         {
455             tmp[i] = tan((float)tmp[i]);
456         }
457         return vle16_v_f16m8(tmp.data(), vl);
458     }
459 };
460 
461 struct unary_op_asin_fp16s
462 {
operator ()ncnn::unary_op_asin_fp16s463     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
464     {
465         // TODO rvv optimize
466         std::vector<__fp16> tmp(vl);
467         vse16_v_f16m8(tmp.data(), x, vl);
468         for (int i = 0; i < vl; i++)
469         {
470             tmp[i] = asin((float)tmp[i]);
471         }
472         return vle16_v_f16m8(tmp.data(), vl);
473     }
474 };
475 
476 struct unary_op_acos_fp16s
477 {
operator ()ncnn::unary_op_acos_fp16s478     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
479     {
480         // TODO rvv optimize
481         std::vector<__fp16> tmp(vl);
482         vse16_v_f16m8(tmp.data(), x, vl);
483         for (int i = 0; i < vl; i++)
484         {
485             tmp[i] = acos((float)tmp[i]);
486         }
487         return vle16_v_f16m8(tmp.data(), vl);
488     }
489 };
490 
491 struct unary_op_atan_fp16s
492 {
operator ()ncnn::unary_op_atan_fp16s493     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
494     {
495         // TODO rvv optimize
496         std::vector<__fp16> tmp(vl);
497         vse16_v_f16m8(tmp.data(), x, vl);
498         for (int i = 0; i < vl; i++)
499         {
500             tmp[i] = atan((float)tmp[i]);
501         }
502         return vle16_v_f16m8(tmp.data(), vl);
503     }
504 };
505 
506 struct unary_op_reciprocal_fp16s
507 {
operator ()ncnn::unary_op_reciprocal_fp16s508     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
509     {
510         vfloat16m8_t _reciprocal = vfrec7_v_f16m8(x, vl);
511         _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
512         // _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
513         return _reciprocal;
514     }
515 };
516 
517 struct unary_op_tanh_fp16s
518 {
operator ()ncnn::unary_op_tanh_fp16s519     vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
520     {
521         return tanh_ps(x, vl);
522     }
523 };
524 
forward_inplace_fp16s(Mat & bottom_top_blob,const Option & opt) const525 int UnaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
526 {
527     if (op_type == Operation_ABS)
528         return unary_op_inplace_fp16s<unary_op_abs_fp16s>(bottom_top_blob, opt);
529 
530     if (op_type == Operation_NEG)
531         return unary_op_inplace_fp16s<unary_op_neg_fp16s>(bottom_top_blob, opt);
532 
533     if (op_type == Operation_FLOOR)
534         return unary_op_inplace_fp16s<unary_op_floor_fp16s>(bottom_top_blob, opt);
535 
536     if (op_type == Operation_CEIL)
537         return unary_op_inplace_fp16s<unary_op_ceil_fp16s>(bottom_top_blob, opt);
538 
539     if (op_type == Operation_SQUARE)
540         return unary_op_inplace_fp16s<unary_op_square_fp16s>(bottom_top_blob, opt);
541 
542     if (op_type == Operation_SQRT)
543         return unary_op_inplace_fp16s<unary_op_sqrt_fp16s>(bottom_top_blob, opt);
544 
545     if (op_type == Operation_RSQRT)
546         return unary_op_inplace_fp16s<unary_op_rsqrt_fp16s>(bottom_top_blob, opt);
547 
548     if (op_type == Operation_EXP)
549         return unary_op_inplace_fp16s<unary_op_exp_fp16s>(bottom_top_blob, opt);
550 
551     if (op_type == Operation_LOG)
552         return unary_op_inplace_fp16s<unary_op_log_fp16s>(bottom_top_blob, opt);
553 
554     if (op_type == Operation_SIN)
555         return unary_op_inplace_fp16s<unary_op_sin_fp16s>(bottom_top_blob, opt);
556 
557     if (op_type == Operation_COS)
558         return unary_op_inplace_fp16s<unary_op_cos_fp16s>(bottom_top_blob, opt);
559 
560     if (op_type == Operation_TAN)
561         return unary_op_inplace_fp16s<unary_op_tan_fp16s>(bottom_top_blob, opt);
562 
563     if (op_type == Operation_ASIN)
564         return unary_op_inplace_fp16s<unary_op_asin_fp16s>(bottom_top_blob, opt);
565 
566     if (op_type == Operation_ACOS)
567         return unary_op_inplace_fp16s<unary_op_acos_fp16s>(bottom_top_blob, opt);
568 
569     if (op_type == Operation_ATAN)
570         return unary_op_inplace_fp16s<unary_op_atan_fp16s>(bottom_top_blob, opt);
571 
572     if (op_type == Operation_RECIPROCAL)
573         return unary_op_inplace_fp16s<unary_op_reciprocal_fp16s>(bottom_top_blob, opt);
574 
575     if (op_type == Operation_TANH)
576         return unary_op_inplace_fp16s<unary_op_tanh_fp16s>(bottom_top_blob, opt);
577 
578     return 0;
579 }
580 #endif // __riscv_vector && __riscv_zfh
581 
582 } // namespace ncnn
583