1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifndef ARM_ACTIVATION_H
16 #define ARM_ACTIVATION_H
17 
18 #include "fused_activation.h"
19 
20 #if __ARM_NEON
21 #include <arm_neon.h>
22 #include "neon_mathfun.h"
23 
activation_ps(float32x4_t _v,int activation_type,const ncnn::Mat & activation_params)24 static inline float32x4_t activation_ps(float32x4_t _v, int activation_type, const ncnn::Mat& activation_params)
25 {
26     if (activation_type == 1)
27     {
28         const float32x4_t _zero = vdupq_n_f32(0.f);
29         _v = vmaxq_f32(_v, _zero);
30     }
31     else if (activation_type == 2)
32     {
33         const float32x4_t _zero = vdupq_n_f32(0.f);
34         const float32x4_t _slope = vdupq_n_f32(activation_params[0]);
35         const uint32x4_t _lemask = vcleq_f32(_v, _zero);
36         float32x4_t _ps = vmulq_f32(_v, _slope);
37         _v = vbslq_f32(_lemask, _ps, _v);
38     }
39     else if (activation_type == 3)
40     {
41         const float32x4_t _min = vdupq_n_f32(activation_params[0]);
42         const float32x4_t _max = vdupq_n_f32(activation_params[1]);
43         _v = vmaxq_f32(_v, _min);
44         _v = vminq_f32(_v, _max);
45     }
46     else if (activation_type == 4)
47     {
48         _v = sigmoid_ps(_v);
49     }
50     else if (activation_type == 5)
51     {
52         _v = vmulq_f32(_v, tanh_ps(log_ps(vaddq_f32(exp_ps(_v), vdupq_n_f32(1.f)))));
53     }
54     else if (activation_type == 6)
55     {
56         const float alpha = activation_params[0];
57         const float beta = activation_params[1];
58         const float32x4_t _zero = vdupq_n_f32(0.f);
59         const float32x4_t _one = vdupq_n_f32(1.f);
60         float32x4_t _ans = vdupq_n_f32(beta);
61         _ans = vmlaq_n_f32(_ans, _v, alpha);
62         _ans = vmaxq_f32(_ans, _zero);
63         _ans = vminq_f32(_ans, _one);
64         _v = vmulq_f32(_ans, _v);
65     }
66 
67     return _v;
68 }
69 
70 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
71 #include "neon_mathfun_fp16s.h"
72 
activation_ss(__fp16 v,int activation_type,const ncnn::Mat & activation_params)73 static inline __fp16 activation_ss(__fp16 v, int activation_type, const ncnn::Mat& activation_params)
74 {
75     if (activation_type == 1)
76     {
77         v = std::max(v, (__fp16)0.f);
78     }
79     else if (activation_type == 2)
80     {
81         const __fp16 slope = (__fp16)(activation_params[0]);
82         v = v > 0.f ? v : v * slope;
83     }
84     else if (activation_type == 3)
85     {
86         const __fp16 min = (__fp16)(activation_params[0]);
87         const __fp16 max = (__fp16)(activation_params[1]);
88         if (v < min)
89             v = min;
90         if (v > max)
91             v = max;
92     }
93     else if (activation_type == 4)
94     {
95         v = (__fp16)1.f / ((__fp16)1.f + exp(-v));
96     }
97     else if (activation_type == 5)
98     {
99         v = v * tanh(log(exp(v) + (__fp16)1.f));
100     }
101     else if (activation_type == 6)
102     {
103         const __fp16 alpha = (__fp16)(activation_params[0]);
104         const __fp16 beta = (__fp16)(activation_params[1]);
105         const __fp16 lower = -beta / alpha;
106         const __fp16 upper = ((__fp16)1.f / alpha) + lower;
107         if (v < lower)
108             v = (__fp16)0.f;
109         else if (v > upper)
110             ;
111         else
112             v = v * (v * alpha + beta);
113     }
114 
115     return v;
116 }
117 
activation_ps(float16x4_t _v,int activation_type,const ncnn::Mat & activation_params)118 static inline float16x4_t activation_ps(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params)
119 {
120     if (activation_type == 1)
121     {
122         const float16x4_t _zero = vdup_n_f16(0.f);
123         _v = vmax_f16(_v, _zero);
124     }
125     else if (activation_type == 2)
126     {
127         const float16x4_t _zero = vdup_n_f16(0.f);
128         const float16x4_t _slope = vdup_n_f16((__fp16)activation_params[0]);
129         const uint16x4_t _lemask = vcle_f16(_v, _zero);
130         float16x4_t _ps = vmul_f16(_v, _slope);
131         _v = vbsl_f16(_lemask, _ps, _v);
132     }
133     else if (activation_type == 3)
134     {
135         const float16x4_t _min = vdup_n_f16((__fp16)activation_params[0]);
136         const float16x4_t _max = vdup_n_f16((__fp16)activation_params[1]);
137         _v = vmax_f16(_v, _min);
138         _v = vmin_f16(_v, _max);
139     }
140     else if (activation_type == 4)
141     {
142         _v = sigmoid_ps(_v);
143     }
144     else if (activation_type == 5)
145     {
146         _v = vmul_f16(_v, tanh_ps(log_ps(vadd_f16(exp_ps(_v), vdup_n_f16(1.f)))));
147     }
148     else if (activation_type == 6)
149     {
150         const __fp16 alpha = (__fp16)activation_params[0];
151         const __fp16 beta = (__fp16)activation_params[1];
152         const float16x4_t _zero = vdup_n_f16(0.f);
153         const float16x4_t _one = vdup_n_f16(1.f);
154         float16x4_t _ans = vdup_n_f16(beta);
155         _ans = vfma_n_f16(_ans, _v, alpha);
156         _ans = vmax_f16(_ans, _zero);
157         _ans = vmin_f16(_ans, _one);
158         _v = vmul_f16(_ans, _v);
159     }
160 
161     return _v;
162 }
163 
activation_ps(float16x8_t _v,int activation_type,const ncnn::Mat & activation_params)164 static inline float16x8_t activation_ps(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params)
165 {
166     if (activation_type == 1)
167     {
168         const float16x8_t _zero = vdupq_n_f16(0.f);
169         _v = vmaxq_f16(_v, _zero);
170     }
171     else if (activation_type == 2)
172     {
173         const float16x8_t _zero = vdupq_n_f16(0.f);
174         const float16x8_t _slope = vdupq_n_f16((__fp16)activation_params[0]);
175         const uint16x8_t _lemask = vcleq_f16(_v, _zero);
176         float16x8_t _ps = vmulq_f16(_v, _slope);
177         _v = vbslq_f16(_lemask, _ps, _v);
178     }
179     else if (activation_type == 3)
180     {
181         const float16x8_t _min = vdupq_n_f16((__fp16)activation_params[0]);
182         const float16x8_t _max = vdupq_n_f16((__fp16)activation_params[1]);
183         _v = vmaxq_f16(_v, _min);
184         _v = vminq_f16(_v, _max);
185     }
186     else if (activation_type == 4)
187     {
188         _v = sigmoid_ps(_v);
189     }
190     else if (activation_type == 5)
191     {
192         _v = vmulq_f16(_v, tanh_ps(log_ps(vaddq_f16(exp_ps(_v), vdupq_n_f16(1.f)))));
193     }
194     else if (activation_type == 6)
195     {
196         const __fp16 alpha_fp16 = (__fp16)activation_params[0];
197         const __fp16 beta_fp16 = (__fp16)activation_params[1];
198         const float16x8_t _zero = vdupq_n_f16(0.f);
199         const float16x8_t _one = vdupq_n_f16(1.f);
200         float16x8_t _ans = vdupq_n_f16(beta_fp16);
201         _ans = vfmaq_n_f16(_ans, _v, alpha_fp16);
202         _ans = vmaxq_f16(_ans, _zero);
203         _ans = vminq_f16(_ans, _one);
204         _v = vmulq_f16(_ans, _v);
205     }
206     return _v;
207 }
208 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
209 #endif // __ARM_NEON
210 
211 #endif // ARM_ACTIVATION_H
212