1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "sigmoid_arm.h"
16
17 #if __ARM_NEON
18 #include "neon_mathfun.h"
19
20 #include <arm_neon.h>
21 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
22 #include "neon_mathfun_fp16s.h"
23 #endif
24 #endif // __ARM_NEON
25
26 #include <math.h>
27
28 namespace ncnn {
29
Sigmoid_arm()30 Sigmoid_arm::Sigmoid_arm()
31 {
32 #if __ARM_NEON
33 support_packing = true;
34 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
35 support_fp16_storage = true;
36 #endif
37 #endif // __ARM_NEON
38
39 #if NCNN_BF16
40 support_bf16_storage = true;
41 #endif
42 }
43
forward_inplace(Mat & bottom_top_blob,const Option & opt) const44 int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
45 {
46 int elembits = bottom_top_blob.elembits();
47
48 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
49 if (opt.use_fp16_storage && elembits == 16)
50 {
51 if (opt.use_fp16_arithmetic)
52 return forward_inplace_fp16sa(bottom_top_blob, opt);
53 else
54 return forward_inplace_fp16s(bottom_top_blob, opt);
55 }
56 #endif
57
58 #if NCNN_BF16
59 if (opt.use_bf16_storage && elembits == 16)
60 return forward_inplace_bf16s(bottom_top_blob, opt);
61 #endif
62
63 int w = bottom_top_blob.w;
64 int h = bottom_top_blob.h;
65 int channels = bottom_top_blob.c;
66 int size = w * h;
67 int elempack = bottom_top_blob.elempack;
68
69 #if __ARM_NEON
70 if (elempack == 4)
71 {
72 #pragma omp parallel for num_threads(opt.num_threads)
73 for (int q = 0; q < channels; q++)
74 {
75 float* ptr = bottom_top_blob.channel(q);
76
77 for (int i = 0; i < size; i++)
78 {
79 float32x4_t _p = vld1q_f32(ptr);
80 _p = sigmoid_ps(_p);
81 vst1q_f32(ptr, _p);
82
83 ptr += 4;
84 }
85 }
86
87 return 0;
88 }
89 #endif // __ARM_NEON
90
91 #pragma omp parallel for num_threads(opt.num_threads)
92 for (int q = 0; q < channels; q++)
93 {
94 float* ptr = bottom_top_blob.channel(q);
95
96 #if __ARM_NEON
97 int nn = size >> 2;
98 int remain = size - (nn << 2);
99 #else
100 int remain = size;
101 #endif // __ARM_NEON
102
103 #if __ARM_NEON
104 for (; nn > 0; nn--)
105 {
106 float32x4_t _p = vld1q_f32(ptr);
107 _p = sigmoid_ps(_p);
108 vst1q_f32(ptr, _p);
109
110 ptr += 4;
111 }
112 #endif // __ARM_NEON
113 for (; remain > 0; remain--)
114 {
115 *ptr = 1.f / (1.f + exp(-*ptr));
116
117 ptr++;
118 }
119 }
120
121 return 0;
122 }
123
124 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_inplace_fp16s(Mat & bottom_top_blob,const Option & opt) const125 int Sigmoid_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
126 {
127 int w = bottom_top_blob.w;
128 int h = bottom_top_blob.h;
129 int channels = bottom_top_blob.c;
130 int size = w * h;
131 int elempack = bottom_top_blob.elempack;
132
133 if (elempack == 4)
134 {
135 #pragma omp parallel for num_threads(opt.num_threads)
136 for (int q = 0; q < channels; q++)
137 {
138 __fp16* ptr = bottom_top_blob.channel(q);
139
140 for (int i = 0; i < size; i++)
141 {
142 float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
143 _p = sigmoid_ps(_p);
144 vst1_f16(ptr, vcvt_f16_f32(_p));
145
146 ptr += 4;
147 }
148 }
149
150 return 0;
151 }
152
153 #pragma omp parallel for num_threads(opt.num_threads)
154 for (int q = 0; q < channels; q++)
155 {
156 __fp16* ptr = bottom_top_blob.channel(q);
157
158 int i = 0;
159 for (; i + 3 < size; i += 4)
160 {
161 float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
162 _p = sigmoid_ps(_p);
163 vst1_f16(ptr, vcvt_f16_f32(_p));
164
165 ptr += 4;
166 }
167 for (; i < size; i++)
168 {
169 float v = (float)*ptr;
170 v = 1.f / (1.f + exp(-v));
171 *ptr = (__fp16)v;
172 ptr++;
173 }
174 }
175
176 return 0;
177 }
178
forward_inplace_fp16sa(Mat & bottom_top_blob,const Option & opt) const179 int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
180 {
181 int w = bottom_top_blob.w;
182 int h = bottom_top_blob.h;
183 int channels = bottom_top_blob.c;
184 int size = w * h;
185 int elempack = bottom_top_blob.elempack;
186
187 if (elempack == 8)
188 {
189 #pragma omp parallel for num_threads(opt.num_threads)
190 for (int q = 0; q < channels; q++)
191 {
192 __fp16* ptr = bottom_top_blob.channel(q);
193
194 for (int i = 0; i < size; i++)
195 {
196 float16x8_t _p = vld1q_f16(ptr);
197 _p = sigmoid_ps(_p);
198 vst1q_f16(ptr, _p);
199
200 ptr += 8;
201 }
202 }
203
204 return 0;
205 }
206
207 if (elempack == 4)
208 {
209 #pragma omp parallel for num_threads(opt.num_threads)
210 for (int q = 0; q < channels; q++)
211 {
212 __fp16* ptr = bottom_top_blob.channel(q);
213
214 for (int i = 0; i < size; i++)
215 {
216 float16x4_t _p = vld1_f16(ptr);
217 _p = sigmoid_ps(_p);
218 vst1_f16(ptr, _p);
219
220 ptr += 4;
221 }
222 }
223
224 return 0;
225 }
226
227 #pragma omp parallel for num_threads(opt.num_threads)
228 for (int q = 0; q < channels; q++)
229 {
230 __fp16* ptr = bottom_top_blob.channel(q);
231
232 int i = 0;
233 for (; i + 3 < size; i += 4)
234 {
235 float16x4_t _p = vld1_f16(ptr);
236 _p = sigmoid_ps(_p);
237 vst1_f16(ptr, _p);
238
239 ptr += 4;
240 }
241 for (; i < size; i++)
242 {
243 __fp16 v = *ptr;
244 v = 1.f / (1.f + exp(-v));
245 *ptr = v;
246 ptr++;
247 }
248 }
249
250 return 0;
251 }
252 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
253
254 #if NCNN_BF16
forward_inplace_bf16s(Mat & bottom_top_blob,const Option & opt) const255 int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
256 {
257 int w = bottom_top_blob.w;
258 int h = bottom_top_blob.h;
259 int channels = bottom_top_blob.c;
260 int size = w * h;
261 int elempack = bottom_top_blob.elempack;
262
263 #if __ARM_NEON
264 if (elempack == 4)
265 {
266 #pragma omp parallel for num_threads(opt.num_threads)
267 for (int q = 0; q < channels; q++)
268 {
269 unsigned short* ptr = bottom_top_blob.channel(q);
270
271 for (int i = 0; i < size; i++)
272 {
273 float32x4_t _p = vcvt_f32_bf16(vld1_u16(ptr));
274 _p = sigmoid_ps(_p);
275 vst1_u16(ptr, vcvt_bf16_f32(_p));
276
277 ptr += 4;
278 }
279 }
280
281 return 0;
282 }
283 #endif // __ARM_NEON
284
285 #pragma omp parallel for num_threads(opt.num_threads)
286 for (int q = 0; q < channels; q++)
287 {
288 unsigned short* ptr = bottom_top_blob.channel(q);
289
290 #if __ARM_NEON
291 int nn = size >> 2;
292 int remain = size - (nn << 2);
293 #else
294 int remain = size;
295 #endif // __ARM_NEON
296
297 #if __ARM_NEON
298 for (; nn > 0; nn--)
299 {
300 float32x4_t _p = vcvt_f32_bf16(vld1_u16(ptr));
301 _p = sigmoid_ps(_p);
302 vst1_u16(ptr, vcvt_bf16_f32(_p));
303
304 ptr += 4;
305 }
306 #endif // __ARM_NEON
307 for (; remain > 0; remain--)
308 {
309 float v = bfloat16_to_float32(*ptr);
310 v = 1.f / (1.f + exp(-v));
311 *ptr = float32_to_bfloat16(v);
312
313 ptr++;
314 }
315 }
316
317 return 0;
318 }
319 #endif // NCNN_BF16
320
321 } // namespace ncnn
322