1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "sigmoid_arm.h"
16 
17 #if __ARM_NEON
18 #include "neon_mathfun.h"
19 
20 #include <arm_neon.h>
21 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
22 #include "neon_mathfun_fp16s.h"
23 #endif
24 #endif // __ARM_NEON
25 
26 #include <math.h>
27 
28 namespace ncnn {
29 
Sigmoid_arm()30 Sigmoid_arm::Sigmoid_arm()
31 {
32 #if __ARM_NEON
33     support_packing = true;
34 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
35     support_fp16_storage = true;
36 #endif
37 #endif // __ARM_NEON
38 
39 #if NCNN_BF16
40     support_bf16_storage = true;
41 #endif
42 }
43 
forward_inplace(Mat & bottom_top_blob,const Option & opt) const44 int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
45 {
46     int elembits = bottom_top_blob.elembits();
47 
48 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
49     if (opt.use_fp16_storage && elembits == 16)
50     {
51         if (opt.use_fp16_arithmetic)
52             return forward_inplace_fp16sa(bottom_top_blob, opt);
53         else
54             return forward_inplace_fp16s(bottom_top_blob, opt);
55     }
56 #endif
57 
58 #if NCNN_BF16
59     if (opt.use_bf16_storage && elembits == 16)
60         return forward_inplace_bf16s(bottom_top_blob, opt);
61 #endif
62 
63     int w = bottom_top_blob.w;
64     int h = bottom_top_blob.h;
65     int channels = bottom_top_blob.c;
66     int size = w * h;
67     int elempack = bottom_top_blob.elempack;
68 
69 #if __ARM_NEON
70     if (elempack == 4)
71     {
72         #pragma omp parallel for num_threads(opt.num_threads)
73         for (int q = 0; q < channels; q++)
74         {
75             float* ptr = bottom_top_blob.channel(q);
76 
77             for (int i = 0; i < size; i++)
78             {
79                 float32x4_t _p = vld1q_f32(ptr);
80                 _p = sigmoid_ps(_p);
81                 vst1q_f32(ptr, _p);
82 
83                 ptr += 4;
84             }
85         }
86 
87         return 0;
88     }
89 #endif // __ARM_NEON
90 
91     #pragma omp parallel for num_threads(opt.num_threads)
92     for (int q = 0; q < channels; q++)
93     {
94         float* ptr = bottom_top_blob.channel(q);
95 
96 #if __ARM_NEON
97         int nn = size >> 2;
98         int remain = size - (nn << 2);
99 #else
100         int remain = size;
101 #endif // __ARM_NEON
102 
103 #if __ARM_NEON
104         for (; nn > 0; nn--)
105         {
106             float32x4_t _p = vld1q_f32(ptr);
107             _p = sigmoid_ps(_p);
108             vst1q_f32(ptr, _p);
109 
110             ptr += 4;
111         }
112 #endif // __ARM_NEON
113         for (; remain > 0; remain--)
114         {
115             *ptr = 1.f / (1.f + exp(-*ptr));
116 
117             ptr++;
118         }
119     }
120 
121     return 0;
122 }
123 
124 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
forward_inplace_fp16s(Mat & bottom_top_blob,const Option & opt) const125 int Sigmoid_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
126 {
127     int w = bottom_top_blob.w;
128     int h = bottom_top_blob.h;
129     int channels = bottom_top_blob.c;
130     int size = w * h;
131     int elempack = bottom_top_blob.elempack;
132 
133     if (elempack == 4)
134     {
135         #pragma omp parallel for num_threads(opt.num_threads)
136         for (int q = 0; q < channels; q++)
137         {
138             __fp16* ptr = bottom_top_blob.channel(q);
139 
140             for (int i = 0; i < size; i++)
141             {
142                 float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
143                 _p = sigmoid_ps(_p);
144                 vst1_f16(ptr, vcvt_f16_f32(_p));
145 
146                 ptr += 4;
147             }
148         }
149 
150         return 0;
151     }
152 
153     #pragma omp parallel for num_threads(opt.num_threads)
154     for (int q = 0; q < channels; q++)
155     {
156         __fp16* ptr = bottom_top_blob.channel(q);
157 
158         int i = 0;
159         for (; i + 3 < size; i += 4)
160         {
161             float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
162             _p = sigmoid_ps(_p);
163             vst1_f16(ptr, vcvt_f16_f32(_p));
164 
165             ptr += 4;
166         }
167         for (; i < size; i++)
168         {
169             float v = (float)*ptr;
170             v = 1.f / (1.f + exp(-v));
171             *ptr = (__fp16)v;
172             ptr++;
173         }
174     }
175 
176     return 0;
177 }
178 
forward_inplace_fp16sa(Mat & bottom_top_blob,const Option & opt) const179 int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
180 {
181     int w = bottom_top_blob.w;
182     int h = bottom_top_blob.h;
183     int channels = bottom_top_blob.c;
184     int size = w * h;
185     int elempack = bottom_top_blob.elempack;
186 
187     if (elempack == 8)
188     {
189         #pragma omp parallel for num_threads(opt.num_threads)
190         for (int q = 0; q < channels; q++)
191         {
192             __fp16* ptr = bottom_top_blob.channel(q);
193 
194             for (int i = 0; i < size; i++)
195             {
196                 float16x8_t _p = vld1q_f16(ptr);
197                 _p = sigmoid_ps(_p);
198                 vst1q_f16(ptr, _p);
199 
200                 ptr += 8;
201             }
202         }
203 
204         return 0;
205     }
206 
207     if (elempack == 4)
208     {
209         #pragma omp parallel for num_threads(opt.num_threads)
210         for (int q = 0; q < channels; q++)
211         {
212             __fp16* ptr = bottom_top_blob.channel(q);
213 
214             for (int i = 0; i < size; i++)
215             {
216                 float16x4_t _p = vld1_f16(ptr);
217                 _p = sigmoid_ps(_p);
218                 vst1_f16(ptr, _p);
219 
220                 ptr += 4;
221             }
222         }
223 
224         return 0;
225     }
226 
227     #pragma omp parallel for num_threads(opt.num_threads)
228     for (int q = 0; q < channels; q++)
229     {
230         __fp16* ptr = bottom_top_blob.channel(q);
231 
232         int i = 0;
233         for (; i + 3 < size; i += 4)
234         {
235             float16x4_t _p = vld1_f16(ptr);
236             _p = sigmoid_ps(_p);
237             vst1_f16(ptr, _p);
238 
239             ptr += 4;
240         }
241         for (; i < size; i++)
242         {
243             __fp16 v = *ptr;
244             v = 1.f / (1.f + exp(-v));
245             *ptr = v;
246             ptr++;
247         }
248     }
249 
250     return 0;
251 }
252 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
253 
254 #if NCNN_BF16
forward_inplace_bf16s(Mat & bottom_top_blob,const Option & opt) const255 int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
256 {
257     int w = bottom_top_blob.w;
258     int h = bottom_top_blob.h;
259     int channels = bottom_top_blob.c;
260     int size = w * h;
261     int elempack = bottom_top_blob.elempack;
262 
263 #if __ARM_NEON
264     if (elempack == 4)
265     {
266         #pragma omp parallel for num_threads(opt.num_threads)
267         for (int q = 0; q < channels; q++)
268         {
269             unsigned short* ptr = bottom_top_blob.channel(q);
270 
271             for (int i = 0; i < size; i++)
272             {
273                 float32x4_t _p = vcvt_f32_bf16(vld1_u16(ptr));
274                 _p = sigmoid_ps(_p);
275                 vst1_u16(ptr, vcvt_bf16_f32(_p));
276 
277                 ptr += 4;
278             }
279         }
280 
281         return 0;
282     }
283 #endif // __ARM_NEON
284 
285     #pragma omp parallel for num_threads(opt.num_threads)
286     for (int q = 0; q < channels; q++)
287     {
288         unsigned short* ptr = bottom_top_blob.channel(q);
289 
290 #if __ARM_NEON
291         int nn = size >> 2;
292         int remain = size - (nn << 2);
293 #else
294         int remain = size;
295 #endif // __ARM_NEON
296 
297 #if __ARM_NEON
298         for (; nn > 0; nn--)
299         {
300             float32x4_t _p = vcvt_f32_bf16(vld1_u16(ptr));
301             _p = sigmoid_ps(_p);
302             vst1_u16(ptr, vcvt_bf16_f32(_p));
303 
304             ptr += 4;
305         }
306 #endif // __ARM_NEON
307         for (; remain > 0; remain--)
308         {
309             float v = bfloat16_to_float32(*ptr);
310             v = 1.f / (1.f + exp(-v));
311             *ptr = float32_to_bfloat16(v);
312 
313             ptr++;
314         }
315     }
316 
317     return 0;
318 }
319 #endif // NCNN_BF16
320 
321 } // namespace ncnn
322