1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "hardsigmoid_x86.h"
16
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23
24 namespace ncnn {
25
HardSigmoid_x86()26 HardSigmoid_x86::HardSigmoid_x86()
27 {
28 #if __SSE2__
29 support_packing = true;
30 #endif // __SSE2__
31 }
32
forward_inplace(Mat & bottom_top_blob,const Option & opt) const33 int HardSigmoid_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
34 {
35 int w = bottom_top_blob.w;
36 int h = bottom_top_blob.h;
37 int channels = bottom_top_blob.c;
38 int size = w * h;
39 #if __SSE2__
40 int elempack = bottom_top_blob.elempack;
41
42 #if __AVX__
43 if (elempack == 8)
44 {
45 #pragma omp parallel for num_threads(opt.num_threads)
46 for (int q = 0; q < channels; q++)
47 {
48 float* ptr = bottom_top_blob.channel(q);
49
50 __m256 _zero = _mm256_set1_ps(0.f);
51 __m256 _one = _mm256_set1_ps(1.f);
52 for (int i = 0; i < size; i++)
53 {
54 __m256 _p = _mm256_loadu_ps(ptr);
55 __m256 _ans = _mm256_set1_ps(beta);
56 _ans = _mm256_fmadd_ps(_p, _mm256_set1_ps(alpha), _ans);
57 _ans = _mm256_max_ps(_ans, _zero);
58 _ans = _mm256_min_ps(_ans, _one);
59 _mm256_storeu_ps(ptr, _ans);
60
61 ptr += 8;
62 }
63 }
64
65 return 0;
66 }
67 #endif // __AVX__
68
69 if (elempack == 4)
70 {
71 #pragma omp parallel for num_threads(opt.num_threads)
72 for (int q = 0; q < channels; q++)
73 {
74 float* ptr = bottom_top_blob.channel(q);
75
76 __m128 _zero = _mm_set1_ps(0.f);
77 __m128 _one = _mm_set1_ps(1.f);
78 for (int i = 0; i < size; i++)
79 {
80 __m128 _p = _mm_loadu_ps(ptr);
81 __m128 _ans = _mm_set1_ps(beta);
82 _ans = _mm_add_ps(_mm_mul_ps(_p, _mm_set1_ps(alpha)), _ans);
83 _ans = _mm_max_ps(_ans, _zero);
84 _ans = _mm_min_ps(_ans, _one);
85 _mm_storeu_ps(ptr, _ans);
86
87 ptr += 4;
88 }
89 }
90
91 return 0;
92 }
93 #endif // __SSE2__
94
95 #pragma omp parallel for num_threads(opt.num_threads)
96 for (int q = 0; q < channels; q++)
97 {
98 float* ptr = bottom_top_blob.channel(q);
99
100 int i = 0;
101 #if __AVX__
102 __m256 _zero = _mm256_set1_ps(0.f);
103 __m256 _one = _mm256_set1_ps(1.f);
104 for (; i + 7 < size; i += 8)
105 {
106 __m256 _p = _mm256_loadu_ps(ptr);
107 __m256 _ans = _mm256_set1_ps(beta);
108 _ans = _mm256_fmadd_ps(_p, _mm256_set1_ps(alpha), _ans);
109 _ans = _mm256_max_ps(_ans, _zero);
110 _ans = _mm256_min_ps(_ans, _one);
111 _mm256_storeu_ps(ptr, _ans);
112
113 ptr += 8;
114 }
115 #endif
116 for (; i < size; i++)
117 {
118 if (*ptr < lower)
119 *ptr = 0.f;
120 else if (*ptr > upper)
121 *ptr = 1.f;
122 else
123 *ptr = *ptr * alpha + beta;
124 ++ptr;
125 }
126 }
127
128 return 0;
129 }
130
131 } // namespace ncnn
132