1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "hardswish_x86.h"
16 
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23 
24 namespace ncnn {
25 
HardSwish_x86()26 HardSwish_x86::HardSwish_x86()
27 {
28 #if __SSE2__
29     support_packing = true;
30 #endif // __SSE2__
31 }
32 
forward_inplace(Mat & bottom_top_blob,const Option & opt) const33 int HardSwish_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
34 {
35     int w = bottom_top_blob.w;
36     int h = bottom_top_blob.h;
37     int channels = bottom_top_blob.c;
38     int size = w * h;
39 #if __SSE2__
40     int elempack = bottom_top_blob.elempack;
41 
42 #if __AVX__
43     if (elempack == 8)
44     {
45         #pragma omp parallel for num_threads(opt.num_threads)
46         for (int q = 0; q < channels; q++)
47         {
48             float* ptr = bottom_top_blob.channel(q);
49 
50             __m256 _zero = _mm256_set1_ps(0.f);
51             __m256 _one = _mm256_set1_ps(1.f);
52             for (int i = 0; i < size; i++)
53             {
54                 __m256 _p = _mm256_loadu_ps(ptr);
55                 __m256 _ans = _mm256_set1_ps(beta);
56                 _ans = _mm256_fmadd_ps(_p, _mm256_set1_ps(alpha), _ans);
57                 _ans = _mm256_max_ps(_ans, _zero);
58                 _ans = _mm256_min_ps(_ans, _one);
59                 _ans = _mm256_mul_ps(_ans, _p);
60                 _mm256_storeu_ps(ptr, _ans);
61 
62                 ptr += 8;
63             }
64         }
65 
66         return 0;
67     }
68 #endif // __AVX__
69 
70     if (elempack == 4)
71     {
72         #pragma omp parallel for num_threads(opt.num_threads)
73         for (int q = 0; q < channels; q++)
74         {
75             float* ptr = bottom_top_blob.channel(q);
76 
77             __m128 _zero = _mm_set1_ps(0.f);
78             __m128 _one = _mm_set1_ps(1.f);
79             for (int i = 0; i < size; i++)
80             {
81                 __m128 _p = _mm_loadu_ps(ptr);
82                 __m128 _ans = _mm_set1_ps(beta);
83                 _ans = _mm_add_ps(_mm_mul_ps(_p, _mm_set1_ps(alpha)), _ans);
84                 _ans = _mm_max_ps(_ans, _zero);
85                 _ans = _mm_min_ps(_ans, _one);
86                 _ans = _mm_mul_ps(_ans, _p);
87                 _mm_storeu_ps(ptr, _ans);
88 
89                 ptr += 4;
90             }
91         }
92 
93         return 0;
94     }
95 #endif // __SSE2__
96 
97     #pragma omp parallel for num_threads(opt.num_threads)
98     for (int q = 0; q < channels; q++)
99     {
100         float* ptr = bottom_top_blob.channel(q);
101 
102         int i = 0;
103 #if __AVX__
104         __m256 _zero = _mm256_set1_ps(0.f);
105         __m256 _one = _mm256_set1_ps(1.f);
106         for (; i + 7 < size; i += 8)
107         {
108             __m256 _p = _mm256_loadu_ps(ptr);
109             __m256 _ans = _mm256_set1_ps(beta);
110             _ans = _mm256_fmadd_ps(_p, _mm256_set1_ps(alpha), _ans);
111             _ans = _mm256_max_ps(_ans, _zero);
112             _ans = _mm256_min_ps(_ans, _one);
113             _ans = _mm256_mul_ps(_ans, _p);
114             _mm256_storeu_ps(ptr, _ans);
115 
116             ptr += 8;
117         }
118 #endif
119         for (; i < size; i++)
120         {
121             if (*ptr < lower)
122                 *ptr = 0.f;
123             else if (*ptr > upper)
124                 ;
125             else
126                 *ptr = *ptr * (*ptr * alpha + beta);
127             ++ptr;
128         }
129     }
130 
131     return 0;
132 }
133 
134 } // namespace ncnn
135