1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "bias_x86.h"
16 
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23 
24 namespace ncnn {
25 
forward_inplace(Mat & bottom_top_blob,const Option & opt) const26 int Bias_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
27 {
28     int w = bottom_top_blob.w;
29     int h = bottom_top_blob.h;
30     int channels = bottom_top_blob.c;
31     int size = w * h;
32 
33     const float* bias_ptr = bias_data;
34     #pragma omp parallel for num_threads(opt.num_threads)
35     for (int q = 0; q < channels; q++)
36     {
37         float* ptr = bottom_top_blob.channel(q);
38 
39         float bias = bias_ptr[q];
40 
41         int i = 0;
42 #if __SSE2__
43 #if __AVX__
44         {
45             __m256 _bias256 = _mm256_set1_ps(bias);
46             for (; i + 7 < size; i += 8)
47             {
48                 __m256 _p = _mm256_loadu_ps(ptr);
49                 __m256 _outp = _mm256_add_ps(_p, _bias256);
50                 _mm256_storeu_ps(ptr, _outp);
51 
52                 ptr += 8;
53             }
54         }
55 #endif // __AVX__
56         {
57             __m128 _bias = _mm_set1_ps(bias);
58             for (; i + 3 < size; i += 4)
59             {
60                 __m128 _p = _mm_loadu_ps(ptr);
61                 __m128 _outp = _mm_add_ps(_p, _bias);
62                 _mm_storeu_ps(ptr, _outp);
63 
64                 ptr += 4;
65             }
66         }
67 #endif // __SSE2__
68 
69         for (; i < size; i++)
70         {
71             *ptr = *ptr + bias;
72 
73             ptr++;
74         }
75     }
76 
77     return 0;
78 }
79 
80 } // namespace ncnn
81