1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "clip_x86.h"
16 
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23 
24 namespace ncnn {
25 
Clip_x86()26 Clip_x86::Clip_x86()
27 {
28 #if __SSE2__
29     support_packing = true;
30 #endif // __SSE2__
31 }
32 
forward_inplace(Mat & bottom_top_blob,const Option & opt) const33 int Clip_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
34 {
35     int w = bottom_top_blob.w;
36     int h = bottom_top_blob.h;
37     int channels = bottom_top_blob.c;
38     int size = w * h;
39 #if __SSE2__
40     int elempack = bottom_top_blob.elempack;
41 
42 #if __AVX__
43     if (elempack == 8)
44     {
45         #pragma omp parallel for num_threads(opt.num_threads)
46         for (int q = 0; q < channels; q++)
47         {
48             float* ptr = bottom_top_blob.channel(q);
49 
50             __m256 _max = _mm256_set1_ps(max);
51             __m256 _min = _mm256_set1_ps(min);
52 
53             for (int i = 0; i < size; i++)
54             {
55                 __m256 _ptr = _mm256_loadu_ps(ptr);
56                 _ptr = _mm256_max_ps(_ptr, _min);
57                 _ptr = _mm256_min_ps(_ptr, _max);
58                 _mm256_storeu_ps(ptr, _ptr);
59 
60                 ptr += 8;
61             }
62         }
63 
64         return 0;
65     }
66 #endif // __AVX__
67 
68     if (elempack == 4)
69     {
70         #pragma omp parallel for num_threads(opt.num_threads)
71         for (int q = 0; q < channels; q++)
72         {
73             float* ptr = bottom_top_blob.channel(q);
74 
75             __m128 _max = _mm_set1_ps(max);
76             __m128 _min = _mm_set1_ps(min);
77 
78             for (int i = 0; i < size; i++)
79             {
80                 __m128 _ptr = _mm_loadu_ps(ptr);
81                 _ptr = _mm_max_ps(_ptr, _min);
82                 _ptr = _mm_min_ps(_ptr, _max);
83                 _mm_storeu_ps(ptr, _ptr);
84 
85                 ptr += 4;
86             }
87         }
88 
89         return 0;
90     }
91 #endif // __SSE2__
92 
93     #pragma omp parallel for num_threads(opt.num_threads)
94     for (int q = 0; q < channels; q++)
95     {
96         float* ptr = bottom_top_blob.channel(q);
97 
98         int remain = size;
99         for (; remain > 0; remain--)
100         {
101             if (*ptr < min)
102                 *ptr = min;
103 
104             if (*ptr > max)
105                 *ptr = max;
106 
107             ptr++;
108         }
109     }
110 
111     return 0;
112 }
113 
114 } //namespace ncnn
115