1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "lrn_x86.h"
16 
17 #if __AVX__
18 #include "avx_mathfun.h"
19 #endif // __AVX__
20 
21 #include <math.h>
22 
23 namespace ncnn {
24 
forward_inplace(Mat & bottom_top_blob,const Option & opt) const25 int LRN_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
26 {
27     int w = bottom_top_blob.w;
28     int h = bottom_top_blob.h;
29     int channels = bottom_top_blob.c;
30     size_t elemsize = bottom_top_blob.elemsize;
31     int size = w * h;
32 
33     // squared values with local_size padding
34     Mat square_blob;
35     square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
36     if (square_blob.empty())
37         return -100;
38 
39     #pragma omp parallel for num_threads(opt.num_threads)
40     for (int q = 0; q < channels; q++)
41     {
42         const float* ptr = bottom_top_blob.channel(q);
43         float* outptr = square_blob.channel(q);
44 
45         int i = 0;
46 #if __AVX__
47         for (; i + 7 < size; i += 8)
48         {
49             __m256 _p = _mm256_loadu_ps(ptr);
50             __m256 _outp = _mm256_mul_ps(_p, _p);
51             _mm256_storeu_ps(outptr, _outp);
52 
53             ptr += 8;
54             outptr += 8;
55         }
56 #endif // __AVX__
57         for (; i < size; i++)
58         {
59             *outptr = *ptr * *ptr;
60 
61             ptr++;
62             outptr++;
63         }
64     }
65 
66     if (region_type == NormRegion_ACROSS_CHANNELS)
67     {
68         Mat square_sum;
69         square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
70         if (square_sum.empty())
71             return -100;
72         square_sum.fill(0.f);
73 
74         const float alpha_div_size = alpha / local_size;
75 
76         #pragma omp parallel for num_threads(opt.num_threads)
77         for (int q = 0; q < channels; q++)
78         {
79             // square sum
80             for (int p = q - local_size / 2; p <= q + local_size / 2; p++)
81             {
82                 if (p < 0 || p >= channels)
83                     continue;
84 
85                 const float* sptr = square_blob.channel(p);
86                 float* ssptr = square_sum.channel(q);
87 
88                 int i = 0;
89 #if __AVX__
90                 for (; i + 7 < size; i += 8)
91                 {
92                     __m256 _sp = _mm256_loadu_ps(sptr);
93                     __m256 _ssp = _mm256_loadu_ps(ssptr);
94                     _ssp = _mm256_add_ps(_ssp, _sp);
95                     _mm256_storeu_ps(ssptr, _ssp);
96 
97                     sptr += 8;
98                     ssptr += 8;
99                 }
100 #endif // __AVX__
101                 for (; i < size; i++)
102                 {
103                     *ssptr += *sptr;
104                     sptr++;
105                     ssptr++;
106                 }
107             }
108 
109             float* ptr = bottom_top_blob.channel(q);
110             float* ssptr = square_sum.channel(q);
111 
112             int i = 0;
113 #if __AVX__
114             __m256 _bias = _mm256_set1_ps(bias);
115             __m256 _ads = _mm256_set1_ps(alpha_div_size);
116             __m256 _mb = _mm256_set1_ps(-beta);
117             for (; i + 7 < size; i += 8)
118             {
119                 __m256 _p = _mm256_loadu_ps(ptr);
120                 __m256 _ssp = _mm256_loadu_ps(ssptr);
121                 _ssp = _mm256_mul_ps(_ssp, _ads);
122                 _ssp = _mm256_add_ps(_ssp, _bias);
123                 _ssp = pow_ps(_ssp, _mb);
124                 _p = _mm256_mul_ps(_p, _ssp);
125                 _mm256_storeu_ps(ptr, _p);
126 
127                 ssptr += 8;
128                 ptr += 8;
129             }
130 #endif // __AVX__
131             for (; i < size; i++)
132             {
133                 *ptr = *ptr * pow(bias + alpha_div_size * *ssptr, -beta);
134 
135                 ssptr++;
136                 ptr++;
137             }
138         }
139     }
140     else if (region_type == NormRegion_WITHIN_CHANNEL)
141     {
142         int outw = w;
143         int outh = h;
144 
145         Mat square_blob_bordered = square_blob;
146         int pad = local_size / 2;
147         if (pad > 0)
148         {
149             Option opt_b = opt;
150             opt_b.blob_allocator = opt.workspace_allocator;
151             copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt_b);
152             if (square_blob_bordered.empty())
153                 return -100;
154 
155             w = square_blob_bordered.w;
156             h = square_blob_bordered.h;
157         }
158 
159         const int maxk = local_size * local_size;
160 
161         const float alpha_div_size = alpha / maxk;
162 
163         // norm window offsets
164         std::vector<int> _space_ofs(maxk);
165         int* space_ofs = &_space_ofs[0];
166         {
167             int p1 = 0;
168             int p2 = 0;
169             int gap = w - local_size;
170             for (int i = 0; i < local_size; i++)
171             {
172                 for (int j = 0; j < local_size; j++)
173                 {
174                     space_ofs[p1] = p2;
175                     p1++;
176                     p2++;
177                 }
178                 p2 += gap;
179             }
180         }
181 
182         #pragma omp parallel for num_threads(opt.num_threads)
183         for (int q = 0; q < channels; q++)
184         {
185             float* ptr = bottom_top_blob.channel(q);
186             const Mat m = square_blob_bordered.channel(q);
187 
188             for (int i = 0; i < outh; i++)
189             {
190                 for (int j = 0; j < outw; j++)
191                 {
192                     const float* sptr = m.row(i) + j;
193 
194                     float ss = 0.f;
195 
196                     for (int k = 0; k < maxk; k++)
197                     {
198                         float val = sptr[space_ofs[k]];
199                         ss += val;
200                     }
201 
202                     ptr[j] = ptr[j] * pow(bias + alpha_div_size * ss, -beta);
203                 }
204 
205                 ptr += outw;
206             }
207         }
208     }
209 
210     return 0;
211 }
212 
213 } // namespace ncnn
214