1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "lrn_x86.h"
16
17 #if __AVX__
18 #include "avx_mathfun.h"
19 #endif // __AVX__
20
21 #include <math.h>
22
23 namespace ncnn {
24
forward_inplace(Mat & bottom_top_blob,const Option & opt) const25 int LRN_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
26 {
27 int w = bottom_top_blob.w;
28 int h = bottom_top_blob.h;
29 int channels = bottom_top_blob.c;
30 size_t elemsize = bottom_top_blob.elemsize;
31 int size = w * h;
32
33 // squared values with local_size padding
34 Mat square_blob;
35 square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
36 if (square_blob.empty())
37 return -100;
38
39 #pragma omp parallel for num_threads(opt.num_threads)
40 for (int q = 0; q < channels; q++)
41 {
42 const float* ptr = bottom_top_blob.channel(q);
43 float* outptr = square_blob.channel(q);
44
45 int i = 0;
46 #if __AVX__
47 for (; i + 7 < size; i += 8)
48 {
49 __m256 _p = _mm256_loadu_ps(ptr);
50 __m256 _outp = _mm256_mul_ps(_p, _p);
51 _mm256_storeu_ps(outptr, _outp);
52
53 ptr += 8;
54 outptr += 8;
55 }
56 #endif // __AVX__
57 for (; i < size; i++)
58 {
59 *outptr = *ptr * *ptr;
60
61 ptr++;
62 outptr++;
63 }
64 }
65
66 if (region_type == NormRegion_ACROSS_CHANNELS)
67 {
68 Mat square_sum;
69 square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
70 if (square_sum.empty())
71 return -100;
72 square_sum.fill(0.f);
73
74 const float alpha_div_size = alpha / local_size;
75
76 #pragma omp parallel for num_threads(opt.num_threads)
77 for (int q = 0; q < channels; q++)
78 {
79 // square sum
80 for (int p = q - local_size / 2; p <= q + local_size / 2; p++)
81 {
82 if (p < 0 || p >= channels)
83 continue;
84
85 const float* sptr = square_blob.channel(p);
86 float* ssptr = square_sum.channel(q);
87
88 int i = 0;
89 #if __AVX__
90 for (; i + 7 < size; i += 8)
91 {
92 __m256 _sp = _mm256_loadu_ps(sptr);
93 __m256 _ssp = _mm256_loadu_ps(ssptr);
94 _ssp = _mm256_add_ps(_ssp, _sp);
95 _mm256_storeu_ps(ssptr, _ssp);
96
97 sptr += 8;
98 ssptr += 8;
99 }
100 #endif // __AVX__
101 for (; i < size; i++)
102 {
103 *ssptr += *sptr;
104 sptr++;
105 ssptr++;
106 }
107 }
108
109 float* ptr = bottom_top_blob.channel(q);
110 float* ssptr = square_sum.channel(q);
111
112 int i = 0;
113 #if __AVX__
114 __m256 _bias = _mm256_set1_ps(bias);
115 __m256 _ads = _mm256_set1_ps(alpha_div_size);
116 __m256 _mb = _mm256_set1_ps(-beta);
117 for (; i + 7 < size; i += 8)
118 {
119 __m256 _p = _mm256_loadu_ps(ptr);
120 __m256 _ssp = _mm256_loadu_ps(ssptr);
121 _ssp = _mm256_mul_ps(_ssp, _ads);
122 _ssp = _mm256_add_ps(_ssp, _bias);
123 _ssp = pow_ps(_ssp, _mb);
124 _p = _mm256_mul_ps(_p, _ssp);
125 _mm256_storeu_ps(ptr, _p);
126
127 ssptr += 8;
128 ptr += 8;
129 }
130 #endif // __AVX__
131 for (; i < size; i++)
132 {
133 *ptr = *ptr * pow(bias + alpha_div_size * *ssptr, -beta);
134
135 ssptr++;
136 ptr++;
137 }
138 }
139 }
140 else if (region_type == NormRegion_WITHIN_CHANNEL)
141 {
142 int outw = w;
143 int outh = h;
144
145 Mat square_blob_bordered = square_blob;
146 int pad = local_size / 2;
147 if (pad > 0)
148 {
149 Option opt_b = opt;
150 opt_b.blob_allocator = opt.workspace_allocator;
151 copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt_b);
152 if (square_blob_bordered.empty())
153 return -100;
154
155 w = square_blob_bordered.w;
156 h = square_blob_bordered.h;
157 }
158
159 const int maxk = local_size * local_size;
160
161 const float alpha_div_size = alpha / maxk;
162
163 // norm window offsets
164 std::vector<int> _space_ofs(maxk);
165 int* space_ofs = &_space_ofs[0];
166 {
167 int p1 = 0;
168 int p2 = 0;
169 int gap = w - local_size;
170 for (int i = 0; i < local_size; i++)
171 {
172 for (int j = 0; j < local_size; j++)
173 {
174 space_ofs[p1] = p2;
175 p1++;
176 p2++;
177 }
178 p2 += gap;
179 }
180 }
181
182 #pragma omp parallel for num_threads(opt.num_threads)
183 for (int q = 0; q < channels; q++)
184 {
185 float* ptr = bottom_top_blob.channel(q);
186 const Mat m = square_blob_bordered.channel(q);
187
188 for (int i = 0; i < outh; i++)
189 {
190 for (int j = 0; j < outw; j++)
191 {
192 const float* sptr = m.row(i) + j;
193
194 float ss = 0.f;
195
196 for (int k = 0; k < maxk; k++)
197 {
198 float val = sptr[space_ofs[k]];
199 ss += val;
200 }
201
202 ptr[j] = ptr[j] * pow(bias + alpha_div_size * ss, -beta);
203 }
204
205 ptr += outw;
206 }
207 }
208 }
209
210 return 0;
211 }
212
213 } // namespace ncnn
214