1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "padding_x86.h"
16
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23
24 namespace ncnn {
25
26 #if __SSE2__
27 #include "padding_pack4.h"
28 #include "padding_pack8_int8.h"
29 #if __AVX__
30 #include "padding_pack8.h"
31 #endif // __AVX__
32 #endif // __SSE2__
33
Padding_x86()34 Padding_x86::Padding_x86()
35 {
36 #if __SSE2__
37 support_packing = true;
38 #endif // __SSE2__
39 }
40
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const41 int Padding_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
42 {
43 if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
44 {
45 top_blob = bottom_blob;
46 return 0;
47 }
48
49 int elembits = bottom_blob.elembits();
50
51 if (elembits == 8)
52 return forward_int8(bottom_blob, top_blob, opt);
53
54 int w = bottom_blob.w;
55 int h = bottom_blob.h;
56 int channels = bottom_blob.c;
57 int dims = bottom_blob.dims;
58 size_t elemsize = bottom_blob.elemsize;
59 int elempack = bottom_blob.elempack;
60
61 #if __SSE2__
62 #if __AVX__
63 if (elempack == 8)
64 {
65 if (dims == 1)
66 {
67 int outw = w * elempack + left + right;
68
69 int out_elempack = outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1;
70 size_t out_elemsize = elemsize / elempack * out_elempack;
71
72 top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
73 if (top_blob.empty())
74 return -100;
75
76 if (left % 8 == 0 && out_elempack == 8)
77 {
78 // TODO
79 }
80 }
81
82 if (dims == 2)
83 {
84 int outw = w + left + right;
85 int outh = h * elempack + top + bottom;
86
87 int out_elempack = outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
88 size_t out_elemsize = elemsize / elempack * out_elempack;
89
90 top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
91 if (top_blob.empty())
92 return -100;
93
94 if (top % 8 == 0 && out_elempack == 8)
95 {
96 // TODO
97 }
98 }
99
100 if (dims == 3)
101 {
102 int outw = w + left + right;
103 int outh = h + top + bottom;
104 int outc = channels * elempack + front + behind;
105
106 int out_elempack = outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
107 size_t out_elemsize = elemsize / elempack * out_elempack;
108
109 top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
110 if (top_blob.empty())
111 return -100;
112
113 if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
114 {
115 int front_ = front / elempack;
116 #pragma omp parallel for num_threads(opt.num_threads)
117 for (int q = 0; q < outc / out_elempack; q++)
118 {
119 Mat borderm = top_blob.channel(q);
120
121 __m256 pad_value = per_channel_pad_data_size ? _mm256_loadu_ps((const float*)per_channel_pad_data + q * 8) : _mm256_set1_ps(value);
122 //Channel padding
123 if ((q - front_) < 0 || (q - front_) >= channels)
124 {
125 borderm.fill(pad_value);
126 }
127 else
128 {
129 const Mat m = bottom_blob.channel(q - front_);
130 if (type == 0)
131 padding_constant_pack8_avx(m, borderm, top, bottom, left, right, pad_value);
132 if (type == 1)
133 padding_replicate_pack8_avx(m, borderm, top, bottom, left, right);
134 if (type == 2)
135 padding_reflect_pack8_avx(m, borderm, top, bottom, left, right);
136 }
137 }
138
139 return 0;
140 }
141 }
142 }
143 #endif // __AVX__
144
145 if (elempack == 4)
146 {
147 if (dims == 1)
148 {
149 int outw = w * elempack + left + right;
150
151 #if __AVX__
152 int out_elempack = outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1;
153 #else
154 int out_elempack = outw % 4 == 0 ? 4 : 1;
155 #endif
156 size_t out_elemsize = elemsize / elempack * out_elempack;
157
158 top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
159 if (top_blob.empty())
160 return -100;
161
162 if (left % 4 == 0 && out_elempack == 4)
163 {
164 // TODO
165 }
166 }
167
168 if (dims == 2)
169 {
170 int outw = w + left + right;
171 int outh = h * elempack + top + bottom;
172
173 #if __AVX__
174 int out_elempack = outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
175 #else
176 int out_elempack = outh % 4 == 0 ? 4 : 1;
177 #endif
178 size_t out_elemsize = elemsize / elempack * out_elempack;
179
180 top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
181 if (top_blob.empty())
182 return -100;
183
184 if (top % 4 == 0 && out_elempack == 4)
185 {
186 // TODO
187 }
188 }
189
190 if (dims == 3)
191 {
192 int outw = w + left + right;
193 int outh = h + top + bottom;
194 int outc = channels * elempack + front + behind;
195
196 #if __AVX__
197 int out_elempack = outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
198 #else
199 int out_elempack = outc % 4 == 0 ? 4 : 1;
200 #endif
201 size_t out_elemsize = elemsize / elempack * out_elempack;
202
203 top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
204 if (top_blob.empty())
205 return -100;
206
207 if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
208 {
209 int front_ = front / elempack;
210 #pragma omp parallel for num_threads(opt.num_threads)
211 for (int q = 0; q < outc / out_elempack; q++)
212 {
213 Mat borderm = top_blob.channel(q);
214
215 __m128 pad_value = per_channel_pad_data_size ? _mm_loadu_ps((const float*)per_channel_pad_data + q * 4) : _mm_set1_ps(value);
216 //Channel padding
217 if ((q - front_) < 0 || (q - front_) >= channels)
218 {
219 borderm.fill(pad_value);
220 }
221 else
222 {
223 const Mat m = bottom_blob.channel(q - front_);
224 if (type == 0)
225 padding_constant_pack4_sse(m, borderm, top, bottom, left, right, pad_value);
226 if (type == 1)
227 padding_replicate_pack4_sse(m, borderm, top, bottom, left, right);
228 if (type == 2)
229 padding_reflect_pack4_sse(m, borderm, top, bottom, left, right);
230 }
231 }
232
233 return 0;
234 }
235 }
236 }
237 #endif // __SSE2__
238
239 Mat bottom_blob_unpacked = bottom_blob;
240 if (elempack != 1)
241 {
242 Option opt_pack1 = opt;
243 opt_pack1.blob_allocator = opt.workspace_allocator;
244
245 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
246 }
247
248 return Padding::forward(bottom_blob_unpacked, top_blob, opt);
249 }
250
forward_int8(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const251 int Padding_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
252 {
253 int w = bottom_blob.w;
254 int h = bottom_blob.h;
255 int channels = bottom_blob.c;
256 int dims = bottom_blob.dims;
257 size_t elemsize = bottom_blob.elemsize;
258 int elempack = bottom_blob.elempack;
259
260 #if __SSE2__
261 if (elempack == 8)
262 {
263 if (dims == 1)
264 {
265 int outw = w * elempack + left + right;
266
267 int out_elempack = outw % 8 == 0 ? 8 : 1;
268 size_t out_elemsize = elemsize / elempack * out_elempack;
269
270 top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
271 if (top_blob.empty())
272 return -100;
273
274 if (left % 8 == 0 && out_elempack == 8)
275 {
276 // TODO
277 }
278 }
279
280 if (dims == 2)
281 {
282 int outw = w + left + right;
283 int outh = h * elempack + top + bottom;
284
285 int out_elempack = outh % 8 == 0 ? 8 : 1;
286 size_t out_elemsize = elemsize / elempack * out_elempack;
287
288 top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
289 if (top_blob.empty())
290 return -100;
291
292 if (top % 8 == 0 && out_elempack == 8)
293 {
294 // TODO
295 }
296 }
297
298 if (dims == 3)
299 {
300 int outw = w + left + right;
301 int outh = h + top + bottom;
302 int outc = channels * elempack + front + behind;
303
304 int out_elempack = outc % 8 == 0 ? 8 : 1;
305 size_t out_elemsize = elemsize / elempack * out_elempack;
306
307 top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
308 if (top_blob.empty())
309 return -100;
310
311 if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
312 {
313 int front_ = front / elempack;
314 #pragma omp parallel for num_threads(opt.num_threads)
315 for (int q = 0; q < outc / out_elempack; q++)
316 {
317 Mat borderm = top_blob.channel(q);
318
319 // TODO perchannel
320 // int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
321 int64_t v8 = (int64_t)value;
322 int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
323
324 //Channel padding
325 if ((q - front_) < 0 || (q - front_) >= channels)
326 {
327 borderm.fill<int64_t>(pad_value);
328 }
329 else
330 {
331 const Mat m = bottom_blob.channel(q - front_);
332 if (type == 0)
333 padding_constant_pack8_int8_sse(m, borderm, top, bottom, left, right, pad_value);
334 if (type == 1)
335 padding_replicate_pack8_int8_sse(m, borderm, top, bottom, left, right);
336 if (type == 2)
337 padding_reflect_pack8_int8_sse(m, borderm, top, bottom, left, right);
338 }
339 }
340
341 return 0;
342 }
343 }
344 }
345 #endif // __SSE2__
346
347 Mat bottom_blob_unpacked = bottom_blob;
348 if (elempack != 1)
349 {
350 Option opt_pack1 = opt;
351 opt_pack1.blob_allocator = opt.workspace_allocator;
352
353 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
354 }
355
356 return Padding::forward(bottom_blob_unpacked, top_blob, opt);
357 }
358
359 } // namespace ncnn
360