1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "padding_x86.h"
16 
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23 
24 namespace ncnn {
25 
26 #if __SSE2__
27 #include "padding_pack4.h"
28 #include "padding_pack8_int8.h"
29 #if __AVX__
30 #include "padding_pack8.h"
31 #endif // __AVX__
32 #endif // __SSE2__
33 
Padding_x86()34 Padding_x86::Padding_x86()
35 {
36 #if __SSE2__
37     support_packing = true;
38 #endif // __SSE2__
39 }
40 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const41 int Padding_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
42 {
43     if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
44     {
45         top_blob = bottom_blob;
46         return 0;
47     }
48 
49     int elembits = bottom_blob.elembits();
50 
51     if (elembits == 8)
52         return forward_int8(bottom_blob, top_blob, opt);
53 
54     int w = bottom_blob.w;
55     int h = bottom_blob.h;
56     int channels = bottom_blob.c;
57     int dims = bottom_blob.dims;
58     size_t elemsize = bottom_blob.elemsize;
59     int elempack = bottom_blob.elempack;
60 
61 #if __SSE2__
62 #if __AVX__
63     if (elempack == 8)
64     {
65         if (dims == 1)
66         {
67             int outw = w * elempack + left + right;
68 
69             int out_elempack = outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1;
70             size_t out_elemsize = elemsize / elempack * out_elempack;
71 
72             top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
73             if (top_blob.empty())
74                 return -100;
75 
76             if (left % 8 == 0 && out_elempack == 8)
77             {
78                 // TODO
79             }
80         }
81 
82         if (dims == 2)
83         {
84             int outw = w + left + right;
85             int outh = h * elempack + top + bottom;
86 
87             int out_elempack = outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
88             size_t out_elemsize = elemsize / elempack * out_elempack;
89 
90             top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
91             if (top_blob.empty())
92                 return -100;
93 
94             if (top % 8 == 0 && out_elempack == 8)
95             {
96                 // TODO
97             }
98         }
99 
100         if (dims == 3)
101         {
102             int outw = w + left + right;
103             int outh = h + top + bottom;
104             int outc = channels * elempack + front + behind;
105 
106             int out_elempack = outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
107             size_t out_elemsize = elemsize / elempack * out_elempack;
108 
109             top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
110             if (top_blob.empty())
111                 return -100;
112 
113             if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
114             {
115                 int front_ = front / elempack;
116                 #pragma omp parallel for num_threads(opt.num_threads)
117                 for (int q = 0; q < outc / out_elempack; q++)
118                 {
119                     Mat borderm = top_blob.channel(q);
120 
121                     __m256 pad_value = per_channel_pad_data_size ? _mm256_loadu_ps((const float*)per_channel_pad_data + q * 8) : _mm256_set1_ps(value);
122                     //Channel padding
123                     if ((q - front_) < 0 || (q - front_) >= channels)
124                     {
125                         borderm.fill(pad_value);
126                     }
127                     else
128                     {
129                         const Mat m = bottom_blob.channel(q - front_);
130                         if (type == 0)
131                             padding_constant_pack8_avx(m, borderm, top, bottom, left, right, pad_value);
132                         if (type == 1)
133                             padding_replicate_pack8_avx(m, borderm, top, bottom, left, right);
134                         if (type == 2)
135                             padding_reflect_pack8_avx(m, borderm, top, bottom, left, right);
136                     }
137                 }
138 
139                 return 0;
140             }
141         }
142     }
143 #endif // __AVX__
144 
145     if (elempack == 4)
146     {
147         if (dims == 1)
148         {
149             int outw = w * elempack + left + right;
150 
151 #if __AVX__
152             int out_elempack = outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1;
153 #else
154             int out_elempack = outw % 4 == 0 ? 4 : 1;
155 #endif
156             size_t out_elemsize = elemsize / elempack * out_elempack;
157 
158             top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
159             if (top_blob.empty())
160                 return -100;
161 
162             if (left % 4 == 0 && out_elempack == 4)
163             {
164                 // TODO
165             }
166         }
167 
168         if (dims == 2)
169         {
170             int outw = w + left + right;
171             int outh = h * elempack + top + bottom;
172 
173 #if __AVX__
174             int out_elempack = outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
175 #else
176             int out_elempack = outh % 4 == 0 ? 4 : 1;
177 #endif
178             size_t out_elemsize = elemsize / elempack * out_elempack;
179 
180             top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
181             if (top_blob.empty())
182                 return -100;
183 
184             if (top % 4 == 0 && out_elempack == 4)
185             {
186                 // TODO
187             }
188         }
189 
190         if (dims == 3)
191         {
192             int outw = w + left + right;
193             int outh = h + top + bottom;
194             int outc = channels * elempack + front + behind;
195 
196 #if __AVX__
197             int out_elempack = outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
198 #else
199             int out_elempack = outc % 4 == 0 ? 4 : 1;
200 #endif
201             size_t out_elemsize = elemsize / elempack * out_elempack;
202 
203             top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
204             if (top_blob.empty())
205                 return -100;
206 
207             if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
208             {
209                 int front_ = front / elempack;
210                 #pragma omp parallel for num_threads(opt.num_threads)
211                 for (int q = 0; q < outc / out_elempack; q++)
212                 {
213                     Mat borderm = top_blob.channel(q);
214 
215                     __m128 pad_value = per_channel_pad_data_size ? _mm_loadu_ps((const float*)per_channel_pad_data + q * 4) : _mm_set1_ps(value);
216                     //Channel padding
217                     if ((q - front_) < 0 || (q - front_) >= channels)
218                     {
219                         borderm.fill(pad_value);
220                     }
221                     else
222                     {
223                         const Mat m = bottom_blob.channel(q - front_);
224                         if (type == 0)
225                             padding_constant_pack4_sse(m, borderm, top, bottom, left, right, pad_value);
226                         if (type == 1)
227                             padding_replicate_pack4_sse(m, borderm, top, bottom, left, right);
228                         if (type == 2)
229                             padding_reflect_pack4_sse(m, borderm, top, bottom, left, right);
230                     }
231                 }
232 
233                 return 0;
234             }
235         }
236     }
237 #endif // __SSE2__
238 
239     Mat bottom_blob_unpacked = bottom_blob;
240     if (elempack != 1)
241     {
242         Option opt_pack1 = opt;
243         opt_pack1.blob_allocator = opt.workspace_allocator;
244 
245         convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
246     }
247 
248     return Padding::forward(bottom_blob_unpacked, top_blob, opt);
249 }
250 
forward_int8(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const251 int Padding_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
252 {
253     int w = bottom_blob.w;
254     int h = bottom_blob.h;
255     int channels = bottom_blob.c;
256     int dims = bottom_blob.dims;
257     size_t elemsize = bottom_blob.elemsize;
258     int elempack = bottom_blob.elempack;
259 
260 #if __SSE2__
261     if (elempack == 8)
262     {
263         if (dims == 1)
264         {
265             int outw = w * elempack + left + right;
266 
267             int out_elempack = outw % 8 == 0 ? 8 : 1;
268             size_t out_elemsize = elemsize / elempack * out_elempack;
269 
270             top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
271             if (top_blob.empty())
272                 return -100;
273 
274             if (left % 8 == 0 && out_elempack == 8)
275             {
276                 // TODO
277             }
278         }
279 
280         if (dims == 2)
281         {
282             int outw = w + left + right;
283             int outh = h * elempack + top + bottom;
284 
285             int out_elempack = outh % 8 == 0 ? 8 : 1;
286             size_t out_elemsize = elemsize / elempack * out_elempack;
287 
288             top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
289             if (top_blob.empty())
290                 return -100;
291 
292             if (top % 8 == 0 && out_elempack == 8)
293             {
294                 // TODO
295             }
296         }
297 
298         if (dims == 3)
299         {
300             int outw = w + left + right;
301             int outh = h + top + bottom;
302             int outc = channels * elempack + front + behind;
303 
304             int out_elempack = outc % 8 == 0 ? 8 : 1;
305             size_t out_elemsize = elemsize / elempack * out_elempack;
306 
307             top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
308             if (top_blob.empty())
309                 return -100;
310 
311             if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
312             {
313                 int front_ = front / elempack;
314                 #pragma omp parallel for num_threads(opt.num_threads)
315                 for (int q = 0; q < outc / out_elempack; q++)
316                 {
317                     Mat borderm = top_blob.channel(q);
318 
319                     // TODO perchannel
320                     //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
321                     int64_t v8 = (int64_t)value;
322                     int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
323 
324                     //Channel padding
325                     if ((q - front_) < 0 || (q - front_) >= channels)
326                     {
327                         borderm.fill<int64_t>(pad_value);
328                     }
329                     else
330                     {
331                         const Mat m = bottom_blob.channel(q - front_);
332                         if (type == 0)
333                             padding_constant_pack8_int8_sse(m, borderm, top, bottom, left, right, pad_value);
334                         if (type == 1)
335                             padding_replicate_pack8_int8_sse(m, borderm, top, bottom, left, right);
336                         if (type == 2)
337                             padding_reflect_pack8_int8_sse(m, borderm, top, bottom, left, right);
338                     }
339                 }
340 
341                 return 0;
342             }
343         }
344     }
345 #endif // __SSE2__
346 
347     Mat bottom_blob_unpacked = bottom_blob;
348     if (elempack != 1)
349     {
350         Option opt_pack1 = opt;
351         opt_pack1.blob_allocator = opt.workspace_allocator;
352 
353         convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
354     }
355 
356     return Padding::forward(bottom_blob_unpacked, top_blob, opt);
357 }
358 
359 } // namespace ncnn
360