1 // BUG1989 is pleased to support the open source community by supporting ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv3x3s1_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)15 static void conv3x3s1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
16 {
17 int w = bottom_blob.w;
18 int inch = bottom_blob.c;
19
20 int outw = top_blob.w;
21 int outh = top_blob.h;
22 int outch = top_blob.c;
23
24 const signed char* kernel = _kernel;
25
26 #pragma omp parallel for num_threads(opt.num_threads)
27 for (int p = 0; p < outch; p++)
28 {
29 Mat out0 = top_blob.channel(p);
30
31 out0.fill(0);
32
33 const signed char* kernel0 = (const signed char*)kernel + p * inch * 9;
34
35 for (int q = 0; q < inch; q++)
36 {
37 int* outptr0 = out0;
38
39 const signed char* img0 = bottom_blob.channel(q);
40
41 const signed char* r0 = img0;
42 const signed char* r1 = img0 + w;
43 const signed char* r2 = img0 + w * 2;
44
45 for (int i = 0; i < outh; i++)
46 {
47 int remain = outw;
48
49 for (; remain > 0; remain--)
50 {
51 int sum0 = 0;
52
53 sum0 += (int)r0[0] * kernel0[0];
54 sum0 += (int)r0[1] * kernel0[1];
55 sum0 += (int)r0[2] * kernel0[2];
56 sum0 += (int)r1[0] * kernel0[3];
57 sum0 += (int)r1[1] * kernel0[4];
58 sum0 += (int)r1[2] * kernel0[5];
59 sum0 += (int)r2[0] * kernel0[6];
60 sum0 += (int)r2[1] * kernel0[7];
61 sum0 += (int)r2[2] * kernel0[8];
62
63 *outptr0 += sum0;
64
65 r0++;
66 r1++;
67 r2++;
68 outptr0++;
69 }
70
71 r0 += 2;
72 r1 += 2;
73 r2 += 2;
74 }
75
76 kernel0 += 9;
77 }
78 }
79 }
80
conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat & kernel,Mat & kernel_tm,int inch,int outch,const Option & opt)81 static void conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
82 {
83 kernel_tm.create(4 * 4, inch, outch, (size_t)2u);
84
85 // G
86 const short ktm[4][3] = {
87 {2, 0, 0},
88 {1, 1, 1},
89 {1, -1, 1},
90 {0, 0, 2}
91 };
92
93 #pragma omp parallel for num_threads(opt.num_threads)
94 for (int p = 0; p < outch; p++)
95 {
96 for (int q = 0; q < inch; q++)
97 {
98 const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
99 short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
100
101 // transform kernel
102 const signed char* k0 = kernel0;
103 const signed char* k1 = kernel0 + 3;
104 const signed char* k2 = kernel0 + 6;
105
106 // h
107 short tmp[4][3];
108 for (int i = 0; i < 4; i++)
109 {
110 tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
111 tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
112 tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
113 }
114
115 // U
116 for (int j = 0; j < 4; j++)
117 {
118 short* tmpp = &tmp[j][0];
119
120 for (int i = 0; i < 4; i++)
121 {
122 kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
123 }
124 }
125 }
126 }
127 }
128
conv3x3s1_winograd23_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Option & opt)129 static void conv3x3s1_winograd23_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
130 {
131 int w = bottom_blob.w;
132 int h = bottom_blob.h;
133 int inch = bottom_blob.c;
134
135 int outw = top_blob.w;
136 int outh = top_blob.h;
137 int outch = top_blob.c;
138
139 // pad to 2n+2, winograd F(2,3)
140 Mat bottom_blob_bordered = bottom_blob;
141
142 outw = (outw + 1) / 2 * 2;
143 outh = (outh + 1) / 2 * 2;
144
145 w = outw + 2;
146 h = outh + 2;
147 Option opt_b = opt;
148 opt_b.blob_allocator = opt.workspace_allocator;
149 copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
150
151 // BEGIN transform input
152 Mat bottom_blob_tm;
153 {
154 int w_tm = outw / 2 * 4;
155 int h_tm = outh / 2 * 4;
156
157 int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
158 int nRowBlocks = w_tm / 4;
159
160 const int tiles = nColBlocks * nRowBlocks;
161
162 bottom_blob_tm.create(4 * 4, tiles, inch, 2u, opt.workspace_allocator);
163
164 // BT
165 // const float itm[4][4] = {
166 // {1.0f, 0.0f, -1.0f, 0.0f},
167 // {0.0f, 1.0f, 1.00f, 0.0f},
168 // {0.0f, -1.0f, 1.00f, 0.0f},
169 // {0.0f, -1.0f, 0.00f, 1.0f}
170 // };
171
172 #pragma omp parallel for num_threads(opt.num_threads)
173 for (int q = 0; q < inch; q++)
174 {
175 const signed char* img = bottom_blob_bordered.channel(q);
176 short* out_tm0 = bottom_blob_tm.channel(q);
177
178 for (int j = 0; j < nColBlocks; j++)
179 {
180 const signed char* r0 = img + w * j * 2;
181 const signed char* r1 = r0 + w;
182 const signed char* r2 = r1 + w;
183 const signed char* r3 = r2 + w;
184
185 for (int i = 0; i < nRowBlocks; i++)
186 {
187 short d0[4], d1[4], d2[4], d3[4];
188 short w0[4], w1[4], w2[4], w3[4];
189 short t0[4], t1[4], t2[4], t3[4];
190 // load
191 for (int n = 0; n < 4; n++)
192 {
193 d0[n] = r0[n];
194 d1[n] = r1[n];
195 d2[n] = r2[n];
196 d3[n] = r3[n];
197 }
198 // w = B_t * d
199 for (int n = 0; n < 4; n++)
200 {
201 w0[n] = d0[n] - d2[n];
202 w1[n] = d1[n] + d2[n];
203 w2[n] = d2[n] - d1[n];
204 w3[n] = d3[n] - d1[n];
205 }
206 // transpose d to d_t
207 {
208 t0[0] = w0[0];
209 t1[0] = w0[1];
210 t2[0] = w0[2];
211 t3[0] = w0[3];
212 t0[1] = w1[0];
213 t1[1] = w1[1];
214 t2[1] = w1[2];
215 t3[1] = w1[3];
216 t0[2] = w2[0];
217 t1[2] = w2[1];
218 t2[2] = w2[2];
219 t3[2] = w2[3];
220 t0[3] = w3[0];
221 t1[3] = w3[1];
222 t2[3] = w3[2];
223 t3[3] = w3[3];
224 }
225 // U = B_t * d_t
226 for (int n = 0; n < 4; n++)
227 {
228 d0[n] = t0[n] - t2[n];
229 d1[n] = t1[n] + t2[n];
230 d2[n] = t2[n] - t1[n];
231 d3[n] = t3[n] - t1[n];
232 }
233 // save to out_tm
234 for (int n = 0; n < 4; n++)
235 {
236 out_tm0[n] = d0[n];
237 out_tm0[n + 4] = d1[n];
238 out_tm0[n + 8] = d2[n];
239 out_tm0[n + 12] = d3[n];
240 }
241
242 r0 += 2;
243 r1 += 2;
244 r2 += 2;
245 r3 += 2;
246
247 out_tm0 += 16;
248 }
249 }
250 }
251 }
252 bottom_blob_bordered = Mat();
253
254 // BEGIN dot
255 Mat top_blob_tm;
256 {
257 int w_tm = outw / 2 * 4;
258 int h_tm = outh / 2 * 4;
259
260 int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
261 int nRowBlocks = w_tm / 4;
262
263 const int tiles = nColBlocks * nRowBlocks;
264
265 top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);
266
267 int nn_outch = outch >> 2;
268 int remain_outch_start = nn_outch << 2;
269
270 #pragma omp parallel for num_threads(opt.num_threads)
271 for (int pp = 0; pp < nn_outch; pp++)
272 {
273 int p = pp * 4;
274
275 Mat out0_tm = top_blob_tm.channel(p);
276 Mat out1_tm = top_blob_tm.channel(p + 1);
277 Mat out2_tm = top_blob_tm.channel(p + 2);
278 Mat out3_tm = top_blob_tm.channel(p + 3);
279
280 const Mat kernel0_tm = kernel_tm.channel(p);
281 const Mat kernel1_tm = kernel_tm.channel(p + 1);
282 const Mat kernel2_tm = kernel_tm.channel(p + 2);
283 const Mat kernel3_tm = kernel_tm.channel(p + 3);
284
285 for (int i = 0; i < tiles; i++)
286 {
287 int* output0_tm = out0_tm.row<int>(i);
288 int* output1_tm = out1_tm.row<int>(i);
289 int* output2_tm = out2_tm.row<int>(i);
290 int* output3_tm = out3_tm.row<int>(i);
291
292 int sum0[16] = {0};
293 int sum1[16] = {0};
294 int sum2[16] = {0};
295 int sum3[16] = {0};
296
297 int q = 0;
298 for (; q + 3 < inch; q += 4)
299 {
300 const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
301 const short* r1 = bottom_blob_tm.channel(q + 1).row<short>(i);
302 const short* r2 = bottom_blob_tm.channel(q + 2).row<short>(i);
303 const short* r3 = bottom_blob_tm.channel(q + 3).row<short>(i);
304
305 const short* k0 = kernel0_tm.row<short>(q);
306 const short* k1 = kernel1_tm.row<short>(q);
307 const short* k2 = kernel2_tm.row<short>(q);
308 const short* k3 = kernel3_tm.row<short>(q);
309
310 for (int n = 0; n < 16; n++)
311 {
312 sum0[n] += (int)r0[n] * k0[n];
313 k0 += 16;
314 sum0[n] += (int)r1[n] * k0[n];
315 k0 += 16;
316 sum0[n] += (int)r2[n] * k0[n];
317 k0 += 16;
318 sum0[n] += (int)r3[n] * k0[n];
319 k0 -= 16 * 3;
320
321 sum1[n] += (int)r0[n] * k1[n];
322 k1 += 16;
323 sum1[n] += (int)r1[n] * k1[n];
324 k1 += 16;
325 sum1[n] += (int)r2[n] * k1[n];
326 k1 += 16;
327 sum1[n] += (int)r3[n] * k1[n];
328 k1 -= 16 * 3;
329
330 sum2[n] += (int)r0[n] * k2[n];
331 k2 += 16;
332 sum2[n] += (int)r1[n] * k2[n];
333 k2 += 16;
334 sum2[n] += (int)r2[n] * k2[n];
335 k2 += 16;
336 sum2[n] += (int)r3[n] * k2[n];
337 k2 -= 16 * 3;
338
339 sum3[n] += (int)r0[n] * k3[n];
340 k3 += 16;
341 sum3[n] += (int)r1[n] * k3[n];
342 k3 += 16;
343 sum3[n] += (int)r2[n] * k3[n];
344 k3 += 16;
345 sum3[n] += (int)r3[n] * k3[n];
346 k3 -= 16 * 3;
347 }
348 }
349
350 for (; q < inch; q++)
351 {
352 const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
353
354 const short* k0 = kernel0_tm.row<short>(q);
355 const short* k1 = kernel1_tm.row<short>(q);
356 const short* k2 = kernel2_tm.row<short>(q);
357 const short* k3 = kernel3_tm.row<short>(q);
358
359 for (int n = 0; n < 16; n++)
360 {
361 sum0[n] += (int)r0[n] * k0[n];
362 sum1[n] += (int)r0[n] * k1[n];
363 sum2[n] += (int)r0[n] * k2[n];
364 sum3[n] += (int)r0[n] * k3[n];
365 }
366 }
367
368 for (int n = 0; n < 16; n++)
369 {
370 output0_tm[n] = sum0[n];
371 output1_tm[n] = sum1[n];
372 output2_tm[n] = sum2[n];
373 output3_tm[n] = sum3[n];
374 }
375 }
376 }
377
378 #pragma omp parallel for num_threads(opt.num_threads)
379 for (int p = remain_outch_start; p < outch; p++)
380 {
381 Mat out0_tm = top_blob_tm.channel(p);
382 const Mat kernel0_tm = kernel_tm.channel(p);
383
384 for (int i = 0; i < tiles; i++)
385 {
386 int* output0_tm = out0_tm.row<int>(i);
387
388 int sum0[16] = {0};
389
390 int q = 0;
391 for (; q + 3 < inch; q += 4)
392 {
393 const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
394 const short* r1 = bottom_blob_tm.channel(q + 1).row<short>(i);
395 const short* r2 = bottom_blob_tm.channel(q + 2).row<short>(i);
396 const short* r3 = bottom_blob_tm.channel(q + 3).row<short>(i);
397
398 const short* k0 = kernel0_tm.row<short>(q);
399 const short* k1 = kernel0_tm.row<short>(q + 1);
400 const short* k2 = kernel0_tm.row<short>(q + 2);
401 const short* k3 = kernel0_tm.row<short>(q + 3);
402
403 for (int n = 0; n < 16; n++)
404 {
405 sum0[n] += (int)r0[n] * k0[n];
406 sum0[n] += (int)r1[n] * k1[n];
407 sum0[n] += (int)r2[n] * k2[n];
408 sum0[n] += (int)r3[n] * k3[n];
409 }
410 }
411
412 for (; q < inch; q++)
413 {
414 const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
415 const short* k0 = kernel0_tm.row<short>(q);
416
417 for (int n = 0; n < 16; n++)
418 {
419 sum0[n] += (int)r0[n] * k0[n];
420 }
421 }
422
423 for (int n = 0; n < 16; n++)
424 {
425 output0_tm[n] = sum0[n];
426 }
427 }
428 }
429 }
430 bottom_blob_tm = Mat();
431 // END dot
432
433 // BEGIN transform output
434 Mat top_blob_bordered;
435 top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
436 {
437 // AT
438 // const float itm[2][4] = {
439 // {1.0f, 1.0f, 1.0f, 0.0f},
440 // {0.0f, 1.0f, -1.0f, 1.0f}
441 // };
442
443 int w_tm = outw / 2 * 4;
444 int h_tm = outh / 2 * 4;
445
446 int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
447 int nRowBlocks = w_tm / 4;
448
449 #pragma omp parallel for num_threads(opt.num_threads)
450 for (int p = 0; p < outch; p++)
451 {
452 Mat out_tm = top_blob_tm.channel(p);
453 Mat out = top_blob_bordered.channel(p);
454
455 for (int j = 0; j < nColBlocks; j++)
456 {
457 int* outRow0 = out.row<int>(j * 2);
458 int* outRow1 = out.row<int>(j * 2 + 1);
459
460 for (int i = 0; i < nRowBlocks; i++)
461 {
462 int* out_tile = out_tm.row<int>(j * nRowBlocks + i);
463
464 int s0[4], s1[4], s2[4], s3[4];
465 int w0[4], w1[4];
466 int d0[2], d1[2], d2[2], d3[2];
467 int o0[2], o1[2];
468 // load
469 for (int n = 0; n < 4; n++)
470 {
471 s0[n] = out_tile[n];
472 s1[n] = out_tile[n + 4];
473 s2[n] = out_tile[n + 8];
474 s3[n] = out_tile[n + 12];
475 }
476 // w = A_T * W
477 for (int n = 0; n < 4; n++)
478 {
479 w0[n] = s0[n] + s1[n] + s2[n];
480 w1[n] = s1[n] - s2[n] + s3[n];
481 }
482 // transpose w to w_t
483 {
484 d0[0] = w0[0];
485 d0[1] = w1[0];
486 d1[0] = w0[1];
487 d1[1] = w1[1];
488 d2[0] = w0[2];
489 d2[1] = w1[2];
490 d3[0] = w0[3];
491 d3[1] = w1[3];
492 }
493 // Y = A_T * w_t
494 for (int n = 0; n < 2; n++)
495 {
496 o0[n] = d0[n] + d1[n] + d2[n];
497 o1[n] = d1[n] - d2[n] + d3[n];
498 }
499 // save to top blob tm,why right 2,because the G' = G*2
500 outRow0[0] = o0[0] >> 2;
501 outRow0[1] = o0[1] >> 2;
502 outRow1[0] = o1[0] >> 2;
503 outRow1[1] = o1[1] >> 2;
504
505 outRow0 += 2;
506 outRow1 += 2;
507 }
508 }
509 }
510 }
511 // END transform output
512
513 // cut result pad
514 copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
515 }
516
conv3x3s1_winograd43_transform_kernel_int8_sse(const Mat & kernel,Mat & kernel_tm,int inch,int outch,const Option & opt)517 static void conv3x3s1_winograd43_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
518 {
519 kernel_tm.create(6 * 6, inch, outch, (size_t)2u);
520
521 // G
522 // const float ktm[6][3] = {
523 // { 1.0f/4, 0.0f, 0.0f},
524 // { -1.0f/6, -1.0f/6, -1.0f/6},
525 // { -1.0f/6, 1.0f/6, -1.0f/6},
526 // { 1.0f/24, 1.0f/12, 1.0f/6},
527 // { 1.0f/24, -1.0f/12, 1.0f/6},
528 // { 0.0f, 0.0f, 1.0f}
529 // };
530 const short ktm[6][3] = {
531 {6, 0, 0},
532 {-4, -4, -4},
533 {-4, 4, -4},
534 {1, 2, 4},
535 {1, -2, 4},
536 {0, 0, 24}
537 };
538
539 #pragma omp parallel for num_threads(opt.num_threads)
540 for (int p = 0; p < outch; p++)
541 {
542 for (int q = 0; q < inch; q++)
543 {
544 const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
545 short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
546
547 // transform kernel
548 const signed char* k0 = kernel0;
549 const signed char* k1 = kernel0 + 3;
550 const signed char* k2 = kernel0 + 6;
551
552 // h
553 short tmp[6][3];
554 for (int i = 0; i < 6; i++)
555 {
556 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
557 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
558 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
559 }
560
561 // U
562 for (int j = 0; j < 6; j++)
563 {
564 short* tmpp = &tmp[j][0];
565
566 for (int i = 0; i < 6; i++)
567 {
568 kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
569 }
570 }
571 }
572 }
573 }
574
conv3x3s1_winograd43_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Option & opt)575 static void conv3x3s1_winograd43_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
576 {
577 int w = bottom_blob.w;
578 int h = bottom_blob.h;
579 int inch = bottom_blob.c;
580
581 int outw = top_blob.w;
582 int outh = top_blob.h;
583 int outch = top_blob.c;
584
585 // pad to 4n+2, winograd F(4,3)
586 Mat bottom_blob_bordered = bottom_blob;
587
588 outw = (outw + 3) / 4 * 4;
589 outh = (outh + 3) / 4 * 4;
590
591 w = outw + 2;
592 h = outh + 2;
593 Option opt_b = opt;
594 opt_b.blob_allocator = opt.workspace_allocator;
595 copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
596
597 // BEGIN transform input
598 Mat bottom_blob_tm;
599 {
600 int w_tm = outw / 4 * 6;
601 int h_tm = outh / 4 * 6;
602
603 int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
604 int nRowBlocks = w_tm / 6;
605
606 const int tiles = nColBlocks * nRowBlocks;
607
608 bottom_blob_tm.create(6 * 6, tiles, inch, 2u, opt.workspace_allocator);
609
610 // BT
611 // const float itm[4][4] = {
612 // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
613 // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
614 // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
615 // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
616 // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
617 // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f}
618 // };
619
620 // 0 = 4 * r00 - 5 * r02 + r04
621 // 1 = -4 * (r01 + r02) + r03 + r04
622 // 2 = 4 * (r01 - r02) - r03 + r04
623 // 3 = -2 * r01 - r02 + 2 * r03 + r04
624 // 4 = 2 * r01 - r02 - 2 * r03 + r04
625 // 5 = 4 * r01 - 5 * r03 + r05
626
627 #pragma omp parallel for num_threads(opt.num_threads)
628 for (int q = 0; q < inch; q++)
629 {
630 const signed char* img = bottom_blob_bordered.channel(q);
631 short* out_tm0 = bottom_blob_tm.channel(q);
632
633 for (int j = 0; j < nColBlocks; j++)
634 {
635 const signed char* r0 = img + w * j * 4;
636 const signed char* r1 = r0 + w;
637 const signed char* r2 = r1 + w;
638 const signed char* r3 = r2 + w;
639 const signed char* r4 = r3 + w;
640 const signed char* r5 = r4 + w;
641
642 for (int i = 0; i < nRowBlocks; i++)
643 {
644 short d0[6], d1[6], d2[6], d3[6], d4[6], d5[6];
645 short w0[6], w1[6], w2[6], w3[6], w4[6], w5[6];
646 short t0[6], t1[6], t2[6], t3[6], t4[6], t5[6];
647
648 // load
649 for (int n = 0; n < 6; n++)
650 {
651 d0[n] = r0[n];
652 d1[n] = r1[n];
653 d2[n] = r2[n];
654 d3[n] = r3[n];
655 d4[n] = r4[n];
656 d5[n] = r5[n];
657 }
658 // w = B_t * d
659 for (int n = 0; n < 6; n++)
660 {
661 w0[n] = 4 * d0[n] - 5 * d2[n] + d4[n];
662 w1[n] = -4 * d1[n] - 4 * d2[n] + d3[n] + d4[n];
663 w2[n] = 4 * d1[n] - 4 * d2[n] - d3[n] + d4[n];
664 w3[n] = -2 * d1[n] - d2[n] + 2 * d3[n] + d4[n];
665 w4[n] = 2 * d1[n] - d2[n] - 2 * d3[n] + d4[n];
666 w5[n] = 4 * d1[n] - 5 * d3[n] + d5[n];
667 }
668 // transpose d to d_t
669 {
670 t0[0] = w0[0];
671 t1[0] = w0[1];
672 t2[0] = w0[2];
673 t3[0] = w0[3];
674 t4[0] = w0[4];
675 t5[0] = w0[5];
676 t0[1] = w1[0];
677 t1[1] = w1[1];
678 t2[1] = w1[2];
679 t3[1] = w1[3];
680 t4[1] = w1[4];
681 t5[1] = w1[5];
682 t0[2] = w2[0];
683 t1[2] = w2[1];
684 t2[2] = w2[2];
685 t3[2] = w2[3];
686 t4[2] = w2[4];
687 t5[2] = w2[5];
688 t0[3] = w3[0];
689 t1[3] = w3[1];
690 t2[3] = w3[2];
691 t3[3] = w3[3];
692 t4[3] = w3[4];
693 t5[3] = w3[5];
694 t0[4] = w4[0];
695 t1[4] = w4[1];
696 t2[4] = w4[2];
697 t3[4] = w4[3];
698 t4[4] = w4[4];
699 t5[4] = w4[5];
700 t0[5] = w5[0];
701 t1[5] = w5[1];
702 t2[5] = w5[2];
703 t3[5] = w5[3];
704 t4[5] = w5[4];
705 t5[5] = w5[5];
706 }
707 // d = B_t * d_t
708 for (int n = 0; n < 6; n++)
709 {
710 d0[n] = 4 * t0[n] - 5 * t2[n] + t4[n];
711 d1[n] = -4 * t1[n] - 4 * t2[n] + t3[n] + t4[n];
712 d2[n] = 4 * t1[n] - 4 * t2[n] - t3[n] + t4[n];
713 d3[n] = -2 * t1[n] - t2[n] + 2 * t3[n] + t4[n];
714 d4[n] = 2 * t1[n] - t2[n] - 2 * t3[n] + t4[n];
715 d5[n] = 4 * t1[n] - 5 * t3[n] + t5[n];
716 }
717 // save to out_tm
718 for (int n = 0; n < 6; n++)
719 {
720 out_tm0[n] = d0[n];
721 out_tm0[n + 6] = d1[n];
722 out_tm0[n + 12] = d2[n];
723 out_tm0[n + 18] = d3[n];
724 out_tm0[n + 24] = d4[n];
725 out_tm0[n + 30] = d5[n];
726 }
727
728 r0 += 4;
729 r1 += 4;
730 r2 += 4;
731 r3 += 4;
732 r4 += 4;
733 r5 += 4;
734
735 out_tm0 += 36;
736 }
737 }
738 }
739 }
740 bottom_blob_bordered = Mat();
741
742 // BEGIN dot
743 Mat top_blob_tm;
744 {
745 int w_tm = outw / 4 * 6;
746 int h_tm = outh / 4 * 6;
747
748 int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
749 int nRowBlocks = w_tm / 6;
750
751 const int tiles = nColBlocks * nRowBlocks;
752
753 top_blob_tm.create(36, tiles, outch, 4u, opt.workspace_allocator);
754
755 #pragma omp parallel for num_threads(opt.num_threads)
756 for (int p = 0; p < outch; p++)
757 {
758 Mat out0_tm = top_blob_tm.channel(p);
759 const Mat kernel0_tm = kernel_tm.channel(p);
760
761 for (int i = 0; i < tiles; i++)
762 {
763 int* output0_tm = out0_tm.row<int>(i);
764
765 int sum0[36] = {0};
766
767 for (int q = 0; q < inch; q++)
768 {
769 const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
770 const short* k0 = kernel0_tm.row<short>(q);
771
772 for (int n = 0; n < 36; n++)
773 {
774 sum0[n] += (int)r0[n] * k0[n];
775 }
776 }
777
778 for (int n = 0; n < 36; n++)
779 {
780 output0_tm[n] = sum0[n];
781 }
782 }
783 }
784 }
785 bottom_blob_tm = Mat();
786 // END dot
787
788 // BEGIN transform output
789 Mat top_blob_bordered;
790 top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
791 {
792 // AT
793 // const float itm[4][6] = {
794 // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f},
795 // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
796 // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f},
797 // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
798 // };
799
800 // 0 = r00 + r01 + r02 + r03 + r04
801 // 1 = r01 - r02 + 2 * (r03 - r04)
802 // 2 = r01 + r02 + 4 * (r03 + r04)
803 // 3 = r01 - r02 + 8 * (r03 - r04) + r05
804
805 int w_tm = outw / 4 * 6;
806 int h_tm = outh / 4 * 6;
807
808 int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
809 int nRowBlocks = w_tm / 6;
810
811 #pragma omp parallel for num_threads(opt.num_threads)
812 for (int p = 0; p < outch; p++)
813 {
814 Mat out_tm = top_blob_tm.channel(p);
815 Mat out = top_blob_bordered.channel(p);
816
817 for (int j = 0; j < nColBlocks; j++)
818 {
819 int* outRow0 = out.row<int>(j * 4);
820 int* outRow1 = out.row<int>(j * 4 + 1);
821 int* outRow2 = out.row<int>(j * 4 + 2);
822 int* outRow3 = out.row<int>(j * 4 + 3);
823
824 for (int i = 0; i < nRowBlocks; i++)
825 {
826 int* out_tile = out_tm.row<int>(j * nRowBlocks + i);
827
828 int s0[6], s1[6], s2[6], s3[6], s4[6], s5[6];
829 int w0[6], w1[6], w2[6], w3[6];
830 int d0[4], d1[4], d2[4], d3[4], d4[4], d5[4];
831 int o0[4], o1[4], o2[4], o3[4];
832 // load
833 for (int n = 0; n < 6; n++)
834 {
835 s0[n] = out_tile[n];
836 s1[n] = out_tile[n + 6];
837 s2[n] = out_tile[n + 12];
838 s3[n] = out_tile[n + 18];
839 s4[n] = out_tile[n + 24];
840 s5[n] = out_tile[n + 30];
841 }
842 // w = A_T * W
843 for (int n = 0; n < 6; n++)
844 {
845 w0[n] = s0[n] + s1[n] + s2[n] + s3[n] + s4[n];
846 w1[n] = s1[n] - s2[n] + 2 * s3[n] - 2 * s4[n];
847 w2[n] = s1[n] + s2[n] + 4 * s3[n] + 4 * s4[n];
848 w3[n] = s1[n] - s2[n] + 8 * s3[n] - 8 * s4[n] + s5[n];
849 }
850 // transpose w to w_t
851 {
852 d0[0] = w0[0];
853 d0[1] = w1[0];
854 d0[2] = w2[0];
855 d0[3] = w3[0];
856 d1[0] = w0[1];
857 d1[1] = w1[1];
858 d1[2] = w2[1];
859 d1[3] = w3[1];
860 d2[0] = w0[2];
861 d2[1] = w1[2];
862 d2[2] = w2[2];
863 d2[3] = w3[2];
864 d3[0] = w0[3];
865 d3[1] = w1[3];
866 d3[2] = w2[3];
867 d3[3] = w3[3];
868 d4[0] = w0[4];
869 d4[1] = w1[4];
870 d4[2] = w2[4];
871 d4[3] = w3[4];
872 d5[0] = w0[5];
873 d5[1] = w1[5];
874 d5[2] = w2[5];
875 d5[3] = w3[5];
876 }
877 // Y = A_T * w_t
878 for (int n = 0; n < 4; n++)
879 {
880 o0[n] = d0[n] + d1[n] + d2[n] + d3[n] + d4[n];
881 o1[n] = d1[n] - d2[n] + 2 * d3[n] - 2 * d4[n];
882 o2[n] = d1[n] + d2[n] + 4 * d3[n] + 4 * d4[n];
883 o3[n] = d1[n] - d2[n] + 8 * d3[n] - 8 * d4[n] + d5[n];
884 }
885 // save to top blob tm
886 for (int n = 0; n < 4; n++)
887 {
888 outRow0[n] = o0[n] / 576;
889 outRow1[n] = o1[n] / 576;
890 outRow2[n] = o2[n] / 576;
891 outRow3[n] = o3[n] / 576;
892 }
893
894 outRow0 += 4;
895 outRow1 += 4;
896 outRow2 += 4;
897 outRow3 += 4;
898 }
899 }
900 }
901 }
902 // END transform output
903
904 // cut result pad
905 copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
906 }
907
conv3x3s2_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)908 static void conv3x3s2_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
909 {
910 int w = bottom_blob.w;
911 int inch = bottom_blob.c;
912
913 int outw = top_blob.w;
914 int outh = top_blob.h;
915 int outch = top_blob.c;
916
917 const int tailstep = w - 2 * outw + w;
918
919 const signed char* kernel = _kernel;
920
921 #pragma omp parallel for num_threads(opt.num_threads)
922 for (int p = 0; p < outch; p++)
923 {
924 Mat out0 = top_blob.channel(p);
925
926 out0.fill(0);
927
928 const signed char* kernel0 = (const signed char*)kernel + p * inch * 9;
929
930 for (int q = 0; q < inch; q++)
931 {
932 int* outptr0 = out0;
933
934 const signed char* img0 = bottom_blob.channel(q);
935
936 const signed char* r0 = img0;
937 const signed char* r1 = img0 + w;
938 const signed char* r2 = img0 + w * 2;
939
940 for (int i = 0; i < outh; i++)
941 {
942 int remain = outw;
943
944 for (; remain > 0; remain--)
945 {
946 int sum0 = 0;
947
948 sum0 += (int)r0[0] * kernel0[0];
949 sum0 += (int)r0[1] * kernel0[1];
950 sum0 += (int)r0[2] * kernel0[2];
951 sum0 += (int)r1[0] * kernel0[3];
952 sum0 += (int)r1[1] * kernel0[4];
953 sum0 += (int)r1[2] * kernel0[5];
954 sum0 += (int)r2[0] * kernel0[6];
955 sum0 += (int)r2[1] * kernel0[7];
956 sum0 += (int)r2[2] * kernel0[8];
957
958 *outptr0 += sum0;
959
960 r0 += 2;
961 r1 += 2;
962 r2 += 2;
963 outptr0++;
964 }
965
966 r0 += tailstep;
967 r1 += tailstep;
968 r2 += tailstep;
969 }
970
971 kernel0 += 9;
972 }
973 }
974 }
975