1 // BUG1989 is pleased to support the open source community by supporting ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
conv3x3s1_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)15 static void conv3x3s1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
16 {
17     int w = bottom_blob.w;
18     int inch = bottom_blob.c;
19 
20     int outw = top_blob.w;
21     int outh = top_blob.h;
22     int outch = top_blob.c;
23 
24     const signed char* kernel = _kernel;
25 
26     #pragma omp parallel for num_threads(opt.num_threads)
27     for (int p = 0; p < outch; p++)
28     {
29         Mat out0 = top_blob.channel(p);
30 
31         out0.fill(0);
32 
33         const signed char* kernel0 = (const signed char*)kernel + p * inch * 9;
34 
35         for (int q = 0; q < inch; q++)
36         {
37             int* outptr0 = out0;
38 
39             const signed char* img0 = bottom_blob.channel(q);
40 
41             const signed char* r0 = img0;
42             const signed char* r1 = img0 + w;
43             const signed char* r2 = img0 + w * 2;
44 
45             for (int i = 0; i < outh; i++)
46             {
47                 int remain = outw;
48 
49                 for (; remain > 0; remain--)
50                 {
51                     int sum0 = 0;
52 
53                     sum0 += (int)r0[0] * kernel0[0];
54                     sum0 += (int)r0[1] * kernel0[1];
55                     sum0 += (int)r0[2] * kernel0[2];
56                     sum0 += (int)r1[0] * kernel0[3];
57                     sum0 += (int)r1[1] * kernel0[4];
58                     sum0 += (int)r1[2] * kernel0[5];
59                     sum0 += (int)r2[0] * kernel0[6];
60                     sum0 += (int)r2[1] * kernel0[7];
61                     sum0 += (int)r2[2] * kernel0[8];
62 
63                     *outptr0 += sum0;
64 
65                     r0++;
66                     r1++;
67                     r2++;
68                     outptr0++;
69                 }
70 
71                 r0 += 2;
72                 r1 += 2;
73                 r2 += 2;
74             }
75 
76             kernel0 += 9;
77         }
78     }
79 }
80 
conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat & kernel,Mat & kernel_tm,int inch,int outch)81 static void conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
82 {
83     kernel_tm.create(4 * 4, inch, outch, 2ul);
84 
85     // G
86     const short ktm[4][3] = {
87         {2, 0, 0},
88         {1, 1, 1},
89         {1, -1, 1},
90         {0, 0, 2}
91     };
92 
93     #pragma omp parallel for
94     for (int p = 0; p < outch; p++)
95     {
96         for (int q = 0; q < inch; q++)
97         {
98             const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
99             short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
100 
101             // transform kernel
102             const signed char* k0 = kernel0;
103             const signed char* k1 = kernel0 + 3;
104             const signed char* k2 = kernel0 + 6;
105 
106             // h
107             short tmp[4][3];
108             for (int i = 0; i < 4; i++)
109             {
110                 tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
111                 tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
112                 tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
113             }
114 
115             // U
116             for (int j = 0; j < 4; j++)
117             {
118                 short* tmpp = &tmp[j][0];
119 
120                 for (int i = 0; i < 4; i++)
121                 {
122                     kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
123                 }
124             }
125         }
126     }
127 }
128 
conv3x3s1_winograd23_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Option & opt)129 static void conv3x3s1_winograd23_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
130 {
131     int w = bottom_blob.w;
132     int h = bottom_blob.h;
133     int inch = bottom_blob.c;
134 
135     int outw = top_blob.w;
136     int outh = top_blob.h;
137     int outch = top_blob.c;
138 
139     // pad to 2n+2, winograd F(2,3)
140     Mat bottom_blob_bordered = bottom_blob;
141 
142     outw = (outw + 1) / 2 * 2;
143     outh = (outh + 1) / 2 * 2;
144 
145     w = outw + 2;
146     h = outh + 2;
147     Option opt_b = opt;
148     opt_b.blob_allocator = opt.workspace_allocator;
149     copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
150 
151     // BEGIN transform input
152     Mat bottom_blob_tm;
153     {
154         int w_tm = outw / 2 * 4;
155         int h_tm = outh / 2 * 4;
156 
157         int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
158         int nRowBlocks = w_tm / 4;
159 
160         const int tiles = nColBlocks * nRowBlocks;
161 
162         bottom_blob_tm.create(4 * 4, tiles, inch, 2u, opt.workspace_allocator);
163 
164         // BT
165         // const float itm[4][4] = {
166         //     {1.0f,  0.0f, -1.0f,  0.0f},
167         //     {0.0f,  1.0f,  1.00f, 0.0f},
168         //     {0.0f, -1.0f,  1.00f, 0.0f},
169         //     {0.0f, -1.0f,  0.00f, 1.0f}
170         // };
171 
172         #pragma omp parallel for num_threads(opt.num_threads)
173         for (int q = 0; q < inch; q++)
174         {
175             const signed char* img = bottom_blob_bordered.channel(q);
176             short* out_tm0 = bottom_blob_tm.channel(q);
177 
178             for (int j = 0; j < nColBlocks; j++)
179             {
180                 const signed char* r0 = img + w * j * 2;
181                 const signed char* r1 = r0 + w;
182                 const signed char* r2 = r1 + w;
183                 const signed char* r3 = r2 + w;
184 
185                 for (int i = 0; i < nRowBlocks; i++)
186                 {
187                     short d0[4], d1[4], d2[4], d3[4];
188                     short w0[4], w1[4], w2[4], w3[4];
189                     short t0[4], t1[4], t2[4], t3[4];
190                     // load
191                     for (int n = 0; n < 4; n++)
192                     {
193                         d0[n] = r0[n];
194                         d1[n] = r1[n];
195                         d2[n] = r2[n];
196                         d3[n] = r3[n];
197                     }
198                     // w = B_t * d
199                     for (int n = 0; n < 4; n++)
200                     {
201                         w0[n] = d0[n] - d2[n];
202                         w1[n] = d1[n] + d2[n];
203                         w2[n] = d2[n] - d1[n];
204                         w3[n] = d3[n] - d1[n];
205                     }
206                     // transpose d to d_t
207                     {
208                         t0[0] = w0[0];
209                         t1[0] = w0[1];
210                         t2[0] = w0[2];
211                         t3[0] = w0[3];
212                         t0[1] = w1[0];
213                         t1[1] = w1[1];
214                         t2[1] = w1[2];
215                         t3[1] = w1[3];
216                         t0[2] = w2[0];
217                         t1[2] = w2[1];
218                         t2[2] = w2[2];
219                         t3[2] = w2[3];
220                         t0[3] = w3[0];
221                         t1[3] = w3[1];
222                         t2[3] = w3[2];
223                         t3[3] = w3[3];
224                     }
225                     // U = B_t * d_t
226                     for (int n = 0; n < 4; n++)
227                     {
228                         d0[n] = t0[n] - t2[n];
229                         d1[n] = t1[n] + t2[n];
230                         d2[n] = t2[n] - t1[n];
231                         d3[n] = t3[n] - t1[n];
232                     }
233                     // save to out_tm
234                     for (int n = 0; n < 4; n++)
235                     {
236                         out_tm0[n] = d0[n];
237                         out_tm0[n + 4] = d1[n];
238                         out_tm0[n + 8] = d2[n];
239                         out_tm0[n + 12] = d3[n];
240                     }
241 
242                     r0 += 2;
243                     r1 += 2;
244                     r2 += 2;
245                     r3 += 2;
246 
247                     out_tm0 += 16;
248                 }
249             }
250         }
251     }
252     bottom_blob_bordered = Mat();
253 
254     // BEGIN dot
255     Mat top_blob_tm;
256     {
257         int w_tm = outw / 2 * 4;
258         int h_tm = outh / 2 * 4;
259 
260         int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
261         int nRowBlocks = w_tm / 4;
262 
263         const int tiles = nColBlocks * nRowBlocks;
264 
265         top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);
266 
267         int nn_outch = outch >> 2;
268         int remain_outch_start = nn_outch << 2;
269 
270         #pragma omp parallel for num_threads(opt.num_threads)
271         for (int pp = 0; pp < nn_outch; pp++)
272         {
273             int p = pp * 4;
274 
275             Mat out0_tm = top_blob_tm.channel(p);
276             Mat out1_tm = top_blob_tm.channel(p + 1);
277             Mat out2_tm = top_blob_tm.channel(p + 2);
278             Mat out3_tm = top_blob_tm.channel(p + 3);
279 
280             const Mat kernel0_tm = kernel_tm.channel(p);
281             const Mat kernel1_tm = kernel_tm.channel(p + 1);
282             const Mat kernel2_tm = kernel_tm.channel(p + 2);
283             const Mat kernel3_tm = kernel_tm.channel(p + 3);
284 
285             for (int i = 0; i < tiles; i++)
286             {
287                 int* output0_tm = out0_tm.row<int>(i);
288                 int* output1_tm = out1_tm.row<int>(i);
289                 int* output2_tm = out2_tm.row<int>(i);
290                 int* output3_tm = out3_tm.row<int>(i);
291 
292                 int sum0[16] = {0};
293                 int sum1[16] = {0};
294                 int sum2[16] = {0};
295                 int sum3[16] = {0};
296 
297                 int q = 0;
298                 for (; q + 3 < inch; q += 4)
299                 {
300                     const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
301                     const short* r1 = bottom_blob_tm.channel(q + 1).row<short>(i);
302                     const short* r2 = bottom_blob_tm.channel(q + 2).row<short>(i);
303                     const short* r3 = bottom_blob_tm.channel(q + 3).row<short>(i);
304 
305                     const short* k0 = kernel0_tm.row<short>(q);
306                     const short* k1 = kernel1_tm.row<short>(q);
307                     const short* k2 = kernel2_tm.row<short>(q);
308                     const short* k3 = kernel3_tm.row<short>(q);
309 
310                     for (int n = 0; n < 16; n++)
311                     {
312                         sum0[n] += (int)r0[n] * k0[n];
313                         k0 += 16;
314                         sum0[n] += (int)r1[n] * k0[n];
315                         k0 += 16;
316                         sum0[n] += (int)r2[n] * k0[n];
317                         k0 += 16;
318                         sum0[n] += (int)r3[n] * k0[n];
319                         k0 -= 16 * 3;
320 
321                         sum1[n] += (int)r0[n] * k1[n];
322                         k1 += 16;
323                         sum1[n] += (int)r1[n] * k1[n];
324                         k1 += 16;
325                         sum1[n] += (int)r2[n] * k1[n];
326                         k1 += 16;
327                         sum1[n] += (int)r3[n] * k1[n];
328                         k1 -= 16 * 3;
329 
330                         sum2[n] += (int)r0[n] * k2[n];
331                         k2 += 16;
332                         sum2[n] += (int)r1[n] * k2[n];
333                         k2 += 16;
334                         sum2[n] += (int)r2[n] * k2[n];
335                         k2 += 16;
336                         sum2[n] += (int)r3[n] * k2[n];
337                         k2 -= 16 * 3;
338 
339                         sum3[n] += (int)r0[n] * k3[n];
340                         k3 += 16;
341                         sum3[n] += (int)r1[n] * k3[n];
342                         k3 += 16;
343                         sum3[n] += (int)r2[n] * k3[n];
344                         k3 += 16;
345                         sum3[n] += (int)r3[n] * k3[n];
346                         k3 -= 16 * 3;
347                     }
348                 }
349 
350                 for (; q < inch; q++)
351                 {
352                     const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
353 
354                     const short* k0 = kernel0_tm.row<short>(q);
355                     const short* k1 = kernel1_tm.row<short>(q);
356                     const short* k2 = kernel2_tm.row<short>(q);
357                     const short* k3 = kernel3_tm.row<short>(q);
358 
359                     for (int n = 0; n < 16; n++)
360                     {
361                         sum0[n] += (int)r0[n] * k0[n];
362                         sum1[n] += (int)r0[n] * k1[n];
363                         sum2[n] += (int)r0[n] * k2[n];
364                         sum3[n] += (int)r0[n] * k3[n];
365                     }
366                 }
367 
368                 for (int n = 0; n < 16; n++)
369                 {
370                     output0_tm[n] = sum0[n];
371                     output1_tm[n] = sum1[n];
372                     output2_tm[n] = sum2[n];
373                     output3_tm[n] = sum3[n];
374                 }
375             }
376         }
377 
378         #pragma omp parallel for num_threads(opt.num_threads)
379         for (int p = remain_outch_start; p < outch; p++)
380         {
381             Mat out0_tm = top_blob_tm.channel(p);
382             const Mat kernel0_tm = kernel_tm.channel(p);
383 
384             for (int i = 0; i < tiles; i++)
385             {
386                 int* output0_tm = out0_tm.row<int>(i);
387 
388                 int sum0[16] = {0};
389 
390                 int q = 0;
391                 for (; q + 3 < inch; q += 4)
392                 {
393                     const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
394                     const short* r1 = bottom_blob_tm.channel(q + 1).row<short>(i);
395                     const short* r2 = bottom_blob_tm.channel(q + 2).row<short>(i);
396                     const short* r3 = bottom_blob_tm.channel(q + 3).row<short>(i);
397 
398                     const short* k0 = kernel0_tm.row<short>(q);
399                     const short* k1 = kernel0_tm.row<short>(q + 1);
400                     const short* k2 = kernel0_tm.row<short>(q + 2);
401                     const short* k3 = kernel0_tm.row<short>(q + 3);
402 
403                     for (int n = 0; n < 16; n++)
404                     {
405                         sum0[n] += (int)r0[n] * k0[n];
406                         sum0[n] += (int)r1[n] * k1[n];
407                         sum0[n] += (int)r2[n] * k2[n];
408                         sum0[n] += (int)r3[n] * k3[n];
409                     }
410                 }
411 
412                 for (; q < inch; q++)
413                 {
414                     const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
415                     const short* k0 = kernel0_tm.row<short>(q);
416 
417                     for (int n = 0; n < 16; n++)
418                     {
419                         sum0[n] += (int)r0[n] * k0[n];
420                     }
421                 }
422 
423                 for (int n = 0; n < 16; n++)
424                 {
425                     output0_tm[n] = sum0[n];
426                 }
427             }
428         }
429     }
430     bottom_blob_tm = Mat();
431     // END dot
432 
433     // BEGIN transform output
434     Mat top_blob_bordered;
435     top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
436     {
437         // AT
438         // const float itm[2][4] = {
439         //     {1.0f,  1.0f,  1.0f,  0.0f},
440         //     {0.0f,  1.0f, -1.0f,  1.0f}
441         // };
442 
443         int w_tm = outw / 2 * 4;
444         int h_tm = outh / 2 * 4;
445 
446         int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
447         int nRowBlocks = w_tm / 4;
448 
449         #pragma omp parallel for num_threads(opt.num_threads)
450         for (int p = 0; p < outch; p++)
451         {
452             Mat out_tm = top_blob_tm.channel(p);
453             Mat out = top_blob_bordered.channel(p);
454 
455             for (int j = 0; j < nColBlocks; j++)
456             {
457                 int* outRow0 = out.row<int>(j * 2);
458                 int* outRow1 = out.row<int>(j * 2 + 1);
459 
460                 for (int i = 0; i < nRowBlocks; i++)
461                 {
462                     int* out_tile = out_tm.row<int>(j * nRowBlocks + i);
463 
464                     int s0[4], s1[4], s2[4], s3[4];
465                     int w0[4], w1[4];
466                     int d0[2], d1[2], d2[2], d3[2];
467                     int o0[2], o1[2];
468                     // load
469                     for (int n = 0; n < 4; n++)
470                     {
471                         s0[n] = out_tile[n];
472                         s1[n] = out_tile[n + 4];
473                         s2[n] = out_tile[n + 8];
474                         s3[n] = out_tile[n + 12];
475                     }
476                     // w = A_T * W
477                     for (int n = 0; n < 4; n++)
478                     {
479                         w0[n] = s0[n] + s1[n] + s2[n];
480                         w1[n] = s1[n] - s2[n] + s3[n];
481                     }
482                     // transpose w to w_t
483                     {
484                         d0[0] = w0[0];
485                         d0[1] = w1[0];
486                         d1[0] = w0[1];
487                         d1[1] = w1[1];
488                         d2[0] = w0[2];
489                         d2[1] = w1[2];
490                         d3[0] = w0[3];
491                         d3[1] = w1[3];
492                     }
493                     // Y = A_T * w_t
494                     for (int n = 0; n < 2; n++)
495                     {
496                         o0[n] = d0[n] + d1[n] + d2[n];
497                         o1[n] = d1[n] - d2[n] + d3[n];
498                     }
499                     // save to top blob tm,why right 2,because the G' = G*2
500                     outRow0[0] = o0[0] >> 2;
501                     outRow0[1] = o0[1] >> 2;
502                     outRow1[0] = o1[0] >> 2;
503                     outRow1[1] = o1[1] >> 2;
504 
505                     outRow0 += 2;
506                     outRow1 += 2;
507                 }
508             }
509         }
510     }
511     // END transform output
512 
513     // cut result pad
514     copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
515 }
516 
conv3x3s1_winograd43_transform_kernel_int8_sse(const Mat & kernel,Mat & kernel_tm,int inch,int outch)517 static void conv3x3s1_winograd43_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
518 {
519     kernel_tm.create(6 * 6, inch, outch, 2ul);
520 
521     // G
522     // const float ktm[6][3] = {
523     //     {  1.0f/4,     0.0f,    0.0f},
524     //     { -1.0f/6,  -1.0f/6, -1.0f/6},
525     //     { -1.0f/6,   1.0f/6, -1.0f/6},
526     //     { 1.0f/24,  1.0f/12,  1.0f/6},
527     //     { 1.0f/24, -1.0f/12,  1.0f/6},
528     //     {    0.0f,     0.0f,    1.0f}
529     // };
530     const short ktm[6][3] = {
531         {6, 0, 0},
532         {-4, -4, -4},
533         {-4, 4, -4},
534         {1, 2, 4},
535         {1, -2, 4},
536         {0, 0, 24}
537     };
538 
539     #pragma omp parallel for
540     for (int p = 0; p < outch; p++)
541     {
542         for (int q = 0; q < inch; q++)
543         {
544             const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
545             short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
546 
547             // transform kernel
548             const signed char* k0 = kernel0;
549             const signed char* k1 = kernel0 + 3;
550             const signed char* k2 = kernel0 + 6;
551 
552             // h
553             short tmp[6][3];
554             for (int i = 0; i < 6; i++)
555             {
556                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
557                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
558                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
559             }
560 
561             // U
562             for (int j = 0; j < 6; j++)
563             {
564                 short* tmpp = &tmp[j][0];
565 
566                 for (int i = 0; i < 6; i++)
567                 {
568                     kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
569                 }
570             }
571         }
572     }
573 }
574 
conv3x3s1_winograd43_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Option & opt)575 static void conv3x3s1_winograd43_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
576 {
577     int w = bottom_blob.w;
578     int h = bottom_blob.h;
579     int inch = bottom_blob.c;
580 
581     int outw = top_blob.w;
582     int outh = top_blob.h;
583     int outch = top_blob.c;
584 
585     // pad to 4n+2, winograd F(4,3)
586     Mat bottom_blob_bordered = bottom_blob;
587 
588     outw = (outw + 3) / 4 * 4;
589     outh = (outh + 3) / 4 * 4;
590 
591     w = outw + 2;
592     h = outh + 2;
593     Option opt_b = opt;
594     opt_b.blob_allocator = opt.workspace_allocator;
595     copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
596 
597     // BEGIN transform input
598     Mat bottom_blob_tm;
599     {
600         int w_tm = outw / 4 * 6;
601         int h_tm = outh / 4 * 6;
602 
603         int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
604         int nRowBlocks = w_tm / 6;
605 
606         const int tiles = nColBlocks * nRowBlocks;
607 
608         bottom_blob_tm.create(6 * 6, tiles, inch, 2u, opt.workspace_allocator);
609 
610         // BT
611         // const float itm[4][4] = {
612         //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
613         //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
614         //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
615         //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
616         //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
617         //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
618         // };
619 
620         // 0 =	4 * r00  - 5 * r02	+ r04
621         // 1 = -4 * (r01 + r02)  + r03 + r04
622         // 2 =	4 * (r01 - r02)  - r03 + r04
623         // 3 = -2 * r01 - r02 + 2 * r03 + r04
624         // 4 =	2 * r01 - r02 - 2 * r03 + r04
625         // 5 =	4 * r01 - 5 * r03 + r05
626 
627         #pragma omp parallel for num_threads(opt.num_threads)
628         for (int q = 0; q < inch; q++)
629         {
630             const signed char* img = bottom_blob_bordered.channel(q);
631             short* out_tm0 = bottom_blob_tm.channel(q);
632 
633             for (int j = 0; j < nColBlocks; j++)
634             {
635                 const signed char* r0 = img + w * j * 4;
636                 const signed char* r1 = r0 + w;
637                 const signed char* r2 = r1 + w;
638                 const signed char* r3 = r2 + w;
639                 const signed char* r4 = r3 + w;
640                 const signed char* r5 = r4 + w;
641 
642                 for (int i = 0; i < nRowBlocks; i++)
643                 {
644                     short d0[6], d1[6], d2[6], d3[6], d4[6], d5[6];
645                     short w0[6], w1[6], w2[6], w3[6], w4[6], w5[6];
646                     short t0[6], t1[6], t2[6], t3[6], t4[6], t5[6];
647 
648                     // load
649                     for (int n = 0; n < 6; n++)
650                     {
651                         d0[n] = r0[n];
652                         d1[n] = r1[n];
653                         d2[n] = r2[n];
654                         d3[n] = r3[n];
655                         d4[n] = r4[n];
656                         d5[n] = r5[n];
657                     }
658                     // w = B_t * d
659                     for (int n = 0; n < 6; n++)
660                     {
661                         w0[n] = 4 * d0[n] - 5 * d2[n] + d4[n];
662                         w1[n] = -4 * d1[n] - 4 * d2[n] + d3[n] + d4[n];
663                         w2[n] = 4 * d1[n] - 4 * d2[n] - d3[n] + d4[n];
664                         w3[n] = -2 * d1[n] - d2[n] + 2 * d3[n] + d4[n];
665                         w4[n] = 2 * d1[n] - d2[n] - 2 * d3[n] + d4[n];
666                         w5[n] = 4 * d1[n] - 5 * d3[n] + d5[n];
667                     }
668                     // transpose d to d_t
669                     {
670                         t0[0] = w0[0];
671                         t1[0] = w0[1];
672                         t2[0] = w0[2];
673                         t3[0] = w0[3];
674                         t4[0] = w0[4];
675                         t5[0] = w0[5];
676                         t0[1] = w1[0];
677                         t1[1] = w1[1];
678                         t2[1] = w1[2];
679                         t3[1] = w1[3];
680                         t4[1] = w1[4];
681                         t5[1] = w1[5];
682                         t0[2] = w2[0];
683                         t1[2] = w2[1];
684                         t2[2] = w2[2];
685                         t3[2] = w2[3];
686                         t4[2] = w2[4];
687                         t5[2] = w2[5];
688                         t0[3] = w3[0];
689                         t1[3] = w3[1];
690                         t2[3] = w3[2];
691                         t3[3] = w3[3];
692                         t4[3] = w3[4];
693                         t5[3] = w3[5];
694                         t0[4] = w4[0];
695                         t1[4] = w4[1];
696                         t2[4] = w4[2];
697                         t3[4] = w4[3];
698                         t4[4] = w4[4];
699                         t5[4] = w4[5];
700                         t0[5] = w5[0];
701                         t1[5] = w5[1];
702                         t2[5] = w5[2];
703                         t3[5] = w5[3];
704                         t4[5] = w5[4];
705                         t5[5] = w5[5];
706                     }
707                     // d = B_t * d_t
708                     for (int n = 0; n < 6; n++)
709                     {
710                         d0[n] = 4 * t0[n] - 5 * t2[n] + t4[n];
711                         d1[n] = -4 * t1[n] - 4 * t2[n] + t3[n] + t4[n];
712                         d2[n] = 4 * t1[n] - 4 * t2[n] - t3[n] + t4[n];
713                         d3[n] = -2 * t1[n] - t2[n] + 2 * t3[n] + t4[n];
714                         d4[n] = 2 * t1[n] - t2[n] - 2 * t3[n] + t4[n];
715                         d5[n] = 4 * t1[n] - 5 * t3[n] + t5[n];
716                     }
717                     // save to out_tm
718                     for (int n = 0; n < 6; n++)
719                     {
720                         out_tm0[n] = d0[n];
721                         out_tm0[n + 6] = d1[n];
722                         out_tm0[n + 12] = d2[n];
723                         out_tm0[n + 18] = d3[n];
724                         out_tm0[n + 24] = d4[n];
725                         out_tm0[n + 30] = d5[n];
726                     }
727 
728                     r0 += 4;
729                     r1 += 4;
730                     r2 += 4;
731                     r3 += 4;
732                     r4 += 4;
733                     r5 += 4;
734 
735                     out_tm0 += 36;
736                 }
737             }
738         }
739     }
740     bottom_blob_bordered = Mat();
741 
742     // BEGIN dot
743     Mat top_blob_tm;
744     {
745         int w_tm = outw / 4 * 6;
746         int h_tm = outh / 4 * 6;
747 
748         int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
749         int nRowBlocks = w_tm / 6;
750 
751         const int tiles = nColBlocks * nRowBlocks;
752 
753         top_blob_tm.create(36, tiles, outch, 4u, opt.workspace_allocator);
754 
755         #pragma omp parallel for num_threads(opt.num_threads)
756         for (int p = 0; p < outch; p++)
757         {
758             Mat out0_tm = top_blob_tm.channel(p);
759             const Mat kernel0_tm = kernel_tm.channel(p);
760 
761             for (int i = 0; i < tiles; i++)
762             {
763                 int* output0_tm = out0_tm.row<int>(i);
764 
765                 int sum0[36] = {0};
766 
767                 for (int q = 0; q < inch; q++)
768                 {
769                     const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
770                     const short* k0 = kernel0_tm.row<short>(q);
771 
772                     for (int n = 0; n < 36; n++)
773                     {
774                         sum0[n] += (int)r0[n] * k0[n];
775                     }
776                 }
777 
778                 for (int n = 0; n < 36; n++)
779                 {
780                     output0_tm[n] = sum0[n];
781                 }
782             }
783         }
784     }
785     bottom_blob_tm = Mat();
786     // END dot
787 
788     // BEGIN transform output
789     Mat top_blob_bordered;
790     top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
791     {
792         // AT
793         // const float itm[4][6] = {
794         //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
795         //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
796         //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
797         //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
798         // };
799 
800         // 0 =	r00 + r01 + r02 + r03 +	r04
801         // 1 =		  r01 - r02 + 2 * (r03 - r04)
802         // 2 =		  r01 + r02 + 4 * (r03 + r04)
803         // 3 =		  r01 - r02 + 8 * (r03 - r04)  + r05
804 
805         int w_tm = outw / 4 * 6;
806         int h_tm = outh / 4 * 6;
807 
808         int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
809         int nRowBlocks = w_tm / 6;
810 
811         #pragma omp parallel for num_threads(opt.num_threads)
812         for (int p = 0; p < outch; p++)
813         {
814             Mat out_tm = top_blob_tm.channel(p);
815             Mat out = top_blob_bordered.channel(p);
816 
817             for (int j = 0; j < nColBlocks; j++)
818             {
819                 int* outRow0 = out.row<int>(j * 4);
820                 int* outRow1 = out.row<int>(j * 4 + 1);
821                 int* outRow2 = out.row<int>(j * 4 + 2);
822                 int* outRow3 = out.row<int>(j * 4 + 3);
823 
824                 for (int i = 0; i < nRowBlocks; i++)
825                 {
826                     int* out_tile = out_tm.row<int>(j * nRowBlocks + i);
827 
828                     int s0[6], s1[6], s2[6], s3[6], s4[6], s5[6];
829                     int w0[6], w1[6], w2[6], w3[6];
830                     int d0[4], d1[4], d2[4], d3[4], d4[4], d5[4];
831                     int o0[4], o1[4], o2[4], o3[4];
832                     // load
833                     for (int n = 0; n < 6; n++)
834                     {
835                         s0[n] = out_tile[n];
836                         s1[n] = out_tile[n + 6];
837                         s2[n] = out_tile[n + 12];
838                         s3[n] = out_tile[n + 18];
839                         s4[n] = out_tile[n + 24];
840                         s5[n] = out_tile[n + 30];
841                     }
842                     // w = A_T * W
843                     for (int n = 0; n < 6; n++)
844                     {
845                         w0[n] = s0[n] + s1[n] + s2[n] + s3[n] + s4[n];
846                         w1[n] = s1[n] - s2[n] + 2 * s3[n] - 2 * s4[n];
847                         w2[n] = s1[n] + s2[n] + 4 * s3[n] + 4 * s4[n];
848                         w3[n] = s1[n] - s2[n] + 8 * s3[n] - 8 * s4[n] + s5[n];
849                     }
850                     // transpose w to w_t
851                     {
852                         d0[0] = w0[0];
853                         d0[1] = w1[0];
854                         d0[2] = w2[0];
855                         d0[3] = w3[0];
856                         d1[0] = w0[1];
857                         d1[1] = w1[1];
858                         d1[2] = w2[1];
859                         d1[3] = w3[1];
860                         d2[0] = w0[2];
861                         d2[1] = w1[2];
862                         d2[2] = w2[2];
863                         d2[3] = w3[2];
864                         d3[0] = w0[3];
865                         d3[1] = w1[3];
866                         d3[2] = w2[3];
867                         d3[3] = w3[3];
868                         d4[0] = w0[4];
869                         d4[1] = w1[4];
870                         d4[2] = w2[4];
871                         d4[3] = w3[4];
872                         d5[0] = w0[5];
873                         d5[1] = w1[5];
874                         d5[2] = w2[5];
875                         d5[3] = w3[5];
876                     }
877                     // Y = A_T * w_t
878                     for (int n = 0; n < 4; n++)
879                     {
880                         o0[n] = d0[n] + d1[n] + d2[n] + d3[n] + d4[n];
881                         o1[n] = d1[n] - d2[n] + 2 * d3[n] - 2 * d4[n];
882                         o2[n] = d1[n] + d2[n] + 4 * d3[n] + 4 * d4[n];
883                         o3[n] = d1[n] - d2[n] + 8 * d3[n] - 8 * d4[n] + d5[n];
884                     }
885                     // save to top blob tm
886                     for (int n = 0; n < 4; n++)
887                     {
888                         outRow0[n] = o0[n] / 576;
889                         outRow1[n] = o1[n] / 576;
890                         outRow2[n] = o2[n] / 576;
891                         outRow3[n] = o3[n] / 576;
892                     }
893 
894                     outRow0 += 4;
895                     outRow1 += 4;
896                     outRow2 += 4;
897                     outRow3 += 4;
898                 }
899             }
900         }
901     }
902     // END transform output
903 
904     // cut result pad
905     copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
906 }
907 
conv3x3s2_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Option & opt)908 static void conv3x3s2_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
909 {
910     int w = bottom_blob.w;
911     int inch = bottom_blob.c;
912 
913     int outw = top_blob.w;
914     int outh = top_blob.h;
915     int outch = top_blob.c;
916 
917     const int tailstep = w - 2 * outw + w;
918 
919     const signed char* kernel = _kernel;
920 
921     #pragma omp parallel for num_threads(opt.num_threads)
922     for (int p = 0; p < outch; p++)
923     {
924         Mat out0 = top_blob.channel(p);
925 
926         out0.fill(0);
927 
928         const signed char* kernel0 = (const signed char*)kernel + p * inch * 9;
929 
930         for (int q = 0; q < inch; q++)
931         {
932             int* outptr0 = out0;
933 
934             const signed char* img0 = bottom_blob.channel(q);
935 
936             const signed char* r0 = img0;
937             const signed char* r1 = img0 + w;
938             const signed char* r2 = img0 + w * 2;
939 
940             for (int i = 0; i < outh; i++)
941             {
942                 int remain = outw;
943 
944                 for (; remain > 0; remain--)
945                 {
946                     int sum0 = 0;
947 
948                     sum0 += (int)r0[0] * kernel0[0];
949                     sum0 += (int)r0[1] * kernel0[1];
950                     sum0 += (int)r0[2] * kernel0[2];
951                     sum0 += (int)r1[0] * kernel0[3];
952                     sum0 += (int)r1[1] * kernel0[4];
953                     sum0 += (int)r1[2] * kernel0[5];
954                     sum0 += (int)r2[0] * kernel0[6];
955                     sum0 += (int)r2[1] * kernel0[7];
956                     sum0 += (int)r2[2] * kernel0[8];
957 
958                     *outptr0 += sum0;
959 
960                     r0 += 2;
961                     r1 += 2;
962                     r2 += 2;
963                     outptr0++;
964                 }
965 
966                 r0 += tailstep;
967                 r1 += tailstep;
968                 r2 += tailstep;
969             }
970 
971             kernel0 += 9;
972         }
973     }
974 }
975