1 // BUG1989 is pleased to support the open source community by supporting ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
float2int8(float v)15 static inline signed char float2int8(float v)
16 {
17     int int32 = static_cast<int>(round(v));
18     if (int32 > 127) return 127;
19     if (int32 < -127) return -127;
20     return (signed char)int32;
21 }
22 
conv_im2col_sgemm_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const int kernel_w,const int kernel_h,const int stride_w,const int stride_h,const Option & opt)23 static void conv_im2col_sgemm_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel,
24                                        const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt)
25 {
26     int w = bottom_blob.w;
27     int inch = bottom_blob.c;
28 
29     int outw = top_blob.w;
30     int outh = top_blob.h;
31     int outch = top_blob.c;
32 
33     const signed char* kernel = _kernel;
34 
35     // im2row
36     Mat bottom_im2row(kernel_h * kernel_w * inch, outw * outh, 1UL, opt.workspace_allocator);
37     {
38         signed char* ret = (signed char*)bottom_im2row;
39         int retID = 0;
40 
41         for (int i = 0; i < outh; i++)
42         {
43             for (int j = 0; j < outw; j++)
44             {
45                 for (int p = 0; p < inch; p++)
46                 {
47                     const signed char* input = bottom_blob.channel(p);
48                     for (int u = 0; u < kernel_h; u++)
49                     {
50                         for (int v = 0; v < kernel_w; v++)
51                         {
52                             int row = u + i * stride_h;
53                             int col = v + j * stride_w;
54                             int index = row * w + col;
55                             ret[retID] = input[index];
56                             retID++;
57                         }
58                     }
59                 }
60             }
61         }
62     }
63 
64     int kernel_size = kernel_w * kernel_h;
65     int out_size = outw * outh;
66 
67     // int M = outch;  // outch
68     int N = outw * outh;                // outsize or out stride
69     int K = kernel_w * kernel_h * inch; // ksize * inch
70 
71     // bottom_im2row memory packed 4 x 4
72     Mat bottom_tm(4 * kernel_size, inch, out_size / 4 + out_size % 4, (size_t)1u, opt.workspace_allocator);
73     {
74         int nn_size = out_size >> 2;
75         int remain_size_start = nn_size << 2;
76 
77         #pragma omp parallel for num_threads(opt.num_threads)
78         for (int ii = 0; ii < nn_size; ii++)
79         {
80             int i = ii * 4;
81 
82             const signed char* img0 = bottom_im2row.row<signed char>(i);
83             const signed char* img1 = bottom_im2row.row<signed char>(i + 1);
84             const signed char* img2 = bottom_im2row.row<signed char>(i + 2);
85             const signed char* img3 = bottom_im2row.row<signed char>(i + 3);
86 
87             signed char* tmpptr = bottom_tm.channel(i / 4);
88 
89             int q = 0;
90             for (; q + 1 < inch * kernel_size; q = q + 2)
91             {
92                 tmpptr[0] = img0[0];
93                 tmpptr[1] = img0[1];
94                 tmpptr[2] = img1[0];
95                 tmpptr[3] = img1[1];
96                 tmpptr[4] = img2[0];
97                 tmpptr[5] = img2[1];
98                 tmpptr[6] = img3[0];
99                 tmpptr[7] = img3[1];
100 
101                 tmpptr += 8;
102                 img0 += 2;
103                 img1 += 2;
104                 img2 += 2;
105                 img3 += 2;
106             }
107 
108             for (; q < inch * kernel_size; q++)
109             {
110                 tmpptr[0] = img0[0];
111                 tmpptr[1] = img1[0];
112                 tmpptr[2] = img2[0];
113                 tmpptr[3] = img3[0];
114 
115                 tmpptr += 4;
116                 img0 += 1;
117                 img1 += 1;
118                 img2 += 1;
119                 img3 += 1;
120             }
121         }
122 
123         #pragma omp parallel for num_threads(opt.num_threads)
124         for (int i = remain_size_start; i < out_size; i++)
125         {
126             const signed char* img0 = bottom_im2row.row<signed char>(i);
127 
128             signed char* tmpptr = bottom_tm.channel(i / 4 + i % 4);
129 
130             int q = 0;
131             for (; q + 1 < inch * kernel_size; q = q + 2)
132             {
133                 tmpptr[0] = img0[0];
134                 tmpptr[1] = img0[1];
135 
136                 tmpptr += 2;
137                 img0 += 2;
138             }
139 
140             for (; q < inch * kernel_size; q++)
141             {
142                 tmpptr[0] = img0[0];
143 
144                 tmpptr += 1;
145                 img0 += 1;
146             }
147         }
148     }
149 
150     // kernel memory packed 4 x 4
151     Mat kernel_tm(4 * kernel_size, inch, outch / 4 + outch % 4, (size_t)1u, opt.workspace_allocator);
152     {
153         int nn_outch = 0;
154         int remain_outch_start = 0;
155 
156         nn_outch = outch >> 2;
157         remain_outch_start = nn_outch << 2;
158 
159         #pragma omp parallel for num_threads(opt.num_threads)
160         for (int pp = 0; pp < nn_outch; pp++)
161         {
162             int p = pp * 4;
163 
164             const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
165             const signed char* k1 = kernel + (p + 1) * inch * kernel_size;
166             const signed char* k2 = kernel + (p + 2) * inch * kernel_size;
167             const signed char* k3 = kernel + (p + 3) * inch * kernel_size;
168 
169             signed char* ktmp = kernel_tm.channel(p / 4);
170 
171             int q = 0;
172             for (; q + 1 < inch * kernel_size; q += 2)
173             {
174                 ktmp[0] = k0[0];
175                 ktmp[1] = k0[1];
176                 ktmp[2] = k1[0];
177                 ktmp[3] = k1[1];
178                 ktmp[4] = k2[0];
179                 ktmp[5] = k2[1];
180                 ktmp[6] = k3[0];
181                 ktmp[7] = k3[1];
182 
183                 ktmp += 8;
184 
185                 k0 += 2;
186                 k1 += 2;
187                 k2 += 2;
188                 k3 += 2;
189             }
190 
191             for (; q < inch * kernel_size; q++)
192             {
193                 ktmp[0] = k0[0];
194                 ktmp[1] = k1[0];
195                 ktmp[2] = k2[0];
196                 ktmp[3] = k3[0];
197                 ktmp += 4;
198 
199                 k0 += 1;
200                 k1 += 1;
201                 k2 += 1;
202                 k3 += 1;
203             }
204         }
205 
206         #pragma omp parallel for num_threads(opt.num_threads)
207         for (int p = remain_outch_start; p < outch; p++)
208         {
209             const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
210 
211             signed char* ktmp = kernel_tm.channel(p / 4 + p % 4);
212 
213             int q = 0;
214             for (; q + 1 < inch * kernel_size; q = q + 2)
215             {
216                 ktmp[0] = k0[0];
217                 ktmp[1] = k0[1];
218                 ktmp += 2;
219                 k0 += 2;
220             }
221 
222             for (; q < inch * kernel_size; q++)
223             {
224                 ktmp[0] = k0[0];
225                 ktmp++;
226                 k0++;
227             }
228         }
229     }
230 
231     // 4x4
232     // sgemm(int M, int N, int K, float* A, float* B, float* C)
233     {
234         // int M = outch;  // outch
235         // int N = outw * outh; // outsize or out stride
236         // int L = kernel_w * kernel_h * inch; // ksize * inch
237 
238         int nn_outch = 0;
239         int remain_outch_start = 0;
240 
241         nn_outch = outch >> 2;
242         remain_outch_start = nn_outch << 2;
243 
244         #pragma omp parallel for num_threads(opt.num_threads)
245         for (int pp = 0; pp < nn_outch; pp++)
246         {
247             int i = pp * 4;
248 
249             int* output0 = top_blob.channel(i);
250             int* output1 = top_blob.channel(i + 1);
251             int* output2 = top_blob.channel(i + 2);
252             int* output3 = top_blob.channel(i + 3);
253 
254             int j = 0;
255             for (; j + 3 < N; j = j + 4)
256             {
257                 signed char* vb = bottom_tm.channel(j / 4);
258                 signed char* va = kernel_tm.channel(i / 4);
259 
260                 int sum0[4] = {0};
261                 int sum1[4] = {0};
262                 int sum2[4] = {0};
263                 int sum3[4] = {0};
264 
265                 int k = 0;
266 
267                 for (; k + 1 < K; k = k + 2)
268                 {
269                     for (int n = 0; n < 4; n++)
270                     {
271                         sum0[n] += (int)va[0] * vb[2 * n]; // k0
272                         sum0[n] += (int)va[1] * vb[2 * n + 1];
273 
274                         sum1[n] += (int)va[2] * vb[2 * n]; // k1
275                         sum1[n] += (int)va[3] * vb[2 * n + 1];
276 
277                         sum2[n] += (int)va[4] * vb[2 * n]; // k2
278                         sum2[n] += (int)va[5] * vb[2 * n + 1];
279 
280                         sum3[n] += (int)va[6] * vb[2 * n]; // k3
281                         sum3[n] += (int)va[7] * vb[2 * n + 1];
282                     }
283 
284                     va += 8;
285                     vb += 8;
286                 }
287 
288                 for (; k < K; k++)
289                 {
290                     for (int n = 0; n < 4; n++)
291                     {
292                         sum0[n] += (int)va[0] * vb[n];
293                         sum1[n] += (int)va[1] * vb[n];
294                         sum2[n] += (int)va[2] * vb[n];
295                         sum3[n] += (int)va[3] * vb[n];
296                     }
297 
298                     va += 4;
299                     vb += 4;
300                 }
301 
302                 for (int n = 0; n < 4; n++)
303                 {
304                     output0[n] = sum0[n];
305                     output1[n] = sum1[n];
306                     output2[n] = sum2[n];
307                     output3[n] = sum3[n];
308                 }
309                 output0 += 4;
310                 output1 += 4;
311                 output2 += 4;
312                 output3 += 4;
313             }
314 
315             for (; j < N; j++)
316             {
317                 int sum0 = 0;
318                 int sum1 = 0;
319                 int sum2 = 0;
320                 int sum3 = 0;
321 
322                 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
323                 signed char* va = kernel_tm.channel(i / 4);
324 
325                 int k = 0;
326 
327                 for (; k + 1 < K; k = k + 2)
328                 {
329                     sum0 += (int)va[0] * vb[0];
330                     sum0 += (int)va[1] * vb[1];
331 
332                     sum1 += (int)va[2] * vb[0];
333                     sum1 += (int)va[3] * vb[1];
334 
335                     sum2 += (int)va[4] * vb[0];
336                     sum2 += (int)va[5] * vb[1];
337 
338                     sum3 += (int)va[6] * vb[0];
339                     sum3 += (int)va[7] * vb[1];
340 
341                     va += 8;
342                     vb += 2;
343                 }
344 
345                 for (; k < K; k++)
346                 {
347                     sum0 += (int)va[0] * vb[0];
348                     sum1 += (int)va[1] * vb[0];
349                     sum2 += (int)va[2] * vb[0];
350                     sum3 += (int)va[3] * vb[0];
351 
352                     va += 4;
353                     vb += 1;
354                 }
355 
356                 output0[0] = sum0;
357                 output1[0] = sum1;
358                 output2[0] = sum2;
359                 output3[0] = sum3;
360 
361                 output0++;
362                 output1++;
363                 output2++;
364                 output3++;
365             }
366         }
367 
368         #pragma omp parallel for num_threads(opt.num_threads)
369         for (int i = remain_outch_start; i < outch; i++)
370         {
371             int* output = top_blob.channel(i);
372 
373             int j = 0;
374             for (; j + 3 < N; j = j + 4)
375             {
376                 signed char* vb = bottom_tm.channel(j / 4);
377                 signed char* va = kernel_tm.channel(i / 4 + i % 4);
378                 int sum[4] = {0};
379 
380                 int k = 0;
381                 for (; k + 1 < K; k = k + 2)
382                 {
383                     for (int n = 0; n < 4; n++)
384                     {
385                         sum[n] += (int)va[0] * vb[2 * n];
386                         sum[n] += (int)va[1] * vb[2 * n + 1];
387                     }
388                     va += 2;
389                     vb += 8;
390                 }
391 
392                 for (; k < K; k++)
393                 {
394                     for (int n = 0; n < 4; n++)
395                     {
396                         sum[n] += (int)va[0] * vb[n];
397                     }
398                     va += 1;
399                     vb += 4;
400                 }
401 
402                 for (int n = 0; n < 4; n++)
403                 {
404                     output[n] = sum[n];
405                 }
406                 output += 4;
407             }
408 
409             for (; j < N; j++)
410             {
411                 int sum = 0;
412 
413                 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
414                 signed char* va = kernel_tm.channel(i / 4 + i % 4);
415 
416                 for (int k = 0; k < K; k++)
417                 {
418                     sum += (int)va[0] * vb[0];
419 
420                     va += 1;
421                     vb += 1;
422                 }
423                 output[0] = sum;
424 
425                 output++;
426             }
427         }
428     }
429 
430     // // sgemm(int M, int N, int K, float* A, float* B, float* C)
431     // {
432     //     for (int i=0; i<M; i++)
433     //     {
434     //         int* output = top_blob.channel(i);
435 
436     //         for (int j=0; j<N; j++)
437     //         {
438     //             int sum = 0;
439 
440     //             signed char* vb = (signed char*)bottom_im2row + K * j;
441     //             const signed char* va = kernel + K * i;
442 
443     //             for (int k=0; k<K; k++)
444     //             {
445     //                 sum += (int)va[0] * vb[0];
446 
447     //                 va += 1;
448     //                 vb += 1;
449     //             }
450     //             output[0] = sum;
451 
452     //             output++;
453     //         }
454     //     }
455     // }
456 }
457 
conv_im2col_sgemm_int8_dequant_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const int kernel_w,const int kernel_h,const int stride_w,const int stride_h,const Mat & _bias,std::vector<float> scale_dequant,const Option & opt)458 static void conv_im2col_sgemm_int8_dequant_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel,
459         const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Mat& _bias, std::vector<float> scale_dequant, const Option& opt)
460 {
461     int w = bottom_blob.w;
462     int inch = bottom_blob.c;
463 
464     int outw = top_blob.w;
465     int outh = top_blob.h;
466     int outch = top_blob.c;
467 
468     const signed char* kernel = _kernel;
469     const float* bias = _bias;
470 
471     // im2row
472     Mat bottom_im2row(kernel_h * kernel_w * inch, outw * outh, 1UL, opt.workspace_allocator);
473     {
474         signed char* ret = (signed char*)bottom_im2row;
475         int retID = 0;
476 
477         for (int i = 0; i < outh; i++)
478         {
479             for (int j = 0; j < outw; j++)
480             {
481                 for (int p = 0; p < inch; p++)
482                 {
483                     const signed char* input = bottom_blob.channel(p);
484                     for (int u = 0; u < kernel_h; u++)
485                     {
486                         for (int v = 0; v < kernel_w; v++)
487                         {
488                             int row = u + i * stride_h;
489                             int col = v + j * stride_w;
490                             int index = row * w + col;
491                             ret[retID] = input[index];
492                             retID++;
493                         }
494                     }
495                 }
496             }
497         }
498     }
499 
500     int kernel_size = kernel_w * kernel_h;
501     int out_size = outw * outh;
502 
503     // int M = outch;  // outch
504     int N = outw * outh;                // outsize or out stride
505     int K = kernel_w * kernel_h * inch; // ksize * inch
506 
507     // bottom_im2row memory packed 4 x 4
508     Mat bottom_tm(4 * kernel_size, inch, out_size / 4 + out_size % 4, (size_t)1u, opt.workspace_allocator);
509     {
510         int nn_size = out_size >> 2;
511         int remain_size_start = nn_size << 2;
512 
513         #pragma omp parallel for num_threads(opt.num_threads)
514         for (int ii = 0; ii < nn_size; ii++)
515         {
516             int i = ii * 4;
517 
518             const signed char* img0 = bottom_im2row.row<signed char>(i);
519             const signed char* img1 = bottom_im2row.row<signed char>(i + 1);
520             const signed char* img2 = bottom_im2row.row<signed char>(i + 2);
521             const signed char* img3 = bottom_im2row.row<signed char>(i + 3);
522 
523             signed char* tmpptr = bottom_tm.channel(i / 4);
524 
525             int q = 0;
526             for (; q + 1 < inch * kernel_size; q = q + 2)
527             {
528                 tmpptr[0] = img0[0];
529                 tmpptr[1] = img0[1];
530                 tmpptr[2] = img1[0];
531                 tmpptr[3] = img1[1];
532                 tmpptr[4] = img2[0];
533                 tmpptr[5] = img2[1];
534                 tmpptr[6] = img3[0];
535                 tmpptr[7] = img3[1];
536 
537                 tmpptr += 8;
538                 img0 += 2;
539                 img1 += 2;
540                 img2 += 2;
541                 img3 += 2;
542             }
543 
544             for (; q < inch * kernel_size; q++)
545             {
546                 tmpptr[0] = img0[0];
547                 tmpptr[1] = img1[0];
548                 tmpptr[2] = img2[0];
549                 tmpptr[3] = img3[0];
550 
551                 tmpptr += 4;
552                 img0 += 1;
553                 img1 += 1;
554                 img2 += 1;
555                 img3 += 1;
556             }
557         }
558 
559         #pragma omp parallel for num_threads(opt.num_threads)
560         for (int i = remain_size_start; i < out_size; i++)
561         {
562             const signed char* img0 = bottom_im2row.row<signed char>(i);
563 
564             signed char* tmpptr = bottom_tm.channel(i / 4 + i % 4);
565 
566             int q = 0;
567             for (; q + 1 < inch * kernel_size; q = q + 2)
568             {
569                 tmpptr[0] = img0[0];
570                 tmpptr[1] = img0[1];
571 
572                 tmpptr += 2;
573                 img0 += 2;
574             }
575 
576             for (; q < inch * kernel_size; q++)
577             {
578                 tmpptr[0] = img0[0];
579 
580                 tmpptr += 1;
581                 img0 += 1;
582             }
583         }
584     }
585 
586     // kernel memory packed 4 x 4
587     Mat kernel_tm(4 * kernel_size, inch, outch / 4 + outch % 4, (size_t)1u, opt.workspace_allocator);
588     {
589         int nn_outch = 0;
590         int remain_outch_start = 0;
591 
592         nn_outch = outch >> 2;
593         remain_outch_start = nn_outch << 2;
594 
595         #pragma omp parallel for num_threads(opt.num_threads)
596         for (int pp = 0; pp < nn_outch; pp++)
597         {
598             int p = pp * 4;
599 
600             const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
601             const signed char* k1 = kernel + (p + 1) * inch * kernel_size;
602             const signed char* k2 = kernel + (p + 2) * inch * kernel_size;
603             const signed char* k3 = kernel + (p + 3) * inch * kernel_size;
604 
605             signed char* ktmp = kernel_tm.channel(p / 4);
606 
607             int q = 0;
608             for (; q + 1 < inch * kernel_size; q += 2)
609             {
610                 ktmp[0] = k0[0];
611                 ktmp[1] = k0[1];
612                 ktmp[2] = k1[0];
613                 ktmp[3] = k1[1];
614                 ktmp[4] = k2[0];
615                 ktmp[5] = k2[1];
616                 ktmp[6] = k3[0];
617                 ktmp[7] = k3[1];
618 
619                 ktmp += 8;
620 
621                 k0 += 2;
622                 k1 += 2;
623                 k2 += 2;
624                 k3 += 2;
625             }
626 
627             for (; q < inch * kernel_size; q++)
628             {
629                 ktmp[0] = k0[0];
630                 ktmp[1] = k1[0];
631                 ktmp[2] = k2[0];
632                 ktmp[3] = k3[0];
633                 ktmp += 4;
634 
635                 k0 += 1;
636                 k1 += 1;
637                 k2 += 1;
638                 k3 += 1;
639             }
640         }
641 
642         #pragma omp parallel for num_threads(opt.num_threads)
643         for (int p = remain_outch_start; p < outch; p++)
644         {
645             const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
646 
647             signed char* ktmp = kernel_tm.channel(p / 4 + p % 4);
648 
649             int q = 0;
650             for (; q + 1 < inch * kernel_size; q = q + 2)
651             {
652                 ktmp[0] = k0[0];
653                 ktmp[1] = k0[1];
654                 ktmp += 2;
655                 k0 += 2;
656             }
657 
658             for (; q < inch * kernel_size; q++)
659             {
660                 ktmp[0] = k0[0];
661                 ktmp++;
662                 k0++;
663             }
664         }
665     }
666 
667     // 4x4
668     // sgemm(int M, int N, int K, float* A, float* B, float* C)
669     {
670         // int M = outch;  // outch
671         // int N = outw * outh; // outsize or out stride
672         // int L = kernel_w * kernel_h * inch; // ksize * inch
673 
674         int nn_outch = 0;
675         int remain_outch_start = 0;
676 
677         nn_outch = outch >> 2;
678         remain_outch_start = nn_outch << 2;
679 
680         #pragma omp parallel for num_threads(opt.num_threads)
681         for (int pp = 0; pp < nn_outch; pp++)
682         {
683             int i = pp * 4;
684 
685             const float bias0 = bias ? bias[i] : 0.f;
686             const float bias1 = bias ? bias[i + 1] : 0.f;
687             const float bias2 = bias ? bias[i + 2] : 0.f;
688             const float bias3 = bias ? bias[i + 3] : 0.f;
689 
690             const float scale_dequant0 = scale_dequant[i];
691             const float scale_dequant1 = scale_dequant[i + 1];
692             const float scale_dequant2 = scale_dequant[i + 2];
693             const float scale_dequant3 = scale_dequant[i + 3];
694 
695             float* output0 = top_blob.channel(i);
696             float* output1 = top_blob.channel(i + 1);
697             float* output2 = top_blob.channel(i + 2);
698             float* output3 = top_blob.channel(i + 3);
699 
700             int j = 0;
701             for (; j + 3 < N; j = j + 4)
702             {
703                 signed char* vb = bottom_tm.channel(j / 4);
704                 signed char* va = kernel_tm.channel(i / 4);
705 
706                 int sum0[4] = {0};
707                 int sum1[4] = {0};
708                 int sum2[4] = {0};
709                 int sum3[4] = {0};
710 
711                 int k = 0;
712 
713                 for (; k + 1 < K; k = k + 2)
714                 {
715                     for (int n = 0; n < 4; n++)
716                     {
717                         sum0[n] += (int)va[0] * vb[2 * n]; // k0
718                         sum0[n] += (int)va[1] * vb[2 * n + 1];
719 
720                         sum1[n] += (int)va[2] * vb[2 * n]; // k1
721                         sum1[n] += (int)va[3] * vb[2 * n + 1];
722 
723                         sum2[n] += (int)va[4] * vb[2 * n]; // k2
724                         sum2[n] += (int)va[5] * vb[2 * n + 1];
725 
726                         sum3[n] += (int)va[6] * vb[2 * n]; // k3
727                         sum3[n] += (int)va[7] * vb[2 * n + 1];
728                     }
729 
730                     va += 8;
731                     vb += 8;
732                 }
733 
734                 for (; k < K; k++)
735                 {
736                     for (int n = 0; n < 4; n++)
737                     {
738                         sum0[n] += (int)va[0] * vb[n];
739                         sum1[n] += (int)va[1] * vb[n];
740                         sum2[n] += (int)va[2] * vb[n];
741                         sum3[n] += (int)va[3] * vb[n];
742                     }
743 
744                     va += 4;
745                     vb += 4;
746                 }
747 
748                 for (int n = 0; n < 4; n++)
749                 {
750                     output0[n] = (float)sum0[n] * scale_dequant0 + bias0;
751                     output1[n] = (float)sum1[n] * scale_dequant1 + bias1;
752                     output2[n] = (float)sum2[n] * scale_dequant2 + bias2;
753                     output3[n] = (float)sum3[n] * scale_dequant3 + bias3;
754                 }
755                 output0 += 4;
756                 output1 += 4;
757                 output2 += 4;
758                 output3 += 4;
759             }
760 
761             for (; j < N; j++)
762             {
763                 int sum0 = 0;
764                 int sum1 = 0;
765                 int sum2 = 0;
766                 int sum3 = 0;
767 
768                 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
769                 signed char* va = kernel_tm.channel(i / 4);
770 
771                 int k = 0;
772 
773                 for (; k + 1 < K; k = k + 2)
774                 {
775                     sum0 += (int)va[0] * vb[0];
776                     sum0 += (int)va[1] * vb[1];
777 
778                     sum1 += (int)va[2] * vb[0];
779                     sum1 += (int)va[3] * vb[1];
780 
781                     sum2 += (int)va[4] * vb[0];
782                     sum2 += (int)va[5] * vb[1];
783 
784                     sum3 += (int)va[6] * vb[0];
785                     sum3 += (int)va[7] * vb[1];
786 
787                     va += 8;
788                     vb += 2;
789                 }
790 
791                 for (; k < K; k++)
792                 {
793                     sum0 += (int)va[0] * vb[0];
794                     sum1 += (int)va[1] * vb[0];
795                     sum2 += (int)va[2] * vb[0];
796                     sum3 += (int)va[3] * vb[0];
797 
798                     va += 4;
799                     vb += 1;
800                 }
801 
802                 output0[0] = (float)sum0 * scale_dequant0 + bias0;
803                 output1[0] = (float)sum1 * scale_dequant1 + bias1;
804                 output2[0] = (float)sum2 * scale_dequant2 + bias2;
805                 output3[0] = (float)sum3 * scale_dequant3 + bias3;
806 
807                 output0++;
808                 output1++;
809                 output2++;
810                 output3++;
811             }
812         }
813 
814         #pragma omp parallel for num_threads(opt.num_threads)
815         for (int i = remain_outch_start; i < outch; i++)
816         {
817             float* output = top_blob.channel(i);
818 
819             const float bias0 = bias ? bias[i] : 0.f;
820             const float scale_dequant0 = scale_dequant[i];
821 
822             int j = 0;
823             for (; j + 3 < N; j = j + 4)
824             {
825                 signed char* vb = bottom_tm.channel(j / 4);
826                 signed char* va = kernel_tm.channel(i / 4 + i % 4);
827                 int sum[4] = {0};
828 
829                 int k = 0;
830                 for (; k + 1 < K; k = k + 2)
831                 {
832                     for (int n = 0; n < 4; n++)
833                     {
834                         sum[n] += (int)va[0] * vb[2 * n];
835                         sum[n] += (int)va[1] * vb[2 * n + 1];
836                     }
837                     va += 2;
838                     vb += 8;
839                 }
840 
841                 for (; k < K; k++)
842                 {
843                     for (int n = 0; n < 4; n++)
844                     {
845                         sum[n] += (int)va[0] * vb[n];
846                     }
847                     va += 1;
848                     vb += 4;
849                 }
850 
851                 for (int n = 0; n < 4; n++)
852                 {
853                     output[n] = (float)sum[n] * scale_dequant0 + bias0;
854                 }
855                 output += 4;
856             }
857 
858             for (; j < N; j++)
859             {
860                 int sum = 0;
861 
862                 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
863                 signed char* va = kernel_tm.channel(i / 4 + i % 4);
864 
865                 for (int k = 0; k < K; k++)
866                 {
867                     sum += (int)va[0] * vb[0];
868 
869                     va += 1;
870                     vb += 1;
871                 }
872                 output[0] = (float)sum * scale_dequant0 + bias0;
873 
874                 output++;
875             }
876         }
877     }
878 
879     // // sgemm(int M, int N, int K, float* A, float* B, float* C)
880     // {
881     //     for (int i=0; i<M; i++)
882     //     {
883     //         int* output = top_blob.channel(i);
884 
885     //         for (int j=0; j<N; j++)
886     //         {
887     //             int sum = 0;
888 
889     //             signed char* vb = (signed char*)bottom_im2row + K * j;
890     //             const signed char* va = kernel + K * i;
891 
892     //             for (int k=0; k<K; k++)
893     //             {
894     //                 sum += (int)va[0] * vb[0];
895 
896     //                 va += 1;
897     //                 vb += 1;
898     //             }
899     //             output[0] = sum;
900 
901     //             output++;
902     //         }
903     //     }
904     // }
905 }
906 
conv_im2col_sgemm_int8_requant_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const int kernel_w,const int kernel_h,const int stride_w,const int stride_h,const Mat & _bias,std::vector<float> scale_requant,const Option & opt)907 static void conv_im2col_sgemm_int8_requant_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel,
908         const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Mat& _bias, std::vector<float> scale_requant, const Option& opt)
909 {
910     int w = bottom_blob.w;
911     int inch = bottom_blob.c;
912 
913     int outw = top_blob.w;
914     int outh = top_blob.h;
915     int outch = top_blob.c;
916 
917     const signed char* kernel = _kernel;
918     const float* bias = _bias;
919 
920     // im2row
921     Mat bottom_im2row(kernel_h * kernel_w * inch, outw * outh, 1UL, opt.workspace_allocator);
922     {
923         signed char* ret = (signed char*)bottom_im2row;
924         int retID = 0;
925 
926         for (int i = 0; i < outh; i++)
927         {
928             for (int j = 0; j < outw; j++)
929             {
930                 for (int p = 0; p < inch; p++)
931                 {
932                     const signed char* input = bottom_blob.channel(p);
933                     for (int u = 0; u < kernel_h; u++)
934                     {
935                         for (int v = 0; v < kernel_w; v++)
936                         {
937                             int row = u + i * stride_h;
938                             int col = v + j * stride_w;
939                             int index = row * w + col;
940                             ret[retID] = input[index];
941                             retID++;
942                         }
943                     }
944                 }
945             }
946         }
947     }
948 
949     int kernel_size = kernel_w * kernel_h;
950     int out_size = outw * outh;
951 
952     // int M = outch;  // outch
953     int N = outw * outh;                // outsize or out stride
954     int K = kernel_w * kernel_h * inch; // ksize * inch
955 
956     // bottom_im2row memory packed 4 x 4
957     Mat bottom_tm(4 * kernel_size, inch, out_size / 4 + out_size % 4, (size_t)1u, opt.workspace_allocator);
958     {
959         int nn_size = out_size >> 2;
960         int remain_size_start = nn_size << 2;
961 
962         #pragma omp parallel for num_threads(opt.num_threads)
963         for (int ii = 0; ii < nn_size; ii++)
964         {
965             int i = ii * 4;
966 
967             const signed char* img0 = bottom_im2row.row<signed char>(i);
968             const signed char* img1 = bottom_im2row.row<signed char>(i + 1);
969             const signed char* img2 = bottom_im2row.row<signed char>(i + 2);
970             const signed char* img3 = bottom_im2row.row<signed char>(i + 3);
971 
972             signed char* tmpptr = bottom_tm.channel(i / 4);
973 
974             int q = 0;
975             for (; q + 1 < inch * kernel_size; q = q + 2)
976             {
977                 tmpptr[0] = img0[0];
978                 tmpptr[1] = img0[1];
979                 tmpptr[2] = img1[0];
980                 tmpptr[3] = img1[1];
981                 tmpptr[4] = img2[0];
982                 tmpptr[5] = img2[1];
983                 tmpptr[6] = img3[0];
984                 tmpptr[7] = img3[1];
985 
986                 tmpptr += 8;
987                 img0 += 2;
988                 img1 += 2;
989                 img2 += 2;
990                 img3 += 2;
991             }
992 
993             for (; q < inch * kernel_size; q++)
994             {
995                 tmpptr[0] = img0[0];
996                 tmpptr[1] = img1[0];
997                 tmpptr[2] = img2[0];
998                 tmpptr[3] = img3[0];
999 
1000                 tmpptr += 4;
1001                 img0 += 1;
1002                 img1 += 1;
1003                 img2 += 1;
1004                 img3 += 1;
1005             }
1006         }
1007 
1008         #pragma omp parallel for num_threads(opt.num_threads)
1009         for (int i = remain_size_start; i < out_size; i++)
1010         {
1011             const signed char* img0 = bottom_im2row.row<signed char>(i);
1012 
1013             signed char* tmpptr = bottom_tm.channel(i / 4 + i % 4);
1014 
1015             int q = 0;
1016             for (; q + 1 < inch * kernel_size; q = q + 2)
1017             {
1018                 tmpptr[0] = img0[0];
1019                 tmpptr[1] = img0[1];
1020 
1021                 tmpptr += 2;
1022                 img0 += 2;
1023             }
1024 
1025             for (; q < inch * kernel_size; q++)
1026             {
1027                 tmpptr[0] = img0[0];
1028 
1029                 tmpptr += 1;
1030                 img0 += 1;
1031             }
1032         }
1033     }
1034 
1035     // kernel memory packed 4 x 4
1036     Mat kernel_tm(4 * kernel_size, inch, outch / 4 + outch % 4, (size_t)1u, opt.workspace_allocator);
1037     {
1038         int nn_outch = 0;
1039         int remain_outch_start = 0;
1040 
1041         nn_outch = outch >> 2;
1042         remain_outch_start = nn_outch << 2;
1043 
1044         #pragma omp parallel for num_threads(opt.num_threads)
1045         for (int pp = 0; pp < nn_outch; pp++)
1046         {
1047             int p = pp * 4;
1048 
1049             const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
1050             const signed char* k1 = kernel + (p + 1) * inch * kernel_size;
1051             const signed char* k2 = kernel + (p + 2) * inch * kernel_size;
1052             const signed char* k3 = kernel + (p + 3) * inch * kernel_size;
1053 
1054             signed char* ktmp = kernel_tm.channel(p / 4);
1055 
1056             int q = 0;
1057             for (; q + 1 < inch * kernel_size; q += 2)
1058             {
1059                 ktmp[0] = k0[0];
1060                 ktmp[1] = k0[1];
1061                 ktmp[2] = k1[0];
1062                 ktmp[3] = k1[1];
1063                 ktmp[4] = k2[0];
1064                 ktmp[5] = k2[1];
1065                 ktmp[6] = k3[0];
1066                 ktmp[7] = k3[1];
1067 
1068                 ktmp += 8;
1069 
1070                 k0 += 2;
1071                 k1 += 2;
1072                 k2 += 2;
1073                 k3 += 2;
1074             }
1075 
1076             for (; q < inch * kernel_size; q++)
1077             {
1078                 ktmp[0] = k0[0];
1079                 ktmp[1] = k1[0];
1080                 ktmp[2] = k2[0];
1081                 ktmp[3] = k3[0];
1082                 ktmp += 4;
1083 
1084                 k0 += 1;
1085                 k1 += 1;
1086                 k2 += 1;
1087                 k3 += 1;
1088             }
1089         }
1090 
1091         #pragma omp parallel for num_threads(opt.num_threads)
1092         for (int p = remain_outch_start; p < outch; p++)
1093         {
1094             const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
1095 
1096             signed char* ktmp = kernel_tm.channel(p / 4 + p % 4);
1097 
1098             int q = 0;
1099             for (; q + 1 < inch * kernel_size; q = q + 2)
1100             {
1101                 ktmp[0] = k0[0];
1102                 ktmp[1] = k0[1];
1103                 ktmp += 2;
1104                 k0 += 2;
1105             }
1106 
1107             for (; q < inch * kernel_size; q++)
1108             {
1109                 ktmp[0] = k0[0];
1110                 ktmp++;
1111                 k0++;
1112             }
1113         }
1114     }
1115 
1116     // 4x4
1117     // sgemm(int M, int N, int K, float* A, float* B, float* C)
1118     {
1119         // int M = outch;  // outch
1120         // int N = outw * outh; // outsize or out stride
1121         // int L = kernel_w * kernel_h * inch; // ksize * inch
1122 
1123         int nn_outch = 0;
1124         int remain_outch_start = 0;
1125 
1126         nn_outch = outch >> 2;
1127         remain_outch_start = nn_outch << 2;
1128 
1129         #pragma omp parallel for num_threads(opt.num_threads)
1130         for (int pp = 0; pp < nn_outch; pp++)
1131         {
1132             int i = pp * 4;
1133 
1134             signed char* output0 = top_blob.channel(i);
1135             signed char* output1 = top_blob.channel(i + 1);
1136             signed char* output2 = top_blob.channel(i + 2);
1137             signed char* output3 = top_blob.channel(i + 3);
1138 
1139             const float bias0 = bias ? bias[i] : 0.f;
1140             const float bias1 = bias ? bias[i + 1] : 0.f;
1141             const float bias2 = bias ? bias[i + 2] : 0.f;
1142             const float bias3 = bias ? bias[i + 3] : 0.f;
1143 
1144             const float scale_requant_in0 = scale_requant[2 * i];
1145             const float scale_requant_out0 = scale_requant[2 * i + 1];
1146             const float scale_requant_in1 = scale_requant[2 * (i + 1)];
1147             const float scale_requant_out1 = scale_requant[2 * (i + 1) + 1];
1148             const float scale_requant_in2 = scale_requant[2 * (i + 2)];
1149             const float scale_requant_out2 = scale_requant[2 * (i + 2) + 1];
1150             const float scale_requant_in3 = scale_requant[2 * (i + 3)];
1151             const float scale_requant_out3 = scale_requant[2 * (i + 3) + 1];
1152 
1153             int j = 0;
1154             for (; j + 3 < N; j = j + 4)
1155             {
1156                 signed char* vb = bottom_tm.channel(j / 4);
1157                 signed char* va = kernel_tm.channel(i / 4);
1158 
1159                 int sum0[4] = {0};
1160                 int sum1[4] = {0};
1161                 int sum2[4] = {0};
1162                 int sum3[4] = {0};
1163 
1164                 int k = 0;
1165 
1166                 for (; k + 1 < K; k = k + 2)
1167                 {
1168                     for (int n = 0; n < 4; n++)
1169                     {
1170                         sum0[n] += (int)va[0] * vb[2 * n]; // k0
1171                         sum0[n] += (int)va[1] * vb[2 * n + 1];
1172 
1173                         sum1[n] += (int)va[2] * vb[2 * n]; // k1
1174                         sum1[n] += (int)va[3] * vb[2 * n + 1];
1175 
1176                         sum2[n] += (int)va[4] * vb[2 * n]; // k2
1177                         sum2[n] += (int)va[5] * vb[2 * n + 1];
1178 
1179                         sum3[n] += (int)va[6] * vb[2 * n]; // k3
1180                         sum3[n] += (int)va[7] * vb[2 * n + 1];
1181                     }
1182 
1183                     va += 8;
1184                     vb += 8;
1185                 }
1186 
1187                 for (; k < K; k++)
1188                 {
1189                     for (int n = 0; n < 4; n++)
1190                     {
1191                         sum0[n] += (int)va[0] * vb[n];
1192                         sum1[n] += (int)va[1] * vb[n];
1193                         sum2[n] += (int)va[2] * vb[n];
1194                         sum3[n] += (int)va[3] * vb[n];
1195                     }
1196 
1197                     va += 4;
1198                     vb += 4;
1199                 }
1200 
1201                 for (int n = 0; n < 4; n++)
1202                 {
1203                     output0[n] = float2int8(((float)sum0[n] * scale_requant_in0 + bias0) * scale_requant_out0);
1204                     output1[n] = float2int8(((float)sum1[n] * scale_requant_in1 + bias1) * scale_requant_out1);
1205                     output2[n] = float2int8(((float)sum2[n] * scale_requant_in2 + bias2) * scale_requant_out2);
1206                     output3[n] = float2int8(((float)sum3[n] * scale_requant_in3 + bias3) * scale_requant_out3);
1207                 }
1208                 output0 += 4;
1209                 output1 += 4;
1210                 output2 += 4;
1211                 output3 += 4;
1212             }
1213 
1214             for (; j < N; j++)
1215             {
1216                 int sum0 = 0;
1217                 int sum1 = 0;
1218                 int sum2 = 0;
1219                 int sum3 = 0;
1220 
1221                 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
1222                 signed char* va = kernel_tm.channel(i / 4);
1223 
1224                 int k = 0;
1225 
1226                 for (; k + 1 < K; k = k + 2)
1227                 {
1228                     sum0 += (int)va[0] * vb[0];
1229                     sum0 += (int)va[1] * vb[1];
1230 
1231                     sum1 += (int)va[2] * vb[0];
1232                     sum1 += (int)va[3] * vb[1];
1233 
1234                     sum2 += (int)va[4] * vb[0];
1235                     sum2 += (int)va[5] * vb[1];
1236 
1237                     sum3 += (int)va[6] * vb[0];
1238                     sum3 += (int)va[7] * vb[1];
1239 
1240                     va += 8;
1241                     vb += 2;
1242                 }
1243 
1244                 for (; k < K; k++)
1245                 {
1246                     sum0 += (int)va[0] * vb[0];
1247                     sum1 += (int)va[1] * vb[0];
1248                     sum2 += (int)va[2] * vb[0];
1249                     sum3 += (int)va[3] * vb[0];
1250 
1251                     va += 4;
1252                     vb += 1;
1253                 }
1254 
1255                 output0[0] = float2int8(((float)sum0 * scale_requant_in0 + bias0) * scale_requant_out0);
1256                 output1[0] = float2int8(((float)sum1 * scale_requant_in1 + bias1) * scale_requant_out1);
1257                 output2[0] = float2int8(((float)sum2 * scale_requant_in2 + bias2) * scale_requant_out2);
1258                 output3[0] = float2int8(((float)sum3 * scale_requant_in3 + bias3) * scale_requant_out3);
1259 
1260                 output0++;
1261                 output1++;
1262                 output2++;
1263                 output3++;
1264             }
1265         }
1266 
1267         #pragma omp parallel for num_threads(opt.num_threads)
1268         for (int i = remain_outch_start; i < outch; i++)
1269         {
1270             signed char* output = top_blob.channel(i);
1271 
1272             const float bias0 = bias ? bias[i] : 0.f;
1273 
1274             const float scale_requant_in0 = scale_requant[2 * i];
1275             const float scale_requant_out0 = scale_requant[2 * i + 1];
1276 
1277             int j = 0;
1278             for (; j + 3 < N; j = j + 4)
1279             {
1280                 signed char* vb = bottom_tm.channel(j / 4);
1281                 signed char* va = kernel_tm.channel(i / 4 + i % 4);
1282                 int sum[4] = {0};
1283 
1284                 int k = 0;
1285                 for (; k + 1 < K; k = k + 2)
1286                 {
1287                     for (int n = 0; n < 4; n++)
1288                     {
1289                         sum[n] += (int)va[0] * vb[2 * n];
1290                         sum[n] += (int)va[1] * vb[2 * n + 1];
1291                     }
1292                     va += 2;
1293                     vb += 8;
1294                 }
1295 
1296                 for (; k < K; k++)
1297                 {
1298                     for (int n = 0; n < 4; n++)
1299                     {
1300                         sum[n] += (int)va[0] * vb[n];
1301                     }
1302                     va += 1;
1303                     vb += 4;
1304                 }
1305 
1306                 for (int n = 0; n < 4; n++)
1307                 {
1308                     output[n] = float2int8(((float)sum[n] * scale_requant_in0 + bias0) * scale_requant_out0);
1309                 }
1310                 output += 4;
1311             }
1312 
1313             for (; j < N; j++)
1314             {
1315                 int sum = 0;
1316 
1317                 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
1318                 signed char* va = kernel_tm.channel(i / 4 + i % 4);
1319 
1320                 for (int k = 0; k < K; k++)
1321                 {
1322                     sum += (int)va[0] * vb[0];
1323 
1324                     va += 1;
1325                     vb += 1;
1326                 }
1327                 output[0] = float2int8(((float)sum * scale_requant_in0 + bias0) * scale_requant_out0);
1328 
1329                 output++;
1330             }
1331         }
1332     }
1333 
1334     // // sgemm(int M, int N, int K, float* A, float* B, float* C)
1335     // {
1336     //     for (int i=0; i<M; i++)
1337     //     {
1338     //         int* output = top_blob.channel(i);
1339 
1340     //         for (int j=0; j<N; j++)
1341     //         {
1342     //             int sum = 0;
1343 
1344     //             signed char* vb = (signed char*)bottom_im2row + K * j;
1345     //             const signed char* va = kernel + K * i;
1346 
1347     //             for (int k=0; k<K; k++)
1348     //             {
1349     //                 sum += (int)va[0] * vb[0];
1350 
1351     //                 va += 1;
1352     //                 vb += 1;
1353     //             }
1354     //             output[0] = sum;
1355 
1356     //             output++;
1357     //         }
1358     //     }
1359     // }
1360 }
1361