1 // BUG1989 is pleased to support the open source community by supporting ncnn available.
2 //
3 // Copyright (C) 2019 BUG1989. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
float2int8(float v)15 static inline signed char float2int8(float v)
16 {
17 int int32 = static_cast<int>(round(v));
18 if (int32 > 127) return 127;
19 if (int32 < -127) return -127;
20 return (signed char)int32;
21 }
22
conv_im2col_sgemm_int8_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const int kernel_w,const int kernel_h,const int stride_w,const int stride_h,const Option & opt)23 static void conv_im2col_sgemm_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel,
24 const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt)
25 {
26 int w = bottom_blob.w;
27 int inch = bottom_blob.c;
28
29 int outw = top_blob.w;
30 int outh = top_blob.h;
31 int outch = top_blob.c;
32
33 const signed char* kernel = _kernel;
34
35 // im2row
36 Mat bottom_im2row(kernel_h * kernel_w * inch, outw * outh, 1UL, opt.workspace_allocator);
37 {
38 signed char* ret = (signed char*)bottom_im2row;
39 int retID = 0;
40
41 for (int i = 0; i < outh; i++)
42 {
43 for (int j = 0; j < outw; j++)
44 {
45 for (int p = 0; p < inch; p++)
46 {
47 const signed char* input = bottom_blob.channel(p);
48 for (int u = 0; u < kernel_h; u++)
49 {
50 for (int v = 0; v < kernel_w; v++)
51 {
52 int row = u + i * stride_h;
53 int col = v + j * stride_w;
54 int index = row * w + col;
55 ret[retID] = input[index];
56 retID++;
57 }
58 }
59 }
60 }
61 }
62 }
63
64 int kernel_size = kernel_w * kernel_h;
65 int out_size = outw * outh;
66
67 // int M = outch; // outch
68 int N = outw * outh; // outsize or out stride
69 int K = kernel_w * kernel_h * inch; // ksize * inch
70
71 // bottom_im2row memory packed 4 x 4
72 Mat bottom_tm(4 * kernel_size, inch, out_size / 4 + out_size % 4, (size_t)1u, opt.workspace_allocator);
73 {
74 int nn_size = out_size >> 2;
75 int remain_size_start = nn_size << 2;
76
77 #pragma omp parallel for num_threads(opt.num_threads)
78 for (int ii = 0; ii < nn_size; ii++)
79 {
80 int i = ii * 4;
81
82 const signed char* img0 = bottom_im2row.row<signed char>(i);
83 const signed char* img1 = bottom_im2row.row<signed char>(i + 1);
84 const signed char* img2 = bottom_im2row.row<signed char>(i + 2);
85 const signed char* img3 = bottom_im2row.row<signed char>(i + 3);
86
87 signed char* tmpptr = bottom_tm.channel(i / 4);
88
89 int q = 0;
90 for (; q + 1 < inch * kernel_size; q = q + 2)
91 {
92 tmpptr[0] = img0[0];
93 tmpptr[1] = img0[1];
94 tmpptr[2] = img1[0];
95 tmpptr[3] = img1[1];
96 tmpptr[4] = img2[0];
97 tmpptr[5] = img2[1];
98 tmpptr[6] = img3[0];
99 tmpptr[7] = img3[1];
100
101 tmpptr += 8;
102 img0 += 2;
103 img1 += 2;
104 img2 += 2;
105 img3 += 2;
106 }
107
108 for (; q < inch * kernel_size; q++)
109 {
110 tmpptr[0] = img0[0];
111 tmpptr[1] = img1[0];
112 tmpptr[2] = img2[0];
113 tmpptr[3] = img3[0];
114
115 tmpptr += 4;
116 img0 += 1;
117 img1 += 1;
118 img2 += 1;
119 img3 += 1;
120 }
121 }
122
123 #pragma omp parallel for num_threads(opt.num_threads)
124 for (int i = remain_size_start; i < out_size; i++)
125 {
126 const signed char* img0 = bottom_im2row.row<signed char>(i);
127
128 signed char* tmpptr = bottom_tm.channel(i / 4 + i % 4);
129
130 int q = 0;
131 for (; q + 1 < inch * kernel_size; q = q + 2)
132 {
133 tmpptr[0] = img0[0];
134 tmpptr[1] = img0[1];
135
136 tmpptr += 2;
137 img0 += 2;
138 }
139
140 for (; q < inch * kernel_size; q++)
141 {
142 tmpptr[0] = img0[0];
143
144 tmpptr += 1;
145 img0 += 1;
146 }
147 }
148 }
149
150 // kernel memory packed 4 x 4
151 Mat kernel_tm(4 * kernel_size, inch, outch / 4 + outch % 4, (size_t)1u, opt.workspace_allocator);
152 {
153 int nn_outch = 0;
154 int remain_outch_start = 0;
155
156 nn_outch = outch >> 2;
157 remain_outch_start = nn_outch << 2;
158
159 #pragma omp parallel for num_threads(opt.num_threads)
160 for (int pp = 0; pp < nn_outch; pp++)
161 {
162 int p = pp * 4;
163
164 const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
165 const signed char* k1 = kernel + (p + 1) * inch * kernel_size;
166 const signed char* k2 = kernel + (p + 2) * inch * kernel_size;
167 const signed char* k3 = kernel + (p + 3) * inch * kernel_size;
168
169 signed char* ktmp = kernel_tm.channel(p / 4);
170
171 int q = 0;
172 for (; q + 1 < inch * kernel_size; q += 2)
173 {
174 ktmp[0] = k0[0];
175 ktmp[1] = k0[1];
176 ktmp[2] = k1[0];
177 ktmp[3] = k1[1];
178 ktmp[4] = k2[0];
179 ktmp[5] = k2[1];
180 ktmp[6] = k3[0];
181 ktmp[7] = k3[1];
182
183 ktmp += 8;
184
185 k0 += 2;
186 k1 += 2;
187 k2 += 2;
188 k3 += 2;
189 }
190
191 for (; q < inch * kernel_size; q++)
192 {
193 ktmp[0] = k0[0];
194 ktmp[1] = k1[0];
195 ktmp[2] = k2[0];
196 ktmp[3] = k3[0];
197 ktmp += 4;
198
199 k0 += 1;
200 k1 += 1;
201 k2 += 1;
202 k3 += 1;
203 }
204 }
205
206 #pragma omp parallel for num_threads(opt.num_threads)
207 for (int p = remain_outch_start; p < outch; p++)
208 {
209 const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
210
211 signed char* ktmp = kernel_tm.channel(p / 4 + p % 4);
212
213 int q = 0;
214 for (; q + 1 < inch * kernel_size; q = q + 2)
215 {
216 ktmp[0] = k0[0];
217 ktmp[1] = k0[1];
218 ktmp += 2;
219 k0 += 2;
220 }
221
222 for (; q < inch * kernel_size; q++)
223 {
224 ktmp[0] = k0[0];
225 ktmp++;
226 k0++;
227 }
228 }
229 }
230
231 // 4x4
232 // sgemm(int M, int N, int K, float* A, float* B, float* C)
233 {
234 // int M = outch; // outch
235 // int N = outw * outh; // outsize or out stride
236 // int L = kernel_w * kernel_h * inch; // ksize * inch
237
238 int nn_outch = 0;
239 int remain_outch_start = 0;
240
241 nn_outch = outch >> 2;
242 remain_outch_start = nn_outch << 2;
243
244 #pragma omp parallel for num_threads(opt.num_threads)
245 for (int pp = 0; pp < nn_outch; pp++)
246 {
247 int i = pp * 4;
248
249 int* output0 = top_blob.channel(i);
250 int* output1 = top_blob.channel(i + 1);
251 int* output2 = top_blob.channel(i + 2);
252 int* output3 = top_blob.channel(i + 3);
253
254 int j = 0;
255 for (; j + 3 < N; j = j + 4)
256 {
257 signed char* vb = bottom_tm.channel(j / 4);
258 signed char* va = kernel_tm.channel(i / 4);
259
260 int sum0[4] = {0};
261 int sum1[4] = {0};
262 int sum2[4] = {0};
263 int sum3[4] = {0};
264
265 int k = 0;
266
267 for (; k + 1 < K; k = k + 2)
268 {
269 for (int n = 0; n < 4; n++)
270 {
271 sum0[n] += (int)va[0] * vb[2 * n]; // k0
272 sum0[n] += (int)va[1] * vb[2 * n + 1];
273
274 sum1[n] += (int)va[2] * vb[2 * n]; // k1
275 sum1[n] += (int)va[3] * vb[2 * n + 1];
276
277 sum2[n] += (int)va[4] * vb[2 * n]; // k2
278 sum2[n] += (int)va[5] * vb[2 * n + 1];
279
280 sum3[n] += (int)va[6] * vb[2 * n]; // k3
281 sum3[n] += (int)va[7] * vb[2 * n + 1];
282 }
283
284 va += 8;
285 vb += 8;
286 }
287
288 for (; k < K; k++)
289 {
290 for (int n = 0; n < 4; n++)
291 {
292 sum0[n] += (int)va[0] * vb[n];
293 sum1[n] += (int)va[1] * vb[n];
294 sum2[n] += (int)va[2] * vb[n];
295 sum3[n] += (int)va[3] * vb[n];
296 }
297
298 va += 4;
299 vb += 4;
300 }
301
302 for (int n = 0; n < 4; n++)
303 {
304 output0[n] = sum0[n];
305 output1[n] = sum1[n];
306 output2[n] = sum2[n];
307 output3[n] = sum3[n];
308 }
309 output0 += 4;
310 output1 += 4;
311 output2 += 4;
312 output3 += 4;
313 }
314
315 for (; j < N; j++)
316 {
317 int sum0 = 0;
318 int sum1 = 0;
319 int sum2 = 0;
320 int sum3 = 0;
321
322 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
323 signed char* va = kernel_tm.channel(i / 4);
324
325 int k = 0;
326
327 for (; k + 1 < K; k = k + 2)
328 {
329 sum0 += (int)va[0] * vb[0];
330 sum0 += (int)va[1] * vb[1];
331
332 sum1 += (int)va[2] * vb[0];
333 sum1 += (int)va[3] * vb[1];
334
335 sum2 += (int)va[4] * vb[0];
336 sum2 += (int)va[5] * vb[1];
337
338 sum3 += (int)va[6] * vb[0];
339 sum3 += (int)va[7] * vb[1];
340
341 va += 8;
342 vb += 2;
343 }
344
345 for (; k < K; k++)
346 {
347 sum0 += (int)va[0] * vb[0];
348 sum1 += (int)va[1] * vb[0];
349 sum2 += (int)va[2] * vb[0];
350 sum3 += (int)va[3] * vb[0];
351
352 va += 4;
353 vb += 1;
354 }
355
356 output0[0] = sum0;
357 output1[0] = sum1;
358 output2[0] = sum2;
359 output3[0] = sum3;
360
361 output0++;
362 output1++;
363 output2++;
364 output3++;
365 }
366 }
367
368 #pragma omp parallel for num_threads(opt.num_threads)
369 for (int i = remain_outch_start; i < outch; i++)
370 {
371 int* output = top_blob.channel(i);
372
373 int j = 0;
374 for (; j + 3 < N; j = j + 4)
375 {
376 signed char* vb = bottom_tm.channel(j / 4);
377 signed char* va = kernel_tm.channel(i / 4 + i % 4);
378 int sum[4] = {0};
379
380 int k = 0;
381 for (; k + 1 < K; k = k + 2)
382 {
383 for (int n = 0; n < 4; n++)
384 {
385 sum[n] += (int)va[0] * vb[2 * n];
386 sum[n] += (int)va[1] * vb[2 * n + 1];
387 }
388 va += 2;
389 vb += 8;
390 }
391
392 for (; k < K; k++)
393 {
394 for (int n = 0; n < 4; n++)
395 {
396 sum[n] += (int)va[0] * vb[n];
397 }
398 va += 1;
399 vb += 4;
400 }
401
402 for (int n = 0; n < 4; n++)
403 {
404 output[n] = sum[n];
405 }
406 output += 4;
407 }
408
409 for (; j < N; j++)
410 {
411 int sum = 0;
412
413 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
414 signed char* va = kernel_tm.channel(i / 4 + i % 4);
415
416 for (int k = 0; k < K; k++)
417 {
418 sum += (int)va[0] * vb[0];
419
420 va += 1;
421 vb += 1;
422 }
423 output[0] = sum;
424
425 output++;
426 }
427 }
428 }
429
430 // // sgemm(int M, int N, int K, float* A, float* B, float* C)
431 // {
432 // for (int i=0; i<M; i++)
433 // {
434 // int* output = top_blob.channel(i);
435
436 // for (int j=0; j<N; j++)
437 // {
438 // int sum = 0;
439
440 // signed char* vb = (signed char*)bottom_im2row + K * j;
441 // const signed char* va = kernel + K * i;
442
443 // for (int k=0; k<K; k++)
444 // {
445 // sum += (int)va[0] * vb[0];
446
447 // va += 1;
448 // vb += 1;
449 // }
450 // output[0] = sum;
451
452 // output++;
453 // }
454 // }
455 // }
456 }
457
conv_im2col_sgemm_int8_dequant_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const int kernel_w,const int kernel_h,const int stride_w,const int stride_h,const Mat & _bias,std::vector<float> scale_dequant,const Option & opt)458 static void conv_im2col_sgemm_int8_dequant_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel,
459 const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Mat& _bias, std::vector<float> scale_dequant, const Option& opt)
460 {
461 int w = bottom_blob.w;
462 int inch = bottom_blob.c;
463
464 int outw = top_blob.w;
465 int outh = top_blob.h;
466 int outch = top_blob.c;
467
468 const signed char* kernel = _kernel;
469 const float* bias = _bias;
470
471 // im2row
472 Mat bottom_im2row(kernel_h * kernel_w * inch, outw * outh, 1UL, opt.workspace_allocator);
473 {
474 signed char* ret = (signed char*)bottom_im2row;
475 int retID = 0;
476
477 for (int i = 0; i < outh; i++)
478 {
479 for (int j = 0; j < outw; j++)
480 {
481 for (int p = 0; p < inch; p++)
482 {
483 const signed char* input = bottom_blob.channel(p);
484 for (int u = 0; u < kernel_h; u++)
485 {
486 for (int v = 0; v < kernel_w; v++)
487 {
488 int row = u + i * stride_h;
489 int col = v + j * stride_w;
490 int index = row * w + col;
491 ret[retID] = input[index];
492 retID++;
493 }
494 }
495 }
496 }
497 }
498 }
499
500 int kernel_size = kernel_w * kernel_h;
501 int out_size = outw * outh;
502
503 // int M = outch; // outch
504 int N = outw * outh; // outsize or out stride
505 int K = kernel_w * kernel_h * inch; // ksize * inch
506
507 // bottom_im2row memory packed 4 x 4
508 Mat bottom_tm(4 * kernel_size, inch, out_size / 4 + out_size % 4, (size_t)1u, opt.workspace_allocator);
509 {
510 int nn_size = out_size >> 2;
511 int remain_size_start = nn_size << 2;
512
513 #pragma omp parallel for num_threads(opt.num_threads)
514 for (int ii = 0; ii < nn_size; ii++)
515 {
516 int i = ii * 4;
517
518 const signed char* img0 = bottom_im2row.row<signed char>(i);
519 const signed char* img1 = bottom_im2row.row<signed char>(i + 1);
520 const signed char* img2 = bottom_im2row.row<signed char>(i + 2);
521 const signed char* img3 = bottom_im2row.row<signed char>(i + 3);
522
523 signed char* tmpptr = bottom_tm.channel(i / 4);
524
525 int q = 0;
526 for (; q + 1 < inch * kernel_size; q = q + 2)
527 {
528 tmpptr[0] = img0[0];
529 tmpptr[1] = img0[1];
530 tmpptr[2] = img1[0];
531 tmpptr[3] = img1[1];
532 tmpptr[4] = img2[0];
533 tmpptr[5] = img2[1];
534 tmpptr[6] = img3[0];
535 tmpptr[7] = img3[1];
536
537 tmpptr += 8;
538 img0 += 2;
539 img1 += 2;
540 img2 += 2;
541 img3 += 2;
542 }
543
544 for (; q < inch * kernel_size; q++)
545 {
546 tmpptr[0] = img0[0];
547 tmpptr[1] = img1[0];
548 tmpptr[2] = img2[0];
549 tmpptr[3] = img3[0];
550
551 tmpptr += 4;
552 img0 += 1;
553 img1 += 1;
554 img2 += 1;
555 img3 += 1;
556 }
557 }
558
559 #pragma omp parallel for num_threads(opt.num_threads)
560 for (int i = remain_size_start; i < out_size; i++)
561 {
562 const signed char* img0 = bottom_im2row.row<signed char>(i);
563
564 signed char* tmpptr = bottom_tm.channel(i / 4 + i % 4);
565
566 int q = 0;
567 for (; q + 1 < inch * kernel_size; q = q + 2)
568 {
569 tmpptr[0] = img0[0];
570 tmpptr[1] = img0[1];
571
572 tmpptr += 2;
573 img0 += 2;
574 }
575
576 for (; q < inch * kernel_size; q++)
577 {
578 tmpptr[0] = img0[0];
579
580 tmpptr += 1;
581 img0 += 1;
582 }
583 }
584 }
585
586 // kernel memory packed 4 x 4
587 Mat kernel_tm(4 * kernel_size, inch, outch / 4 + outch % 4, (size_t)1u, opt.workspace_allocator);
588 {
589 int nn_outch = 0;
590 int remain_outch_start = 0;
591
592 nn_outch = outch >> 2;
593 remain_outch_start = nn_outch << 2;
594
595 #pragma omp parallel for num_threads(opt.num_threads)
596 for (int pp = 0; pp < nn_outch; pp++)
597 {
598 int p = pp * 4;
599
600 const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
601 const signed char* k1 = kernel + (p + 1) * inch * kernel_size;
602 const signed char* k2 = kernel + (p + 2) * inch * kernel_size;
603 const signed char* k3 = kernel + (p + 3) * inch * kernel_size;
604
605 signed char* ktmp = kernel_tm.channel(p / 4);
606
607 int q = 0;
608 for (; q + 1 < inch * kernel_size; q += 2)
609 {
610 ktmp[0] = k0[0];
611 ktmp[1] = k0[1];
612 ktmp[2] = k1[0];
613 ktmp[3] = k1[1];
614 ktmp[4] = k2[0];
615 ktmp[5] = k2[1];
616 ktmp[6] = k3[0];
617 ktmp[7] = k3[1];
618
619 ktmp += 8;
620
621 k0 += 2;
622 k1 += 2;
623 k2 += 2;
624 k3 += 2;
625 }
626
627 for (; q < inch * kernel_size; q++)
628 {
629 ktmp[0] = k0[0];
630 ktmp[1] = k1[0];
631 ktmp[2] = k2[0];
632 ktmp[3] = k3[0];
633 ktmp += 4;
634
635 k0 += 1;
636 k1 += 1;
637 k2 += 1;
638 k3 += 1;
639 }
640 }
641
642 #pragma omp parallel for num_threads(opt.num_threads)
643 for (int p = remain_outch_start; p < outch; p++)
644 {
645 const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
646
647 signed char* ktmp = kernel_tm.channel(p / 4 + p % 4);
648
649 int q = 0;
650 for (; q + 1 < inch * kernel_size; q = q + 2)
651 {
652 ktmp[0] = k0[0];
653 ktmp[1] = k0[1];
654 ktmp += 2;
655 k0 += 2;
656 }
657
658 for (; q < inch * kernel_size; q++)
659 {
660 ktmp[0] = k0[0];
661 ktmp++;
662 k0++;
663 }
664 }
665 }
666
667 // 4x4
668 // sgemm(int M, int N, int K, float* A, float* B, float* C)
669 {
670 // int M = outch; // outch
671 // int N = outw * outh; // outsize or out stride
672 // int L = kernel_w * kernel_h * inch; // ksize * inch
673
674 int nn_outch = 0;
675 int remain_outch_start = 0;
676
677 nn_outch = outch >> 2;
678 remain_outch_start = nn_outch << 2;
679
680 #pragma omp parallel for num_threads(opt.num_threads)
681 for (int pp = 0; pp < nn_outch; pp++)
682 {
683 int i = pp * 4;
684
685 const float bias0 = bias ? bias[i] : 0.f;
686 const float bias1 = bias ? bias[i + 1] : 0.f;
687 const float bias2 = bias ? bias[i + 2] : 0.f;
688 const float bias3 = bias ? bias[i + 3] : 0.f;
689
690 const float scale_dequant0 = scale_dequant[i];
691 const float scale_dequant1 = scale_dequant[i + 1];
692 const float scale_dequant2 = scale_dequant[i + 2];
693 const float scale_dequant3 = scale_dequant[i + 3];
694
695 float* output0 = top_blob.channel(i);
696 float* output1 = top_blob.channel(i + 1);
697 float* output2 = top_blob.channel(i + 2);
698 float* output3 = top_blob.channel(i + 3);
699
700 int j = 0;
701 for (; j + 3 < N; j = j + 4)
702 {
703 signed char* vb = bottom_tm.channel(j / 4);
704 signed char* va = kernel_tm.channel(i / 4);
705
706 int sum0[4] = {0};
707 int sum1[4] = {0};
708 int sum2[4] = {0};
709 int sum3[4] = {0};
710
711 int k = 0;
712
713 for (; k + 1 < K; k = k + 2)
714 {
715 for (int n = 0; n < 4; n++)
716 {
717 sum0[n] += (int)va[0] * vb[2 * n]; // k0
718 sum0[n] += (int)va[1] * vb[2 * n + 1];
719
720 sum1[n] += (int)va[2] * vb[2 * n]; // k1
721 sum1[n] += (int)va[3] * vb[2 * n + 1];
722
723 sum2[n] += (int)va[4] * vb[2 * n]; // k2
724 sum2[n] += (int)va[5] * vb[2 * n + 1];
725
726 sum3[n] += (int)va[6] * vb[2 * n]; // k3
727 sum3[n] += (int)va[7] * vb[2 * n + 1];
728 }
729
730 va += 8;
731 vb += 8;
732 }
733
734 for (; k < K; k++)
735 {
736 for (int n = 0; n < 4; n++)
737 {
738 sum0[n] += (int)va[0] * vb[n];
739 sum1[n] += (int)va[1] * vb[n];
740 sum2[n] += (int)va[2] * vb[n];
741 sum3[n] += (int)va[3] * vb[n];
742 }
743
744 va += 4;
745 vb += 4;
746 }
747
748 for (int n = 0; n < 4; n++)
749 {
750 output0[n] = (float)sum0[n] * scale_dequant0 + bias0;
751 output1[n] = (float)sum1[n] * scale_dequant1 + bias1;
752 output2[n] = (float)sum2[n] * scale_dequant2 + bias2;
753 output3[n] = (float)sum3[n] * scale_dequant3 + bias3;
754 }
755 output0 += 4;
756 output1 += 4;
757 output2 += 4;
758 output3 += 4;
759 }
760
761 for (; j < N; j++)
762 {
763 int sum0 = 0;
764 int sum1 = 0;
765 int sum2 = 0;
766 int sum3 = 0;
767
768 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
769 signed char* va = kernel_tm.channel(i / 4);
770
771 int k = 0;
772
773 for (; k + 1 < K; k = k + 2)
774 {
775 sum0 += (int)va[0] * vb[0];
776 sum0 += (int)va[1] * vb[1];
777
778 sum1 += (int)va[2] * vb[0];
779 sum1 += (int)va[3] * vb[1];
780
781 sum2 += (int)va[4] * vb[0];
782 sum2 += (int)va[5] * vb[1];
783
784 sum3 += (int)va[6] * vb[0];
785 sum3 += (int)va[7] * vb[1];
786
787 va += 8;
788 vb += 2;
789 }
790
791 for (; k < K; k++)
792 {
793 sum0 += (int)va[0] * vb[0];
794 sum1 += (int)va[1] * vb[0];
795 sum2 += (int)va[2] * vb[0];
796 sum3 += (int)va[3] * vb[0];
797
798 va += 4;
799 vb += 1;
800 }
801
802 output0[0] = (float)sum0 * scale_dequant0 + bias0;
803 output1[0] = (float)sum1 * scale_dequant1 + bias1;
804 output2[0] = (float)sum2 * scale_dequant2 + bias2;
805 output3[0] = (float)sum3 * scale_dequant3 + bias3;
806
807 output0++;
808 output1++;
809 output2++;
810 output3++;
811 }
812 }
813
814 #pragma omp parallel for num_threads(opt.num_threads)
815 for (int i = remain_outch_start; i < outch; i++)
816 {
817 float* output = top_blob.channel(i);
818
819 const float bias0 = bias ? bias[i] : 0.f;
820 const float scale_dequant0 = scale_dequant[i];
821
822 int j = 0;
823 for (; j + 3 < N; j = j + 4)
824 {
825 signed char* vb = bottom_tm.channel(j / 4);
826 signed char* va = kernel_tm.channel(i / 4 + i % 4);
827 int sum[4] = {0};
828
829 int k = 0;
830 for (; k + 1 < K; k = k + 2)
831 {
832 for (int n = 0; n < 4; n++)
833 {
834 sum[n] += (int)va[0] * vb[2 * n];
835 sum[n] += (int)va[1] * vb[2 * n + 1];
836 }
837 va += 2;
838 vb += 8;
839 }
840
841 for (; k < K; k++)
842 {
843 for (int n = 0; n < 4; n++)
844 {
845 sum[n] += (int)va[0] * vb[n];
846 }
847 va += 1;
848 vb += 4;
849 }
850
851 for (int n = 0; n < 4; n++)
852 {
853 output[n] = (float)sum[n] * scale_dequant0 + bias0;
854 }
855 output += 4;
856 }
857
858 for (; j < N; j++)
859 {
860 int sum = 0;
861
862 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
863 signed char* va = kernel_tm.channel(i / 4 + i % 4);
864
865 for (int k = 0; k < K; k++)
866 {
867 sum += (int)va[0] * vb[0];
868
869 va += 1;
870 vb += 1;
871 }
872 output[0] = (float)sum * scale_dequant0 + bias0;
873
874 output++;
875 }
876 }
877 }
878
879 // // sgemm(int M, int N, int K, float* A, float* B, float* C)
880 // {
881 // for (int i=0; i<M; i++)
882 // {
883 // int* output = top_blob.channel(i);
884
885 // for (int j=0; j<N; j++)
886 // {
887 // int sum = 0;
888
889 // signed char* vb = (signed char*)bottom_im2row + K * j;
890 // const signed char* va = kernel + K * i;
891
892 // for (int k=0; k<K; k++)
893 // {
894 // sum += (int)va[0] * vb[0];
895
896 // va += 1;
897 // vb += 1;
898 // }
899 // output[0] = sum;
900
901 // output++;
902 // }
903 // }
904 // }
905 }
906
conv_im2col_sgemm_int8_requant_sse(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const int kernel_w,const int kernel_h,const int stride_w,const int stride_h,const Mat & _bias,std::vector<float> scale_requant,const Option & opt)907 static void conv_im2col_sgemm_int8_requant_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel,
908 const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Mat& _bias, std::vector<float> scale_requant, const Option& opt)
909 {
910 int w = bottom_blob.w;
911 int inch = bottom_blob.c;
912
913 int outw = top_blob.w;
914 int outh = top_blob.h;
915 int outch = top_blob.c;
916
917 const signed char* kernel = _kernel;
918 const float* bias = _bias;
919
920 // im2row
921 Mat bottom_im2row(kernel_h * kernel_w * inch, outw * outh, 1UL, opt.workspace_allocator);
922 {
923 signed char* ret = (signed char*)bottom_im2row;
924 int retID = 0;
925
926 for (int i = 0; i < outh; i++)
927 {
928 for (int j = 0; j < outw; j++)
929 {
930 for (int p = 0; p < inch; p++)
931 {
932 const signed char* input = bottom_blob.channel(p);
933 for (int u = 0; u < kernel_h; u++)
934 {
935 for (int v = 0; v < kernel_w; v++)
936 {
937 int row = u + i * stride_h;
938 int col = v + j * stride_w;
939 int index = row * w + col;
940 ret[retID] = input[index];
941 retID++;
942 }
943 }
944 }
945 }
946 }
947 }
948
949 int kernel_size = kernel_w * kernel_h;
950 int out_size = outw * outh;
951
952 // int M = outch; // outch
953 int N = outw * outh; // outsize or out stride
954 int K = kernel_w * kernel_h * inch; // ksize * inch
955
956 // bottom_im2row memory packed 4 x 4
957 Mat bottom_tm(4 * kernel_size, inch, out_size / 4 + out_size % 4, (size_t)1u, opt.workspace_allocator);
958 {
959 int nn_size = out_size >> 2;
960 int remain_size_start = nn_size << 2;
961
962 #pragma omp parallel for num_threads(opt.num_threads)
963 for (int ii = 0; ii < nn_size; ii++)
964 {
965 int i = ii * 4;
966
967 const signed char* img0 = bottom_im2row.row<signed char>(i);
968 const signed char* img1 = bottom_im2row.row<signed char>(i + 1);
969 const signed char* img2 = bottom_im2row.row<signed char>(i + 2);
970 const signed char* img3 = bottom_im2row.row<signed char>(i + 3);
971
972 signed char* tmpptr = bottom_tm.channel(i / 4);
973
974 int q = 0;
975 for (; q + 1 < inch * kernel_size; q = q + 2)
976 {
977 tmpptr[0] = img0[0];
978 tmpptr[1] = img0[1];
979 tmpptr[2] = img1[0];
980 tmpptr[3] = img1[1];
981 tmpptr[4] = img2[0];
982 tmpptr[5] = img2[1];
983 tmpptr[6] = img3[0];
984 tmpptr[7] = img3[1];
985
986 tmpptr += 8;
987 img0 += 2;
988 img1 += 2;
989 img2 += 2;
990 img3 += 2;
991 }
992
993 for (; q < inch * kernel_size; q++)
994 {
995 tmpptr[0] = img0[0];
996 tmpptr[1] = img1[0];
997 tmpptr[2] = img2[0];
998 tmpptr[3] = img3[0];
999
1000 tmpptr += 4;
1001 img0 += 1;
1002 img1 += 1;
1003 img2 += 1;
1004 img3 += 1;
1005 }
1006 }
1007
1008 #pragma omp parallel for num_threads(opt.num_threads)
1009 for (int i = remain_size_start; i < out_size; i++)
1010 {
1011 const signed char* img0 = bottom_im2row.row<signed char>(i);
1012
1013 signed char* tmpptr = bottom_tm.channel(i / 4 + i % 4);
1014
1015 int q = 0;
1016 for (; q + 1 < inch * kernel_size; q = q + 2)
1017 {
1018 tmpptr[0] = img0[0];
1019 tmpptr[1] = img0[1];
1020
1021 tmpptr += 2;
1022 img0 += 2;
1023 }
1024
1025 for (; q < inch * kernel_size; q++)
1026 {
1027 tmpptr[0] = img0[0];
1028
1029 tmpptr += 1;
1030 img0 += 1;
1031 }
1032 }
1033 }
1034
1035 // kernel memory packed 4 x 4
1036 Mat kernel_tm(4 * kernel_size, inch, outch / 4 + outch % 4, (size_t)1u, opt.workspace_allocator);
1037 {
1038 int nn_outch = 0;
1039 int remain_outch_start = 0;
1040
1041 nn_outch = outch >> 2;
1042 remain_outch_start = nn_outch << 2;
1043
1044 #pragma omp parallel for num_threads(opt.num_threads)
1045 for (int pp = 0; pp < nn_outch; pp++)
1046 {
1047 int p = pp * 4;
1048
1049 const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
1050 const signed char* k1 = kernel + (p + 1) * inch * kernel_size;
1051 const signed char* k2 = kernel + (p + 2) * inch * kernel_size;
1052 const signed char* k3 = kernel + (p + 3) * inch * kernel_size;
1053
1054 signed char* ktmp = kernel_tm.channel(p / 4);
1055
1056 int q = 0;
1057 for (; q + 1 < inch * kernel_size; q += 2)
1058 {
1059 ktmp[0] = k0[0];
1060 ktmp[1] = k0[1];
1061 ktmp[2] = k1[0];
1062 ktmp[3] = k1[1];
1063 ktmp[4] = k2[0];
1064 ktmp[5] = k2[1];
1065 ktmp[6] = k3[0];
1066 ktmp[7] = k3[1];
1067
1068 ktmp += 8;
1069
1070 k0 += 2;
1071 k1 += 2;
1072 k2 += 2;
1073 k3 += 2;
1074 }
1075
1076 for (; q < inch * kernel_size; q++)
1077 {
1078 ktmp[0] = k0[0];
1079 ktmp[1] = k1[0];
1080 ktmp[2] = k2[0];
1081 ktmp[3] = k3[0];
1082 ktmp += 4;
1083
1084 k0 += 1;
1085 k1 += 1;
1086 k2 += 1;
1087 k3 += 1;
1088 }
1089 }
1090
1091 #pragma omp parallel for num_threads(opt.num_threads)
1092 for (int p = remain_outch_start; p < outch; p++)
1093 {
1094 const signed char* k0 = kernel + (p + 0) * inch * kernel_size;
1095
1096 signed char* ktmp = kernel_tm.channel(p / 4 + p % 4);
1097
1098 int q = 0;
1099 for (; q + 1 < inch * kernel_size; q = q + 2)
1100 {
1101 ktmp[0] = k0[0];
1102 ktmp[1] = k0[1];
1103 ktmp += 2;
1104 k0 += 2;
1105 }
1106
1107 for (; q < inch * kernel_size; q++)
1108 {
1109 ktmp[0] = k0[0];
1110 ktmp++;
1111 k0++;
1112 }
1113 }
1114 }
1115
1116 // 4x4
1117 // sgemm(int M, int N, int K, float* A, float* B, float* C)
1118 {
1119 // int M = outch; // outch
1120 // int N = outw * outh; // outsize or out stride
1121 // int L = kernel_w * kernel_h * inch; // ksize * inch
1122
1123 int nn_outch = 0;
1124 int remain_outch_start = 0;
1125
1126 nn_outch = outch >> 2;
1127 remain_outch_start = nn_outch << 2;
1128
1129 #pragma omp parallel for num_threads(opt.num_threads)
1130 for (int pp = 0; pp < nn_outch; pp++)
1131 {
1132 int i = pp * 4;
1133
1134 signed char* output0 = top_blob.channel(i);
1135 signed char* output1 = top_blob.channel(i + 1);
1136 signed char* output2 = top_blob.channel(i + 2);
1137 signed char* output3 = top_blob.channel(i + 3);
1138
1139 const float bias0 = bias ? bias[i] : 0.f;
1140 const float bias1 = bias ? bias[i + 1] : 0.f;
1141 const float bias2 = bias ? bias[i + 2] : 0.f;
1142 const float bias3 = bias ? bias[i + 3] : 0.f;
1143
1144 const float scale_requant_in0 = scale_requant[2 * i];
1145 const float scale_requant_out0 = scale_requant[2 * i + 1];
1146 const float scale_requant_in1 = scale_requant[2 * (i + 1)];
1147 const float scale_requant_out1 = scale_requant[2 * (i + 1) + 1];
1148 const float scale_requant_in2 = scale_requant[2 * (i + 2)];
1149 const float scale_requant_out2 = scale_requant[2 * (i + 2) + 1];
1150 const float scale_requant_in3 = scale_requant[2 * (i + 3)];
1151 const float scale_requant_out3 = scale_requant[2 * (i + 3) + 1];
1152
1153 int j = 0;
1154 for (; j + 3 < N; j = j + 4)
1155 {
1156 signed char* vb = bottom_tm.channel(j / 4);
1157 signed char* va = kernel_tm.channel(i / 4);
1158
1159 int sum0[4] = {0};
1160 int sum1[4] = {0};
1161 int sum2[4] = {0};
1162 int sum3[4] = {0};
1163
1164 int k = 0;
1165
1166 for (; k + 1 < K; k = k + 2)
1167 {
1168 for (int n = 0; n < 4; n++)
1169 {
1170 sum0[n] += (int)va[0] * vb[2 * n]; // k0
1171 sum0[n] += (int)va[1] * vb[2 * n + 1];
1172
1173 sum1[n] += (int)va[2] * vb[2 * n]; // k1
1174 sum1[n] += (int)va[3] * vb[2 * n + 1];
1175
1176 sum2[n] += (int)va[4] * vb[2 * n]; // k2
1177 sum2[n] += (int)va[5] * vb[2 * n + 1];
1178
1179 sum3[n] += (int)va[6] * vb[2 * n]; // k3
1180 sum3[n] += (int)va[7] * vb[2 * n + 1];
1181 }
1182
1183 va += 8;
1184 vb += 8;
1185 }
1186
1187 for (; k < K; k++)
1188 {
1189 for (int n = 0; n < 4; n++)
1190 {
1191 sum0[n] += (int)va[0] * vb[n];
1192 sum1[n] += (int)va[1] * vb[n];
1193 sum2[n] += (int)va[2] * vb[n];
1194 sum3[n] += (int)va[3] * vb[n];
1195 }
1196
1197 va += 4;
1198 vb += 4;
1199 }
1200
1201 for (int n = 0; n < 4; n++)
1202 {
1203 output0[n] = float2int8(((float)sum0[n] * scale_requant_in0 + bias0) * scale_requant_out0);
1204 output1[n] = float2int8(((float)sum1[n] * scale_requant_in1 + bias1) * scale_requant_out1);
1205 output2[n] = float2int8(((float)sum2[n] * scale_requant_in2 + bias2) * scale_requant_out2);
1206 output3[n] = float2int8(((float)sum3[n] * scale_requant_in3 + bias3) * scale_requant_out3);
1207 }
1208 output0 += 4;
1209 output1 += 4;
1210 output2 += 4;
1211 output3 += 4;
1212 }
1213
1214 for (; j < N; j++)
1215 {
1216 int sum0 = 0;
1217 int sum1 = 0;
1218 int sum2 = 0;
1219 int sum3 = 0;
1220
1221 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
1222 signed char* va = kernel_tm.channel(i / 4);
1223
1224 int k = 0;
1225
1226 for (; k + 1 < K; k = k + 2)
1227 {
1228 sum0 += (int)va[0] * vb[0];
1229 sum0 += (int)va[1] * vb[1];
1230
1231 sum1 += (int)va[2] * vb[0];
1232 sum1 += (int)va[3] * vb[1];
1233
1234 sum2 += (int)va[4] * vb[0];
1235 sum2 += (int)va[5] * vb[1];
1236
1237 sum3 += (int)va[6] * vb[0];
1238 sum3 += (int)va[7] * vb[1];
1239
1240 va += 8;
1241 vb += 2;
1242 }
1243
1244 for (; k < K; k++)
1245 {
1246 sum0 += (int)va[0] * vb[0];
1247 sum1 += (int)va[1] * vb[0];
1248 sum2 += (int)va[2] * vb[0];
1249 sum3 += (int)va[3] * vb[0];
1250
1251 va += 4;
1252 vb += 1;
1253 }
1254
1255 output0[0] = float2int8(((float)sum0 * scale_requant_in0 + bias0) * scale_requant_out0);
1256 output1[0] = float2int8(((float)sum1 * scale_requant_in1 + bias1) * scale_requant_out1);
1257 output2[0] = float2int8(((float)sum2 * scale_requant_in2 + bias2) * scale_requant_out2);
1258 output3[0] = float2int8(((float)sum3 * scale_requant_in3 + bias3) * scale_requant_out3);
1259
1260 output0++;
1261 output1++;
1262 output2++;
1263 output3++;
1264 }
1265 }
1266
1267 #pragma omp parallel for num_threads(opt.num_threads)
1268 for (int i = remain_outch_start; i < outch; i++)
1269 {
1270 signed char* output = top_blob.channel(i);
1271
1272 const float bias0 = bias ? bias[i] : 0.f;
1273
1274 const float scale_requant_in0 = scale_requant[2 * i];
1275 const float scale_requant_out0 = scale_requant[2 * i + 1];
1276
1277 int j = 0;
1278 for (; j + 3 < N; j = j + 4)
1279 {
1280 signed char* vb = bottom_tm.channel(j / 4);
1281 signed char* va = kernel_tm.channel(i / 4 + i % 4);
1282 int sum[4] = {0};
1283
1284 int k = 0;
1285 for (; k + 1 < K; k = k + 2)
1286 {
1287 for (int n = 0; n < 4; n++)
1288 {
1289 sum[n] += (int)va[0] * vb[2 * n];
1290 sum[n] += (int)va[1] * vb[2 * n + 1];
1291 }
1292 va += 2;
1293 vb += 8;
1294 }
1295
1296 for (; k < K; k++)
1297 {
1298 for (int n = 0; n < 4; n++)
1299 {
1300 sum[n] += (int)va[0] * vb[n];
1301 }
1302 va += 1;
1303 vb += 4;
1304 }
1305
1306 for (int n = 0; n < 4; n++)
1307 {
1308 output[n] = float2int8(((float)sum[n] * scale_requant_in0 + bias0) * scale_requant_out0);
1309 }
1310 output += 4;
1311 }
1312
1313 for (; j < N; j++)
1314 {
1315 int sum = 0;
1316
1317 signed char* vb = bottom_tm.channel(j / 4 + j % 4);
1318 signed char* va = kernel_tm.channel(i / 4 + i % 4);
1319
1320 for (int k = 0; k < K; k++)
1321 {
1322 sum += (int)va[0] * vb[0];
1323
1324 va += 1;
1325 vb += 1;
1326 }
1327 output[0] = float2int8(((float)sum * scale_requant_in0 + bias0) * scale_requant_out0);
1328
1329 output++;
1330 }
1331 }
1332 }
1333
1334 // // sgemm(int M, int N, int K, float* A, float* B, float* C)
1335 // {
1336 // for (int i=0; i<M; i++)
1337 // {
1338 // int* output = top_blob.channel(i);
1339
1340 // for (int j=0; j<N; j++)
1341 // {
1342 // int sum = 0;
1343
1344 // signed char* vb = (signed char*)bottom_im2row + K * j;
1345 // const signed char* va = kernel + K * i;
1346
1347 // for (int k=0; k<K; k++)
1348 // {
1349 // sum += (int)va[0] * vb[0];
1350
1351 // va += 1;
1352 // vb += 1;
1353 // }
1354 // output[0] = sum;
1355
1356 // output++;
1357 // }
1358 // }
1359 // }
1360 }
1361