1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "quantize_x86.h"
16
17 #include <math.h>
18
19 #if __SSE2__
20 #include <emmintrin.h>
21 #if __AVX__
22 #include <immintrin.h>
23 #endif // __AVX__
24 #endif // __SSE2__
25
26 #include "x86_usability.h"
27
28 namespace ncnn {
29
Quantize_x86()30 Quantize_x86::Quantize_x86()
31 {
32 #if __SSE2__
33 support_packing = true;
34 #endif // __SSE2__
35 }
36
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const37 int Quantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
38 {
39 int dims = bottom_blob.dims;
40 int elempack = bottom_blob.elempack;
41
42 #if __SSE2__
43 #if __AVX__
44 if (elempack == 8)
45 {
46 if (dims == 1)
47 {
48 int w = bottom_blob.w;
49
50 top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
51 if (top_blob.empty())
52 return -100;
53
54 if (scale_data_size == 1)
55 {
56 __m256 _scale = _mm256_set1_ps(scale_data[0]);
57
58 #pragma omp parallel for num_threads(opt.num_threads)
59 for (int i = 0; i < w; i++)
60 {
61 const float* ptr = (const float*)bottom_blob + i * 8;
62 signed char* outptr = (signed char*)top_blob + i * 8;
63
64 __m256 _v = _mm256_loadu_ps(ptr);
65 _v = _mm256_mul_ps(_v, _scale);
66 *(int64_t*)outptr = float2int8_avx(_v);
67 }
68 }
69 else
70 {
71 #pragma omp parallel for num_threads(opt.num_threads)
72 for (int i = 0; i < w; i++)
73 {
74 const float* ptr = (const float*)bottom_blob + i * 8;
75 signed char* outptr = (signed char*)top_blob + i * 8;
76
77 __m256 _v = _mm256_loadu_ps(ptr);
78 __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
79 _v = _mm256_mul_ps(_v, _scale);
80 *(int64_t*)outptr = float2int8_avx(_v);
81 }
82 }
83 }
84
85 if (dims == 2)
86 {
87 int w = bottom_blob.w;
88 int h = bottom_blob.h;
89
90 top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
91 if (top_blob.empty())
92 return -100;
93
94 if (scale_data_size == 1)
95 {
96 __m256 _scale = _mm256_set1_ps(scale_data[0]);
97
98 #pragma omp parallel for num_threads(opt.num_threads)
99 for (int i = 0; i < h; i++)
100 {
101 const float* ptr = bottom_blob.row(i);
102 signed char* outptr = top_blob.row<signed char>(i);
103
104 int j = 0;
105 for (; j + 1 < w; j += 2)
106 {
107 __m256 _v0 = _mm256_loadu_ps(ptr);
108 __m256 _v1 = _mm256_loadu_ps(ptr + 8);
109 _v0 = _mm256_mul_ps(_v0, _scale);
110 _v1 = _mm256_mul_ps(_v1, _scale);
111 __m128i _v = float2int8_avx(_v0, _v1);
112 _mm_storeu_si128((__m128i*)outptr, _v);
113
114 ptr += 16;
115 outptr += 16;
116 }
117 for (; j < w; j++)
118 {
119 __m256 _v = _mm256_loadu_ps(ptr);
120 _v = _mm256_mul_ps(_v, _scale);
121 *(int64_t*)outptr = float2int8_avx(_v);
122
123 ptr += 8;
124 outptr += 8;
125 }
126 }
127 }
128 else
129 {
130 #pragma omp parallel for num_threads(opt.num_threads)
131 for (int i = 0; i < h; i++)
132 {
133 const float* ptr = bottom_blob.row(i);
134 signed char* outptr = top_blob.row<signed char>(i);
135
136 __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
137
138 int j = 0;
139 for (; j + 1 < w; j += 2)
140 {
141 __m256 _v0 = _mm256_loadu_ps(ptr);
142 __m256 _v1 = _mm256_loadu_ps(ptr + 8);
143 _v0 = _mm256_mul_ps(_v0, _scale);
144 _v1 = _mm256_mul_ps(_v1, _scale);
145 __m128i _v = float2int8_avx(_v0, _v1);
146 _mm_storeu_si128((__m128i*)outptr, _v);
147
148 ptr += 16;
149 outptr += 16;
150 }
151 for (; j < w; j++)
152 {
153 __m256 _v = _mm256_loadu_ps(ptr);
154 _v = _mm256_mul_ps(_v, _scale);
155 *(int64_t*)outptr = float2int8_avx(_v);
156
157 ptr += 8;
158 outptr += 8;
159 }
160 }
161 }
162 }
163
164 if (dims == 3)
165 {
166 int w = bottom_blob.w;
167 int h = bottom_blob.h;
168 int channels = bottom_blob.c;
169 int size = w * h;
170
171 top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
172 if (top_blob.empty())
173 return -100;
174
175 if (scale_data_size == 1)
176 {
177 __m256 _scale = _mm256_set1_ps(scale_data[0]);
178
179 #pragma omp parallel for num_threads(opt.num_threads)
180 for (int q = 0; q < channels; q++)
181 {
182 const float* ptr = bottom_blob.channel(q);
183 signed char* outptr = top_blob.channel(q);
184
185 int i = 0;
186 for (; i + 1 < size; i += 2)
187 {
188 __m256 _v0 = _mm256_loadu_ps(ptr);
189 __m256 _v1 = _mm256_loadu_ps(ptr + 8);
190 _v0 = _mm256_mul_ps(_v0, _scale);
191 _v1 = _mm256_mul_ps(_v1, _scale);
192 __m128i _v = float2int8_avx(_v0, _v1);
193 _mm_storeu_si128((__m128i*)outptr, _v);
194
195 ptr += 16;
196 outptr += 16;
197 }
198 for (; i < size; i++)
199 {
200 __m256 _v = _mm256_loadu_ps(ptr);
201 _v = _mm256_mul_ps(_v, _scale);
202 *(int64_t*)outptr = float2int8_avx(_v);
203
204 ptr += 8;
205 outptr += 8;
206 }
207 }
208 }
209 else
210 {
211 #pragma omp parallel for num_threads(opt.num_threads)
212 for (int q = 0; q < channels; q++)
213 {
214 const float* ptr = bottom_blob.channel(q);
215 signed char* outptr = top_blob.channel(q);
216
217 __m256 _scale = _mm256_loadu_ps((const float*)scale_data + q * 8);
218
219 int i = 0;
220 for (; i + 1 < size; i += 2)
221 {
222 __m256 _v0 = _mm256_loadu_ps(ptr);
223 __m256 _v1 = _mm256_loadu_ps(ptr + 8);
224 _v0 = _mm256_mul_ps(_v0, _scale);
225 _v1 = _mm256_mul_ps(_v1, _scale);
226 __m128i _v = float2int8_avx(_v0, _v1);
227 _mm_storeu_si128((__m128i*)outptr, _v);
228
229 ptr += 16;
230 outptr += 16;
231 }
232 for (; i < size; i++)
233 {
234 __m256 _v = _mm256_loadu_ps(ptr);
235 _v = _mm256_mul_ps(_v, _scale);
236 *(int64_t*)outptr = float2int8_avx(_v);
237
238 ptr += 8;
239 outptr += 8;
240 }
241 }
242 }
243 }
244
245 return 0;
246 }
247 #endif // __AVX__
248
249 if (elempack == 4)
250 {
251 if (dims == 1)
252 {
253 int w = bottom_blob.w;
254 int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
255 int outw = w * elempack / out_elempack;
256
257 top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
258 if (top_blob.empty())
259 return -100;
260
261 if (scale_data_size == 1)
262 {
263 const float scale = scale_data[0];
264
265 #pragma omp parallel for num_threads(opt.num_threads)
266 for (int i = 0; i < w; i++)
267 {
268 const float* ptr0 = (const float*)bottom_blob + i * 4;
269 signed char* outptr = (signed char*)top_blob + i * 4;
270
271 outptr[0] = float2int8(ptr0[0] * scale);
272 outptr[1] = float2int8(ptr0[1] * scale);
273 outptr[2] = float2int8(ptr0[2] * scale);
274 outptr[3] = float2int8(ptr0[3] * scale);
275 }
276 }
277 else
278 {
279 #pragma omp parallel for num_threads(opt.num_threads)
280 for (int i = 0; i < w; i++)
281 {
282 const float* ptr0 = (const float*)bottom_blob + i * 4;
283 signed char* outptr = (signed char*)top_blob + i * 4;
284
285 outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
286 outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
287 outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
288 outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
289 }
290 }
291 }
292
293 if (dims == 2)
294 {
295 int w = bottom_blob.w;
296 int h = bottom_blob.h;
297 int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
298 int outh = h * elempack / out_elempack;
299
300 top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
301 if (top_blob.empty())
302 return -100;
303
304 if (out_elempack == 8)
305 {
306 if (scale_data_size == 1)
307 {
308 __m128 _scale = _mm_set1_ps(scale_data[0]);
309
310 #pragma omp parallel for num_threads(opt.num_threads)
311 for (int i = 0; i < outh; i++)
312 {
313 const float* ptr0 = bottom_blob.row(i * 2);
314 const float* ptr1 = bottom_blob.row(i * 2 + 1);
315 signed char* outptr = top_blob.row<signed char>(i);
316
317 int j = 0;
318 for (; j + 1 < w; j += 2)
319 {
320 __m128 _v0 = _mm_loadu_ps(ptr0);
321 __m128 _v1 = _mm_loadu_ps(ptr1);
322 __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
323 __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
324 _v0 = _mm_mul_ps(_v0, _scale);
325 _v1 = _mm_mul_ps(_v1, _scale);
326 _v2 = _mm_mul_ps(_v2, _scale);
327 _v3 = _mm_mul_ps(_v3, _scale);
328 __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
329 _mm_storeu_si128((__m128i*)outptr, _v);
330
331 ptr0 += 8;
332 ptr1 += 8;
333 outptr += 16;
334 }
335 for (; j < w; j++)
336 {
337 __m128 _vlow = _mm_loadu_ps(ptr0);
338 __m128 _vhigh = _mm_loadu_ps(ptr1);
339 _vlow = _mm_mul_ps(_vlow, _scale);
340 _vhigh = _mm_mul_ps(_vhigh, _scale);
341 *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
342
343 ptr0 += 4;
344 ptr1 += 4;
345 outptr += 8;
346 }
347 }
348 }
349 else
350 {
351 #pragma omp parallel for num_threads(opt.num_threads)
352 for (int i = 0; i < outh; i++)
353 {
354 const float* ptr0 = bottom_blob.row(i * 2);
355 const float* ptr1 = bottom_blob.row(i * 2 + 1);
356 signed char* outptr = top_blob.row<signed char>(i);
357
358 __m128 _scale0 = _mm_loadu_ps((const float*)scale_data + i * 8);
359 __m128 _scale1 = _mm_loadu_ps((const float*)scale_data + i * 8 + 4);
360
361 int j = 0;
362 for (; j + 1 < w; j += 2)
363 {
364 __m128 _v0 = _mm_loadu_ps(ptr0);
365 __m128 _v1 = _mm_loadu_ps(ptr1);
366 __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
367 __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
368 _v0 = _mm_mul_ps(_v0, _scale0);
369 _v1 = _mm_mul_ps(_v1, _scale1);
370 _v2 = _mm_mul_ps(_v2, _scale0);
371 _v3 = _mm_mul_ps(_v3, _scale1);
372 __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
373 _mm_storeu_si128((__m128i*)outptr, _v);
374
375 ptr0 += 8;
376 ptr1 += 8;
377 outptr += 16;
378 }
379 for (; j < w; j++)
380 {
381 __m128 _vlow = _mm_loadu_ps(ptr0);
382 __m128 _vhigh = _mm_loadu_ps(ptr1);
383 _vlow = _mm_mul_ps(_vlow, _scale0);
384 _vhigh = _mm_mul_ps(_vhigh, _scale1);
385 *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
386
387 ptr0 += 4;
388 ptr1 += 4;
389 outptr += 8;
390 }
391 }
392 }
393 }
394 if (out_elempack == 1)
395 {
396 if (scale_data_size == 1)
397 {
398 const float scale = scale_data[0];
399
400 #pragma omp parallel for num_threads(opt.num_threads)
401 for (int i = 0; i < h; i++)
402 {
403 const float* ptr0 = bottom_blob.row(i);
404 signed char* outptr0 = top_blob.row<signed char>(i * 4);
405 signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
406 signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
407 signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
408
409 for (int j = 0; j < w; j++)
410 {
411 outptr0[0] = float2int8(ptr0[0] * scale);
412 outptr1[0] = float2int8(ptr0[1] * scale);
413 outptr2[0] = float2int8(ptr0[2] * scale);
414 outptr3[0] = float2int8(ptr0[3] * scale);
415
416 ptr0 += 4;
417 outptr0 += 1;
418 outptr1 += 1;
419 outptr2 += 1;
420 outptr3 += 1;
421 }
422 }
423 }
424 else
425 {
426 #pragma omp parallel for num_threads(opt.num_threads)
427 for (int i = 0; i < h; i++)
428 {
429 const float* ptr0 = bottom_blob.row(i);
430 signed char* outptr0 = top_blob.row<signed char>(i * 4);
431 signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
432 signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
433 signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
434
435 const float s0 = scale_data[i * 4];
436 const float s1 = scale_data[i * 4 + 1];
437 const float s2 = scale_data[i * 4 + 2];
438 const float s3 = scale_data[i * 4 + 3];
439
440 for (int j = 0; j < w; j++)
441 {
442 outptr0[0] = float2int8(ptr0[0] * s0);
443 outptr1[0] = float2int8(ptr0[1] * s1);
444 outptr2[0] = float2int8(ptr0[2] * s2);
445 outptr3[0] = float2int8(ptr0[3] * s3);
446
447 ptr0 += 4;
448 outptr0 += 1;
449 outptr1 += 1;
450 outptr2 += 1;
451 outptr3 += 1;
452 }
453 }
454 }
455 }
456 }
457
458 if (dims == 3)
459 {
460 int w = bottom_blob.w;
461 int h = bottom_blob.h;
462 int channels = bottom_blob.c;
463 int size = w * h;
464 int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
465 int outc = channels * elempack / out_elempack;
466
467 top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
468 if (top_blob.empty())
469 return -100;
470
471 if (out_elempack == 8)
472 {
473 if (scale_data_size == 1)
474 {
475 __m128 _scale = _mm_set1_ps(scale_data[0]);
476
477 #pragma omp parallel for num_threads(opt.num_threads)
478 for (int q = 0; q < outc; q++)
479 {
480 const float* ptr0 = bottom_blob.channel(q * 2);
481 const float* ptr1 = bottom_blob.channel(q * 2 + 1);
482 signed char* outptr = top_blob.channel(q);
483
484 int i = 0;
485 for (; i + 1 < size; i += 2)
486 {
487 __m128 _v0 = _mm_loadu_ps(ptr0);
488 __m128 _v1 = _mm_loadu_ps(ptr1);
489 __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
490 __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
491 _v0 = _mm_mul_ps(_v0, _scale);
492 _v1 = _mm_mul_ps(_v1, _scale);
493 _v2 = _mm_mul_ps(_v2, _scale);
494 _v3 = _mm_mul_ps(_v3, _scale);
495 __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
496 _mm_storeu_si128((__m128i*)outptr, _v);
497
498 ptr0 += 8;
499 ptr1 += 8;
500 outptr += 16;
501 }
502 for (; i < size; i++)
503 {
504 __m128 _vlow = _mm_loadu_ps(ptr0);
505 __m128 _vhigh = _mm_loadu_ps(ptr1);
506 _vlow = _mm_mul_ps(_vlow, _scale);
507 _vhigh = _mm_mul_ps(_vhigh, _scale);
508 *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
509
510 ptr0 += 4;
511 ptr1 += 4;
512 outptr += 8;
513 }
514 }
515 }
516 else
517 {
518 #pragma omp parallel for num_threads(opt.num_threads)
519 for (int q = 0; q < outc; q++)
520 {
521 const float* ptr0 = bottom_blob.channel(q * 2);
522 const float* ptr1 = bottom_blob.channel(q * 2 + 1);
523 signed char* outptr = top_blob.channel(q);
524
525 __m128 _scale0 = _mm_loadu_ps((const float*)scale_data + q * 8);
526 __m128 _scale1 = _mm_loadu_ps((const float*)scale_data + q * 8 + 4);
527
528 int i = 0;
529 for (; i + 1 < size; i += 2)
530 {
531 __m128 _v0 = _mm_loadu_ps(ptr0);
532 __m128 _v1 = _mm_loadu_ps(ptr1);
533 __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
534 __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
535 _v0 = _mm_mul_ps(_v0, _scale0);
536 _v1 = _mm_mul_ps(_v1, _scale1);
537 _v2 = _mm_mul_ps(_v2, _scale0);
538 _v3 = _mm_mul_ps(_v3, _scale1);
539 __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
540 _mm_storeu_si128((__m128i*)outptr, _v);
541
542 ptr0 += 8;
543 ptr1 += 8;
544 outptr += 16;
545 }
546 for (; i < size; i++)
547 {
548 __m128 _vlow = _mm_loadu_ps(ptr0);
549 __m128 _vhigh = _mm_loadu_ps(ptr1);
550 _vlow = _mm_mul_ps(_vlow, _scale0);
551 _vhigh = _mm_mul_ps(_vhigh, _scale1);
552 *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
553
554 ptr0 += 4;
555 ptr1 += 4;
556 outptr += 8;
557 }
558 }
559 }
560 }
561 if (out_elempack == 1)
562 {
563 if (scale_data_size == 1)
564 {
565 const float scale = scale_data[0];
566
567 #pragma omp parallel for num_threads(opt.num_threads)
568 for (int q = 0; q < channels; q++)
569 {
570 const float* ptr0 = bottom_blob.channel(q);
571 signed char* outptr0 = top_blob.channel(q * 4);
572 signed char* outptr1 = top_blob.channel(q * 4 + 1);
573 signed char* outptr2 = top_blob.channel(q * 4 + 2);
574 signed char* outptr3 = top_blob.channel(q * 4 + 3);
575
576 for (int i = 0; i < size; i++)
577 {
578 outptr0[0] = float2int8(ptr0[0] * scale);
579 outptr1[0] = float2int8(ptr0[1] * scale);
580 outptr2[0] = float2int8(ptr0[2] * scale);
581 outptr3[0] = float2int8(ptr0[3] * scale);
582
583 ptr0 += 4;
584 outptr0 += 1;
585 outptr1 += 1;
586 outptr2 += 1;
587 outptr3 += 1;
588 }
589 }
590 }
591 else
592 {
593 #pragma omp parallel for num_threads(opt.num_threads)
594 for (int q = 0; q < channels; q++)
595 {
596 const float* ptr0 = bottom_blob.channel(q);
597 signed char* outptr0 = top_blob.channel(q * 4);
598 signed char* outptr1 = top_blob.channel(q * 4 + 1);
599 signed char* outptr2 = top_blob.channel(q * 4 + 2);
600 signed char* outptr3 = top_blob.channel(q * 4 + 3);
601
602 const float s0 = scale_data[q * 4];
603 const float s1 = scale_data[q * 4 + 1];
604 const float s2 = scale_data[q * 4 + 2];
605 const float s3 = scale_data[q * 4 + 3];
606
607 for (int i = 0; i < size; i++)
608 {
609 outptr0[0] = float2int8(ptr0[0] * s0);
610 outptr1[0] = float2int8(ptr0[1] * s1);
611 outptr2[0] = float2int8(ptr0[2] * s2);
612 outptr3[0] = float2int8(ptr0[3] * s3);
613
614 ptr0 += 4;
615 outptr0 += 1;
616 outptr1 += 1;
617 outptr2 += 1;
618 outptr3 += 1;
619 }
620 }
621 }
622 }
623 }
624
625 return 0;
626 }
627 #endif // __SSE2__
628
629 if (dims == 1)
630 {
631 int w = bottom_blob.w;
632
633 top_blob.create(w, (size_t)1u, opt.blob_allocator);
634 if (top_blob.empty())
635 return -100;
636
637 const float* ptr = bottom_blob;
638 signed char* outptr = top_blob;
639
640 if (scale_data_size == 1)
641 {
642 const float scale = scale_data[0];
643
644 #pragma omp parallel for num_threads(opt.num_threads)
645 for (int i = 0; i < w; i++)
646 {
647 outptr[i] = float2int8(ptr[i] * scale);
648 }
649 }
650 else
651 {
652 #pragma omp parallel for num_threads(opt.num_threads)
653 for (int i = 0; i < w; i++)
654 {
655 outptr[i] = float2int8(ptr[i] * scale_data[i]);
656 }
657 }
658 }
659
660 if (dims == 2)
661 {
662 int w = bottom_blob.w;
663 int h = bottom_blob.h;
664
665 top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
666 if (top_blob.empty())
667 return -100;
668
669 #pragma omp parallel for num_threads(opt.num_threads)
670 for (int i = 0; i < h; i++)
671 {
672 const float* ptr0 = bottom_blob.row(i);
673 signed char* outptr0 = top_blob.row<signed char>(i);
674
675 const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
676
677 for (int j = 0; j < w; j++)
678 {
679 *outptr0++ = float2int8(*ptr0++ * scale);
680 }
681 }
682 }
683
684 if (dims == 3)
685 {
686 int w = bottom_blob.w;
687 int h = bottom_blob.h;
688 int channels = bottom_blob.c;
689 int size = w * h;
690
691 top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
692 if (top_blob.empty())
693 return -100;
694
695 #pragma omp parallel for num_threads(opt.num_threads)
696 for (int q = 0; q < channels; q++)
697 {
698 const float* ptr = bottom_blob.channel(q);
699 signed char* outptr = top_blob.channel(q);
700
701 const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
702
703 for (int i = 0; i < size; i++)
704 {
705 *outptr++ = float2int8(*ptr++ * scale);
706 }
707 }
708 }
709
710 return 0;
711 }
712
713 } // namespace ncnn
714