1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "quantize_x86.h"
16 
17 #include <math.h>
18 
19 #if __SSE2__
20 #include <emmintrin.h>
21 #if __AVX__
22 #include <immintrin.h>
23 #endif // __AVX__
24 #endif // __SSE2__
25 
26 #include "x86_usability.h"
27 
28 namespace ncnn {
29 
Quantize_x86()30 Quantize_x86::Quantize_x86()
31 {
32 #if __SSE2__
33     support_packing = true;
34 #endif // __SSE2__
35 }
36 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const37 int Quantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
38 {
39     int dims = bottom_blob.dims;
40     int elempack = bottom_blob.elempack;
41 
42 #if __SSE2__
43 #if __AVX__
44     if (elempack == 8)
45     {
46         if (dims == 1)
47         {
48             int w = bottom_blob.w;
49 
50             top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
51             if (top_blob.empty())
52                 return -100;
53 
54             if (scale_data_size == 1)
55             {
56                 __m256 _scale = _mm256_set1_ps(scale_data[0]);
57 
58                 #pragma omp parallel for num_threads(opt.num_threads)
59                 for (int i = 0; i < w; i++)
60                 {
61                     const float* ptr = (const float*)bottom_blob + i * 8;
62                     signed char* outptr = (signed char*)top_blob + i * 8;
63 
64                     __m256 _v = _mm256_loadu_ps(ptr);
65                     _v = _mm256_mul_ps(_v, _scale);
66                     *(int64_t*)outptr = float2int8_avx(_v);
67                 }
68             }
69             else
70             {
71                 #pragma omp parallel for num_threads(opt.num_threads)
72                 for (int i = 0; i < w; i++)
73                 {
74                     const float* ptr = (const float*)bottom_blob + i * 8;
75                     signed char* outptr = (signed char*)top_blob + i * 8;
76 
77                     __m256 _v = _mm256_loadu_ps(ptr);
78                     __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
79                     _v = _mm256_mul_ps(_v, _scale);
80                     *(int64_t*)outptr = float2int8_avx(_v);
81                 }
82             }
83         }
84 
85         if (dims == 2)
86         {
87             int w = bottom_blob.w;
88             int h = bottom_blob.h;
89 
90             top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
91             if (top_blob.empty())
92                 return -100;
93 
94             if (scale_data_size == 1)
95             {
96                 __m256 _scale = _mm256_set1_ps(scale_data[0]);
97 
98                 #pragma omp parallel for num_threads(opt.num_threads)
99                 for (int i = 0; i < h; i++)
100                 {
101                     const float* ptr = bottom_blob.row(i);
102                     signed char* outptr = top_blob.row<signed char>(i);
103 
104                     int j = 0;
105                     for (; j + 1 < w; j += 2)
106                     {
107                         __m256 _v0 = _mm256_loadu_ps(ptr);
108                         __m256 _v1 = _mm256_loadu_ps(ptr + 8);
109                         _v0 = _mm256_mul_ps(_v0, _scale);
110                         _v1 = _mm256_mul_ps(_v1, _scale);
111                         __m128i _v = float2int8_avx(_v0, _v1);
112                         _mm_storeu_si128((__m128i*)outptr, _v);
113 
114                         ptr += 16;
115                         outptr += 16;
116                     }
117                     for (; j < w; j++)
118                     {
119                         __m256 _v = _mm256_loadu_ps(ptr);
120                         _v = _mm256_mul_ps(_v, _scale);
121                         *(int64_t*)outptr = float2int8_avx(_v);
122 
123                         ptr += 8;
124                         outptr += 8;
125                     }
126                 }
127             }
128             else
129             {
130                 #pragma omp parallel for num_threads(opt.num_threads)
131                 for (int i = 0; i < h; i++)
132                 {
133                     const float* ptr = bottom_blob.row(i);
134                     signed char* outptr = top_blob.row<signed char>(i);
135 
136                     __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
137 
138                     int j = 0;
139                     for (; j + 1 < w; j += 2)
140                     {
141                         __m256 _v0 = _mm256_loadu_ps(ptr);
142                         __m256 _v1 = _mm256_loadu_ps(ptr + 8);
143                         _v0 = _mm256_mul_ps(_v0, _scale);
144                         _v1 = _mm256_mul_ps(_v1, _scale);
145                         __m128i _v = float2int8_avx(_v0, _v1);
146                         _mm_storeu_si128((__m128i*)outptr, _v);
147 
148                         ptr += 16;
149                         outptr += 16;
150                     }
151                     for (; j < w; j++)
152                     {
153                         __m256 _v = _mm256_loadu_ps(ptr);
154                         _v = _mm256_mul_ps(_v, _scale);
155                         *(int64_t*)outptr = float2int8_avx(_v);
156 
157                         ptr += 8;
158                         outptr += 8;
159                     }
160                 }
161             }
162         }
163 
164         if (dims == 3)
165         {
166             int w = bottom_blob.w;
167             int h = bottom_blob.h;
168             int channels = bottom_blob.c;
169             int size = w * h;
170 
171             top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
172             if (top_blob.empty())
173                 return -100;
174 
175             if (scale_data_size == 1)
176             {
177                 __m256 _scale = _mm256_set1_ps(scale_data[0]);
178 
179                 #pragma omp parallel for num_threads(opt.num_threads)
180                 for (int q = 0; q < channels; q++)
181                 {
182                     const float* ptr = bottom_blob.channel(q);
183                     signed char* outptr = top_blob.channel(q);
184 
185                     int i = 0;
186                     for (; i + 1 < size; i += 2)
187                     {
188                         __m256 _v0 = _mm256_loadu_ps(ptr);
189                         __m256 _v1 = _mm256_loadu_ps(ptr + 8);
190                         _v0 = _mm256_mul_ps(_v0, _scale);
191                         _v1 = _mm256_mul_ps(_v1, _scale);
192                         __m128i _v = float2int8_avx(_v0, _v1);
193                         _mm_storeu_si128((__m128i*)outptr, _v);
194 
195                         ptr += 16;
196                         outptr += 16;
197                     }
198                     for (; i < size; i++)
199                     {
200                         __m256 _v = _mm256_loadu_ps(ptr);
201                         _v = _mm256_mul_ps(_v, _scale);
202                         *(int64_t*)outptr = float2int8_avx(_v);
203 
204                         ptr += 8;
205                         outptr += 8;
206                     }
207                 }
208             }
209             else
210             {
211                 #pragma omp parallel for num_threads(opt.num_threads)
212                 for (int q = 0; q < channels; q++)
213                 {
214                     const float* ptr = bottom_blob.channel(q);
215                     signed char* outptr = top_blob.channel(q);
216 
217                     __m256 _scale = _mm256_loadu_ps((const float*)scale_data + q * 8);
218 
219                     int i = 0;
220                     for (; i + 1 < size; i += 2)
221                     {
222                         __m256 _v0 = _mm256_loadu_ps(ptr);
223                         __m256 _v1 = _mm256_loadu_ps(ptr + 8);
224                         _v0 = _mm256_mul_ps(_v0, _scale);
225                         _v1 = _mm256_mul_ps(_v1, _scale);
226                         __m128i _v = float2int8_avx(_v0, _v1);
227                         _mm_storeu_si128((__m128i*)outptr, _v);
228 
229                         ptr += 16;
230                         outptr += 16;
231                     }
232                     for (; i < size; i++)
233                     {
234                         __m256 _v = _mm256_loadu_ps(ptr);
235                         _v = _mm256_mul_ps(_v, _scale);
236                         *(int64_t*)outptr = float2int8_avx(_v);
237 
238                         ptr += 8;
239                         outptr += 8;
240                     }
241                 }
242             }
243         }
244 
245         return 0;
246     }
247 #endif // __AVX__
248 
249     if (elempack == 4)
250     {
251         if (dims == 1)
252         {
253             int w = bottom_blob.w;
254             int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
255             int outw = w * elempack / out_elempack;
256 
257             top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
258             if (top_blob.empty())
259                 return -100;
260 
261             if (scale_data_size == 1)
262             {
263                 const float scale = scale_data[0];
264 
265                 #pragma omp parallel for num_threads(opt.num_threads)
266                 for (int i = 0; i < w; i++)
267                 {
268                     const float* ptr0 = (const float*)bottom_blob + i * 4;
269                     signed char* outptr = (signed char*)top_blob + i * 4;
270 
271                     outptr[0] = float2int8(ptr0[0] * scale);
272                     outptr[1] = float2int8(ptr0[1] * scale);
273                     outptr[2] = float2int8(ptr0[2] * scale);
274                     outptr[3] = float2int8(ptr0[3] * scale);
275                 }
276             }
277             else
278             {
279                 #pragma omp parallel for num_threads(opt.num_threads)
280                 for (int i = 0; i < w; i++)
281                 {
282                     const float* ptr0 = (const float*)bottom_blob + i * 4;
283                     signed char* outptr = (signed char*)top_blob + i * 4;
284 
285                     outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
286                     outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
287                     outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
288                     outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
289                 }
290             }
291         }
292 
293         if (dims == 2)
294         {
295             int w = bottom_blob.w;
296             int h = bottom_blob.h;
297             int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
298             int outh = h * elempack / out_elempack;
299 
300             top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
301             if (top_blob.empty())
302                 return -100;
303 
304             if (out_elempack == 8)
305             {
306                 if (scale_data_size == 1)
307                 {
308                     __m128 _scale = _mm_set1_ps(scale_data[0]);
309 
310                     #pragma omp parallel for num_threads(opt.num_threads)
311                     for (int i = 0; i < outh; i++)
312                     {
313                         const float* ptr0 = bottom_blob.row(i * 2);
314                         const float* ptr1 = bottom_blob.row(i * 2 + 1);
315                         signed char* outptr = top_blob.row<signed char>(i);
316 
317                         int j = 0;
318                         for (; j + 1 < w; j += 2)
319                         {
320                             __m128 _v0 = _mm_loadu_ps(ptr0);
321                             __m128 _v1 = _mm_loadu_ps(ptr1);
322                             __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
323                             __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
324                             _v0 = _mm_mul_ps(_v0, _scale);
325                             _v1 = _mm_mul_ps(_v1, _scale);
326                             _v2 = _mm_mul_ps(_v2, _scale);
327                             _v3 = _mm_mul_ps(_v3, _scale);
328                             __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
329                             _mm_storeu_si128((__m128i*)outptr, _v);
330 
331                             ptr0 += 8;
332                             ptr1 += 8;
333                             outptr += 16;
334                         }
335                         for (; j < w; j++)
336                         {
337                             __m128 _vlow = _mm_loadu_ps(ptr0);
338                             __m128 _vhigh = _mm_loadu_ps(ptr1);
339                             _vlow = _mm_mul_ps(_vlow, _scale);
340                             _vhigh = _mm_mul_ps(_vhigh, _scale);
341                             *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
342 
343                             ptr0 += 4;
344                             ptr1 += 4;
345                             outptr += 8;
346                         }
347                     }
348                 }
349                 else
350                 {
351                     #pragma omp parallel for num_threads(opt.num_threads)
352                     for (int i = 0; i < outh; i++)
353                     {
354                         const float* ptr0 = bottom_blob.row(i * 2);
355                         const float* ptr1 = bottom_blob.row(i * 2 + 1);
356                         signed char* outptr = top_blob.row<signed char>(i);
357 
358                         __m128 _scale0 = _mm_loadu_ps((const float*)scale_data + i * 8);
359                         __m128 _scale1 = _mm_loadu_ps((const float*)scale_data + i * 8 + 4);
360 
361                         int j = 0;
362                         for (; j + 1 < w; j += 2)
363                         {
364                             __m128 _v0 = _mm_loadu_ps(ptr0);
365                             __m128 _v1 = _mm_loadu_ps(ptr1);
366                             __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
367                             __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
368                             _v0 = _mm_mul_ps(_v0, _scale0);
369                             _v1 = _mm_mul_ps(_v1, _scale1);
370                             _v2 = _mm_mul_ps(_v2, _scale0);
371                             _v3 = _mm_mul_ps(_v3, _scale1);
372                             __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
373                             _mm_storeu_si128((__m128i*)outptr, _v);
374 
375                             ptr0 += 8;
376                             ptr1 += 8;
377                             outptr += 16;
378                         }
379                         for (; j < w; j++)
380                         {
381                             __m128 _vlow = _mm_loadu_ps(ptr0);
382                             __m128 _vhigh = _mm_loadu_ps(ptr1);
383                             _vlow = _mm_mul_ps(_vlow, _scale0);
384                             _vhigh = _mm_mul_ps(_vhigh, _scale1);
385                             *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
386 
387                             ptr0 += 4;
388                             ptr1 += 4;
389                             outptr += 8;
390                         }
391                     }
392                 }
393             }
394             if (out_elempack == 1)
395             {
396                 if (scale_data_size == 1)
397                 {
398                     const float scale = scale_data[0];
399 
400                     #pragma omp parallel for num_threads(opt.num_threads)
401                     for (int i = 0; i < h; i++)
402                     {
403                         const float* ptr0 = bottom_blob.row(i);
404                         signed char* outptr0 = top_blob.row<signed char>(i * 4);
405                         signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
406                         signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
407                         signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
408 
409                         for (int j = 0; j < w; j++)
410                         {
411                             outptr0[0] = float2int8(ptr0[0] * scale);
412                             outptr1[0] = float2int8(ptr0[1] * scale);
413                             outptr2[0] = float2int8(ptr0[2] * scale);
414                             outptr3[0] = float2int8(ptr0[3] * scale);
415 
416                             ptr0 += 4;
417                             outptr0 += 1;
418                             outptr1 += 1;
419                             outptr2 += 1;
420                             outptr3 += 1;
421                         }
422                     }
423                 }
424                 else
425                 {
426                     #pragma omp parallel for num_threads(opt.num_threads)
427                     for (int i = 0; i < h; i++)
428                     {
429                         const float* ptr0 = bottom_blob.row(i);
430                         signed char* outptr0 = top_blob.row<signed char>(i * 4);
431                         signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
432                         signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
433                         signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
434 
435                         const float s0 = scale_data[i * 4];
436                         const float s1 = scale_data[i * 4 + 1];
437                         const float s2 = scale_data[i * 4 + 2];
438                         const float s3 = scale_data[i * 4 + 3];
439 
440                         for (int j = 0; j < w; j++)
441                         {
442                             outptr0[0] = float2int8(ptr0[0] * s0);
443                             outptr1[0] = float2int8(ptr0[1] * s1);
444                             outptr2[0] = float2int8(ptr0[2] * s2);
445                             outptr3[0] = float2int8(ptr0[3] * s3);
446 
447                             ptr0 += 4;
448                             outptr0 += 1;
449                             outptr1 += 1;
450                             outptr2 += 1;
451                             outptr3 += 1;
452                         }
453                     }
454                 }
455             }
456         }
457 
458         if (dims == 3)
459         {
460             int w = bottom_blob.w;
461             int h = bottom_blob.h;
462             int channels = bottom_blob.c;
463             int size = w * h;
464             int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
465             int outc = channels * elempack / out_elempack;
466 
467             top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
468             if (top_blob.empty())
469                 return -100;
470 
471             if (out_elempack == 8)
472             {
473                 if (scale_data_size == 1)
474                 {
475                     __m128 _scale = _mm_set1_ps(scale_data[0]);
476 
477                     #pragma omp parallel for num_threads(opt.num_threads)
478                     for (int q = 0; q < outc; q++)
479                     {
480                         const float* ptr0 = bottom_blob.channel(q * 2);
481                         const float* ptr1 = bottom_blob.channel(q * 2 + 1);
482                         signed char* outptr = top_blob.channel(q);
483 
484                         int i = 0;
485                         for (; i + 1 < size; i += 2)
486                         {
487                             __m128 _v0 = _mm_loadu_ps(ptr0);
488                             __m128 _v1 = _mm_loadu_ps(ptr1);
489                             __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
490                             __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
491                             _v0 = _mm_mul_ps(_v0, _scale);
492                             _v1 = _mm_mul_ps(_v1, _scale);
493                             _v2 = _mm_mul_ps(_v2, _scale);
494                             _v3 = _mm_mul_ps(_v3, _scale);
495                             __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
496                             _mm_storeu_si128((__m128i*)outptr, _v);
497 
498                             ptr0 += 8;
499                             ptr1 += 8;
500                             outptr += 16;
501                         }
502                         for (; i < size; i++)
503                         {
504                             __m128 _vlow = _mm_loadu_ps(ptr0);
505                             __m128 _vhigh = _mm_loadu_ps(ptr1);
506                             _vlow = _mm_mul_ps(_vlow, _scale);
507                             _vhigh = _mm_mul_ps(_vhigh, _scale);
508                             *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
509 
510                             ptr0 += 4;
511                             ptr1 += 4;
512                             outptr += 8;
513                         }
514                     }
515                 }
516                 else
517                 {
518                     #pragma omp parallel for num_threads(opt.num_threads)
519                     for (int q = 0; q < outc; q++)
520                     {
521                         const float* ptr0 = bottom_blob.channel(q * 2);
522                         const float* ptr1 = bottom_blob.channel(q * 2 + 1);
523                         signed char* outptr = top_blob.channel(q);
524 
525                         __m128 _scale0 = _mm_loadu_ps((const float*)scale_data + q * 8);
526                         __m128 _scale1 = _mm_loadu_ps((const float*)scale_data + q * 8 + 4);
527 
528                         int i = 0;
529                         for (; i + 1 < size; i += 2)
530                         {
531                             __m128 _v0 = _mm_loadu_ps(ptr0);
532                             __m128 _v1 = _mm_loadu_ps(ptr1);
533                             __m128 _v2 = _mm_loadu_ps(ptr0 + 4);
534                             __m128 _v3 = _mm_loadu_ps(ptr1 + 4);
535                             _v0 = _mm_mul_ps(_v0, _scale0);
536                             _v1 = _mm_mul_ps(_v1, _scale1);
537                             _v2 = _mm_mul_ps(_v2, _scale0);
538                             _v3 = _mm_mul_ps(_v3, _scale1);
539                             __m128i _v = float2int8_sse(_v0, _v1, _v2, _v3);
540                             _mm_storeu_si128((__m128i*)outptr, _v);
541 
542                             ptr0 += 8;
543                             ptr1 += 8;
544                             outptr += 16;
545                         }
546                         for (; i < size; i++)
547                         {
548                             __m128 _vlow = _mm_loadu_ps(ptr0);
549                             __m128 _vhigh = _mm_loadu_ps(ptr1);
550                             _vlow = _mm_mul_ps(_vlow, _scale0);
551                             _vhigh = _mm_mul_ps(_vhigh, _scale1);
552                             *(int64_t*)outptr = float2int8_sse(_vlow, _vhigh);
553 
554                             ptr0 += 4;
555                             ptr1 += 4;
556                             outptr += 8;
557                         }
558                     }
559                 }
560             }
561             if (out_elempack == 1)
562             {
563                 if (scale_data_size == 1)
564                 {
565                     const float scale = scale_data[0];
566 
567                     #pragma omp parallel for num_threads(opt.num_threads)
568                     for (int q = 0; q < channels; q++)
569                     {
570                         const float* ptr0 = bottom_blob.channel(q);
571                         signed char* outptr0 = top_blob.channel(q * 4);
572                         signed char* outptr1 = top_blob.channel(q * 4 + 1);
573                         signed char* outptr2 = top_blob.channel(q * 4 + 2);
574                         signed char* outptr3 = top_blob.channel(q * 4 + 3);
575 
576                         for (int i = 0; i < size; i++)
577                         {
578                             outptr0[0] = float2int8(ptr0[0] * scale);
579                             outptr1[0] = float2int8(ptr0[1] * scale);
580                             outptr2[0] = float2int8(ptr0[2] * scale);
581                             outptr3[0] = float2int8(ptr0[3] * scale);
582 
583                             ptr0 += 4;
584                             outptr0 += 1;
585                             outptr1 += 1;
586                             outptr2 += 1;
587                             outptr3 += 1;
588                         }
589                     }
590                 }
591                 else
592                 {
593                     #pragma omp parallel for num_threads(opt.num_threads)
594                     for (int q = 0; q < channels; q++)
595                     {
596                         const float* ptr0 = bottom_blob.channel(q);
597                         signed char* outptr0 = top_blob.channel(q * 4);
598                         signed char* outptr1 = top_blob.channel(q * 4 + 1);
599                         signed char* outptr2 = top_blob.channel(q * 4 + 2);
600                         signed char* outptr3 = top_blob.channel(q * 4 + 3);
601 
602                         const float s0 = scale_data[q * 4];
603                         const float s1 = scale_data[q * 4 + 1];
604                         const float s2 = scale_data[q * 4 + 2];
605                         const float s3 = scale_data[q * 4 + 3];
606 
607                         for (int i = 0; i < size; i++)
608                         {
609                             outptr0[0] = float2int8(ptr0[0] * s0);
610                             outptr1[0] = float2int8(ptr0[1] * s1);
611                             outptr2[0] = float2int8(ptr0[2] * s2);
612                             outptr3[0] = float2int8(ptr0[3] * s3);
613 
614                             ptr0 += 4;
615                             outptr0 += 1;
616                             outptr1 += 1;
617                             outptr2 += 1;
618                             outptr3 += 1;
619                         }
620                     }
621                 }
622             }
623         }
624 
625         return 0;
626     }
627 #endif // __SSE2__
628 
629     if (dims == 1)
630     {
631         int w = bottom_blob.w;
632 
633         top_blob.create(w, (size_t)1u, opt.blob_allocator);
634         if (top_blob.empty())
635             return -100;
636 
637         const float* ptr = bottom_blob;
638         signed char* outptr = top_blob;
639 
640         if (scale_data_size == 1)
641         {
642             const float scale = scale_data[0];
643 
644             #pragma omp parallel for num_threads(opt.num_threads)
645             for (int i = 0; i < w; i++)
646             {
647                 outptr[i] = float2int8(ptr[i] * scale);
648             }
649         }
650         else
651         {
652             #pragma omp parallel for num_threads(opt.num_threads)
653             for (int i = 0; i < w; i++)
654             {
655                 outptr[i] = float2int8(ptr[i] * scale_data[i]);
656             }
657         }
658     }
659 
660     if (dims == 2)
661     {
662         int w = bottom_blob.w;
663         int h = bottom_blob.h;
664 
665         top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
666         if (top_blob.empty())
667             return -100;
668 
669         #pragma omp parallel for num_threads(opt.num_threads)
670         for (int i = 0; i < h; i++)
671         {
672             const float* ptr0 = bottom_blob.row(i);
673             signed char* outptr0 = top_blob.row<signed char>(i);
674 
675             const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
676 
677             for (int j = 0; j < w; j++)
678             {
679                 *outptr0++ = float2int8(*ptr0++ * scale);
680             }
681         }
682     }
683 
684     if (dims == 3)
685     {
686         int w = bottom_blob.w;
687         int h = bottom_blob.h;
688         int channels = bottom_blob.c;
689         int size = w * h;
690 
691         top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
692         if (top_blob.empty())
693             return -100;
694 
695         #pragma omp parallel for num_threads(opt.num_threads)
696         for (int q = 0; q < channels; q++)
697         {
698             const float* ptr = bottom_blob.channel(q);
699             signed char* outptr = top_blob.channel(q);
700 
701             const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
702 
703             for (int i = 0; i < size; i++)
704             {
705                 *outptr++ = float2int8(*ptr++ * scale);
706             }
707         }
708     }
709 
710     return 0;
711 }
712 
713 } // namespace ncnn
714