1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "requantize_x86.h"
16 
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23 
24 #include "x86_activation.h"
25 #include "x86_usability.h"
26 
27 namespace ncnn {
28 
Requantize_x86()29 Requantize_x86::Requantize_x86()
30 {
31 #if __SSE2__
32     support_packing = true;
33 #endif // __SSE2__
34 }
35 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const36 int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
37 {
38     int dims = bottom_blob.dims;
39     int elempack = bottom_blob.elempack;
40 
41 #if __SSE2__
42     if (elempack == 8)
43     {
44         if (dims == 1)
45         {
46             int w = bottom_blob.w;
47 
48             top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
49             if (top_blob.empty())
50                 return -100;
51 
52             if (scale_in_data_size == 1 && scale_out_data_size == 1)
53             {
54 #if __AVX__
55                 __m256 _scale_in = _mm256_set1_ps(scale_in_data[0]);
56                 __m256 _scale_out = _mm256_set1_ps(scale_out_data[0]);
57 #else
58                 __m128 _scale_in = _mm_set1_ps(scale_in_data[0]);
59                 __m128 _scale_out = _mm_set1_ps(scale_out_data[0]);
60 #endif
61 
62                 if (bias_data_size == 0)
63                 {
64                     #pragma omp parallel for num_threads(opt.num_threads)
65                     for (int i = 0; i < w; i++)
66                     {
67                         const int* intptr = (const int*)bottom_blob + i * 8;
68                         signed char* ptr = (signed char*)top_blob + i * 8;
69 
70 #if __AVX__
71                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
72                         _v = _mm256_mul_ps(_v, _scale_in);
73                         _v = activation_avx(_v, activation_type, activation_params);
74                         _v = _mm256_mul_ps(_v, _scale_out);
75                         *(int64_t*)ptr = float2int8_avx(_v);
76 #else
77                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
78                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
79                         _v0 = _mm_mul_ps(_v0, _scale_in);
80                         _v1 = _mm_mul_ps(_v1, _scale_in);
81                         _v0 = activation_sse(_v0, activation_type, activation_params);
82                         _v1 = activation_sse(_v1, activation_type, activation_params);
83                         _v0 = _mm_mul_ps(_v0, _scale_out);
84                         _v1 = _mm_mul_ps(_v1, _scale_out);
85                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
86 #endif
87                     }
88                 }
89                 else if (bias_data_size == 1)
90                 {
91 #if __AVX__
92                     __m256 _bias = _mm256_set1_ps(bias_data[0]);
93 #else
94                     __m128 _bias = _mm_set1_ps(bias_data[0]);
95 #endif
96 
97                     #pragma omp parallel for num_threads(opt.num_threads)
98                     for (int i = 0; i < w; i++)
99                     {
100                         const int* intptr = (const int*)bottom_blob + i * 8;
101                         signed char* ptr = (signed char*)top_blob + i * 8;
102 
103 #if __AVX__
104                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
105                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
106                         _v = activation_avx(_v, activation_type, activation_params);
107                         _v = _mm256_mul_ps(_v, _scale_out);
108                         *(int64_t*)ptr = float2int8_avx(_v);
109 #else
110                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
111                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
112                         _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in));
113                         _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in));
114                         _v0 = activation_sse(_v0, activation_type, activation_params);
115                         _v1 = activation_sse(_v1, activation_type, activation_params);
116                         _v0 = _mm_mul_ps(_v0, _scale_out);
117                         _v1 = _mm_mul_ps(_v1, _scale_out);
118                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
119 #endif
120                     }
121                 }
122                 else
123                 {
124                     #pragma omp parallel for num_threads(opt.num_threads)
125                     for (int i = 0; i < w; i++)
126                     {
127                         const int* intptr = (const int*)bottom_blob + i * 8;
128                         signed char* ptr = (signed char*)top_blob + i * 8;
129 
130 #if __AVX__
131                         __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
132                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
133                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
134                         _v = activation_avx(_v, activation_type, activation_params);
135                         _v = _mm256_mul_ps(_v, _scale_out);
136                         *(int64_t*)ptr = float2int8_avx(_v);
137 #else
138                         __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
139                         __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
140                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
141                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
142                         _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in));
143                         _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in));
144                         _v0 = activation_sse(_v0, activation_type, activation_params);
145                         _v1 = activation_sse(_v1, activation_type, activation_params);
146                         _v0 = _mm_mul_ps(_v0, _scale_out);
147                         _v1 = _mm_mul_ps(_v1, _scale_out);
148                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
149 #endif
150                     }
151                 }
152             }
153             else if (scale_in_data_size == 1 && scale_out_data_size > 1)
154             {
155 #if __AVX__
156                 __m256 _scale_in = _mm256_set1_ps(scale_in_data[0]);
157 #else
158                 __m128 _scale_in = _mm_set1_ps(scale_in_data[0]);
159 #endif
160 
161                 if (bias_data_size == 0)
162                 {
163                     #pragma omp parallel for num_threads(opt.num_threads)
164                     for (int i = 0; i < w; i++)
165                     {
166                         const int* intptr = (const int*)bottom_blob + i * 8;
167                         signed char* ptr = (signed char*)top_blob + i * 8;
168 
169 #if __AVX__
170                         __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
171                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
172                         _v = _mm256_mul_ps(_v, _scale_in);
173                         _v = activation_avx(_v, activation_type, activation_params);
174                         _v = _mm256_mul_ps(_v, _scale_out);
175                         *(int64_t*)ptr = float2int8_avx(_v);
176 #else
177                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
178                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
179                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
180                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
181                         _v0 = _mm_mul_ps(_v0, _scale_in);
182                         _v1 = _mm_mul_ps(_v1, _scale_in);
183                         _v0 = activation_sse(_v0, activation_type, activation_params);
184                         _v1 = activation_sse(_v1, activation_type, activation_params);
185                         _v0 = _mm_mul_ps(_v0, _scale_out0);
186                         _v1 = _mm_mul_ps(_v1, _scale_out1);
187                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
188 #endif
189                     }
190                 }
191                 else if (bias_data_size == 1)
192                 {
193 #if __AVX__
194                     __m256 _bias = _mm256_set1_ps(bias_data[0]);
195 #else
196                     __m128 _bias = _mm_set1_ps(bias_data[0]);
197 #endif
198 
199                     #pragma omp parallel for num_threads(opt.num_threads)
200                     for (int i = 0; i < w; i++)
201                     {
202                         const int* intptr = (const int*)bottom_blob + i * 8;
203                         signed char* ptr = (signed char*)top_blob + i * 8;
204 
205 #if __AVX__
206                         __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
207                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
208                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
209                         _v = activation_avx(_v, activation_type, activation_params);
210                         _v = _mm256_mul_ps(_v, _scale_out);
211                         *(int64_t*)ptr = float2int8_avx(_v);
212 #else
213                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
214                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
215                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
216                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
217                         _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in));
218                         _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in));
219                         _v0 = activation_sse(_v0, activation_type, activation_params);
220                         _v1 = activation_sse(_v1, activation_type, activation_params);
221                         _v0 = _mm_mul_ps(_v0, _scale_out0);
222                         _v1 = _mm_mul_ps(_v1, _scale_out1);
223                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
224 #endif
225                     }
226                 }
227                 else
228                 {
229                     #pragma omp parallel for num_threads(opt.num_threads)
230                     for (int i = 0; i < w; i++)
231                     {
232                         const int* intptr = (const int*)bottom_blob + i * 8;
233                         signed char* ptr = (signed char*)top_blob + i * 8;
234 
235 #if __AVX__
236                         __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
237                         __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
238                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
239                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
240                         _v = activation_avx(_v, activation_type, activation_params);
241                         _v = _mm256_mul_ps(_v, _scale_out);
242                         *(int64_t*)ptr = float2int8_avx(_v);
243 #else
244                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
245                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
246                         __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
247                         __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
248                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
249                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
250                         _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in));
251                         _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in));
252                         _v0 = activation_sse(_v0, activation_type, activation_params);
253                         _v1 = activation_sse(_v1, activation_type, activation_params);
254                         _v0 = _mm_mul_ps(_v0, _scale_out0);
255                         _v1 = _mm_mul_ps(_v1, _scale_out1);
256                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
257 #endif
258                     }
259                 }
260             }
261             else if (scale_in_data_size > 1 && scale_out_data_size == 1)
262             {
263 #if __AVX__
264                 __m256 _scale_out = _mm256_set1_ps(scale_out_data[0]);
265 #else
266                 __m128 _scale_out = _mm_set1_ps(scale_out_data[0]);
267 #endif
268 
269                 if (bias_data_size == 0)
270                 {
271                     #pragma omp parallel for num_threads(opt.num_threads)
272                     for (int i = 0; i < w; i++)
273                     {
274                         const int* intptr = (const int*)bottom_blob + i * 8;
275                         signed char* ptr = (signed char*)top_blob + i * 8;
276 
277 #if __AVX__
278                         __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
279                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
280                         _v = _mm256_mul_ps(_v, _scale_in);
281                         _v = activation_avx(_v, activation_type, activation_params);
282                         _v = _mm256_mul_ps(_v, _scale_out);
283                         *(int64_t*)ptr = float2int8_avx(_v);
284 #else
285                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
286                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
287                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
288                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
289                         _v0 = _mm_mul_ps(_v0, _scale_in0);
290                         _v1 = _mm_mul_ps(_v1, _scale_in1);
291                         _v0 = activation_sse(_v0, activation_type, activation_params);
292                         _v1 = activation_sse(_v1, activation_type, activation_params);
293                         _v0 = _mm_mul_ps(_v0, _scale_out);
294                         _v1 = _mm_mul_ps(_v1, _scale_out);
295                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
296 #endif
297                     }
298                 }
299                 else if (bias_data_size == 1)
300                 {
301 #if __AVX__
302                     __m256 _bias = _mm256_set1_ps(bias_data[0]);
303 #else
304                     __m128 _bias = _mm_set1_ps(bias_data[0]);
305 #endif
306 
307                     #pragma omp parallel for num_threads(opt.num_threads)
308                     for (int i = 0; i < w; i++)
309                     {
310                         const int* intptr = (const int*)bottom_blob + i * 8;
311                         signed char* ptr = (signed char*)top_blob + i * 8;
312 
313 #if __AVX__
314                         __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
315                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
316                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
317                         _v = activation_avx(_v, activation_type, activation_params);
318                         _v = _mm256_mul_ps(_v, _scale_out);
319                         *(int64_t*)ptr = float2int8_avx(_v);
320 #else
321                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
322                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
323                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
324                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
325                         _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in0));
326                         _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in1));
327                         _v0 = activation_sse(_v0, activation_type, activation_params);
328                         _v1 = activation_sse(_v1, activation_type, activation_params);
329                         _v0 = _mm_mul_ps(_v0, _scale_out);
330                         _v1 = _mm_mul_ps(_v1, _scale_out);
331                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
332 #endif
333                     }
334                 }
335                 else
336                 {
337                     #pragma omp parallel for num_threads(opt.num_threads)
338                     for (int i = 0; i < w; i++)
339                     {
340                         const int* intptr = (const int*)bottom_blob + i * 8;
341                         signed char* ptr = (signed char*)top_blob + i * 8;
342 
343 #if __AVX__
344                         __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
345                         __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
346                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
347                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
348                         _v = activation_avx(_v, activation_type, activation_params);
349                         _v = _mm256_mul_ps(_v, _scale_out);
350                         *(int64_t*)ptr = float2int8_avx(_v);
351 #else
352                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
353                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
354                         __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
355                         __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
356                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
357                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
358                         _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
359                         _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
360                         _v0 = activation_sse(_v0, activation_type, activation_params);
361                         _v1 = activation_sse(_v1, activation_type, activation_params);
362                         _v0 = _mm_mul_ps(_v0, _scale_out);
363                         _v1 = _mm_mul_ps(_v1, _scale_out);
364                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
365 #endif
366                     }
367                 }
368             }
369             else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
370             {
371                 if (bias_data_size == 0)
372                 {
373                     #pragma omp parallel for num_threads(opt.num_threads)
374                     for (int i = 0; i < w; i++)
375                     {
376                         const int* intptr = (const int*)bottom_blob + i * 8;
377                         signed char* ptr = (signed char*)top_blob + i * 8;
378 
379 #if __AVX__
380                         __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
381                         __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
382                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
383                         _v = _mm256_mul_ps(_v, _scale_in);
384                         _v = activation_avx(_v, activation_type, activation_params);
385                         _v = _mm256_mul_ps(_v, _scale_out);
386                         *(int64_t*)ptr = float2int8_avx(_v);
387 #else
388                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
389                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
390                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
391                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
392                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
393                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
394                         _v0 = _mm_mul_ps(_v0, _scale_in0);
395                         _v1 = _mm_mul_ps(_v1, _scale_in1);
396                         _v0 = activation_sse(_v0, activation_type, activation_params);
397                         _v1 = activation_sse(_v1, activation_type, activation_params);
398                         _v0 = _mm_mul_ps(_v0, _scale_out0);
399                         _v1 = _mm_mul_ps(_v1, _scale_out1);
400                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
401 #endif
402                     }
403                 }
404                 else if (bias_data_size == 1)
405                 {
406 #if __AVX__
407                     __m256 _bias = _mm256_set1_ps(bias_data[0]);
408 #else
409                     __m128 _bias = _mm_set1_ps(bias_data[0]);
410 #endif
411 
412                     #pragma omp parallel for num_threads(opt.num_threads)
413                     for (int i = 0; i < w; i++)
414                     {
415                         const int* intptr = (const int*)bottom_blob + i * 8;
416                         signed char* ptr = (signed char*)top_blob + i * 8;
417 
418 #if __AVX__
419                         __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
420                         __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
421                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
422                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
423                         _v = activation_avx(_v, activation_type, activation_params);
424                         _v = _mm256_mul_ps(_v, _scale_out);
425                         *(int64_t*)ptr = float2int8_avx(_v);
426 #else
427                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
428                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
429                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
430                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
431                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
432                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
433                         _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in0));
434                         _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in1));
435                         _v0 = activation_sse(_v0, activation_type, activation_params);
436                         _v1 = activation_sse(_v1, activation_type, activation_params);
437                         _v0 = _mm_mul_ps(_v0, _scale_out0);
438                         _v1 = _mm_mul_ps(_v1, _scale_out1);
439                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
440 #endif
441                     }
442                 }
443                 else
444                 {
445                     #pragma omp parallel for num_threads(opt.num_threads)
446                     for (int i = 0; i < w; i++)
447                     {
448                         const int* intptr = (const int*)bottom_blob + i * 8;
449                         signed char* ptr = (signed char*)top_blob + i * 8;
450 
451 #if __AVX__
452                         __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
453                         __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
454                         __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
455                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
456                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
457                         _v = activation_avx(_v, activation_type, activation_params);
458                         _v = _mm256_mul_ps(_v, _scale_out);
459                         *(int64_t*)ptr = float2int8_avx(_v);
460 #else
461                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
462                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
463                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
464                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
465                         __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
466                         __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
467                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
468                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
469                         _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
470                         _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
471                         _v0 = activation_sse(_v0, activation_type, activation_params);
472                         _v1 = activation_sse(_v1, activation_type, activation_params);
473                         _v0 = _mm_mul_ps(_v0, _scale_out0);
474                         _v1 = _mm_mul_ps(_v1, _scale_out1);
475                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
476 #endif
477                     }
478                 }
479             }
480         }
481 
482         if (dims == 2)
483         {
484             int w = bottom_blob.w;
485             int h = bottom_blob.h;
486 
487             top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
488             if (top_blob.empty())
489                 return -100;
490 
491             if (bias_data_size == 0)
492             {
493                 #pragma omp parallel for num_threads(opt.num_threads)
494                 for (int i = 0; i < h; i++)
495                 {
496                     const int* intptr = bottom_blob.row<const int>(i);
497                     signed char* ptr = top_blob.row<signed char>(i);
498 
499 #if __AVX__
500                     __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + i * 8);
501                     __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + i * 8);
502 #else
503                     __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
504                     __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
505                     __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
506                     __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
507 #endif
508 
509                     for (int j = 0; j < w; j++)
510                     {
511 #if __AVX__
512                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
513                         _v = _mm256_mul_ps(_v, _scale_in);
514                         _v = activation_avx(_v, activation_type, activation_params);
515                         _v = _mm256_mul_ps(_v, _scale_out);
516                         *(int64_t*)ptr = float2int8_avx(_v);
517 #else
518                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
519                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
520                         _v0 = _mm_mul_ps(_v0, _scale_in0);
521                         _v1 = _mm_mul_ps(_v1, _scale_in1);
522                         _v0 = activation_sse(_v0, activation_type, activation_params);
523                         _v1 = activation_sse(_v1, activation_type, activation_params);
524                         _v0 = _mm_mul_ps(_v0, _scale_out0);
525                         _v1 = _mm_mul_ps(_v1, _scale_out1);
526                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
527 #endif
528 
529                         intptr += 8;
530                         ptr += 8;
531                     }
532                 }
533             }
534             else
535             {
536                 #pragma omp parallel for num_threads(opt.num_threads)
537                 for (int i = 0; i < h; i++)
538                 {
539                     const int* intptr = bottom_blob.row<const int>(i);
540                     signed char* ptr = top_blob.row<signed char>(i);
541 
542 #if __AVX__
543                     __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + i * 8);
544                     __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + i * 8);
545                     __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + i * 8);
546 #else
547                     __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
548                     __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
549                     __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
550                     __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
551                     __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
552                     __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
553 #endif
554 
555                     for (int j = 0; j < w; j++)
556                     {
557 #if __AVX__
558                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
559                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
560                         _v = activation_avx(_v, activation_type, activation_params);
561                         _v = _mm256_mul_ps(_v, _scale_out);
562                         *(int64_t*)ptr = float2int8_avx(_v);
563 #else
564                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
565                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
566                         _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
567                         _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
568                         _v0 = activation_sse(_v0, activation_type, activation_params);
569                         _v1 = activation_sse(_v1, activation_type, activation_params);
570                         _v0 = _mm_mul_ps(_v0, _scale_out0);
571                         _v1 = _mm_mul_ps(_v1, _scale_out1);
572                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
573 #endif
574 
575                         intptr += 8;
576                         ptr += 8;
577                     }
578                 }
579             }
580         }
581 
582         if (dims == 3)
583         {
584             int w = bottom_blob.w;
585             int h = bottom_blob.h;
586             int channels = bottom_blob.c;
587             int size = w * h;
588 
589             top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
590             if (top_blob.empty())
591                 return -100;
592 
593             if (bias_data_size == 0)
594             {
595                 #pragma omp parallel for num_threads(opt.num_threads)
596                 for (int q = 0; q < channels; q++)
597                 {
598                     const int* intptr = bottom_blob.channel(q);
599                     signed char* ptr = top_blob.channel(q);
600 
601 #if __AVX__
602                     __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + q * 8);
603                     __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + q * 8);
604 #else
605                     __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8);
606                     __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4);
607                     __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8);
608                     __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4);
609 #endif
610 
611                     for (int i = 0; i < size; i++)
612                     {
613 #if __AVX__
614                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
615                         _v = _mm256_mul_ps(_v, _scale_in);
616                         _v = activation_avx(_v, activation_type, activation_params);
617                         _v = _mm256_mul_ps(_v, _scale_out);
618                         *(int64_t*)ptr = float2int8_avx(_v);
619 #else
620                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
621                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
622                         _v0 = _mm_mul_ps(_v0, _scale_in0);
623                         _v1 = _mm_mul_ps(_v1, _scale_in1);
624                         _v0 = activation_sse(_v0, activation_type, activation_params);
625                         _v1 = activation_sse(_v1, activation_type, activation_params);
626                         _v0 = _mm_mul_ps(_v0, _scale_out0);
627                         _v1 = _mm_mul_ps(_v1, _scale_out1);
628                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
629 #endif
630 
631                         intptr += 8;
632                         ptr += 8;
633                     }
634                 }
635             }
636             else
637             {
638                 #pragma omp parallel for num_threads(opt.num_threads)
639                 for (int q = 0; q < channels; q++)
640                 {
641                     const int* intptr = bottom_blob.channel(q);
642                     signed char* ptr = top_blob.channel(q);
643 
644 #if __AVX__
645                     __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + q * 8);
646                     __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + q * 8);
647                     __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + q * 8);
648 #else
649                     __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8);
650                     __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4);
651                     __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8);
652                     __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4);
653                     __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8);
654                     __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8 + 4);
655 #endif
656 
657                     for (int i = 0; i < size; i++)
658                     {
659 #if __AVX__
660                         __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
661                         _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
662                         _v = activation_avx(_v, activation_type, activation_params);
663                         _v = _mm256_mul_ps(_v, _scale_out);
664                         *(int64_t*)ptr = float2int8_avx(_v);
665 #else
666                         __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
667                         __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
668                         _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
669                         _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
670                         _v0 = activation_sse(_v0, activation_type, activation_params);
671                         _v1 = activation_sse(_v1, activation_type, activation_params);
672                         _v0 = _mm_mul_ps(_v0, _scale_out0);
673                         _v1 = _mm_mul_ps(_v1, _scale_out1);
674                         *(int64_t*)ptr = float2int8_sse(_v0, _v1);
675 #endif
676 
677                         intptr += 8;
678                         ptr += 8;
679                     }
680                 }
681             }
682         }
683 
684         return 0;
685     }
686 
687     if (elempack == 4)
688     {
689         if (dims == 1)
690         {
691             int w = bottom_blob.w;
692             int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
693             int outw = w * elempack / out_elempack;
694 
695             top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
696             if (top_blob.empty())
697                 return -100;
698 
699             if (scale_in_data_size == 1 && scale_out_data_size == 1)
700             {
701                 __m128 _scale_in = _mm_set1_ps(scale_in_data[0]);
702                 __m128 _scale_out = _mm_set1_ps(scale_out_data[0]);
703 
704                 if (bias_data_size == 0)
705                 {
706                     #pragma omp parallel for num_threads(opt.num_threads)
707                     for (int i = 0; i < w; i++)
708                     {
709                         const int* intptr = (const int*)bottom_blob + i * 4;
710                         signed char* ptr = (signed char*)top_blob + i * 4;
711 
712                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
713                         _v = _mm_mul_ps(_v, _scale_in);
714                         _v = activation_sse(_v, activation_type, activation_params);
715                         _v = _mm_mul_ps(_v, _scale_out);
716                         int64_t v = float2int8_sse(_v, _v);
717                         ptr[0] = (v >> 56) & 0xff;
718                         ptr[1] = (v >> 48) & 0xff;
719                         ptr[2] = (v >> 40) & 0xff;
720                         ptr[3] = (v >> 32) & 0xff;
721                     }
722                 }
723                 else if (bias_data_size == 1)
724                 {
725                     __m128 _bias = _mm_set1_ps(bias_data[0]);
726 
727                     #pragma omp parallel for num_threads(opt.num_threads)
728                     for (int i = 0; i < w; i++)
729                     {
730                         const int* intptr = (const int*)bottom_blob + i * 4;
731                         signed char* ptr = (signed char*)top_blob + i * 4;
732 
733                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
734                         _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
735                         _v = activation_sse(_v, activation_type, activation_params);
736                         _v = _mm_mul_ps(_v, _scale_out);
737                         int64_t v = float2int8_sse(_v, _v);
738                         ptr[0] = (v >> 56) & 0xff;
739                         ptr[1] = (v >> 48) & 0xff;
740                         ptr[2] = (v >> 40) & 0xff;
741                         ptr[3] = (v >> 32) & 0xff;
742                     }
743                 }
744                 else
745                 {
746                     #pragma omp parallel for num_threads(opt.num_threads)
747                     for (int i = 0; i < w; i++)
748                     {
749                         const int* intptr = (const int*)bottom_blob + i * 4;
750                         signed char* ptr = (signed char*)top_blob + i * 4;
751 
752                         __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
753                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
754                         _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
755                         _v = activation_sse(_v, activation_type, activation_params);
756                         _v = _mm_mul_ps(_v, _scale_out);
757                         int64_t v = float2int8_sse(_v, _v);
758                         ptr[0] = (v >> 56) & 0xff;
759                         ptr[1] = (v >> 48) & 0xff;
760                         ptr[2] = (v >> 40) & 0xff;
761                         ptr[3] = (v >> 32) & 0xff;
762                     }
763                 }
764             }
765             else if (scale_in_data_size == 1 && scale_out_data_size > 1)
766             {
767                 __m128 _scale_in = _mm_set1_ps(scale_in_data[0]);
768 
769                 if (bias_data_size == 0)
770                 {
771                     #pragma omp parallel for num_threads(opt.num_threads)
772                     for (int i = 0; i < w; i++)
773                     {
774                         const int* intptr = (const int*)bottom_blob + i * 4;
775                         signed char* ptr = (signed char*)top_blob + i * 4;
776 
777                         __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
778                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
779                         _v = _mm_mul_ps(_v, _scale_in);
780                         _v = activation_sse(_v, activation_type, activation_params);
781                         _v = _mm_mul_ps(_v, _scale_out);
782                         int64_t v = float2int8_sse(_v, _v);
783                         ptr[0] = (v >> 56) & 0xff;
784                         ptr[1] = (v >> 48) & 0xff;
785                         ptr[2] = (v >> 40) & 0xff;
786                         ptr[3] = (v >> 32) & 0xff;
787                     }
788                 }
789                 else if (bias_data_size == 1)
790                 {
791                     __m128 _bias = _mm_set1_ps(bias_data[0]);
792 
793                     #pragma omp parallel for num_threads(opt.num_threads)
794                     for (int i = 0; i < w; i++)
795                     {
796                         const int* intptr = (const int*)bottom_blob + i * 4;
797                         signed char* ptr = (signed char*)top_blob + i * 4;
798 
799                         __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
800                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
801                         _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
802                         _v = activation_sse(_v, activation_type, activation_params);
803                         _v = _mm_mul_ps(_v, _scale_out);
804                         int64_t v = float2int8_sse(_v, _v);
805                         ptr[0] = (v >> 56) & 0xff;
806                         ptr[1] = (v >> 48) & 0xff;
807                         ptr[2] = (v >> 40) & 0xff;
808                         ptr[3] = (v >> 32) & 0xff;
809                     }
810                 }
811                 else
812                 {
813                     #pragma omp parallel for num_threads(opt.num_threads)
814                     for (int i = 0; i < w; i++)
815                     {
816                         const int* intptr = (const int*)bottom_blob + i * 4;
817                         signed char* ptr = (signed char*)top_blob + i * 4;
818 
819                         __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
820                         __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
821                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
822                         _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
823                         _v = activation_sse(_v, activation_type, activation_params);
824                         _v = _mm_mul_ps(_v, _scale_out);
825                         int64_t v = float2int8_sse(_v, _v);
826                         ptr[0] = (v >> 56) & 0xff;
827                         ptr[1] = (v >> 48) & 0xff;
828                         ptr[2] = (v >> 40) & 0xff;
829                         ptr[3] = (v >> 32) & 0xff;
830                     }
831                 }
832             }
833             else if (scale_in_data_size > 1 && scale_out_data_size == 1)
834             {
835                 __m128 _scale_out = _mm_set1_ps(scale_out_data[0]);
836 
837                 if (bias_data_size == 0)
838                 {
839                     #pragma omp parallel for num_threads(opt.num_threads)
840                     for (int i = 0; i < w; i++)
841                     {
842                         const int* intptr = (const int*)bottom_blob + i * 4;
843                         signed char* ptr = (signed char*)top_blob + i * 4;
844 
845                         __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
846                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
847                         _v = _mm_mul_ps(_v, _scale_in);
848                         _v = activation_sse(_v, activation_type, activation_params);
849                         _v = _mm_mul_ps(_v, _scale_out);
850                         int64_t v = float2int8_sse(_v, _v);
851                         ptr[0] = (v >> 56) & 0xff;
852                         ptr[1] = (v >> 48) & 0xff;
853                         ptr[2] = (v >> 40) & 0xff;
854                         ptr[3] = (v >> 32) & 0xff;
855                     }
856                 }
857                 else if (bias_data_size == 1)
858                 {
859                     __m128 _bias = _mm_set1_ps(bias_data[0]);
860 
861                     #pragma omp parallel for num_threads(opt.num_threads)
862                     for (int i = 0; i < w; i++)
863                     {
864                         const int* intptr = (const int*)bottom_blob + i * 4;
865                         signed char* ptr = (signed char*)top_blob + i * 4;
866 
867                         __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
868                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
869                         _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
870                         _v = activation_sse(_v, activation_type, activation_params);
871                         _v = _mm_mul_ps(_v, _scale_out);
872                         int64_t v = float2int8_sse(_v, _v);
873                         ptr[0] = (v >> 56) & 0xff;
874                         ptr[1] = (v >> 48) & 0xff;
875                         ptr[2] = (v >> 40) & 0xff;
876                         ptr[3] = (v >> 32) & 0xff;
877                     }
878                 }
879                 else
880                 {
881                     #pragma omp parallel for num_threads(opt.num_threads)
882                     for (int i = 0; i < w; i++)
883                     {
884                         const int* intptr = (const int*)bottom_blob + i * 4;
885                         signed char* ptr = (signed char*)top_blob + i * 4;
886 
887                         __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
888                         __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
889                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
890                         _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
891                         _v = activation_sse(_v, activation_type, activation_params);
892                         _v = _mm_mul_ps(_v, _scale_out);
893                         int64_t v = float2int8_sse(_v, _v);
894                         ptr[0] = (v >> 56) & 0xff;
895                         ptr[1] = (v >> 48) & 0xff;
896                         ptr[2] = (v >> 40) & 0xff;
897                         ptr[3] = (v >> 32) & 0xff;
898                     }
899                 }
900             }
901             else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
902             {
903                 if (bias_data_size == 0)
904                 {
905                     #pragma omp parallel for num_threads(opt.num_threads)
906                     for (int i = 0; i < w; i++)
907                     {
908                         const int* intptr = (const int*)bottom_blob + i * 4;
909                         signed char* ptr = (signed char*)top_blob + i * 4;
910 
911                         __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
912                         __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
913                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
914                         _v = _mm_mul_ps(_v, _scale_in);
915                         _v = activation_sse(_v, activation_type, activation_params);
916                         _v = _mm_mul_ps(_v, _scale_out);
917                         int64_t v = float2int8_sse(_v, _v);
918                         ptr[0] = (v >> 56) & 0xff;
919                         ptr[1] = (v >> 48) & 0xff;
920                         ptr[2] = (v >> 40) & 0xff;
921                         ptr[3] = (v >> 32) & 0xff;
922                     }
923                 }
924                 else if (bias_data_size == 1)
925                 {
926                     __m128 _bias = _mm_set1_ps(bias_data[0]);
927 
928                     #pragma omp parallel for num_threads(opt.num_threads)
929                     for (int i = 0; i < w; i++)
930                     {
931                         const int* intptr = (const int*)bottom_blob + i * 4;
932                         signed char* ptr = (signed char*)top_blob + i * 4;
933 
934                         __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
935                         __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
936                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
937                         _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
938                         _v = activation_sse(_v, activation_type, activation_params);
939                         _v = _mm_mul_ps(_v, _scale_out);
940                         int64_t v = float2int8_sse(_v, _v);
941                         ptr[0] = (v >> 56) & 0xff;
942                         ptr[1] = (v >> 48) & 0xff;
943                         ptr[2] = (v >> 40) & 0xff;
944                         ptr[3] = (v >> 32) & 0xff;
945                     }
946                 }
947                 else
948                 {
949                     #pragma omp parallel for num_threads(opt.num_threads)
950                     for (int i = 0; i < w; i++)
951                     {
952                         const int* intptr = (const int*)bottom_blob + i * 4;
953                         signed char* ptr = (signed char*)top_blob + i * 4;
954 
955                         __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
956                         __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
957                         __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
958                         __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
959                         _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
960                         _v = activation_sse(_v, activation_type, activation_params);
961                         _v = _mm_mul_ps(_v, _scale_out);
962                         int64_t v = float2int8_sse(_v, _v);
963                         ptr[0] = (v >> 56) & 0xff;
964                         ptr[1] = (v >> 48) & 0xff;
965                         ptr[2] = (v >> 40) & 0xff;
966                         ptr[3] = (v >> 32) & 0xff;
967                     }
968                 }
969             }
970         }
971 
972         if (dims == 2)
973         {
974             int w = bottom_blob.w;
975             int h = bottom_blob.h;
976             int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
977             int outh = h * elempack / out_elempack;
978 
979             top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
980             if (top_blob.empty())
981                 return -100;
982 
983             if (out_elempack == 8)
984             {
985                 if (bias_data_size == 0)
986                 {
987                     #pragma omp parallel for num_threads(opt.num_threads)
988                     for (int i = 0; i < outh; i++)
989                     {
990                         const int* intptr0 = bottom_blob.row<const int>(i * 2);
991                         const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
992                         signed char* ptr = top_blob.row<signed char>(i);
993 
994                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
995                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
996                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
997                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
998 
999                         for (int j = 0; j < w; j++)
1000                         {
1001                             __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
1002                             __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
1003                             _v0 = _mm_mul_ps(_v0, _scale_in0);
1004                             _v1 = _mm_mul_ps(_v1, _scale_in1);
1005                             _v0 = activation_sse(_v0, activation_type, activation_params);
1006                             _v1 = activation_sse(_v1, activation_type, activation_params);
1007                             _v0 = _mm_mul_ps(_v0, _scale_out0);
1008                             _v1 = _mm_mul_ps(_v1, _scale_out1);
1009                             *(int64_t*)ptr = float2int8_sse(_v0, _v1);
1010 
1011                             intptr0 += 4;
1012                             intptr1 += 4;
1013                             ptr += 8;
1014                         }
1015                     }
1016                 }
1017                 else
1018                 {
1019                     #pragma omp parallel for num_threads(opt.num_threads)
1020                     for (int i = 0; i < outh; i++)
1021                     {
1022                         const int* intptr0 = bottom_blob.row<const int>(i * 2);
1023                         const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
1024                         signed char* ptr = top_blob.row<signed char>(i);
1025 
1026                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
1027                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
1028                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
1029                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
1030                         __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
1031                         __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
1032 
1033                         for (int j = 0; j < w; j++)
1034                         {
1035                             __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
1036                             __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
1037                             _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
1038                             _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
1039                             _v0 = activation_sse(_v0, activation_type, activation_params);
1040                             _v1 = activation_sse(_v1, activation_type, activation_params);
1041                             _v0 = _mm_mul_ps(_v0, _scale_out0);
1042                             _v1 = _mm_mul_ps(_v1, _scale_out1);
1043                             *(int64_t*)ptr = float2int8_sse(_v0, _v1);
1044 
1045                             intptr0 += 4;
1046                             intptr1 += 4;
1047                             ptr += 8;
1048                         }
1049                     }
1050                 }
1051             }
1052             if (out_elempack == 1)
1053             {
1054                 if (bias_data_size == 0)
1055                 {
1056                     #pragma omp parallel for num_threads(opt.num_threads)
1057                     for (int i = 0; i < h; i++)
1058                     {
1059                         const int* intptr = bottom_blob.row<const int>(i);
1060                         signed char* ptr0 = top_blob.row<signed char>(i * 4);
1061                         signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
1062                         signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
1063                         signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
1064 
1065                         __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 4);
1066                         __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 4);
1067 
1068                         for (int j = 0; j < w; j++)
1069                         {
1070                             __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
1071                             _v = _mm_mul_ps(_v, _scale_in);
1072                             _v = activation_sse(_v, activation_type, activation_params);
1073                             _v = _mm_mul_ps(_v, _scale_out);
1074                             int64_t v = float2int8_sse(_v, _v);
1075                             ptr0[0] = (v >> 56) & 0xff;
1076                             ptr1[0] = (v >> 48) & 0xff;
1077                             ptr2[0] = (v >> 40) & 0xff;
1078                             ptr3[0] = (v >> 32) & 0xff;
1079 
1080                             intptr += 4;
1081                             ptr0 += 1;
1082                             ptr1 += 1;
1083                             ptr2 += 1;
1084                             ptr3 += 1;
1085                         }
1086                     }
1087                 }
1088                 else
1089                 {
1090                     #pragma omp parallel for num_threads(opt.num_threads)
1091                     for (int i = 0; i < h; i++)
1092                     {
1093                         const int* intptr = bottom_blob.row<const int>(i);
1094                         signed char* ptr0 = top_blob.row<signed char>(i * 4);
1095                         signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
1096                         signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
1097                         signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
1098 
1099                         __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 4);
1100                         __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 4);
1101                         __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 4);
1102 
1103                         for (int j = 0; j < w; j++)
1104                         {
1105                             __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
1106                             _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
1107                             _v = activation_sse(_v, activation_type, activation_params);
1108                             _v = _mm_mul_ps(_v, _scale_out);
1109                             int64_t v = float2int8_sse(_v, _v);
1110                             ptr0[0] = (v >> 56) & 0xff;
1111                             ptr1[0] = (v >> 48) & 0xff;
1112                             ptr2[0] = (v >> 40) & 0xff;
1113                             ptr3[0] = (v >> 32) & 0xff;
1114 
1115                             intptr += 4;
1116                             ptr0 += 1;
1117                             ptr1 += 1;
1118                             ptr2 += 1;
1119                             ptr3 += 1;
1120                         }
1121                     }
1122                 }
1123             }
1124         }
1125 
1126         if (dims == 3)
1127         {
1128             int w = bottom_blob.w;
1129             int h = bottom_blob.h;
1130             int channels = bottom_blob.c;
1131             int size = w * h;
1132             int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
1133             int outc = channels * elempack / out_elempack;
1134 
1135             top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
1136             if (top_blob.empty())
1137                 return -100;
1138 
1139             if (out_elempack == 8)
1140             {
1141                 if (bias_data_size == 0)
1142                 {
1143                     #pragma omp parallel for num_threads(opt.num_threads)
1144                     for (int q = 0; q < outc; q++)
1145                     {
1146                         const int* intptr0 = bottom_blob.channel(q * 2);
1147                         const int* intptr1 = bottom_blob.channel(q * 2 + 1);
1148                         signed char* ptr = top_blob.channel(q);
1149 
1150                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8);
1151                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4);
1152                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8);
1153                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4);
1154 
1155                         for (int i = 0; i < size; i++)
1156                         {
1157                             __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
1158                             __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
1159                             _v0 = _mm_mul_ps(_v0, _scale_in0);
1160                             _v1 = _mm_mul_ps(_v1, _scale_in1);
1161                             _v0 = activation_sse(_v0, activation_type, activation_params);
1162                             _v1 = activation_sse(_v1, activation_type, activation_params);
1163                             _v0 = _mm_mul_ps(_v0, _scale_out0);
1164                             _v1 = _mm_mul_ps(_v1, _scale_out1);
1165                             *(int64_t*)ptr = float2int8_sse(_v0, _v1);
1166 
1167                             intptr0 += 4;
1168                             intptr1 += 4;
1169                             ptr += 8;
1170                         }
1171                     }
1172                 }
1173                 else
1174                 {
1175                     #pragma omp parallel for num_threads(opt.num_threads)
1176                     for (int q = 0; q < outc; q++)
1177                     {
1178                         const int* intptr0 = bottom_blob.channel(q * 2);
1179                         const int* intptr1 = bottom_blob.channel(q * 2 + 1);
1180                         signed char* ptr = top_blob.channel(q);
1181 
1182                         __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8);
1183                         __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4);
1184                         __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8);
1185                         __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4);
1186                         __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8);
1187                         __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8 + 4);
1188 
1189                         for (int i = 0; i < size; i++)
1190                         {
1191                             __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
1192                             __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
1193                             _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
1194                             _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
1195                             _v0 = activation_sse(_v0, activation_type, activation_params);
1196                             _v1 = activation_sse(_v1, activation_type, activation_params);
1197                             _v0 = _mm_mul_ps(_v0, _scale_out0);
1198                             _v1 = _mm_mul_ps(_v1, _scale_out1);
1199                             *(int64_t*)ptr = float2int8_sse(_v0, _v1);
1200 
1201                             intptr0 += 4;
1202                             intptr1 += 4;
1203                             ptr += 8;
1204                         }
1205                     }
1206                 }
1207             }
1208             if (out_elempack == 1)
1209             {
1210                 if (bias_data_size == 0)
1211                 {
1212                     #pragma omp parallel for num_threads(opt.num_threads)
1213                     for (int q = 0; q < channels; q++)
1214                     {
1215                         const int* intptr = bottom_blob.channel(q);
1216                         signed char* ptr0 = top_blob.channel(q * 4);
1217                         signed char* ptr1 = top_blob.channel(q * 4 + 1);
1218                         signed char* ptr2 = top_blob.channel(q * 4 + 2);
1219                         signed char* ptr3 = top_blob.channel(q * 4 + 3);
1220 
1221                         __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 4);
1222                         __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 4);
1223 
1224                         for (int i = 0; i < size; i++)
1225                         {
1226                             __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
1227                             _v = _mm_mul_ps(_v, _scale_in);
1228                             _v = activation_sse(_v, activation_type, activation_params);
1229                             _v = _mm_mul_ps(_v, _scale_out);
1230                             int64_t v = float2int8_sse(_v, _v);
1231                             ptr0[0] = (v >> 56) & 0xff;
1232                             ptr1[0] = (v >> 48) & 0xff;
1233                             ptr2[0] = (v >> 40) & 0xff;
1234                             ptr3[0] = (v >> 32) & 0xff;
1235 
1236                             intptr += 4;
1237                             ptr0 += 1;
1238                             ptr1 += 1;
1239                             ptr2 += 1;
1240                             ptr3 += 1;
1241                         }
1242                     }
1243                 }
1244                 else
1245                 {
1246                     #pragma omp parallel for num_threads(opt.num_threads)
1247                     for (int q = 0; q < channels; q++)
1248                     {
1249                         const int* intptr = bottom_blob.channel(q);
1250                         signed char* ptr0 = top_blob.channel(q * 4);
1251                         signed char* ptr1 = top_blob.channel(q * 4 + 1);
1252                         signed char* ptr2 = top_blob.channel(q * 4 + 2);
1253                         signed char* ptr3 = top_blob.channel(q * 4 + 3);
1254 
1255                         __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 4);
1256                         __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 4);
1257                         __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 4);
1258 
1259                         for (int i = 0; i < size; i++)
1260                         {
1261                             __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
1262                             _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
1263                             _v = activation_sse(_v, activation_type, activation_params);
1264                             _v = _mm_mul_ps(_v, _scale_out);
1265                             int64_t v = float2int8_sse(_v, _v);
1266                             ptr0[0] = (v >> 56) & 0xff;
1267                             ptr1[0] = (v >> 48) & 0xff;
1268                             ptr2[0] = (v >> 40) & 0xff;
1269                             ptr3[0] = (v >> 32) & 0xff;
1270 
1271                             intptr += 4;
1272                             ptr0 += 1;
1273                             ptr1 += 1;
1274                             ptr2 += 1;
1275                             ptr3 += 1;
1276                         }
1277                     }
1278                 }
1279             }
1280         }
1281 
1282         return 0;
1283     }
1284 #endif // __SSE2__
1285 
1286     if (dims == 1)
1287     {
1288         int w = bottom_blob.w;
1289 
1290         top_blob.create(w, (size_t)1u, opt.blob_allocator);
1291         if (top_blob.empty())
1292             return -100;
1293 
1294         const int* intptr = bottom_blob;
1295         signed char* ptr = top_blob;
1296 
1297         if (scale_in_data_size == 1 && scale_out_data_size == 1)
1298         {
1299             const float scale_in = scale_in_data[0];
1300             const float scale_out = scale_out_data[0];
1301 
1302             if (bias_data_size == 0)
1303             {
1304                 #pragma omp parallel for num_threads(opt.num_threads)
1305                 for (int i = 0; i < w; i++)
1306                 {
1307                     float v = intptr[i] * scale_in;
1308                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1309                 }
1310             }
1311             else if (bias_data_size == 1)
1312             {
1313                 const float bias = bias_data[0];
1314 
1315                 #pragma omp parallel for num_threads(opt.num_threads)
1316                 for (int i = 0; i < w; i++)
1317                 {
1318                     float v = intptr[i] * scale_in + bias;
1319                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1320                 }
1321             }
1322             else
1323             {
1324                 #pragma omp parallel for num_threads(opt.num_threads)
1325                 for (int i = 0; i < w; i++)
1326                 {
1327                     float v = intptr[i] * scale_in + bias_data[i];
1328                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1329                 }
1330             }
1331         }
1332         else if (scale_in_data_size == 1 && scale_out_data_size > 1)
1333         {
1334             const float scale_in = scale_in_data[0];
1335 
1336             if (bias_data_size == 0)
1337             {
1338                 #pragma omp parallel for num_threads(opt.num_threads)
1339                 for (int i = 0; i < w; i++)
1340                 {
1341                     float v = intptr[i] * scale_in;
1342                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1343                 }
1344             }
1345             else if (bias_data_size == 1)
1346             {
1347                 const float bias = bias_data[0];
1348 
1349                 #pragma omp parallel for num_threads(opt.num_threads)
1350                 for (int i = 0; i < w; i++)
1351                 {
1352                     float v = intptr[i] * scale_in + bias;
1353                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1354                 }
1355             }
1356             else
1357             {
1358                 #pragma omp parallel for num_threads(opt.num_threads)
1359                 for (int i = 0; i < w; i++)
1360                 {
1361                     float v = intptr[i] * scale_in + bias_data[i];
1362                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1363                 }
1364             }
1365         }
1366         else if (scale_in_data_size > 1 && scale_out_data_size == 1)
1367         {
1368             const float scale_out = scale_out_data[0];
1369 
1370             if (bias_data_size == 0)
1371             {
1372                 #pragma omp parallel for num_threads(opt.num_threads)
1373                 for (int i = 0; i < w; i++)
1374                 {
1375                     float v = intptr[i] * scale_in_data[i];
1376                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1377                 }
1378             }
1379             else if (bias_data_size == 1)
1380             {
1381                 const float bias = bias_data[0];
1382 
1383                 #pragma omp parallel for num_threads(opt.num_threads)
1384                 for (int i = 0; i < w; i++)
1385                 {
1386                     float v = intptr[i] * scale_in_data[i] + bias;
1387                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1388                 }
1389             }
1390             else
1391             {
1392                 #pragma omp parallel for num_threads(opt.num_threads)
1393                 for (int i = 0; i < w; i++)
1394                 {
1395                     float v = intptr[i] * scale_in_data[i] + bias_data[i];
1396                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1397                 }
1398             }
1399         }
1400         else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
1401         {
1402             if (bias_data_size == 0)
1403             {
1404                 #pragma omp parallel for num_threads(opt.num_threads)
1405                 for (int i = 0; i < w; i++)
1406                 {
1407                     float v = intptr[i] * scale_in_data[i];
1408                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1409                 }
1410             }
1411             else if (bias_data_size == 1)
1412             {
1413                 const float bias = bias_data[0];
1414 
1415                 #pragma omp parallel for num_threads(opt.num_threads)
1416                 for (int i = 0; i < w; i++)
1417                 {
1418                     float v = intptr[i] * scale_in_data[i] + bias;
1419                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1420                 }
1421             }
1422             else
1423             {
1424                 #pragma omp parallel for num_threads(opt.num_threads)
1425                 for (int i = 0; i < w; i++)
1426                 {
1427                     float v = intptr[i] * scale_in_data[i] + bias_data[i];
1428                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1429                 }
1430             }
1431         }
1432     }
1433 
1434     if (dims == 2)
1435     {
1436         int w = bottom_blob.w;
1437         int h = bottom_blob.h;
1438 
1439         top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
1440         if (top_blob.empty())
1441             return -100;
1442 
1443         if (bias_data_size == 0)
1444         {
1445             #pragma omp parallel for num_threads(opt.num_threads)
1446             for (int i = 0; i < h; i++)
1447             {
1448                 const int* intptr = bottom_blob.row<const int>(i);
1449                 signed char* ptr = top_blob.row<signed char>(i);
1450 
1451                 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1452                 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1453 
1454                 for (int j = 0; j < w; j++)
1455                 {
1456                     float v = intptr[j] * scale_in;
1457                     ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1458                 }
1459             }
1460         }
1461         else
1462         {
1463             #pragma omp parallel for num_threads(opt.num_threads)
1464             for (int i = 0; i < h; i++)
1465             {
1466                 const int* intptr = bottom_blob.row<const int>(i);
1467                 signed char* ptr = top_blob.row<signed char>(i);
1468 
1469                 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1470                 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1471                 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
1472 
1473                 for (int j = 0; j < w; j++)
1474                 {
1475                     float v = intptr[j] * scale_in + bias;
1476                     ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1477                 }
1478             }
1479         }
1480     }
1481 
1482     if (dims == 3)
1483     {
1484         int w = bottom_blob.w;
1485         int h = bottom_blob.h;
1486         int channels = bottom_blob.c;
1487         int size = w * h;
1488 
1489         top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
1490         if (top_blob.empty())
1491             return -100;
1492 
1493         if (bias_data_size == 0)
1494         {
1495             #pragma omp parallel for num_threads(opt.num_threads)
1496             for (int q = 0; q < channels; q++)
1497             {
1498                 const int* intptr = bottom_blob.channel(q);
1499                 signed char* ptr = top_blob.channel(q);
1500 
1501                 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1502                 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1503 
1504                 for (int i = 0; i < size; i++)
1505                 {
1506                     float v = intptr[i] * scale_in;
1507                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1508                 }
1509             }
1510         }
1511         else
1512         {
1513             #pragma omp parallel for num_threads(opt.num_threads)
1514             for (int q = 0; q < channels; q++)
1515             {
1516                 const int* intptr = bottom_blob.channel(q);
1517                 signed char* ptr = top_blob.channel(q);
1518 
1519                 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1520                 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1521                 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
1522 
1523                 for (int i = 0; i < size; i++)
1524                 {
1525                     float v = intptr[i] * scale_in + bias;
1526                     ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1527                 }
1528             }
1529         }
1530     }
1531 
1532     return 0;
1533 }
1534 
1535 } // namespace ncnn
1536