1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "requantize_x86.h"
16
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23
24 #include "x86_activation.h"
25 #include "x86_usability.h"
26
27 namespace ncnn {
28
Requantize_x86()29 Requantize_x86::Requantize_x86()
30 {
31 #if __SSE2__
32 support_packing = true;
33 #endif // __SSE2__
34 }
35
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const36 int Requantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
37 {
38 int dims = bottom_blob.dims;
39 int elempack = bottom_blob.elempack;
40
41 #if __SSE2__
42 if (elempack == 8)
43 {
44 if (dims == 1)
45 {
46 int w = bottom_blob.w;
47
48 top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
49 if (top_blob.empty())
50 return -100;
51
52 if (scale_in_data_size == 1 && scale_out_data_size == 1)
53 {
54 #if __AVX__
55 __m256 _scale_in = _mm256_set1_ps(scale_in_data[0]);
56 __m256 _scale_out = _mm256_set1_ps(scale_out_data[0]);
57 #else
58 __m128 _scale_in = _mm_set1_ps(scale_in_data[0]);
59 __m128 _scale_out = _mm_set1_ps(scale_out_data[0]);
60 #endif
61
62 if (bias_data_size == 0)
63 {
64 #pragma omp parallel for num_threads(opt.num_threads)
65 for (int i = 0; i < w; i++)
66 {
67 const int* intptr = (const int*)bottom_blob + i * 8;
68 signed char* ptr = (signed char*)top_blob + i * 8;
69
70 #if __AVX__
71 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
72 _v = _mm256_mul_ps(_v, _scale_in);
73 _v = activation_avx(_v, activation_type, activation_params);
74 _v = _mm256_mul_ps(_v, _scale_out);
75 *(int64_t*)ptr = float2int8_avx(_v);
76 #else
77 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
78 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
79 _v0 = _mm_mul_ps(_v0, _scale_in);
80 _v1 = _mm_mul_ps(_v1, _scale_in);
81 _v0 = activation_sse(_v0, activation_type, activation_params);
82 _v1 = activation_sse(_v1, activation_type, activation_params);
83 _v0 = _mm_mul_ps(_v0, _scale_out);
84 _v1 = _mm_mul_ps(_v1, _scale_out);
85 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
86 #endif
87 }
88 }
89 else if (bias_data_size == 1)
90 {
91 #if __AVX__
92 __m256 _bias = _mm256_set1_ps(bias_data[0]);
93 #else
94 __m128 _bias = _mm_set1_ps(bias_data[0]);
95 #endif
96
97 #pragma omp parallel for num_threads(opt.num_threads)
98 for (int i = 0; i < w; i++)
99 {
100 const int* intptr = (const int*)bottom_blob + i * 8;
101 signed char* ptr = (signed char*)top_blob + i * 8;
102
103 #if __AVX__
104 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
105 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
106 _v = activation_avx(_v, activation_type, activation_params);
107 _v = _mm256_mul_ps(_v, _scale_out);
108 *(int64_t*)ptr = float2int8_avx(_v);
109 #else
110 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
111 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
112 _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in));
113 _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in));
114 _v0 = activation_sse(_v0, activation_type, activation_params);
115 _v1 = activation_sse(_v1, activation_type, activation_params);
116 _v0 = _mm_mul_ps(_v0, _scale_out);
117 _v1 = _mm_mul_ps(_v1, _scale_out);
118 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
119 #endif
120 }
121 }
122 else
123 {
124 #pragma omp parallel for num_threads(opt.num_threads)
125 for (int i = 0; i < w; i++)
126 {
127 const int* intptr = (const int*)bottom_blob + i * 8;
128 signed char* ptr = (signed char*)top_blob + i * 8;
129
130 #if __AVX__
131 __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
132 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
133 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
134 _v = activation_avx(_v, activation_type, activation_params);
135 _v = _mm256_mul_ps(_v, _scale_out);
136 *(int64_t*)ptr = float2int8_avx(_v);
137 #else
138 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
139 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
140 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
141 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
142 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in));
143 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in));
144 _v0 = activation_sse(_v0, activation_type, activation_params);
145 _v1 = activation_sse(_v1, activation_type, activation_params);
146 _v0 = _mm_mul_ps(_v0, _scale_out);
147 _v1 = _mm_mul_ps(_v1, _scale_out);
148 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
149 #endif
150 }
151 }
152 }
153 else if (scale_in_data_size == 1 && scale_out_data_size > 1)
154 {
155 #if __AVX__
156 __m256 _scale_in = _mm256_set1_ps(scale_in_data[0]);
157 #else
158 __m128 _scale_in = _mm_set1_ps(scale_in_data[0]);
159 #endif
160
161 if (bias_data_size == 0)
162 {
163 #pragma omp parallel for num_threads(opt.num_threads)
164 for (int i = 0; i < w; i++)
165 {
166 const int* intptr = (const int*)bottom_blob + i * 8;
167 signed char* ptr = (signed char*)top_blob + i * 8;
168
169 #if __AVX__
170 __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
171 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
172 _v = _mm256_mul_ps(_v, _scale_in);
173 _v = activation_avx(_v, activation_type, activation_params);
174 _v = _mm256_mul_ps(_v, _scale_out);
175 *(int64_t*)ptr = float2int8_avx(_v);
176 #else
177 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
178 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
179 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
180 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
181 _v0 = _mm_mul_ps(_v0, _scale_in);
182 _v1 = _mm_mul_ps(_v1, _scale_in);
183 _v0 = activation_sse(_v0, activation_type, activation_params);
184 _v1 = activation_sse(_v1, activation_type, activation_params);
185 _v0 = _mm_mul_ps(_v0, _scale_out0);
186 _v1 = _mm_mul_ps(_v1, _scale_out1);
187 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
188 #endif
189 }
190 }
191 else if (bias_data_size == 1)
192 {
193 #if __AVX__
194 __m256 _bias = _mm256_set1_ps(bias_data[0]);
195 #else
196 __m128 _bias = _mm_set1_ps(bias_data[0]);
197 #endif
198
199 #pragma omp parallel for num_threads(opt.num_threads)
200 for (int i = 0; i < w; i++)
201 {
202 const int* intptr = (const int*)bottom_blob + i * 8;
203 signed char* ptr = (signed char*)top_blob + i * 8;
204
205 #if __AVX__
206 __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
207 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
208 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
209 _v = activation_avx(_v, activation_type, activation_params);
210 _v = _mm256_mul_ps(_v, _scale_out);
211 *(int64_t*)ptr = float2int8_avx(_v);
212 #else
213 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
214 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
215 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
216 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
217 _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in));
218 _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in));
219 _v0 = activation_sse(_v0, activation_type, activation_params);
220 _v1 = activation_sse(_v1, activation_type, activation_params);
221 _v0 = _mm_mul_ps(_v0, _scale_out0);
222 _v1 = _mm_mul_ps(_v1, _scale_out1);
223 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
224 #endif
225 }
226 }
227 else
228 {
229 #pragma omp parallel for num_threads(opt.num_threads)
230 for (int i = 0; i < w; i++)
231 {
232 const int* intptr = (const int*)bottom_blob + i * 8;
233 signed char* ptr = (signed char*)top_blob + i * 8;
234
235 #if __AVX__
236 __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
237 __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
238 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
239 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
240 _v = activation_avx(_v, activation_type, activation_params);
241 _v = _mm256_mul_ps(_v, _scale_out);
242 *(int64_t*)ptr = float2int8_avx(_v);
243 #else
244 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
245 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
246 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
247 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
248 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
249 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
250 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in));
251 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in));
252 _v0 = activation_sse(_v0, activation_type, activation_params);
253 _v1 = activation_sse(_v1, activation_type, activation_params);
254 _v0 = _mm_mul_ps(_v0, _scale_out0);
255 _v1 = _mm_mul_ps(_v1, _scale_out1);
256 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
257 #endif
258 }
259 }
260 }
261 else if (scale_in_data_size > 1 && scale_out_data_size == 1)
262 {
263 #if __AVX__
264 __m256 _scale_out = _mm256_set1_ps(scale_out_data[0]);
265 #else
266 __m128 _scale_out = _mm_set1_ps(scale_out_data[0]);
267 #endif
268
269 if (bias_data_size == 0)
270 {
271 #pragma omp parallel for num_threads(opt.num_threads)
272 for (int i = 0; i < w; i++)
273 {
274 const int* intptr = (const int*)bottom_blob + i * 8;
275 signed char* ptr = (signed char*)top_blob + i * 8;
276
277 #if __AVX__
278 __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
279 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
280 _v = _mm256_mul_ps(_v, _scale_in);
281 _v = activation_avx(_v, activation_type, activation_params);
282 _v = _mm256_mul_ps(_v, _scale_out);
283 *(int64_t*)ptr = float2int8_avx(_v);
284 #else
285 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
286 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
287 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
288 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
289 _v0 = _mm_mul_ps(_v0, _scale_in0);
290 _v1 = _mm_mul_ps(_v1, _scale_in1);
291 _v0 = activation_sse(_v0, activation_type, activation_params);
292 _v1 = activation_sse(_v1, activation_type, activation_params);
293 _v0 = _mm_mul_ps(_v0, _scale_out);
294 _v1 = _mm_mul_ps(_v1, _scale_out);
295 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
296 #endif
297 }
298 }
299 else if (bias_data_size == 1)
300 {
301 #if __AVX__
302 __m256 _bias = _mm256_set1_ps(bias_data[0]);
303 #else
304 __m128 _bias = _mm_set1_ps(bias_data[0]);
305 #endif
306
307 #pragma omp parallel for num_threads(opt.num_threads)
308 for (int i = 0; i < w; i++)
309 {
310 const int* intptr = (const int*)bottom_blob + i * 8;
311 signed char* ptr = (signed char*)top_blob + i * 8;
312
313 #if __AVX__
314 __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
315 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
316 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
317 _v = activation_avx(_v, activation_type, activation_params);
318 _v = _mm256_mul_ps(_v, _scale_out);
319 *(int64_t*)ptr = float2int8_avx(_v);
320 #else
321 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
322 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
323 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
324 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
325 _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in0));
326 _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in1));
327 _v0 = activation_sse(_v0, activation_type, activation_params);
328 _v1 = activation_sse(_v1, activation_type, activation_params);
329 _v0 = _mm_mul_ps(_v0, _scale_out);
330 _v1 = _mm_mul_ps(_v1, _scale_out);
331 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
332 #endif
333 }
334 }
335 else
336 {
337 #pragma omp parallel for num_threads(opt.num_threads)
338 for (int i = 0; i < w; i++)
339 {
340 const int* intptr = (const int*)bottom_blob + i * 8;
341 signed char* ptr = (signed char*)top_blob + i * 8;
342
343 #if __AVX__
344 __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
345 __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
346 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
347 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
348 _v = activation_avx(_v, activation_type, activation_params);
349 _v = _mm256_mul_ps(_v, _scale_out);
350 *(int64_t*)ptr = float2int8_avx(_v);
351 #else
352 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
353 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
354 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
355 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
356 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
357 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
358 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
359 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
360 _v0 = activation_sse(_v0, activation_type, activation_params);
361 _v1 = activation_sse(_v1, activation_type, activation_params);
362 _v0 = _mm_mul_ps(_v0, _scale_out);
363 _v1 = _mm_mul_ps(_v1, _scale_out);
364 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
365 #endif
366 }
367 }
368 }
369 else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
370 {
371 if (bias_data_size == 0)
372 {
373 #pragma omp parallel for num_threads(opt.num_threads)
374 for (int i = 0; i < w; i++)
375 {
376 const int* intptr = (const int*)bottom_blob + i * 8;
377 signed char* ptr = (signed char*)top_blob + i * 8;
378
379 #if __AVX__
380 __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
381 __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
382 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
383 _v = _mm256_mul_ps(_v, _scale_in);
384 _v = activation_avx(_v, activation_type, activation_params);
385 _v = _mm256_mul_ps(_v, _scale_out);
386 *(int64_t*)ptr = float2int8_avx(_v);
387 #else
388 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
389 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
390 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
391 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
392 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
393 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
394 _v0 = _mm_mul_ps(_v0, _scale_in0);
395 _v1 = _mm_mul_ps(_v1, _scale_in1);
396 _v0 = activation_sse(_v0, activation_type, activation_params);
397 _v1 = activation_sse(_v1, activation_type, activation_params);
398 _v0 = _mm_mul_ps(_v0, _scale_out0);
399 _v1 = _mm_mul_ps(_v1, _scale_out1);
400 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
401 #endif
402 }
403 }
404 else if (bias_data_size == 1)
405 {
406 #if __AVX__
407 __m256 _bias = _mm256_set1_ps(bias_data[0]);
408 #else
409 __m128 _bias = _mm_set1_ps(bias_data[0]);
410 #endif
411
412 #pragma omp parallel for num_threads(opt.num_threads)
413 for (int i = 0; i < w; i++)
414 {
415 const int* intptr = (const int*)bottom_blob + i * 8;
416 signed char* ptr = (signed char*)top_blob + i * 8;
417
418 #if __AVX__
419 __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
420 __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
421 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
422 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
423 _v = activation_avx(_v, activation_type, activation_params);
424 _v = _mm256_mul_ps(_v, _scale_out);
425 *(int64_t*)ptr = float2int8_avx(_v);
426 #else
427 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
428 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
429 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
430 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
431 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
432 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
433 _v0 = _mm_add_ps(_bias, _mm_mul_ps(_v0, _scale_in0));
434 _v1 = _mm_add_ps(_bias, _mm_mul_ps(_v1, _scale_in1));
435 _v0 = activation_sse(_v0, activation_type, activation_params);
436 _v1 = activation_sse(_v1, activation_type, activation_params);
437 _v0 = _mm_mul_ps(_v0, _scale_out0);
438 _v1 = _mm_mul_ps(_v1, _scale_out1);
439 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
440 #endif
441 }
442 }
443 else
444 {
445 #pragma omp parallel for num_threads(opt.num_threads)
446 for (int i = 0; i < w; i++)
447 {
448 const int* intptr = (const int*)bottom_blob + i * 8;
449 signed char* ptr = (signed char*)top_blob + i * 8;
450
451 #if __AVX__
452 __m256 _scale_in = _mm256_loadu_ps((const float*)scale_in_data + i * 8);
453 __m256 _scale_out = _mm256_loadu_ps((const float*)scale_out_data + i * 8);
454 __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
455 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
456 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
457 _v = activation_avx(_v, activation_type, activation_params);
458 _v = _mm256_mul_ps(_v, _scale_out);
459 *(int64_t*)ptr = float2int8_avx(_v);
460 #else
461 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
462 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
463 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
464 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
465 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
466 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
467 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
468 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
469 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
470 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
471 _v0 = activation_sse(_v0, activation_type, activation_params);
472 _v1 = activation_sse(_v1, activation_type, activation_params);
473 _v0 = _mm_mul_ps(_v0, _scale_out0);
474 _v1 = _mm_mul_ps(_v1, _scale_out1);
475 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
476 #endif
477 }
478 }
479 }
480 }
481
482 if (dims == 2)
483 {
484 int w = bottom_blob.w;
485 int h = bottom_blob.h;
486
487 top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
488 if (top_blob.empty())
489 return -100;
490
491 if (bias_data_size == 0)
492 {
493 #pragma omp parallel for num_threads(opt.num_threads)
494 for (int i = 0; i < h; i++)
495 {
496 const int* intptr = bottom_blob.row<const int>(i);
497 signed char* ptr = top_blob.row<signed char>(i);
498
499 #if __AVX__
500 __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + i * 8);
501 __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + i * 8);
502 #else
503 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
504 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
505 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
506 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
507 #endif
508
509 for (int j = 0; j < w; j++)
510 {
511 #if __AVX__
512 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
513 _v = _mm256_mul_ps(_v, _scale_in);
514 _v = activation_avx(_v, activation_type, activation_params);
515 _v = _mm256_mul_ps(_v, _scale_out);
516 *(int64_t*)ptr = float2int8_avx(_v);
517 #else
518 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
519 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
520 _v0 = _mm_mul_ps(_v0, _scale_in0);
521 _v1 = _mm_mul_ps(_v1, _scale_in1);
522 _v0 = activation_sse(_v0, activation_type, activation_params);
523 _v1 = activation_sse(_v1, activation_type, activation_params);
524 _v0 = _mm_mul_ps(_v0, _scale_out0);
525 _v1 = _mm_mul_ps(_v1, _scale_out1);
526 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
527 #endif
528
529 intptr += 8;
530 ptr += 8;
531 }
532 }
533 }
534 else
535 {
536 #pragma omp parallel for num_threads(opt.num_threads)
537 for (int i = 0; i < h; i++)
538 {
539 const int* intptr = bottom_blob.row<const int>(i);
540 signed char* ptr = top_blob.row<signed char>(i);
541
542 #if __AVX__
543 __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + i * 8);
544 __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + i * 8);
545 __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + i * 8);
546 #else
547 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
548 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
549 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
550 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
551 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
552 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
553 #endif
554
555 for (int j = 0; j < w; j++)
556 {
557 #if __AVX__
558 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
559 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
560 _v = activation_avx(_v, activation_type, activation_params);
561 _v = _mm256_mul_ps(_v, _scale_out);
562 *(int64_t*)ptr = float2int8_avx(_v);
563 #else
564 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
565 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
566 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
567 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
568 _v0 = activation_sse(_v0, activation_type, activation_params);
569 _v1 = activation_sse(_v1, activation_type, activation_params);
570 _v0 = _mm_mul_ps(_v0, _scale_out0);
571 _v1 = _mm_mul_ps(_v1, _scale_out1);
572 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
573 #endif
574
575 intptr += 8;
576 ptr += 8;
577 }
578 }
579 }
580 }
581
582 if (dims == 3)
583 {
584 int w = bottom_blob.w;
585 int h = bottom_blob.h;
586 int channels = bottom_blob.c;
587 int size = w * h;
588
589 top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
590 if (top_blob.empty())
591 return -100;
592
593 if (bias_data_size == 0)
594 {
595 #pragma omp parallel for num_threads(opt.num_threads)
596 for (int q = 0; q < channels; q++)
597 {
598 const int* intptr = bottom_blob.channel(q);
599 signed char* ptr = top_blob.channel(q);
600
601 #if __AVX__
602 __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + q * 8);
603 __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + q * 8);
604 #else
605 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8);
606 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4);
607 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8);
608 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4);
609 #endif
610
611 for (int i = 0; i < size; i++)
612 {
613 #if __AVX__
614 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
615 _v = _mm256_mul_ps(_v, _scale_in);
616 _v = activation_avx(_v, activation_type, activation_params);
617 _v = _mm256_mul_ps(_v, _scale_out);
618 *(int64_t*)ptr = float2int8_avx(_v);
619 #else
620 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
621 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
622 _v0 = _mm_mul_ps(_v0, _scale_in0);
623 _v1 = _mm_mul_ps(_v1, _scale_in1);
624 _v0 = activation_sse(_v0, activation_type, activation_params);
625 _v1 = activation_sse(_v1, activation_type, activation_params);
626 _v0 = _mm_mul_ps(_v0, _scale_out0);
627 _v1 = _mm_mul_ps(_v1, _scale_out1);
628 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
629 #endif
630
631 intptr += 8;
632 ptr += 8;
633 }
634 }
635 }
636 else
637 {
638 #pragma omp parallel for num_threads(opt.num_threads)
639 for (int q = 0; q < channels; q++)
640 {
641 const int* intptr = bottom_blob.channel(q);
642 signed char* ptr = top_blob.channel(q);
643
644 #if __AVX__
645 __m256 _scale_in = scale_in_data_size == 1 ? _mm256_set1_ps(scale_in_data[0]) : _mm256_loadu_ps((const float*)scale_in_data + q * 8);
646 __m256 _scale_out = scale_out_data_size == 1 ? _mm256_set1_ps(scale_out_data[0]) : _mm256_loadu_ps((const float*)scale_out_data + q * 8);
647 __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + q * 8);
648 #else
649 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8);
650 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4);
651 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8);
652 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4);
653 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8);
654 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8 + 4);
655 #endif
656
657 for (int i = 0; i < size; i++)
658 {
659 #if __AVX__
660 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
661 _v = _mm256_fmadd_ps(_v, _scale_in, _bias);
662 _v = activation_avx(_v, activation_type, activation_params);
663 _v = _mm256_mul_ps(_v, _scale_out);
664 *(int64_t*)ptr = float2int8_avx(_v);
665 #else
666 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
667 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
668 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
669 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
670 _v0 = activation_sse(_v0, activation_type, activation_params);
671 _v1 = activation_sse(_v1, activation_type, activation_params);
672 _v0 = _mm_mul_ps(_v0, _scale_out0);
673 _v1 = _mm_mul_ps(_v1, _scale_out1);
674 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
675 #endif
676
677 intptr += 8;
678 ptr += 8;
679 }
680 }
681 }
682 }
683
684 return 0;
685 }
686
687 if (elempack == 4)
688 {
689 if (dims == 1)
690 {
691 int w = bottom_blob.w;
692 int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
693 int outw = w * elempack / out_elempack;
694
695 top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
696 if (top_blob.empty())
697 return -100;
698
699 if (scale_in_data_size == 1 && scale_out_data_size == 1)
700 {
701 __m128 _scale_in = _mm_set1_ps(scale_in_data[0]);
702 __m128 _scale_out = _mm_set1_ps(scale_out_data[0]);
703
704 if (bias_data_size == 0)
705 {
706 #pragma omp parallel for num_threads(opt.num_threads)
707 for (int i = 0; i < w; i++)
708 {
709 const int* intptr = (const int*)bottom_blob + i * 4;
710 signed char* ptr = (signed char*)top_blob + i * 4;
711
712 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
713 _v = _mm_mul_ps(_v, _scale_in);
714 _v = activation_sse(_v, activation_type, activation_params);
715 _v = _mm_mul_ps(_v, _scale_out);
716 int64_t v = float2int8_sse(_v, _v);
717 ptr[0] = (v >> 56) & 0xff;
718 ptr[1] = (v >> 48) & 0xff;
719 ptr[2] = (v >> 40) & 0xff;
720 ptr[3] = (v >> 32) & 0xff;
721 }
722 }
723 else if (bias_data_size == 1)
724 {
725 __m128 _bias = _mm_set1_ps(bias_data[0]);
726
727 #pragma omp parallel for num_threads(opt.num_threads)
728 for (int i = 0; i < w; i++)
729 {
730 const int* intptr = (const int*)bottom_blob + i * 4;
731 signed char* ptr = (signed char*)top_blob + i * 4;
732
733 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
734 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
735 _v = activation_sse(_v, activation_type, activation_params);
736 _v = _mm_mul_ps(_v, _scale_out);
737 int64_t v = float2int8_sse(_v, _v);
738 ptr[0] = (v >> 56) & 0xff;
739 ptr[1] = (v >> 48) & 0xff;
740 ptr[2] = (v >> 40) & 0xff;
741 ptr[3] = (v >> 32) & 0xff;
742 }
743 }
744 else
745 {
746 #pragma omp parallel for num_threads(opt.num_threads)
747 for (int i = 0; i < w; i++)
748 {
749 const int* intptr = (const int*)bottom_blob + i * 4;
750 signed char* ptr = (signed char*)top_blob + i * 4;
751
752 __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
753 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
754 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
755 _v = activation_sse(_v, activation_type, activation_params);
756 _v = _mm_mul_ps(_v, _scale_out);
757 int64_t v = float2int8_sse(_v, _v);
758 ptr[0] = (v >> 56) & 0xff;
759 ptr[1] = (v >> 48) & 0xff;
760 ptr[2] = (v >> 40) & 0xff;
761 ptr[3] = (v >> 32) & 0xff;
762 }
763 }
764 }
765 else if (scale_in_data_size == 1 && scale_out_data_size > 1)
766 {
767 __m128 _scale_in = _mm_set1_ps(scale_in_data[0]);
768
769 if (bias_data_size == 0)
770 {
771 #pragma omp parallel for num_threads(opt.num_threads)
772 for (int i = 0; i < w; i++)
773 {
774 const int* intptr = (const int*)bottom_blob + i * 4;
775 signed char* ptr = (signed char*)top_blob + i * 4;
776
777 __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
778 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
779 _v = _mm_mul_ps(_v, _scale_in);
780 _v = activation_sse(_v, activation_type, activation_params);
781 _v = _mm_mul_ps(_v, _scale_out);
782 int64_t v = float2int8_sse(_v, _v);
783 ptr[0] = (v >> 56) & 0xff;
784 ptr[1] = (v >> 48) & 0xff;
785 ptr[2] = (v >> 40) & 0xff;
786 ptr[3] = (v >> 32) & 0xff;
787 }
788 }
789 else if (bias_data_size == 1)
790 {
791 __m128 _bias = _mm_set1_ps(bias_data[0]);
792
793 #pragma omp parallel for num_threads(opt.num_threads)
794 for (int i = 0; i < w; i++)
795 {
796 const int* intptr = (const int*)bottom_blob + i * 4;
797 signed char* ptr = (signed char*)top_blob + i * 4;
798
799 __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
800 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
801 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
802 _v = activation_sse(_v, activation_type, activation_params);
803 _v = _mm_mul_ps(_v, _scale_out);
804 int64_t v = float2int8_sse(_v, _v);
805 ptr[0] = (v >> 56) & 0xff;
806 ptr[1] = (v >> 48) & 0xff;
807 ptr[2] = (v >> 40) & 0xff;
808 ptr[3] = (v >> 32) & 0xff;
809 }
810 }
811 else
812 {
813 #pragma omp parallel for num_threads(opt.num_threads)
814 for (int i = 0; i < w; i++)
815 {
816 const int* intptr = (const int*)bottom_blob + i * 4;
817 signed char* ptr = (signed char*)top_blob + i * 4;
818
819 __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
820 __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
821 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
822 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
823 _v = activation_sse(_v, activation_type, activation_params);
824 _v = _mm_mul_ps(_v, _scale_out);
825 int64_t v = float2int8_sse(_v, _v);
826 ptr[0] = (v >> 56) & 0xff;
827 ptr[1] = (v >> 48) & 0xff;
828 ptr[2] = (v >> 40) & 0xff;
829 ptr[3] = (v >> 32) & 0xff;
830 }
831 }
832 }
833 else if (scale_in_data_size > 1 && scale_out_data_size == 1)
834 {
835 __m128 _scale_out = _mm_set1_ps(scale_out_data[0]);
836
837 if (bias_data_size == 0)
838 {
839 #pragma omp parallel for num_threads(opt.num_threads)
840 for (int i = 0; i < w; i++)
841 {
842 const int* intptr = (const int*)bottom_blob + i * 4;
843 signed char* ptr = (signed char*)top_blob + i * 4;
844
845 __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
846 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
847 _v = _mm_mul_ps(_v, _scale_in);
848 _v = activation_sse(_v, activation_type, activation_params);
849 _v = _mm_mul_ps(_v, _scale_out);
850 int64_t v = float2int8_sse(_v, _v);
851 ptr[0] = (v >> 56) & 0xff;
852 ptr[1] = (v >> 48) & 0xff;
853 ptr[2] = (v >> 40) & 0xff;
854 ptr[3] = (v >> 32) & 0xff;
855 }
856 }
857 else if (bias_data_size == 1)
858 {
859 __m128 _bias = _mm_set1_ps(bias_data[0]);
860
861 #pragma omp parallel for num_threads(opt.num_threads)
862 for (int i = 0; i < w; i++)
863 {
864 const int* intptr = (const int*)bottom_blob + i * 4;
865 signed char* ptr = (signed char*)top_blob + i * 4;
866
867 __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
868 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
869 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
870 _v = activation_sse(_v, activation_type, activation_params);
871 _v = _mm_mul_ps(_v, _scale_out);
872 int64_t v = float2int8_sse(_v, _v);
873 ptr[0] = (v >> 56) & 0xff;
874 ptr[1] = (v >> 48) & 0xff;
875 ptr[2] = (v >> 40) & 0xff;
876 ptr[3] = (v >> 32) & 0xff;
877 }
878 }
879 else
880 {
881 #pragma omp parallel for num_threads(opt.num_threads)
882 for (int i = 0; i < w; i++)
883 {
884 const int* intptr = (const int*)bottom_blob + i * 4;
885 signed char* ptr = (signed char*)top_blob + i * 4;
886
887 __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
888 __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
889 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
890 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
891 _v = activation_sse(_v, activation_type, activation_params);
892 _v = _mm_mul_ps(_v, _scale_out);
893 int64_t v = float2int8_sse(_v, _v);
894 ptr[0] = (v >> 56) & 0xff;
895 ptr[1] = (v >> 48) & 0xff;
896 ptr[2] = (v >> 40) & 0xff;
897 ptr[3] = (v >> 32) & 0xff;
898 }
899 }
900 }
901 else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
902 {
903 if (bias_data_size == 0)
904 {
905 #pragma omp parallel for num_threads(opt.num_threads)
906 for (int i = 0; i < w; i++)
907 {
908 const int* intptr = (const int*)bottom_blob + i * 4;
909 signed char* ptr = (signed char*)top_blob + i * 4;
910
911 __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
912 __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
913 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
914 _v = _mm_mul_ps(_v, _scale_in);
915 _v = activation_sse(_v, activation_type, activation_params);
916 _v = _mm_mul_ps(_v, _scale_out);
917 int64_t v = float2int8_sse(_v, _v);
918 ptr[0] = (v >> 56) & 0xff;
919 ptr[1] = (v >> 48) & 0xff;
920 ptr[2] = (v >> 40) & 0xff;
921 ptr[3] = (v >> 32) & 0xff;
922 }
923 }
924 else if (bias_data_size == 1)
925 {
926 __m128 _bias = _mm_set1_ps(bias_data[0]);
927
928 #pragma omp parallel for num_threads(opt.num_threads)
929 for (int i = 0; i < w; i++)
930 {
931 const int* intptr = (const int*)bottom_blob + i * 4;
932 signed char* ptr = (signed char*)top_blob + i * 4;
933
934 __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
935 __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
936 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
937 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
938 _v = activation_sse(_v, activation_type, activation_params);
939 _v = _mm_mul_ps(_v, _scale_out);
940 int64_t v = float2int8_sse(_v, _v);
941 ptr[0] = (v >> 56) & 0xff;
942 ptr[1] = (v >> 48) & 0xff;
943 ptr[2] = (v >> 40) & 0xff;
944 ptr[3] = (v >> 32) & 0xff;
945 }
946 }
947 else
948 {
949 #pragma omp parallel for num_threads(opt.num_threads)
950 for (int i = 0; i < w; i++)
951 {
952 const int* intptr = (const int*)bottom_blob + i * 4;
953 signed char* ptr = (signed char*)top_blob + i * 4;
954
955 __m128 _scale_in = _mm_loadu_ps((const float*)scale_in_data + i * 4);
956 __m128 _scale_out = _mm_loadu_ps((const float*)scale_out_data + i * 4);
957 __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
958 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
959 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
960 _v = activation_sse(_v, activation_type, activation_params);
961 _v = _mm_mul_ps(_v, _scale_out);
962 int64_t v = float2int8_sse(_v, _v);
963 ptr[0] = (v >> 56) & 0xff;
964 ptr[1] = (v >> 48) & 0xff;
965 ptr[2] = (v >> 40) & 0xff;
966 ptr[3] = (v >> 32) & 0xff;
967 }
968 }
969 }
970 }
971
972 if (dims == 2)
973 {
974 int w = bottom_blob.w;
975 int h = bottom_blob.h;
976 int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
977 int outh = h * elempack / out_elempack;
978
979 top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
980 if (top_blob.empty())
981 return -100;
982
983 if (out_elempack == 8)
984 {
985 if (bias_data_size == 0)
986 {
987 #pragma omp parallel for num_threads(opt.num_threads)
988 for (int i = 0; i < outh; i++)
989 {
990 const int* intptr0 = bottom_blob.row<const int>(i * 2);
991 const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
992 signed char* ptr = top_blob.row<signed char>(i);
993
994 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
995 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
996 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
997 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
998
999 for (int j = 0; j < w; j++)
1000 {
1001 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
1002 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
1003 _v0 = _mm_mul_ps(_v0, _scale_in0);
1004 _v1 = _mm_mul_ps(_v1, _scale_in1);
1005 _v0 = activation_sse(_v0, activation_type, activation_params);
1006 _v1 = activation_sse(_v1, activation_type, activation_params);
1007 _v0 = _mm_mul_ps(_v0, _scale_out0);
1008 _v1 = _mm_mul_ps(_v1, _scale_out1);
1009 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
1010
1011 intptr0 += 4;
1012 intptr1 += 4;
1013 ptr += 8;
1014 }
1015 }
1016 }
1017 else
1018 {
1019 #pragma omp parallel for num_threads(opt.num_threads)
1020 for (int i = 0; i < outh; i++)
1021 {
1022 const int* intptr0 = bottom_blob.row<const int>(i * 2);
1023 const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
1024 signed char* ptr = top_blob.row<signed char>(i);
1025
1026 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8);
1027 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 8 + 4);
1028 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8);
1029 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 8 + 4);
1030 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
1031 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
1032
1033 for (int j = 0; j < w; j++)
1034 {
1035 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
1036 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
1037 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
1038 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
1039 _v0 = activation_sse(_v0, activation_type, activation_params);
1040 _v1 = activation_sse(_v1, activation_type, activation_params);
1041 _v0 = _mm_mul_ps(_v0, _scale_out0);
1042 _v1 = _mm_mul_ps(_v1, _scale_out1);
1043 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
1044
1045 intptr0 += 4;
1046 intptr1 += 4;
1047 ptr += 8;
1048 }
1049 }
1050 }
1051 }
1052 if (out_elempack == 1)
1053 {
1054 if (bias_data_size == 0)
1055 {
1056 #pragma omp parallel for num_threads(opt.num_threads)
1057 for (int i = 0; i < h; i++)
1058 {
1059 const int* intptr = bottom_blob.row<const int>(i);
1060 signed char* ptr0 = top_blob.row<signed char>(i * 4);
1061 signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
1062 signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
1063 signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
1064
1065 __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 4);
1066 __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 4);
1067
1068 for (int j = 0; j < w; j++)
1069 {
1070 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
1071 _v = _mm_mul_ps(_v, _scale_in);
1072 _v = activation_sse(_v, activation_type, activation_params);
1073 _v = _mm_mul_ps(_v, _scale_out);
1074 int64_t v = float2int8_sse(_v, _v);
1075 ptr0[0] = (v >> 56) & 0xff;
1076 ptr1[0] = (v >> 48) & 0xff;
1077 ptr2[0] = (v >> 40) & 0xff;
1078 ptr3[0] = (v >> 32) & 0xff;
1079
1080 intptr += 4;
1081 ptr0 += 1;
1082 ptr1 += 1;
1083 ptr2 += 1;
1084 ptr3 += 1;
1085 }
1086 }
1087 }
1088 else
1089 {
1090 #pragma omp parallel for num_threads(opt.num_threads)
1091 for (int i = 0; i < h; i++)
1092 {
1093 const int* intptr = bottom_blob.row<const int>(i);
1094 signed char* ptr0 = top_blob.row<signed char>(i * 4);
1095 signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
1096 signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
1097 signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
1098
1099 __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + i * 4);
1100 __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + i * 4);
1101 __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 4);
1102
1103 for (int j = 0; j < w; j++)
1104 {
1105 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
1106 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
1107 _v = activation_sse(_v, activation_type, activation_params);
1108 _v = _mm_mul_ps(_v, _scale_out);
1109 int64_t v = float2int8_sse(_v, _v);
1110 ptr0[0] = (v >> 56) & 0xff;
1111 ptr1[0] = (v >> 48) & 0xff;
1112 ptr2[0] = (v >> 40) & 0xff;
1113 ptr3[0] = (v >> 32) & 0xff;
1114
1115 intptr += 4;
1116 ptr0 += 1;
1117 ptr1 += 1;
1118 ptr2 += 1;
1119 ptr3 += 1;
1120 }
1121 }
1122 }
1123 }
1124 }
1125
1126 if (dims == 3)
1127 {
1128 int w = bottom_blob.w;
1129 int h = bottom_blob.h;
1130 int channels = bottom_blob.c;
1131 int size = w * h;
1132 int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
1133 int outc = channels * elempack / out_elempack;
1134
1135 top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
1136 if (top_blob.empty())
1137 return -100;
1138
1139 if (out_elempack == 8)
1140 {
1141 if (bias_data_size == 0)
1142 {
1143 #pragma omp parallel for num_threads(opt.num_threads)
1144 for (int q = 0; q < outc; q++)
1145 {
1146 const int* intptr0 = bottom_blob.channel(q * 2);
1147 const int* intptr1 = bottom_blob.channel(q * 2 + 1);
1148 signed char* ptr = top_blob.channel(q);
1149
1150 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8);
1151 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4);
1152 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8);
1153 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4);
1154
1155 for (int i = 0; i < size; i++)
1156 {
1157 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
1158 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
1159 _v0 = _mm_mul_ps(_v0, _scale_in0);
1160 _v1 = _mm_mul_ps(_v1, _scale_in1);
1161 _v0 = activation_sse(_v0, activation_type, activation_params);
1162 _v1 = activation_sse(_v1, activation_type, activation_params);
1163 _v0 = _mm_mul_ps(_v0, _scale_out0);
1164 _v1 = _mm_mul_ps(_v1, _scale_out1);
1165 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
1166
1167 intptr0 += 4;
1168 intptr1 += 4;
1169 ptr += 8;
1170 }
1171 }
1172 }
1173 else
1174 {
1175 #pragma omp parallel for num_threads(opt.num_threads)
1176 for (int q = 0; q < outc; q++)
1177 {
1178 const int* intptr0 = bottom_blob.channel(q * 2);
1179 const int* intptr1 = bottom_blob.channel(q * 2 + 1);
1180 signed char* ptr = top_blob.channel(q);
1181
1182 __m128 _scale_in0 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8);
1183 __m128 _scale_in1 = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 8 + 4);
1184 __m128 _scale_out0 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8);
1185 __m128 _scale_out1 = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 8 + 4);
1186 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8);
1187 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8 + 4);
1188
1189 for (int i = 0; i < size; i++)
1190 {
1191 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr0));
1192 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr1));
1193 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale_in0));
1194 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale_in1));
1195 _v0 = activation_sse(_v0, activation_type, activation_params);
1196 _v1 = activation_sse(_v1, activation_type, activation_params);
1197 _v0 = _mm_mul_ps(_v0, _scale_out0);
1198 _v1 = _mm_mul_ps(_v1, _scale_out1);
1199 *(int64_t*)ptr = float2int8_sse(_v0, _v1);
1200
1201 intptr0 += 4;
1202 intptr1 += 4;
1203 ptr += 8;
1204 }
1205 }
1206 }
1207 }
1208 if (out_elempack == 1)
1209 {
1210 if (bias_data_size == 0)
1211 {
1212 #pragma omp parallel for num_threads(opt.num_threads)
1213 for (int q = 0; q < channels; q++)
1214 {
1215 const int* intptr = bottom_blob.channel(q);
1216 signed char* ptr0 = top_blob.channel(q * 4);
1217 signed char* ptr1 = top_blob.channel(q * 4 + 1);
1218 signed char* ptr2 = top_blob.channel(q * 4 + 2);
1219 signed char* ptr3 = top_blob.channel(q * 4 + 3);
1220
1221 __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 4);
1222 __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 4);
1223
1224 for (int i = 0; i < size; i++)
1225 {
1226 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
1227 _v = _mm_mul_ps(_v, _scale_in);
1228 _v = activation_sse(_v, activation_type, activation_params);
1229 _v = _mm_mul_ps(_v, _scale_out);
1230 int64_t v = float2int8_sse(_v, _v);
1231 ptr0[0] = (v >> 56) & 0xff;
1232 ptr1[0] = (v >> 48) & 0xff;
1233 ptr2[0] = (v >> 40) & 0xff;
1234 ptr3[0] = (v >> 32) & 0xff;
1235
1236 intptr += 4;
1237 ptr0 += 1;
1238 ptr1 += 1;
1239 ptr2 += 1;
1240 ptr3 += 1;
1241 }
1242 }
1243 }
1244 else
1245 {
1246 #pragma omp parallel for num_threads(opt.num_threads)
1247 for (int q = 0; q < channels; q++)
1248 {
1249 const int* intptr = bottom_blob.channel(q);
1250 signed char* ptr0 = top_blob.channel(q * 4);
1251 signed char* ptr1 = top_blob.channel(q * 4 + 1);
1252 signed char* ptr2 = top_blob.channel(q * 4 + 2);
1253 signed char* ptr3 = top_blob.channel(q * 4 + 3);
1254
1255 __m128 _scale_in = scale_in_data_size == 1 ? _mm_set1_ps(scale_in_data[0]) : _mm_loadu_ps((const float*)scale_in_data + q * 4);
1256 __m128 _scale_out = scale_out_data_size == 1 ? _mm_set1_ps(scale_out_data[0]) : _mm_loadu_ps((const float*)scale_out_data + q * 4);
1257 __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 4);
1258
1259 for (int i = 0; i < size; i++)
1260 {
1261 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
1262 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale_in));
1263 _v = activation_sse(_v, activation_type, activation_params);
1264 _v = _mm_mul_ps(_v, _scale_out);
1265 int64_t v = float2int8_sse(_v, _v);
1266 ptr0[0] = (v >> 56) & 0xff;
1267 ptr1[0] = (v >> 48) & 0xff;
1268 ptr2[0] = (v >> 40) & 0xff;
1269 ptr3[0] = (v >> 32) & 0xff;
1270
1271 intptr += 4;
1272 ptr0 += 1;
1273 ptr1 += 1;
1274 ptr2 += 1;
1275 ptr3 += 1;
1276 }
1277 }
1278 }
1279 }
1280 }
1281
1282 return 0;
1283 }
1284 #endif // __SSE2__
1285
1286 if (dims == 1)
1287 {
1288 int w = bottom_blob.w;
1289
1290 top_blob.create(w, (size_t)1u, opt.blob_allocator);
1291 if (top_blob.empty())
1292 return -100;
1293
1294 const int* intptr = bottom_blob;
1295 signed char* ptr = top_blob;
1296
1297 if (scale_in_data_size == 1 && scale_out_data_size == 1)
1298 {
1299 const float scale_in = scale_in_data[0];
1300 const float scale_out = scale_out_data[0];
1301
1302 if (bias_data_size == 0)
1303 {
1304 #pragma omp parallel for num_threads(opt.num_threads)
1305 for (int i = 0; i < w; i++)
1306 {
1307 float v = intptr[i] * scale_in;
1308 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1309 }
1310 }
1311 else if (bias_data_size == 1)
1312 {
1313 const float bias = bias_data[0];
1314
1315 #pragma omp parallel for num_threads(opt.num_threads)
1316 for (int i = 0; i < w; i++)
1317 {
1318 float v = intptr[i] * scale_in + bias;
1319 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1320 }
1321 }
1322 else
1323 {
1324 #pragma omp parallel for num_threads(opt.num_threads)
1325 for (int i = 0; i < w; i++)
1326 {
1327 float v = intptr[i] * scale_in + bias_data[i];
1328 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1329 }
1330 }
1331 }
1332 else if (scale_in_data_size == 1 && scale_out_data_size > 1)
1333 {
1334 const float scale_in = scale_in_data[0];
1335
1336 if (bias_data_size == 0)
1337 {
1338 #pragma omp parallel for num_threads(opt.num_threads)
1339 for (int i = 0; i < w; i++)
1340 {
1341 float v = intptr[i] * scale_in;
1342 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1343 }
1344 }
1345 else if (bias_data_size == 1)
1346 {
1347 const float bias = bias_data[0];
1348
1349 #pragma omp parallel for num_threads(opt.num_threads)
1350 for (int i = 0; i < w; i++)
1351 {
1352 float v = intptr[i] * scale_in + bias;
1353 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1354 }
1355 }
1356 else
1357 {
1358 #pragma omp parallel for num_threads(opt.num_threads)
1359 for (int i = 0; i < w; i++)
1360 {
1361 float v = intptr[i] * scale_in + bias_data[i];
1362 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1363 }
1364 }
1365 }
1366 else if (scale_in_data_size > 1 && scale_out_data_size == 1)
1367 {
1368 const float scale_out = scale_out_data[0];
1369
1370 if (bias_data_size == 0)
1371 {
1372 #pragma omp parallel for num_threads(opt.num_threads)
1373 for (int i = 0; i < w; i++)
1374 {
1375 float v = intptr[i] * scale_in_data[i];
1376 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1377 }
1378 }
1379 else if (bias_data_size == 1)
1380 {
1381 const float bias = bias_data[0];
1382
1383 #pragma omp parallel for num_threads(opt.num_threads)
1384 for (int i = 0; i < w; i++)
1385 {
1386 float v = intptr[i] * scale_in_data[i] + bias;
1387 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1388 }
1389 }
1390 else
1391 {
1392 #pragma omp parallel for num_threads(opt.num_threads)
1393 for (int i = 0; i < w; i++)
1394 {
1395 float v = intptr[i] * scale_in_data[i] + bias_data[i];
1396 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1397 }
1398 }
1399 }
1400 else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
1401 {
1402 if (bias_data_size == 0)
1403 {
1404 #pragma omp parallel for num_threads(opt.num_threads)
1405 for (int i = 0; i < w; i++)
1406 {
1407 float v = intptr[i] * scale_in_data[i];
1408 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1409 }
1410 }
1411 else if (bias_data_size == 1)
1412 {
1413 const float bias = bias_data[0];
1414
1415 #pragma omp parallel for num_threads(opt.num_threads)
1416 for (int i = 0; i < w; i++)
1417 {
1418 float v = intptr[i] * scale_in_data[i] + bias;
1419 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1420 }
1421 }
1422 else
1423 {
1424 #pragma omp parallel for num_threads(opt.num_threads)
1425 for (int i = 0; i < w; i++)
1426 {
1427 float v = intptr[i] * scale_in_data[i] + bias_data[i];
1428 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
1429 }
1430 }
1431 }
1432 }
1433
1434 if (dims == 2)
1435 {
1436 int w = bottom_blob.w;
1437 int h = bottom_blob.h;
1438
1439 top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
1440 if (top_blob.empty())
1441 return -100;
1442
1443 if (bias_data_size == 0)
1444 {
1445 #pragma omp parallel for num_threads(opt.num_threads)
1446 for (int i = 0; i < h; i++)
1447 {
1448 const int* intptr = bottom_blob.row<const int>(i);
1449 signed char* ptr = top_blob.row<signed char>(i);
1450
1451 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1452 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1453
1454 for (int j = 0; j < w; j++)
1455 {
1456 float v = intptr[j] * scale_in;
1457 ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1458 }
1459 }
1460 }
1461 else
1462 {
1463 #pragma omp parallel for num_threads(opt.num_threads)
1464 for (int i = 0; i < h; i++)
1465 {
1466 const int* intptr = bottom_blob.row<const int>(i);
1467 signed char* ptr = top_blob.row<signed char>(i);
1468
1469 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
1470 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
1471 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
1472
1473 for (int j = 0; j < w; j++)
1474 {
1475 float v = intptr[j] * scale_in + bias;
1476 ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1477 }
1478 }
1479 }
1480 }
1481
1482 if (dims == 3)
1483 {
1484 int w = bottom_blob.w;
1485 int h = bottom_blob.h;
1486 int channels = bottom_blob.c;
1487 int size = w * h;
1488
1489 top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
1490 if (top_blob.empty())
1491 return -100;
1492
1493 if (bias_data_size == 0)
1494 {
1495 #pragma omp parallel for num_threads(opt.num_threads)
1496 for (int q = 0; q < channels; q++)
1497 {
1498 const int* intptr = bottom_blob.channel(q);
1499 signed char* ptr = top_blob.channel(q);
1500
1501 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1502 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1503
1504 for (int i = 0; i < size; i++)
1505 {
1506 float v = intptr[i] * scale_in;
1507 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1508 }
1509 }
1510 }
1511 else
1512 {
1513 #pragma omp parallel for num_threads(opt.num_threads)
1514 for (int q = 0; q < channels; q++)
1515 {
1516 const int* intptr = bottom_blob.channel(q);
1517 signed char* ptr = top_blob.channel(q);
1518
1519 const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
1520 const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
1521 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
1522
1523 for (int i = 0; i < size; i++)
1524 {
1525 float v = intptr[i] * scale_in + bias;
1526 ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
1527 }
1528 }
1529 }
1530 }
1531
1532 return 0;
1533 }
1534
1535 } // namespace ncnn
1536