1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "dequantize_x86.h"
16
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23
24 namespace ncnn {
25
Dequantize_x86()26 Dequantize_x86::Dequantize_x86()
27 {
28 #if __SSE2__
29 support_packing = true;
30 #endif // __SSE2__
31 }
32
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const33 int Dequantize_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
34 {
35 int dims = bottom_blob.dims;
36 int elempack = bottom_blob.elempack;
37
38 #if __SSE2__
39 #if __AVX__
40 if (elempack == 8)
41 {
42 if (dims == 1)
43 {
44 int w = bottom_blob.w;
45
46 top_blob.create(w, (size_t)32u, 8, opt.blob_allocator);
47 if (top_blob.empty())
48 return -100;
49
50 if (scale_data_size == 1)
51 {
52 __m256 _scale = _mm256_set1_ps(scale_data[0]);
53
54 if (bias_data_size == 0)
55 {
56 #pragma omp parallel for num_threads(opt.num_threads)
57 for (int i = 0; i < w; i++)
58 {
59 const int* intptr = (const int*)bottom_blob + i * 8;
60 float* ptr = (float*)top_blob + i * 8;
61
62 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
63 _v = _mm256_mul_ps(_v, _scale);
64 _mm256_storeu_ps(ptr, _v);
65 }
66 }
67 else if (bias_data_size == 1)
68 {
69 __m256 _bias = _mm256_set1_ps(bias_data[0]);
70
71 #pragma omp parallel for num_threads(opt.num_threads)
72 for (int i = 0; i < w; i++)
73 {
74 const int* intptr = (const int*)bottom_blob + i * 8;
75 float* ptr = (float*)top_blob + i * 8;
76
77 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
78 _v = _mm256_fmadd_ps(_v, _scale, _bias);
79 _mm256_storeu_ps(ptr, _v);
80 }
81 }
82 else
83 {
84 #pragma omp parallel for num_threads(opt.num_threads)
85 for (int i = 0; i < w; i++)
86 {
87 const int* intptr = (const int*)bottom_blob + i * 8;
88 float* ptr = (float*)top_blob + i * 8;
89
90 __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
91 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
92 _v = _mm256_fmadd_ps(_v, _scale, _bias);
93 _mm256_storeu_ps(ptr, _v);
94 }
95 }
96 }
97 else
98 {
99 if (bias_data_size == 0)
100 {
101 #pragma omp parallel for num_threads(opt.num_threads)
102 for (int i = 0; i < w; i++)
103 {
104 const int* intptr = (const int*)bottom_blob + i * 8;
105 float* ptr = (float*)top_blob + i * 8;
106
107 __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
108 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
109 _v = _mm256_mul_ps(_v, _scale);
110 _mm256_storeu_ps(ptr, _v);
111 }
112 }
113 else if (bias_data_size == 1)
114 {
115 __m256 _bias = _mm256_set1_ps(bias_data[0]);
116
117 #pragma omp parallel for num_threads(opt.num_threads)
118 for (int i = 0; i < w; i++)
119 {
120 const int* intptr = (const int*)bottom_blob + i * 8;
121 float* ptr = (float*)top_blob + i * 8;
122
123 __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
124 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
125 _v = _mm256_fmadd_ps(_v, _scale, _bias);
126 _mm256_storeu_ps(ptr, _v);
127 }
128 }
129 else
130 {
131 #pragma omp parallel for num_threads(opt.num_threads)
132 for (int i = 0; i < w; i++)
133 {
134 const int* intptr = (const int*)bottom_blob + i * 8;
135 float* ptr = (float*)top_blob + i * 8;
136
137 __m256 _scale = _mm256_loadu_ps((const float*)scale_data + i * 8);
138 __m256 _bias = _mm256_loadu_ps((const float*)bias_data + i * 8);
139 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
140 _v = _mm256_fmadd_ps(_v, _scale, _bias);
141 _mm256_storeu_ps(ptr, _v);
142 }
143 }
144 }
145 }
146
147 if (dims == 2)
148 {
149 int w = bottom_blob.w;
150 int h = bottom_blob.h;
151
152 top_blob.create(w, h, (size_t)32u, 8, opt.blob_allocator);
153 if (top_blob.empty())
154 return -100;
155
156 if (bias_data_size == 0)
157 {
158 #pragma omp parallel for num_threads(opt.num_threads)
159 for (int i = 0; i < h; i++)
160 {
161 const int* intptr = bottom_blob.row<const int>(i);
162 float* ptr = top_blob.row(i);
163
164 __m256 _scale = scale_data_size == 1 ? _mm256_set1_ps(scale_data[0]) : _mm256_loadu_ps((const float*)scale_data + i * 8);
165
166 for (int j = 0; j < w; j++)
167 {
168 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
169 _v = _mm256_mul_ps(_v, _scale);
170 _mm256_storeu_ps(ptr, _v);
171
172 intptr += 8;
173 ptr += 8;
174 }
175 }
176 }
177 else
178 {
179 #pragma omp parallel for num_threads(opt.num_threads)
180 for (int i = 0; i < h; i++)
181 {
182 const int* intptr = bottom_blob.row<const int>(i);
183 float* ptr = top_blob.row(i);
184
185 __m256 _scale = scale_data_size == 1 ? _mm256_set1_ps(scale_data[0]) : _mm256_loadu_ps((const float*)scale_data + i * 8);
186 __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + i * 8);
187
188 for (int j = 0; j < w; j++)
189 {
190 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
191 _v = _mm256_fmadd_ps(_v, _scale, _bias);
192 _mm256_storeu_ps(ptr, _v);
193
194 intptr += 8;
195 ptr += 8;
196 }
197 }
198 }
199 }
200
201 if (dims == 3)
202 {
203 int w = bottom_blob.w;
204 int h = bottom_blob.h;
205 int channels = bottom_blob.c;
206 int size = w * h;
207
208 top_blob.create(w, h, channels, (size_t)32u, 8, opt.blob_allocator);
209 if (top_blob.empty())
210 return -100;
211
212 if (bias_data_size == 0)
213 {
214 #pragma omp parallel for num_threads(opt.num_threads)
215 for (int q = 0; q < channels; q++)
216 {
217 const int* intptr = bottom_blob.channel(q);
218 float* ptr = top_blob.channel(q);
219
220 __m256 _scale = scale_data_size == 1 ? _mm256_set1_ps(scale_data[0]) : _mm256_loadu_ps((const float*)scale_data + q * 8);
221
222 for (int i = 0; i < size; i++)
223 {
224 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
225 _v = _mm256_mul_ps(_v, _scale);
226 _mm256_storeu_ps(ptr, _v);
227
228 intptr += 8;
229 ptr += 8;
230 }
231 }
232 }
233 else
234 {
235 #pragma omp parallel for num_threads(opt.num_threads)
236 for (int q = 0; q < channels; q++)
237 {
238 const int* intptr = bottom_blob.channel(q);
239 float* ptr = top_blob.channel(q);
240
241 __m256 _scale = scale_data_size == 1 ? _mm256_set1_ps(scale_data[0]) : _mm256_loadu_ps((const float*)scale_data + q * 8);
242 __m256 _bias = bias_data_size == 1 ? _mm256_set1_ps(bias_data[0]) : _mm256_loadu_ps((const float*)bias_data + q * 8);
243
244 for (int i = 0; i < size; i++)
245 {
246 __m256 _v = _mm256_cvtepi32_ps(_mm256_loadu_si256((const __m256i*)intptr));
247 _v = _mm256_fmadd_ps(_v, _scale, _bias);
248 _mm256_storeu_ps(ptr, _v);
249
250 intptr += 8;
251 ptr += 8;
252 }
253 }
254 }
255 }
256
257 return 0;
258 }
259 #else // __AVX__
260 if (elempack == 8)
261 {
262 if (dims == 1)
263 {
264 int w = bottom_blob.w;
265 int outw = w * 2;
266
267 top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator);
268 if (top_blob.empty())
269 return -100;
270
271 if (scale_data_size == 1)
272 {
273 __m128 _scale = _mm_set1_ps(scale_data[0]);
274
275 if (bias_data_size == 0)
276 {
277 #pragma omp parallel for num_threads(opt.num_threads)
278 for (int i = 0; i < outw; i++)
279 {
280 const int* intptr = (const int*)bottom_blob + i * 4;
281 float* ptr = (float*)top_blob + i * 4;
282
283 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
284 _v = _mm_mul_ps(_v, _scale);
285 _mm_storeu_ps(ptr, _v);
286 }
287 }
288 else if (bias_data_size == 1)
289 {
290 __m128 _bias = _mm_set1_ps(bias_data[0]);
291
292 #pragma omp parallel for num_threads(opt.num_threads)
293 for (int i = 0; i < outw; i++)
294 {
295 const int* intptr = (const int*)bottom_blob + i * 4;
296 float* ptr = (float*)top_blob + i * 4;
297
298 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
299 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
300 _mm_storeu_ps(ptr, _v);
301 }
302 }
303 else
304 {
305 #pragma omp parallel for num_threads(opt.num_threads)
306 for (int i = 0; i < outw; i++)
307 {
308 const int* intptr = (const int*)bottom_blob + i * 4;
309 float* ptr = (float*)top_blob + i * 4;
310
311 __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
312 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
313 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
314 _mm_storeu_ps(ptr, _v);
315 }
316 }
317 }
318 else
319 {
320 if (bias_data_size == 0)
321 {
322 #pragma omp parallel for num_threads(opt.num_threads)
323 for (int i = 0; i < outw; i++)
324 {
325 const int* intptr = (const int*)bottom_blob + i * 4;
326 float* ptr = (float*)top_blob + i * 4;
327
328 __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4);
329 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
330 _v = _mm_mul_ps(_v, _scale);
331 _mm_storeu_ps(ptr, _v);
332 }
333 }
334 else if (bias_data_size == 1)
335 {
336 __m128 _bias = _mm_set1_ps(bias_data[0]);
337
338 #pragma omp parallel for num_threads(opt.num_threads)
339 for (int i = 0; i < outw; i++)
340 {
341 const int* intptr = (const int*)bottom_blob + i * 4;
342 float* ptr = (float*)top_blob + i * 4;
343
344 __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4);
345 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
346 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
347 _mm_storeu_ps(ptr, _v);
348 }
349 }
350 else
351 {
352 #pragma omp parallel for num_threads(opt.num_threads)
353 for (int i = 0; i < outw; i++)
354 {
355 const int* intptr = (const int*)bottom_blob + i * 4;
356 float* ptr = (float*)top_blob + i * 4;
357
358 __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4);
359 __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
360 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
361 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
362 _mm_storeu_ps(ptr, _v);
363 }
364 }
365 }
366 }
367
368 if (dims == 2)
369 {
370 int w = bottom_blob.w;
371 int h = bottom_blob.h;
372 int outh = h * 2;
373
374 top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator);
375 if (top_blob.empty())
376 return -100;
377
378 if (bias_data_size == 0)
379 {
380 #pragma omp parallel for num_threads(opt.num_threads)
381 for (int i = 0; i < h; i++)
382 {
383 const int* intptr = bottom_blob.row<const int>(i);
384 float* ptr0 = top_blob.row(i * 2);
385 float* ptr1 = top_blob.row(i * 2 + 1);
386
387 __m128 _scale0 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 8);
388 __m128 _scale1 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 8 + 4);
389
390 for (int j = 0; j < w; j++)
391 {
392 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
393 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
394 _v0 = _mm_mul_ps(_v0, _scale0);
395 _v1 = _mm_mul_ps(_v1, _scale1);
396 _mm_storeu_ps(ptr0, _v0);
397 _mm_storeu_ps(ptr1, _v1);
398
399 intptr += 8;
400 ptr0 += 4;
401 ptr1 += 4;
402 }
403 }
404 }
405 else
406 {
407 #pragma omp parallel for num_threads(opt.num_threads)
408 for (int i = 0; i < h; i++)
409 {
410 const int* intptr = bottom_blob.row<const int>(i);
411 float* ptr0 = top_blob.row(i * 2);
412 float* ptr1 = top_blob.row(i * 2 + 1);
413
414 __m128 _scale0 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 8);
415 __m128 _scale1 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 8 + 4);
416 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8);
417 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 8 + 4);
418
419 for (int j = 0; j < w; j++)
420 {
421 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
422 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
423 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale0));
424 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale1));
425 _mm_storeu_ps(ptr0, _v0);
426 _mm_storeu_ps(ptr1, _v1);
427
428 intptr += 8;
429 ptr0 += 4;
430 ptr1 += 4;
431 }
432 }
433 }
434 }
435
436 if (dims == 3)
437 {
438 int w = bottom_blob.w;
439 int h = bottom_blob.h;
440 int channels = bottom_blob.c;
441 int size = w * h;
442 int outc = channels * 2;
443
444 top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator);
445 if (top_blob.empty())
446 return -100;
447
448 if (bias_data_size == 0)
449 {
450 #pragma omp parallel for num_threads(opt.num_threads)
451 for (int q = 0; q < channels; q++)
452 {
453 const int* intptr = bottom_blob.channel(q);
454 float* ptr0 = top_blob.channel(q * 2);
455 float* ptr1 = top_blob.channel(q * 2 + 1);
456
457 __m128 _scale0 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 8);
458 __m128 _scale1 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 8 + 4);
459
460 for (int i = 0; i < size; i++)
461 {
462 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
463 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
464 _v0 = _mm_mul_ps(_v0, _scale0);
465 _v1 = _mm_mul_ps(_v1, _scale1);
466 _mm_storeu_ps(ptr0, _v0);
467 _mm_storeu_ps(ptr1, _v1);
468
469 intptr += 8;
470 ptr0 += 4;
471 ptr1 += 4;
472 }
473 }
474 }
475 else
476 {
477 #pragma omp parallel for num_threads(opt.num_threads)
478 for (int q = 0; q < channels; q++)
479 {
480 const int* intptr = bottom_blob.channel(q);
481 float* ptr0 = top_blob.channel(q * 2);
482 float* ptr1 = top_blob.channel(q * 2 + 1);
483
484 __m128 _scale0 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 8);
485 __m128 _scale1 = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 8 + 4);
486 __m128 _bias0 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8);
487 __m128 _bias1 = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 8 + 4);
488
489 for (int i = 0; i < size; i++)
490 {
491 __m128 _v0 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
492 __m128 _v1 = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)(intptr + 4)));
493 _v0 = _mm_add_ps(_bias0, _mm_mul_ps(_v0, _scale0));
494 _v1 = _mm_add_ps(_bias1, _mm_mul_ps(_v1, _scale1));
495 _mm_storeu_ps(ptr0, _v0);
496 _mm_storeu_ps(ptr1, _v1);
497
498 intptr += 8;
499 ptr0 += 4;
500 ptr1 += 4;
501 }
502 }
503 }
504 }
505
506 return 0;
507 }
508 #endif // __AVX__
509
510 if (elempack == 4)
511 {
512 if (dims == 1)
513 {
514 int w = bottom_blob.w;
515
516 top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator);
517 if (top_blob.empty())
518 return -100;
519
520 if (scale_data_size == 1)
521 {
522 __m128 _scale = _mm_set1_ps(scale_data[0]);
523
524 if (bias_data_size == 0)
525 {
526 #pragma omp parallel for num_threads(opt.num_threads)
527 for (int i = 0; i < w; i++)
528 {
529 const int* intptr = (const int*)bottom_blob + i * 4;
530 float* ptr = (float*)top_blob + i * 4;
531
532 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
533 _v = _mm_mul_ps(_v, _scale);
534 _mm_storeu_ps(ptr, _v);
535 }
536 }
537 else if (bias_data_size == 1)
538 {
539 __m128 _bias = _mm_set1_ps(bias_data[0]);
540
541 #pragma omp parallel for num_threads(opt.num_threads)
542 for (int i = 0; i < w; i++)
543 {
544 const int* intptr = (const int*)bottom_blob + i * 4;
545 float* ptr = (float*)top_blob + i * 4;
546
547 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
548 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
549 _mm_storeu_ps(ptr, _v);
550 }
551 }
552 else
553 {
554 #pragma omp parallel for num_threads(opt.num_threads)
555 for (int i = 0; i < w; i++)
556 {
557 const int* intptr = (const int*)bottom_blob + i * 4;
558 float* ptr = (float*)top_blob + i * 4;
559
560 __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
561 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
562 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
563 _mm_storeu_ps(ptr, _v);
564 }
565 }
566 }
567 else
568 {
569 if (bias_data_size == 0)
570 {
571 #pragma omp parallel for num_threads(opt.num_threads)
572 for (int i = 0; i < w; i++)
573 {
574 const int* intptr = (const int*)bottom_blob + i * 4;
575 float* ptr = (float*)top_blob + i * 4;
576
577 __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4);
578 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
579 _v = _mm_mul_ps(_v, _scale);
580 _mm_storeu_ps(ptr, _v);
581 }
582 }
583 else if (bias_data_size == 1)
584 {
585 __m128 _bias = _mm_set1_ps(bias_data[0]);
586
587 #pragma omp parallel for num_threads(opt.num_threads)
588 for (int i = 0; i < w; i++)
589 {
590 const int* intptr = (const int*)bottom_blob + i * 4;
591 float* ptr = (float*)top_blob + i * 4;
592
593 __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4);
594 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
595 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
596 _mm_storeu_ps(ptr, _v);
597 }
598 }
599 else
600 {
601 #pragma omp parallel for num_threads(opt.num_threads)
602 for (int i = 0; i < w; i++)
603 {
604 const int* intptr = (const int*)bottom_blob + i * 4;
605 float* ptr = (float*)top_blob + i * 4;
606
607 __m128 _scale = _mm_loadu_ps((const float*)scale_data + i * 4);
608 __m128 _bias = _mm_loadu_ps((const float*)bias_data + i * 4);
609 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
610 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
611 _mm_storeu_ps(ptr, _v);
612 }
613 }
614 }
615 }
616
617 if (dims == 2)
618 {
619 int w = bottom_blob.w;
620 int h = bottom_blob.h;
621
622 top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator);
623 if (top_blob.empty())
624 return -100;
625
626 if (bias_data_size == 0)
627 {
628 #pragma omp parallel for num_threads(opt.num_threads)
629 for (int i = 0; i < h; i++)
630 {
631 const int* intptr = bottom_blob.row<const int>(i);
632 float* ptr = top_blob.row(i);
633
634 __m128 _scale = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 4);
635
636 for (int j = 0; j < w; j++)
637 {
638 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
639 _v = _mm_mul_ps(_v, _scale);
640 _mm_storeu_ps(ptr, _v);
641
642 intptr += 4;
643 ptr += 4;
644 }
645 }
646 }
647 else
648 {
649 #pragma omp parallel for num_threads(opt.num_threads)
650 for (int i = 0; i < h; i++)
651 {
652 const int* intptr = bottom_blob.row<const int>(i);
653 float* ptr = top_blob.row(i);
654
655 __m128 _scale = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + i * 4);
656 __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + i * 4);
657
658 for (int j = 0; j < w; j++)
659 {
660 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
661 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
662 _mm_storeu_ps(ptr, _v);
663
664 intptr += 4;
665 ptr += 4;
666 }
667 }
668 }
669 }
670
671 if (dims == 3)
672 {
673 int w = bottom_blob.w;
674 int h = bottom_blob.h;
675 int channels = bottom_blob.c;
676 int size = w * h;
677
678 top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator);
679 if (top_blob.empty())
680 return -100;
681
682 if (bias_data_size == 0)
683 {
684 #pragma omp parallel for num_threads(opt.num_threads)
685 for (int q = 0; q < channels; q++)
686 {
687 const int* intptr = bottom_blob.channel(q);
688 float* ptr = top_blob.channel(q);
689
690 __m128 _scale = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 4);
691
692 for (int i = 0; i < size; i++)
693 {
694 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
695 _v = _mm_mul_ps(_v, _scale);
696 _mm_storeu_ps(ptr, _v);
697
698 intptr += 4;
699 ptr += 4;
700 }
701 }
702 }
703 else
704 {
705 #pragma omp parallel for num_threads(opt.num_threads)
706 for (int q = 0; q < channels; q++)
707 {
708 const int* intptr = bottom_blob.channel(q);
709 float* ptr = top_blob.channel(q);
710
711 __m128 _scale = scale_data_size == 1 ? _mm_set1_ps(scale_data[0]) : _mm_loadu_ps((const float*)scale_data + q * 4);
712 __m128 _bias = bias_data_size == 1 ? _mm_set1_ps(bias_data[0]) : _mm_loadu_ps((const float*)bias_data + q * 4);
713
714 for (int i = 0; i < size; i++)
715 {
716 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
717 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
718 _mm_storeu_ps(ptr, _v);
719
720 intptr += 4;
721 ptr += 4;
722 }
723 }
724 }
725 }
726
727 return 0;
728 }
729 #endif // __SSE2__
730
731 if (dims == 1)
732 {
733 int w = bottom_blob.w;
734
735 top_blob.create(w, (size_t)4u, opt.blob_allocator);
736 if (top_blob.empty())
737 return -100;
738
739 const int* intptr = bottom_blob;
740 float* ptr = top_blob;
741
742 if (scale_data_size == 1)
743 {
744 const float scale = scale_data[0];
745
746 if (bias_data_size == 0)
747 {
748 #pragma omp parallel for num_threads(opt.num_threads)
749 for (int i = 0; i < w; i++)
750 {
751 ptr[i] = intptr[i] * scale;
752 }
753 }
754 else if (bias_data_size == 1)
755 {
756 const float bias = bias_data[0];
757
758 #pragma omp parallel for num_threads(opt.num_threads)
759 for (int i = 0; i < w; i++)
760 {
761 ptr[i] = intptr[i] * scale + bias;
762 }
763 }
764 else
765 {
766 #pragma omp parallel for num_threads(opt.num_threads)
767 for (int i = 0; i < w; i++)
768 {
769 ptr[i] = intptr[i] * scale + bias_data[i];
770 }
771 }
772 }
773 else
774 {
775 if (bias_data_size == 0)
776 {
777 #pragma omp parallel for num_threads(opt.num_threads)
778 for (int i = 0; i < w; i++)
779 {
780 ptr[i] = intptr[i] * scale_data[i];
781 }
782 }
783 else if (bias_data_size == 1)
784 {
785 const float bias = bias_data[0];
786
787 #pragma omp parallel for num_threads(opt.num_threads)
788 for (int i = 0; i < w; i++)
789 {
790 ptr[i] = intptr[i] * scale_data[i] + bias;
791 }
792 }
793 else
794 {
795 #pragma omp parallel for num_threads(opt.num_threads)
796 for (int i = 0; i < w; i++)
797 {
798 ptr[i] = intptr[i] * scale_data[i] + bias_data[i];
799 }
800 }
801 }
802 }
803
804 if (dims == 2)
805 {
806 int w = bottom_blob.w;
807 int h = bottom_blob.h;
808
809 top_blob.create(w, h, (size_t)4u, opt.blob_allocator);
810 if (top_blob.empty())
811 return -100;
812
813 if (bias_data_size == 0)
814 {
815 #pragma omp parallel for num_threads(opt.num_threads)
816 for (int i = 0; i < h; i++)
817 {
818 const int* intptr = bottom_blob.row<const int>(i);
819 float* ptr = top_blob.row(i);
820
821 const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
822
823 int j = 0;
824 #if __SSE2__
825 __m128 _scale = _mm_set1_ps(scale);
826 for (; j + 3 < w; j += 4)
827 {
828 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
829 _v = _mm_mul_ps(_v, _scale);
830 _mm_storeu_ps(ptr, _v);
831
832 intptr += 4;
833 ptr += 4;
834 }
835 #endif // __SSE2__
836 for (; j < w; j++)
837 {
838 *ptr++ = *intptr++ * scale;
839 }
840 }
841 }
842 else
843 {
844 #pragma omp parallel for num_threads(opt.num_threads)
845 for (int i = 0; i < h; i++)
846 {
847 const int* intptr = bottom_blob.row<const int>(i);
848 float* ptr = top_blob.row(i);
849
850 const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
851 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
852
853 int j = 0;
854 #if __SSE2__
855 __m128 _scale = _mm_set1_ps(scale);
856 __m128 _bias = _mm_set1_ps(bias);
857 for (; j + 3 < w; j += 4)
858 {
859 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
860 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
861 _mm_storeu_ps(ptr, _v);
862
863 intptr += 4;
864 ptr += 4;
865 }
866 #endif // __SSE2__
867 for (; j < w; j++)
868 {
869 *ptr++ = *intptr++ * scale + bias;
870 }
871 }
872 }
873 }
874
875 if (dims == 3)
876 {
877 int w = bottom_blob.w;
878 int h = bottom_blob.h;
879 int channels = bottom_blob.c;
880 int size = w * h;
881
882 top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator);
883 if (top_blob.empty())
884 return -100;
885
886 if (bias_data_size == 0)
887 {
888 #pragma omp parallel for num_threads(opt.num_threads)
889 for (int q = 0; q < channels; q++)
890 {
891 const int* intptr = bottom_blob.channel(q);
892 float* ptr = top_blob.channel(q);
893
894 const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
895
896 int i = 0;
897 #if __SSE2__
898 __m128 _scale = _mm_set1_ps(scale);
899 for (; i + 3 < size; i += 4)
900 {
901 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
902 _v = _mm_mul_ps(_v, _scale);
903 _mm_storeu_ps(ptr, _v);
904
905 intptr += 4;
906 ptr += 4;
907 }
908 #endif // __SSE2__
909 for (; i < size; i++)
910 {
911 *ptr++ = *intptr++ * scale;
912 }
913 }
914 }
915 else
916 {
917 #pragma omp parallel for num_threads(opt.num_threads)
918 for (int q = 0; q < channels; q++)
919 {
920 const int* intptr = bottom_blob.channel(q);
921 float* ptr = top_blob.channel(q);
922
923 const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
924 const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
925
926 int i = 0;
927 #if __SSE2__
928 __m128 _scale = _mm_set1_ps(scale);
929 __m128 _bias = _mm_set1_ps(bias);
930 for (; i + 3 < size; i += 4)
931 {
932 __m128 _v = _mm_cvtepi32_ps(_mm_loadu_si128((const __m128i*)intptr));
933 _v = _mm_add_ps(_bias, _mm_mul_ps(_v, _scale));
934 _mm_storeu_ps(ptr, _v);
935
936 intptr += 4;
937 ptr += 4;
938 }
939 #endif // __SSE2__
940 for (; i < size; i++)
941 {
942 *ptr++ = *intptr++ * scale + bias;
943 }
944 }
945 }
946 }
947
948 return 0;
949 }
950
951 } // namespace ncnn
952