1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "eltwise_x86.h"
16
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23
24 namespace ncnn {
25
Eltwise_x86()26 Eltwise_x86::Eltwise_x86()
27 {
28 #if __SSE2__
29 support_packing = true;
30 #endif // __SSE2__
31 }
32
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const33 int Eltwise_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
34 {
35 const Mat& bottom_blob = bottom_blobs[0];
36 int w = bottom_blob.w;
37 int h = bottom_blob.h;
38 int channels = bottom_blob.c;
39 int elempack = bottom_blob.elempack;
40 int size = w * h;
41
42 Mat& top_blob = top_blobs[0];
43 top_blob.create_like(bottom_blob, opt.blob_allocator);
44 if (top_blob.empty())
45 return -100;
46
47 #if __SSE2__
48 #if __AVX__
49 if (elempack == 8)
50 {
51 if (op_type == Operation_PROD)
52 {
53 // first blob
54 const Mat& bottom_blob1 = bottom_blobs[1];
55 #pragma omp parallel for num_threads(opt.num_threads)
56 for (int q = 0; q < channels; q++)
57 {
58 const float* ptr = bottom_blob.channel(q);
59 const float* ptr1 = bottom_blob1.channel(q);
60 float* outptr = top_blob.channel(q);
61
62 for (int i = 0; i < size; i++)
63 {
64 __m256 _p = _mm256_loadu_ps(ptr);
65 __m256 _p1 = _mm256_loadu_ps(ptr1);
66 _p = _mm256_mul_ps(_p, _p1);
67 _mm256_storeu_ps(outptr, _p);
68
69 ptr += 8;
70 ptr1 += 8;
71 outptr += 8;
72 }
73 }
74
75 for (size_t b = 2; b < bottom_blobs.size(); b++)
76 {
77 const Mat& bottom_blob2 = bottom_blobs[b];
78 #pragma omp parallel for num_threads(opt.num_threads)
79 for (int q = 0; q < channels; q++)
80 {
81 const float* ptr = bottom_blob2.channel(q);
82 float* outptr = top_blob.channel(q);
83
84 for (int i = 0; i < size; i++)
85 {
86 __m256 _p = _mm256_loadu_ps(outptr);
87 __m256 _p1 = _mm256_loadu_ps(ptr);
88 _p = _mm256_mul_ps(_p, _p1);
89 _mm256_storeu_ps(outptr, _p);
90
91 ptr += 8;
92 outptr += 8;
93 }
94 }
95 }
96 }
97 if (op_type == Operation_SUM)
98 {
99 if (coeffs.w == 0)
100 {
101 // first blob
102 const Mat& bottom_blob1 = bottom_blobs[1];
103 #pragma omp parallel for num_threads(opt.num_threads)
104 for (int q = 0; q < channels; q++)
105 {
106 const float* ptr = bottom_blob.channel(q);
107 const float* ptr1 = bottom_blob1.channel(q);
108 float* outptr = top_blob.channel(q);
109
110 for (int i = 0; i < size; i++)
111 {
112 __m256 _p = _mm256_loadu_ps(ptr);
113 __m256 _p1 = _mm256_loadu_ps(ptr1);
114 _p = _mm256_add_ps(_p, _p1);
115 _mm256_storeu_ps(outptr, _p);
116
117 ptr += 8;
118 ptr1 += 8;
119 outptr += 8;
120 }
121 }
122
123 for (size_t b = 2; b < bottom_blobs.size(); b++)
124 {
125 const Mat& bottom_blob2 = bottom_blobs[b];
126 #pragma omp parallel for num_threads(opt.num_threads)
127 for (int q = 0; q < channels; q++)
128 {
129 const float* ptr = bottom_blob2.channel(q);
130 float* outptr = top_blob.channel(q);
131
132 for (int i = 0; i < size; i++)
133 {
134 __m256 _p = _mm256_loadu_ps(outptr);
135 __m256 _p1 = _mm256_loadu_ps(ptr);
136 _p = _mm256_add_ps(_p, _p1);
137 _mm256_storeu_ps(outptr, _p);
138
139 ptr += 8;
140 outptr += 8;
141 }
142 }
143 }
144 }
145 else
146 {
147 // first blob
148 const Mat& bottom_blob1 = bottom_blobs[1];
149 __m256 _coeff0 = _mm256_set1_ps(coeffs[0]);
150 __m256 _coeff1 = _mm256_set1_ps(coeffs[1]);
151 #pragma omp parallel for num_threads(opt.num_threads)
152 for (int q = 0; q < channels; q++)
153 {
154 const float* ptr = bottom_blob.channel(q);
155 const float* ptr1 = bottom_blob1.channel(q);
156 float* outptr = top_blob.channel(q);
157
158 for (int i = 0; i < size; i++)
159 {
160 __m256 _p = _mm256_loadu_ps(ptr);
161 __m256 _p1 = _mm256_loadu_ps(ptr1);
162 _p = _mm256_mul_ps(_p, _coeff0);
163 _p = _mm256_fmadd_ps(_p1, _coeff1, _p);
164 _mm256_storeu_ps(outptr, _p);
165
166 ptr += 8;
167 ptr1 += 8;
168 outptr += 8;
169 }
170 }
171
172 for (size_t b = 2; b < bottom_blobs.size(); b++)
173 {
174 const Mat& bottom_blob2 = bottom_blobs[b];
175 __m256 _coeff = _mm256_set1_ps(coeffs[b]);
176 #pragma omp parallel for num_threads(opt.num_threads)
177 for (int q = 0; q < channels; q++)
178 {
179 const float* ptr = bottom_blob2.channel(q);
180 float* outptr = top_blob.channel(q);
181
182 for (int i = 0; i < size; i++)
183 {
184 __m256 _p = _mm256_loadu_ps(outptr);
185 __m256 _p1 = _mm256_loadu_ps(ptr);
186 _p = _mm256_fmadd_ps(_p1, _coeff, _p);
187 _mm256_storeu_ps(outptr, _p);
188
189 ptr += 8;
190 outptr += 8;
191 }
192 }
193 }
194 }
195 }
196 if (op_type == Operation_MAX)
197 {
198 // first blob
199 const Mat& bottom_blob1 = bottom_blobs[1];
200 #pragma omp parallel for num_threads(opt.num_threads)
201 for (int q = 0; q < channels; q++)
202 {
203 const float* ptr = bottom_blob.channel(q);
204 const float* ptr1 = bottom_blob1.channel(q);
205 float* outptr = top_blob.channel(q);
206
207 for (int i = 0; i < size; i++)
208 {
209 __m256 _p = _mm256_loadu_ps(ptr);
210 __m256 _p1 = _mm256_loadu_ps(ptr1);
211 _p = _mm256_max_ps(_p, _p1);
212 _mm256_storeu_ps(outptr, _p);
213
214 ptr += 8;
215 ptr1 += 8;
216 outptr += 8;
217 }
218 }
219
220 for (size_t b = 2; b < bottom_blobs.size(); b++)
221 {
222 const Mat& bottom_blob2 = bottom_blobs[b];
223 #pragma omp parallel for num_threads(opt.num_threads)
224 for (int q = 0; q < channels; q++)
225 {
226 const float* ptr = bottom_blob2.channel(q);
227 float* outptr = top_blob.channel(q);
228
229 for (int i = 0; i < size; i++)
230 {
231 __m256 _p = _mm256_loadu_ps(outptr);
232 __m256 _p1 = _mm256_loadu_ps(ptr);
233 _p = _mm256_max_ps(_p, _p1);
234 _mm256_storeu_ps(outptr, _p);
235
236 ptr += 8;
237 outptr += 8;
238 }
239 }
240 }
241 }
242
243 return 0;
244 }
245 #endif // __AVX__
246
247 if (elempack == 4)
248 {
249 if (op_type == Operation_PROD)
250 {
251 // first blob
252 const Mat& bottom_blob1 = bottom_blobs[1];
253 #pragma omp parallel for num_threads(opt.num_threads)
254 for (int q = 0; q < channels; q++)
255 {
256 const float* ptr = bottom_blob.channel(q);
257 const float* ptr1 = bottom_blob1.channel(q);
258 float* outptr = top_blob.channel(q);
259
260 for (int i = 0; i < size; i++)
261 {
262 __m128 _p = _mm_load_ps(ptr);
263 __m128 _p1 = _mm_load_ps(ptr1);
264 _p = _mm_mul_ps(_p, _p1);
265 _mm_store_ps(outptr, _p);
266
267 ptr += 4;
268 ptr1 += 4;
269 outptr += 4;
270 }
271 }
272
273 for (size_t b = 2; b < bottom_blobs.size(); b++)
274 {
275 const Mat& bottom_blob2 = bottom_blobs[b];
276 #pragma omp parallel for num_threads(opt.num_threads)
277 for (int q = 0; q < channels; q++)
278 {
279 const float* ptr = bottom_blob2.channel(q);
280 float* outptr = top_blob.channel(q);
281
282 for (int i = 0; i < size; i++)
283 {
284 __m128 _p = _mm_load_ps(outptr);
285 __m128 _p1 = _mm_load_ps(ptr);
286 _p = _mm_mul_ps(_p, _p1);
287 _mm_store_ps(outptr, _p);
288
289 ptr += 4;
290 outptr += 4;
291 }
292 }
293 }
294 }
295 if (op_type == Operation_SUM)
296 {
297 if (coeffs.w == 0)
298 {
299 // first blob
300 const Mat& bottom_blob1 = bottom_blobs[1];
301 #pragma omp parallel for num_threads(opt.num_threads)
302 for (int q = 0; q < channels; q++)
303 {
304 const float* ptr = bottom_blob.channel(q);
305 const float* ptr1 = bottom_blob1.channel(q);
306 float* outptr = top_blob.channel(q);
307
308 for (int i = 0; i < size; i++)
309 {
310 __m128 _p = _mm_load_ps(ptr);
311 __m128 _p1 = _mm_load_ps(ptr1);
312 _p = _mm_add_ps(_p, _p1);
313 _mm_store_ps(outptr, _p);
314
315 ptr += 4;
316 ptr1 += 4;
317 outptr += 4;
318 }
319 }
320
321 for (size_t b = 2; b < bottom_blobs.size(); b++)
322 {
323 const Mat& bottom_blob2 = bottom_blobs[b];
324 #pragma omp parallel for num_threads(opt.num_threads)
325 for (int q = 0; q < channels; q++)
326 {
327 const float* ptr = bottom_blob2.channel(q);
328 float* outptr = top_blob.channel(q);
329
330 for (int i = 0; i < size; i++)
331 {
332 __m128 _p = _mm_load_ps(outptr);
333 __m128 _p1 = _mm_load_ps(ptr);
334 _p = _mm_add_ps(_p, _p1);
335 _mm_store_ps(outptr, _p);
336
337 ptr += 4;
338 outptr += 4;
339 }
340 }
341 }
342 }
343 else
344 {
345 // first blob
346 const Mat& bottom_blob1 = bottom_blobs[1];
347 __m128 _coeff0 = _mm_set1_ps(coeffs[0]);
348 __m128 _coeff1 = _mm_set1_ps(coeffs[1]);
349 #pragma omp parallel for num_threads(opt.num_threads)
350 for (int q = 0; q < channels; q++)
351 {
352 const float* ptr = bottom_blob.channel(q);
353 const float* ptr1 = bottom_blob1.channel(q);
354 float* outptr = top_blob.channel(q);
355
356 for (int i = 0; i < size; i++)
357 {
358 __m128 _p = _mm_load_ps(ptr);
359 __m128 _p1 = _mm_load_ps(ptr1);
360 _p = _mm_mul_ps(_p, _coeff0);
361 _p1 = _mm_mul_ps(_p1, _coeff1);
362 _p = _mm_add_ps(_p1, _p);
363 _mm_store_ps(outptr, _p);
364
365 ptr += 4;
366 ptr1 += 4;
367 outptr += 4;
368 }
369 }
370
371 for (size_t b = 2; b < bottom_blobs.size(); b++)
372 {
373 const Mat& bottom_blob2 = bottom_blobs[b];
374 __m128 _coeff = _mm_set1_ps(coeffs[b]);
375 #pragma omp parallel for num_threads(opt.num_threads)
376 for (int q = 0; q < channels; q++)
377 {
378 const float* ptr = bottom_blob2.channel(q);
379 float* outptr = top_blob.channel(q);
380
381 for (int i = 0; i < size; i++)
382 {
383 __m128 _p1 = _mm_load_ps(ptr);
384 __m128 _p = _mm_load_ps(outptr);
385 _p1 = _mm_mul_ps(_p1, _coeff);
386 _p = _mm_add_ps(_p1, _p);
387 _mm_store_ps(outptr, _p);
388
389 ptr += 4;
390 outptr += 4;
391 }
392 }
393 }
394 }
395 }
396 if (op_type == Operation_MAX)
397 {
398 // first blob
399 const Mat& bottom_blob1 = bottom_blobs[1];
400 #pragma omp parallel for num_threads(opt.num_threads)
401 for (int q = 0; q < channels; q++)
402 {
403 const float* ptr = bottom_blob.channel(q);
404 const float* ptr1 = bottom_blob1.channel(q);
405 float* outptr = top_blob.channel(q);
406
407 for (int i = 0; i < size; i++)
408 {
409 __m128 _p = _mm_load_ps(ptr);
410 __m128 _p1 = _mm_load_ps(ptr1);
411 _p = _mm_max_ps(_p, _p1);
412 _mm_store_ps(outptr, _p);
413
414 ptr += 4;
415 ptr1 += 4;
416 outptr += 4;
417 }
418 }
419
420 for (size_t b = 2; b < bottom_blobs.size(); b++)
421 {
422 const Mat& bottom_blob2 = bottom_blobs[b];
423 #pragma omp parallel for num_threads(opt.num_threads)
424 for (int q = 0; q < channels; q++)
425 {
426 const float* ptr = bottom_blob2.channel(q);
427 float* outptr = top_blob.channel(q);
428
429 for (int i = 0; i < size; i++)
430 {
431 __m128 _p = _mm_load_ps(outptr);
432 __m128 _p1 = _mm_load_ps(ptr);
433 _p = _mm_max_ps(_p, _p1);
434 _mm_store_ps(outptr, _p);
435
436 ptr += 4;
437 outptr += 4;
438 }
439 }
440 }
441 }
442
443 return 0;
444 }
445 #endif // __SSE2__
446
447 if (op_type == Operation_PROD)
448 {
449 // first blob
450 const Mat& bottom_blob1 = bottom_blobs[1];
451 #pragma omp parallel for num_threads(opt.num_threads)
452 for (int q = 0; q < channels; q++)
453 {
454 const float* ptr = bottom_blob.channel(q);
455 const float* ptr1 = bottom_blob1.channel(q);
456 float* outptr = top_blob.channel(q);
457 int remain = size;
458 for (; remain > 0; remain--)
459 {
460 *outptr = *ptr * *ptr1;
461
462 ptr++;
463 ptr1++;
464 outptr++;
465 }
466 }
467
468 for (size_t b = 2; b < bottom_blobs.size(); b++)
469 {
470 const Mat& bottom_blob2 = bottom_blobs[b];
471 #pragma omp parallel for num_threads(opt.num_threads)
472 for (int q = 0; q < channels; q++)
473 {
474 const float* ptr = bottom_blob2.channel(q);
475 float* outptr = top_blob.channel(q);
476 int remain = size;
477
478 for (; remain > 0; remain--)
479 {
480 *outptr *= *ptr;
481
482 ptr++;
483 outptr++;
484 }
485 }
486 }
487 }
488 if (op_type == Operation_SUM)
489 {
490 if (coeffs.w == 0)
491 {
492 // first blob
493 const Mat& bottom_blob1 = bottom_blobs[1];
494 #pragma omp parallel for num_threads(opt.num_threads)
495 for (int q = 0; q < channels; q++)
496 {
497 const float* ptr = bottom_blob.channel(q);
498 const float* ptr1 = bottom_blob1.channel(q);
499 float* outptr = top_blob.channel(q);
500 int remain = size;
501
502 for (; remain > 0; remain--)
503 {
504 *outptr = *ptr + *ptr1;
505
506 ptr++;
507 ptr1++;
508 outptr++;
509 }
510 }
511
512 for (size_t b = 2; b < bottom_blobs.size(); b++)
513 {
514 const Mat& bottom_blob2 = bottom_blobs[b];
515 #pragma omp parallel for num_threads(opt.num_threads)
516 for (int q = 0; q < channels; q++)
517 {
518 const float* ptr = bottom_blob2.channel(q);
519 float* outptr = top_blob.channel(q);
520
521 int remain = size;
522 for (; remain > 0; remain--)
523 {
524 *outptr += *ptr;
525
526 ptr++;
527 outptr++;
528 }
529 }
530 }
531 }
532 else
533 {
534 // first blob
535 const Mat& bottom_blob1 = bottom_blobs[1];
536 float coeff0 = coeffs[0];
537 float coeff1 = coeffs[1];
538 #pragma omp parallel for num_threads(opt.num_threads)
539 for (int q = 0; q < channels; q++)
540 {
541 const float* ptr = bottom_blob.channel(q);
542 const float* ptr1 = bottom_blob1.channel(q);
543 float* outptr = top_blob.channel(q);
544 int remain = size;
545 for (; remain > 0; remain--)
546 {
547 *outptr = *ptr * coeff0 + *ptr1 * coeff1;
548
549 ptr++;
550 ptr1++;
551 outptr++;
552 }
553 }
554
555 for (size_t b = 2; b < bottom_blobs.size(); b++)
556 {
557 const Mat& bottom_blob2 = bottom_blobs[b];
558 float coeff = coeffs[b];
559 #pragma omp parallel for num_threads(opt.num_threads)
560 for (int q = 0; q < channels; q++)
561 {
562 const float* ptr = bottom_blob2.channel(q);
563 float* outptr = top_blob.channel(q);
564
565 int remain = size;
566 for (; remain > 0; remain--)
567 {
568 *outptr += *ptr * coeff;
569
570 ptr++;
571 outptr++;
572 }
573 }
574 }
575 }
576 }
577 if (op_type == Operation_MAX)
578 {
579 // first blob
580 const Mat& bottom_blob1 = bottom_blobs[1];
581 #pragma omp parallel for num_threads(opt.num_threads)
582 for (int q = 0; q < channels; q++)
583 {
584 const float* ptr = bottom_blob.channel(q);
585 const float* ptr1 = bottom_blob1.channel(q);
586 float* outptr = top_blob.channel(q);
587
588 int remain = size;
589 for (; remain > 0; remain--)
590 {
591 *outptr = std::max(*ptr, *ptr1);
592
593 ptr++;
594 ptr1++;
595 outptr++;
596 }
597 }
598
599 for (size_t b = 2; b < bottom_blobs.size(); b++)
600 {
601 const Mat& bottom_blob2 = bottom_blobs[b];
602 #pragma omp parallel for num_threads(opt.num_threads)
603 for (int q = 0; q < channels; q++)
604 {
605 const float* ptr = bottom_blob2.channel(q);
606 float* outptr = top_blob.channel(q);
607
608 int remain = size;
609 for (; remain > 0; remain--)
610 {
611 *outptr = std::max(*ptr, *outptr);
612
613 ptr++;
614 outptr++;
615 }
616 }
617 }
618 }
619
620 return 0;
621 }
622
623 } // namespace ncnn
624