1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "eltwise_x86.h"
16 
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif // __AVX__
22 #endif // __SSE2__
23 
24 namespace ncnn {
25 
Eltwise_x86()26 Eltwise_x86::Eltwise_x86()
27 {
28 #if __SSE2__
29     support_packing = true;
30 #endif // __SSE2__
31 }
32 
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const33 int Eltwise_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
34 {
35     const Mat& bottom_blob = bottom_blobs[0];
36     int w = bottom_blob.w;
37     int h = bottom_blob.h;
38     int channels = bottom_blob.c;
39     int elempack = bottom_blob.elempack;
40     int size = w * h;
41 
42     Mat& top_blob = top_blobs[0];
43     top_blob.create_like(bottom_blob, opt.blob_allocator);
44     if (top_blob.empty())
45         return -100;
46 
47 #if __SSE2__
48 #if __AVX__
49     if (elempack == 8)
50     {
51         if (op_type == Operation_PROD)
52         {
53             // first blob
54             const Mat& bottom_blob1 = bottom_blobs[1];
55             #pragma omp parallel for num_threads(opt.num_threads)
56             for (int q = 0; q < channels; q++)
57             {
58                 const float* ptr = bottom_blob.channel(q);
59                 const float* ptr1 = bottom_blob1.channel(q);
60                 float* outptr = top_blob.channel(q);
61 
62                 for (int i = 0; i < size; i++)
63                 {
64                     __m256 _p = _mm256_loadu_ps(ptr);
65                     __m256 _p1 = _mm256_loadu_ps(ptr1);
66                     _p = _mm256_mul_ps(_p, _p1);
67                     _mm256_storeu_ps(outptr, _p);
68 
69                     ptr += 8;
70                     ptr1 += 8;
71                     outptr += 8;
72                 }
73             }
74 
75             for (size_t b = 2; b < bottom_blobs.size(); b++)
76             {
77                 const Mat& bottom_blob2 = bottom_blobs[b];
78                 #pragma omp parallel for num_threads(opt.num_threads)
79                 for (int q = 0; q < channels; q++)
80                 {
81                     const float* ptr = bottom_blob2.channel(q);
82                     float* outptr = top_blob.channel(q);
83 
84                     for (int i = 0; i < size; i++)
85                     {
86                         __m256 _p = _mm256_loadu_ps(outptr);
87                         __m256 _p1 = _mm256_loadu_ps(ptr);
88                         _p = _mm256_mul_ps(_p, _p1);
89                         _mm256_storeu_ps(outptr, _p);
90 
91                         ptr += 8;
92                         outptr += 8;
93                     }
94                 }
95             }
96         }
97         if (op_type == Operation_SUM)
98         {
99             if (coeffs.w == 0)
100             {
101                 // first blob
102                 const Mat& bottom_blob1 = bottom_blobs[1];
103                 #pragma omp parallel for num_threads(opt.num_threads)
104                 for (int q = 0; q < channels; q++)
105                 {
106                     const float* ptr = bottom_blob.channel(q);
107                     const float* ptr1 = bottom_blob1.channel(q);
108                     float* outptr = top_blob.channel(q);
109 
110                     for (int i = 0; i < size; i++)
111                     {
112                         __m256 _p = _mm256_loadu_ps(ptr);
113                         __m256 _p1 = _mm256_loadu_ps(ptr1);
114                         _p = _mm256_add_ps(_p, _p1);
115                         _mm256_storeu_ps(outptr, _p);
116 
117                         ptr += 8;
118                         ptr1 += 8;
119                         outptr += 8;
120                     }
121                 }
122 
123                 for (size_t b = 2; b < bottom_blobs.size(); b++)
124                 {
125                     const Mat& bottom_blob2 = bottom_blobs[b];
126                     #pragma omp parallel for num_threads(opt.num_threads)
127                     for (int q = 0; q < channels; q++)
128                     {
129                         const float* ptr = bottom_blob2.channel(q);
130                         float* outptr = top_blob.channel(q);
131 
132                         for (int i = 0; i < size; i++)
133                         {
134                             __m256 _p = _mm256_loadu_ps(outptr);
135                             __m256 _p1 = _mm256_loadu_ps(ptr);
136                             _p = _mm256_add_ps(_p, _p1);
137                             _mm256_storeu_ps(outptr, _p);
138 
139                             ptr += 8;
140                             outptr += 8;
141                         }
142                     }
143                 }
144             }
145             else
146             {
147                 // first blob
148                 const Mat& bottom_blob1 = bottom_blobs[1];
149                 __m256 _coeff0 = _mm256_set1_ps(coeffs[0]);
150                 __m256 _coeff1 = _mm256_set1_ps(coeffs[1]);
151                 #pragma omp parallel for num_threads(opt.num_threads)
152                 for (int q = 0; q < channels; q++)
153                 {
154                     const float* ptr = bottom_blob.channel(q);
155                     const float* ptr1 = bottom_blob1.channel(q);
156                     float* outptr = top_blob.channel(q);
157 
158                     for (int i = 0; i < size; i++)
159                     {
160                         __m256 _p = _mm256_loadu_ps(ptr);
161                         __m256 _p1 = _mm256_loadu_ps(ptr1);
162                         _p = _mm256_mul_ps(_p, _coeff0);
163                         _p = _mm256_fmadd_ps(_p1, _coeff1, _p);
164                         _mm256_storeu_ps(outptr, _p);
165 
166                         ptr += 8;
167                         ptr1 += 8;
168                         outptr += 8;
169                     }
170                 }
171 
172                 for (size_t b = 2; b < bottom_blobs.size(); b++)
173                 {
174                     const Mat& bottom_blob2 = bottom_blobs[b];
175                     __m256 _coeff = _mm256_set1_ps(coeffs[b]);
176                     #pragma omp parallel for num_threads(opt.num_threads)
177                     for (int q = 0; q < channels; q++)
178                     {
179                         const float* ptr = bottom_blob2.channel(q);
180                         float* outptr = top_blob.channel(q);
181 
182                         for (int i = 0; i < size; i++)
183                         {
184                             __m256 _p = _mm256_loadu_ps(outptr);
185                             __m256 _p1 = _mm256_loadu_ps(ptr);
186                             _p = _mm256_fmadd_ps(_p1, _coeff, _p);
187                             _mm256_storeu_ps(outptr, _p);
188 
189                             ptr += 8;
190                             outptr += 8;
191                         }
192                     }
193                 }
194             }
195         }
196         if (op_type == Operation_MAX)
197         {
198             // first blob
199             const Mat& bottom_blob1 = bottom_blobs[1];
200             #pragma omp parallel for num_threads(opt.num_threads)
201             for (int q = 0; q < channels; q++)
202             {
203                 const float* ptr = bottom_blob.channel(q);
204                 const float* ptr1 = bottom_blob1.channel(q);
205                 float* outptr = top_blob.channel(q);
206 
207                 for (int i = 0; i < size; i++)
208                 {
209                     __m256 _p = _mm256_loadu_ps(ptr);
210                     __m256 _p1 = _mm256_loadu_ps(ptr1);
211                     _p = _mm256_max_ps(_p, _p1);
212                     _mm256_storeu_ps(outptr, _p);
213 
214                     ptr += 8;
215                     ptr1 += 8;
216                     outptr += 8;
217                 }
218             }
219 
220             for (size_t b = 2; b < bottom_blobs.size(); b++)
221             {
222                 const Mat& bottom_blob2 = bottom_blobs[b];
223                 #pragma omp parallel for num_threads(opt.num_threads)
224                 for (int q = 0; q < channels; q++)
225                 {
226                     const float* ptr = bottom_blob2.channel(q);
227                     float* outptr = top_blob.channel(q);
228 
229                     for (int i = 0; i < size; i++)
230                     {
231                         __m256 _p = _mm256_loadu_ps(outptr);
232                         __m256 _p1 = _mm256_loadu_ps(ptr);
233                         _p = _mm256_max_ps(_p, _p1);
234                         _mm256_storeu_ps(outptr, _p);
235 
236                         ptr += 8;
237                         outptr += 8;
238                     }
239                 }
240             }
241         }
242 
243         return 0;
244     }
245 #endif // __AVX__
246 
247     if (elempack == 4)
248     {
249         if (op_type == Operation_PROD)
250         {
251             // first blob
252             const Mat& bottom_blob1 = bottom_blobs[1];
253             #pragma omp parallel for num_threads(opt.num_threads)
254             for (int q = 0; q < channels; q++)
255             {
256                 const float* ptr = bottom_blob.channel(q);
257                 const float* ptr1 = bottom_blob1.channel(q);
258                 float* outptr = top_blob.channel(q);
259 
260                 for (int i = 0; i < size; i++)
261                 {
262                     __m128 _p = _mm_load_ps(ptr);
263                     __m128 _p1 = _mm_load_ps(ptr1);
264                     _p = _mm_mul_ps(_p, _p1);
265                     _mm_store_ps(outptr, _p);
266 
267                     ptr += 4;
268                     ptr1 += 4;
269                     outptr += 4;
270                 }
271             }
272 
273             for (size_t b = 2; b < bottom_blobs.size(); b++)
274             {
275                 const Mat& bottom_blob2 = bottom_blobs[b];
276                 #pragma omp parallel for num_threads(opt.num_threads)
277                 for (int q = 0; q < channels; q++)
278                 {
279                     const float* ptr = bottom_blob2.channel(q);
280                     float* outptr = top_blob.channel(q);
281 
282                     for (int i = 0; i < size; i++)
283                     {
284                         __m128 _p = _mm_load_ps(outptr);
285                         __m128 _p1 = _mm_load_ps(ptr);
286                         _p = _mm_mul_ps(_p, _p1);
287                         _mm_store_ps(outptr, _p);
288 
289                         ptr += 4;
290                         outptr += 4;
291                     }
292                 }
293             }
294         }
295         if (op_type == Operation_SUM)
296         {
297             if (coeffs.w == 0)
298             {
299                 // first blob
300                 const Mat& bottom_blob1 = bottom_blobs[1];
301                 #pragma omp parallel for num_threads(opt.num_threads)
302                 for (int q = 0; q < channels; q++)
303                 {
304                     const float* ptr = bottom_blob.channel(q);
305                     const float* ptr1 = bottom_blob1.channel(q);
306                     float* outptr = top_blob.channel(q);
307 
308                     for (int i = 0; i < size; i++)
309                     {
310                         __m128 _p = _mm_load_ps(ptr);
311                         __m128 _p1 = _mm_load_ps(ptr1);
312                         _p = _mm_add_ps(_p, _p1);
313                         _mm_store_ps(outptr, _p);
314 
315                         ptr += 4;
316                         ptr1 += 4;
317                         outptr += 4;
318                     }
319                 }
320 
321                 for (size_t b = 2; b < bottom_blobs.size(); b++)
322                 {
323                     const Mat& bottom_blob2 = bottom_blobs[b];
324                     #pragma omp parallel for num_threads(opt.num_threads)
325                     for (int q = 0; q < channels; q++)
326                     {
327                         const float* ptr = bottom_blob2.channel(q);
328                         float* outptr = top_blob.channel(q);
329 
330                         for (int i = 0; i < size; i++)
331                         {
332                             __m128 _p = _mm_load_ps(outptr);
333                             __m128 _p1 = _mm_load_ps(ptr);
334                             _p = _mm_add_ps(_p, _p1);
335                             _mm_store_ps(outptr, _p);
336 
337                             ptr += 4;
338                             outptr += 4;
339                         }
340                     }
341                 }
342             }
343             else
344             {
345                 // first blob
346                 const Mat& bottom_blob1 = bottom_blobs[1];
347                 __m128 _coeff0 = _mm_set1_ps(coeffs[0]);
348                 __m128 _coeff1 = _mm_set1_ps(coeffs[1]);
349                 #pragma omp parallel for num_threads(opt.num_threads)
350                 for (int q = 0; q < channels; q++)
351                 {
352                     const float* ptr = bottom_blob.channel(q);
353                     const float* ptr1 = bottom_blob1.channel(q);
354                     float* outptr = top_blob.channel(q);
355 
356                     for (int i = 0; i < size; i++)
357                     {
358                         __m128 _p = _mm_load_ps(ptr);
359                         __m128 _p1 = _mm_load_ps(ptr1);
360                         _p = _mm_mul_ps(_p, _coeff0);
361                         _p1 = _mm_mul_ps(_p1, _coeff1);
362                         _p = _mm_add_ps(_p1, _p);
363                         _mm_store_ps(outptr, _p);
364 
365                         ptr += 4;
366                         ptr1 += 4;
367                         outptr += 4;
368                     }
369                 }
370 
371                 for (size_t b = 2; b < bottom_blobs.size(); b++)
372                 {
373                     const Mat& bottom_blob2 = bottom_blobs[b];
374                     __m128 _coeff = _mm_set1_ps(coeffs[b]);
375                     #pragma omp parallel for num_threads(opt.num_threads)
376                     for (int q = 0; q < channels; q++)
377                     {
378                         const float* ptr = bottom_blob2.channel(q);
379                         float* outptr = top_blob.channel(q);
380 
381                         for (int i = 0; i < size; i++)
382                         {
383                             __m128 _p1 = _mm_load_ps(ptr);
384                             __m128 _p = _mm_load_ps(outptr);
385                             _p1 = _mm_mul_ps(_p1, _coeff);
386                             _p = _mm_add_ps(_p1, _p);
387                             _mm_store_ps(outptr, _p);
388 
389                             ptr += 4;
390                             outptr += 4;
391                         }
392                     }
393                 }
394             }
395         }
396         if (op_type == Operation_MAX)
397         {
398             // first blob
399             const Mat& bottom_blob1 = bottom_blobs[1];
400             #pragma omp parallel for num_threads(opt.num_threads)
401             for (int q = 0; q < channels; q++)
402             {
403                 const float* ptr = bottom_blob.channel(q);
404                 const float* ptr1 = bottom_blob1.channel(q);
405                 float* outptr = top_blob.channel(q);
406 
407                 for (int i = 0; i < size; i++)
408                 {
409                     __m128 _p = _mm_load_ps(ptr);
410                     __m128 _p1 = _mm_load_ps(ptr1);
411                     _p = _mm_max_ps(_p, _p1);
412                     _mm_store_ps(outptr, _p);
413 
414                     ptr += 4;
415                     ptr1 += 4;
416                     outptr += 4;
417                 }
418             }
419 
420             for (size_t b = 2; b < bottom_blobs.size(); b++)
421             {
422                 const Mat& bottom_blob2 = bottom_blobs[b];
423                 #pragma omp parallel for num_threads(opt.num_threads)
424                 for (int q = 0; q < channels; q++)
425                 {
426                     const float* ptr = bottom_blob2.channel(q);
427                     float* outptr = top_blob.channel(q);
428 
429                     for (int i = 0; i < size; i++)
430                     {
431                         __m128 _p = _mm_load_ps(outptr);
432                         __m128 _p1 = _mm_load_ps(ptr);
433                         _p = _mm_max_ps(_p, _p1);
434                         _mm_store_ps(outptr, _p);
435 
436                         ptr += 4;
437                         outptr += 4;
438                     }
439                 }
440             }
441         }
442 
443         return 0;
444     }
445 #endif // __SSE2__
446 
447     if (op_type == Operation_PROD)
448     {
449         // first blob
450         const Mat& bottom_blob1 = bottom_blobs[1];
451         #pragma omp parallel for num_threads(opt.num_threads)
452         for (int q = 0; q < channels; q++)
453         {
454             const float* ptr = bottom_blob.channel(q);
455             const float* ptr1 = bottom_blob1.channel(q);
456             float* outptr = top_blob.channel(q);
457             int remain = size;
458             for (; remain > 0; remain--)
459             {
460                 *outptr = *ptr * *ptr1;
461 
462                 ptr++;
463                 ptr1++;
464                 outptr++;
465             }
466         }
467 
468         for (size_t b = 2; b < bottom_blobs.size(); b++)
469         {
470             const Mat& bottom_blob2 = bottom_blobs[b];
471             #pragma omp parallel for num_threads(opt.num_threads)
472             for (int q = 0; q < channels; q++)
473             {
474                 const float* ptr = bottom_blob2.channel(q);
475                 float* outptr = top_blob.channel(q);
476                 int remain = size;
477 
478                 for (; remain > 0; remain--)
479                 {
480                     *outptr *= *ptr;
481 
482                     ptr++;
483                     outptr++;
484                 }
485             }
486         }
487     }
488     if (op_type == Operation_SUM)
489     {
490         if (coeffs.w == 0)
491         {
492             // first blob
493             const Mat& bottom_blob1 = bottom_blobs[1];
494             #pragma omp parallel for num_threads(opt.num_threads)
495             for (int q = 0; q < channels; q++)
496             {
497                 const float* ptr = bottom_blob.channel(q);
498                 const float* ptr1 = bottom_blob1.channel(q);
499                 float* outptr = top_blob.channel(q);
500                 int remain = size;
501 
502                 for (; remain > 0; remain--)
503                 {
504                     *outptr = *ptr + *ptr1;
505 
506                     ptr++;
507                     ptr1++;
508                     outptr++;
509                 }
510             }
511 
512             for (size_t b = 2; b < bottom_blobs.size(); b++)
513             {
514                 const Mat& bottom_blob2 = bottom_blobs[b];
515                 #pragma omp parallel for num_threads(opt.num_threads)
516                 for (int q = 0; q < channels; q++)
517                 {
518                     const float* ptr = bottom_blob2.channel(q);
519                     float* outptr = top_blob.channel(q);
520 
521                     int remain = size;
522                     for (; remain > 0; remain--)
523                     {
524                         *outptr += *ptr;
525 
526                         ptr++;
527                         outptr++;
528                     }
529                 }
530             }
531         }
532         else
533         {
534             // first blob
535             const Mat& bottom_blob1 = bottom_blobs[1];
536             float coeff0 = coeffs[0];
537             float coeff1 = coeffs[1];
538             #pragma omp parallel for num_threads(opt.num_threads)
539             for (int q = 0; q < channels; q++)
540             {
541                 const float* ptr = bottom_blob.channel(q);
542                 const float* ptr1 = bottom_blob1.channel(q);
543                 float* outptr = top_blob.channel(q);
544                 int remain = size;
545                 for (; remain > 0; remain--)
546                 {
547                     *outptr = *ptr * coeff0 + *ptr1 * coeff1;
548 
549                     ptr++;
550                     ptr1++;
551                     outptr++;
552                 }
553             }
554 
555             for (size_t b = 2; b < bottom_blobs.size(); b++)
556             {
557                 const Mat& bottom_blob2 = bottom_blobs[b];
558                 float coeff = coeffs[b];
559                 #pragma omp parallel for num_threads(opt.num_threads)
560                 for (int q = 0; q < channels; q++)
561                 {
562                     const float* ptr = bottom_blob2.channel(q);
563                     float* outptr = top_blob.channel(q);
564 
565                     int remain = size;
566                     for (; remain > 0; remain--)
567                     {
568                         *outptr += *ptr * coeff;
569 
570                         ptr++;
571                         outptr++;
572                     }
573                 }
574             }
575         }
576     }
577     if (op_type == Operation_MAX)
578     {
579         // first blob
580         const Mat& bottom_blob1 = bottom_blobs[1];
581         #pragma omp parallel for num_threads(opt.num_threads)
582         for (int q = 0; q < channels; q++)
583         {
584             const float* ptr = bottom_blob.channel(q);
585             const float* ptr1 = bottom_blob1.channel(q);
586             float* outptr = top_blob.channel(q);
587 
588             int remain = size;
589             for (; remain > 0; remain--)
590             {
591                 *outptr = std::max(*ptr, *ptr1);
592 
593                 ptr++;
594                 ptr1++;
595                 outptr++;
596             }
597         }
598 
599         for (size_t b = 2; b < bottom_blobs.size(); b++)
600         {
601             const Mat& bottom_blob2 = bottom_blobs[b];
602             #pragma omp parallel for num_threads(opt.num_threads)
603             for (int q = 0; q < channels; q++)
604             {
605                 const float* ptr = bottom_blob2.channel(q);
606                 float* outptr = top_blob.channel(q);
607 
608                 int remain = size;
609                 for (; remain > 0; remain--)
610                 {
611                     *outptr = std::max(*ptr, *outptr);
612 
613                     ptr++;
614                     outptr++;
615                 }
616             }
617         }
618     }
619 
620     return 0;
621 }
622 
623 } // namespace ncnn
624