1 /*
2 * Simd Library (http://ermig1979.github.io/Simd).
3 *
4 * Copyright (c) 2011-2019 Yermalayeu Ihar.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "Simd/SimdMemory.h"
25 #include "Simd/SimdResizer.h"
26 #include "Simd/SimdStore.h"
27 
28 namespace Simd
29 {
30 #ifdef SIMD_AVX_ENABLE
31     namespace Avx
32     {
ResizerFloatBilinear(const ResParam & param)33         ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param)
34             : Base::ResizerFloatBilinear(param)
35         {
36         }
37 
Run(const float * src,size_t srcStride,float * dst,size_t dstStride)38         void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride)
39         {
40             size_t cn = _param.channels;
41             size_t rs = _param.dstW * cn;
42             float * pbx[2] = { _bx[0].data, _bx[1].data };
43             int32_t prev = -2;
44             size_t rsa = AlignLo(rs, Avx::F);
45             size_t rsh = AlignLo(rs, Sse::F);
46             for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
47             {
48                 float fy1 = _ay[dy];
49                 float fy0 = 1.0f - fy1;
50                 int32_t sy = _iy[dy];
51                 int32_t k = 0;
52 
53                 if (sy == prev)
54                     k = 2;
55                 else if (sy == prev + 1)
56                 {
57                     Swap(pbx[0], pbx[1]);
58                     k = 1;
59                 }
60 
61                 prev = sy;
62 
63                 for (; k < 2; k++)
64                 {
65                     float * pb = pbx[k];
66                     const float * ps = src + (sy + k)*srcStride;
67                     size_t dx = 0;
68                     if (cn == 1)
69                     {
70                         __m256 _1 = _mm256_set1_ps(1.0f);
71                         for (; dx < rsa; dx += Avx::F)
72                         {
73                             __m256 s0145 = Avx::Load(ps + _ix[dx + 0], ps + _ix[dx + 1], ps + _ix[dx + 4], ps + _ix[dx + 5]);
74                             __m256 s2367 = Avx::Load(ps + _ix[dx + 2], ps + _ix[dx + 3], ps + _ix[dx + 6], ps + _ix[dx + 7]);
75                             __m256 fx1 = _mm256_load_ps(_ax.data + dx);
76                             __m256 fx0 = _mm256_sub_ps(_1, fx1);
77                             __m256 m0 = _mm256_mul_ps(fx0, _mm256_shuffle_ps(s0145, s2367, 0x88));
78                             __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD));
79                             _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1));
80                         }
81                         for (; dx < rsh; dx += Sse::F)
82                         {
83                             __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
84                             __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
85                             __m128 fx1 = _mm_load_ps(_ax.data + dx);
86                             __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
87                             __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
88                             __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD));
89                             _mm_store_ps(pb + dx, _mm_add_ps(m0, m1));
90                         }
91                     }
92                     if (cn == 3 && rs > 3)
93                     {
94                         __m256 _1 = _mm256_set1_ps(1.0f);
95                         size_t rs3 = rs - 3;
96                         size_t rs6 = AlignLoAny(rs3, 6);
97                         for (; dx < rs6; dx += 6)
98                         {
99                             __m256 s0 = Load<false>(ps + _ix[dx + 0] + 0, ps + _ix[dx + 3] + 0);
100                             __m256 s1 = Load<false>(ps + _ix[dx + 0] + 3, ps + _ix[dx + 3] + 3);
101                             __m256 fx1 = Load<false>(_ax.data + dx + 0, _ax.data + dx + 3);
102                             __m256 fx0 = _mm256_sub_ps(_1, fx1);
103                             Store<false>(pb + dx + 0, pb + dx + 3, _mm256_add_ps(_mm256_mul_ps(fx0, s0), _mm256_mul_ps(fx1, s1)));
104                         }
105                         for (; dx < rs3; dx += 3)
106                         {
107                             __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0);
108                             __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3);
109                             __m128 fx1 = _mm_set1_ps(_ax.data[dx]);
110                             __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
111                             _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1)));
112                         }
113                     }
114                     for (; dx < rs; dx++)
115                     {
116                         int32_t sx = _ix[dx];
117                         float fx = _ax[dx];
118                         pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx;
119                     }
120                 }
121 
122                 size_t dx = 0;
123                 __m256 _fy0 = _mm256_set1_ps(fy0);
124                 __m256 _fy1 = _mm256_set1_ps(fy1);
125                 for (; dx < rsa; dx += Avx::F)
126                 {
127                     __m256 m0 = _mm256_mul_ps(_mm256_load_ps(pbx[0] + dx), _fy0);
128                     __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1);
129                     _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1));
130                 }
131                 for (; dx < rsh; dx += Sse::F)
132                 {
133                     __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0));
134                     __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1));
135                     _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1));
136                 }
137                 for (; dx < rs; dx++)
138                     dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1;
139             }
140         }
141 
142         //---------------------------------------------------------------------
143 
ResizerInit(size_t srcX,size_t srcY,size_t dstX,size_t dstY,size_t channels,SimdResizeChannelType type,SimdResizeMethodType method)144         void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
145         {
146             ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256));
147             if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp))
148                 return new ResizerFloatBilinear(param);
149             else
150                 return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
151         }
152     }
153 #endif //SIMD_AVX_ENABLE
154 }
155 
156