1 /* 2 * Simd Library (http://ermig1979.github.io/Simd). 3 * 4 * Copyright (c) 2011-2019 Yermalayeu Ihar. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #include "Simd/SimdMemory.h" 25 #include "Simd/SimdResizer.h" 26 #include "Simd/SimdStore.h" 27 28 namespace Simd 29 { 30 #ifdef SIMD_AVX_ENABLE 31 namespace Avx 32 { ResizerFloatBilinear(const ResParam & param)33 ResizerFloatBilinear::ResizerFloatBilinear(const ResParam & param) 34 : Base::ResizerFloatBilinear(param) 35 { 36 } 37 Run(const float * src,size_t srcStride,float * dst,size_t dstStride)38 void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) 39 { 40 size_t cn = _param.channels; 41 size_t rs = _param.dstW * cn; 42 float * pbx[2] = { _bx[0].data, _bx[1].data }; 43 int32_t prev = -2; 44 size_t rsa = AlignLo(rs, Avx::F); 45 size_t rsh = AlignLo(rs, Sse::F); 46 for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) 47 { 48 float fy1 = _ay[dy]; 49 float fy0 = 1.0f - fy1; 50 int32_t sy = _iy[dy]; 51 int32_t k = 0; 52 53 if (sy == prev) 54 k = 2; 55 else if (sy == prev + 1) 56 { 57 Swap(pbx[0], pbx[1]); 58 k = 1; 59 } 60 61 prev = sy; 62 63 for (; k < 2; k++) 64 { 65 float * pb = pbx[k]; 66 const float * ps = src + (sy + k)*srcStride; 67 size_t dx = 0; 68 if (cn == 1) 69 { 70 __m256 _1 = _mm256_set1_ps(1.0f); 71 for (; dx < rsa; dx += Avx::F) 72 { 73 __m256 s0145 = Avx::Load(ps + _ix[dx + 0], ps + _ix[dx + 1], ps + _ix[dx + 4], ps + _ix[dx + 5]); 74 __m256 s2367 = Avx::Load(ps + _ix[dx + 2], ps + _ix[dx + 3], ps + _ix[dx + 6], ps + _ix[dx + 7]); 75 __m256 fx1 = _mm256_load_ps(_ax.data + dx); 76 __m256 fx0 = _mm256_sub_ps(_1, fx1); 77 __m256 m0 = _mm256_mul_ps(fx0, _mm256_shuffle_ps(s0145, s2367, 0x88)); 78 __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD)); 79 _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1)); 80 } 81 for (; dx < rsh; dx += Sse::F) 82 { 83 __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); 84 __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); 85 __m128 fx1 = _mm_load_ps(_ax.data + dx); 86 __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); 87 __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); 88 __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD)); 89 _mm_store_ps(pb + dx, _mm_add_ps(m0, m1)); 90 } 91 } 92 if (cn == 3 && rs > 3) 93 { 94 __m256 _1 = _mm256_set1_ps(1.0f); 95 size_t rs3 = rs - 3; 96 size_t rs6 = AlignLoAny(rs3, 6); 97 for (; dx < rs6; dx += 6) 98 { 99 __m256 s0 = Load<false>(ps + _ix[dx + 0] + 0, ps + _ix[dx + 3] + 0); 100 __m256 s1 = Load<false>(ps + _ix[dx + 0] + 3, ps + _ix[dx + 3] + 3); 101 __m256 fx1 = Load<false>(_ax.data + dx + 0, _ax.data + dx + 3); 102 __m256 fx0 = _mm256_sub_ps(_1, fx1); 103 Store<false>(pb + dx + 0, pb + dx + 3, _mm256_add_ps(_mm256_mul_ps(fx0, s0), _mm256_mul_ps(fx1, s1))); 104 } 105 for (; dx < rs3; dx += 3) 106 { 107 __m128 s0 = _mm_loadu_ps(ps + _ix[dx] + 0); 108 __m128 s1 = _mm_loadu_ps(ps + _ix[dx] + 3); 109 __m128 fx1 = _mm_set1_ps(_ax.data[dx]); 110 __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); 111 _mm_storeu_ps(pb + dx, _mm_add_ps(_mm_mul_ps(fx0, s0), _mm_mul_ps(fx1, s1))); 112 } 113 } 114 for (; dx < rs; dx++) 115 { 116 int32_t sx = _ix[dx]; 117 float fx = _ax[dx]; 118 pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + cn] * fx; 119 } 120 } 121 122 size_t dx = 0; 123 __m256 _fy0 = _mm256_set1_ps(fy0); 124 __m256 _fy1 = _mm256_set1_ps(fy1); 125 for (; dx < rsa; dx += Avx::F) 126 { 127 __m256 m0 = _mm256_mul_ps(_mm256_load_ps(pbx[0] + dx), _fy0); 128 __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1); 129 _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1)); 130 } 131 for (; dx < rsh; dx += Sse::F) 132 { 133 __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); 134 __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); 135 _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1)); 136 } 137 for (; dx < rs; dx++) 138 dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; 139 } 140 } 141 142 //--------------------------------------------------------------------- 143 ResizerInit(size_t srcX,size_t srcY,size_t dstX,size_t dstY,size_t channels,SimdResizeChannelType type,SimdResizeMethodType method)144 void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) 145 { 146 ResParam param(srcX, srcY, dstX, dstY, channels, type, method, sizeof(__m256)); 147 if (type == SimdResizeChannelFloat && (method == SimdResizeMethodBilinear || method == SimdResizeMethodCaffeInterp)) 148 return new ResizerFloatBilinear(param); 149 else 150 return Sse41::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); 151 } 152 } 153 #endif //SIMD_AVX_ENABLE 154 } 155 156