1 /* 2 * Simd Library (http://ermig1979.github.io/Simd). 3 * 4 * Copyright (c) 2011-2017 Yermalayeu Ihar. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #include "Simd/SimdMemory.h" 25 #include "Simd/SimdStore.h" 26 27 namespace Simd 28 { 29 #ifdef SIMD_NEON_ENABLE 30 namespace Neon 31 { 32 namespace 33 { 34 struct Buffer 35 { BufferSimd::Neon::__anon1c40cfd90111::Buffer36 Buffer(size_t width) 37 { 38 _p = Allocate(sizeof(uint16_t)*(5 * width + A)); 39 in0 = (uint16_t*)_p; 40 in1 = in0 + width; 41 out0 = in1 + width; 42 out1 = out0 + width; 43 dst = out1 + width + HA; 44 } 45 ~BufferSimd::Neon::__anon1c40cfd90111::Buffer46 ~Buffer() 47 { 48 Free(_p); 49 } 50 51 uint16_t * in0; 52 uint16_t * in1; 53 uint16_t * out0; 54 uint16_t * out1; 55 uint16_t * dst; 56 private: 57 void *_p; 58 }; 59 } 60 61 template <bool compensation> SIMD_INLINE uint16x8_t DivideBy256(uint16x8_t value); 62 DivideBy256(uint16x8_t value)63 template <> SIMD_INLINE uint16x8_t DivideBy256<true>(uint16x8_t value) 64 { 65 return vshrq_n_u16(vaddq_u16(value, K16_0080), 8); 66 } 67 DivideBy256(uint16x8_t value)68 template <> SIMD_INLINE uint16x8_t DivideBy256<false>(uint16x8_t value) 69 { 70 return vshrq_n_u16(value, 8); 71 } 72 LoadUnpacked(const uint8_t * src)73 SIMD_INLINE uint16x8_t LoadUnpacked(const uint8_t * src) 74 { 75 return vmovl_u8(vld1_u8(src)); 76 } 77 FirstRow5x5(uint16x8_t src,Buffer & buffer,size_t offset)78 template<bool align> SIMD_INLINE void FirstRow5x5(uint16x8_t src, Buffer & buffer, size_t offset) 79 { 80 Store<align>(buffer.in0 + offset, src); 81 Store<align>(buffer.in1 + offset, vmulq_u16(src, K16_0005)); 82 } 83 FirstRow5x5(const uint8_t * src,Buffer & buffer,size_t offset)84 template<bool align> SIMD_INLINE void FirstRow5x5(const uint8_t * src, Buffer & buffer, size_t offset) 85 { 86 FirstRow5x5<align>(LoadUnpacked(src + offset), buffer, offset); 87 offset += HA; 88 FirstRow5x5<align>(LoadUnpacked(src + offset), buffer, offset); 89 } 90 MainRowY5x5(uint16x8_t odd,uint16x8_t even,Buffer & buffer,size_t offset)91 template<bool align> SIMD_INLINE void MainRowY5x5(uint16x8_t odd, uint16x8_t even, Buffer & buffer, size_t offset) 92 { 93 uint16x8_t cp = vmulq_u16(odd, K16_0004); 94 uint16x8_t c0 = Load<align>(buffer.in0 + offset); 95 uint16x8_t c1 = Load<align>(buffer.in1 + offset); 96 Store<align>(buffer.dst + offset, vaddq_u16(even, vaddq_u16(c1, vaddq_u16(cp, vmulq_u16(c0, K16_0006))))); 97 Store<align>(buffer.out1 + offset, vaddq_u16(c0, cp)); 98 Store<align>(buffer.out0 + offset, even); 99 } 100 MainRowY5x5(const uint8_t * odd,const uint8_t * even,Buffer & buffer,size_t offset)101 template<bool align> SIMD_INLINE void MainRowY5x5(const uint8_t *odd, const uint8_t *even, Buffer & buffer, size_t offset) 102 { 103 MainRowY5x5<align>(LoadUnpacked(odd + offset), LoadUnpacked(even + offset), buffer, offset); 104 offset += HA; 105 MainRowY5x5<align>(LoadUnpacked(odd + offset), LoadUnpacked(even + offset), buffer, offset); 106 } 107 MainRowX5x5(uint16_t * dst)108 template <bool align, bool compensation> SIMD_INLINE uint16x8_t MainRowX5x5(uint16_t * dst) 109 { 110 uint16x8_t t0 = vld1q_u16(dst - 2); 111 uint16x8_t t1 = vld1q_u16(dst - 1); 112 uint16x8_t t2 = Load<align>(dst); 113 uint16x8_t t3 = vld1q_u16(dst + 1); 114 uint16x8_t t4 = vld1q_u16(dst + 2); 115 t2 = vaddq_u16(vaddq_u16(vmulq_u16(t2, K16_0006), vmulq_u16(vaddq_u16(t1, t3), K16_0004)), vaddq_u16(t0, t4)); 116 return DivideBy256<compensation>(t2); 117 } 118 MainRowX5x5(Buffer & buffer,size_t offset,uint8_t * dst)119 template <bool align, bool compensation> SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t *dst) 120 { 121 uint16x8_t lo = MainRowX5x5<align, compensation>(buffer.dst + offset); 122 uint16x8_t hi = MainRowX5x5<align, compensation>(buffer.dst + offset + HA); 123 vst1_u8(dst, Deinterleave(PackU16(lo, hi)).val[0]); 124 } 125 ReduceGray5x5(const uint8_t * src,size_t srcWidth,size_t srcHeight,size_t srcStride,uint8_t * dst,size_t dstWidth,size_t dstHeight,size_t dstStride)126 template <bool compensation> void ReduceGray5x5( 127 const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, 128 uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride) 129 { 130 assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth >= A); 131 132 size_t alignedWidth = Simd::AlignLo(srcWidth, A); 133 size_t bufferDstTail = Simd::AlignHi(srcWidth - A, 2); 134 135 Buffer buffer(Simd::AlignHi(srcWidth, A)); 136 137 for (size_t col = 0; col < alignedWidth; col += A) 138 FirstRow5x5<true>(src, buffer, col); 139 if (alignedWidth != srcWidth) 140 FirstRow5x5<false>(src, buffer, srcWidth - A); 141 src += srcStride; 142 143 for (size_t row = 1; row <= srcHeight; row += 2, dst += dstStride, src += 2 * srcStride) 144 { 145 const uint8_t *odd = src - (row < srcHeight ? 0 : srcStride); 146 const uint8_t *even = odd + (row < srcHeight - 1 ? srcStride : 0); 147 148 for (size_t col = 0; col < alignedWidth; col += A) 149 MainRowY5x5<true>(odd, even, buffer, col); 150 if (alignedWidth != srcWidth) 151 MainRowY5x5<false>(odd, even, buffer, srcWidth - A); 152 153 Swap(buffer.in0, buffer.out0); 154 Swap(buffer.in1, buffer.out1); 155 156 buffer.dst[-2] = buffer.dst[0]; 157 buffer.dst[-1] = buffer.dst[0]; 158 buffer.dst[srcWidth] = buffer.dst[srcWidth - 1]; 159 buffer.dst[srcWidth + 1] = buffer.dst[srcWidth - 1]; 160 161 for (size_t srcCol = 0, dstCol = 0; srcCol < alignedWidth; srcCol += A, dstCol += HA) 162 MainRowX5x5<true, compensation>(buffer, srcCol, dst + dstCol); 163 if (alignedWidth != srcWidth) 164 MainRowX5x5<false, compensation>(buffer, bufferDstTail, dst + dstWidth - HA); 165 } 166 } 167 ReduceGray5x5(const uint8_t * src,size_t srcWidth,size_t srcHeight,size_t srcStride,uint8_t * dst,size_t dstWidth,size_t dstHeight,size_t dstStride,int compensation)168 void ReduceGray5x5(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, 169 uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, int compensation) 170 { 171 if (compensation) 172 ReduceGray5x5<true>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); 173 else 174 ReduceGray5x5<false>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); 175 } 176 } 177 #endif// SIMD_NEON_ENABLE 178 } 179