1 /* -----------------------------------------------------------------------------
2 The copyright in this software is being made available under the BSD
3 License, included below. No patent rights, trademark rights and/or
4 other Intellectual Property Rights other than the copyrights concerning
5 the Software are granted under this license.
6
7 For any license concerning other Intellectual Property rights than the software,
8 especially patent licenses, a separate Agreement needs to be closed.
9 For more information please contact:
10
11 Fraunhofer Heinrich Hertz Institute
12 Einsteinufer 37
13 10587 Berlin, Germany
14 www.hhi.fraunhofer.de/vvc
15 vvc@hhi.fraunhofer.de
16
17 Copyright (c) 2018-2021, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V.
18 All rights reserved.
19
20 Redistribution and use in source and binary forms, with or without
21 modification, are permitted provided that the following conditions are met:
22
23 * Redistributions of source code must retain the above copyright notice,
24 this list of conditions and the following disclaimer.
25 * Redistributions in binary form must reproduce the above copyright notice,
26 this list of conditions and the following disclaimer in the documentation
27 and/or other materials provided with the distribution.
28 * Neither the name of Fraunhofer nor the names of its contributors may
29 be used to endorse or promote products derived from this software without
30 specific prior written permission.
31
32 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
33 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
36 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
37 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
40 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
42 THE POSSIBILITY OF SUCH DAMAGE.
43
44
45 ------------------------------------------------------------------------------------------- */
46
47 /** \file InterPredX86.h
48 \brief SIMD for InterPrediction
49 */
50
51 //! \ingroup CommonLib
52 //! \{
53
54
55 #include "CommonLib/CommonDef.h"
56 #include "CommonDefX86.h"
57 #include "CommonLib/InterPrediction.h"
58
59 namespace vvdec
60 {
61
62 #if ENABLE_SIMD_OPT_BIO
63 #ifdef TARGET_SIMD_X86
64
65 #define _mm_storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
66
67 template<X86_VEXT vext>
PaddBIO_SIMD(const Pel * refPel,Pel * dstPel,unsigned width,const int shift)68 inline void PaddBIO_SIMD( const Pel* refPel, Pel* dstPel, unsigned width, const int shift )
69 {
70 int w;
71 __m128i off = _mm_set1_epi16( ( Pel ) IF_INTERNAL_OFFS );
72
73 if( width > 4 )
74 {
75 for( w = 0; w < width; w += 8 )
76 {
77
78 __m128i ref = _mm_lddqu_si128( ( __m128i const * )&refPel[w] );
79 ref = _mm_slli_epi16( ref, shift );
80 ref = _mm_sub_epi16( ref, off );
81 _mm_storeu_si128( ( __m128i * )&dstPel[w], ref );
82
83 }
84 //2 * BIO_EXTEND_SIZE
85 __m128i ref = _mm_lddqu_si128( ( __m128i const * )&refPel[w] );
86 ref = _mm_slli_epi16( ref, shift );
87 ref = _mm_sub_epi16( ref, off );
88 _mm_storeu_si32( ( __m128i * )&dstPel[w], ref );
89
90 }
91 else
92 {
93 __m128i ref = _mm_lddqu_si128( ( __m128i const * )&refPel[0] );
94 ref = _mm_slli_epi16( ref, shift );
95 ref = _mm_sub_epi16( ref, off );
96 _mm_storel_epi64( ( __m128i * )&dstPel[0], ref );
97 ref = _mm_srli_si128( ref, 8 );
98 _mm_storeu_si32( ( __m128i * )&dstPel[4], ref );
99 }
100 }
101
rightShiftMSB(int numer,int denom)102 static inline int rightShiftMSB( int numer, int denom )
103 {
104 int shiftIdx = _bit_scan_reverse( denom );
105 return ( numer >> shiftIdx );
106 }
107
108 template<X86_VEXT vext>
addBIOAvg4_SSE(const int16_t * src0,const int16_t * src1,int16_t * dst,ptrdiff_t dstStride,const int16_t * gradX0,const int16_t * gradX1,const int16_t * gradY0,const int16_t * gradY1,ptrdiff_t widthG,int tmpx,int tmpy,int shift,int offset,const ClpRng & clpRng)109 static inline void addBIOAvg4_SSE(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
110 {
111 const ptrdiff_t src0Stride = widthG;
112 const ptrdiff_t src1Stride = widthG;
113 const ptrdiff_t gradStride = widthG;
114
115 __m128i mm_tmpx = _mm_set1_epi32( ( tmpx & 0xffff ) | ( tmpy << 16 ) );
116 __m128i mm_offset = _mm_set1_epi32( offset );
117 __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
118 __m128i vibdimax = _mm_set1_epi16( clpRng.max() );
119 __m128i mm_a;
120 __m128i mm_b;
121 __m128i mm_sum;
122
123 for( int y = 0; y < 4; y++, dst += dstStride, src0 += src0Stride, src1 += src1Stride, gradX0 += gradStride, gradX1 += gradStride, gradY0 += gradStride, gradY1 += gradStride )
124 {
125 mm_a = _mm_unpacklo_epi16 ( _mm_loadl_epi64( (const __m128i *) gradX0 ), _mm_loadl_epi64( (const __m128i *) gradY0 ) );
126 mm_b = _mm_unpacklo_epi16 ( _mm_loadl_epi64( (const __m128i *) gradX1 ), _mm_loadl_epi64( (const __m128i *) gradY1 ) );
127 mm_a = _mm_sub_epi16 ( mm_a, mm_b );
128 mm_sum = _mm_madd_epi16 ( mm_a, mm_tmpx );
129 mm_a = _mm_cvtepi16_epi32 ( _mm_loadl_epi64( (const __m128i *) ( src0 ) ) );
130 mm_b = _mm_cvtepi16_epi32 ( _mm_loadl_epi64( (const __m128i *) ( src1 ) ) );
131 mm_sum = _mm_add_epi32 ( _mm_add_epi32( mm_sum, mm_a ), _mm_add_epi32( mm_b, mm_offset ) );
132 mm_sum = _mm_packs_epi32 ( _mm_srai_epi32( mm_sum, shift ), mm_a );
133 mm_sum = _mm_min_epi16 ( vibdimax, _mm_max_epi16( vibdimin, mm_sum ) );
134 _mm_storel_epi64 ( (__m128i *) dst, mm_sum );
135 }
136 }
137
138 #if USE_AVX2
addBIOAvg4_2x_AVX2(const int16_t * src0,const int16_t * src1,int16_t * dst,ptrdiff_t dstStride,const int16_t * gradX0,const int16_t * gradX1,const int16_t * gradY0,const int16_t * gradY1,ptrdiff_t widthG,int tmpx0,int tmpx1,int tmpy0,int tmpy1,int shift,int offset,const ClpRng & clpRng)139 static inline void addBIOAvg4_2x_AVX2(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx0, int tmpx1, int tmpy0, int tmpy1, int shift, int offset, const ClpRng& clpRng)
140 {
141 const ptrdiff_t src0Stride = widthG;
142 const ptrdiff_t src1Stride = widthG;
143 const ptrdiff_t gradStride = widthG;
144
145 __m256i mm_tmpx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_set1_epi32( ( tmpx0 & 0xffff ) | ( tmpy0 << 16 ) ) ), _mm_set1_epi32( ( tmpx1 & 0xffff ) | ( tmpy1 << 16 ) ), 1 );
146 __m256i mm_offset = _mm256_set1_epi32( offset );
147 __m256i vibdimin = _mm256_set1_epi32( clpRng.min() );
148 __m256i vibdimax = _mm256_set1_epi32( clpRng.max() );
149 __m256i mm_a;
150 __m256i mm_b;
151 __m256i mm_sum;
152 __m128i xsrc0, xsrc1;
153
154 for( int y = 0; y < 4; y++, dst += dstStride, src0 += src0Stride, src1 += src1Stride, gradX0 += gradStride, gradX1 += gradStride, gradY0 += gradStride, gradY1 += gradStride )
155 {
156 xsrc0 = _mm_loadu_si128 ( ( const __m128i * ) gradX0 );
157 xsrc1 = _mm_loadu_si128 ( ( const __m128i * ) gradY0 );
158 mm_a = _mm256_castsi128_si256( _mm_unpacklo_epi16( xsrc0, xsrc1 ) );
159 mm_a = _mm256_inserti128_si256( mm_a, _mm_unpackhi_epi16( xsrc0, xsrc1 ), 1 );
160 xsrc0 = _mm_loadu_si128 ( ( const __m128i * ) gradX1 );
161 xsrc1 = _mm_loadu_si128 ( ( const __m128i * ) gradY1 );
162 mm_b = _mm256_castsi128_si256( _mm_unpacklo_epi16( xsrc0, xsrc1 ) );
163 mm_b = _mm256_inserti128_si256( mm_b, _mm_unpackhi_epi16( xsrc0, xsrc1 ), 1 );
164 mm_a = _mm256_sub_epi16 ( mm_a, mm_b );
165 mm_sum = _mm256_madd_epi16 ( mm_a, mm_tmpx );
166 mm_a = _mm256_cvtepi16_epi32 ( _mm_loadu_si128( (const __m128i *) ( src0 ) ) );
167 mm_b = _mm256_cvtepi16_epi32 ( _mm_loadu_si128( (const __m128i *) ( src1 ) ) );
168 mm_sum = _mm256_add_epi32 ( _mm256_add_epi32( mm_sum, mm_a ), _mm256_add_epi32( mm_b, mm_offset ) );
169 mm_sum = _mm256_srai_epi32 ( mm_sum, shift );
170 mm_sum = _mm256_min_epi32 ( vibdimax, _mm256_max_epi32( vibdimin, mm_sum ) );
171 _mm_storeu_si128 ( (__m128i *) dst, _mm256_cvtepi32_epi16x( mm_sum ) );
172 }
173 }
174 #endif
175
176 template< X86_VEXT vext >
calcBIOSums_SSE(const Pel * srcY0Tmp,const Pel * srcY1Tmp,const Pel * gradX0,const Pel * gradX1,const Pel * gradY0,const Pel * gradY1,const int widthG,const int bitDepth,int limit,int & tmpx,int & tmpy)177 static inline void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx, int &tmpy)
178 {
179 static constexpr int shift4 = 4;
180 static constexpr int shift5 = 1;
181 const int srcStride = widthG;
182
183 __m128i sumAbsGXTmp = _mm_setzero_si128();
184 __m128i sumDIXTmp = _mm_setzero_si128();
185 __m128i sumAbsGYTmp = _mm_setzero_si128();
186 __m128i sumDIYTmp = _mm_setzero_si128();
187 __m128i sumSignGyGxTmp = _mm_setzero_si128();
188
189 for (int y = 0; y < 6; y++)
190 {
191 __m128i shiftSrcY0Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Tmp)), shift4);
192 __m128i shiftSrcY1Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY1Tmp)), shift4);
193 __m128i loadGradX0 = _mm_loadu_si128((__m128i*)(gradX0));
194 __m128i loadGradX1 = _mm_loadu_si128((__m128i*)(gradX1));
195 __m128i loadGradY0 = _mm_loadu_si128((__m128i*)(gradY0));
196 __m128i loadGradY1 = _mm_loadu_si128((__m128i*)(gradY1));
197 __m128i subTemp1 = _mm_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp);
198 __m128i packTempX = _mm_srai_epi16(_mm_add_epi16(loadGradX0, loadGradX1), shift5);
199 __m128i packTempY = _mm_srai_epi16(_mm_add_epi16(loadGradY0, loadGradY1), shift5);
200 __m128i gX = _mm_abs_epi16(packTempX);
201 __m128i gY = _mm_abs_epi16(packTempY);
202 __m128i dIX = _mm_sign_epi16(subTemp1, packTempX );
203 __m128i dIY = _mm_sign_epi16(subTemp1, packTempY );
204 __m128i signGY_GX = _mm_sign_epi16(packTempX, packTempY );
205
206 sumAbsGXTmp = _mm_add_epi16(sumAbsGXTmp, gX);
207 sumDIXTmp = _mm_add_epi16(sumDIXTmp, dIX);
208 sumAbsGYTmp = _mm_add_epi16(sumAbsGYTmp, gY);
209 sumDIYTmp = _mm_add_epi16(sumDIYTmp, dIY);
210 sumSignGyGxTmp = _mm_add_epi16(sumSignGyGxTmp, signGY_GX);
211 srcY0Tmp += srcStride;
212 srcY1Tmp += srcStride;
213 gradX0 += widthG;
214 gradX1 += widthG;
215 gradY0 += widthG;
216 gradY1 += widthG;
217 }
218
219 sumAbsGXTmp = _mm_madd_epi16(sumAbsGXTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
220 sumDIXTmp = _mm_madd_epi16(sumDIXTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
221 sumAbsGYTmp = _mm_madd_epi16(sumAbsGYTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
222 sumDIYTmp = _mm_madd_epi16(sumDIYTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
223 sumSignGyGxTmp = _mm_madd_epi16(sumSignGyGxTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
224
225 __m128i a12 = _mm_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
226 __m128i a3 = _mm_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
227 __m128i b12 = _mm_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
228 __m128i b3 = _mm_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
229 __m128i c1 = _mm_unpacklo_epi64(a12, b12);
230 __m128i c2 = _mm_unpackhi_epi64(a12, b12);
231 __m128i c3 = _mm_unpacklo_epi64(a3, b3);
232
233 c1 = _mm_add_epi32(c1, c2);
234 c1 = _mm_add_epi32(c1, c3);
235
236 int sumAbsGX = _mm_cvtsi128_si32(c1);
237 int sumAbsGY = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0x55));
238 int sumDIX = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xaa));
239 int sumDIY = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xff));
240
241 sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0x4e)); // 01001110
242 sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0xb1)); // 10110001
243 int sumSignGY_GX = _mm_cvtsi128_si32(sumSignGyGxTmp);
244
245 tmpx = sumAbsGX == 0 ? 0 : rightShiftMSB( sumDIX << 2, sumAbsGX );
246 tmpx = Clip3( -limit, limit, tmpx );
247
248 int mainsGxGy = sumSignGY_GX >> 12;
249 int secsGxGy = sumSignGY_GX & ( ( 1 << 12 ) - 1 );
250 int tmpData = tmpx * mainsGxGy;
251 tmpData = ( ( tmpData << 12 ) + tmpx * secsGxGy ) >> 1;
252 tmpy = sumAbsGY == 0 ? 0 : rightShiftMSB( ( ( sumDIY << 2 ) - tmpData ), sumAbsGY );
253 tmpy = Clip3( -limit, limit, tmpy );
254 }
255
256 #if USE_AVX2
calcBIOSums2x_AVX2(const Pel * srcY0Tmp,const Pel * srcY1Tmp,const Pel * gradX0,const Pel * gradX1,const Pel * gradY0,const Pel * gradY1,const int widthG,const int bitDepth,int limit,int & tmpx0,int & tmpx1,int & tmpy0,int & tmpy1)257 static inline void calcBIOSums2x_AVX2(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx0, int &tmpx1, int &tmpy0, int &tmpy1 )
258 {
259 static constexpr int shift4 = 4;
260 static constexpr int shift5 = 1;
261 const int srcStride = widthG;
262
263 __m256i sumAbsGXTmp = _mm256_setzero_si256();
264 __m256i sumDIXTmp = _mm256_setzero_si256();
265 __m256i sumAbsGYTmp = _mm256_setzero_si256();
266 __m256i sumDIYTmp = _mm256_setzero_si256();
267 __m256i sumSignGyGxTmp = _mm256_setzero_si256();
268
269 #define _mm256_load2_si128_offset4(addr) _mm256_inserti128_si256( _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*) &addr[0])), _mm_loadu_si128((const __m128i*) &addr[4]), 1 )
270
271 for (int y = 0; y < 6; y++)
272 {
273 __m256i shiftSrcY0Tmp = _mm256_srai_epi16(_mm256_load2_si128_offset4(srcY0Tmp), shift4);
274 __m256i shiftSrcY1Tmp = _mm256_srai_epi16(_mm256_load2_si128_offset4(srcY1Tmp), shift4);
275 __m256i loadGradX0 = _mm256_load2_si128_offset4(gradX0);
276 __m256i loadGradX1 = _mm256_load2_si128_offset4(gradX1);
277 __m256i loadGradY0 = _mm256_load2_si128_offset4(gradY0);
278 __m256i loadGradY1 = _mm256_load2_si128_offset4(gradY1);
279 __m256i subTemp1 = _mm256_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp);
280 __m256i packTempX = _mm256_srai_epi16(_mm256_add_epi16(loadGradX0, loadGradX1), shift5);
281 __m256i packTempY = _mm256_srai_epi16(_mm256_add_epi16(loadGradY0, loadGradY1), shift5);
282 __m256i gX = _mm256_abs_epi16(packTempX);
283 __m256i gY = _mm256_abs_epi16(packTempY);
284 __m256i dIX = _mm256_sign_epi16(subTemp1, packTempX );
285 __m256i dIY = _mm256_sign_epi16(subTemp1, packTempY );
286 __m256i signGY_GX = _mm256_sign_epi16(packTempX, packTempY );
287
288 sumAbsGXTmp = _mm256_add_epi16(sumAbsGXTmp, gX);
289 sumDIXTmp = _mm256_add_epi16(sumDIXTmp, dIX);
290 sumAbsGYTmp = _mm256_add_epi16(sumAbsGYTmp, gY);
291 sumDIYTmp = _mm256_add_epi16(sumDIYTmp, dIY);
292 sumSignGyGxTmp = _mm256_add_epi16(sumSignGyGxTmp, signGY_GX);
293
294 srcY0Tmp += srcStride;
295 srcY1Tmp += srcStride;
296 gradX0 += widthG;
297 gradX1 += widthG;
298 gradY0 += widthG;
299 gradY1 += widthG;
300 }
301
302 #undef _mm256_load2_si128_offset4
303
304 sumAbsGXTmp = _mm256_madd_epi16(sumAbsGXTmp, _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
305 sumDIXTmp = _mm256_madd_epi16(sumDIXTmp, _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
306 sumAbsGYTmp = _mm256_madd_epi16(sumAbsGYTmp, _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
307 sumDIYTmp = _mm256_madd_epi16(sumDIYTmp, _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
308 sumSignGyGxTmp = _mm256_madd_epi16(sumSignGyGxTmp, _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
309
310 __m256i a12 = _mm256_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
311 __m256i a3 = _mm256_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
312 __m256i b12 = _mm256_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
313 __m256i b3 = _mm256_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
314 __m256i c1 = _mm256_unpacklo_epi64(a12, b12);
315 __m256i c2 = _mm256_unpackhi_epi64(a12, b12);
316 __m256i c3 = _mm256_unpacklo_epi64(a3, b3);
317
318 c1 = _mm256_add_epi32(c1, c2);
319 c1 = _mm256_add_epi32(c1, c3);
320
321 int tmpData[8];
322
323 _mm256_storeu_si256( ( __m256i* ) &tmpData[0], c1 );
324
325 #define sumAbsGX0 tmpData[0]
326 #define sumAbsGX1 tmpData[4]
327
328 #define sumAbsGY0 tmpData[1]
329 #define sumAbsGY1 tmpData[5]
330
331 #define sumDIX0 tmpData[2]
332 #define sumDIX1 tmpData[6]
333
334 #define sumDIY0 tmpData[3]
335 #define sumDIY1 tmpData[7]
336
337 sumSignGyGxTmp = _mm256_add_epi32(sumSignGyGxTmp, _mm256_shuffle_epi32(sumSignGyGxTmp, 0x4e)); // 01001110
338 sumSignGyGxTmp = _mm256_add_epi32(sumSignGyGxTmp, _mm256_shuffle_epi32(sumSignGyGxTmp, 0xb1)); // 10110001
339
340 int sumSignGY_GX0 = _mm256_extract_epi32( sumSignGyGxTmp, 0 );
341 int sumSignGY_GX1 = _mm256_extract_epi32( sumSignGyGxTmp, 4 );
342
343 #if 0
344 tmpx0 = sumAbsGX0 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( sumDIX0 << 2, sumAbsGX0 ) );
345 tmpx1 = sumAbsGX1 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( sumDIX1 << 2, sumAbsGX1 ) );
346 __m128i vtmpx = _mm_setr_epi32 ( tmpx0, tmpx1, 0, 0 );
347 __m128i vsumSignGY_GX = _mm_setr_epi32 ( sumSignGY_GX0, sumSignGY_GX1, 0, 0 );
348 __m128i vmainsGxGy = _mm_srai_epi32 ( vsumSignGY_GX, 12 );
349 __m128i vsecsGxGy = _mm_and_si128 ( vsumSignGY_GX, _mm_set1_epi32( ( 1 << 12 ) - 1 ) );
350 __m128i vtmpData = _mm_mullo_epi32( vtmpx, vmainsGxGy );
351 vtmpData = _mm_slli_epi32 ( vtmpData, 12 );
352 vtmpData = _mm_add_epi32 ( vtmpData, _mm_mullo_epi32( vtmpx, vsecsGxGy ) );
353 vtmpData = _mm_srai_epi32 ( vtmpData, 1 );
354 __m128i vtmpyIn = _mm_slli_epi32 ( _mm_setr_epi32( sumDIY0, sumDIY1, 0, 0 ), 2 );
355 vtmpyIn = _mm_sub_epi32 ( vtmpyIn, vtmpData );
356
357 tmpy0 = sumAbsGY0 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( _mm_extract_epi32( vtmpyIn, 0 ), sumAbsGY0 ) );
358 tmpy1 = sumAbsGY1 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( _mm_extract_epi32( vtmpyIn, 1 ), sumAbsGY1 ) );
359 #else
360 tmpx0 = sumAbsGX0 == 0 ? 0 : rightShiftMSB( sumDIX0 << 2, sumAbsGX0 );
361 tmpx0 = Clip3( -limit, limit, tmpx0 );
362
363 int mainsGxGy0 = sumSignGY_GX0 >> 12;
364 int secsGxGy0 = sumSignGY_GX0 & ( ( 1 << 12 ) - 1 );
365 int tmpData0 = tmpx0 * mainsGxGy0;
366 tmpData0 = ( ( tmpData0 << 12 ) + tmpx0 * secsGxGy0 ) >> 1;
367 tmpy0 = sumAbsGY0 == 0 ? 0 : rightShiftMSB( ( ( sumDIY0 << 2 ) - tmpData0 ), sumAbsGY0 );
368 tmpy0 = Clip3( -limit, limit, tmpy0 );
369
370
371 tmpx1 = sumAbsGX1 == 0 ? 0 : rightShiftMSB( sumDIX1 << 2, sumAbsGX1 );
372 tmpx1 = Clip3( -limit, limit, tmpx1 );
373
374 int mainsGxGy1 = sumSignGY_GX1 >> 12;
375 int secsGxGy1 = sumSignGY_GX1 & ( ( 1 << 12 ) - 1 );
376 int tmpData1 = tmpx1 * mainsGxGy1;
377 tmpData1 = ( ( tmpData1 << 12 ) + tmpx1 * secsGxGy1 ) >> 1;
378 tmpy1 = sumAbsGY1 == 0 ? 0 : rightShiftMSB( ( ( sumDIY1 << 2 ) - tmpData1 ), sumAbsGY1 );
379 tmpy1 = Clip3( -limit, limit, tmpy1 );
380 #endif
381
382 #undef sumAbsGX0
383 #undef sumAbsGX1
384 #undef sumAbsGY0
385 #undef sumAbsGY1
386 #undef sumDIX0
387 #undef sumDIX1
388 #undef sumDIY0
389 #undef sumDIY1
390 }
391 #endif
392
393 template< X86_VEXT vext>
BiOptFlowCoreSIMD(const Pel * srcY0,const Pel * srcY1,const Pel * gradX0,const Pel * gradX1,const Pel * gradY0,const Pel * gradY1,const int width,const int height,Pel * dstY,const ptrdiff_t dstStride,const int shiftNum,const int offset,const int limit,const ClpRng & clpRng,const int bitDepth)394 void BiOptFlowCoreSIMD( const Pel* srcY0,
395 const Pel* srcY1,
396 const Pel* gradX0,
397 const Pel* gradX1,
398 const Pel* gradY0,
399 const Pel* gradY1,
400 const int width,
401 const int height,
402 Pel* dstY,
403 const ptrdiff_t dstStride,
404 const int shiftNum,
405 const int offset,
406 const int limit,
407 const ClpRng& clpRng,
408 const int bitDepth )
409 {
410 const int widthG = width + BIO_ALIGN_SIZE;
411 const int stridePredMC = widthG;
412 int offsetPos = widthG * BIO_EXTEND_SIZE + BIO_EXTEND_SIZE;
413 const int xUnit = ( width >> 2 );
414 const int yUnit = ( height >> 2 );
415
416 const Pel* srcY0Temp;
417 const Pel* srcY1Temp;
418 Pel *dstY0;
419
420 int OffPos;
421 int OffPad = 0;
422
423 for( int yu = 0; yu < yUnit; yu++, srcY0 += ( stridePredMC << 2 ), srcY1 += ( stridePredMC << 2 ), dstY += ( dstStride << 2 ), offsetPos += ( widthG << 2 ) )
424 {
425 srcY0Temp = srcY0;
426 srcY1Temp = srcY1;
427 dstY0 = dstY;
428
429 OffPos = offsetPos;
430 OffPad = ( ( yu * widthG ) << 2 );
431
432 #if USE_AVX2
433 for( int xu = 0; xu < xUnit; xu += 2, srcY0Temp += 8, srcY1Temp += 8, dstY0 += 8, OffPos += 8, OffPad += 8 )
434 {
435 int tmpx0, tmpy0, tmpx1, tmpy1;
436
437 //calcBIOSums_SSE<vext>( srcY0Temp + 0, srcY1Temp + 0, gradX0 + OffPad + 0, gradX1 + OffPad + 0, gradY0 + OffPad + 0, gradY1 + OffPad + 0, stridePredMC, bitDepth, limit, tmpx, tmpy );
438 //calcBIOSums_SSE<vext>( srcY0Temp + 0, srcY1Temp + 0, gradX0 + OffPad + 0, gradX1 + OffPad + 0, gradY0 + OffPad + 0, gradY1 + OffPad + 0, stridePredMC, bitDepth, limit, tmpx, tmpy );
439 calcBIOSums2x_AVX2( srcY0Temp, srcY1Temp, gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad, stridePredMC, bitDepth, limit, tmpx0, tmpx1, tmpy0, tmpy1 );
440
441 //addBIOAvg4_SSE<vext>( srcY0Temp + stridePredMC + 1 + 0, srcY1Temp + stridePredMC + 1 + 0, dstY0 + 0, dstStride, gradX0 + OffPos + 0, gradX1 + OffPos + 0, gradY0 + OffPos + 0, gradY1 + OffPos + 0, widthG, tmpx0, tmpy0, shiftNum, offset, clpRng );
442 //addBIOAvg4_SSE<vext>( srcY0Temp + stridePredMC + 1 + 4, srcY1Temp + stridePredMC + 1 + 4, dstY0 + 4, dstStride, gradX0 + OffPos + 4, gradX1 + OffPos + 4, gradY0 + OffPos + 4, gradY1 + OffPos + 4, widthG, tmpx1, tmpy1, shiftNum, offset, clpRng );
443 addBIOAvg4_2x_AVX2( srcY0Temp + stridePredMC + 1, srcY1Temp + stridePredMC + 1, dstY0, dstStride, gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG, tmpx0, tmpx1, tmpy0, tmpy1, shiftNum, offset, clpRng );
444 } // xu
445 #else
446 for( int xu = 0; xu < xUnit; xu++, srcY0Temp += 4, srcY1Temp += 4, dstY0 += 4, OffPos += 4, OffPad += 4 )
447 {
448 int tmpx, tmpy;
449
450 calcBIOSums_SSE<vext>( srcY0Temp, srcY1Temp, gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad, stridePredMC, bitDepth, limit, tmpx, tmpy );
451
452 addBIOAvg4_SSE<vext> ( srcY0Temp + stridePredMC + 1, srcY1Temp + stridePredMC + 1, dstY0, dstStride, gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG, tmpx, tmpy, shiftNum, offset, clpRng );
453 } // xu
454 #endif
455 } // yu
456 #if USE_AVX2
457
458 _mm256_zeroupper();
459 #endif
460 }
461
462 template< X86_VEXT vext, bool bi >
applyPROF_SSE(Pel * dstPel,ptrdiff_t dstStride,const Pel * srcPel,const Pel * gradX,const Pel * gradY,const int * dMvX,const int * dMvY,int shiftNum,Pel offset,const ClpRng & clpRng)463 void applyPROF_SSE(Pel* dstPel, ptrdiff_t dstStride, const Pel* srcPel, const Pel* gradX, const Pel* gradY, const int* dMvX, const int* dMvY, int shiftNum, Pel offset, const ClpRng& clpRng)
464 {
465 static constexpr ptrdiff_t srcStride = 6;
466 static constexpr ptrdiff_t gradStride = 4;
467 static constexpr ptrdiff_t dMvStride = 4;
468
469 const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13);
470
471 #if USE_AVX2
472 __m256i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0, mm_src;
473 __m256i mm_offset = _mm256_set1_epi16( offset );
474 __m256i vibdimin = _mm256_set1_epi16( clpRng.min() );
475 __m256i vibdimax = _mm256_set1_epi16( clpRng.max() );
476 __m256i mm_dimin = _mm256_set1_epi32( -dILimit );
477 __m256i mm_dimax = _mm256_set1_epi32( dILimit - 1 );
478
479 const int *vX0 = dMvX, *vY0 = dMvY;
480 const Pel *gX0 = gradX, *gY0 = gradY;
481
482 // first two rows
483 mm_dmvx = _mm256_loadu_si256( ( const __m256i * ) vX0 );
484 mm_dmvy = _mm256_loadu_si256( ( const __m256i * ) vY0 );
485
486 mm_dmvx = _mm256_packs_epi32( mm_dmvx, _mm256_setzero_si256() );
487 mm_dmvy = _mm256_packs_epi32( mm_dmvy, _mm256_setzero_si256() );
488
489 mm_gradx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadl_epi64( ( __m128i* )gX0 ) ), _mm_loadl_epi64( ( __m128i* )( gX0 + gradStride ) ), 1 );
490 mm_grady = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadl_epi64( ( __m128i* )gY0 ) ), _mm_loadl_epi64( ( __m128i* )( gY0 + gradStride ) ), 1 );
491
492 mm_dI0 = _mm256_madd_epi16( _mm256_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm256_unpacklo_epi16( mm_gradx, mm_grady ) );
493 mm_dI0 = _mm256_min_epi32( mm_dimax, _mm256_max_epi32( mm_dimin, mm_dI0 ) );
494
495 // next two rows
496 vX0 += ( dMvStride << 1 ); vY0 += ( dMvStride << 1 ); gX0 += ( gradStride << 1 ); gY0 += ( gradStride << 1 );
497
498 mm_dmvx = _mm256_loadu_si256( ( const __m256i * ) vX0 );
499 mm_dmvy = _mm256_loadu_si256( ( const __m256i * ) vY0 );
500
501 mm_dmvx = _mm256_packs_epi32( mm_dmvx, _mm256_setzero_si256() );
502 mm_dmvy = _mm256_packs_epi32( mm_dmvy, _mm256_setzero_si256() );
503
504 mm_gradx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadl_epi64( ( __m128i* )gX0 ) ), _mm_loadl_epi64( ( __m128i* )( gX0 + gradStride ) ), 1 );
505 mm_grady = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadl_epi64( ( __m128i* )gY0 ) ), _mm_loadl_epi64( ( __m128i* )( gY0 + gradStride ) ), 1 );
506
507 mm_dI = _mm256_madd_epi16( _mm256_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm256_unpacklo_epi16( mm_gradx, mm_grady ) );
508 mm_dI = _mm256_min_epi32( mm_dimax, _mm256_max_epi32( mm_dimin, mm_dI ) );
509
510 // combine four rows
511 mm_dI = _mm256_packs_epi32( mm_dI0, mm_dI );
512 const Pel* src0 = srcPel + srcStride;
513 mm_src = _mm256_inserti128_si256(
514 _mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)srcPel), _mm_loadl_epi64((const __m128i *)(srcPel + (srcStride << 1))))),
515 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src0), _mm_loadl_epi64((const __m128i *)(src0 + (srcStride << 1)))),
516 1
517 );
518 mm_dI = _mm256_add_epi16(mm_dI, mm_src);
519 if (!bi)
520 {
521 mm_dI = _mm256_srai_epi16(_mm256_adds_epi16(mm_dI, mm_offset), shiftNum);
522 mm_dI = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, mm_dI));
523 }
524
525 // store final results
526 __m128i dITmp = _mm256_extracti128_si256(mm_dI, 1);
527 Pel* dst0 = dstPel;
528 _mm_storel_epi64((__m128i *)dst0, _mm256_castsi256_si128(mm_dI));
529 dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, dITmp);
530 dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(_mm256_castsi256_si128(mm_dI), _mm256_castsi256_si128(mm_dI)));
531 dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(dITmp, dITmp));
532 #else
533 __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0;
534 __m128i mm_offset = _mm_set1_epi16( offset );
535 __m128i vibdimin = _mm_set1_epi16( clpRng.min() );
536 __m128i vibdimax = _mm_set1_epi16( clpRng.max() );
537 __m128i mm_dimin = _mm_set1_epi32( -dILimit );
538 __m128i mm_dimax = _mm_set1_epi32( dILimit - 1 );
539
540 static constexpr int height = 4;
541
542 for( int h = 0; h < height; h += 2 )
543 {
544 const int* vX = dMvX;
545 const int* vY = dMvY;
546 const Pel* gX = gradX;
547 const Pel* gY = gradY;
548 const Pel* src = srcPel;
549 Pel* dst = dstPel;
550
551 // first row
552 mm_dmvx = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) vX ), _mm_setzero_si128() );
553 mm_dmvy = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) vY ), _mm_setzero_si128() );
554 mm_gradx = _mm_loadl_epi64( ( __m128i* ) gX );
555 mm_grady = _mm_loadl_epi64( ( __m128i* ) gY );
556 mm_dI0 = _mm_madd_epi16 ( _mm_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm_unpacklo_epi16( mm_gradx, mm_grady ) );
557 mm_dI0 = _mm_min_epi32 ( mm_dimax, _mm_max_epi32( mm_dimin, mm_dI0 ) );
558
559 // second row
560 mm_dmvx = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) ( vX + dMvStride ) ), _mm_setzero_si128() );
561 mm_dmvy = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) ( vY + dMvStride ) ), _mm_setzero_si128() );
562 mm_gradx = _mm_loadl_epi64( ( __m128i* ) ( gX + gradStride ) );
563 mm_grady = _mm_loadl_epi64( ( __m128i* ) ( gY + gradStride ) );
564 mm_dI = _mm_madd_epi16 ( _mm_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm_unpacklo_epi16( mm_gradx, mm_grady ) );
565 mm_dI = _mm_min_epi32 ( mm_dimax, _mm_max_epi32( mm_dimin, mm_dI ) );
566
567 // combine both rows
568 mm_dI = _mm_packs_epi32( mm_dI0, mm_dI );
569 mm_dI = _mm_add_epi16 ( _mm_unpacklo_epi64( _mm_loadl_epi64( ( const __m128i * )src ), _mm_loadl_epi64( ( const __m128i * )( src + srcStride ) ) ), mm_dI );
570 if (!bi)
571 {
572 mm_dI = _mm_srai_epi16(_mm_adds_epi16(mm_dI, mm_offset), shiftNum);
573 mm_dI = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_dI));
574 }
575
576 _mm_storel_epi64( ( __m128i * ) dst, mm_dI );
577 _mm_storel_epi64( ( __m128i * )( dst + dstStride ), _mm_unpackhi_epi64( mm_dI, mm_dI ) );
578
579 dMvX += (dMvStride << 1);
580 dMvY += (dMvStride << 1);
581 gradX += (gradStride << 1);
582 gradY += (gradStride << 1);
583 srcPel += (srcStride << 1);
584 dstPel += (dstStride << 1);
585 }
586 #endif
587 }
588
589
590 template< X86_VEXT vext >
roundIntVector_SIMD(int * v,int size,unsigned int nShift,const int dmvLimit)591 void roundIntVector_SIMD(int* v, int size, unsigned int nShift, const int dmvLimit)
592 {
593 CHECKD(size % 16 != 0, "Size must be multiple of 16!");
594 #ifdef USE_AVX512
595 if (vext >= AVX512 && size >= 16)
596 {
597 __m512i dMvMin = _mm256_set1_epi32(-dmvLimit);
598 __m512i dMvMax = _mm256_set1_epi32(dmvLimit);
599 __m512i nOffset = _mm512_set1_epi32((1 << (nShift - 1)));
600 __m512i vones = _mm512_set1_epi32(1);
601 __m512i vzero = _mm512_setzero_si512();
602 for (int i = 0; i < size; i += 16, v += 16)
603 {
604 __m512i src = _mm512_loadu_si512(v);
605 __mmask16 mask = _mm512_cmpge_epi32_mask(src, vzero);
606 src = __mm512_add_epi32(src, nOffset);
607 __mm512i dst = _mm512_srai_epi32(_mm512_mask_sub_epi32(src, mask, src, vones), nShift);
608 dst = _mm512_min_epi32(dMvMax, _mm512_max_epi32(dMvMin, dst));
609 _mm512_storeu_si512(v, dst);
610 }
611 }
612 else
613 #endif
614 #ifdef USE_AVX2
615 if (vext >= AVX2 && size >= 8)
616 {
617 __m256i dMvMin = _mm256_set1_epi32(-dmvLimit);
618 __m256i dMvMax = _mm256_set1_epi32(dmvLimit);
619 __m256i nOffset = _mm256_set1_epi32(1 << (nShift - 1));
620 __m256i vzero = _mm256_setzero_si256();
621 for (int i = 0; i < size; i += 8, v += 8)
622 {
623 __m256i src = _mm256_lddqu_si256((__m256i*)v);
624 __m256i of = _mm256_cmpgt_epi32(src, vzero);
625 __m256i dst = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(src, nOffset), of), nShift);
626 dst = _mm256_min_epi32(dMvMax, _mm256_max_epi32(dMvMin, dst));
627 _mm256_storeu_si256((__m256i*)v, dst);
628 }
629 }
630 else
631 #endif
632 {
633 __m128i dMvMin = _mm_set1_epi32(-dmvLimit);
634 __m128i dMvMax = _mm_set1_epi32(dmvLimit);
635 __m128i nOffset = _mm_set1_epi32((1 << (nShift - 1)));
636 __m128i vzero = _mm_setzero_si128();
637 for (int i = 0; i < size; i += 4, v += 4)
638 {
639 __m128i src = _mm_loadu_si128((__m128i*)v);
640 __m128i of = _mm_cmpgt_epi32(src, vzero);
641 __m128i dst = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(src, nOffset), of), nShift);
642 dst = _mm_min_epi32(dMvMax, _mm_max_epi32(dMvMin, dst));
643 _mm_storeu_si128((__m128i*)v, dst);
644 }
645 }
646 }
647
648 template< X86_VEXT vext, bool PAD = true>
gradFilter_SSE(int16_t * src,ptrdiff_t _srcStride,int width,int height,ptrdiff_t _gradStride,int16_t * gradX,int16_t * gradY,const int bitDepth)649 void gradFilter_SSE( int16_t* src, ptrdiff_t _srcStride, int width, int height, ptrdiff_t _gradStride, int16_t* gradX, int16_t* gradY, const int bitDepth)
650 {
651 const int widthInside = PAD ? width - 2 * BIO_EXTEND_SIZE : 4;
652 const int heightInside = PAD ? height - 2 * BIO_EXTEND_SIZE : 4;
653 const ptrdiff_t gradStride = PAD ? _gradStride : 4;
654 const ptrdiff_t srcStride = PAD ? _srcStride : 6;
655
656 int16_t* srcTmp = PAD ? src + srcStride + 1 : src;
657 int16_t* gradXTmp = PAD ? gradX + gradStride + 1 : gradX;
658 int16_t* gradYTmp = PAD ? gradY + gradStride + 1 : gradY;
659
660 const int shift1 = std::max<int>( 6, bitDepth - 6 );
661
662 CHECKD( gradStride != _gradStride, "Wrong PROF stride!" );
663 CHECKD( srcStride != _srcStride, "Wrong PROF stride!" );
664
665 #if USE_AVX2
666 if( PAD && ( widthInside & 15 ) == 0 && vext >= AVX2 )
667 {
668 for( int y = 0; y < heightInside; y++ )
669 {
670 for( int x = 0; x < widthInside; x += 16 )
671 {
672 __m256i mmPixTop = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x - srcStride ) ), shift1 );
673 __m256i mmPixBottom = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x + srcStride ) ), shift1 );
674 __m256i mmPixLeft = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x - 1 ) ), shift1 );
675 __m256i mmPixRight = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x + 1 ) ), shift1 );
676
677 __m256i mmGradVer = _mm256_sub_epi16( mmPixBottom, mmPixTop );
678 __m256i mmGradHor = _mm256_sub_epi16( mmPixRight, mmPixLeft );
679
680 _mm256_storeu_si256( ( __m256i * )( gradYTmp + x ), mmGradVer );
681 _mm256_storeu_si256( ( __m256i * )( gradXTmp + x ), mmGradHor );
682 }
683
684 gradXTmp[widthInside] = gradXTmp[widthInside - 1];
685 gradYTmp[widthInside] = gradYTmp[widthInside - 1];
686 gradXTmp[-1] = gradXTmp[0];
687 gradYTmp[-1] = gradYTmp[0];
688
689 gradXTmp += gradStride;
690 gradYTmp += gradStride;
691 srcTmp += srcStride;
692 }
693 }
694 else
695 #endif
696 if( PAD && ( widthInside & 7 ) == 0 )
697 {
698 for( int y = 0; y < heightInside; y++ )
699 {
700 for( int x = 0; x < widthInside; x += 8 )
701 {
702 __m128i mmPixTop = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x - srcStride ) ), shift1 );
703 __m128i mmPixBottom = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x + srcStride ) ), shift1 );
704 __m128i mmPixLeft = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x - 1 ) ), shift1 );
705 __m128i mmPixRight = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x + 1 ) ), shift1 );
706
707 __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop );
708 __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft );
709
710 _mm_storeu_si128((__m128i *) (gradYTmp + x), mmGradVer);
711 _mm_storeu_si128((__m128i *) (gradXTmp + x), mmGradHor);
712 }
713
714 if( PAD )
715 {
716 gradXTmp[widthInside] = gradXTmp[widthInside - 1];
717 gradYTmp[widthInside] = gradYTmp[widthInside - 1];
718 gradXTmp[-1] = gradXTmp[0];
719 gradYTmp[-1] = gradYTmp[0];
720 }
721
722 gradXTmp += gradStride;
723 gradYTmp += gradStride;
724 srcTmp += srcStride;
725 }
726 }
727 else
728 {
729 CHECK( widthInside != 4, "Width needs to be '4'!" );
730
731 for( int y = 0; y < ( PAD ? heightInside : 4 ); y++ )
732 {
733 __m128i mmPixTop = _mm_srai_epi16( _mm_loadl_epi64( ( __m128i* )( srcTmp - srcStride ) ), shift1 );
734 __m128i mmPixBottom = _mm_srai_epi16( _mm_loadl_epi64( ( __m128i* )( srcTmp + srcStride ) ), shift1 );
735 __m128i mmPixLeft = _mm_srai_epi16( _mm_loadl_epi64( ( __m128i* )( srcTmp - 1 ) ), shift1 );
736 __m128i mmPixRight = _mm_srai_epi16( _mm_loadl_epi64( ( __m128i* )( srcTmp + 1 ) ), shift1 );
737
738 __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop );
739 __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft );
740
741 _mm_storel_epi64( ( __m128i * )( gradYTmp ), mmGradVer );
742 _mm_storel_epi64( ( __m128i * )( gradXTmp ), mmGradHor );
743
744 if( PAD )
745 {
746 gradXTmp[widthInside] = gradXTmp[widthInside - 1];
747 gradYTmp[widthInside] = gradYTmp[widthInside - 1];
748 gradXTmp[-1] = gradXTmp[0];
749 gradYTmp[-1] = gradYTmp[0];
750 }
751
752 gradXTmp += gradStride;
753 gradYTmp += gradStride;
754 srcTmp += srcStride;
755 }
756 }
757 #if USE_AVX2
758
759 _mm256_zeroupper();
760 #endif
761
762 if( PAD )
763 {
764 gradXTmp = gradX + gradStride;
765 gradYTmp = gradY + gradStride;
766
767 ::memcpy( gradXTmp + heightInside * gradStride, gradXTmp + ( heightInside - 1 ) * gradStride, sizeof( int16_t ) * ( width ) );
768 ::memcpy( gradYTmp + heightInside * gradStride, gradYTmp + ( heightInside - 1 ) * gradStride, sizeof( int16_t ) * ( width ) );
769 ::memcpy( gradXTmp - gradStride, gradXTmp, sizeof( int16_t ) * ( width ) );
770 ::memcpy( gradYTmp - gradStride, gradYTmp, sizeof( int16_t ) * ( width ) );
771 }
772 }
773
774 template<X86_VEXT vext>
_initInterPredictionX86()775 void InterPrediction::_initInterPredictionX86()
776 {
777 BiOptFlow = BiOptFlowCoreSIMD <vext>;
778 PaddBIO = PaddBIO_SIMD <vext>;
779 BioGradFilter = gradFilter_SSE <vext, true>;
780 profGradFilter = gradFilter_SSE <vext, false>;
781 applyPROF[0] = applyPROF_SSE <vext, false>;
782 applyPROF[1] = applyPROF_SSE <vext, true>;
783 roundIntVector = roundIntVector_SIMD<vext>;
784 }
785 template void InterPrediction::_initInterPredictionX86<SIMDX86>();
786
787 #endif // TARGET_SIMD_X86
788 #endif
789
790 }
791