1 /* -----------------------------------------------------------------------------
2 The copyright in this software is being made available under the BSD
3 License, included below. No patent rights, trademark rights and/or
4 other Intellectual Property Rights other than the copyrights concerning
5 the Software are granted under this license.
6 
7 For any license concerning other Intellectual Property rights than the software,
8 especially patent licenses, a separate Agreement needs to be closed.
9 For more information please contact:
10 
11 Fraunhofer Heinrich Hertz Institute
12 Einsteinufer 37
13 10587 Berlin, Germany
14 www.hhi.fraunhofer.de/vvc
15 vvc@hhi.fraunhofer.de
16 
17 Copyright (c) 2018-2021, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V.
18 All rights reserved.
19 
20 Redistribution and use in source and binary forms, with or without
21 modification, are permitted provided that the following conditions are met:
22 
23  * Redistributions of source code must retain the above copyright notice,
24    this list of conditions and the following disclaimer.
25  * Redistributions in binary form must reproduce the above copyright notice,
26    this list of conditions and the following disclaimer in the documentation
27    and/or other materials provided with the distribution.
28  * Neither the name of Fraunhofer nor the names of its contributors may
29    be used to endorse or promote products derived from this software without
30    specific prior written permission.
31 
32 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
33 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
36 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
37 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
40 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
42 THE POSSIBILITY OF SUCH DAMAGE.
43 
44 
45 ------------------------------------------------------------------------------------------- */
46 
47 /** \file     InterPredX86.h
48     \brief    SIMD for InterPrediction
49 */
50 
51 //! \ingroup CommonLib
52 //! \{
53 
54 
55 #include "CommonLib/CommonDef.h"
56 #include "CommonDefX86.h"
57 #include "CommonLib/InterPrediction.h"
58 
59 namespace vvdec
60 {
61 
62 #if ENABLE_SIMD_OPT_BIO
63 #ifdef TARGET_SIMD_X86
64 
65 #define _mm_storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
66 
67 template<X86_VEXT vext>
PaddBIO_SIMD(const Pel * refPel,Pel * dstPel,unsigned width,const int shift)68 inline void PaddBIO_SIMD( const Pel* refPel, Pel* dstPel, unsigned width, const int shift )
69 {
70   int w;
71   __m128i off = _mm_set1_epi16( ( Pel ) IF_INTERNAL_OFFS );
72 
73   if( width > 4 )
74   {
75     for( w = 0; w < width; w += 8 )
76     {
77 
78       __m128i ref = _mm_lddqu_si128( ( __m128i const * )&refPel[w] );
79       ref = _mm_slli_epi16( ref, shift );
80       ref = _mm_sub_epi16( ref, off );
81       _mm_storeu_si128( ( __m128i * )&dstPel[w], ref );
82 
83     }
84     //2 * BIO_EXTEND_SIZE
85     __m128i ref = _mm_lddqu_si128( ( __m128i const * )&refPel[w] );
86     ref = _mm_slli_epi16( ref, shift );
87     ref = _mm_sub_epi16( ref, off );
88     _mm_storeu_si32( ( __m128i * )&dstPel[w], ref );
89 
90   }
91   else
92   {
93     __m128i ref = _mm_lddqu_si128( ( __m128i const * )&refPel[0] );
94     ref = _mm_slli_epi16( ref, shift );
95     ref = _mm_sub_epi16( ref, off );
96     _mm_storel_epi64( ( __m128i * )&dstPel[0], ref );
97     ref = _mm_srli_si128( ref, 8 );
98     _mm_storeu_si32( ( __m128i * )&dstPel[4], ref );
99   }
100 }
101 
rightShiftMSB(int numer,int denom)102 static inline int rightShiftMSB( int numer, int denom )
103 {
104   int shiftIdx = _bit_scan_reverse( denom );
105   return ( numer >> shiftIdx );
106 }
107 
108 template<X86_VEXT vext>
addBIOAvg4_SSE(const int16_t * src0,const int16_t * src1,int16_t * dst,ptrdiff_t dstStride,const int16_t * gradX0,const int16_t * gradX1,const int16_t * gradY0,const int16_t * gradY1,ptrdiff_t widthG,int tmpx,int tmpy,int shift,int offset,const ClpRng & clpRng)109 static inline void addBIOAvg4_SSE(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx, int tmpy, int shift, int offset, const ClpRng& clpRng)
110 {
111   const ptrdiff_t src0Stride = widthG;
112   const ptrdiff_t src1Stride = widthG;
113   const ptrdiff_t gradStride = widthG;
114 
115   __m128i mm_tmpx    = _mm_set1_epi32( ( tmpx & 0xffff ) | ( tmpy << 16 ) );
116   __m128i mm_offset  = _mm_set1_epi32( offset );
117   __m128i vibdimin   = _mm_set1_epi16( clpRng.min() );
118   __m128i vibdimax   = _mm_set1_epi16( clpRng.max() );
119   __m128i mm_a;
120   __m128i mm_b;
121   __m128i mm_sum;
122 
123   for( int y = 0; y < 4; y++, dst += dstStride, src0 += src0Stride, src1 += src1Stride, gradX0 += gradStride, gradX1 += gradStride, gradY0 += gradStride, gradY1 += gradStride )
124   {
125     mm_a   = _mm_unpacklo_epi16 ( _mm_loadl_epi64( (const __m128i *) gradX0 ), _mm_loadl_epi64( (const __m128i *) gradY0 ) );
126     mm_b   = _mm_unpacklo_epi16 ( _mm_loadl_epi64( (const __m128i *) gradX1 ), _mm_loadl_epi64( (const __m128i *) gradY1 ) );
127     mm_a   = _mm_sub_epi16      ( mm_a, mm_b );
128     mm_sum = _mm_madd_epi16     ( mm_a, mm_tmpx );
129     mm_a   = _mm_cvtepi16_epi32 ( _mm_loadl_epi64( (const __m128i *) ( src0 ) ) );
130     mm_b   = _mm_cvtepi16_epi32 ( _mm_loadl_epi64( (const __m128i *) ( src1 ) ) );
131     mm_sum = _mm_add_epi32      ( _mm_add_epi32( mm_sum, mm_a ), _mm_add_epi32( mm_b, mm_offset ) );
132     mm_sum = _mm_packs_epi32    ( _mm_srai_epi32( mm_sum, shift ), mm_a );
133     mm_sum = _mm_min_epi16      ( vibdimax, _mm_max_epi16( vibdimin, mm_sum ) );
134     _mm_storel_epi64            ( (__m128i *) dst, mm_sum );
135   }
136 }
137 
138 #if USE_AVX2
addBIOAvg4_2x_AVX2(const int16_t * src0,const int16_t * src1,int16_t * dst,ptrdiff_t dstStride,const int16_t * gradX0,const int16_t * gradX1,const int16_t * gradY0,const int16_t * gradY1,ptrdiff_t widthG,int tmpx0,int tmpx1,int tmpy0,int tmpy1,int shift,int offset,const ClpRng & clpRng)139 static inline void addBIOAvg4_2x_AVX2(const int16_t* src0, const int16_t* src1, int16_t* dst, ptrdiff_t dstStride, const int16_t* gradX0, const int16_t* gradX1, const int16_t* gradY0, const int16_t* gradY1, ptrdiff_t widthG, int tmpx0, int tmpx1, int tmpy0, int tmpy1, int shift, int offset, const ClpRng& clpRng)
140 {
141   const ptrdiff_t src0Stride = widthG;
142   const ptrdiff_t src1Stride = widthG;
143   const ptrdiff_t gradStride = widthG;
144 
145   __m256i mm_tmpx    = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_set1_epi32( ( tmpx0 & 0xffff ) | ( tmpy0 << 16 ) ) ), _mm_set1_epi32( ( tmpx1 & 0xffff ) | ( tmpy1 << 16 ) ), 1 );
146   __m256i mm_offset  = _mm256_set1_epi32( offset );
147   __m256i vibdimin   = _mm256_set1_epi32( clpRng.min() );
148   __m256i vibdimax   = _mm256_set1_epi32( clpRng.max() );
149   __m256i mm_a;
150   __m256i mm_b;
151   __m256i mm_sum;
152   __m128i xsrc0, xsrc1;
153 
154   for( int y = 0; y < 4; y++, dst += dstStride, src0 += src0Stride, src1 += src1Stride, gradX0 += gradStride, gradX1 += gradStride, gradY0 += gradStride, gradY1 += gradStride )
155   {
156     xsrc0  = _mm_loadu_si128       ( ( const __m128i * ) gradX0 );
157     xsrc1  = _mm_loadu_si128       ( ( const __m128i * ) gradY0 );
158     mm_a   = _mm256_castsi128_si256( _mm_unpacklo_epi16( xsrc0, xsrc1 ) );
159     mm_a   = _mm256_inserti128_si256( mm_a, _mm_unpackhi_epi16( xsrc0, xsrc1 ), 1 );
160     xsrc0  = _mm_loadu_si128       ( ( const __m128i * ) gradX1 );
161     xsrc1  = _mm_loadu_si128       ( ( const __m128i * ) gradY1 );
162     mm_b   = _mm256_castsi128_si256( _mm_unpacklo_epi16( xsrc0, xsrc1 ) );
163     mm_b   = _mm256_inserti128_si256( mm_b, _mm_unpackhi_epi16( xsrc0, xsrc1 ), 1 );
164     mm_a   = _mm256_sub_epi16      ( mm_a, mm_b );
165     mm_sum = _mm256_madd_epi16     ( mm_a, mm_tmpx );
166     mm_a   = _mm256_cvtepi16_epi32 ( _mm_loadu_si128( (const __m128i *) ( src0 ) ) );
167     mm_b   = _mm256_cvtepi16_epi32 ( _mm_loadu_si128( (const __m128i *) ( src1 ) ) );
168     mm_sum = _mm256_add_epi32      ( _mm256_add_epi32( mm_sum, mm_a ), _mm256_add_epi32( mm_b, mm_offset ) );
169     mm_sum = _mm256_srai_epi32     ( mm_sum, shift );
170     mm_sum = _mm256_min_epi32      ( vibdimax, _mm256_max_epi32( vibdimin, mm_sum ) );
171     _mm_storeu_si128               ( (__m128i *) dst, _mm256_cvtepi32_epi16x( mm_sum ) );
172   }
173 }
174 #endif
175 
176 template< X86_VEXT vext >
calcBIOSums_SSE(const Pel * srcY0Tmp,const Pel * srcY1Tmp,const Pel * gradX0,const Pel * gradX1,const Pel * gradY0,const Pel * gradY1,const int widthG,const int bitDepth,int limit,int & tmpx,int & tmpy)177 static inline void calcBIOSums_SSE(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx, int &tmpy)
178 {
179   static constexpr int shift4 = 4;
180   static constexpr int shift5 = 1;
181   const int srcStride = widthG;
182 
183   __m128i sumAbsGXTmp = _mm_setzero_si128();
184   __m128i sumDIXTmp = _mm_setzero_si128();
185   __m128i sumAbsGYTmp = _mm_setzero_si128();
186   __m128i sumDIYTmp = _mm_setzero_si128();
187   __m128i sumSignGyGxTmp = _mm_setzero_si128();
188 
189   for (int y = 0; y < 6; y++)
190   {
191     __m128i shiftSrcY0Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY0Tmp)), shift4);
192     __m128i shiftSrcY1Tmp = _mm_srai_epi16(_mm_loadu_si128((__m128i*)(srcY1Tmp)), shift4);
193     __m128i loadGradX0 = _mm_loadu_si128((__m128i*)(gradX0));
194     __m128i loadGradX1 = _mm_loadu_si128((__m128i*)(gradX1));
195     __m128i loadGradY0 = _mm_loadu_si128((__m128i*)(gradY0));
196     __m128i loadGradY1 = _mm_loadu_si128((__m128i*)(gradY1));
197     __m128i subTemp1 = _mm_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp);
198     __m128i packTempX = _mm_srai_epi16(_mm_add_epi16(loadGradX0, loadGradX1), shift5);
199     __m128i packTempY = _mm_srai_epi16(_mm_add_epi16(loadGradY0, loadGradY1), shift5);
200     __m128i gX = _mm_abs_epi16(packTempX);
201     __m128i gY = _mm_abs_epi16(packTempY);
202     __m128i dIX       = _mm_sign_epi16(subTemp1,  packTempX );
203     __m128i dIY       = _mm_sign_epi16(subTemp1,  packTempY );
204     __m128i signGY_GX = _mm_sign_epi16(packTempX, packTempY );
205 
206     sumAbsGXTmp = _mm_add_epi16(sumAbsGXTmp, gX);
207     sumDIXTmp = _mm_add_epi16(sumDIXTmp, dIX);
208     sumAbsGYTmp = _mm_add_epi16(sumAbsGYTmp, gY);
209     sumDIYTmp = _mm_add_epi16(sumDIYTmp, dIY);
210     sumSignGyGxTmp = _mm_add_epi16(sumSignGyGxTmp, signGY_GX);
211     srcY0Tmp += srcStride;
212     srcY1Tmp += srcStride;
213     gradX0 += widthG;
214     gradX1 += widthG;
215     gradY0 += widthG;
216     gradY1 += widthG;
217   }
218 
219   sumAbsGXTmp    = _mm_madd_epi16(sumAbsGXTmp,    _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
220   sumDIXTmp      = _mm_madd_epi16(sumDIXTmp,      _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
221   sumAbsGYTmp    = _mm_madd_epi16(sumAbsGYTmp,    _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
222   sumDIYTmp      = _mm_madd_epi16(sumDIYTmp,      _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
223   sumSignGyGxTmp = _mm_madd_epi16(sumSignGyGxTmp, _mm_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0));
224 
225   __m128i a12 = _mm_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
226   __m128i a3  = _mm_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
227   __m128i b12 = _mm_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
228   __m128i b3  = _mm_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
229   __m128i c1  = _mm_unpacklo_epi64(a12, b12);
230   __m128i c2  = _mm_unpackhi_epi64(a12, b12);
231   __m128i c3  = _mm_unpacklo_epi64(a3, b3);
232 
233   c1 = _mm_add_epi32(c1, c2);
234   c1 = _mm_add_epi32(c1, c3);
235 
236   int sumAbsGX = _mm_cvtsi128_si32(c1);
237   int sumAbsGY = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0x55));
238   int sumDIX   = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xaa));
239   int sumDIY   = _mm_cvtsi128_si32(_mm_shuffle_epi32(c1, 0xff));
240 
241   sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0x4e));   // 01001110
242   sumSignGyGxTmp = _mm_add_epi32(sumSignGyGxTmp, _mm_shuffle_epi32(sumSignGyGxTmp, 0xb1));   // 10110001
243   int sumSignGY_GX  = _mm_cvtsi128_si32(sumSignGyGxTmp);
244 
245   tmpx = sumAbsGX == 0 ? 0 : rightShiftMSB( sumDIX << 2, sumAbsGX );
246   tmpx = Clip3( -limit, limit, tmpx );
247 
248   int mainsGxGy = sumSignGY_GX >> 12;
249   int secsGxGy  = sumSignGY_GX & ( ( 1 << 12 ) - 1 );
250   int tmpData   = tmpx * mainsGxGy;
251   tmpData       = ( ( tmpData << 12 ) + tmpx * secsGxGy ) >> 1;
252   tmpy = sumAbsGY == 0 ? 0 : rightShiftMSB( ( ( sumDIY << 2 ) - tmpData ), sumAbsGY );
253   tmpy = Clip3( -limit, limit, tmpy );
254 }
255 
256 #if USE_AVX2
calcBIOSums2x_AVX2(const Pel * srcY0Tmp,const Pel * srcY1Tmp,const Pel * gradX0,const Pel * gradX1,const Pel * gradY0,const Pel * gradY1,const int widthG,const int bitDepth,int limit,int & tmpx0,int & tmpx1,int & tmpy0,int & tmpy1)257 static inline void calcBIOSums2x_AVX2(const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0, const Pel* gradY1, const int widthG, const int bitDepth, int limit, int &tmpx0, int &tmpx1, int &tmpy0, int &tmpy1 )
258 {
259   static constexpr int shift4 = 4;
260   static constexpr int shift5 = 1;
261   const int srcStride = widthG;
262 
263   __m256i sumAbsGXTmp     = _mm256_setzero_si256();
264   __m256i sumDIXTmp       = _mm256_setzero_si256();
265   __m256i sumAbsGYTmp     = _mm256_setzero_si256();
266   __m256i sumDIYTmp       = _mm256_setzero_si256();
267   __m256i sumSignGyGxTmp  = _mm256_setzero_si256();
268 
269 #define _mm256_load2_si128_offset4(addr) _mm256_inserti128_si256( _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*) &addr[0])), _mm_loadu_si128((const __m128i*) &addr[4]), 1 )
270 
271   for (int y = 0; y < 6; y++)
272   {
273     __m256i shiftSrcY0Tmp = _mm256_srai_epi16(_mm256_load2_si128_offset4(srcY0Tmp), shift4);
274     __m256i shiftSrcY1Tmp = _mm256_srai_epi16(_mm256_load2_si128_offset4(srcY1Tmp), shift4);
275     __m256i loadGradX0    = _mm256_load2_si128_offset4(gradX0);
276     __m256i loadGradX1    = _mm256_load2_si128_offset4(gradX1);
277     __m256i loadGradY0    = _mm256_load2_si128_offset4(gradY0);
278     __m256i loadGradY1    = _mm256_load2_si128_offset4(gradY1);
279     __m256i subTemp1      = _mm256_sub_epi16(shiftSrcY1Tmp, shiftSrcY0Tmp);
280     __m256i packTempX     = _mm256_srai_epi16(_mm256_add_epi16(loadGradX0, loadGradX1), shift5);
281     __m256i packTempY     = _mm256_srai_epi16(_mm256_add_epi16(loadGradY0, loadGradY1), shift5);
282     __m256i gX            = _mm256_abs_epi16(packTempX);
283     __m256i gY            = _mm256_abs_epi16(packTempY);
284     __m256i dIX           = _mm256_sign_epi16(subTemp1,  packTempX );
285     __m256i dIY           = _mm256_sign_epi16(subTemp1,  packTempY );
286     __m256i signGY_GX     = _mm256_sign_epi16(packTempX, packTempY );
287 
288     sumAbsGXTmp     = _mm256_add_epi16(sumAbsGXTmp, gX);
289     sumDIXTmp       = _mm256_add_epi16(sumDIXTmp, dIX);
290     sumAbsGYTmp     = _mm256_add_epi16(sumAbsGYTmp, gY);
291     sumDIYTmp       = _mm256_add_epi16(sumDIYTmp, dIY);
292     sumSignGyGxTmp  = _mm256_add_epi16(sumSignGyGxTmp, signGY_GX);
293 
294     srcY0Tmp += srcStride;
295     srcY1Tmp += srcStride;
296     gradX0 += widthG;
297     gradX1 += widthG;
298     gradY0 += widthG;
299     gradY1 += widthG;
300   }
301 
302 #undef _mm256_load2_si128_offset4
303 
304   sumAbsGXTmp    = _mm256_madd_epi16(sumAbsGXTmp,    _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
305   sumDIXTmp      = _mm256_madd_epi16(sumDIXTmp,      _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
306   sumAbsGYTmp    = _mm256_madd_epi16(sumAbsGYTmp,    _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
307   sumDIYTmp      = _mm256_madd_epi16(sumDIYTmp,      _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
308   sumSignGyGxTmp = _mm256_madd_epi16(sumSignGyGxTmp, _mm256_setr_epi16(1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0));
309 
310   __m256i a12 = _mm256_unpacklo_epi32(sumAbsGXTmp, sumAbsGYTmp);
311   __m256i a3  = _mm256_unpackhi_epi32(sumAbsGXTmp, sumAbsGYTmp);
312   __m256i b12 = _mm256_unpacklo_epi32(sumDIXTmp, sumDIYTmp);
313   __m256i b3  = _mm256_unpackhi_epi32(sumDIXTmp, sumDIYTmp);
314   __m256i c1  = _mm256_unpacklo_epi64(a12, b12);
315   __m256i c2  = _mm256_unpackhi_epi64(a12, b12);
316   __m256i c3  = _mm256_unpacklo_epi64(a3, b3);
317 
318   c1 = _mm256_add_epi32(c1, c2);
319   c1 = _mm256_add_epi32(c1, c3);
320 
321   int tmpData[8];
322 
323   _mm256_storeu_si256( ( __m256i* ) &tmpData[0], c1 );
324 
325   #define sumAbsGX0 tmpData[0]
326   #define sumAbsGX1 tmpData[4]
327 
328   #define sumAbsGY0 tmpData[1]
329   #define sumAbsGY1 tmpData[5]
330 
331   #define sumDIX0   tmpData[2]
332   #define sumDIX1   tmpData[6]
333 
334   #define sumDIY0   tmpData[3]
335   #define sumDIY1   tmpData[7]
336 
337   sumSignGyGxTmp = _mm256_add_epi32(sumSignGyGxTmp, _mm256_shuffle_epi32(sumSignGyGxTmp, 0x4e));   // 01001110
338   sumSignGyGxTmp = _mm256_add_epi32(sumSignGyGxTmp, _mm256_shuffle_epi32(sumSignGyGxTmp, 0xb1));   // 10110001
339 
340   int sumSignGY_GX0 = _mm256_extract_epi32( sumSignGyGxTmp, 0 );
341   int sumSignGY_GX1 = _mm256_extract_epi32( sumSignGyGxTmp, 4 );
342 
343 #if 0
344   tmpx0 = sumAbsGX0 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( sumDIX0 << 2, sumAbsGX0 ) );
345   tmpx1 = sumAbsGX1 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( sumDIX1 << 2, sumAbsGX1 ) );
346   __m128i vtmpx         = _mm_setr_epi32 ( tmpx0, tmpx1, 0, 0 );
347   __m128i vsumSignGY_GX = _mm_setr_epi32 ( sumSignGY_GX0, sumSignGY_GX1, 0, 0 );
348   __m128i vmainsGxGy    = _mm_srai_epi32 ( vsumSignGY_GX, 12 );
349   __m128i vsecsGxGy     = _mm_and_si128  ( vsumSignGY_GX, _mm_set1_epi32( ( 1 << 12 ) - 1 ) );
350   __m128i vtmpData      = _mm_mullo_epi32( vtmpx, vmainsGxGy );
351   vtmpData              = _mm_slli_epi32 ( vtmpData, 12 );
352   vtmpData              = _mm_add_epi32  ( vtmpData, _mm_mullo_epi32( vtmpx, vsecsGxGy ) );
353   vtmpData              = _mm_srai_epi32 ( vtmpData, 1 );
354   __m128i vtmpyIn       = _mm_slli_epi32 ( _mm_setr_epi32( sumDIY0, sumDIY1, 0, 0 ), 2 );
355   vtmpyIn               = _mm_sub_epi32  ( vtmpyIn, vtmpData );
356 
357   tmpy0 = sumAbsGY0 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( _mm_extract_epi32( vtmpyIn, 0 ), sumAbsGY0 ) );
358   tmpy1 = sumAbsGY1 == 0 ? 0 : Clip3( -limit, limit, rightShiftMSB( _mm_extract_epi32( vtmpyIn, 1 ), sumAbsGY1 ) );
359 #else
360   tmpx0 = sumAbsGX0 == 0 ? 0 : rightShiftMSB( sumDIX0 << 2, sumAbsGX0 );
361   tmpx0 = Clip3( -limit, limit, tmpx0 );
362 
363   int mainsGxGy0 = sumSignGY_GX0 >> 12;
364   int secsGxGy0  = sumSignGY_GX0 & ( ( 1 << 12 ) - 1 );
365   int tmpData0   = tmpx0 * mainsGxGy0;
366   tmpData0       = ( ( tmpData0 << 12 ) + tmpx0 * secsGxGy0 ) >> 1;
367   tmpy0 = sumAbsGY0 == 0 ? 0 : rightShiftMSB( ( ( sumDIY0 << 2 ) - tmpData0 ), sumAbsGY0 );
368   tmpy0 = Clip3( -limit, limit, tmpy0 );
369 
370 
371   tmpx1 = sumAbsGX1 == 0 ? 0 : rightShiftMSB( sumDIX1 << 2, sumAbsGX1 );
372   tmpx1 = Clip3( -limit, limit, tmpx1 );
373 
374   int mainsGxGy1 = sumSignGY_GX1 >> 12;
375   int secsGxGy1  = sumSignGY_GX1 & ( ( 1 << 12 ) - 1 );
376   int tmpData1   = tmpx1 * mainsGxGy1;
377   tmpData1 = ( ( tmpData1 << 12 ) + tmpx1 * secsGxGy1 ) >> 1;
378   tmpy1 = sumAbsGY1 == 0 ? 0 : rightShiftMSB( ( ( sumDIY1 << 2 ) - tmpData1 ), sumAbsGY1 );
379   tmpy1 = Clip3( -limit, limit, tmpy1 );
380 #endif
381 
382 #undef sumAbsGX0
383 #undef sumAbsGX1
384 #undef sumAbsGY0
385 #undef sumAbsGY1
386 #undef sumDIX0
387 #undef sumDIX1
388 #undef sumDIY0
389 #undef sumDIY1
390 }
391 #endif
392 
393 template< X86_VEXT vext>
BiOptFlowCoreSIMD(const Pel * srcY0,const Pel * srcY1,const Pel * gradX0,const Pel * gradX1,const Pel * gradY0,const Pel * gradY1,const int width,const int height,Pel * dstY,const ptrdiff_t dstStride,const int shiftNum,const int offset,const int limit,const ClpRng & clpRng,const int bitDepth)394 void BiOptFlowCoreSIMD( const Pel* srcY0,
395                         const Pel* srcY1,
396                         const Pel* gradX0,
397                         const Pel* gradX1,
398                         const Pel* gradY0,
399                         const Pel* gradY1,
400                         const int  width,
401                         const int  height,
402                               Pel* dstY,
403                         const ptrdiff_t dstStride,
404                         const int  shiftNum,
405                         const int  offset,
406                         const int  limit,
407                         const ClpRng& clpRng,
408                         const int bitDepth )
409 {
410   const int widthG        = width  + BIO_ALIGN_SIZE;
411   const int stridePredMC  = widthG;
412         int offsetPos     = widthG * BIO_EXTEND_SIZE + BIO_EXTEND_SIZE;
413   const int xUnit         = ( width  >> 2 );
414   const int yUnit         = ( height >> 2 );
415 
416   const Pel* srcY0Temp;
417   const Pel* srcY1Temp;
418         Pel *dstY0;
419 
420   int OffPos;
421   int OffPad = 0;
422 
423   for( int yu = 0; yu < yUnit; yu++, srcY0 += ( stridePredMC << 2 ), srcY1 += ( stridePredMC << 2 ), dstY += ( dstStride << 2 ), offsetPos += ( widthG << 2 ) )
424   {
425     srcY0Temp = srcY0;
426     srcY1Temp = srcY1;
427     dstY0     = dstY;
428 
429     OffPos = offsetPos;
430     OffPad = ( ( yu * widthG ) << 2 );
431 
432 #if USE_AVX2
433     for( int xu = 0; xu < xUnit; xu += 2, srcY0Temp += 8, srcY1Temp += 8, dstY0 += 8, OffPos += 8, OffPad += 8 )
434     {
435       int tmpx0, tmpy0, tmpx1, tmpy1;
436 
437       //calcBIOSums_SSE<vext>( srcY0Temp + 0, srcY1Temp + 0, gradX0 + OffPad + 0, gradX1 + OffPad + 0, gradY0 + OffPad + 0, gradY1 + OffPad + 0, stridePredMC, bitDepth, limit, tmpx, tmpy );
438       //calcBIOSums_SSE<vext>( srcY0Temp + 0, srcY1Temp + 0, gradX0 + OffPad + 0, gradX1 + OffPad + 0, gradY0 + OffPad + 0, gradY1 + OffPad + 0, stridePredMC, bitDepth, limit, tmpx, tmpy );
439       calcBIOSums2x_AVX2( srcY0Temp, srcY1Temp, gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad, stridePredMC, bitDepth, limit, tmpx0, tmpx1, tmpy0, tmpy1 );
440 
441       //addBIOAvg4_SSE<vext>( srcY0Temp + stridePredMC + 1 + 0, srcY1Temp + stridePredMC + 1 + 0, dstY0 + 0, dstStride, gradX0 + OffPos + 0, gradX1 + OffPos + 0, gradY0 + OffPos + 0, gradY1 + OffPos + 0, widthG, tmpx0, tmpy0, shiftNum, offset, clpRng );
442       //addBIOAvg4_SSE<vext>( srcY0Temp + stridePredMC + 1 + 4, srcY1Temp + stridePredMC + 1 + 4, dstY0 + 4, dstStride, gradX0 + OffPos + 4, gradX1 + OffPos + 4, gradY0 + OffPos + 4, gradY1 + OffPos + 4, widthG, tmpx1, tmpy1, shiftNum, offset, clpRng );
443       addBIOAvg4_2x_AVX2( srcY0Temp + stridePredMC + 1, srcY1Temp + stridePredMC + 1, dstY0, dstStride, gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG, tmpx0, tmpx1, tmpy0, tmpy1, shiftNum, offset, clpRng );
444     }  // xu
445 #else
446     for( int xu = 0; xu < xUnit; xu++, srcY0Temp += 4, srcY1Temp += 4, dstY0 += 4, OffPos += 4, OffPad += 4 )
447     {
448       int tmpx, tmpy;
449 
450       calcBIOSums_SSE<vext>( srcY0Temp, srcY1Temp, gradX0 + OffPad, gradX1 + OffPad, gradY0 + OffPad, gradY1 + OffPad, stridePredMC, bitDepth, limit, tmpx, tmpy );
451 
452       addBIOAvg4_SSE<vext> ( srcY0Temp + stridePredMC + 1, srcY1Temp + stridePredMC + 1, dstY0, dstStride, gradX0 + OffPos, gradX1 + OffPos, gradY0 + OffPos, gradY1 + OffPos, widthG, tmpx, tmpy, shiftNum, offset, clpRng );
453     }  // xu
454 #endif
455   }  // yu
456 #if USE_AVX2
457 
458   _mm256_zeroupper();
459 #endif
460 }
461 
462 template< X86_VEXT vext, bool bi >
applyPROF_SSE(Pel * dstPel,ptrdiff_t dstStride,const Pel * srcPel,const Pel * gradX,const Pel * gradY,const int * dMvX,const int * dMvY,int shiftNum,Pel offset,const ClpRng & clpRng)463 void applyPROF_SSE(Pel* dstPel, ptrdiff_t dstStride, const Pel* srcPel, const Pel* gradX, const Pel* gradY, const int* dMvX, const int* dMvY, int shiftNum, Pel offset, const ClpRng& clpRng)
464 {
465   static constexpr ptrdiff_t srcStride  = 6;
466   static constexpr ptrdiff_t gradStride = 4;
467   static constexpr ptrdiff_t dMvStride  = 4;
468 
469   const int dILimit = 1 << std::max<int>(clpRng.bd + 1, 13);
470 
471 #if USE_AVX2
472   __m256i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0, mm_src;
473   __m256i mm_offset = _mm256_set1_epi16( offset );
474   __m256i vibdimin  = _mm256_set1_epi16( clpRng.min() );
475   __m256i vibdimax  = _mm256_set1_epi16( clpRng.max() );
476   __m256i mm_dimin  = _mm256_set1_epi32( -dILimit );
477   __m256i mm_dimax  = _mm256_set1_epi32( dILimit - 1 );
478 
479   const int *vX0 = dMvX, *vY0 = dMvY;
480   const Pel *gX0 = gradX, *gY0 = gradY;
481 
482   // first two rows
483   mm_dmvx = _mm256_loadu_si256( ( const __m256i * ) vX0 );
484   mm_dmvy = _mm256_loadu_si256( ( const __m256i * ) vY0 );
485 
486   mm_dmvx = _mm256_packs_epi32( mm_dmvx, _mm256_setzero_si256() );
487   mm_dmvy = _mm256_packs_epi32( mm_dmvy, _mm256_setzero_si256() );
488 
489   mm_gradx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadl_epi64( ( __m128i* )gX0 ) ), _mm_loadl_epi64( ( __m128i* )( gX0 + gradStride ) ), 1 );
490   mm_grady = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadl_epi64( ( __m128i* )gY0 ) ), _mm_loadl_epi64( ( __m128i* )( gY0 + gradStride ) ), 1 );
491 
492   mm_dI0   = _mm256_madd_epi16( _mm256_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm256_unpacklo_epi16( mm_gradx, mm_grady ) );
493   mm_dI0   = _mm256_min_epi32( mm_dimax, _mm256_max_epi32( mm_dimin, mm_dI0 ) );
494 
495   // next two rows
496   vX0 += ( dMvStride << 1 ); vY0 += ( dMvStride << 1 ); gX0 += ( gradStride << 1 ); gY0 += ( gradStride << 1 );
497 
498   mm_dmvx = _mm256_loadu_si256( ( const __m256i * ) vX0 );
499   mm_dmvy = _mm256_loadu_si256( ( const __m256i * ) vY0 );
500 
501   mm_dmvx = _mm256_packs_epi32( mm_dmvx, _mm256_setzero_si256() );
502   mm_dmvy = _mm256_packs_epi32( mm_dmvy, _mm256_setzero_si256() );
503 
504   mm_gradx = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadl_epi64( ( __m128i* )gX0 ) ), _mm_loadl_epi64( ( __m128i* )( gX0 + gradStride ) ), 1 );
505   mm_grady = _mm256_inserti128_si256( _mm256_castsi128_si256( _mm_loadl_epi64( ( __m128i* )gY0 ) ), _mm_loadl_epi64( ( __m128i* )( gY0 + gradStride ) ), 1 );
506 
507   mm_dI    = _mm256_madd_epi16( _mm256_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm256_unpacklo_epi16( mm_gradx, mm_grady ) );
508   mm_dI    = _mm256_min_epi32( mm_dimax, _mm256_max_epi32( mm_dimin, mm_dI ) );
509 
510   // combine four rows
511   mm_dI = _mm256_packs_epi32( mm_dI0, mm_dI );
512   const Pel* src0 = srcPel + srcStride;
513   mm_src = _mm256_inserti128_si256(
514     _mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)srcPel), _mm_loadl_epi64((const __m128i *)(srcPel + (srcStride << 1))))),
515     _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)src0), _mm_loadl_epi64((const __m128i *)(src0 + (srcStride << 1)))),
516     1
517   );
518   mm_dI = _mm256_add_epi16(mm_dI, mm_src);
519   if (!bi)
520   {
521     mm_dI = _mm256_srai_epi16(_mm256_adds_epi16(mm_dI, mm_offset), shiftNum);
522     mm_dI = _mm256_min_epi16(vibdimax, _mm256_max_epi16(vibdimin, mm_dI));
523   }
524 
525   // store final results
526   __m128i dITmp = _mm256_extracti128_si256(mm_dI, 1);
527   Pel* dst0 = dstPel;
528   _mm_storel_epi64((__m128i *)dst0, _mm256_castsi256_si128(mm_dI));
529   dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, dITmp);
530   dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(_mm256_castsi256_si128(mm_dI), _mm256_castsi256_si128(mm_dI)));
531   dst0 += dstStride; _mm_storel_epi64((__m128i *)dst0, _mm_unpackhi_epi64(dITmp, dITmp));
532 #else
533   __m128i mm_dmvx, mm_dmvy, mm_gradx, mm_grady, mm_dI, mm_dI0;
534   __m128i mm_offset = _mm_set1_epi16( offset );
535   __m128i vibdimin  = _mm_set1_epi16( clpRng.min() );
536   __m128i vibdimax  = _mm_set1_epi16( clpRng.max() );
537   __m128i mm_dimin  = _mm_set1_epi32( -dILimit );
538   __m128i mm_dimax  = _mm_set1_epi32( dILimit - 1 );
539 
540   static constexpr int height = 4;
541 
542   for( int h = 0; h < height; h += 2 )
543   {
544     const int* vX = dMvX;
545     const int* vY = dMvY;
546     const Pel* gX = gradX;
547     const Pel* gY = gradY;
548     const Pel* src = srcPel;
549     Pel*       dst = dstPel;
550 
551     // first row
552     mm_dmvx  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) vX ), _mm_setzero_si128() );
553     mm_dmvy  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) vY ), _mm_setzero_si128() );
554     mm_gradx = _mm_loadl_epi64( ( __m128i* ) gX );
555     mm_grady = _mm_loadl_epi64( ( __m128i* ) gY );
556     mm_dI0   = _mm_madd_epi16 ( _mm_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm_unpacklo_epi16( mm_gradx, mm_grady ) );
557     mm_dI0   = _mm_min_epi32  ( mm_dimax, _mm_max_epi32( mm_dimin, mm_dI0 ) );
558 
559     // second row
560     mm_dmvx  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) ( vX + dMvStride ) ), _mm_setzero_si128() );
561     mm_dmvy  = _mm_packs_epi32( _mm_loadu_si128( ( const __m128i * ) ( vY + dMvStride ) ), _mm_setzero_si128() );
562     mm_gradx = _mm_loadl_epi64( ( __m128i* ) ( gX + gradStride ) );
563     mm_grady = _mm_loadl_epi64( ( __m128i* ) ( gY + gradStride ) );
564     mm_dI    = _mm_madd_epi16 ( _mm_unpacklo_epi16( mm_dmvx, mm_dmvy ), _mm_unpacklo_epi16( mm_gradx, mm_grady ) );
565     mm_dI    = _mm_min_epi32  ( mm_dimax, _mm_max_epi32( mm_dimin, mm_dI ) );
566 
567     // combine both rows
568     mm_dI = _mm_packs_epi32( mm_dI0, mm_dI );
569     mm_dI = _mm_add_epi16  ( _mm_unpacklo_epi64( _mm_loadl_epi64( ( const __m128i * )src ), _mm_loadl_epi64( ( const __m128i * )( src + srcStride ) ) ), mm_dI );
570     if (!bi)
571     {
572       mm_dI = _mm_srai_epi16(_mm_adds_epi16(mm_dI, mm_offset), shiftNum);
573       mm_dI = _mm_min_epi16(vibdimax, _mm_max_epi16(vibdimin, mm_dI));
574     }
575 
576     _mm_storel_epi64( ( __m128i * )  dst,                                   mm_dI );
577     _mm_storel_epi64( ( __m128i * )( dst + dstStride ), _mm_unpackhi_epi64( mm_dI, mm_dI ) );
578 
579     dMvX   += (dMvStride  << 1);
580     dMvY   += (dMvStride  << 1);
581     gradX  += (gradStride << 1);
582     gradY  += (gradStride << 1);
583     srcPel += (srcStride  << 1);
584     dstPel += (dstStride  << 1);
585   }
586 #endif
587 }
588 
589 
590 template< X86_VEXT vext >
roundIntVector_SIMD(int * v,int size,unsigned int nShift,const int dmvLimit)591 void roundIntVector_SIMD(int* v, int size, unsigned int nShift, const int dmvLimit)
592 {
593   CHECKD(size % 16 != 0, "Size must be multiple of 16!");
594 #ifdef USE_AVX512
595   if (vext >= AVX512 && size >= 16)
596   {
597     __m512i dMvMin = _mm256_set1_epi32(-dmvLimit);
598     __m512i dMvMax = _mm256_set1_epi32(dmvLimit);
599     __m512i nOffset = _mm512_set1_epi32((1 << (nShift - 1)));
600     __m512i vones = _mm512_set1_epi32(1);
601     __m512i vzero = _mm512_setzero_si512();
602     for (int i = 0; i < size; i += 16, v += 16)
603     {
604       __m512i src = _mm512_loadu_si512(v);
605       __mmask16 mask = _mm512_cmpge_epi32_mask(src, vzero);
606       src = __mm512_add_epi32(src, nOffset);
607       __mm512i dst = _mm512_srai_epi32(_mm512_mask_sub_epi32(src, mask, src, vones), nShift);
608       dst = _mm512_min_epi32(dMvMax, _mm512_max_epi32(dMvMin, dst));
609       _mm512_storeu_si512(v, dst);
610     }
611   }
612   else
613 #endif
614 #ifdef USE_AVX2
615   if (vext >= AVX2 && size >= 8)
616   {
617     __m256i dMvMin = _mm256_set1_epi32(-dmvLimit);
618     __m256i dMvMax = _mm256_set1_epi32(dmvLimit);
619     __m256i nOffset = _mm256_set1_epi32(1 << (nShift - 1));
620     __m256i vzero = _mm256_setzero_si256();
621     for (int i = 0; i < size; i += 8, v += 8)
622     {
623       __m256i src = _mm256_lddqu_si256((__m256i*)v);
624       __m256i of  = _mm256_cmpgt_epi32(src, vzero);
625       __m256i dst = _mm256_srai_epi32(_mm256_add_epi32(_mm256_add_epi32(src, nOffset), of), nShift);
626       dst = _mm256_min_epi32(dMvMax, _mm256_max_epi32(dMvMin, dst));
627       _mm256_storeu_si256((__m256i*)v, dst);
628     }
629   }
630   else
631 #endif
632   {
633     __m128i dMvMin = _mm_set1_epi32(-dmvLimit);
634     __m128i dMvMax = _mm_set1_epi32(dmvLimit);
635     __m128i nOffset = _mm_set1_epi32((1 << (nShift - 1)));
636     __m128i vzero = _mm_setzero_si128();
637     for (int i = 0; i < size; i += 4, v += 4)
638     {
639       __m128i src = _mm_loadu_si128((__m128i*)v);
640       __m128i of  = _mm_cmpgt_epi32(src, vzero);
641       __m128i dst = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(src, nOffset), of), nShift);
642       dst = _mm_min_epi32(dMvMax, _mm_max_epi32(dMvMin, dst));
643       _mm_storeu_si128((__m128i*)v, dst);
644     }
645   }
646 }
647 
648 template< X86_VEXT vext, bool PAD = true>
gradFilter_SSE(int16_t * src,ptrdiff_t _srcStride,int width,int height,ptrdiff_t _gradStride,int16_t * gradX,int16_t * gradY,const int bitDepth)649 void gradFilter_SSE( int16_t* src, ptrdiff_t _srcStride, int width, int height, ptrdiff_t _gradStride, int16_t* gradX, int16_t* gradY, const int bitDepth)
650 {
651   const int widthInside      = PAD ? width  - 2 * BIO_EXTEND_SIZE : 4;
652   const int heightInside     = PAD ? height - 2 * BIO_EXTEND_SIZE : 4;
653   const ptrdiff_t gradStride = PAD ? _gradStride                  : 4;
654   const ptrdiff_t srcStride  = PAD ? _srcStride                   : 6;
655 
656   int16_t* srcTmp   = PAD ? src   + srcStride  + 1 : src;
657   int16_t* gradXTmp = PAD ? gradX + gradStride + 1 : gradX;
658   int16_t* gradYTmp = PAD ? gradY + gradStride + 1 : gradY;
659 
660   const int shift1 = std::max<int>( 6, bitDepth - 6 );
661 
662   CHECKD( gradStride != _gradStride, "Wrong PROF stride!" );
663   CHECKD( srcStride  != _srcStride,  "Wrong PROF stride!" );
664 
665 #if USE_AVX2
666   if( PAD && ( widthInside & 15 ) == 0 && vext >= AVX2 )
667   {
668     for( int y = 0; y < heightInside; y++ )
669     {
670       for( int x = 0; x < widthInside; x += 16 )
671       {
672         __m256i mmPixTop    = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x - srcStride ) ), shift1 );
673         __m256i mmPixBottom = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x + srcStride ) ), shift1 );
674         __m256i mmPixLeft   = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x - 1 ) ),         shift1 );
675         __m256i mmPixRight  = _mm256_srai_epi16( _mm256_loadu_si256( ( __m256i * ) ( srcTmp + x + 1 ) ),         shift1 );
676 
677         __m256i mmGradVer = _mm256_sub_epi16( mmPixBottom, mmPixTop );
678         __m256i mmGradHor = _mm256_sub_epi16( mmPixRight, mmPixLeft );
679 
680         _mm256_storeu_si256( ( __m256i * )( gradYTmp + x ), mmGradVer );
681         _mm256_storeu_si256( ( __m256i * )( gradXTmp + x ), mmGradHor );
682       }
683 
684       gradXTmp[widthInside] = gradXTmp[widthInside - 1];
685       gradYTmp[widthInside] = gradYTmp[widthInside - 1];
686       gradXTmp[-1]          = gradXTmp[0];
687       gradYTmp[-1]          = gradYTmp[0];
688 
689       gradXTmp += gradStride;
690       gradYTmp += gradStride;
691       srcTmp   += srcStride;
692     }
693   }
694   else
695 #endif
696   if( PAD && ( widthInside & 7 ) == 0 )
697   {
698     for( int y = 0; y < heightInside; y++ )
699     {
700       for( int x = 0; x < widthInside; x += 8 )
701       {
702         __m128i mmPixTop    = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x - srcStride ) ), shift1 );
703         __m128i mmPixBottom = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x + srcStride ) ), shift1 );
704         __m128i mmPixLeft   = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x - 1 ) ),         shift1 );
705         __m128i mmPixRight  = _mm_srai_epi16( _mm_loadu_si128( ( __m128i* )( srcTmp + x + 1 ) ),         shift1 );
706 
707         __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop );
708         __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft );
709 
710         _mm_storeu_si128((__m128i *) (gradYTmp + x), mmGradVer);
711         _mm_storeu_si128((__m128i *) (gradXTmp + x), mmGradHor);
712       }
713 
714       if( PAD )
715       {
716         gradXTmp[widthInside] = gradXTmp[widthInside - 1];
717         gradYTmp[widthInside] = gradYTmp[widthInside - 1];
718         gradXTmp[-1]          = gradXTmp[0];
719         gradYTmp[-1]          = gradYTmp[0];
720       }
721 
722       gradXTmp += gradStride;
723       gradYTmp += gradStride;
724       srcTmp += srcStride;
725     }
726   }
727   else
728   {
729     CHECK( widthInside != 4, "Width needs to be '4'!" );
730 
731     for( int y = 0; y < ( PAD ? heightInside : 4 ); y++ )
732     {
733       __m128i mmPixTop    = _mm_srai_epi16( _mm_loadl_epi64( ( __m128i* )( srcTmp - srcStride ) ), shift1 );
734       __m128i mmPixBottom = _mm_srai_epi16( _mm_loadl_epi64( ( __m128i* )( srcTmp + srcStride ) ), shift1 );
735       __m128i mmPixLeft   = _mm_srai_epi16( _mm_loadl_epi64( ( __m128i* )( srcTmp - 1 ) ),         shift1 );
736       __m128i mmPixRight  = _mm_srai_epi16( _mm_loadl_epi64( ( __m128i* )( srcTmp + 1 ) ),         shift1 );
737 
738       __m128i mmGradVer = _mm_sub_epi16( mmPixBottom, mmPixTop );
739       __m128i mmGradHor = _mm_sub_epi16( mmPixRight, mmPixLeft );
740 
741       _mm_storel_epi64( ( __m128i * )( gradYTmp ), mmGradVer );
742       _mm_storel_epi64( ( __m128i * )( gradXTmp ), mmGradHor );
743 
744       if( PAD )
745       {
746         gradXTmp[widthInside] = gradXTmp[widthInside - 1];
747         gradYTmp[widthInside] = gradYTmp[widthInside - 1];
748         gradXTmp[-1]          = gradXTmp[0];
749         gradYTmp[-1]          = gradYTmp[0];
750       }
751 
752       gradXTmp += gradStride;
753       gradYTmp += gradStride;
754       srcTmp   += srcStride;
755     }
756   }
757 #if USE_AVX2
758 
759   _mm256_zeroupper();
760 #endif
761 
762   if( PAD )
763   {
764     gradXTmp = gradX + gradStride;
765     gradYTmp = gradY + gradStride;
766 
767     ::memcpy( gradXTmp + heightInside * gradStride, gradXTmp + ( heightInside - 1 ) * gradStride, sizeof( int16_t ) * ( width ) );
768     ::memcpy( gradYTmp + heightInside * gradStride, gradYTmp + ( heightInside - 1 ) * gradStride, sizeof( int16_t ) * ( width ) );
769     ::memcpy( gradXTmp - gradStride, gradXTmp, sizeof( int16_t ) * ( width ) );
770     ::memcpy( gradYTmp - gradStride, gradYTmp, sizeof( int16_t ) * ( width ) );
771   }
772 }
773 
774 template<X86_VEXT vext>
_initInterPredictionX86()775 void InterPrediction::_initInterPredictionX86()
776 {
777   BiOptFlow       = BiOptFlowCoreSIMD  <vext>;
778   PaddBIO         = PaddBIO_SIMD       <vext>;
779   BioGradFilter   = gradFilter_SSE     <vext, true>;
780   profGradFilter  = gradFilter_SSE     <vext, false>;
781   applyPROF[0]    = applyPROF_SSE      <vext, false>;
782   applyPROF[1]    = applyPROF_SSE      <vext, true>;
783   roundIntVector  = roundIntVector_SIMD<vext>;
784 }
785 template void InterPrediction::_initInterPredictionX86<SIMDX86>();
786 
787 #endif // TARGET_SIMD_X86
788 #endif
789 
790 }
791