1 /* -----------------------------------------------------------------------------
2 The copyright in this software is being made available under the BSD
3 License, included below. No patent rights, trademark rights and/or
4 other Intellectual Property Rights other than the copyrights concerning
5 the Software are granted under this license.
6 
7 For any license concerning other Intellectual Property rights than the software,
8 especially patent licenses, a separate Agreement needs to be closed.
9 For more information please contact:
10 
11 Fraunhofer Heinrich Hertz Institute
12 Einsteinufer 37
13 10587 Berlin, Germany
14 www.hhi.fraunhofer.de/vvc
15 vvc@hhi.fraunhofer.de
16 
17 Copyright (c) 2018-2021, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V.
18 All rights reserved.
19 
20 Redistribution and use in source and binary forms, with or without
21 modification, are permitted provided that the following conditions are met:
22 
23  * Redistributions of source code must retain the above copyright notice,
24    this list of conditions and the following disclaimer.
25  * Redistributions in binary form must reproduce the above copyright notice,
26    this list of conditions and the following disclaimer in the documentation
27    and/or other materials provided with the distribution.
28  * Neither the name of Fraunhofer nor the names of its contributors may
29    be used to endorse or promote products derived from this software without
30    specific prior written permission.
31 
32 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
33 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
36 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
37 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
40 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
42 THE POSSIBILITY OF SUCH DAMAGE.
43 
44 
45 ------------------------------------------------------------------------------------------- */
46 
47 /** \file     TrafoX86.h
48     \brief    SIMD trafo
49 */
50 
51 //! \ingroup CommonLib
52 //! \{
53 
54 
55 #include "CommonLib/CommonDef.h"
56 #include "CommonDefX86.h"
57 
58 #include "TrQuant_EMT.h"
59 
60 namespace vvdec
61 {
62 
63 #if ENABLE_SIMD_TCOEFF_OPS
64 #ifdef TARGET_SIMD_X86
65 
66 template< X86_VEXT vext, int W >
fastInv_SSE(const TMatrixCoeff * it,const TCoeff * src,TCoeff * dst,unsigned trSize,unsigned lines,unsigned reducedLines,unsigned rows)67 void fastInv_SSE( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned trSize, unsigned lines, unsigned reducedLines, unsigned rows )
68 {
69   unsigned maxLoopL = std::min<int>( reducedLines, 4 );
70 
71 #if USE_AVX2
72   if( W >= 8 && vext >= AVX2 )
73   {
74     if( ( trSize & 15 ) == 0 )
75     {
76       unsigned trLoops = trSize >> 4;
77 
78       for( int k = 0; k < rows; k += 2 )
79       {
80               TCoeff* dstPtr =  dst;
81 
82         const TCoeff* srcPtr0 = &src[ k      * lines];
83         const TCoeff* srcPtr1 = &src[(k + 1) * lines];
84 
85         __m256i vsrc1v[4][2];
86 
87         const TMatrixCoeff*  itPtr0 = &it[ k      * trSize];
88         const TMatrixCoeff*  itPtr1 = &it[(k + 1) * trSize];
89 
90         for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
91         {
92 #if defined( _MSC_VER ) && _MSC_VER > 1900
93           __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94           __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
95 #else
96           __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97           __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( (       __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
98 #endif
99 
100           vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
101           vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
102         }
103 
104         for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
105         {
106           __m128i xscale = maxLoopL == 4
107                          ? _mm_packs_epi32( _mm_load_si128 ( ( const __m128i* )srcPtr0 ), _mm_load_si128 ( ( const __m128i* )srcPtr1 ) )
108                          : _mm_packs_epi32( _mm_loadl_epi64( ( const __m128i* )srcPtr0 ), _mm_loadl_epi64( ( const __m128i* )srcPtr1 ) );
109           xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
110 
111           if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
112 
113           for( int l = 0; l < maxLoopL; l++ )
114           {
115             __m256i
116             vscale = _mm256_broadcastd_epi32( xscale );
117             xscale = _mm_bsrli_si128( xscale, 4 );
118 
119             for( int col = 0; col < trLoops; col++, dstPtr += 16 )
120             {
121               __m256i vsrc0 = _mm256_load_si256       ( ( const __m256i * ) dstPtr );
122 
123               __m256i
124               vsrc1 = vsrc1v[col][0];
125               vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
126               vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
127 
128               _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
129 
130               vsrc0 = _mm256_load_si256    ( ( const __m256i * ) &dstPtr[8] );
131 
132               vsrc1 = vsrc1v[col][1];
133               vsrc1 = _mm256_madd_epi16    ( vsrc1, vscale );
134               vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
135 
136               _mm256_store_si256           ( ( __m256i * ) &dstPtr[8], vsrc0 );
137             }
138           }
139         }
140       }
141     }
142     else
143     {
144       for( int k = 0; k < rows; k += 2 )
145       {
146               TCoeff* dstPtr  =  dst;
147 
148         const TCoeff* srcPtr0 = &src[ k      * lines];
149         const TCoeff* srcPtr1 = &src[(k + 1) * lines];
150 
151         const TMatrixCoeff*  itPtr0 = &it[  k      * trSize];
152         const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
153 
154         __m256i vit;
155 
156         {
157 #if defined( _MSC_VER ) && _MSC_VER > 1900
158           __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
159 #else
160           __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
161 #endif
162 #if defined( _MSC_VER ) && _MSC_VER > 1900
163           __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
164 #else
165           __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
166 #endif
167 
168           vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
169         }
170 
171         for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
172         {
173           __m128i xscale = maxLoopL == 4
174                          ? _mm_packs_epi32( _mm_load_si128 ( ( const __m128i* )srcPtr0 ), _mm_load_si128 ( ( const __m128i* )srcPtr1 ) )
175                          : _mm_packs_epi32( _mm_loadl_epi64( ( const __m128i* )srcPtr0 ), _mm_loadl_epi64( ( const __m128i* )srcPtr1 ) );
176           xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
177 
178           if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
179 
180           for( int l = 0; l < maxLoopL; l++ )
181           {
182             __m256i
183             vscale = _mm256_broadcastd_epi32( xscale );
184             xscale = _mm_bsrli_si128( xscale, 4 );
185 
186             for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
187             {
188               __m256i
189               vsrc0 = _mm256_load_si256    ( ( const __m256i * ) dstPtr );
190               __m256i
191               vsrc1 = _mm256_madd_epi16    ( vit, vscale );
192               vsrc0 = _mm256_add_epi32     ( vsrc0, vsrc1 );
193 
194               _mm256_store_si256           ( ( __m256i * ) dstPtr, vsrc0 );
195             }
196           }
197         }
198       }
199     }
200   }
201 #else
202   if( W >= 8 )
203   {
204     for( int k = 0; k < rows; k += 2 )
205     {
206             TCoeff* dstPtr  =  dst;
207 
208       const TCoeff* srcPtr0 = &src[ k      * lines];
209       const TCoeff* srcPtr1 = &src[(k + 1) * lines];
210 
211       for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
212       {
213         __m128i xscale = maxLoopL == 4
214                         ? _mm_packs_epi32( _mm_load_si128 ( ( const __m128i* )srcPtr0 ), _mm_load_si128 ( ( const __m128i* )srcPtr1 ) )
215                         : _mm_packs_epi32( _mm_loadl_epi64( ( const __m128i* )srcPtr0 ), _mm_loadl_epi64( ( const __m128i* )srcPtr1 ) );
216         xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
217 
218         if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
219 
220         for( int l = 0; l < maxLoopL; l++ )
221         {
222           const TMatrixCoeff*  itPtr0 = &it[k      * trSize];
223           const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
224 
225           __m128i
226           vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
227           xscale = _mm_bsrli_si128( xscale, 4 );
228 
229           for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
230           {
231             __m128i vsrc0   = _mm_load_si128       ( ( const __m128i * ) dstPtr );
232 #if defined( _MSC_VER ) && _MSC_VER > 1900
233             __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
234             __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
235 #else
236             __m128i vit16_0 = _mm_stream_load_si128( (       __m128i * ) itPtr0 );
237             __m128i vit16_1 = _mm_stream_load_si128( (       __m128i * ) itPtr1 );
238 #endif
239 
240             __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
241 
242             vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
243             vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
244 
245             _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
246 
247             vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
248 
249             vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
250 
251             vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
252             vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
253 
254             _mm_store_si128        ( ( __m128i * ) &dstPtr[4], vsrc0 );
255           }
256         }
257       }
258     }
259   }
260 #endif
261   else if( W >= 4 )
262   {
263     CHECKD( trSize != 4, "trSize needs to be '4'!" );
264 
265     for( int k = 0; k < rows; k += 2 )
266     {
267             TCoeff* dstPtr  =  dst;
268 
269       const TCoeff* srcPtr0 = &src[ k      * lines];
270       const TCoeff* srcPtr1 = &src[(k + 1) * lines];
271 
272       const TMatrixCoeff*  itPtr0 = &it[  k       * trSize];
273       const TMatrixCoeff*  itPtr1 = &it[( k + 1 ) * trSize];
274 
275       __m128i vit = _mm_unpacklo_epi16( _mm_loadl_epi64( ( const __m128i * ) itPtr0 ), _mm_loadl_epi64( ( const __m128i * ) itPtr1 ) );
276 
277       for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
278       {
279         __m128i xscale = maxLoopL == 4
280                         ? _mm_packs_epi32( _mm_load_si128 ( ( const __m128i* )srcPtr0 ), _mm_load_si128 ( ( const __m128i* )srcPtr1 ) )
281                         : _mm_packs_epi32( _mm_loadl_epi64( ( const __m128i* )srcPtr0 ), _mm_loadl_epi64( ( const __m128i* )srcPtr1 ) );
282         xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
283 
284         if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
285 
286         for( int l = 0; l < maxLoopL; l++ )
287         {
288           __m128i
289           vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
290           xscale = _mm_bsrli_si128( xscale, 4 );
291 
292           for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
293           {
294             __m128i
295             vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
296             __m128i
297             vsrc1 = _mm_madd_epi16 ( vit, vscale );
298             vsrc0 = _mm_add_epi32  ( vsrc0, vsrc1 );
299 
300             _mm_store_si128        ( ( __m128i * ) dstPtr, vsrc0 );
301           }
302         }
303       }
304     }
305   }
306   else
307   {
308     THROW( "Unsupported size" );
309   }
310 #if USE_AVX2
311 
312   _mm256_zeroupper();
313 #endif
314 }
315 
316 template< X86_VEXT vext, int W >
roundClip_SSE(TCoeff * dst,unsigned width,unsigned height,unsigned stride,const TCoeff outputMin,const TCoeff outputMax,const TCoeff round,const TCoeff shift)317 void roundClip_SSE( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
318 {
319 #if USE_AVX2
320   if( W >= 8 && vext >= AVX2 )
321   {
322     __m256i vmin = _mm256_set1_epi32( outputMin );
323     __m256i vmax = _mm256_set1_epi32( outputMax );
324     __m256i vrnd = _mm256_set1_epi32( round );
325 
326     while( height-- )
327     {
328       for( int col = 0; col < width; col += 8 )
329       {
330         __m256i
331         vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
332         vdst = _mm256_add_epi32 ( vdst, vrnd );
333         vdst = _mm256_srai_epi32( vdst, shift );
334         vdst = _mm256_max_epi32 ( vdst, vmin );
335         vdst = _mm256_min_epi32 ( vdst, vmax );
336         _mm256_store_si256      ( ( __m256i * ) &dst[col], vdst );
337       }
338 
339       dst += stride;
340     }
341   }
342   else
343 #endif
344   if( W >= 4 )
345   {
346     __m128i vmin = _mm_set1_epi32( outputMin );
347     __m128i vmax = _mm_set1_epi32( outputMax );
348     __m128i vrnd = _mm_set1_epi32( round );
349 
350     while( height-- )
351     {
352       for( int col = 0; col < width; col += 4 )
353       {
354         __m128i
355         vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
356         vdst = _mm_add_epi32  ( vdst, vrnd );
357         vdst = _mm_srai_epi32 ( vdst, shift );
358         vdst = _mm_max_epi32  ( vdst, vmin );
359         vdst = _mm_min_epi32  ( vdst, vmax );
360         _mm_store_si128       ( ( __m128i * ) &dst[col], vdst );
361       }
362 
363       dst += stride;
364     }
365   }
366   else
367   {
368     THROW( "Unsupported size" );
369   }
370 #if USE_AVX2
371 
372   _mm256_zeroupper();
373 #endif
374 }
375 
376 template< X86_VEXT vext, int W >
cpyResi_SSE(const TCoeff * src,Pel * dst,ptrdiff_t stride,unsigned width,unsigned height)377 void cpyResi_SSE( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height )
378 {
379 #if USE_AVX2
380   if( W >= 8 && vext >= AVX2 )
381   {
382     while( height-- )
383     {
384       for( int col = 0; col < width; col += 8 )
385       {
386         __m256i
387         vsrc = _mm256_load_si256        ( ( const __m256i * ) &src[col] );
388         __m128i
389         vdst = _mm256_cvtepi32_epi16x   ( vsrc );
390         _mm_storeu_si128                ( ( __m128i * ) &dst[col], vdst );
391       }
392 
393       src += width;
394       dst += stride;
395     }
396   }
397   else
398 #endif
399   if( W >= 4 )
400   {
401     __m128i vzero = _mm_setzero_si128();
402     __m128i vdst;
403 
404     while( height-- )
405     {
406       for( int col = 0; col < width; col += 4 )
407       {
408         vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
409         vdst = _mm_packs_epi32( vdst, vzero );
410         _mm_storel_epi64      ( ( __m128i * ) &dst[col], vdst );
411       }
412 
413       src += width;
414       dst += stride;
415     }
416   }
417   else
418   {
419     THROW( "Unsupported size" );
420   }
421 #if USE_AVX2
422 
423   _mm256_zeroupper();
424 #endif
425 }
426 
427 template<X86_VEXT vext>
_initTCoeffOpsX86()428 void TCoeffOps::_initTCoeffOpsX86()
429 {
430   cpyResi4     = cpyResi_SSE  <vext, 4>;
431   cpyResi8     = cpyResi_SSE  <vext, 8>;
432   roundClip4   = roundClip_SSE<vext, 4>;
433   roundClip8   = roundClip_SSE<vext, 8>;
434   fastInvCore4 = fastInv_SSE  <vext, 4>;
435   fastInvCore8 = fastInv_SSE  <vext, 8>;
436 }
437 
438 template void TCoeffOps::_initTCoeffOpsX86<SIMDX86>();
439 
440 #endif // TARGET_SIMD_X86
441 #endif
442 
443 }
444