1 /* -----------------------------------------------------------------------------
2 The copyright in this software is being made available under the BSD
3 License, included below. No patent rights, trademark rights and/or
4 other Intellectual Property Rights other than the copyrights concerning
5 the Software are granted under this license.
6
7 For any license concerning other Intellectual Property rights than the software,
8 especially patent licenses, a separate Agreement needs to be closed.
9 For more information please contact:
10
11 Fraunhofer Heinrich Hertz Institute
12 Einsteinufer 37
13 10587 Berlin, Germany
14 www.hhi.fraunhofer.de/vvc
15 vvc@hhi.fraunhofer.de
16
17 Copyright (c) 2018-2021, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V.
18 All rights reserved.
19
20 Redistribution and use in source and binary forms, with or without
21 modification, are permitted provided that the following conditions are met:
22
23 * Redistributions of source code must retain the above copyright notice,
24 this list of conditions and the following disclaimer.
25 * Redistributions in binary form must reproduce the above copyright notice,
26 this list of conditions and the following disclaimer in the documentation
27 and/or other materials provided with the distribution.
28 * Neither the name of Fraunhofer nor the names of its contributors may
29 be used to endorse or promote products derived from this software without
30 specific prior written permission.
31
32 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
33 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
36 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
37 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
40 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
42 THE POSSIBILITY OF SUCH DAMAGE.
43
44
45 ------------------------------------------------------------------------------------------- */
46
47 /** \file TrafoX86.h
48 \brief SIMD trafo
49 */
50
51 //! \ingroup CommonLib
52 //! \{
53
54
55 #include "CommonLib/CommonDef.h"
56 #include "CommonDefX86.h"
57
58 #include "TrQuant_EMT.h"
59
60 namespace vvdec
61 {
62
63 #if ENABLE_SIMD_TCOEFF_OPS
64 #ifdef TARGET_SIMD_X86
65
66 template< X86_VEXT vext, int W >
fastInv_SSE(const TMatrixCoeff * it,const TCoeff * src,TCoeff * dst,unsigned trSize,unsigned lines,unsigned reducedLines,unsigned rows)67 void fastInv_SSE( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned trSize, unsigned lines, unsigned reducedLines, unsigned rows )
68 {
69 unsigned maxLoopL = std::min<int>( reducedLines, 4 );
70
71 #if USE_AVX2
72 if( W >= 8 && vext >= AVX2 )
73 {
74 if( ( trSize & 15 ) == 0 )
75 {
76 unsigned trLoops = trSize >> 4;
77
78 for( int k = 0; k < rows; k += 2 )
79 {
80 TCoeff* dstPtr = dst;
81
82 const TCoeff* srcPtr0 = &src[ k * lines];
83 const TCoeff* srcPtr1 = &src[(k + 1) * lines];
84
85 __m256i vsrc1v[4][2];
86
87 const TMatrixCoeff* itPtr0 = &it[ k * trSize];
88 const TMatrixCoeff* itPtr1 = &it[(k + 1) * trSize];
89
90 for( int col = 0; col < trLoops; col++, itPtr0 += 16, itPtr1 += 16 )
91 {
92 #if defined( _MSC_VER ) && _MSC_VER > 1900
93 __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
94 __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( const __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
95 #else
96 __m256i vit16_0 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr0 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
97 __m256i vit16_1 = _mm256_permute4x64_epi64( _mm256_stream_load_si256( ( __m256i * ) itPtr1 ), ( 0 << 0 ) + ( 1 << 4 ) + ( 2 << 2 ) + ( 3 << 6 ) );
98 #endif
99
100 vsrc1v[col][0] = _mm256_unpacklo_epi16( vit16_0, vit16_1 );
101 vsrc1v[col][1] = _mm256_unpackhi_epi16( vit16_0, vit16_1 );
102 }
103
104 for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
105 {
106 __m128i xscale = maxLoopL == 4
107 ? _mm_packs_epi32( _mm_load_si128 ( ( const __m128i* )srcPtr0 ), _mm_load_si128 ( ( const __m128i* )srcPtr1 ) )
108 : _mm_packs_epi32( _mm_loadl_epi64( ( const __m128i* )srcPtr0 ), _mm_loadl_epi64( ( const __m128i* )srcPtr1 ) );
109 xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
110
111 if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
112
113 for( int l = 0; l < maxLoopL; l++ )
114 {
115 __m256i
116 vscale = _mm256_broadcastd_epi32( xscale );
117 xscale = _mm_bsrli_si128( xscale, 4 );
118
119 for( int col = 0; col < trLoops; col++, dstPtr += 16 )
120 {
121 __m256i vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr );
122
123 __m256i
124 vsrc1 = vsrc1v[col][0];
125 vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale );
126 vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 );
127
128 _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 );
129
130 vsrc0 = _mm256_load_si256 ( ( const __m256i * ) &dstPtr[8] );
131
132 vsrc1 = vsrc1v[col][1];
133 vsrc1 = _mm256_madd_epi16 ( vsrc1, vscale );
134 vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 );
135
136 _mm256_store_si256 ( ( __m256i * ) &dstPtr[8], vsrc0 );
137 }
138 }
139 }
140 }
141 }
142 else
143 {
144 for( int k = 0; k < rows; k += 2 )
145 {
146 TCoeff* dstPtr = dst;
147
148 const TCoeff* srcPtr0 = &src[ k * lines];
149 const TCoeff* srcPtr1 = &src[(k + 1) * lines];
150
151 const TMatrixCoeff* itPtr0 = &it[ k * trSize];
152 const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize];
153
154 __m256i vit;
155
156 {
157 #if defined( _MSC_VER ) && _MSC_VER > 1900
158 __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
159 #else
160 __m256i vsrc1 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr0 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
161 #endif
162 #if defined( _MSC_VER ) && _MSC_VER > 1900
163 __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( const __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
164 #else
165 __m256i vsrc2 = _mm256_permute4x64_epi64( _mm256_castsi128_si256( _mm_stream_load_si128( ( __m128i * ) itPtr1 ) ), ( 0 << 0 ) + ( 1 << 4 ) );
166 #endif
167
168 vit = _mm256_unpacklo_epi16( vsrc1, vsrc2 );
169 }
170
171 for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
172 {
173 __m128i xscale = maxLoopL == 4
174 ? _mm_packs_epi32( _mm_load_si128 ( ( const __m128i* )srcPtr0 ), _mm_load_si128 ( ( const __m128i* )srcPtr1 ) )
175 : _mm_packs_epi32( _mm_loadl_epi64( ( const __m128i* )srcPtr0 ), _mm_loadl_epi64( ( const __m128i* )srcPtr1 ) );
176 xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
177
178 if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
179
180 for( int l = 0; l < maxLoopL; l++ )
181 {
182 __m256i
183 vscale = _mm256_broadcastd_epi32( xscale );
184 xscale = _mm_bsrli_si128( xscale, 4 );
185
186 for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
187 {
188 __m256i
189 vsrc0 = _mm256_load_si256 ( ( const __m256i * ) dstPtr );
190 __m256i
191 vsrc1 = _mm256_madd_epi16 ( vit, vscale );
192 vsrc0 = _mm256_add_epi32 ( vsrc0, vsrc1 );
193
194 _mm256_store_si256 ( ( __m256i * ) dstPtr, vsrc0 );
195 }
196 }
197 }
198 }
199 }
200 }
201 #else
202 if( W >= 8 )
203 {
204 for( int k = 0; k < rows; k += 2 )
205 {
206 TCoeff* dstPtr = dst;
207
208 const TCoeff* srcPtr0 = &src[ k * lines];
209 const TCoeff* srcPtr1 = &src[(k + 1) * lines];
210
211 for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
212 {
213 __m128i xscale = maxLoopL == 4
214 ? _mm_packs_epi32( _mm_load_si128 ( ( const __m128i* )srcPtr0 ), _mm_load_si128 ( ( const __m128i* )srcPtr1 ) )
215 : _mm_packs_epi32( _mm_loadl_epi64( ( const __m128i* )srcPtr0 ), _mm_loadl_epi64( ( const __m128i* )srcPtr1 ) );
216 xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
217
218 if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
219
220 for( int l = 0; l < maxLoopL; l++ )
221 {
222 const TMatrixCoeff* itPtr0 = &it[k * trSize];
223 const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize];
224
225 __m128i
226 vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
227 xscale = _mm_bsrli_si128( xscale, 4 );
228
229 for( int col = 0; col < trSize; col += 8, dstPtr += 8, itPtr0 += 8, itPtr1 += 8 )
230 {
231 __m128i vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
232 #if defined( _MSC_VER ) && _MSC_VER > 1900
233 __m128i vit16_0 = _mm_stream_load_si128( ( const __m128i * ) itPtr0 );
234 __m128i vit16_1 = _mm_stream_load_si128( ( const __m128i * ) itPtr1 );
235 #else
236 __m128i vit16_0 = _mm_stream_load_si128( ( __m128i * ) itPtr0 );
237 __m128i vit16_1 = _mm_stream_load_si128( ( __m128i * ) itPtr1 );
238 #endif
239
240 __m128i vsrc1 = _mm_unpacklo_epi16( vit16_0, vit16_1 );
241
242 vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
243 vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 );
244
245 _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 );
246
247 vsrc0 = _mm_load_si128 ( ( const __m128i * ) &dstPtr[4] );
248
249 vsrc1 = _mm_unpackhi_epi16( vit16_0, vit16_1 );
250
251 vsrc1 = _mm_madd_epi16 ( vsrc1, vscale );
252 vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 );
253
254 _mm_store_si128 ( ( __m128i * ) &dstPtr[4], vsrc0 );
255 }
256 }
257 }
258 }
259 }
260 #endif
261 else if( W >= 4 )
262 {
263 CHECKD( trSize != 4, "trSize needs to be '4'!" );
264
265 for( int k = 0; k < rows; k += 2 )
266 {
267 TCoeff* dstPtr = dst;
268
269 const TCoeff* srcPtr0 = &src[ k * lines];
270 const TCoeff* srcPtr1 = &src[(k + 1) * lines];
271
272 const TMatrixCoeff* itPtr0 = &it[ k * trSize];
273 const TMatrixCoeff* itPtr1 = &it[( k + 1 ) * trSize];
274
275 __m128i vit = _mm_unpacklo_epi16( _mm_loadl_epi64( ( const __m128i * ) itPtr0 ), _mm_loadl_epi64( ( const __m128i * ) itPtr1 ) );
276
277 for( int i = 0; i < reducedLines; i += 4, srcPtr0 += maxLoopL, srcPtr1 += maxLoopL )
278 {
279 __m128i xscale = maxLoopL == 4
280 ? _mm_packs_epi32( _mm_load_si128 ( ( const __m128i* )srcPtr0 ), _mm_load_si128 ( ( const __m128i* )srcPtr1 ) )
281 : _mm_packs_epi32( _mm_loadl_epi64( ( const __m128i* )srcPtr0 ), _mm_loadl_epi64( ( const __m128i* )srcPtr1 ) );
282 xscale = _mm_shuffle_epi8( xscale, _mm_setr_epi8( 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 ) );
283
284 if( _mm_test_all_zeros( xscale, xscale ) ) { dstPtr += ( trSize * maxLoopL ); continue; }
285
286 for( int l = 0; l < maxLoopL; l++ )
287 {
288 __m128i
289 vscale = _mm_set1_epi32( _mm_cvtsi128_si32( xscale ) );
290 xscale = _mm_bsrli_si128( xscale, 4 );
291
292 for( int col = 0; col < trSize; col += 4, dstPtr += 4 )
293 {
294 __m128i
295 vsrc0 = _mm_load_si128 ( ( const __m128i * ) dstPtr );
296 __m128i
297 vsrc1 = _mm_madd_epi16 ( vit, vscale );
298 vsrc0 = _mm_add_epi32 ( vsrc0, vsrc1 );
299
300 _mm_store_si128 ( ( __m128i * ) dstPtr, vsrc0 );
301 }
302 }
303 }
304 }
305 }
306 else
307 {
308 THROW( "Unsupported size" );
309 }
310 #if USE_AVX2
311
312 _mm256_zeroupper();
313 #endif
314 }
315
316 template< X86_VEXT vext, int W >
roundClip_SSE(TCoeff * dst,unsigned width,unsigned height,unsigned stride,const TCoeff outputMin,const TCoeff outputMax,const TCoeff round,const TCoeff shift)317 void roundClip_SSE( TCoeff *dst, unsigned width, unsigned height, unsigned stride, const TCoeff outputMin, const TCoeff outputMax, const TCoeff round, const TCoeff shift )
318 {
319 #if USE_AVX2
320 if( W >= 8 && vext >= AVX2 )
321 {
322 __m256i vmin = _mm256_set1_epi32( outputMin );
323 __m256i vmax = _mm256_set1_epi32( outputMax );
324 __m256i vrnd = _mm256_set1_epi32( round );
325
326 while( height-- )
327 {
328 for( int col = 0; col < width; col += 8 )
329 {
330 __m256i
331 vdst = _mm256_load_si256( ( __m256i * ) &dst[col] );
332 vdst = _mm256_add_epi32 ( vdst, vrnd );
333 vdst = _mm256_srai_epi32( vdst, shift );
334 vdst = _mm256_max_epi32 ( vdst, vmin );
335 vdst = _mm256_min_epi32 ( vdst, vmax );
336 _mm256_store_si256 ( ( __m256i * ) &dst[col], vdst );
337 }
338
339 dst += stride;
340 }
341 }
342 else
343 #endif
344 if( W >= 4 )
345 {
346 __m128i vmin = _mm_set1_epi32( outputMin );
347 __m128i vmax = _mm_set1_epi32( outputMax );
348 __m128i vrnd = _mm_set1_epi32( round );
349
350 while( height-- )
351 {
352 for( int col = 0; col < width; col += 4 )
353 {
354 __m128i
355 vdst = _mm_load_si128 ( ( __m128i * ) &dst[col] );
356 vdst = _mm_add_epi32 ( vdst, vrnd );
357 vdst = _mm_srai_epi32 ( vdst, shift );
358 vdst = _mm_max_epi32 ( vdst, vmin );
359 vdst = _mm_min_epi32 ( vdst, vmax );
360 _mm_store_si128 ( ( __m128i * ) &dst[col], vdst );
361 }
362
363 dst += stride;
364 }
365 }
366 else
367 {
368 THROW( "Unsupported size" );
369 }
370 #if USE_AVX2
371
372 _mm256_zeroupper();
373 #endif
374 }
375
376 template< X86_VEXT vext, int W >
cpyResi_SSE(const TCoeff * src,Pel * dst,ptrdiff_t stride,unsigned width,unsigned height)377 void cpyResi_SSE( const TCoeff* src, Pel* dst, ptrdiff_t stride, unsigned width, unsigned height )
378 {
379 #if USE_AVX2
380 if( W >= 8 && vext >= AVX2 )
381 {
382 while( height-- )
383 {
384 for( int col = 0; col < width; col += 8 )
385 {
386 __m256i
387 vsrc = _mm256_load_si256 ( ( const __m256i * ) &src[col] );
388 __m128i
389 vdst = _mm256_cvtepi32_epi16x ( vsrc );
390 _mm_storeu_si128 ( ( __m128i * ) &dst[col], vdst );
391 }
392
393 src += width;
394 dst += stride;
395 }
396 }
397 else
398 #endif
399 if( W >= 4 )
400 {
401 __m128i vzero = _mm_setzero_si128();
402 __m128i vdst;
403
404 while( height-- )
405 {
406 for( int col = 0; col < width; col += 4 )
407 {
408 vdst = _mm_load_si128 ( ( const __m128i * ) &src[col] );
409 vdst = _mm_packs_epi32( vdst, vzero );
410 _mm_storel_epi64 ( ( __m128i * ) &dst[col], vdst );
411 }
412
413 src += width;
414 dst += stride;
415 }
416 }
417 else
418 {
419 THROW( "Unsupported size" );
420 }
421 #if USE_AVX2
422
423 _mm256_zeroupper();
424 #endif
425 }
426
427 template<X86_VEXT vext>
_initTCoeffOpsX86()428 void TCoeffOps::_initTCoeffOpsX86()
429 {
430 cpyResi4 = cpyResi_SSE <vext, 4>;
431 cpyResi8 = cpyResi_SSE <vext, 8>;
432 roundClip4 = roundClip_SSE<vext, 4>;
433 roundClip8 = roundClip_SSE<vext, 8>;
434 fastInvCore4 = fastInv_SSE <vext, 4>;
435 fastInvCore8 = fastInv_SSE <vext, 8>;
436 }
437
438 template void TCoeffOps::_initTCoeffOpsX86<SIMDX86>();
439
440 #endif // TARGET_SIMD_X86
441 #endif
442
443 }
444