1 /* -----------------------------------------------------------------------------
2 The copyright in this software is being made available under the BSD
3 License, included below. No patent rights, trademark rights and/or
4 other Intellectual Property Rights other than the copyrights concerning
5 the Software are granted under this license.
6
7 For any license concerning other Intellectual Property rights than the software,
8 especially patent licenses, a separate Agreement needs to be closed.
9 For more information please contact:
10
11 Fraunhofer Heinrich Hertz Institute
12 Einsteinufer 37
13 10587 Berlin, Germany
14 www.hhi.fraunhofer.de/vvc
15 vvc@hhi.fraunhofer.de
16
17 Copyright (c) 2018-2021, Fraunhofer-Gesellschaft zur Förderung der angewandten Forschung e.V.
18 All rights reserved.
19
20 Redistribution and use in source and binary forms, with or without
21 modification, are permitted provided that the following conditions are met:
22
23 * Redistributions of source code must retain the above copyright notice,
24 this list of conditions and the following disclaimer.
25 * Redistributions in binary form must reproduce the above copyright notice,
26 this list of conditions and the following disclaimer in the documentation
27 and/or other materials provided with the distribution.
28 * Neither the name of Fraunhofer nor the names of its contributors may
29 be used to endorse or promote products derived from this software without
30 specific prior written permission.
31
32 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
33 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
34 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
36 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
37 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
40 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
42 THE POSSIBILITY OF SUCH DAMAGE.
43
44
45 ------------------------------------------------------------------------------------------- */
46
47 /** \file IntraPredX86.h
48 \brief SIMD for IntraPrediction
49 */
50
51 #include "CommonLib/CommonDef.h"
52 #include "CommonDefX86.h"
53 #include "CommonLib/IntraPrediction.h"
54
55 namespace vvdec
56 {
57
58 #if ENABLE_SIMD_OPT_INTRAPRED
59 #ifdef TARGET_SIMD_X86
60
61 //#define USE_AVX2
62 template< X86_VEXT vext, int W >
IntraPredAngleChroma_SIMD(int16_t * pDst,const ptrdiff_t dstStride,int16_t * pBorder,int width,int height,int deltaPos,int intraPredAngle)63 void IntraPredAngleChroma_SIMD(int16_t* pDst,const ptrdiff_t dstStride,int16_t* pBorder,int width,int height,int deltaPos,int intraPredAngle)
64 {
65 int deltaInt;
66 int deltaFract;
67 int refMainIndex;
68
69 __m128i voffset = _mm_set1_epi16(16);
70 if( W == 8 )
71 {
72 if( vext >= AVX2 )
73 {
74 #ifdef USE_AVX2
75 if (( width & 15 ) == 0 )
76 {
77 int deltaInt;
78 int deltaFract;
79 int refMainIndex;
80
81 __m256i voffset = _mm256_set1_epi16(16);
82 for (int k=0; k<height; k++) {
83
84 deltaInt = deltaPos >> 5;
85 deltaFract = deltaPos & (32 - 1);
86
87 __m256i vfract = _mm256_set1_epi16(deltaFract);
88 __m256i v32minfract = _mm256_set1_epi16(32-deltaFract);
89 // Do linear filtering
90 for (int l=0; l<width; l+=16) {
91 refMainIndex = l+ deltaInt+1;
92 __m256i vpred0 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex]);
93 __m256i vpred1 = _mm256_lddqu_si256((__m256i*)&pBorder[refMainIndex+1]);
94 vpred0 = _mm256_mullo_epi16(v32minfract, vpred0);
95 vpred1 = _mm256_mullo_epi16(vfract, vpred1);
96 __m256i vpred = _mm256_srli_epi16(_mm256_add_epi16(_mm256_add_epi16(vpred0, vpred1), voffset), 5);
97 _mm256_storeu_si256((__m256i*)&pDst[l], vpred);
98 }
99 pDst+=dstStride;
100 deltaPos += intraPredAngle;
101 }
102 }
103 else // width==8
104 {
105 for (int k=0; k<height; k++)
106 {
107 deltaInt = deltaPos >> 5;
108 deltaFract = deltaPos & (32 - 1);
109
110 __m128i vfract = _mm_set1_epi16(deltaFract);
111 __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
112 // Do linear filtering
113 for (int l=0; l<width; l+=8) {
114 refMainIndex = l+ deltaInt+1;
115 __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
116 __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
117 vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
118 vpred1 = _mm_mullo_epi16(vfract, vpred1);
119 __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
120 _mm_storeu_si128((__m128i*)&pDst[l], vpred);
121 }
122 deltaPos += intraPredAngle;
123
124 pDst+=dstStride;
125 }
126
127 }
128 #endif
129 } //AVX2
130 else
131 {
132 for (int k=0; k<height; k++) {
133 deltaInt = deltaPos >> 5;
134 deltaFract = deltaPos & (32 - 1);
135
136 __m128i vfract = _mm_set1_epi16(deltaFract);
137 __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
138 // Do linear filtering
139 for (int l=0; l<width; l+=8) {
140 refMainIndex = l+ deltaInt+1;
141 __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
142 __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
143 vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
144 vpred1 = _mm_mullo_epi16(vfract, vpred1);
145 __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
146 _mm_storeu_si128((__m128i*)&pDst[l], vpred);
147 }
148 deltaPos += intraPredAngle;
149
150 pDst+=dstStride;
151 }
152 }
153
154 }
155 else if( W == 4 )
156 {
157 for (int k=0; k<height; k++) {
158 deltaInt = deltaPos >> 5;
159 deltaFract = deltaPos & (32 - 1);
160
161 __m128i vfract = _mm_set1_epi16(deltaFract);
162 __m128i v32minfract = _mm_set1_epi16(32-deltaFract);
163 // Do linear filtering
164 refMainIndex = deltaInt+1;
165 __m128i vpred0 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex]);
166 __m128i vpred1 = _mm_lddqu_si128((__m128i*)&pBorder[refMainIndex+1]);
167 vpred0 = _mm_mullo_epi16(v32minfract, vpred0);
168 vpred1 = _mm_mullo_epi16(vfract, vpred1);
169 __m128i vpred = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(vpred0, vpred1), voffset), 5);
170 _mm_storel_epi64( ( __m128i * )(pDst ), vpred);
171 deltaPos += intraPredAngle;
172 pDst+=dstStride;
173 }
174 }
175 else
176 {
177 THROW( "Unsupported size in IntraPredAngleCore_SIMD" );
178 }
179 #if USE_AVX2
180
181 _mm256_zeroupper();
182 #endif
183 }
184
185
186 template< X86_VEXT vext, int W >
IntraPredAngleCore_SIMD(int16_t * pDstBuf,const ptrdiff_t dstStride,int16_t * refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff * ff,const bool useCubicFilter,const ClpRng & clpRng)187 void IntraPredAngleCore_SIMD(int16_t* pDstBuf,const ptrdiff_t dstStride,int16_t* refMain,int width,int height,int deltaPos,int intraPredAngle,const TFilterCoeff *ff,const bool useCubicFilter,const ClpRng& clpRng)
188 {
189 int16_t* pDst;
190
191 if( W == 8 )
192 {
193
194 if( vext >= AVX2 )
195 {
196 #ifdef USE_AVX2
197 __m256i shflmask1= _mm256_set_epi8(0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4,
198 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
199 __m256i offset = _mm256_set1_epi32( 32 );
200
201 if (( width & 15 ) == 0 )
202 {
203 __m256i vbdmin,vbdmax;
204
205 if (useCubicFilter)
206 {
207 vbdmin = _mm256_set1_epi16( clpRng.min() );
208 vbdmax = _mm256_set1_epi16( clpRng.max() );
209 }
210
211 for (int y = 0; y<height; y++ )
212 {
213 int deltaInt = deltaPos >> 5;
214 int deltaFract = deltaPos & (32 - 1);
215 int refMainIndex = deltaInt + 1;
216 pDst=&pDstBuf[y*dstStride];
217 __m128i tmp = _mm_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs
218 tmp = _mm_shuffle_epi32(tmp,0x44);
219 __m256i coeff = _mm256_broadcastsi128_si256(tmp);
220 for( int x = 0; x < width; x+=16)
221 {
222 __m256i src0 = _mm256_lddqu_si256( ( const __m256i * )&refMain[refMainIndex - 1] );//load 16 16 bit reference Pels -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
223 __m256i src2 = _mm256_castsi128_si256 (_mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex +4 - 1] ));
224 __m256i src1 = _mm256_permute2f128_si256 (src0,src0,0x00);
225 src2 = _mm256_permute2f128_si256 (src2,src2,0x00);
226 src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5
227 src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9
228
229 src1 = _mm256_madd_epi16 (src1, coeff);
230 src2 = _mm256_madd_epi16 (src2, coeff);
231
232 __m256i sum = _mm256_hadd_epi32( src1, src2 );
233 sum = _mm256_permute4x64_epi64(sum,0xD8);
234
235 sum = _mm256_add_epi32( sum, offset );
236 sum = _mm256_srai_epi32( sum, 6 );
237
238 refMainIndex+=8;
239
240 src1 = _mm256_permute2f128_si256 (src0,src0,0x1);
241 src2 = _mm256_inserti128_si256(src2, _mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex +4 - 1] ), 0x0);
242 src1 = _mm256_permute2f128_si256 (src1,src1,0x00);
243 src2 = _mm256_permute2f128_si256 (src2,src2,0x00);
244
245 src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5
246 src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9
247 src1 = _mm256_madd_epi16 (src1, coeff);
248 src2 = _mm256_madd_epi16 (src2, coeff);
249
250 __m256i sum1 = _mm256_hadd_epi32( src1, src2 );
251 sum1 = _mm256_permute4x64_epi64(sum1,0xD8);
252
253 sum1 = _mm256_add_epi32( sum1, offset );
254 sum1 = _mm256_srai_epi32( sum1, 6 );
255 src0 = _mm256_packs_epi32( sum, sum1 );
256
257 src0 = _mm256_permute4x64_epi64(src0,0xD8);
258
259 refMainIndex+=8;
260
261 if (useCubicFilter)
262 src0 = _mm256_min_epi16( vbdmax, _mm256_max_epi16( vbdmin, src0 ) );
263
264 _mm256_storeu_si256( ( __m256i * )(pDst + x), src0);
265 }
266 deltaPos += intraPredAngle;
267 }
268 }
269 else // width =8
270 {
271 // printf("AVX2 Block %d \n",width);
272 __m128i vbdmin,vbdmax;
273
274 if (useCubicFilter)
275 {
276 vbdmin = _mm_set1_epi16( clpRng.min() );
277 vbdmax = _mm_set1_epi16( clpRng.max() );
278 }
279
280 for (int y = 0; y<height; y++ )
281 {
282 int deltaInt = deltaPos >> 5;
283 int deltaFract = deltaPos & (32 - 1);
284 int refMainIndex = deltaInt + 1;
285 pDst=&pDstBuf[y*dstStride];
286 __m128i tmp = _mm_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs
287 tmp = _mm_shuffle_epi32(tmp,0x44);
288 __m256i coeff = _mm256_broadcastsi128_si256(tmp);
289 __m256i src0 = _mm256_lddqu_si256( ( const __m256i * )&refMain[refMainIndex - 1] );//load 16 16 bit reference Pels -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
290 // __m256i src2 = _mm256_inserti128_si256(src2, _mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex +4 - 1] ), 0x0);
291 __m256i src2 = _mm256_castsi128_si256 (_mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex +4 - 1] ));
292 __m256i src1 = _mm256_permute2f128_si256 (src0,src0,0x00);
293 src2 = _mm256_permute2f128_si256 (src2,src2,0x00);
294 src1 = _mm256_shuffle_epi8(src1,shflmask1); // -1 0 1 2 0 1 2 3 1 2 3 4 2 3 4 5
295 src2 = _mm256_shuffle_epi8(src2,shflmask1); // 3 4 5 6 4 5 6 7 5 6 7 8 6 7 8 9
296
297 src1 = _mm256_madd_epi16 (src1, coeff);
298 src2 = _mm256_madd_epi16 (src2, coeff);
299
300 __m256i sum = _mm256_hadd_epi32( src1, src2 );
301 sum = _mm256_permute4x64_epi64(sum,0xD8);
302
303 sum = _mm256_add_epi32( sum, offset );
304 sum = _mm256_srai_epi32( sum, 6 );
305 __m128i dest128 = _mm256_cvtepi32_epi16x( sum );
306
307 if (useCubicFilter)
308 dest128 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, dest128 ) );
309
310 _mm_storeu_si128( ( __m128i * )(pDst), dest128);
311 deltaPos += intraPredAngle;
312 }
313 }
314
315
316
317 #endif
318 }
319 else
320 {
321 __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
322 __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
323 __m128i vbdmin,vbdmax;
324
325 __m128i offset = _mm_set1_epi32( 32 );
326
327 if (useCubicFilter)
328 {
329 vbdmin = _mm_set1_epi16( clpRng.min() );
330 vbdmax = _mm_set1_epi16( clpRng.max() );
331 }
332
333 for (int y = 0; y<height; y++ )
334 {
335 int deltaInt = deltaPos >> 5;
336 int deltaFract = deltaPos & (32 - 1);
337 int refMainIndex = deltaInt + 1;
338 pDst=&pDstBuf[y*dstStride];
339 __m128i coeff = _mm_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs
340 coeff = _mm_shuffle_epi32(coeff,0x44);
341 for( int x = 0; x < width; x+=8)
342 {
343 __m128i src0 = _mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6
344 __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3
345 __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5
346 src0 = _mm_madd_epi16( coeff,src1 );
347 src1 = _mm_madd_epi16( coeff,src2 );
348 __m128i sum = _mm_hadd_epi32( src0, src1 );
349 sum = _mm_add_epi32( sum, offset );
350 sum = _mm_srai_epi32( sum, 6 );
351
352 refMainIndex+=4;
353 src0 = _mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6
354 src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3
355 src2 = _mm_shuffle_epi8(src0,shflmask2);
356
357 // 1 2 3 4 2 3 4 5
358 src0 = _mm_madd_epi16( coeff,src1 );
359 src1 = _mm_madd_epi16( coeff,src2 );
360
361 __m128i sum1 = _mm_hadd_epi32( src0, src1 );
362 sum1 = _mm_add_epi32( sum1, offset );
363 sum1 = _mm_srai_epi32( sum1, 6 );
364 src0 = _mm_packs_epi32( sum, sum1 );
365
366 refMainIndex+=4;
367
368 if (useCubicFilter)
369 src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
370
371 _mm_storeu_si128( ( __m128i * )(pDst + x), src0);
372
373 }
374 deltaPos += intraPredAngle;
375 }
376 }
377 }
378 else if( W == 4 )
379 {
380
381 __m128i shflmask1= _mm_set_epi8( 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 );
382 __m128i shflmask2= _mm_set_epi8( 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4 );
383 __m128i vbdmin,vbdmax;
384
385 __m128i offset = _mm_set1_epi32( 32 );
386
387 if (useCubicFilter)
388 {
389 vbdmin = _mm_set1_epi16( clpRng.min() );
390 vbdmax = _mm_set1_epi16( clpRng.max() );
391 }
392
393 for (int y = 0; y<height; y++ )
394 {
395 int deltaInt = deltaPos >> 5;
396 int deltaFract = deltaPos & (32 - 1);
397 int refMainIndex = deltaInt + 1;
398 pDst=&pDstBuf[y*dstStride];
399 __m128i coeff = _mm_loadl_epi64( ( __m128i const * )&ff[deltaFract<<2] ); //load 4 16 bit filter coeffs
400 coeff = _mm_shuffle_epi32(coeff,0x44);
401 {
402 __m128i src0 = _mm_lddqu_si128( ( __m128i const * )&refMain[refMainIndex - 1] ); //load 8 16 bit reference Pels -1 0 1 2 3 4 5 6
403 __m128i src1 = _mm_shuffle_epi8(src0,shflmask1); // -1 0 1 2 0 1 2 3
404 __m128i src2 = _mm_shuffle_epi8(src0,shflmask2); // 1 2 3 4 2 3 4 5
405 src0 = _mm_madd_epi16( coeff,src1 );
406 src1 = _mm_madd_epi16( coeff,src2 );
407 __m128i sum = _mm_hadd_epi32( src0, src1 );
408 sum = _mm_add_epi32( sum, offset );
409 sum = _mm_srai_epi32( sum, 6 );
410
411 src0 = _mm_packs_epi32( sum, sum );
412
413 refMainIndex+=4;
414
415 if (useCubicFilter)
416 src0 = _mm_min_epi16( vbdmax, _mm_max_epi16( vbdmin, src0 ) );
417
418 _mm_storel_epi64( ( __m128i * )(pDst ), src0);
419
420 }
421 deltaPos += intraPredAngle;
422 }
423 }
424 else
425 {
426 THROW( "Unsupported size in IntraPredAngleCore_SIMD" );
427 }
428 #if USE_AVX2
429 _mm256_zeroupper();
430 #endif
431 }
432
433 template< X86_VEXT vext, int W >
IntraPredSampleFilter_SIMD(Pel * ptrSrc,const ptrdiff_t srcStride,PelBuf & piPred,const uint32_t uiDirMode,const ClpRng & clpRng)434 void IntraPredSampleFilter_SIMD(Pel *ptrSrc,const ptrdiff_t srcStride,PelBuf &piPred,const uint32_t uiDirMode,const ClpRng& clpRng)
435 {
436 const int iWidth = piPred.width;
437 const int iHeight = piPred.height;
438 PelBuf dstBuf = piPred;
439 Pel* pDst = dstBuf.buf;
440 const ptrdiff_t dstStride = dstBuf.stride;
441
442 const int scale = ((getLog2(iWidth) - 2 + getLog2(iHeight) - 2 + 2) >> 2);
443 CHECK(scale < 0 || scale > 31, "PDPC: scale < 0 || scale > 2");
444
445 #if USE_AVX2
446 if( W > 8 )
447 {
448 __m256i tmplo,tmphi;
449 __m256i w64 = _mm256_set_epi16(64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64);
450 __m256i w32 = _mm256_set_epi32(32,32,32,32,32,32,32,32);
451 __m256i vbdmin = _mm256_set1_epi32( clpRng.min() );
452 __m256i vbdmax = _mm256_set1_epi32( clpRng.max() );
453 __m256i wl16,wl16start;
454
455 if (scale==0)
456 {
457 wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,32);
458 }
459 else if (scale==1)
460 {
461 wl16start = _mm256_set_epi16(0,0,0,0,0,0,0,0,0,0,1,2,4,8,16,32);
462 }
463 else if (scale==2)
464 {
465 wl16start = _mm256_set_epi16(0,0,0,0,1,1,2,2,4,4,8,8,16,16,32,32);
466 }
467
468 if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
469 {
470 for (int y = 0; y < iHeight; y++)
471 {
472 int wT = 32 >> std::min(31, ((y << 1) >> scale));
473
474 __m256i wt16 = _mm256_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT,wT);
475 __m256i x16left = _mm256_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
476
477 if (wT)
478 {
479 for (int x = 0; x < iWidth; x+=16)
480 {
481 if (x==0)
482 {
483 wl16=wl16start;
484
485 __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
486 __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
487
488 tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left
489 tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left
490 __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
491 __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
492
493 tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top
494 tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top
495 __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
496 __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
497
498 __m256i wX = _mm256_sub_epi16(w64,wl16);
499 wX = _mm256_sub_epi16(wX,wt16); // 64-wL-wT
500 tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst
501 tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst
502 __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
503 __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
504
505 dstlo = _mm256_add_epi32(dstlo,toplo);
506 dsthi = _mm256_add_epi32(dsthi,tophi);
507 dstlo = _mm256_add_epi32(dstlo,leftlo);
508 dsthi = _mm256_add_epi32(dsthi,lefthi);
509 dstlo = _mm256_add_epi32(dstlo,w32);
510 dsthi = _mm256_add_epi32(dsthi,w32);
511
512 dstlo = _mm256_srai_epi32(dstlo,6);
513 dsthi = _mm256_srai_epi32(dsthi,6);
514
515 dstlo = _mm256_max_epi32(vbdmin,dstlo);
516 dsthi = _mm256_max_epi32(vbdmin,dsthi);
517 dstlo = _mm256_min_epi32(vbdmax,dstlo);
518 dsthi = _mm256_min_epi32(vbdmax,dsthi);
519
520 dstlo = _mm256_packs_epi32(dstlo,dsthi);
521 dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
522
523 _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
524 }
525 else
526 {
527
528 __m256i x16top = _mm256_loadu_si256((__m256i *) (ptrSrc+x+1)); // load top
529 __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride+x)); // load dst
530
531
532 tmplo = _mm256_mullo_epi16(x16top,wt16); // wT*top
533 tmphi = _mm256_mulhi_epi16(x16top,wt16); // wT*top
534 __m256i toplo = _mm256_unpacklo_epi16(tmplo,tmphi);
535 __m256i tophi = _mm256_unpackhi_epi16(tmplo,tmphi);
536
537 __m256i wX = _mm256_sub_epi16(w64,wt16);
538 tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst
539 tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst
540 __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
541 __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
542
543 dstlo = _mm256_add_epi32(dstlo,toplo);
544 dsthi = _mm256_add_epi32(dsthi,tophi);
545 dstlo = _mm256_add_epi32(dstlo,w32);
546 dsthi = _mm256_add_epi32(dsthi,w32);
547
548 dstlo = _mm256_srai_epi32(dstlo,6);
549 dsthi = _mm256_srai_epi32(dsthi,6);
550
551 dstlo = _mm256_max_epi32(vbdmin,dstlo);
552 dsthi = _mm256_max_epi32(vbdmin,dsthi);
553 dstlo = _mm256_min_epi32(vbdmax,dstlo);
554 dsthi = _mm256_min_epi32(vbdmax,dsthi);
555
556 dstlo = _mm256_packs_epi32(dstlo,dsthi);
557 dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
558
559 _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride+x), dstlo );
560 }
561
562 } // for
563 }
564 else
565 { // wT =0
566
567 wl16=wl16start;
568 __m256i x16dst = _mm256_loadu_si256((const __m256i *) (pDst+y*dstStride)); // load dst
569
570 tmplo = _mm256_mullo_epi16(x16left,wl16); //wL * left
571 tmphi = _mm256_mulhi_epi16(x16left,wl16); //wL * left
572 __m256i leftlo = _mm256_unpacklo_epi16(tmplo,tmphi);
573 __m256i lefthi = _mm256_unpackhi_epi16(tmplo,tmphi);
574
575 __m256i wX = _mm256_sub_epi16(w64,wl16);
576 tmplo = _mm256_mullo_epi16(x16dst,wX); // 64-wL-wT*dst
577 tmphi = _mm256_mulhi_epi16(x16dst,wX); // 64-wL-wT*dst
578 __m256i dstlo = _mm256_unpacklo_epi16(tmplo,tmphi);
579 __m256i dsthi = _mm256_unpackhi_epi16(tmplo,tmphi);
580
581 dstlo = _mm256_add_epi32(dstlo,leftlo);
582 dsthi = _mm256_add_epi32(dsthi,lefthi);
583 dstlo = _mm256_add_epi32(dstlo,w32);
584 dsthi = _mm256_add_epi32(dsthi,w32);
585
586 dstlo = _mm256_srai_epi32(dstlo,6);
587 dsthi = _mm256_srai_epi32(dsthi,6);
588
589 dstlo = _mm256_max_epi32(vbdmin,dstlo);
590 dsthi = _mm256_max_epi32(vbdmin,dsthi);
591 dstlo = _mm256_min_epi32(vbdmax,dstlo);
592 dsthi = _mm256_min_epi32(vbdmax,dsthi);
593
594 dstlo = _mm256_packs_epi32(dstlo,dsthi);
595 dstlo = _mm256_permute4x64_epi64 ( dstlo, ( 0 << 0 ) + ( 1 << 2 ) + ( 2 << 4 ) + ( 3 << 6 ) );
596
597 _mm256_storeu_si256( ( __m256i * )(pDst+y*dstStride), dstlo );
598 }
599 }
600 }
601 }
602 else
603 #endif
604 {
605 __m128i tmplo8,tmphi8;
606 __m128i w64_8 = _mm_set_epi16(64,64,64,64,64,64,64,64);
607 __m128i w32_8 = _mm_set_epi32(32,32,32,32);
608 __m128i vbdmin8 = _mm_set1_epi32( clpRng.min() );
609 __m128i vbdmax8 = _mm_set1_epi32( clpRng.max() );
610 __m128i wl8start,wl8start2;
611 CHECK(scale < 0 || scale > 2, "PDPC: scale < 0 || scale > 2");
612
613 if (scale==0)
614 {
615 wl8start = _mm_set_epi16(0,0,0,0,0,2,8,32);
616 wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
617 }
618 else if (scale==1)
619 {
620 wl8start = _mm_set_epi16(0,0,1,2,4,8,16,32);
621 wl8start2 = _mm_set_epi16(0,0,0,0,0,0,0,0);
622 }
623 else if (scale==2)
624 {
625 wl8start = _mm_set_epi16(4,4,8,8,16,16,32,32);
626 wl8start2 = _mm_set_epi16(0,0,0,0,1,1,2,2);
627 }
628 if (uiDirMode == PLANAR_IDX || uiDirMode == DC_IDX )
629 {
630 __m128i wl8 = wl8start;
631 for (int y = 0; y < iHeight; y++)
632 {
633 int wT = 32 >> std::min(31, ((y << 1) >> scale));
634
635 __m128i wt8 = _mm_set_epi16(wT,wT,wT,wT,wT,wT,wT,wT);
636 // __m128i x8left = _mm_broadcastw_epi16(_mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride))));
637
638 __m128i x8left = _mm_loadu_si128 ((__m128i const *) (ptrSrc+((y+1)*srcStride)));
639 x8left =_mm_shufflelo_epi16(x8left,0);
640 x8left =_mm_shuffle_epi32(x8left,0);
641
642
643 if (wT)
644 {
645 for (int x = 0; x < iWidth; x+=8)
646 {
647 if (x>8)
648 {
649 __m128i x8top = _mm_loadu_si128((__m128i *) (ptrSrc+x+1)); // load top
650 __m128i x8dst = _mm_loadu_si128((const __m128i *) (pDst+y*dstStride+x)); // load dst
651
652 tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top
653 tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top
654 __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
655 __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
656
657
658 __m128i wX = _mm_sub_epi16(w64_8,wt8);
659 tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst
660 tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst
661 __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
662 __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
663
664 dstlo8 = _mm_add_epi32(dstlo8,toplo8);
665 dsthi8 = _mm_add_epi32(dsthi8,tophi8);
666 dstlo8 = _mm_add_epi32(dstlo8,w32_8);
667 dsthi8 = _mm_add_epi32(dsthi8,w32_8);
668
669 dstlo8 = _mm_srai_epi32(dstlo8,6);
670 dsthi8 = _mm_srai_epi32(dsthi8,6);
671
672 dstlo8 = _mm_max_epi32(vbdmin8,dstlo8);
673 dsthi8 = _mm_max_epi32(vbdmin8,dsthi8);
674 dstlo8 = _mm_min_epi32(vbdmax8,dstlo8);
675 dsthi8 = _mm_min_epi32(vbdmax8,dsthi8);
676
677 dstlo8 = _mm_packs_epi32(dstlo8,dsthi8);
678
679 _mm_storeu_si128(( __m128i * )(pDst+y*dstStride+x), (dstlo8) );
680 }
681 else // x<=8
682 {
683 if (x==0)
684 wl8=wl8start;
685 else if (x==8)
686 wl8=wl8start2;
687
688 __m128i x8top = _mm_loadu_si128((__m128i *) (ptrSrc+x+1)); // load top
689 __m128i x8dst = _mm_loadu_si128((const __m128i *) (pDst+y*dstStride+x)); // load dst
690
691 tmplo8 = _mm_mullo_epi16(x8left,wl8); //wL * left
692 tmphi8 = _mm_mulhi_epi16(x8left,wl8); //wL * left
693 __m128i leftlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
694 __m128i lefthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
695
696 tmplo8 = _mm_mullo_epi16(x8top,wt8); // wT*top
697 tmphi8 = _mm_mulhi_epi16(x8top,wt8); // wT*top
698 __m128i toplo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
699 __m128i tophi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
700
701 __m128i wX = _mm_sub_epi16(w64_8,wl8);
702 wX = _mm_sub_epi16(wX,wt8); // 64-wL-wT
703 tmplo8 = _mm_mullo_epi16(x8dst,wX); // 64-wL-wT*dst
704 tmphi8 = _mm_mulhi_epi16(x8dst,wX); // 64-wL-wT*dst
705 __m128i dstlo8 = _mm_unpacklo_epi16(tmplo8,tmphi8);
706 __m128i dsthi8 = _mm_unpackhi_epi16(tmplo8,tmphi8);
707
708 dstlo8 = _mm_add_epi32(dstlo8,toplo8);
709 dsthi8 = _mm_add_epi32(dsthi8,tophi8);
710 dstlo8 = _mm_add_epi32(dstlo8,leftlo8);
711 dsthi8 = _mm_add_epi32(dsthi8,lefthi8);
712 dstlo8 = _mm_add_epi32(dstlo8,w32_8);
713 dsthi8 = _mm_add_epi32(dsthi8,w32_8);
714
715 dstlo8 = _mm_srai_epi32(dstlo8,6);
716 dsthi8 = _mm_srai_epi32(dsthi8,6);
717
718 dstlo8 = _mm_max_epi32(vbdmin8,dstlo8);
719 dsthi8 = _mm_max_epi32(vbdmin8,dsthi8);
720 dstlo8 = _mm_min_epi32(vbdmax8,dstlo8);
721 dsthi8 = _mm_min_epi32(vbdmax8,dsthi8);
722
723 dstlo8 = _mm_packs_epi32(dstlo8,dsthi8);
724
725 if (iWidth>=8)
726 _mm_storeu_si128(( __m128i * )(pDst+y*dstStride+x), (dstlo8) );
727 else if (iWidth==4)
728 _mm_storel_epi64(( __m128i * )(pDst+y*dstStride+x), (dstlo8) );
729 else if (iWidth==2)
730 _mm_storeu_si32(( __m128i * )(pDst+y*dstStride+x),(dstlo8) );
731 }
732 }
733 }
734 else //wT =0
735 {
736 for( int x = 0; x < std::min( iWidth, 16 ); x += 8 )
737 {
738 if( x == 0 )
739 wl8 = wl8start;
740 else if( x == 8 )
741 wl8 = wl8start2;
742
743 __m128i x8dst;
744 if( iWidth >= 8 )
745 x8dst = _mm_loadu_si128( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst
746 else if( iWidth == 4 )
747 x8dst = _mm_loadu_si64( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst
748 else if( iWidth == 2 )
749 x8dst = _mm_loadu_si32( (const __m128i*)( pDst + y * dstStride + x ) ); // load dst
750 else
751 CHECK( true, "wrong iWidth in IntraPredSampleFilter_SIMD, only implemented for >=8, ==4, ==2" );
752
753
754 tmplo8 = _mm_mullo_epi16( x8left, wl8 ); // wL * left
755 tmphi8 = _mm_mulhi_epi16( x8left, wl8 ); // wL * left
756 __m128i leftlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
757 __m128i lefthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
758
759 __m128i wX = _mm_sub_epi16( w64_8, wl8 );
760 tmplo8 = _mm_mullo_epi16( x8dst, wX ); // 64-wL-wT*dst
761 tmphi8 = _mm_mulhi_epi16( x8dst, wX ); // 64-wL-wT*dst
762 __m128i dstlo8 = _mm_unpacklo_epi16( tmplo8, tmphi8 );
763 __m128i dsthi8 = _mm_unpackhi_epi16( tmplo8, tmphi8 );
764
765 dstlo8 = _mm_add_epi32( dstlo8, leftlo8 );
766 dsthi8 = _mm_add_epi32( dsthi8, lefthi8 );
767 dstlo8 = _mm_add_epi32( dstlo8, w32_8 );
768 dsthi8 = _mm_add_epi32( dsthi8, w32_8 );
769
770 dstlo8 = _mm_srai_epi32( dstlo8, 6 );
771 dsthi8 = _mm_srai_epi32( dsthi8, 6 );
772
773 dstlo8 = _mm_max_epi32( vbdmin8, dstlo8 );
774 dsthi8 = _mm_max_epi32( vbdmin8, dsthi8 );
775 dstlo8 = _mm_min_epi32( vbdmax8, dstlo8 );
776 dsthi8 = _mm_min_epi32( vbdmax8, dsthi8 );
777
778 dstlo8 = _mm_packs_epi32( dstlo8, dsthi8 );
779
780 if( iWidth >= 8 )
781 _mm_storeu_si128( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
782 else if( iWidth == 4 )
783 _mm_storel_epi64( (__m128i*)( pDst + y * dstStride + x ), ( dstlo8 ) );
784 else if( iWidth == 2 )
785 _mm_storeu_si32( (__m128i*)( pDst + y * dstStride + x ), dstlo8 );
786 }
787 }
788 }
789 }
790 }
791
792
793 #if USE_AVX2
794 _mm256_zeroupper();
795 #endif
796 }
797
798 /** Function for deriving planar intra prediction. This function derives the prediction samples for planar mode (intra coding).
799 */
800 template< X86_VEXT vext>
xPredIntraPlanar_SIMD(const CPelBuf & pSrc,PelBuf & pDst,const SPS & sps)801 void xPredIntraPlanar_SIMD( const CPelBuf &pSrc, PelBuf &pDst, const SPS& sps )
802 {
803
804 const uint32_t width = pDst.width;
805 const uint32_t height = pDst.height;
806 const uint32_t log2W = getLog2( width );
807 const uint32_t log2H = getLog2( height );
808 const uint32_t offset = 1 << (log2W + log2H);
809 const ptrdiff_t stride = pDst.stride;
810 Pel* pred = pDst.buf;
811
812 const Pel *ptrSrc =pSrc.buf;
813
814 int leftColumn,rightColumn;
815 Pel tmp;
816 int topRight = pSrc.at( width + 1, 0 );
817
818 tmp=pSrc.at( 0, height+1 );
819 __m128i bottomLeft16 = _mm_set_epi16(tmp,tmp,tmp,tmp,tmp,tmp,tmp,tmp);
820 __m128i zero = _mm_xor_si128(bottomLeft16,bottomLeft16);
821 __m128i eight = _mm_set_epi16(8,8,8,8,8,8,8,8);
822 __m128i offset32 = _mm_set_epi32(offset,offset,offset,offset);
823
824 const uint32_t finalShift = 1 + log2W + log2H;
825
826 for( int y = 0; y < height; y++)
827 {
828 leftColumn=pSrc.at( 0, y + 1 );
829 rightColumn = topRight - leftColumn;
830 leftColumn = leftColumn << log2W;
831 __m128i leftColumn32 = _mm_set_epi32(leftColumn,leftColumn,leftColumn,leftColumn);
832 __m128i rightcolumn16 = _mm_set_epi16(rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn,rightColumn);
833 __m128i y16 = _mm_set_epi16(y+1,y+1,y+1,y+1,y+1,y+1,y+1,y+1);
834 __m128i x16 = _mm_set_epi16(8,7,6,5,4,3,2,1);
835
836 for( int x = 0; x < width; x+=8 )
837 {
838 //topRow[x] = pSrc.at( x + 1, 0 );
839 __m128i topRow16 = _mm_loadu_si128 ((__m128i const *) (ptrSrc+(x+1)));
840 //bottomRow[x] = bottomLeft - topRow[x];
841 __m128i bottomRow16L = _mm_sub_epi16(bottomLeft16,topRow16);
842 // (y+1)*bottomRow[x]
843 __m128i tmpH = _mm_mulhi_epi16(bottomRow16L,y16);
844 __m128i tmpL = _mm_mullo_epi16(bottomRow16L,y16);
845 bottomRow16L = _mm_unpacklo_epi16(tmpL,tmpH);
846 __m128i bottomRow16H = _mm_unpackhi_epi16(tmpL,tmpH);
847
848 // (topRow[x] topRow16H<< log2H)
849 __m128i topRow32L = _mm_unpacklo_epi16(topRow16,zero);
850 __m128i topRow32H = _mm_unpackhi_epi16(topRow16,zero);
851 topRow32L = _mm_slli_epi32(topRow32L,log2H);
852 topRow32H = _mm_slli_epi32(topRow32H,log2H);
853 // vertPred = (topRow[x] << log2H) + (y+1)*bottomRow[x];
854 topRow32L = _mm_add_epi32(topRow32L,bottomRow16L);
855 topRow32H = _mm_add_epi32(topRow32H,bottomRow16H);
856 // horPred = leftColumn + (x+1)*rightColumn;
857 tmpL = _mm_mullo_epi16(rightcolumn16,x16);
858 tmpH = _mm_mulhi_epi16(rightcolumn16,x16);
859 __m128i horpred32L = _mm_unpacklo_epi16(tmpL,tmpH);
860 __m128i horpred32H = _mm_unpackhi_epi16(tmpL,tmpH);
861 horpred32L = _mm_add_epi32(leftColumn32,horpred32L);
862 horpred32H = _mm_add_epi32(leftColumn32,horpred32H);
863 // pred[x] = ( ( horPred << log2H ) + ( vertPred << log2W ) + offset ) >> finalShift;
864 horpred32L = _mm_slli_epi32(horpred32L,log2H);
865 horpred32H = _mm_slli_epi32(horpred32H,log2H);
866 topRow32L = _mm_slli_epi32(topRow32L,log2W);
867 topRow32H = _mm_slli_epi32(topRow32H,log2W);
868 horpred32L = _mm_add_epi32(horpred32L,topRow32L);
869 horpred32H = _mm_add_epi32(horpred32H,topRow32H);
870 horpred32L = _mm_add_epi32(horpred32L,offset32);
871 horpred32H = _mm_add_epi32(horpred32H,offset32);
872 horpred32L = _mm_srli_epi32(horpred32L,finalShift);
873 horpred32H = _mm_srli_epi32(horpred32H,finalShift);
874
875 tmpL = _mm_packs_epi32(horpred32L,horpred32H);
876 if (width>=8)
877 _mm_storeu_si128(( __m128i * )(pred+y*stride+x), (tmpL) );
878 else if (width==4)
879 _mm_storel_epi64(( __m128i * )(pred+y*stride+x), (tmpL) );
880 else if (width==2)
881 _mm_storeu_si32(( __m128i * )(pred+y*stride+x),(tmpL) );
882 else
883 pred[y*stride+x]=(Pel)_mm_extract_epi16 (tmpL,0);
884
885 x16 = _mm_add_epi16(x16,eight);
886 }
887 }
888 }
889
890 template< X86_VEXT vext>
GetLumaRecPixel420SIMD(const int width,const int height,const Pel * pRecSrc0,const ptrdiff_t iRecStride,Pel * pDst0,const ptrdiff_t iDstStride)891 void GetLumaRecPixel420SIMD (const int width,const int height, const Pel* pRecSrc0,const ptrdiff_t iRecStride,Pel* pDst0,const ptrdiff_t iDstStride)
892 {
893 #ifdef USE_AVX2
894 if( ( width & 15 ) == 0 ) // width>=16
895 // if( 0 ) // width>=16
896 {
897 __m256i vzero = _mm256_set1_epi8(0);
898 __m256i vfour = _mm256_set1_epi32(4);
899 for( int y = 0; y < height; y++ )
900 {
901 for( int x = 0; x < width; x += 16 )
902 {
903 int x2=x<<1;
904 __m256i vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6
905 __m256i vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7
906
907 __m256i vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit
908 __m256i vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit
909 __m256i vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit
910 vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
911 vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
912
913 vsrc0 = _mm256_add_epi32(vsrc0,vsrc10);
914 __m256i vdst0 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
915
916 vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 +15]); // 7 8 9 10 11 12 13 14
917 vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15
918
919 x2+= (int)iRecStride;
920
921 vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
922 vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
923 vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
924 vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
925 vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
926
927 vsrc0 = _mm256_add_epi32(vsrc0,vsrc10);
928 __m256i vdst1 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
929
930 // jetzt die nächste Zeile dazu
931 vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6
932 vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7
933
934 vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit
935 vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit
936 vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit
937 vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
938 vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
939
940 vsrc0 = _mm256_add_epi32(vsrc0,vsrc10);
941 __m256i vdst01 = _mm256_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile
942
943 vsrc_l = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 15]); // 7 8 9 10 11 12 13 14
944 vsrc = _mm256_loadu_si256((__m256i*)&pRecSrc0[x2 + 16 ]); // 8 9 10 11 12 13 14 15
945
946 vsrc01 = _mm256_blend_epi16(vzero,vsrc_l,0x55);
947 vsrc0 = _mm256_blend_epi16(vzero,vsrc,0x55);
948 vsrc10 = _mm256_blend_epi16(vzero,vsrc,0xAA);
949 vsrc10 = _mm256_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
950 vsrc0 = _mm256_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
951
952 vsrc0 = _mm256_add_epi32(vsrc0,vsrc10);
953 __m256i vdst11 = _mm256_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile
954
955 vdst0 = _mm256_add_epi32(vdst0,vdst01);
956 vdst1 = _mm256_add_epi32(vdst1,vdst11);
957 vdst0 = _mm256_add_epi32(vdst0,vfour);
958 vdst1 = _mm256_add_epi32(vdst1,vfour);
959 vdst0 = _mm256_srli_epi32(vdst0,3);
960 vdst1 = _mm256_srli_epi32(vdst1,3);
961 vdst0 = _mm256_packus_epi32 (vdst0,vdst1); // 16 bit
962 vdst0 = _mm256_permute4x64_epi64(vdst0,0xd8);
963
964 _mm256_storeu_si256((__m256i*)&pDst0[x], vdst0);
965 // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
966 }
967 pDst0 += iDstStride;
968 pRecSrc0 += (iRecStride<<1);
969 }
970 }
971 else
972 #endif
973 if( ( width & 7 ) == 0 ) // width>=8
974 {
975 __m128i vzero = _mm_set1_epi8(0);
976 __m128i vfour = _mm_set1_epi32(4);
977
978
979 for( int y = 0; y < height; y++ )
980 {
981
982 for( int x = 0; x < width; x += 8 )
983 {
984 int x2=x<<1;
985 __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6
986 __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7
987
988 __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit
989 __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit
990 __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit
991 vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
992 vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
993
994 vsrc0 = _mm_add_epi32(vsrc0,vsrc10);
995 __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
996
997 vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 +7]); // 7 8 9 10 11 12 13 14
998 vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15
999
1000 x2+=(int)iRecStride;
1001
1002 vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1003 vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1004 vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1005 vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
1006 vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
1007
1008 vsrc0 = _mm_add_epi32(vsrc0,vsrc10);
1009 __m128i vdst1 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile fehlt noch
1010
1011 // jetzt die nächste Zeile dazu
1012 vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2-1]); // -1 0 1 2 3 4 5 6
1013 vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2]); // 0 1 2 3 4 5 6 7
1014
1015 vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit
1016 vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit
1017 vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit
1018 vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
1019 vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
1020
1021 vsrc0 = _mm_add_epi32(vsrc0,vsrc10);
1022 __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile
1023
1024 vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 7]); // 7 8 9 10 11 12 13 14
1025 vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[x2 + 8 ]); // 8 9 10 11 12 13 14 15
1026
1027 vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55);
1028 vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55);
1029 vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA);
1030 vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
1031 vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
1032
1033 vsrc0 = _mm_add_epi32(vsrc0,vsrc10);
1034 __m128i vdst11 = _mm_add_epi32(vsrc0,vsrc01); // dst 4 5 6 7 32 Bit, untere Zeile
1035
1036 vdst0 = _mm_add_epi32(vdst0,vdst01);
1037 vdst1 = _mm_add_epi32(vdst1,vdst11);
1038 vdst0 = _mm_add_epi32(vdst0,vfour);
1039 vdst1 = _mm_add_epi32(vdst1,vfour);
1040 vdst0 = _mm_srli_epi32(vdst0,3);
1041 vdst1 = _mm_srli_epi32(vdst1,3);
1042 vdst0 = _mm_packus_epi32 (vdst0,vdst1); // 16 bit
1043
1044 _mm_storeu_si128((__m128i*)&pDst0[x], vdst0);
1045 // _mm_storeu_si128((__m128i*)&pDstTmp[x], vdst0);
1046 }
1047 pDst0 += iDstStride;
1048 pRecSrc0 += (iRecStride<<1);
1049 }
1050 }
1051 else // width<=4
1052 {
1053 __m128i vzero = _mm_set1_epi8(0);
1054 __m128i vfour = _mm_set1_epi32(4);
1055
1056 for( int y = 0; y < height; y++ )
1057 {
1058 __m128i vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[-1]); // -1 0 1 2 3 4 5 6
1059 __m128i vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[0]); // 0 1 2 3 4 5 6 7
1060
1061 __m128i vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit
1062 __m128i vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit
1063 __m128i vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit
1064 vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
1065 vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
1066
1067 vsrc0 = _mm_add_epi32(vsrc0,vsrc10);
1068 __m128i vdst0 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile fehlt noch
1069
1070 // jetzt die nächste Zeile dazu
1071 vsrc_l = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride-1]); // -1 0 1 2 3 4 5 6
1072 vsrc = _mm_loadu_si128((__m128i*)&pRecSrc0[iRecStride]); // 0 1 2 3 4 5 6_mm_storeu_si32 7
1073
1074 vsrc01 = _mm_blend_epi16(vzero,vsrc_l,0x55); // -1 1 3 5 32 Bit
1075 vsrc0 = _mm_blend_epi16(vzero,vsrc,0x55); // 0 2 4 6 32 Bit
1076 vsrc10 = _mm_blend_epi16(vzero,vsrc,0xAA); // 1 3 5 7 32 Bit
1077 vsrc10 = _mm_srli_epi32(vsrc10,16); // 1 3 5 7 32 Bit
1078 vsrc0 = _mm_slli_epi32 (vsrc0,1); // 0 2 4 6 *2
1079
1080 vsrc0 = _mm_add_epi32(vsrc0,vsrc10);
1081 __m128i vdst01 = _mm_add_epi32(vsrc0,vsrc01); // dst 0 1 2 3 32 Bit, untere Zeile
1082
1083
1084 vdst0 = _mm_add_epi32(vdst0,vdst01);
1085 vdst0 = _mm_add_epi32(vdst0,vfour);
1086 vdst0 = _mm_srli_epi32(vdst0,3);
1087 vdst0 = _mm_packus_epi32 (vdst0,vdst0); // 16 bit
1088
1089 if (width==4)
1090 _mm_storel_epi64(( __m128i * )&pDst0[0], (vdst0) );
1091 else if (width==2)
1092 _mm_storeu_si32(( __m128i * )&pDst0[0], (vdst0) );
1093 else
1094 {
1095 int tmp = _mm_cvtsi128_si32(vdst0);
1096 pDst0[0] = (Pel) tmp;
1097 }
1098
1099 pDst0 += iDstStride;
1100 pRecSrc0 += (iRecStride<<1);
1101 }
1102 }
1103 }
1104
1105
1106
1107 template<X86_VEXT vext>
_initIntraPredictionX86()1108 void IntraPrediction::_initIntraPredictionX86()
1109 {
1110 IntraPredAngleCore4 = IntraPredAngleCore_SIMD<vext, 4>;
1111 IntraPredAngleCore8 = IntraPredAngleCore_SIMD<vext, 8>;
1112 IntraPredAngleChroma4 = IntraPredAngleChroma_SIMD<vext, 4>;
1113 IntraPredAngleChroma8 = IntraPredAngleChroma_SIMD<vext, 8>;
1114
1115 IntraPredSampleFilter8 = IntraPredSampleFilter_SIMD<vext, 8>;
1116 IntraPredSampleFilter16 = IntraPredSampleFilter_SIMD<vext, 16>;
1117
1118 xPredIntraPlanar = xPredIntraPlanar_SIMD<vext>;
1119
1120 GetLumaRecPixel420 = GetLumaRecPixel420SIMD<vext>;
1121
1122 }
1123 template void IntraPrediction::_initIntraPredictionX86<SIMDX86>();
1124
1125 #endif // TARGET_SIMD_X86
1126 #endif
1127
1128 }
1129