1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include "EbPictureOperators_SSE2.h"
7 #include <emmintrin.h>
8 #include "EbDefinitions.h"
9
_mm_loadh_epi64(__m128i x,__m128i * p)10 static __m128i _mm_loadh_epi64(__m128i x, __m128i *p)
11 {
12 return _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(x), (double *)p));
13 }
14
15 // Note: maximum energy within 1 TU considered to be 2^30-e
16 // All functions can accumulate up to 4 TUs to stay within 32-bit unsigned range
17
18 //-------
FullDistortionKernel4x4_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)19 void FullDistortionKernel4x4_32bit_BT_SSE2(
20 EB_S16 *coeff,
21 EB_U32 coeffStride,
22 EB_S16 *reconCoeff,
23 EB_U32 reconCoeffStride,
24 EB_U64 distortionResult[2],
25 EB_U32 areaWidth,
26 EB_U32 areaHeight)
27 {
28 EB_S32 rowCount;
29 __m128i sum = _mm_setzero_si128();
30 __m128i sum2 = _mm_setzero_si128();
31
32 rowCount = 2;
33 do
34 {
35 __m128i x0;
36 __m128i y0;
37 __m128i z0;
38
39 x0 = _mm_loadl_epi64((__m128i *)coeff); coeff += coeffStride;
40 x0 = _mm_loadh_epi64(x0, (__m128i *)coeff); coeff += coeffStride;
41 y0 = _mm_loadl_epi64((__m128i *)reconCoeff); reconCoeff += reconCoeffStride;
42 y0 = _mm_loadh_epi64(y0, (__m128i *)reconCoeff); reconCoeff += reconCoeffStride;
43
44 z0 = _mm_madd_epi16(x0, x0);
45
46 sum2 = _mm_add_epi32(sum2, z0);
47
48 x0 = _mm_sub_epi16(x0, y0);
49
50 x0 = _mm_madd_epi16(x0, x0);
51
52 sum = _mm_add_epi32(sum, x0);
53 }
54 while (--rowCount);
55
56 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
57 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
58 sum = _mm_unpacklo_epi32(sum, sum2);
59 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
60 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
61
62 (void)areaWidth;
63 (void)areaHeight;
64 }
65
FullDistortionKernel8x8_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)66 void FullDistortionKernel8x8_32bit_BT_SSE2(
67 EB_S16 *coeff,
68 EB_U32 coeffStride,
69 EB_S16 *reconCoeff,
70 EB_U32 reconCoeffStride,
71 EB_U64 distortionResult[2],
72 EB_U32 areaWidth,
73 EB_U32 areaHeight)
74 {
75 EB_S32 rowCount;
76
77 __m128i sum = _mm_setzero_si128();
78 __m128i sum2 = _mm_setzero_si128();
79
80 rowCount = 8;
81 do
82 {
83 __m128i x0;
84 __m128i y0;
85 __m128i z0;
86
87 x0 = _mm_loadu_si128((__m128i *)(coeff + 0x00));
88 y0 = _mm_loadu_si128((__m128i *)(reconCoeff + 0x00));
89 coeff += coeffStride;
90 reconCoeff += reconCoeffStride;
91
92 z0 = _mm_madd_epi16(x0, x0);
93
94 sum2 = _mm_add_epi32(sum2, z0);
95
96 x0 = _mm_sub_epi16(x0, y0);
97
98 x0 = _mm_madd_epi16(x0, x0);
99
100 sum = _mm_add_epi32(sum, x0);
101 }
102 while (--rowCount);
103
104 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
105 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
106 sum = _mm_unpacklo_epi32(sum, sum2);
107 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
108 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
109
110 (void)areaWidth;
111 (void)areaHeight;
112 }
113
FullDistortionKernel16MxN_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)114 void FullDistortionKernel16MxN_32bit_BT_SSE2(
115 EB_S16 *coeff,
116 EB_U32 coeffStride,
117 EB_S16 *reconCoeff,
118 EB_U32 reconCoeffStride,
119 EB_U64 distortionResult[2],
120 EB_U32 areaWidth,
121 EB_U32 areaHeight)
122 {
123 EB_S32 rowCount, colCount;
124 __m128i sum = _mm_setzero_si128();
125 __m128i sum2 = _mm_setzero_si128();
126
127 colCount = areaWidth;
128 do
129 {
130 EB_S16 *coeffTemp = coeff;
131 EB_S16 *reconCoeffTemp = reconCoeff;
132
133 rowCount = areaHeight;
134 do
135 {
136 __m128i x0, x1;
137 __m128i y0, y1;
138 __m128i z0, z1;
139
140 x0 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x00));
141 x1 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x08));
142 y0 = _mm_loadu_si128((__m128i *)(reconCoeffTemp + 0x00));
143 y1 = _mm_loadu_si128((__m128i *)(reconCoeffTemp + 0x08));
144 coeffTemp += coeffStride;
145 reconCoeffTemp += reconCoeffStride;
146
147 z0 = _mm_madd_epi16(x0, x0);
148 z1 = _mm_madd_epi16(x1, x1);
149
150 sum2 = _mm_add_epi32(sum2, z0);
151 sum2 = _mm_add_epi32(sum2, z1);
152
153 x0 = _mm_sub_epi16(x0, y0);
154 x1 = _mm_sub_epi16(x1, y1);
155
156 x0 = _mm_madd_epi16(x0, x0);
157 x1 = _mm_madd_epi16(x1, x1);
158
159 sum = _mm_add_epi32(sum, x0);
160 sum = _mm_add_epi32(sum, x1);
161 }
162 while (--rowCount);
163
164 coeff += 16;
165 reconCoeff += 16;
166 colCount -= 16;
167 }
168 while (colCount > 0);
169
170 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
171 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
172 sum = _mm_unpacklo_epi32(sum, sum2);
173 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
174 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
175 }
176
177
FullDistortionKernelIntra4x4_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)178 void FullDistortionKernelIntra4x4_32bit_BT_SSE2(
179 EB_S16 *coeff,
180 EB_U32 coeffStride,
181 EB_S16 *reconCoeff,
182 EB_U32 reconCoeffStride,
183 EB_U64 distortionResult[2],
184 EB_U32 areaWidth,
185 EB_U32 areaHeight)
186 {
187 EB_S32 rowCount;
188
189 __m128i sum = _mm_setzero_si128();
190
191 rowCount = 2;
192 do
193 {
194 __m128i x0;
195 __m128i y0;
196
197 x0 = _mm_loadl_epi64((__m128i *)coeff); coeff += coeffStride;
198 x0 = _mm_loadh_epi64(x0, (__m128i *)coeff); coeff += coeffStride;
199 y0 = _mm_loadl_epi64((__m128i *)reconCoeff); reconCoeff += reconCoeffStride;
200 y0 = _mm_loadh_epi64(y0, (__m128i *)reconCoeff); reconCoeff += reconCoeffStride;
201
202 x0 = _mm_sub_epi16(x0, y0);
203
204 x0 = _mm_madd_epi16(x0, x0);
205
206 sum = _mm_add_epi32(sum, x0);
207 }
208 while (--rowCount);
209
210 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
211 sum = _mm_unpacklo_epi32(sum, sum);
212 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
213 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
214
215 (void)areaWidth;
216 (void)areaHeight;
217 }
218
FullDistortionKernelIntra8x8_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)219 void FullDistortionKernelIntra8x8_32bit_BT_SSE2(
220 EB_S16 *coeff,
221 EB_U32 coeffStride,
222 EB_S16 *reconCoeff,
223 EB_U32 reconCoeffStride,
224 EB_U64 distortionResult[2],
225 EB_U32 areaWidth,
226 EB_U32 areaHeight)
227 {
228 EB_S32 rowCount;
229
230 __m128i sum = _mm_setzero_si128();
231
232 rowCount = 8;
233 do
234 {
235 __m128i x0;
236 __m128i y0;
237
238 x0 = _mm_loadu_si128((__m128i *)(coeff + 0x00));
239 y0 = _mm_loadu_si128((__m128i *)(reconCoeff + 0x00));
240 coeff += coeffStride;
241 reconCoeff += reconCoeffStride;
242
243 x0 = _mm_sub_epi16(x0, y0);
244
245 x0 = _mm_madd_epi16(x0, x0);
246
247 sum = _mm_add_epi32(sum, x0);
248 }
249 while (--rowCount);
250
251 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
252 sum = _mm_unpacklo_epi32(sum, sum);
253 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
254 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
255
256 (void)areaWidth;
257 (void)areaHeight;
258 }
259
FullDistortionKernelIntra16MxN_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)260 void FullDistortionKernelIntra16MxN_32bit_BT_SSE2(
261 EB_S16 *coeff,
262 EB_U32 coeffStride,
263 EB_S16 *reconCoeff,
264 EB_U32 reconCoeffStride,
265 EB_U64 distortionResult[2],
266 EB_U32 areaWidth,
267 EB_U32 areaHeight)
268 {
269 EB_S32 rowCount, colCount;
270 __m128i sum = _mm_setzero_si128();
271
272 colCount = areaWidth;
273 do
274 {
275 EB_S16 *coeffTemp = coeff;
276 EB_S16 *reconCoeffTemp = reconCoeff;
277
278 rowCount = areaHeight;
279 do
280 {
281 __m128i x0, x1;
282 __m128i y0, y1;
283
284 x0 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x00));
285 x1 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x08));
286 y0 = _mm_loadu_si128((__m128i *)(reconCoeffTemp + 0x00));
287 y1 = _mm_loadu_si128((__m128i *)(reconCoeffTemp + 0x08));
288 coeffTemp += coeffStride;
289 reconCoeffTemp += reconCoeffStride;
290
291 x0 = _mm_sub_epi16(x0, y0);
292 x1 = _mm_sub_epi16(x1, y1);
293
294 x0 = _mm_madd_epi16(x0, x0);
295 x1 = _mm_madd_epi16(x1, x1);
296
297 sum = _mm_add_epi32(sum, x0);
298 sum = _mm_add_epi32(sum, x1);
299 }
300 while (--rowCount);
301
302 coeff += 16;
303 reconCoeff += 16;
304 colCount -= 16;
305 }
306 while (colCount > 0);
307
308 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
309 sum = _mm_unpacklo_epi32(sum, sum);
310 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
311 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
312 }
313
314
315
FullDistortionKernelCbfZero4x4_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)316 void FullDistortionKernelCbfZero4x4_32bit_BT_SSE2(
317 EB_S16 *coeff,
318 EB_U32 coeffStride,
319 EB_S16 *reconCoeff,
320 EB_U32 reconCoeffStride,
321 EB_U64 distortionResult[2],
322 EB_U32 areaWidth,
323 EB_U32 areaHeight)
324 {
325 EB_S32 rowCount;
326 __m128i sum2 = _mm_setzero_si128();
327
328 rowCount = 2;
329 do
330 {
331 __m128i x0;
332 __m128i z0;
333
334 x0 = _mm_loadl_epi64((__m128i *)coeff); coeff += coeffStride;
335 x0 = _mm_loadh_epi64(x0, (__m128i *)coeff); coeff += coeffStride;
336
337 z0 = _mm_madd_epi16(x0, x0);
338
339 sum2 = _mm_add_epi32(sum2, z0);
340 }
341 while (--rowCount);
342
343 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
344 sum2 = _mm_unpacklo_epi32(sum2, sum2);
345 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
346 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum2, _mm_setzero_si128()));
347
348 (void)areaWidth;
349 (void)areaHeight;
350 (void)reconCoeff;
351 (void)reconCoeffStride;
352 }
353
FullDistortionKernelCbfZero8x8_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)354 void FullDistortionKernelCbfZero8x8_32bit_BT_SSE2(
355 EB_S16 *coeff,
356 EB_U32 coeffStride,
357 EB_S16 *reconCoeff,
358 EB_U32 reconCoeffStride,
359 EB_U64 distortionResult[2],
360 EB_U32 areaWidth,
361 EB_U32 areaHeight)
362 {
363 EB_S32 rowCount;
364 __m128i sum2 = _mm_setzero_si128();
365
366 rowCount = 8;
367 do
368 {
369 __m128i x0;
370 __m128i z0;
371
372 x0 = _mm_loadu_si128((__m128i *)(coeff + 0x00));
373 coeff += coeffStride;
374
375 z0 = _mm_madd_epi16(x0, x0);
376
377 sum2 = _mm_add_epi32(sum2, z0);
378 }
379 while (--rowCount);
380
381 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
382 sum2 = _mm_unpacklo_epi32(sum2, sum2);
383 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
384 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum2, _mm_setzero_si128()));
385
386 (void)areaWidth;
387 (void)areaHeight;
388 (void)reconCoeff;
389 (void)reconCoeffStride;
390 }
391
FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)392 void FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2(
393 EB_S16 *coeff,
394 EB_U32 coeffStride,
395 EB_S16 *reconCoeff,
396 EB_U32 reconCoeffStride,
397 EB_U64 distortionResult[2],
398 EB_U32 areaWidth,
399 EB_U32 areaHeight)
400 {
401 EB_S32 rowCount, colCount;
402 __m128i sum2 = _mm_setzero_si128();
403
404 colCount = areaWidth;
405 do
406 {
407 EB_S16 *coeffTemp = coeff;
408
409 rowCount = areaHeight;
410 do
411 {
412 __m128i x0, x1;
413 __m128i z0, z1;
414
415 x0 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x00));
416 x1 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x08));
417 coeffTemp += coeffStride;
418
419 z0 = _mm_madd_epi16(x0, x0);
420 z1 = _mm_madd_epi16(x1, x1);
421
422 sum2 = _mm_add_epi32(sum2, z0);
423 sum2 = _mm_add_epi32(sum2, z1);
424 }
425 while (--rowCount);
426
427 coeff += 16;
428 reconCoeff += 16;
429 colCount -= 16;
430 }
431 while (colCount > 0);
432
433 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
434 sum2 = _mm_unpacklo_epi32(sum2, sum2);
435 sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
436 _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum2, _mm_setzero_si128()));
437 (void)reconCoeffStride;
438
439 }
440
441 /*******************************************************************************
442 PictureCopyKernel_INTRIN
443 *******************************************************************************/
PictureCopyKernel4x4_SSE_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)444 void PictureCopyKernel4x4_SSE_INTRIN(
445 EB_BYTE src,
446 EB_U32 srcStride,
447 EB_BYTE dst,
448 EB_U32 dstStride,
449 EB_U32 areaWidth,
450 EB_U32 areaHeight)
451 {
452 *(EB_U32 *)dst = *(EB_U32 *)src;
453 *(EB_U32 *)(dst + dstStride) = *(EB_U32 *)(src + srcStride);
454 *(EB_U32 *)(dst + (dstStride << 1)) = *(EB_U32 *)(src + (srcStride << 1));
455 *(EB_U32 *)(dst + (dstStride * 3)) = *(EB_U32 *)(src + (srcStride * 3));
456
457 (void)areaWidth;
458 (void)areaHeight;
459
460 return;
461 }
462
PictureCopyKernel8x8_SSE2_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)463 void PictureCopyKernel8x8_SSE2_INTRIN(
464 EB_BYTE src,
465 EB_U32 srcStride,
466 EB_BYTE dst,
467 EB_U32 dstStride,
468 EB_U32 areaWidth,
469 EB_U32 areaHeight)
470 {
471 _mm_storel_epi64((__m128i*)dst, _mm_cvtsi64_si128(*(EB_U64 *)src));
472 _mm_storel_epi64((__m128i*)(dst + srcStride), _mm_cvtsi64_si128(*(EB_U64 *)(src + srcStride)));
473 _mm_storel_epi64((__m128i*)(dst + (srcStride << 1)), _mm_cvtsi64_si128(*(EB_U64 *)(src + (srcStride << 1))));
474 _mm_storel_epi64((__m128i*)(dst + 3*srcStride), _mm_cvtsi64_si128(*(EB_U64 *)(src + 3*srcStride)));
475
476 src += (srcStride << 2);
477 dst += (dstStride << 2);
478
479 _mm_storel_epi64((__m128i*)dst, _mm_cvtsi64_si128(*(EB_U64 *)src));
480 _mm_storel_epi64((__m128i*)(dst + srcStride), _mm_cvtsi64_si128(*(EB_U64 *)(src + srcStride)));
481 _mm_storel_epi64((__m128i*)(dst + (srcStride << 1)), _mm_cvtsi64_si128(*(EB_U64 *)(src + (srcStride << 1))));
482 _mm_storel_epi64((__m128i*)(dst + 3*srcStride), _mm_cvtsi64_si128(*(EB_U64 *)(src + 3*srcStride)));
483
484 (void)areaWidth;
485 (void)areaHeight;
486
487 return;
488 }
489
PictureCopyKernel16x16_SSE2_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)490 void PictureCopyKernel16x16_SSE2_INTRIN(
491 EB_BYTE src,
492 EB_U32 srcStride,
493 EB_BYTE dst,
494 EB_U32 dstStride,
495 EB_U32 areaWidth,
496 EB_U32 areaHeight)
497 {
498 _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
499 _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
500 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
501 _mm_storeu_si128((__m128i*)(dst + (dstStride * 3)), _mm_loadu_si128((__m128i*)(src + (srcStride * 3))));
502
503 src += (srcStride << 2);
504 dst += (dstStride << 2);
505
506 _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
507 _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
508 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
509 _mm_storeu_si128((__m128i*)(dst + (dstStride * 3)), _mm_loadu_si128((__m128i*)(src + (srcStride * 3))));
510
511 src += (srcStride << 2);
512 dst += (dstStride << 2);
513
514 _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
515 _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
516 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
517 _mm_storeu_si128((__m128i*)(dst + (dstStride * 3)), _mm_loadu_si128((__m128i*)(src + (srcStride * 3))));
518
519 src += (srcStride << 2);
520 dst += (dstStride << 2);
521
522 _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
523 _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
524 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
525 _mm_storeu_si128((__m128i*)(dst + (dstStride * 3)), _mm_loadu_si128((__m128i*)(src + (srcStride * 3))));
526
527 src += (srcStride << 2);
528 dst += (dstStride << 2);
529
530 (void)areaWidth;
531 (void)areaHeight;
532
533 return;
534 }
535
536
PictureCopyKernel32x32_SSE2_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)537 void PictureCopyKernel32x32_SSE2_INTRIN(
538 EB_BYTE src,
539 EB_U32 srcStride,
540 EB_BYTE dst,
541 EB_U32 dstStride,
542 EB_U32 areaWidth,
543 EB_U32 areaHeight)
544 {
545 EB_U32 y;
546
547 for (y = 0; y < 4; ++y){
548
549 _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
550 _mm_storeu_si128((__m128i*)(dst + 16), _mm_loadu_si128((__m128i*)(src + 16)));
551 _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
552 _mm_storeu_si128((__m128i*)(dst + dstStride + 16), _mm_loadu_si128((__m128i*)(src + srcStride + 16)));
553 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
554 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 16), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 16)));
555 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), _mm_loadu_si128((__m128i*)(src + 3 * srcStride)));
556 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 16), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 16)));
557
558 src += (srcStride << 2);
559 dst += (dstStride << 2);
560
561 _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
562 _mm_storeu_si128((__m128i*)(dst + 16), _mm_loadu_si128((__m128i*)(src + 16)));
563 _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
564 _mm_storeu_si128((__m128i*)(dst + dstStride + 16), _mm_loadu_si128((__m128i*)(src + srcStride + 16)));
565 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
566 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 16), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 16)));
567 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), _mm_loadu_si128((__m128i*)(src + 3 * srcStride)));
568 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 16), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 16)));
569
570 src += (srcStride << 2);
571 dst += (dstStride << 2);
572 }
573 (void)areaWidth;
574 (void)areaHeight;
575
576 return;
577 }
578
PictureCopyKernel64x64_SSE2_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)579 void PictureCopyKernel64x64_SSE2_INTRIN(
580 EB_BYTE src,
581 EB_U32 srcStride,
582 EB_BYTE dst,
583 EB_U32 dstStride,
584 EB_U32 areaWidth,
585 EB_U32 areaHeight)
586 {
587 EB_U32 y;
588
589 for (y = 0; y < 8; ++y){
590
591 _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
592 _mm_storeu_si128((__m128i*)(dst + 16), _mm_loadu_si128((__m128i*)(src + 16)));
593 _mm_storeu_si128((__m128i*)(dst + 32), _mm_loadu_si128((__m128i*)(src + 32)));
594 _mm_storeu_si128((__m128i*)(dst + 48), _mm_loadu_si128((__m128i*)(src + 48)));
595 _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
596 _mm_storeu_si128((__m128i*)(dst + dstStride + 16), _mm_loadu_si128((__m128i*)(src + srcStride + 16)));
597 _mm_storeu_si128((__m128i*)(dst + dstStride + 32), _mm_loadu_si128((__m128i*)(src + srcStride + 32)));
598 _mm_storeu_si128((__m128i*)(dst + dstStride + 48), _mm_loadu_si128((__m128i*)(src + srcStride + 48)));
599 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
600 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 16), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 16)));
601 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 32), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 32)));
602 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 48), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 48)));
603 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), _mm_loadu_si128((__m128i*)(src + 3 * srcStride)));
604 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 16), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 16)));
605 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 32), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 32)));
606 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 48), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 48)));
607
608 src += (srcStride << 2);
609 dst += (dstStride << 2);
610
611 _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
612 _mm_storeu_si128((__m128i*)(dst + 16), _mm_loadu_si128((__m128i*)(src + 16)));
613 _mm_storeu_si128((__m128i*)(dst + 32), _mm_loadu_si128((__m128i*)(src + 32)));
614 _mm_storeu_si128((__m128i*)(dst + 48), _mm_loadu_si128((__m128i*)(src + 48)));
615 _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
616 _mm_storeu_si128((__m128i*)(dst + dstStride + 16), _mm_loadu_si128((__m128i*)(src + srcStride + 16)));
617 _mm_storeu_si128((__m128i*)(dst + dstStride + 32), _mm_loadu_si128((__m128i*)(src + srcStride + 32)));
618 _mm_storeu_si128((__m128i*)(dst + dstStride + 48), _mm_loadu_si128((__m128i*)(src + srcStride + 48)));
619 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
620 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 16), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 16)));
621 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 32), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 32)));
622 _mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 48), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 48)));
623 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), _mm_loadu_si128((__m128i*)(src + 3 * srcStride)));
624 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 16), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 16)));
625 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 32), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 32)));
626 _mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 48), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 48)));
627
628 src += (srcStride << 2);
629 dst += (dstStride << 2);
630 }
631 (void)areaWidth;
632 (void)areaHeight;
633
634 return;
635 }
636 /*******************************************************************************
637 PictureAdditionKernel_INTRIN
638 *******************************************************************************/
PictureAdditionKernel4x4_SSE_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)639 void PictureAdditionKernel4x4_SSE_INTRIN(
640 EB_U8 *predPtr,
641 EB_U32 predStride,
642 EB_S16 *residualPtr,
643 EB_U32 residualStride,
644 EB_U8 *reconPtr,
645 EB_U32 reconStride,
646 EB_U32 width,
647 EB_U32 height)
648 {
649 EB_U32 y;
650 __m128i xmm0, recon_0_3;
651 xmm0 = _mm_setzero_si128();
652
653 for (y = 0; y < 4; ++y){
654
655 recon_0_3 = _mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)predPtr), xmm0), _mm_loadl_epi64((__m128i *)residualPtr)), xmm0);
656
657 *(EB_U32 *)reconPtr = _mm_cvtsi128_si32(recon_0_3);
658 predPtr += predStride;
659 residualPtr += residualStride;
660 reconPtr += reconStride;
661 }
662 (void)width;
663 (void)height;
664
665 return;
666 }
667
PictureAdditionKernel8x8_SSE2_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)668 void PictureAdditionKernel8x8_SSE2_INTRIN(
669 EB_U8 *predPtr,
670 EB_U32 predStride,
671 EB_S16 *residualPtr,
672 EB_U32 residualStride,
673 EB_U8 *reconPtr,
674 EB_U32 reconStride,
675 EB_U32 width,
676 EB_U32 height)
677 {
678
679 __m128i recon_0_7, xmm0;
680 EB_U32 y;
681
682 xmm0 = _mm_setzero_si128();
683
684 for (y = 0; y < 8; ++y){
685
686 recon_0_7 = _mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)predPtr), xmm0), _mm_loadu_si128((__m128i *)residualPtr)), xmm0);
687
688 *(EB_U64 *)reconPtr = _mm_cvtsi128_si64(recon_0_7);
689 predPtr += predStride;
690 residualPtr += residualStride;
691 reconPtr += reconStride;
692 }
693 (void)width;
694 (void)height;
695
696 return;
697 }
698
PictureAdditionKernel16x16_SSE2_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)699 void PictureAdditionKernel16x16_SSE2_INTRIN(
700 EB_U8 *predPtr,
701 EB_U32 predStride,
702 EB_S16 *residualPtr,
703 EB_U32 residualStride,
704 EB_U8 *reconPtr,
705 EB_U32 reconStride,
706 EB_U32 width,
707 EB_U32 height)
708 {
709 __m128i xmm0, xmm_clip_U8, pred_0_15, recon_0_7, recon_8_15;
710 EB_U32 y;
711
712 xmm0 = _mm_setzero_si128();
713
714 for (y = 0; y < 16; ++y){
715
716 pred_0_15 = _mm_loadu_si128((__m128i *)predPtr);
717 recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residualPtr));
718 recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 8)));
719 xmm_clip_U8 = _mm_packus_epi16(recon_0_7, recon_8_15);
720
721 _mm_storeu_si128((__m128i*)reconPtr, xmm_clip_U8);
722
723 predPtr += predStride;
724 residualPtr += residualStride;
725 reconPtr += reconStride;
726 }
727 (void)width;
728 (void)height;
729
730 return;
731
732 }
PictureAdditionKernel32x32_SSE2_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)733 void PictureAdditionKernel32x32_SSE2_INTRIN(
734 EB_U8 *predPtr,
735 EB_U32 predStride,
736 EB_S16 *residualPtr,
737 EB_U32 residualStride,
738 EB_U8 *reconPtr,
739 EB_U32 reconStride,
740 EB_U32 width,
741 EB_U32 height)
742 {
743 EB_U32 y;
744 __m128i xmm0, pred_0_15, pred_16_31, recon_0_15_clipped, recon_0_7, recon_8_15, recon_16_23, recon_24_31, recon_16_31_clipped;
745 xmm0 = _mm_setzero_si128();
746
747 for (y = 0; y < 32; ++y){
748 pred_0_15 = _mm_loadu_si128((__m128i *)predPtr);
749 pred_16_31 = _mm_loadu_si128((__m128i *)(predPtr + 16));
750
751 recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residualPtr));
752 recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 8)));
753 recon_16_23 = _mm_add_epi16(_mm_unpacklo_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 16)));
754 recon_24_31 = _mm_add_epi16(_mm_unpackhi_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 24)));
755
756 recon_0_15_clipped = _mm_packus_epi16(recon_0_7, recon_8_15);
757 recon_16_31_clipped = _mm_packus_epi16(recon_16_23, recon_24_31);
758
759 _mm_storeu_si128((__m128i*)reconPtr, recon_0_15_clipped);
760 _mm_storeu_si128((__m128i*)(reconPtr + 16), recon_16_31_clipped);
761
762 predPtr += predStride;
763 residualPtr += residualStride;
764 reconPtr += reconStride;
765 }
766 (void)width;
767 (void)height;
768
769 return;
770 }
771
PictureAdditionKernel64x64_SSE2_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)772 void PictureAdditionKernel64x64_SSE2_INTRIN(
773 EB_U8 *predPtr,
774 EB_U32 predStride,
775 EB_S16 *residualPtr,
776 EB_U32 residualStride,
777 EB_U8 *reconPtr,
778 EB_U32 reconStride,
779 EB_U32 width,
780 EB_U32 height)
781 {
782 EB_U32 y;
783
784 __m128i xmm0, pred_0_15, pred_16_31, pred_32_47, pred_48_63;
785 __m128i recon_0_15_clipped, recon_16_31_clipped, recon_32_47_clipped, recon_48_63_clipped;
786 __m128i recon_0_7, recon_8_15, recon_16_23, recon_24_31, recon_32_39, recon_40_47, recon_48_55, recon_56_63;
787
788 xmm0 = _mm_setzero_si128();
789
790 for (y = 0; y < 64; ++y){
791
792 pred_0_15 = _mm_loadu_si128((__m128i *)predPtr);
793 pred_16_31 = _mm_loadu_si128((__m128i *)(predPtr + 16));
794 pred_32_47 = _mm_loadu_si128((__m128i *)(predPtr + 32));
795 pred_48_63 = _mm_loadu_si128((__m128i *)(predPtr + 48));
796
797 recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residualPtr));
798 recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 8)));
799 recon_16_23 = _mm_add_epi16(_mm_unpacklo_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 16)));
800 recon_24_31 = _mm_add_epi16(_mm_unpackhi_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 24)));
801 recon_32_39 = _mm_add_epi16(_mm_unpacklo_epi8(pred_32_47, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 32)));
802 recon_40_47 = _mm_add_epi16(_mm_unpackhi_epi8(pred_32_47, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 40)));
803 recon_48_55 = _mm_add_epi16(_mm_unpacklo_epi8(pred_48_63, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 48)));
804 recon_56_63 = _mm_add_epi16(_mm_unpackhi_epi8(pred_48_63, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 56)));
805
806 recon_0_15_clipped = _mm_packus_epi16(recon_0_7, recon_8_15);
807 recon_16_31_clipped = _mm_packus_epi16(recon_16_23, recon_24_31);
808 recon_32_47_clipped = _mm_packus_epi16(recon_32_39, recon_40_47);
809 recon_48_63_clipped = _mm_packus_epi16(recon_48_55, recon_56_63);
810
811 _mm_storeu_si128((__m128i*)reconPtr, recon_0_15_clipped);
812 _mm_storeu_si128((__m128i*)(reconPtr + 16), recon_16_31_clipped);
813 _mm_storeu_si128((__m128i*)(reconPtr + 32), recon_32_47_clipped);
814 _mm_storeu_si128((__m128i*)(reconPtr + 48), recon_48_63_clipped);
815
816 predPtr += predStride;
817 residualPtr += residualStride;
818 reconPtr += reconStride;
819 }
820 (void)width;
821 (void)height;
822
823 return;
824 }
825
826 /******************************************************************************************************
827 ResidualKernel
828 ***********************************************************************************************************/
829
ResidualKernel4x4_SSE_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)830 void ResidualKernel4x4_SSE_INTRIN(
831 EB_U8 *input,
832 EB_U32 inputStride,
833 EB_U8 *pred,
834 EB_U32 predStride,
835 EB_S16 *residual,
836 EB_U32 residualStride,
837 EB_U32 areaWidth,
838 EB_U32 areaHeight)
839 {
840 __m128i residual_0_3, xmm0 = _mm_setzero_si128();
841 EB_U32 y;
842
843 for (y = 0; y < 4; ++y){
844
845 residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)input), xmm0),
846 _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)pred), xmm0));
847
848 *(EB_U64 *)residual = _mm_cvtsi128_si64(residual_0_3);
849
850 input += inputStride;
851 pred += predStride;
852 residual += residualStride;
853 }
854 (void)areaWidth;
855 (void)areaHeight;
856
857 return;
858 }
859
ResidualKernel8x8_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)860 void ResidualKernel8x8_SSE2_INTRIN(
861 EB_U8 *input,
862 EB_U32 inputStride,
863 EB_U8 *pred,
864 EB_U32 predStride,
865 EB_S16 *residual,
866 EB_U32 residualStride,
867 EB_U32 areaWidth,
868 EB_U32 areaHeight)
869 {
870 __m128i xmm0, residual_0_7;
871 EB_U32 y;
872
873 xmm0 = _mm_setzero_si128();
874
875 for (y = 0; y < 8; ++y){
876
877 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
878
879 _mm_storeu_si128((__m128i*)residual, residual_0_7);
880
881 input += inputStride;
882 pred += predStride;
883 residual += residualStride;
884 }
885 (void)areaWidth;
886 (void)areaHeight;
887
888 return;
889 }
890
ResidualKernel16x16_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)891 void ResidualKernel16x16_SSE2_INTRIN(
892 EB_U8 *input,
893 EB_U32 inputStride,
894 EB_U8 *pred,
895 EB_U32 predStride,
896 EB_S16 *residual,
897 EB_U32 residualStride,
898 EB_U32 areaWidth,
899 EB_U32 areaHeight)
900 {
901 __m128i xmm0, residual_0_7, residual_8_15;
902 EB_U32 y;
903
904 xmm0 = _mm_setzero_si128();
905
906 for (y = 0; y < 16; ++y){
907
908 residual_0_7 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
909 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
910
911 _mm_storeu_si128((__m128i*)residual, residual_0_7);
912 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
913
914 input += inputStride;
915 pred += predStride;
916 residual += residualStride;
917 }
918 (void)areaWidth;
919 (void)areaHeight;
920
921 return;
922 }
923
ResidualKernel32x32_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)924 void ResidualKernel32x32_SSE2_INTRIN(
925 EB_U8 *input,
926 EB_U32 inputStride,
927 EB_U8 *pred,
928 EB_U32 predStride,
929 EB_S16 *residual,
930 EB_U32 residualStride,
931 EB_U32 areaWidth,
932 EB_U32 areaHeight)
933 {
934 __m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31;
935 EB_U32 y;
936
937 xmm0 = _mm_setzero_si128();
938
939 for (y = 0; y < 32; ++y){
940
941 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
942 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
943 residual_16_23 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
944 residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
945
946 _mm_storeu_si128((__m128i*)residual, residual_0_7);
947 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
948 _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
949 _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
950
951 input += inputStride;
952 pred += predStride;
953 residual += residualStride;
954 }
955 (void)areaWidth;
956 (void)areaHeight;
957
958 return;
959 }
960
ResidualKernel64x64_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)961 void ResidualKernel64x64_SSE2_INTRIN(
962 EB_U8 *input,
963 EB_U32 inputStride,
964 EB_U8 *pred,
965 EB_U32 predStride,
966 EB_S16 *residual,
967 EB_U32 residualStride,
968 EB_U32 areaWidth,
969 EB_U32 areaHeight)
970 {
971 __m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31, resdiaul_32_39, residual_40_47, residual_48_55, residual_56_63;
972 EB_U32 y;
973
974 xmm0 = _mm_setzero_si128();
975
976 for (y = 0; y < 64; ++y){
977
978 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
979 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
980 residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
981 residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
982 resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
983 residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
984 residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
985 residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
986
987 _mm_storeu_si128((__m128i*)residual, residual_0_7);
988 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
989 _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
990 _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
991 _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
992 _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
993 _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
994 _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
995
996 input += inputStride;
997 pred += predStride;
998 residual += residualStride;
999 }
1000 (void)areaWidth;
1001 (void)areaHeight;
1002
1003 return;
1004 }
1005
ResidualKernelSubSampled4x4_SSE_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1006 void ResidualKernelSubSampled4x4_SSE_INTRIN(
1007 EB_U8 *input,
1008 EB_U32 inputStride,
1009 EB_U8 *pred,
1010 EB_U32 predStride,
1011 EB_S16 *residual,
1012 EB_U32 residualStride,
1013 EB_U32 areaWidth,
1014 EB_U32 areaHeight,
1015 EB_U8 lastLine)
1016 {
1017 __m128i residual_0_3, xmm0 = _mm_setzero_si128();
1018 EB_U32 y;
1019 //hard code subampling dimensions, keep residualStride
1020 areaHeight>>=1;
1021 inputStride<<=1;
1022 predStride<<=1;
1023
1024 for (y = 0; y < areaHeight; ++y){
1025
1026 residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)input), xmm0),
1027 _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)pred), xmm0));
1028
1029 *(EB_U64 *)residual = _mm_cvtsi128_si64(residual_0_3);
1030
1031 residual += residualStride;
1032 *(EB_U64 *)residual = _mm_cvtsi128_si64(residual_0_3);
1033
1034 input += inputStride;
1035 pred += predStride;
1036 residual += residualStride;
1037 }
1038 (void)areaWidth;
1039 //compute the last line
1040
1041 if(lastLine){
1042 input -= (inputStride)>>1;
1043 pred -= (predStride )>>1;
1044 residual -= residualStride;
1045 residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)input), xmm0),
1046 _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)pred), xmm0));
1047
1048 *(EB_U64 *)residual = _mm_cvtsi128_si64(residual_0_3);
1049 }
1050
1051 return;
1052 }
1053
ResidualKernelSubSampled8x8_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1054 void ResidualKernelSubSampled8x8_SSE2_INTRIN(
1055 EB_U8 *input,
1056 EB_U32 inputStride,
1057 EB_U8 *pred,
1058 EB_U32 predStride,
1059 EB_S16 *residual,
1060 EB_U32 residualStride,
1061 EB_U32 areaWidth,
1062 EB_U32 areaHeight,
1063 EB_U8 lastLine
1064
1065 )
1066 {
1067 __m128i xmm0, residual_0_7;
1068 EB_U32 y;
1069
1070 xmm0 = _mm_setzero_si128();
1071 //hard code subampling dimensions, keep residualStride
1072 areaHeight>>=1;
1073 inputStride<<=1;
1074 predStride<<=1;
1075
1076 for (y = 0; y < areaHeight; ++y){
1077
1078 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
1079
1080 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1081
1082 residual += residualStride;
1083 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1084
1085 input += inputStride;
1086 pred += predStride;
1087 residual += residualStride;
1088 }
1089 (void)areaWidth;
1090 //compute the last line
1091 if(lastLine){
1092
1093 input -= (inputStride)>>1;
1094 pred -= (predStride )>>1;
1095 residual -= residualStride;
1096
1097 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
1098
1099 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1100
1101 }
1102
1103 return;
1104 }
1105
ResidualKernelSubSampled16x16_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1106 void ResidualKernelSubSampled16x16_SSE2_INTRIN(
1107 EB_U8 *input,
1108 EB_U32 inputStride,
1109 EB_U8 *pred,
1110 EB_U32 predStride,
1111 EB_S16 *residual,
1112 EB_U32 residualStride,
1113 EB_U32 areaWidth,
1114 EB_U32 areaHeight,
1115 EB_U8 lastLine
1116
1117 )
1118 {
1119 __m128i xmm0, residual_0_7, residual_8_15;
1120 EB_U32 y;
1121
1122 xmm0 = _mm_setzero_si128();
1123 //hard code subampling dimensions, keep residualStride
1124 areaHeight>>=1;
1125 inputStride<<=1;
1126 predStride<<=1;
1127
1128 for (y = 0; y < areaHeight; ++y){
1129
1130 residual_0_7 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1131 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1132
1133 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1134 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1135
1136 residual += residualStride;
1137 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1138 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1139
1140 input += inputStride;
1141 pred += predStride;
1142 residual += residualStride;
1143 }
1144 (void)areaWidth;
1145 //compute the last line
1146
1147 if(lastLine){
1148
1149 input -= (inputStride)>>1;
1150 pred -= (predStride )>>1;
1151 residual -= residualStride;
1152
1153 residual_0_7 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1154 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1155
1156 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1157 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1158
1159 }
1160 return;
1161 }
1162
ResidualKernelSubSampled32x32_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1163 void ResidualKernelSubSampled32x32_SSE2_INTRIN(
1164 EB_U8 *input,
1165 EB_U32 inputStride,
1166 EB_U8 *pred,
1167 EB_U32 predStride,
1168 EB_S16 *residual,
1169 EB_U32 residualStride,
1170 EB_U32 areaWidth,
1171 EB_U32 areaHeight,
1172 EB_U8 lastLine)
1173 {
1174 __m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31;
1175 EB_U32 y;
1176
1177 xmm0 = _mm_setzero_si128();
1178
1179 //hard code subampling dimensions, keep residualStride
1180 areaHeight>>=1;
1181 inputStride<<=1;
1182 predStride<<=1;
1183
1184
1185 for (y = 0; y < areaHeight; ++y){
1186
1187 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1188 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1189 residual_16_23 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1190 residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1191
1192 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1193 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1194 _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1195 _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1196
1197 residual += residualStride;
1198 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1199 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1200 _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1201 _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1202
1203 input += inputStride;
1204 pred += predStride;
1205 residual += residualStride;
1206 }
1207 (void)areaWidth;
1208 //compute the last line
1209
1210 if(lastLine){
1211 input -= (inputStride)>>1;
1212 pred -= (predStride )>>1;
1213 residual -= residualStride;
1214
1215 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1216 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1217 residual_16_23 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1218 residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1219
1220 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1221 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1222 _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1223 _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1224 }
1225
1226 return;
1227 }
1228
1229
ResidualKernelSubSampled64x64_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1230 void ResidualKernelSubSampled64x64_SSE2_INTRIN(
1231 EB_U8 *input,
1232 EB_U32 inputStride,
1233 EB_U8 *pred,
1234 EB_U32 predStride,
1235 EB_S16 *residual,
1236 EB_U32 residualStride,
1237 EB_U32 areaWidth,
1238 EB_U32 areaHeight,
1239 EB_U8 lastLine)
1240 {
1241 __m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31, resdiaul_32_39, residual_40_47, residual_48_55, residual_56_63;
1242 EB_U32 y;
1243
1244 xmm0 = _mm_setzero_si128();
1245
1246 //hard code subampling dimensions, keep residualStride
1247 areaHeight>>=1;
1248 inputStride<<=1;
1249 predStride<<=1;
1250
1251 for (y = 0; y < areaHeight; ++y){
1252
1253 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1254 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1255 residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1256 residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1257 resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
1258 residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
1259 residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
1260 residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
1261
1262 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1263 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1264 _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1265 _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1266 _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
1267 _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
1268 _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
1269 _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
1270
1271 //duplicate top field residual to bottom field
1272 residual += residualStride;
1273 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1274 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1275 _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1276 _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1277 _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
1278 _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
1279 _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
1280 _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
1281
1282 input += inputStride;
1283 pred += predStride;
1284 residual += residualStride;
1285 }
1286 (void)areaWidth;
1287 //compute the last line
1288
1289 if(lastLine){
1290 input -= (inputStride)>>1;
1291 pred -= (predStride )>>1;
1292 residual -= residualStride;
1293
1294 residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1295 residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1296 residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1297 residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1298 resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
1299 residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
1300 residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
1301 residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
1302
1303 _mm_storeu_si128((__m128i*)residual, residual_0_7);
1304 _mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1305 _mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1306 _mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1307 _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
1308 _mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
1309 _mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
1310 _mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
1311
1312 }
1313
1314 return;
1315 }
1316 /******************************************************************************************************
1317 ResidualKernel16bit_SSE2_INTRIN
1318 ******************************************************************************************************/
ResidualKernel16bit_SSE2_INTRIN(EB_U16 * input,EB_U32 inputStride,EB_U16 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)1319 void ResidualKernel16bit_SSE2_INTRIN(
1320 EB_U16 *input,
1321 EB_U32 inputStride,
1322 EB_U16 *pred,
1323 EB_U32 predStride,
1324 EB_S16 *residual,
1325 EB_U32 residualStride,
1326 EB_U32 areaWidth,
1327 EB_U32 areaHeight)
1328 {
1329 EB_U32 x, y;
1330 __m128i residual0, residual1;
1331
1332 if (areaWidth == 4)
1333 {
1334 for (y = 0; y < areaHeight; y += 2){
1335
1336 residual0 = _mm_sub_epi16(_mm_loadl_epi64((__m128i*)input), _mm_loadl_epi64((__m128i*)pred));
1337 residual1 = _mm_sub_epi16(_mm_loadl_epi64((__m128i*)(input + inputStride)), _mm_loadl_epi64((__m128i*)(pred + predStride)));
1338
1339 _mm_storel_epi64((__m128i*)residual, residual0);
1340 _mm_storel_epi64((__m128i*)(residual + residualStride), residual1);
1341
1342 input += inputStride << 1;
1343 pred += predStride << 1;
1344 residual += residualStride << 1;
1345 }
1346 }
1347 else if (areaWidth == 8){
1348 for (y = 0; y < areaHeight; y += 2){
1349
1350 residual0 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred));
1351 residual1 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride)), _mm_loadu_si128((__m128i*)(pred + predStride)));
1352
1353 _mm_storeu_si128((__m128i*) residual, residual0);
1354 _mm_storeu_si128((__m128i*) (residual + residualStride), residual1);
1355
1356 input += inputStride << 1;
1357 pred += predStride << 1;
1358 residual += residualStride << 1;
1359 }
1360 }
1361 else if(areaWidth == 16){
1362
1363 __m128i residual2, residual3;
1364
1365 for (y = 0; y < areaHeight; y += 2){
1366
1367 residual0 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred));
1368 residual1 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8)));
1369 residual2 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input+inputStride)), _mm_loadu_si128((__m128i*)(pred+predStride)));
1370 residual3 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride+8)), _mm_loadu_si128((__m128i*)(pred + predStride+8)));
1371
1372 _mm_storeu_si128((__m128i*)residual, residual0);
1373 _mm_storeu_si128((__m128i*)(residual + 8), residual1);
1374 _mm_storeu_si128((__m128i*)(residual+residualStride), residual2);
1375 _mm_storeu_si128((__m128i*)(residual +residualStride+ 8), residual3);
1376
1377 input += inputStride << 1;
1378 pred += predStride << 1;
1379 residual += residualStride << 1;
1380 }
1381 }
1382 else if(areaWidth == 32){
1383
1384 for (y = 0; y < areaHeight; y += 2){
1385 //residual[columnIndex] = ((EB_S16)input[columnIndex]) - ((EB_S16)pred[columnIndex]);
1386 _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
1387 _mm_storeu_si128((__m128i*) (residual + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8))));
1388 _mm_storeu_si128((__m128i*) (residual + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 16)), _mm_loadu_si128((__m128i*)(pred + 16))));
1389 _mm_storeu_si128((__m128i*) (residual + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 24)), _mm_loadu_si128((__m128i*)(pred + 24))));
1390
1391 _mm_storeu_si128((__m128i*) (residual + residualStride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input+inputStride)), _mm_loadu_si128((__m128i*)(pred+predStride))));
1392 _mm_storeu_si128((__m128i*) (residual + residualStride + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 8)), _mm_loadu_si128((__m128i*)(pred+predStride + 8))));
1393 _mm_storeu_si128((__m128i*) (residual + residualStride + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 16)), _mm_loadu_si128((__m128i*)(pred + predStride+ 16))));
1394 _mm_storeu_si128((__m128i*) (residual + residualStride + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 24)), _mm_loadu_si128((__m128i*)(pred + predStride+ 24))));
1395
1396 input += inputStride << 1;
1397 pred += predStride << 1;
1398 residual += residualStride << 1;
1399 }
1400 }
1401 else if(areaWidth == 64){ // Branch was not tested because the encoder had max tuSize of 32
1402
1403 for (y = 0; y < areaHeight; y += 2){
1404
1405 //residual[columnIndex] = ((EB_S16)input[columnIndex]) - ((EB_S16)pred[columnIndex]) 8 indices per _mm_sub_epi16
1406 _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
1407 _mm_storeu_si128((__m128i*) (residual + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8))));
1408 _mm_storeu_si128((__m128i*) (residual + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 16)), _mm_loadu_si128((__m128i*)(pred + 16))));
1409 _mm_storeu_si128((__m128i*) (residual + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 24)), _mm_loadu_si128((__m128i*)(pred + 24))));
1410 _mm_storeu_si128((__m128i*) (residual + 32), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 32)), _mm_loadu_si128((__m128i*)(pred + 32))));
1411 _mm_storeu_si128((__m128i*) (residual + 40), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 40)), _mm_loadu_si128((__m128i*)(pred + 40))));
1412 _mm_storeu_si128((__m128i*) (residual + 48), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 48)), _mm_loadu_si128((__m128i*)(pred + 48))));
1413 _mm_storeu_si128((__m128i*) (residual + 56), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 56)), _mm_loadu_si128((__m128i*)(pred + 56))));
1414
1415 _mm_storeu_si128((__m128i*) (residual + residualStride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride)), _mm_loadu_si128((__m128i*)(pred + predStride))));
1416 _mm_storeu_si128((__m128i*) (residual + residualStride + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 8)), _mm_loadu_si128((__m128i*)(pred + predStride + 8))));
1417 _mm_storeu_si128((__m128i*) (residual + residualStride + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 16)), _mm_loadu_si128((__m128i*)(pred + predStride + 16))));
1418 _mm_storeu_si128((__m128i*) (residual + residualStride + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 24)), _mm_loadu_si128((__m128i*)(pred + predStride + 24))));
1419 _mm_storeu_si128((__m128i*) (residual + residualStride + 32), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 32)), _mm_loadu_si128((__m128i*)(pred + predStride + 32))));
1420 _mm_storeu_si128((__m128i*) (residual + residualStride + 40), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 40)), _mm_loadu_si128((__m128i*)(pred + predStride + 40))));
1421 _mm_storeu_si128((__m128i*) (residual + residualStride + 48), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 48)), _mm_loadu_si128((__m128i*)(pred + predStride + 48))));
1422 _mm_storeu_si128((__m128i*) (residual + residualStride + 56), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 56)), _mm_loadu_si128((__m128i*)(pred + predStride + 56))));
1423
1424 input += inputStride << 1;
1425 pred += predStride << 1;
1426 residual += residualStride << 1;
1427 }
1428 }
1429 else {
1430
1431 EB_U32 inputStrideDiff = 2 * inputStride;
1432 EB_U32 predStrideDiff = 2 * predStride;
1433 EB_U32 residualStrideDiff = 2 * residualStride;
1434 inputStrideDiff -= areaWidth;
1435 predStrideDiff -= areaWidth;
1436 residualStrideDiff -= areaWidth;
1437
1438 if (!(areaWidth & 7)){
1439
1440 for (x = 0; x < areaHeight; x += 2){
1441 for (y = 0; y < areaWidth; y += 8){
1442
1443 _mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
1444 _mm_storeu_si128((__m128i*) (residual + residualStride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride)), _mm_loadu_si128((__m128i*)(pred + predStride))));
1445
1446 input += 8;
1447 pred += 8;
1448 residual += 8;
1449 }
1450 input = input + inputStrideDiff;
1451 pred = pred + predStrideDiff;
1452 residual = residual + residualStrideDiff;
1453 }
1454 }
1455 else{
1456 for (x = 0; x < areaHeight; x += 2){
1457 for (y = 0; y < areaWidth; y += 4){
1458
1459 _mm_storel_epi64((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
1460 _mm_storel_epi64((__m128i*) (residual + residualStride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride)), _mm_loadu_si128((__m128i*)(pred + predStride))));
1461
1462 input += 4;
1463 pred += 4;
1464 residual += 4;
1465 }
1466 input = input + inputStrideDiff;
1467 pred = pred + predStrideDiff;
1468 residual = residual + residualStrideDiff;
1469 }
1470 }
1471 }
1472 return;
1473 }
1474
1475 /******************************************************************************************************
1476 PictureAdditionKernel16bit_SSE2_INTRIN
1477 ******************************************************************************************************/
1478
1479
PictureAdditionKernel16bit_SSE2_INTRIN(EB_U16 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U16 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)1480 void PictureAdditionKernel16bit_SSE2_INTRIN(
1481 EB_U16 *predPtr,
1482 EB_U32 predStride,
1483 EB_S16 *residualPtr,
1484 EB_U32 residualStride,
1485 EB_U16 *reconPtr,
1486 EB_U32 reconStride,
1487 EB_U32 width,
1488 EB_U32 height)
1489 {
1490 __m128i xmm_0, xmm_Max10bit;
1491
1492 EB_U32 y, x;
1493
1494 xmm_0 = _mm_setzero_si128();
1495 xmm_Max10bit = _mm_set1_epi16(1023);
1496
1497 if (width == 4)
1498 {
1499 __m128i xmm_sum_0_3, xmm_sum_s0_s3, xmm_clip3_0_3, xmm_clip3_s0_s3;
1500 for (y = 0; y < height; y += 2){
1501
1502 xmm_sum_0_3 = _mm_adds_epi16(_mm_loadl_epi64((__m128i*)predPtr), _mm_loadl_epi64((__m128i*)residualPtr));
1503 xmm_sum_s0_s3 = _mm_adds_epi16(_mm_loadl_epi64((__m128i*)(predPtr + predStride)), _mm_loadl_epi64((__m128i*)(residualPtr + residualStride)));
1504
1505 xmm_clip3_0_3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_3, xmm_Max10bit), xmm_0);
1506 xmm_clip3_s0_s3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s3, xmm_Max10bit), xmm_0);
1507
1508 _mm_storel_epi64((__m128i*) reconPtr, xmm_clip3_0_3);
1509 _mm_storel_epi64((__m128i*) (reconPtr + reconStride), xmm_clip3_s0_s3);
1510
1511 predPtr += predStride << 1;
1512 residualPtr += residualStride << 1;
1513 reconPtr += reconStride << 1;
1514 }
1515 }
1516 else if (width == 8){
1517
1518 __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_7, xmm_clip3_s0_s7;
1519
1520 for (y = 0; y < height; y += 2){
1521
1522 xmm_sum_0_7 = _mm_adds_epi16( _mm_loadu_si128((__m128i*)predPtr),_mm_loadu_si128((__m128i*)residualPtr));
1523 xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride)));
1524
1525 xmm_clip3_0_7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
1526 xmm_clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
1527
1528 _mm_storeu_si128((__m128i*) reconPtr, xmm_clip3_0_7);
1529 _mm_storeu_si128((__m128i*) (reconPtr + reconStride), xmm_clip3_s0_s7);
1530
1531 predPtr += predStride << 1;
1532 residualPtr += residualStride << 1;
1533 reconPtr += reconStride << 1;
1534 }
1535 }
1536 else if (width == 16){
1537
1538 __m128i sum_0_7, sum_8_15, sum_s0_s7, sum_s8_s15, clip3_0_7, clip3_8_15, clip3_s0_s7, clip3_s8_s15;
1539
1540 for (y = 0; y < height; y += 2){
1541
1542 sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1543 sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 8)), _mm_loadu_si128((__m128i*)(residualPtr + 8)));
1544 sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride)));
1545 sum_s8_s15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride + 8)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride + 8)));
1546
1547 clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
1548 clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15, xmm_Max10bit), xmm_0);
1549 clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(sum_s0_s7, xmm_Max10bit), xmm_0);
1550 clip3_s8_s15 = _mm_max_epi16(_mm_min_epi16(sum_s8_s15, xmm_Max10bit), xmm_0);
1551
1552 _mm_storeu_si128((__m128i*) reconPtr, clip3_0_7);
1553 _mm_storeu_si128((__m128i*) (reconPtr + 8), clip3_8_15);
1554 _mm_storeu_si128((__m128i*) (reconPtr + reconStride), clip3_s0_s7);
1555 _mm_storeu_si128((__m128i*) (reconPtr + reconStride + 8), clip3_s8_s15);
1556
1557 predPtr += predStride << 1;
1558 residualPtr += residualStride << 1;
1559 reconPtr += reconStride << 1;
1560 }
1561 }
1562 else if (width == 32){
1563 __m128i sum_0_7, sum_8_15, sum_16_23, sum_24_31, sum_s0_s7, sum_s8_s15, sum_s16_s23, sum_s24_s31;
1564 __m128i clip3_0_7, clip3_8_15, clip3_16_23, clip3_24_31, clip3_s0_s7, clip3_s8_s15, clip3_s16_s23, clip3_s24_s31;
1565
1566 for (y = 0; y < height; y += 2){
1567
1568 sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1569 sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 8)), _mm_loadu_si128((__m128i*)(residualPtr + 8)));
1570 sum_16_23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 16)), _mm_loadu_si128((__m128i*)(residualPtr + 16)));
1571 sum_24_31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 24)), _mm_loadu_si128((__m128i*)(residualPtr + 24)));
1572
1573 sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride)));
1574 sum_s8_s15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride + 8)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride + 8)));
1575 sum_s16_s23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride + 16)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride + 16)));
1576 sum_s24_s31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride + 24)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride + 24)));
1577
1578 clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
1579 clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15 , xmm_Max10bit), xmm_0);
1580 clip3_16_23 = _mm_max_epi16(_mm_min_epi16(sum_16_23, xmm_Max10bit), xmm_0);
1581 clip3_24_31 = _mm_max_epi16(_mm_min_epi16(sum_24_31, xmm_Max10bit), xmm_0);
1582
1583 clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(sum_s0_s7, xmm_Max10bit), xmm_0);
1584 clip3_s8_s15 = _mm_max_epi16(_mm_min_epi16(sum_s8_s15, xmm_Max10bit), xmm_0);
1585 clip3_s16_s23 = _mm_max_epi16(_mm_min_epi16(sum_s16_s23, xmm_Max10bit), xmm_0);
1586 clip3_s24_s31 = _mm_max_epi16(_mm_min_epi16(sum_s24_s31, xmm_Max10bit), xmm_0);
1587
1588 _mm_storeu_si128((__m128i*) reconPtr, clip3_0_7);
1589 _mm_storeu_si128((__m128i*) (reconPtr + 8), clip3_8_15);
1590 _mm_storeu_si128((__m128i*) (reconPtr + 16), clip3_16_23);
1591 _mm_storeu_si128((__m128i*) (reconPtr + 24), clip3_24_31);
1592
1593 _mm_storeu_si128((__m128i*) (reconPtr + reconStride), clip3_s0_s7);
1594 _mm_storeu_si128((__m128i*) (reconPtr + reconStride + 8), clip3_s8_s15);
1595 _mm_storeu_si128((__m128i*) (reconPtr + reconStride + 16), clip3_s16_s23);
1596 _mm_storeu_si128((__m128i*) (reconPtr + reconStride + 24), clip3_s24_s31);
1597
1598 predPtr += predStride << 1;
1599 residualPtr += residualStride << 1;
1600 reconPtr += reconStride << 1;
1601 }
1602 }
1603 else if (width == 64){ // Branch not tested due to Max TU size is 32 at time of development
1604
1605 __m128i sum_0_7, sum_8_15, sum_16_23, sum_24_31, sum_32_39, sum_40_47, sum_48_55, sum_56_63;
1606 __m128i clip3_0_7, clip3_8_15, clip3_16_23, clip3_24_31, clip3_32_39, clip3_40_47, clip3_48_55, clip3_56_63;
1607
1608 for (y = 0; y < height; ++y ){
1609
1610 sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1611 sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 8)), _mm_loadu_si128((__m128i*)(residualPtr + 8)));
1612 sum_16_23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 16)), _mm_loadu_si128((__m128i*)(residualPtr + 16)));
1613 sum_24_31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 24)), _mm_loadu_si128((__m128i*)(residualPtr + 24)));
1614 sum_32_39 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 32)), _mm_loadu_si128((__m128i*)(residualPtr + 32)));
1615 sum_40_47 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 40)), _mm_loadu_si128((__m128i*)(residualPtr + 40)));
1616 sum_48_55 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 48)), _mm_loadu_si128((__m128i*)(residualPtr + 48)));
1617 sum_56_63 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 56)), _mm_loadu_si128((__m128i*)(residualPtr + 56)));
1618
1619 clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7 , xmm_Max10bit), xmm_0);
1620 clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15 , xmm_Max10bit), xmm_0);
1621 clip3_16_23 = _mm_max_epi16(_mm_min_epi16(sum_16_23, xmm_Max10bit), xmm_0);
1622 clip3_24_31 = _mm_max_epi16(_mm_min_epi16(sum_24_31, xmm_Max10bit), xmm_0);
1623 clip3_32_39 = _mm_max_epi16(_mm_min_epi16(sum_32_39, xmm_Max10bit), xmm_0);
1624 clip3_40_47 = _mm_max_epi16(_mm_min_epi16(sum_40_47, xmm_Max10bit), xmm_0);
1625 clip3_48_55 = _mm_max_epi16(_mm_min_epi16(sum_48_55, xmm_Max10bit), xmm_0);
1626 clip3_56_63 = _mm_max_epi16(_mm_min_epi16(sum_56_63, xmm_Max10bit), xmm_0);
1627
1628 _mm_storeu_si128((__m128i*) reconPtr, clip3_0_7 );
1629 _mm_storeu_si128((__m128i*) (reconPtr + 8), clip3_8_15 );
1630 _mm_storeu_si128((__m128i*) (reconPtr + 16), clip3_16_23);
1631 _mm_storeu_si128((__m128i*) (reconPtr + 24), clip3_24_31);
1632 _mm_storeu_si128((__m128i*) (reconPtr + 32), clip3_32_39);
1633 _mm_storeu_si128((__m128i*) (reconPtr + 40), clip3_40_47);
1634 _mm_storeu_si128((__m128i*) (reconPtr + 48), clip3_48_55);
1635 _mm_storeu_si128((__m128i*) (reconPtr + 56), clip3_56_63);
1636
1637 predPtr += predStride ;
1638 residualPtr += residualStride ;
1639 reconPtr += reconStride ;
1640 }
1641 }
1642 else
1643 {
1644 EB_U32 predStrideDiff = 2 * predStride;
1645 EB_U32 residualStrideDiff = 2 * residualStride;
1646 EB_U32 reconStrideDiff = 2 * reconStride;
1647 predStrideDiff -= width;
1648 residualStrideDiff -= width;
1649 reconStrideDiff -= width;
1650
1651 if (!(width & 7)){
1652
1653 __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_7, xmm_clip3_s0_s7;
1654
1655 for (x = 0; x < height; x += 2){
1656 for (y = 0; y < width; y += 8){
1657
1658 xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1659 xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride )));
1660
1661 xmm_clip3_0_7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
1662 xmm_clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
1663
1664 _mm_storeu_si128((__m128i*) reconPtr, xmm_clip3_0_7);
1665 _mm_storeu_si128((__m128i*) (reconPtr + reconStride), xmm_clip3_s0_s7);
1666
1667 predPtr += 8;
1668 residualPtr += 8;
1669 reconPtr += 8;
1670 }
1671 predPtr += predStrideDiff;
1672 residualPtr += residualStrideDiff;
1673 reconPtr += reconStrideDiff;
1674 }
1675 }
1676 else{
1677 __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_3, xmm_clip3_s0_s3;
1678 for (x = 0; x < height; x += 2){
1679 for (y = 0; y < width; y += 4){
1680
1681 xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1682 xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride)));
1683
1684 xmm_clip3_0_3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
1685 xmm_clip3_s0_s3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
1686
1687 _mm_storel_epi64((__m128i*) reconPtr, xmm_clip3_0_3);
1688 _mm_storel_epi64((__m128i*) (reconPtr + reconStride), xmm_clip3_s0_s3);
1689
1690 predPtr += 4;
1691 residualPtr += 4;
1692 reconPtr += 4;
1693 }
1694 predPtr += predStrideDiff;
1695 residualPtr += residualStrideDiff;
1696 reconPtr += reconStrideDiff;
1697 }
1698 }
1699 }
1700 return;
1701 }
1702
1703
1704