1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbPictureOperators_SSE2.h"
7 #include <emmintrin.h>
8 #include "EbDefinitions.h"
9 
_mm_loadh_epi64(__m128i x,__m128i * p)10 static __m128i _mm_loadh_epi64(__m128i x, __m128i *p)
11 {
12   return _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(x), (double *)p));
13 }
14 
15 // Note: maximum energy within 1 TU considered to be 2^30-e
16 // All functions can accumulate up to 4 TUs to stay within 32-bit unsigned range
17 
18 //-------
FullDistortionKernel4x4_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)19 void FullDistortionKernel4x4_32bit_BT_SSE2(
20 	EB_S16  *coeff,
21 	EB_U32   coeffStride,
22 	EB_S16  *reconCoeff,
23 	EB_U32   reconCoeffStride,
24 	EB_U64   distortionResult[2],
25 	EB_U32   areaWidth,
26 	EB_U32   areaHeight)
27 {
28   EB_S32 rowCount;
29   __m128i sum = _mm_setzero_si128();
30   __m128i sum2 = _mm_setzero_si128();
31 
32   rowCount = 2;
33   do
34   {
35     __m128i x0;
36     __m128i y0;
37     __m128i z0;
38 
39     x0 = _mm_loadl_epi64((__m128i *)coeff); coeff += coeffStride;
40     x0 = _mm_loadh_epi64(x0, (__m128i *)coeff); coeff += coeffStride;
41     y0 = _mm_loadl_epi64((__m128i *)reconCoeff); reconCoeff += reconCoeffStride;
42     y0 = _mm_loadh_epi64(y0, (__m128i *)reconCoeff); reconCoeff += reconCoeffStride;
43 
44     z0 = _mm_madd_epi16(x0, x0);
45 
46     sum2 = _mm_add_epi32(sum2, z0);
47 
48     x0 = _mm_sub_epi16(x0, y0);
49 
50     x0 = _mm_madd_epi16(x0, x0);
51 
52     sum = _mm_add_epi32(sum, x0);
53   }
54   while (--rowCount);
55 
56   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
57   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
58   sum = _mm_unpacklo_epi32(sum, sum2);
59   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
60   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
61 
62   (void)areaWidth;
63   (void)areaHeight;
64 }
65 
FullDistortionKernel8x8_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)66 void FullDistortionKernel8x8_32bit_BT_SSE2(
67 	EB_S16  *coeff,
68 	EB_U32   coeffStride,
69 	EB_S16  *reconCoeff,
70 	EB_U32   reconCoeffStride,
71 	EB_U64   distortionResult[2],
72 	EB_U32   areaWidth,
73 	EB_U32   areaHeight)
74 {
75   EB_S32 rowCount;
76 
77   __m128i sum = _mm_setzero_si128();
78   __m128i sum2 = _mm_setzero_si128();
79 
80   rowCount = 8;
81   do
82   {
83     __m128i x0;
84     __m128i y0;
85     __m128i z0;
86 
87     x0 = _mm_loadu_si128((__m128i *)(coeff + 0x00));
88     y0 = _mm_loadu_si128((__m128i *)(reconCoeff + 0x00));
89     coeff += coeffStride;
90     reconCoeff += reconCoeffStride;
91 
92     z0 = _mm_madd_epi16(x0, x0);
93 
94     sum2 = _mm_add_epi32(sum2, z0);
95 
96     x0 = _mm_sub_epi16(x0, y0);
97 
98     x0 = _mm_madd_epi16(x0, x0);
99 
100     sum = _mm_add_epi32(sum, x0);
101   }
102   while (--rowCount);
103 
104   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
105   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
106   sum = _mm_unpacklo_epi32(sum, sum2);
107   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
108   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
109 
110   (void)areaWidth;
111   (void)areaHeight;
112 }
113 
FullDistortionKernel16MxN_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)114 void FullDistortionKernel16MxN_32bit_BT_SSE2(
115 	EB_S16  *coeff,
116 	EB_U32   coeffStride,
117 	EB_S16  *reconCoeff,
118 	EB_U32   reconCoeffStride,
119 	EB_U64   distortionResult[2],
120 	EB_U32   areaWidth,
121 	EB_U32   areaHeight)
122 {
123   EB_S32 rowCount, colCount;
124   __m128i sum = _mm_setzero_si128();
125   __m128i sum2 = _mm_setzero_si128();
126 
127   colCount = areaWidth;
128   do
129   {
130     EB_S16 *coeffTemp = coeff;
131     EB_S16 *reconCoeffTemp = reconCoeff;
132 
133     rowCount = areaHeight;
134     do
135     {
136       __m128i x0, x1;
137       __m128i y0, y1;
138       __m128i z0, z1;
139 
140       x0 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x00));
141       x1 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x08));
142       y0 = _mm_loadu_si128((__m128i *)(reconCoeffTemp + 0x00));
143       y1 = _mm_loadu_si128((__m128i *)(reconCoeffTemp + 0x08));
144       coeffTemp += coeffStride;
145       reconCoeffTemp += reconCoeffStride;
146 
147       z0 = _mm_madd_epi16(x0, x0);
148       z1 = _mm_madd_epi16(x1, x1);
149 
150       sum2 = _mm_add_epi32(sum2, z0);
151       sum2 = _mm_add_epi32(sum2, z1);
152 
153       x0 = _mm_sub_epi16(x0, y0);
154       x1 = _mm_sub_epi16(x1, y1);
155 
156       x0 = _mm_madd_epi16(x0, x0);
157       x1 = _mm_madd_epi16(x1, x1);
158 
159       sum = _mm_add_epi32(sum, x0);
160       sum = _mm_add_epi32(sum, x1);
161     }
162     while (--rowCount);
163 
164     coeff += 16;
165     reconCoeff += 16;
166     colCount -= 16;
167   }
168   while (colCount > 0);
169 
170   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
171   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
172   sum = _mm_unpacklo_epi32(sum, sum2);
173   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
174   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
175 }
176 
177 
FullDistortionKernelIntra4x4_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)178 void FullDistortionKernelIntra4x4_32bit_BT_SSE2(
179 	EB_S16  *coeff,
180 	EB_U32   coeffStride,
181 	EB_S16  *reconCoeff,
182 	EB_U32   reconCoeffStride,
183 	EB_U64   distortionResult[2],
184 	EB_U32   areaWidth,
185 	EB_U32   areaHeight)
186 {
187   EB_S32 rowCount;
188 
189   __m128i sum = _mm_setzero_si128();
190 
191   rowCount = 2;
192   do
193   {
194     __m128i x0;
195     __m128i y0;
196 
197     x0 = _mm_loadl_epi64((__m128i *)coeff); coeff += coeffStride;
198     x0 = _mm_loadh_epi64(x0, (__m128i *)coeff); coeff += coeffStride;
199     y0 = _mm_loadl_epi64((__m128i *)reconCoeff); reconCoeff += reconCoeffStride;
200     y0 = _mm_loadh_epi64(y0, (__m128i *)reconCoeff); reconCoeff += reconCoeffStride;
201 
202     x0 = _mm_sub_epi16(x0, y0);
203 
204     x0 = _mm_madd_epi16(x0, x0);
205 
206     sum = _mm_add_epi32(sum, x0);
207   }
208   while (--rowCount);
209 
210   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
211   sum = _mm_unpacklo_epi32(sum, sum);
212   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
213   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
214 
215   (void)areaWidth;
216   (void)areaHeight;
217 }
218 
FullDistortionKernelIntra8x8_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)219 void FullDistortionKernelIntra8x8_32bit_BT_SSE2(
220 	EB_S16  *coeff,
221 	EB_U32   coeffStride,
222 	EB_S16  *reconCoeff,
223 	EB_U32   reconCoeffStride,
224 	EB_U64   distortionResult[2],
225 	EB_U32   areaWidth,
226 	EB_U32   areaHeight)
227 {
228   EB_S32 rowCount;
229 
230   __m128i sum = _mm_setzero_si128();
231 
232   rowCount = 8;
233   do
234   {
235     __m128i x0;
236     __m128i y0;
237 
238     x0 = _mm_loadu_si128((__m128i *)(coeff + 0x00));
239     y0 = _mm_loadu_si128((__m128i *)(reconCoeff + 0x00));
240     coeff += coeffStride;
241     reconCoeff += reconCoeffStride;
242 
243     x0 = _mm_sub_epi16(x0, y0);
244 
245     x0 = _mm_madd_epi16(x0, x0);
246 
247     sum = _mm_add_epi32(sum, x0);
248   }
249   while (--rowCount);
250 
251   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
252   sum = _mm_unpacklo_epi32(sum, sum);
253   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
254   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
255 
256   (void)areaWidth;
257   (void)areaHeight;
258 }
259 
FullDistortionKernelIntra16MxN_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)260 void FullDistortionKernelIntra16MxN_32bit_BT_SSE2(
261 	EB_S16  *coeff,
262 	EB_U32   coeffStride,
263 	EB_S16  *reconCoeff,
264 	EB_U32   reconCoeffStride,
265 	EB_U64   distortionResult[2],
266 	EB_U32   areaWidth,
267 	EB_U32   areaHeight)
268 {
269   EB_S32 rowCount, colCount;
270   __m128i sum = _mm_setzero_si128();
271 
272   colCount = areaWidth;
273   do
274   {
275     EB_S16 *coeffTemp = coeff;
276     EB_S16 *reconCoeffTemp = reconCoeff;
277 
278     rowCount = areaHeight;
279     do
280     {
281       __m128i x0, x1;
282       __m128i y0, y1;
283 
284       x0 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x00));
285       x1 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x08));
286       y0 = _mm_loadu_si128((__m128i *)(reconCoeffTemp + 0x00));
287       y1 = _mm_loadu_si128((__m128i *)(reconCoeffTemp + 0x08));
288       coeffTemp += coeffStride;
289       reconCoeffTemp += reconCoeffStride;
290 
291       x0 = _mm_sub_epi16(x0, y0);
292       x1 = _mm_sub_epi16(x1, y1);
293 
294       x0 = _mm_madd_epi16(x0, x0);
295       x1 = _mm_madd_epi16(x1, x1);
296 
297       sum = _mm_add_epi32(sum, x0);
298       sum = _mm_add_epi32(sum, x1);
299     }
300     while (--rowCount);
301 
302     coeff += 16;
303     reconCoeff += 16;
304     colCount -= 16;
305   }
306   while (colCount > 0);
307 
308   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
309   sum = _mm_unpacklo_epi32(sum, sum);
310   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4e)); // 01001110
311   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum, _mm_setzero_si128()));
312 }
313 
314 
315 
FullDistortionKernelCbfZero4x4_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)316 void FullDistortionKernelCbfZero4x4_32bit_BT_SSE2(
317 	EB_S16  *coeff,
318 	EB_U32   coeffStride,
319 	EB_S16  *reconCoeff,
320 	EB_U32   reconCoeffStride,
321 	EB_U64   distortionResult[2],
322 	EB_U32   areaWidth,
323 	EB_U32   areaHeight)
324 {
325   EB_S32 rowCount;
326   __m128i sum2 = _mm_setzero_si128();
327 
328   rowCount = 2;
329   do
330   {
331     __m128i x0;
332     __m128i z0;
333 
334     x0 = _mm_loadl_epi64((__m128i *)coeff); coeff += coeffStride;
335     x0 = _mm_loadh_epi64(x0, (__m128i *)coeff); coeff += coeffStride;
336 
337     z0 = _mm_madd_epi16(x0, x0);
338 
339     sum2 = _mm_add_epi32(sum2, z0);
340   }
341   while (--rowCount);
342 
343   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
344   sum2 = _mm_unpacklo_epi32(sum2, sum2);
345   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
346   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum2, _mm_setzero_si128()));
347 
348   (void)areaWidth;
349   (void)areaHeight;
350   (void)reconCoeff;
351   (void)reconCoeffStride;
352 }
353 
FullDistortionKernelCbfZero8x8_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)354 void FullDistortionKernelCbfZero8x8_32bit_BT_SSE2(
355 	EB_S16  *coeff,
356 	EB_U32   coeffStride,
357 	EB_S16  *reconCoeff,
358 	EB_U32   reconCoeffStride,
359 	EB_U64   distortionResult[2],
360 	EB_U32   areaWidth,
361 	EB_U32   areaHeight)
362 {
363   EB_S32 rowCount;
364   __m128i sum2 = _mm_setzero_si128();
365 
366   rowCount = 8;
367   do
368   {
369     __m128i x0;
370     __m128i z0;
371 
372     x0 = _mm_loadu_si128((__m128i *)(coeff + 0x00));
373     coeff += coeffStride;
374 
375     z0 = _mm_madd_epi16(x0, x0);
376 
377     sum2 = _mm_add_epi32(sum2, z0);
378   }
379   while (--rowCount);
380 
381   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
382   sum2 = _mm_unpacklo_epi32(sum2, sum2);
383   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
384   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum2, _mm_setzero_si128()));
385 
386   (void)areaWidth;
387   (void)areaHeight;
388   (void)reconCoeff;
389   (void)reconCoeffStride;
390 }
391 
FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2(EB_S16 * coeff,EB_U32 coeffStride,EB_S16 * reconCoeff,EB_U32 reconCoeffStride,EB_U64 distortionResult[2],EB_U32 areaWidth,EB_U32 areaHeight)392 void FullDistortionKernelCbfZero16MxN_32bit_BT_SSE2(
393 	EB_S16  *coeff,
394 	EB_U32   coeffStride,
395 	EB_S16  *reconCoeff,
396 	EB_U32   reconCoeffStride,
397 	EB_U64   distortionResult[2],
398 	EB_U32   areaWidth,
399 	EB_U32   areaHeight)
400 {
401   EB_S32 rowCount, colCount;
402   __m128i sum2 = _mm_setzero_si128();
403 
404   colCount = areaWidth;
405   do
406   {
407     EB_S16 *coeffTemp = coeff;
408 
409     rowCount = areaHeight;
410     do
411     {
412       __m128i x0, x1;
413       __m128i z0, z1;
414 
415       x0 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x00));
416       x1 = _mm_loadu_si128((__m128i *)(coeffTemp + 0x08));
417       coeffTemp += coeffStride;
418 
419       z0 = _mm_madd_epi16(x0, x0);
420       z1 = _mm_madd_epi16(x1, x1);
421 
422       sum2 = _mm_add_epi32(sum2, z0);
423       sum2 = _mm_add_epi32(sum2, z1);
424     }
425     while (--rowCount);
426 
427     coeff += 16;
428     reconCoeff += 16;
429     colCount -= 16;
430   }
431   while (colCount > 0);
432 
433   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
434   sum2 = _mm_unpacklo_epi32(sum2, sum2);
435   sum2 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 0x4e)); // 01001110
436   _mm_storeu_si128((__m128i *)distortionResult, _mm_unpacklo_epi32(sum2, _mm_setzero_si128()));
437   (void)reconCoeffStride;
438 
439 }
440 
441 /*******************************************************************************
442                          PictureCopyKernel_INTRIN
443 *******************************************************************************/
PictureCopyKernel4x4_SSE_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)444 void PictureCopyKernel4x4_SSE_INTRIN(
445 	EB_BYTE                  src,
446 	EB_U32                   srcStride,
447 	EB_BYTE                  dst,
448 	EB_U32                   dstStride,
449 	EB_U32                   areaWidth,
450 	EB_U32                   areaHeight)
451 {
452     *(EB_U32 *)dst = *(EB_U32 *)src;
453     *(EB_U32 *)(dst + dstStride) = *(EB_U32 *)(src + srcStride);
454     *(EB_U32 *)(dst + (dstStride << 1)) = *(EB_U32 *)(src + (srcStride << 1));
455     *(EB_U32 *)(dst + (dstStride * 3)) = *(EB_U32 *)(src + (srcStride * 3));
456 
457 	(void)areaWidth;
458 	(void)areaHeight;
459 
460 	return;
461 }
462 
PictureCopyKernel8x8_SSE2_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)463 void PictureCopyKernel8x8_SSE2_INTRIN(
464 	EB_BYTE                  src,
465 	EB_U32                   srcStride,
466 	EB_BYTE                  dst,
467 	EB_U32                   dstStride,
468 	EB_U32                   areaWidth,
469 	EB_U32                   areaHeight)
470 {
471 	_mm_storel_epi64((__m128i*)dst, _mm_cvtsi64_si128(*(EB_U64 *)src));
472     _mm_storel_epi64((__m128i*)(dst + srcStride), _mm_cvtsi64_si128(*(EB_U64 *)(src + srcStride)));
473     _mm_storel_epi64((__m128i*)(dst + (srcStride << 1)), _mm_cvtsi64_si128(*(EB_U64 *)(src + (srcStride << 1))));
474     _mm_storel_epi64((__m128i*)(dst + 3*srcStride), _mm_cvtsi64_si128(*(EB_U64 *)(src + 3*srcStride)));
475 
476     src += (srcStride << 2);
477     dst += (dstStride << 2);
478 
479     _mm_storel_epi64((__m128i*)dst, _mm_cvtsi64_si128(*(EB_U64 *)src));
480     _mm_storel_epi64((__m128i*)(dst + srcStride), _mm_cvtsi64_si128(*(EB_U64 *)(src + srcStride)));
481     _mm_storel_epi64((__m128i*)(dst + (srcStride << 1)), _mm_cvtsi64_si128(*(EB_U64 *)(src + (srcStride << 1))));
482     _mm_storel_epi64((__m128i*)(dst + 3*srcStride), _mm_cvtsi64_si128(*(EB_U64 *)(src + 3*srcStride)));
483 
484 	(void)areaWidth;
485 	(void)areaHeight;
486 
487 	return;
488 }
489 
PictureCopyKernel16x16_SSE2_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)490 void PictureCopyKernel16x16_SSE2_INTRIN(
491 	EB_BYTE                  src,
492 	EB_U32                   srcStride,
493 	EB_BYTE                  dst,
494 	EB_U32                   dstStride,
495 	EB_U32                   areaWidth,
496 	EB_U32                   areaHeight)
497 {
498 	_mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
499     _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
500     _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
501     _mm_storeu_si128((__m128i*)(dst + (dstStride * 3)), _mm_loadu_si128((__m128i*)(src + (srcStride * 3))));
502 
503 	src += (srcStride << 2);
504     dst += (dstStride << 2);
505 
506    	_mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
507     _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
508     _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
509     _mm_storeu_si128((__m128i*)(dst + (dstStride * 3)), _mm_loadu_si128((__m128i*)(src + (srcStride * 3))));
510 
511    	src += (srcStride << 2);
512     dst += (dstStride << 2);
513 
514     _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
515     _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
516     _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
517     _mm_storeu_si128((__m128i*)(dst + (dstStride * 3)), _mm_loadu_si128((__m128i*)(src + (srcStride * 3))));
518 
519 	src += (srcStride << 2);
520     dst += (dstStride << 2);
521 
522    	_mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
523     _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
524     _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
525     _mm_storeu_si128((__m128i*)(dst + (dstStride * 3)), _mm_loadu_si128((__m128i*)(src + (srcStride * 3))));
526 
527    	src += (srcStride << 2);
528     dst += (dstStride << 2);
529 
530 	(void)areaWidth;
531 	(void)areaHeight;
532 
533 	return;
534 }
535 
536 
PictureCopyKernel32x32_SSE2_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)537 void PictureCopyKernel32x32_SSE2_INTRIN(
538 	EB_BYTE                  src,
539 	EB_U32                   srcStride,
540 	EB_BYTE                  dst,
541 	EB_U32                   dstStride,
542 	EB_U32                   areaWidth,
543 	EB_U32                   areaHeight)
544 {
545 	EB_U32 y;
546 
547 	for (y = 0; y < 4; ++y){
548 
549 		_mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
550 		_mm_storeu_si128((__m128i*)(dst + 16), _mm_loadu_si128((__m128i*)(src + 16)));
551         _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
552 		_mm_storeu_si128((__m128i*)(dst + dstStride + 16), _mm_loadu_si128((__m128i*)(src + srcStride + 16)));
553         _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
554 		_mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 16), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 16)));
555         _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), _mm_loadu_si128((__m128i*)(src + 3 * srcStride)));
556 		_mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 16), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 16)));
557 
558         src += (srcStride << 2);
559         dst += (dstStride << 2);
560 
561         _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
562 		_mm_storeu_si128((__m128i*)(dst + 16), _mm_loadu_si128((__m128i*)(src + 16)));
563         _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
564 		_mm_storeu_si128((__m128i*)(dst + dstStride + 16), _mm_loadu_si128((__m128i*)(src + srcStride + 16)));
565         _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
566 		_mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 16), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 16)));
567         _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), _mm_loadu_si128((__m128i*)(src + 3 * srcStride)));
568 		_mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 16), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 16)));
569 
570         src += (srcStride << 2);
571         dst += (dstStride << 2);
572 	}
573 	(void)areaWidth;
574 	(void)areaHeight;
575 
576 	return;
577 }
578 
PictureCopyKernel64x64_SSE2_INTRIN(EB_BYTE src,EB_U32 srcStride,EB_BYTE dst,EB_U32 dstStride,EB_U32 areaWidth,EB_U32 areaHeight)579 void PictureCopyKernel64x64_SSE2_INTRIN(
580 	EB_BYTE                  src,
581 	EB_U32                   srcStride,
582 	EB_BYTE                  dst,
583 	EB_U32                   dstStride,
584 	EB_U32                   areaWidth,
585 	EB_U32                   areaHeight)
586 {
587 	EB_U32 y;
588 
589 	for (y = 0; y < 8; ++y){
590 
591 		_mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
592 		_mm_storeu_si128((__m128i*)(dst + 16), _mm_loadu_si128((__m128i*)(src + 16)));
593 		_mm_storeu_si128((__m128i*)(dst + 32), _mm_loadu_si128((__m128i*)(src + 32)));
594 		_mm_storeu_si128((__m128i*)(dst + 48), _mm_loadu_si128((__m128i*)(src + 48)));
595         _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
596 		_mm_storeu_si128((__m128i*)(dst + dstStride + 16), _mm_loadu_si128((__m128i*)(src + srcStride + 16)));
597 		_mm_storeu_si128((__m128i*)(dst + dstStride + 32), _mm_loadu_si128((__m128i*)(src + srcStride + 32)));
598 		_mm_storeu_si128((__m128i*)(dst + dstStride + 48), _mm_loadu_si128((__m128i*)(src + srcStride + 48)));
599         _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
600 		_mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 16), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 16)));
601 		_mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 32), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 32)));
602 		_mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 48), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 48)));
603         _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), _mm_loadu_si128((__m128i*)(src + 3 * srcStride)));
604 		_mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 16), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 16)));
605 		_mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 32), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 32)));
606 		_mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 48), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 48)));
607 
608         src += (srcStride << 2);
609         dst += (dstStride << 2);
610 
611         _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((__m128i*)src));
612 		_mm_storeu_si128((__m128i*)(dst + 16), _mm_loadu_si128((__m128i*)(src + 16)));
613 		_mm_storeu_si128((__m128i*)(dst + 32), _mm_loadu_si128((__m128i*)(src + 32)));
614 		_mm_storeu_si128((__m128i*)(dst + 48), _mm_loadu_si128((__m128i*)(src + 48)));
615         _mm_storeu_si128((__m128i*)(dst + dstStride), _mm_loadu_si128((__m128i*)(src + srcStride)));
616 		_mm_storeu_si128((__m128i*)(dst + dstStride + 16), _mm_loadu_si128((__m128i*)(src + srcStride + 16)));
617 		_mm_storeu_si128((__m128i*)(dst + dstStride + 32), _mm_loadu_si128((__m128i*)(src + srcStride + 32)));
618 		_mm_storeu_si128((__m128i*)(dst + dstStride + 48), _mm_loadu_si128((__m128i*)(src + srcStride + 48)));
619         _mm_storeu_si128((__m128i*)(dst + (dstStride << 1)), _mm_loadu_si128((__m128i*)(src + (srcStride << 1))));
620 		_mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 16), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 16)));
621 		_mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 32), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 32)));
622 		_mm_storeu_si128((__m128i*)(dst + (dstStride << 1) + 48), _mm_loadu_si128((__m128i*)(src + (srcStride << 1) + 48)));
623         _mm_storeu_si128((__m128i*)(dst + 3 * dstStride), _mm_loadu_si128((__m128i*)(src + 3 * srcStride)));
624 		_mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 16), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 16)));
625 		_mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 32), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 32)));
626 		_mm_storeu_si128((__m128i*)(dst + 3 * dstStride + 48), _mm_loadu_si128((__m128i*)(src + 3 * srcStride + 48)));
627 
628         src += (srcStride << 2);
629         dst += (dstStride << 2);
630 	}
631 	(void)areaWidth;
632 	(void)areaHeight;
633 
634 	return;
635 }
636 /*******************************************************************************
637                       PictureAdditionKernel_INTRIN
638 *******************************************************************************/
PictureAdditionKernel4x4_SSE_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)639 void PictureAdditionKernel4x4_SSE_INTRIN(
640 	EB_U8  *predPtr,
641 	EB_U32  predStride,
642 	EB_S16 *residualPtr,
643 	EB_U32  residualStride,
644 	EB_U8  *reconPtr,
645 	EB_U32  reconStride,
646 	EB_U32  width,
647 	EB_U32  height)
648 {
649 	EB_U32 y;
650     __m128i xmm0, recon_0_3;
651 	xmm0 = _mm_setzero_si128();
652 
653 	for (y = 0; y < 4; ++y){
654 
655 		recon_0_3 = _mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)predPtr), xmm0), _mm_loadl_epi64((__m128i *)residualPtr)), xmm0);
656 
657 		*(EB_U32 *)reconPtr = _mm_cvtsi128_si32(recon_0_3);
658 		predPtr += predStride;
659 		residualPtr += residualStride;
660 		reconPtr += reconStride;
661 	}
662 	(void)width;
663 	(void)height;
664 
665 	return;
666 }
667 
PictureAdditionKernel8x8_SSE2_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)668 void PictureAdditionKernel8x8_SSE2_INTRIN(
669 	EB_U8  *predPtr,
670 	EB_U32  predStride,
671 	EB_S16 *residualPtr,
672 	EB_U32  residualStride,
673 	EB_U8  *reconPtr,
674 	EB_U32  reconStride,
675 	EB_U32  width,
676 	EB_U32  height)
677 {
678 
679 	__m128i recon_0_7, xmm0;
680 	EB_U32 y;
681 
682 	xmm0 = _mm_setzero_si128();
683 
684 	for (y = 0; y < 8; ++y){
685 
686 		recon_0_7 = _mm_packus_epi16(_mm_add_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)predPtr), xmm0), _mm_loadu_si128((__m128i *)residualPtr)), xmm0);
687 
688 		*(EB_U64 *)reconPtr = _mm_cvtsi128_si64(recon_0_7);
689 		predPtr += predStride;
690 		residualPtr += residualStride;
691 		reconPtr += reconStride;
692 	}
693 	(void)width;
694 	(void)height;
695 
696 	return;
697 }
698 
PictureAdditionKernel16x16_SSE2_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)699 void PictureAdditionKernel16x16_SSE2_INTRIN(
700 	EB_U8  *predPtr,
701 	EB_U32  predStride,
702 	EB_S16 *residualPtr,
703 	EB_U32  residualStride,
704 	EB_U8  *reconPtr,
705 	EB_U32  reconStride,
706 	EB_U32  width,
707 	EB_U32  height)
708 {
709 	__m128i xmm0, xmm_clip_U8, pred_0_15, recon_0_7, recon_8_15;
710 	EB_U32 y;
711 
712 	xmm0 = _mm_setzero_si128();
713 
714 	for (y = 0; y < 16; ++y){
715 
716 		pred_0_15 = _mm_loadu_si128((__m128i *)predPtr);
717 		recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residualPtr));
718 		recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 8)));
719 		xmm_clip_U8 = _mm_packus_epi16(recon_0_7, recon_8_15);
720 
721 		_mm_storeu_si128((__m128i*)reconPtr, xmm_clip_U8);
722 
723 		predPtr += predStride;
724 		residualPtr += residualStride;
725 		reconPtr += reconStride;
726 	}
727 	(void)width;
728 	(void)height;
729 
730 	return;
731 
732 }
PictureAdditionKernel32x32_SSE2_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)733 void PictureAdditionKernel32x32_SSE2_INTRIN(
734 	EB_U8  *predPtr,
735 	EB_U32  predStride,
736 	EB_S16 *residualPtr,
737 	EB_U32  residualStride,
738 	EB_U8  *reconPtr,
739 	EB_U32  reconStride,
740 	EB_U32  width,
741 	EB_U32  height)
742 {
743 	EB_U32 y;
744     __m128i xmm0, pred_0_15, pred_16_31, recon_0_15_clipped, recon_0_7, recon_8_15, recon_16_23, recon_24_31, recon_16_31_clipped;
745 	xmm0 = _mm_setzero_si128();
746 
747 	for (y = 0; y < 32; ++y){
748 		pred_0_15 = _mm_loadu_si128((__m128i *)predPtr);
749         pred_16_31 = _mm_loadu_si128((__m128i *)(predPtr + 16));
750 
751 		recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residualPtr));
752 		recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 8)));
753 		recon_16_23 = _mm_add_epi16(_mm_unpacklo_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 16)));
754 		recon_24_31 = _mm_add_epi16(_mm_unpackhi_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 24)));
755 
756         recon_0_15_clipped = _mm_packus_epi16(recon_0_7, recon_8_15);
757 		recon_16_31_clipped = _mm_packus_epi16(recon_16_23, recon_24_31);
758 
759         _mm_storeu_si128((__m128i*)reconPtr, recon_0_15_clipped);
760 		_mm_storeu_si128((__m128i*)(reconPtr + 16), recon_16_31_clipped);
761 
762 		predPtr += predStride;
763 		residualPtr += residualStride;
764 		reconPtr += reconStride;
765 	}
766 	(void)width;
767 	(void)height;
768 
769 	return;
770 }
771 
PictureAdditionKernel64x64_SSE2_INTRIN(EB_U8 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U8 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)772 void PictureAdditionKernel64x64_SSE2_INTRIN(
773 	EB_U8  *predPtr,
774 	EB_U32  predStride,
775 	EB_S16 *residualPtr,
776 	EB_U32  residualStride,
777 	EB_U8  *reconPtr,
778 	EB_U32  reconStride,
779 	EB_U32  width,
780 	EB_U32  height)
781 {
782 	EB_U32 y;
783 
784     __m128i xmm0, pred_0_15, pred_16_31, pred_32_47, pred_48_63;
785     __m128i recon_0_15_clipped, recon_16_31_clipped, recon_32_47_clipped, recon_48_63_clipped;
786     __m128i recon_0_7, recon_8_15, recon_16_23, recon_24_31, recon_32_39, recon_40_47, recon_48_55, recon_56_63;
787 
788     xmm0 = _mm_setzero_si128();
789 
790 	for (y = 0; y < 64; ++y){
791 
792 		pred_0_15 = _mm_loadu_si128((__m128i *)predPtr);
793         pred_16_31 = _mm_loadu_si128((__m128i *)(predPtr + 16));
794         pred_32_47 = _mm_loadu_si128((__m128i *)(predPtr + 32));
795         pred_48_63 = _mm_loadu_si128((__m128i *)(predPtr + 48));
796 
797 		recon_0_7 = _mm_add_epi16(_mm_unpacklo_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)residualPtr));
798 		recon_8_15 = _mm_add_epi16(_mm_unpackhi_epi8(pred_0_15, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 8)));
799 		recon_16_23 = _mm_add_epi16(_mm_unpacklo_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 16)));
800 		recon_24_31 = _mm_add_epi16(_mm_unpackhi_epi8(pred_16_31, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 24)));
801 		recon_32_39 = _mm_add_epi16(_mm_unpacklo_epi8(pred_32_47, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 32)));
802 		recon_40_47 = _mm_add_epi16(_mm_unpackhi_epi8(pred_32_47, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 40)));
803 		recon_48_55 = _mm_add_epi16(_mm_unpacklo_epi8(pred_48_63, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 48)));
804 		recon_56_63 = _mm_add_epi16(_mm_unpackhi_epi8(pred_48_63, xmm0), _mm_loadu_si128((__m128i *)(residualPtr + 56)));
805 
806         recon_0_15_clipped = _mm_packus_epi16(recon_0_7, recon_8_15);
807         recon_16_31_clipped = _mm_packus_epi16(recon_16_23, recon_24_31);
808         recon_32_47_clipped = _mm_packus_epi16(recon_32_39, recon_40_47);
809 		recon_48_63_clipped = _mm_packus_epi16(recon_48_55, recon_56_63);
810 
811         _mm_storeu_si128((__m128i*)reconPtr, recon_0_15_clipped);
812         _mm_storeu_si128((__m128i*)(reconPtr + 16), recon_16_31_clipped);
813         _mm_storeu_si128((__m128i*)(reconPtr + 32), recon_32_47_clipped);
814 		_mm_storeu_si128((__m128i*)(reconPtr + 48), recon_48_63_clipped);
815 
816 		predPtr += predStride;
817 		residualPtr += residualStride;
818 		reconPtr += reconStride;
819 	}
820 	(void)width;
821 	(void)height;
822 
823 	return;
824 }
825 
826 /******************************************************************************************************
827 ResidualKernel
828 ***********************************************************************************************************/
829 
ResidualKernel4x4_SSE_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)830 void ResidualKernel4x4_SSE_INTRIN(
831 	EB_U8   *input,
832 	EB_U32   inputStride,
833 	EB_U8   *pred,
834 	EB_U32   predStride,
835 	EB_S16  *residual,
836 	EB_U32   residualStride,
837 	EB_U32   areaWidth,
838 	EB_U32   areaHeight)
839 {
840 	__m128i residual_0_3, xmm0 = _mm_setzero_si128();
841 	EB_U32 y;
842 
843 	for (y = 0; y < 4; ++y){
844 
845 		residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)input), xmm0),
846                                      _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)pred), xmm0));
847 
848 		*(EB_U64 *)residual = _mm_cvtsi128_si64(residual_0_3);
849 
850 		input += inputStride;
851 		pred += predStride;
852 		residual += residualStride;
853 	}
854 	(void)areaWidth;
855 	(void)areaHeight;
856 
857 	return;
858 }
859 
ResidualKernel8x8_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)860 void ResidualKernel8x8_SSE2_INTRIN(
861 	EB_U8   *input,
862 	EB_U32   inputStride,
863 	EB_U8   *pred,
864 	EB_U32   predStride,
865 	EB_S16  *residual,
866 	EB_U32   residualStride,
867 	EB_U32   areaWidth,
868 	EB_U32   areaHeight)
869 {
870 	__m128i xmm0, residual_0_7;
871 	EB_U32 y;
872 
873 	xmm0 = _mm_setzero_si128();
874 
875 	for (y = 0; y < 8; ++y){
876 
877 		residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
878 
879 		_mm_storeu_si128((__m128i*)residual, residual_0_7);
880 
881 		input += inputStride;
882 		pred += predStride;
883 		residual += residualStride;
884 	}
885 	(void)areaWidth;
886 	(void)areaHeight;
887 
888 	return;
889 }
890 
ResidualKernel16x16_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)891 void ResidualKernel16x16_SSE2_INTRIN(
892 	EB_U8   *input,
893 	EB_U32   inputStride,
894 	EB_U8   *pred,
895 	EB_U32   predStride,
896 	EB_S16  *residual,
897 	EB_U32   residualStride,
898 	EB_U32   areaWidth,
899 	EB_U32   areaHeight)
900 {
901     __m128i xmm0, residual_0_7, residual_8_15;
902 	EB_U32 y;
903 
904 	xmm0 = _mm_setzero_si128();
905 
906 	for (y = 0; y < 16; ++y){
907 
908 		residual_0_7 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
909 		residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
910 
911 		_mm_storeu_si128((__m128i*)residual, residual_0_7);
912 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
913 
914 		input += inputStride;
915 		pred += predStride;
916 		residual += residualStride;
917 	}
918 	(void)areaWidth;
919 	(void)areaHeight;
920 
921 	return;
922 }
923 
ResidualKernel32x32_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)924 void ResidualKernel32x32_SSE2_INTRIN(
925 	EB_U8   *input,
926 	EB_U32   inputStride,
927 	EB_U8   *pred,
928 	EB_U32   predStride,
929 	EB_S16  *residual,
930 	EB_U32   residualStride,
931 	EB_U32   areaWidth,
932 	EB_U32   areaHeight)
933 {
934 	__m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31;
935 	EB_U32 y;
936 
937 	xmm0 = _mm_setzero_si128();
938 
939 	for (y = 0; y < 32; ++y){
940 
941 		residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
942 		residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
943 		residual_16_23 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
944 		residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
945 
946         _mm_storeu_si128((__m128i*)residual, residual_0_7);
947 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
948 		_mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
949 		_mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
950 
951 		input += inputStride;
952 		pred += predStride;
953 		residual += residualStride;
954 	}
955 	(void)areaWidth;
956 	(void)areaHeight;
957 
958 	return;
959 }
960 
ResidualKernel64x64_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)961 void ResidualKernel64x64_SSE2_INTRIN(
962 	EB_U8   *input,
963 	EB_U32   inputStride,
964 	EB_U8   *pred,
965 	EB_U32   predStride,
966 	EB_S16  *residual,
967 	EB_U32   residualStride,
968 	EB_U32   areaWidth,
969 	EB_U32   areaHeight)
970 {
971 	__m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31, resdiaul_32_39, residual_40_47, residual_48_55, residual_56_63;
972 	EB_U32 y;
973 
974 	xmm0 = _mm_setzero_si128();
975 
976 	for (y = 0; y < 64; ++y){
977 
978 		residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
979 		residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
980 		residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
981 		residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
982 		resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
983 		residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
984 		residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
985 		residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
986 
987         _mm_storeu_si128((__m128i*)residual, residual_0_7);
988 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
989 		_mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
990 		_mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
991         _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
992 		_mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
993 		_mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
994 		_mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
995 
996 		input += inputStride;
997 		pred += predStride;
998 		residual += residualStride;
999 	}
1000 	(void)areaWidth;
1001 	(void)areaHeight;
1002 
1003 	return;
1004 }
1005 
ResidualKernelSubSampled4x4_SSE_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1006 void ResidualKernelSubSampled4x4_SSE_INTRIN(
1007 	EB_U8   *input,
1008 	EB_U32   inputStride,
1009 	EB_U8   *pred,
1010 	EB_U32   predStride,
1011 	EB_S16  *residual,
1012 	EB_U32   residualStride,
1013 	EB_U32   areaWidth,
1014 	EB_U32   areaHeight,
1015     EB_U8    lastLine)
1016 {
1017 	__m128i residual_0_3, xmm0 = _mm_setzero_si128();
1018 	EB_U32 y;
1019     //hard code subampling dimensions, keep residualStride
1020     areaHeight>>=1;
1021     inputStride<<=1;
1022     predStride<<=1;
1023 
1024 	for (y = 0; y < areaHeight; ++y){
1025 
1026 		residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)input), xmm0),
1027                                      _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)pred), xmm0));
1028 
1029 		*(EB_U64 *)residual = _mm_cvtsi128_si64(residual_0_3);
1030 
1031         residual += residualStride;
1032         *(EB_U64 *)residual = _mm_cvtsi128_si64(residual_0_3);
1033 
1034 		input += inputStride;
1035 		pred += predStride;
1036 		residual += residualStride;
1037 	}
1038 	(void)areaWidth;
1039     //compute the last line
1040 
1041     if(lastLine){
1042     input -= (inputStride)>>1;
1043 	pred  -= (predStride )>>1;
1044 	residual -= residualStride;
1045     residual_0_3 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)input), xmm0),
1046                                  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(EB_U32 *)pred), xmm0));
1047 
1048 	*(EB_U64 *)residual = _mm_cvtsi128_si64(residual_0_3);
1049     }
1050 
1051 	return;
1052 }
1053 
ResidualKernelSubSampled8x8_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1054 void ResidualKernelSubSampled8x8_SSE2_INTRIN(
1055 	EB_U8   *input,
1056 	EB_U32   inputStride,
1057 	EB_U8   *pred,
1058 	EB_U32   predStride,
1059 	EB_S16  *residual,
1060 	EB_U32   residualStride,
1061 	EB_U32   areaWidth,
1062 	EB_U32   areaHeight,
1063     EB_U8    lastLine
1064 
1065     )
1066 {
1067 	__m128i xmm0, residual_0_7;
1068 	EB_U32 y;
1069 
1070 	xmm0 = _mm_setzero_si128();
1071     //hard code subampling dimensions, keep residualStride
1072     areaHeight>>=1;
1073     inputStride<<=1;
1074     predStride<<=1;
1075 
1076 	for (y = 0; y < areaHeight; ++y){
1077 
1078 		residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
1079 
1080 		_mm_storeu_si128((__m128i*)residual, residual_0_7);
1081 
1082         residual += residualStride;
1083         _mm_storeu_si128((__m128i*)residual, residual_0_7);
1084 
1085 		input += inputStride;
1086 		pred += predStride;
1087 		residual += residualStride;
1088 	}
1089 	(void)areaWidth;
1090     //compute the last line
1091     if(lastLine){
1092 
1093     input -= (inputStride)>>1;
1094 	pred  -= (predStride )>>1;
1095 	residual -= residualStride;
1096 
1097     residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)pred), xmm0));
1098 
1099 	_mm_storeu_si128((__m128i*)residual, residual_0_7);
1100 
1101     }
1102 
1103 	return;
1104 }
1105 
ResidualKernelSubSampled16x16_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1106 void ResidualKernelSubSampled16x16_SSE2_INTRIN(
1107 	EB_U8   *input,
1108 	EB_U32   inputStride,
1109 	EB_U8   *pred,
1110 	EB_U32   predStride,
1111 	EB_S16  *residual,
1112 	EB_U32   residualStride,
1113 	EB_U32   areaWidth,
1114 	EB_U32   areaHeight,
1115     EB_U8    lastLine
1116 
1117     )
1118 {
1119     __m128i xmm0, residual_0_7, residual_8_15;
1120 	EB_U32 y;
1121 
1122 	xmm0 = _mm_setzero_si128();
1123     //hard code subampling dimensions, keep residualStride
1124     areaHeight>>=1;
1125     inputStride<<=1;
1126     predStride<<=1;
1127 
1128 	for (y = 0; y < areaHeight; ++y){
1129 
1130 		residual_0_7 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1131 		residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1132 
1133 		_mm_storeu_si128((__m128i*)residual, residual_0_7);
1134 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1135 
1136         residual += residualStride;
1137         _mm_storeu_si128((__m128i*)residual, residual_0_7);
1138 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1139 
1140 		input += inputStride;
1141 		pred += predStride;
1142 		residual += residualStride;
1143 	}
1144 	(void)areaWidth;
1145     //compute the last line
1146 
1147     if(lastLine){
1148 
1149     input -= (inputStride)>>1;
1150 	pred  -= (predStride )>>1;
1151 	residual -= residualStride;
1152 
1153     residual_0_7 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1154 	residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1155 
1156 	_mm_storeu_si128((__m128i*)residual, residual_0_7);
1157 	_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1158 
1159     }
1160 	return;
1161 }
1162 
ResidualKernelSubSampled32x32_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1163 void ResidualKernelSubSampled32x32_SSE2_INTRIN(
1164 	EB_U8   *input,
1165 	EB_U32   inputStride,
1166 	EB_U8   *pred,
1167 	EB_U32   predStride,
1168 	EB_S16  *residual,
1169 	EB_U32   residualStride,
1170 	EB_U32   areaWidth,
1171 	EB_U32   areaHeight,
1172     EB_U8    lastLine)
1173 {
1174 	__m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31;
1175 	EB_U32 y;
1176 
1177 	xmm0 = _mm_setzero_si128();
1178 
1179     //hard code subampling dimensions, keep residualStride
1180     areaHeight>>=1;
1181     inputStride<<=1;
1182     predStride<<=1;
1183 
1184 
1185 	for (y = 0; y < areaHeight; ++y){
1186 
1187 		residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1188 		residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1189 		residual_16_23 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1190 		residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1191 
1192         _mm_storeu_si128((__m128i*)residual, residual_0_7);
1193 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1194 		_mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1195 		_mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1196 
1197          residual += residualStride;
1198         _mm_storeu_si128((__m128i*)residual, residual_0_7);
1199 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1200 		_mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1201 		_mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1202 
1203 		input += inputStride;
1204 		pred += predStride;
1205 		residual += residualStride;
1206 	}
1207 	(void)areaWidth;
1208         //compute the last line
1209 
1210     if(lastLine){
1211         input -= (inputStride)>>1;
1212 		pred  -= (predStride )>>1;
1213 		residual -= residualStride;
1214 
1215         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1216 		residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1217 		residual_16_23 = _mm_sub_epi16( _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1218 		residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1219 
1220         _mm_storeu_si128((__m128i*)residual, residual_0_7);
1221 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1222 		_mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1223 		_mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1224     }
1225 
1226 	return;
1227 }
1228 
1229 
ResidualKernelSubSampled64x64_SSE2_INTRIN(EB_U8 * input,EB_U32 inputStride,EB_U8 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight,EB_U8 lastLine)1230 void ResidualKernelSubSampled64x64_SSE2_INTRIN(
1231 	EB_U8   *input,
1232 	EB_U32   inputStride,
1233 	EB_U8   *pred,
1234 	EB_U32   predStride,
1235 	EB_S16  *residual,
1236 	EB_U32   residualStride,
1237 	EB_U32   areaWidth,
1238 	EB_U32   areaHeight,
1239     EB_U8    lastLine)
1240 {
1241 	__m128i xmm0, residual_0_7, residual_8_15, residual_16_23, residual_24_31, resdiaul_32_39, residual_40_47, residual_48_55, residual_56_63;
1242 	EB_U32 y;
1243 
1244 	xmm0 = _mm_setzero_si128();
1245 
1246     //hard code subampling dimensions, keep residualStride
1247     areaHeight>>=1;
1248     inputStride<<=1;
1249     predStride<<=1;
1250 
1251 	for (y = 0; y < areaHeight; ++y){
1252 
1253 		residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1254 		residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1255 		residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1256 		residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1257 		resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
1258 		residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
1259 		residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
1260 		residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
1261 
1262         _mm_storeu_si128((__m128i*)residual, residual_0_7);
1263 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1264 		_mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1265 		_mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1266         _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
1267 		_mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
1268 		_mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
1269 		_mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
1270 
1271 //duplicate top field residual to bottom field
1272          residual += residualStride;
1273         _mm_storeu_si128((__m128i*)residual, residual_0_7);
1274 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1275 		_mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1276 		_mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1277         _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
1278 		_mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
1279 		_mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
1280 		_mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
1281 
1282 		input += inputStride;
1283 		pred += predStride;
1284 		residual += residualStride;
1285 	}
1286 	(void)areaWidth;
1287         //compute the last line
1288 
1289     if(lastLine){
1290         input -= (inputStride)>>1;
1291 		pred  -= (predStride )>>1;
1292 		residual -= residualStride;
1293 
1294         residual_0_7 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1295 		residual_8_15 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)input), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)pred), xmm0));
1296 		residual_16_23 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1297 		residual_24_31 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 16)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 16)), xmm0));
1298 		resdiaul_32_39 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
1299 		residual_40_47 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 32)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 32)), xmm0));
1300 		residual_48_55 = _mm_sub_epi16(_mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpacklo_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
1301 		residual_56_63 = _mm_sub_epi16(_mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(input + 48)), xmm0), _mm_unpackhi_epi8(_mm_loadu_si128((__m128i *)(pred + 48)), xmm0));
1302 
1303         _mm_storeu_si128((__m128i*)residual, residual_0_7);
1304 		_mm_storeu_si128((__m128i*)(residual + 8), residual_8_15);
1305 		_mm_storeu_si128((__m128i*)(residual + 16), residual_16_23);
1306 		_mm_storeu_si128((__m128i*)(residual + 24), residual_24_31);
1307         _mm_storeu_si128((__m128i*)(residual + 32), resdiaul_32_39);
1308 		_mm_storeu_si128((__m128i*)(residual + 40), residual_40_47);
1309 		_mm_storeu_si128((__m128i*)(residual + 48), residual_48_55);
1310 		_mm_storeu_si128((__m128i*)(residual + 56), residual_56_63);
1311 
1312     }
1313 
1314 	return;
1315 }
1316 /******************************************************************************************************
1317                                        ResidualKernel16bit_SSE2_INTRIN
1318 ******************************************************************************************************/
ResidualKernel16bit_SSE2_INTRIN(EB_U16 * input,EB_U32 inputStride,EB_U16 * pred,EB_U32 predStride,EB_S16 * residual,EB_U32 residualStride,EB_U32 areaWidth,EB_U32 areaHeight)1319 void ResidualKernel16bit_SSE2_INTRIN(
1320 	EB_U16   *input,
1321 	EB_U32   inputStride,
1322 	EB_U16   *pred,
1323 	EB_U32   predStride,
1324 	EB_S16  *residual,
1325 	EB_U32   residualStride,
1326 	EB_U32   areaWidth,
1327 	EB_U32   areaHeight)
1328 {
1329 	EB_U32 x, y;
1330     __m128i residual0, residual1;
1331 
1332 	if (areaWidth == 4)
1333 	{
1334 		for (y = 0; y < areaHeight; y += 2){
1335 
1336 			residual0 = _mm_sub_epi16(_mm_loadl_epi64((__m128i*)input), _mm_loadl_epi64((__m128i*)pred));
1337 			residual1 = _mm_sub_epi16(_mm_loadl_epi64((__m128i*)(input + inputStride)), _mm_loadl_epi64((__m128i*)(pred +  predStride)));
1338 
1339 			_mm_storel_epi64((__m128i*)residual, residual0);
1340 			_mm_storel_epi64((__m128i*)(residual + residualStride), residual1);
1341 
1342 			input += inputStride << 1;
1343 			pred += predStride << 1;
1344 			residual += residualStride << 1;
1345 		}
1346 	}
1347 	else if (areaWidth == 8){
1348 		for (y = 0; y < areaHeight; y += 2){
1349 
1350 			residual0 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred));
1351 			residual1 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride)), _mm_loadu_si128((__m128i*)(pred + predStride)));
1352 
1353 			_mm_storeu_si128((__m128i*) residual, residual0);
1354 			_mm_storeu_si128((__m128i*) (residual + residualStride), residual1);
1355 
1356 			input += inputStride << 1;
1357 			pred += predStride << 1;
1358 			residual += residualStride << 1;
1359 		}
1360 	}
1361 	else if(areaWidth == 16){
1362 
1363         __m128i residual2, residual3;
1364 
1365 		for (y = 0; y < areaHeight; y += 2){
1366 
1367 			residual0 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred));
1368 			residual1 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8)));
1369 			residual2 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input+inputStride)), _mm_loadu_si128((__m128i*)(pred+predStride)));
1370 			residual3 = _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride+8)), _mm_loadu_si128((__m128i*)(pred + predStride+8)));
1371 
1372 			_mm_storeu_si128((__m128i*)residual, residual0);
1373 			_mm_storeu_si128((__m128i*)(residual + 8), residual1);
1374 			_mm_storeu_si128((__m128i*)(residual+residualStride), residual2);
1375 			_mm_storeu_si128((__m128i*)(residual +residualStride+ 8), residual3);
1376 
1377 			input += inputStride << 1;
1378 			pred += predStride << 1;
1379 			residual += residualStride << 1;
1380 		}
1381 	}
1382 	else if(areaWidth == 32){
1383 
1384 		for (y = 0; y < areaHeight; y += 2){
1385             //residual[columnIndex] = ((EB_S16)input[columnIndex]) - ((EB_S16)pred[columnIndex]);
1386 			_mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
1387 			_mm_storeu_si128((__m128i*) (residual + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8))));
1388 			_mm_storeu_si128((__m128i*) (residual + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 16)), _mm_loadu_si128((__m128i*)(pred + 16))));
1389 			_mm_storeu_si128((__m128i*) (residual + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 24)), _mm_loadu_si128((__m128i*)(pred + 24))));
1390 
1391 			_mm_storeu_si128((__m128i*) (residual + residualStride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input+inputStride)), _mm_loadu_si128((__m128i*)(pred+predStride))));
1392 			_mm_storeu_si128((__m128i*) (residual + residualStride + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 8)), _mm_loadu_si128((__m128i*)(pred+predStride + 8))));
1393 			_mm_storeu_si128((__m128i*) (residual + residualStride + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 16)), _mm_loadu_si128((__m128i*)(pred + predStride+ 16))));
1394 			_mm_storeu_si128((__m128i*) (residual + residualStride + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 24)), _mm_loadu_si128((__m128i*)(pred + predStride+ 24))));
1395 
1396 			input += inputStride << 1;
1397 			pred += predStride << 1;
1398 			residual += residualStride << 1;
1399 		}
1400 	}
1401 	else if(areaWidth == 64){ // Branch was not tested because the encoder had max tuSize of 32
1402 
1403 		for (y = 0; y < areaHeight; y += 2){
1404 
1405 		    //residual[columnIndex] = ((EB_S16)input[columnIndex]) - ((EB_S16)pred[columnIndex]) 8 indices per _mm_sub_epi16
1406 			_mm_storeu_si128((__m128i*) residual,  _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
1407 			_mm_storeu_si128((__m128i*) (residual + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 8)), _mm_loadu_si128((__m128i*)(pred + 8))));
1408 			_mm_storeu_si128((__m128i*) (residual + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 16)), _mm_loadu_si128((__m128i*)(pred + 16))));
1409 			_mm_storeu_si128((__m128i*) (residual + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 24)), _mm_loadu_si128((__m128i*)(pred + 24))));
1410 			_mm_storeu_si128((__m128i*) (residual + 32), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 32)), _mm_loadu_si128((__m128i*)(pred + 32))));
1411 			_mm_storeu_si128((__m128i*) (residual + 40), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 40)), _mm_loadu_si128((__m128i*)(pred + 40))));
1412 			_mm_storeu_si128((__m128i*) (residual + 48), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 48)), _mm_loadu_si128((__m128i*)(pred + 48))));
1413 			_mm_storeu_si128((__m128i*) (residual + 56), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + 56)), _mm_loadu_si128((__m128i*)(pred + 56))));
1414 
1415 			_mm_storeu_si128((__m128i*) (residual + residualStride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride)), _mm_loadu_si128((__m128i*)(pred + predStride))));
1416 			_mm_storeu_si128((__m128i*) (residual + residualStride + 8), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 8)), _mm_loadu_si128((__m128i*)(pred + predStride + 8))));
1417 			_mm_storeu_si128((__m128i*) (residual + residualStride + 16), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 16)), _mm_loadu_si128((__m128i*)(pred + predStride + 16))));
1418 			_mm_storeu_si128((__m128i*) (residual + residualStride + 24), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 24)), _mm_loadu_si128((__m128i*)(pred + predStride + 24))));
1419 			_mm_storeu_si128((__m128i*) (residual + residualStride + 32), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 32)), _mm_loadu_si128((__m128i*)(pred + predStride + 32))));
1420 			_mm_storeu_si128((__m128i*) (residual + residualStride + 40), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 40)), _mm_loadu_si128((__m128i*)(pred + predStride + 40))));
1421 			_mm_storeu_si128((__m128i*) (residual + residualStride + 48), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 48)), _mm_loadu_si128((__m128i*)(pred + predStride + 48))));
1422 			_mm_storeu_si128((__m128i*) (residual + residualStride + 56), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride + 56)), _mm_loadu_si128((__m128i*)(pred + predStride + 56))));
1423 
1424 			input += inputStride << 1;
1425 			pred += predStride << 1;
1426 			residual += residualStride << 1;
1427 		}
1428 	}
1429 	else {
1430 
1431 		EB_U32 inputStrideDiff = 2 * inputStride;
1432 		EB_U32 predStrideDiff = 2 * predStride;
1433 		EB_U32 residualStrideDiff = 2 * residualStride;
1434 		inputStrideDiff -= areaWidth;
1435 		predStrideDiff -= areaWidth;
1436 		residualStrideDiff -= areaWidth;
1437 
1438 		if (!(areaWidth & 7)){
1439 
1440 			for (x = 0; x < areaHeight; x += 2){
1441 				for (y = 0; y < areaWidth; y += 8){
1442 
1443 					_mm_storeu_si128((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
1444 					_mm_storeu_si128((__m128i*) (residual + residualStride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride)), _mm_loadu_si128((__m128i*)(pred + predStride))));
1445 
1446 					input += 8;
1447 					pred += 8;
1448 					residual += 8;
1449 				}
1450 				input = input + inputStrideDiff;
1451 				pred = pred + predStrideDiff;
1452 				residual = residual + residualStrideDiff;
1453 			}
1454 		}
1455 		else{
1456 			for (x = 0; x < areaHeight; x += 2){
1457 				for (y = 0; y < areaWidth; y += 4){
1458 
1459 					_mm_storel_epi64((__m128i*) residual, _mm_sub_epi16(_mm_loadu_si128((__m128i*)input), _mm_loadu_si128((__m128i*)pred)));
1460 					_mm_storel_epi64((__m128i*) (residual + residualStride), _mm_sub_epi16(_mm_loadu_si128((__m128i*)(input + inputStride)), _mm_loadu_si128((__m128i*)(pred + predStride))));
1461 
1462 					input += 4;
1463 					pred += 4;
1464 					residual += 4;
1465 				}
1466 				input = input + inputStrideDiff;
1467 				pred = pred + predStrideDiff;
1468 				residual = residual + residualStrideDiff;
1469 			}
1470 		}
1471 	}
1472 	return;
1473 }
1474 
1475 /******************************************************************************************************
1476                                    PictureAdditionKernel16bit_SSE2_INTRIN
1477 ******************************************************************************************************/
1478 
1479 
PictureAdditionKernel16bit_SSE2_INTRIN(EB_U16 * predPtr,EB_U32 predStride,EB_S16 * residualPtr,EB_U32 residualStride,EB_U16 * reconPtr,EB_U32 reconStride,EB_U32 width,EB_U32 height)1480 void PictureAdditionKernel16bit_SSE2_INTRIN(
1481 	EB_U16  *predPtr,
1482 	EB_U32  predStride,
1483 	EB_S16 *residualPtr,
1484 	EB_U32  residualStride,
1485 	EB_U16  *reconPtr,
1486 	EB_U32  reconStride,
1487 	EB_U32  width,
1488 	EB_U32  height)
1489 {
1490     __m128i xmm_0, xmm_Max10bit;
1491 
1492 	EB_U32 y, x;
1493 
1494 	xmm_0 = _mm_setzero_si128();
1495 	xmm_Max10bit = _mm_set1_epi16(1023);
1496 
1497 	if (width == 4)
1498 	{
1499         __m128i xmm_sum_0_3, xmm_sum_s0_s3, xmm_clip3_0_3, xmm_clip3_s0_s3;
1500 		for (y = 0; y < height; y += 2){
1501 
1502 			xmm_sum_0_3 = _mm_adds_epi16(_mm_loadl_epi64((__m128i*)predPtr), _mm_loadl_epi64((__m128i*)residualPtr));
1503 			xmm_sum_s0_s3 = _mm_adds_epi16(_mm_loadl_epi64((__m128i*)(predPtr + predStride)), _mm_loadl_epi64((__m128i*)(residualPtr + residualStride)));
1504 
1505 			xmm_clip3_0_3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_3, xmm_Max10bit), xmm_0);
1506 			xmm_clip3_s0_s3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s3, xmm_Max10bit), xmm_0);
1507 
1508 			_mm_storel_epi64((__m128i*) reconPtr, xmm_clip3_0_3);
1509 			_mm_storel_epi64((__m128i*) (reconPtr + reconStride), xmm_clip3_s0_s3);
1510 
1511 			predPtr += predStride << 1;
1512 			residualPtr += residualStride << 1;
1513 			reconPtr += reconStride << 1;
1514 		}
1515 	}
1516 	else if (width == 8){
1517 
1518         __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_7, xmm_clip3_s0_s7;
1519 
1520 		for (y = 0; y < height; y += 2){
1521 
1522 			xmm_sum_0_7 = _mm_adds_epi16( _mm_loadu_si128((__m128i*)predPtr),_mm_loadu_si128((__m128i*)residualPtr));
1523 			xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride)));
1524 
1525 			xmm_clip3_0_7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
1526 			xmm_clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
1527 
1528 			_mm_storeu_si128((__m128i*) reconPtr, xmm_clip3_0_7);
1529 			_mm_storeu_si128((__m128i*) (reconPtr + reconStride), xmm_clip3_s0_s7);
1530 
1531 			predPtr += predStride << 1;
1532 			residualPtr += residualStride << 1;
1533 			reconPtr += reconStride << 1;
1534 		}
1535 	}
1536 	else if (width == 16){
1537 
1538         __m128i sum_0_7, sum_8_15, sum_s0_s7, sum_s8_s15, clip3_0_7, clip3_8_15, clip3_s0_s7, clip3_s8_s15;
1539 
1540 		for (y = 0; y < height; y += 2){
1541 
1542 			sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1543 			sum_8_15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 8)), _mm_loadu_si128((__m128i*)(residualPtr + 8)));
1544 			sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride)));
1545 			sum_s8_s15 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride + 8)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride + 8)));
1546 
1547 			clip3_0_7 = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
1548 			clip3_8_15 = _mm_max_epi16(_mm_min_epi16(sum_8_15, xmm_Max10bit), xmm_0);
1549 			clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(sum_s0_s7, xmm_Max10bit), xmm_0);
1550 			clip3_s8_s15 = _mm_max_epi16(_mm_min_epi16(sum_s8_s15, xmm_Max10bit), xmm_0);
1551 
1552 			_mm_storeu_si128((__m128i*) reconPtr, clip3_0_7);
1553 			_mm_storeu_si128((__m128i*) (reconPtr + 8), clip3_8_15);
1554 			_mm_storeu_si128((__m128i*) (reconPtr + reconStride), clip3_s0_s7);
1555 			_mm_storeu_si128((__m128i*) (reconPtr + reconStride + 8), clip3_s8_s15);
1556 
1557 			predPtr += predStride << 1;
1558 			residualPtr += residualStride << 1;
1559 			reconPtr += reconStride << 1;
1560 		}
1561 	}
1562 	else if (width == 32){
1563         __m128i sum_0_7, sum_8_15, sum_16_23, sum_24_31, sum_s0_s7, sum_s8_s15, sum_s16_s23, sum_s24_s31;
1564         __m128i clip3_0_7, clip3_8_15, clip3_16_23, clip3_24_31, clip3_s0_s7, clip3_s8_s15, clip3_s16_s23, clip3_s24_s31;
1565 
1566 		for (y = 0; y < height; y += 2){
1567 
1568 			sum_0_7   = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1569 			sum_8_15  = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 8)), _mm_loadu_si128((__m128i*)(residualPtr + 8)));
1570 			sum_16_23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 16)), _mm_loadu_si128((__m128i*)(residualPtr + 16)));
1571 			sum_24_31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 24)), _mm_loadu_si128((__m128i*)(residualPtr + 24)));
1572 
1573 			sum_s0_s7   = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride)));
1574 			sum_s8_s15  = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride + 8)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride + 8)));
1575 			sum_s16_s23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride + 16)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride + 16)));
1576 			sum_s24_s31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride + 24)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride + 24)));
1577 
1578 			clip3_0_7   = _mm_max_epi16(_mm_min_epi16(sum_0_7, xmm_Max10bit), xmm_0);
1579 			clip3_8_15  = _mm_max_epi16(_mm_min_epi16(sum_8_15 , xmm_Max10bit), xmm_0);
1580 			clip3_16_23 = _mm_max_epi16(_mm_min_epi16(sum_16_23, xmm_Max10bit), xmm_0);
1581 			clip3_24_31 = _mm_max_epi16(_mm_min_epi16(sum_24_31, xmm_Max10bit), xmm_0);
1582 
1583 			clip3_s0_s7   = _mm_max_epi16(_mm_min_epi16(sum_s0_s7, xmm_Max10bit), xmm_0);
1584 			clip3_s8_s15  = _mm_max_epi16(_mm_min_epi16(sum_s8_s15, xmm_Max10bit), xmm_0);
1585 			clip3_s16_s23 = _mm_max_epi16(_mm_min_epi16(sum_s16_s23, xmm_Max10bit), xmm_0);
1586 			clip3_s24_s31 = _mm_max_epi16(_mm_min_epi16(sum_s24_s31, xmm_Max10bit), xmm_0);
1587 
1588 			_mm_storeu_si128((__m128i*) reconPtr,        clip3_0_7);
1589 			_mm_storeu_si128((__m128i*) (reconPtr + 8),  clip3_8_15);
1590 			_mm_storeu_si128((__m128i*) (reconPtr + 16), clip3_16_23);
1591 			_mm_storeu_si128((__m128i*) (reconPtr + 24), clip3_24_31);
1592 
1593 			_mm_storeu_si128((__m128i*) (reconPtr + reconStride),      clip3_s0_s7);
1594 			_mm_storeu_si128((__m128i*) (reconPtr + reconStride + 8),  clip3_s8_s15);
1595 			_mm_storeu_si128((__m128i*) (reconPtr + reconStride + 16), clip3_s16_s23);
1596 			_mm_storeu_si128((__m128i*) (reconPtr + reconStride + 24), clip3_s24_s31);
1597 
1598 			predPtr += predStride << 1;
1599 			residualPtr += residualStride << 1;
1600 			reconPtr += reconStride << 1;
1601 		}
1602 	}
1603 	else if (width == 64){ // Branch not tested due to Max TU size is 32 at time of development
1604 
1605         __m128i sum_0_7, sum_8_15, sum_16_23, sum_24_31, sum_32_39, sum_40_47, sum_48_55, sum_56_63;
1606         __m128i clip3_0_7, clip3_8_15, clip3_16_23, clip3_24_31, clip3_32_39, clip3_40_47, clip3_48_55, clip3_56_63;
1607 
1608 		for (y = 0; y < height; ++y ){
1609 
1610 			sum_0_7   = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1611 			sum_8_15  = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 8)), _mm_loadu_si128((__m128i*)(residualPtr + 8)));
1612 			sum_16_23 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 16)), _mm_loadu_si128((__m128i*)(residualPtr + 16)));
1613 			sum_24_31 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 24)), _mm_loadu_si128((__m128i*)(residualPtr + 24)));
1614 			sum_32_39 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 32)), _mm_loadu_si128((__m128i*)(residualPtr + 32)));
1615 			sum_40_47 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 40)), _mm_loadu_si128((__m128i*)(residualPtr + 40)));
1616 			sum_48_55 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 48)), _mm_loadu_si128((__m128i*)(residualPtr + 48)));
1617 			sum_56_63 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + 56)), _mm_loadu_si128((__m128i*)(residualPtr + 56)));
1618 
1619 			clip3_0_7   = _mm_max_epi16(_mm_min_epi16(sum_0_7  , xmm_Max10bit), xmm_0);
1620 			clip3_8_15  = _mm_max_epi16(_mm_min_epi16(sum_8_15 , xmm_Max10bit), xmm_0);
1621 			clip3_16_23 = _mm_max_epi16(_mm_min_epi16(sum_16_23, xmm_Max10bit), xmm_0);
1622 			clip3_24_31 = _mm_max_epi16(_mm_min_epi16(sum_24_31, xmm_Max10bit), xmm_0);
1623 			clip3_32_39 = _mm_max_epi16(_mm_min_epi16(sum_32_39, xmm_Max10bit), xmm_0);
1624 			clip3_40_47 = _mm_max_epi16(_mm_min_epi16(sum_40_47, xmm_Max10bit), xmm_0);
1625 			clip3_48_55 = _mm_max_epi16(_mm_min_epi16(sum_48_55, xmm_Max10bit), xmm_0);
1626 			clip3_56_63 = _mm_max_epi16(_mm_min_epi16(sum_56_63, xmm_Max10bit), xmm_0);
1627 
1628 			_mm_storeu_si128((__m128i*) reconPtr,        clip3_0_7  );
1629 			_mm_storeu_si128((__m128i*) (reconPtr + 8),  clip3_8_15 );
1630 			_mm_storeu_si128((__m128i*) (reconPtr + 16), clip3_16_23);
1631 			_mm_storeu_si128((__m128i*) (reconPtr + 24), clip3_24_31);
1632 			_mm_storeu_si128((__m128i*) (reconPtr + 32), clip3_32_39);
1633 			_mm_storeu_si128((__m128i*) (reconPtr + 40), clip3_40_47);
1634 			_mm_storeu_si128((__m128i*) (reconPtr + 48), clip3_48_55);
1635 			_mm_storeu_si128((__m128i*) (reconPtr + 56), clip3_56_63);
1636 
1637 			predPtr += predStride ;
1638 			residualPtr += residualStride ;
1639 			reconPtr += reconStride ;
1640 		}
1641 	}
1642 	else
1643 	{
1644 		EB_U32 predStrideDiff = 2 * predStride;
1645 		EB_U32 residualStrideDiff = 2 * residualStride;
1646 		EB_U32 reconStrideDiff = 2 * reconStride;
1647 		predStrideDiff -= width;
1648 		residualStrideDiff -= width;
1649 		reconStrideDiff -= width;
1650 
1651 		if (!(width & 7)){
1652 
1653             __m128i xmm_sum_0_7, xmm_sum_s0_s7, xmm_clip3_0_7, xmm_clip3_s0_s7;
1654 
1655 			for (x = 0; x < height; x += 2){
1656 				for (y = 0; y < width; y += 8){
1657 
1658 					xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1659 					xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride )));
1660 
1661 					xmm_clip3_0_7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
1662 					xmm_clip3_s0_s7 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
1663 
1664 					_mm_storeu_si128((__m128i*) reconPtr, xmm_clip3_0_7);
1665 					_mm_storeu_si128((__m128i*) (reconPtr + reconStride), xmm_clip3_s0_s7);
1666 
1667 					predPtr += 8;
1668 					residualPtr += 8;
1669 					reconPtr += 8;
1670 				}
1671 				predPtr += predStrideDiff;
1672 				residualPtr +=  residualStrideDiff;
1673 				reconPtr +=  reconStrideDiff;
1674 			}
1675 		}
1676 		else{
1677             __m128i xmm_sum_0_7, xmm_sum_s0_s7,  xmm_clip3_0_3, xmm_clip3_s0_s3;
1678 			for (x = 0; x < height; x += 2){
1679 				for (y = 0; y < width; y += 4){
1680 
1681 					xmm_sum_0_7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)predPtr), _mm_loadu_si128((__m128i*)residualPtr));
1682 					xmm_sum_s0_s7 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(predPtr + predStride)), _mm_loadu_si128((__m128i*)(residualPtr + residualStride)));
1683 
1684 					xmm_clip3_0_3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_0_7, xmm_Max10bit), xmm_0);
1685 					xmm_clip3_s0_s3 = _mm_max_epi16(_mm_min_epi16(xmm_sum_s0_s7, xmm_Max10bit), xmm_0);
1686 
1687 					_mm_storel_epi64((__m128i*) reconPtr, xmm_clip3_0_3);
1688 					_mm_storel_epi64((__m128i*) (reconPtr + reconStride), xmm_clip3_s0_s3);
1689 
1690 					predPtr += 4;
1691 					residualPtr += 4;
1692 					reconPtr += 4;
1693 				}
1694 				predPtr +=  predStrideDiff;
1695 				residualPtr +=  residualStrideDiff;
1696 				reconPtr += reconStrideDiff;
1697 			}
1698 		}
1699 	}
1700 	return;
1701 }
1702 
1703 
1704