1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbComputeSAD_AVX2.h"
7 #include "EbDefinitions.h"
8 #include "immintrin.h"
9 
10 #ifndef _mm256_setr_m128i
11 #define _mm256_setr_m128i(/* __m128i */ hi, /* __m128i */ lo) \
12     _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
13 #endif
14 
15 #define UPDATE_BEST(s, k, offset) \
16   temSum1 = _mm_extract_epi32(s, k); \
17   if (temSum1 < lowSum) { \
18     lowSum = temSum1; \
19     xBest = j + offset + k; \
20     yBest = i; \
21   }
22 
23 /*******************************************************************************
24  * Requirement: width   = 4, 8, 16, 24, 32, 48 or 64
25  * Requirement: height <= 64
26  * Requirement: height % 2 = 0 when width = 4 or 8
27 *******************************************************************************/
SadLoopKernel_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width,EB_U64 * bestSad,EB_S16 * xSearchCenter,EB_S16 * ySearchCenter,EB_U32 srcStrideRaw,EB_S16 searchAreaWidth,EB_S16 searchAreaHeight)28 void SadLoopKernel_AVX2_INTRIN(
29     EB_U8  *src,                            // input parameter, source samples Ptr
30     EB_U32  srcStride,                      // input parameter, source stride
31     EB_U8  *ref,                            // input parameter, reference samples Ptr
32     EB_U32  refStride,                      // input parameter, reference stride
33     EB_U32  height,                         // input parameter, block height (M)
34     EB_U32  width,                          // input parameter, block width (N)
35     EB_U64 *bestSad,
36     EB_S16 *xSearchCenter,
37     EB_S16 *ySearchCenter,
38     EB_U32  srcStrideRaw,                   // input parameter, source stride (no line skipping)
39     EB_S16 searchAreaWidth,
40     EB_S16 searchAreaHeight)
41 {
42   EB_S16 xBest = *xSearchCenter, yBest = *ySearchCenter;
43   EB_U32 lowSum = 0xffffff;
44   EB_U32 temSum1 = 0;
45   EB_S16 i, j;
46   EB_U32 k, l;
47   EB_U32 leftover = searchAreaWidth & 7;
48   const EB_U8 *pRef, *pSrc;
49   __m128i s0, s1, s2, s3, s4, s5, s6, s8 = _mm_set1_epi32(-1);
50   __m256i ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, ss8;
51 
52   if (leftover) {
53     for (k=0; k<leftover; k++) {
54       s8 = _mm_slli_si128(s8, 2);
55     }
56   }
57 
58   switch (width) {
59   case 4:
60 
61     if (!(height % 4)) {
62       EB_U32 srcStrideT = 3 * srcStride;
63       EB_U32 refStrideT = 3 * refStride;
64       for (i=0; i<searchAreaHeight; i++) {
65         for (j=0; j<=searchAreaWidth-8; j+=8) {
66           pSrc = src;
67           pRef = ref + j;
68           ss3 = ss5 = _mm256_setzero_si256();
69           for (k=0; k<height; k+=4) {
70 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef))), _mm_loadu_si128((__m128i*)(pRef + 2 * refStride)), 0x1);
71 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + refStride))), _mm_loadu_si128((__m128i*)(pRef + refStrideT)), 0x1);
72 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_cvtsi32_si128(*(EB_U32 *)pSrc), _mm_cvtsi32_si128(*(EB_U32 *)(pSrc + srcStride)))), _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(EB_U32 *)(pSrc + 2 * srcStride)), _mm_cvtsi32_si128(*(EB_U32 *)(pSrc + srcStrideT))), 0x1);
73             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
74             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
75             pSrc += srcStride << 2;
76             pRef += refStride << 2;
77           }
78           ss3 = _mm256_adds_epu16(ss3, ss5);
79           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
80           s3 = _mm_minpos_epu16(s3);
81           temSum1 = _mm_extract_epi16(s3, 0);
82           if (temSum1 < lowSum) {
83             lowSum = temSum1;
84             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
85             yBest = i;
86           }
87         }
88 
89         if (leftover) {
90           pSrc = src;
91           pRef = ref + j;
92           ss3 = ss5 = _mm256_setzero_si256();
93           for (k=0; k<height; k+=4) {
94 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + 2 * refStride)), 0x1);
95 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + refStride))), _mm_loadu_si128((__m128i*)(pRef + refStrideT)), 0x1);
96 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_cvtsi32_si128(*(EB_U32 *)pSrc), _mm_cvtsi32_si128(*(EB_U32 *)(pSrc + srcStride)))), _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(EB_U32 *)(pSrc + 2 * srcStride)), _mm_cvtsi32_si128(*(EB_U32 *)(pSrc + srcStrideT))), 0x1);
97             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
98             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
99             pSrc += srcStride << 2;
100             pRef += refStride << 2;
101           }
102           ss3 = _mm256_adds_epu16(ss3, ss5);
103           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
104           s3 = _mm_or_si128(s3, s8);
105           s3 = _mm_minpos_epu16(s3);
106           temSum1 = _mm_extract_epi16(s3, 0);
107           if (temSum1 < lowSum) {
108             lowSum = temSum1;
109             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
110             yBest = i;
111           }
112         }
113         ref += srcStrideRaw;
114       }
115     }
116     else {
117       for (i=0; i<searchAreaHeight; i++) {
118         for (j=0; j<=searchAreaWidth-8; j+=8) {
119           pSrc = src;
120           pRef = ref + j;
121           s3 = _mm_setzero_si128();
122           for (k=0; k<height; k+=2) {
123             s0 = _mm_loadu_si128((__m128i*)pRef);
124             s1 = _mm_loadu_si128((__m128i*)(pRef+refStride));
125             s2 = _mm_cvtsi32_si128(*(EB_U32 *)pSrc);
126             s5 = _mm_cvtsi32_si128(*(EB_U32 *)(pSrc+srcStride));
127             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
128             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
129             pSrc += srcStride << 1;
130             pRef += refStride << 1;
131           }
132           s3 = _mm_minpos_epu16(s3);
133           temSum1 = _mm_extract_epi16(s3, 0);
134           if (temSum1 < lowSum) {
135             lowSum = temSum1;
136             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
137             yBest = i;
138           }
139         }
140 
141         if (leftover) {
142           pSrc = src;
143           pRef = ref + j;
144           s3 = _mm_setzero_si128();
145           for (k=0; k<height; k+=2) {
146             s0 = _mm_loadu_si128((__m128i*)pRef);
147             s1 = _mm_loadu_si128((__m128i*)(pRef+refStride));
148             s2 = _mm_cvtsi32_si128(*(EB_U32 *)pSrc);
149             s5 = _mm_cvtsi32_si128(*(EB_U32 *)(pSrc+srcStride));
150             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
151             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
152             pSrc += srcStride << 1;
153             pRef += refStride << 1;
154           }
155           s3 = _mm_or_si128(s3, s8);
156           s3 = _mm_minpos_epu16(s3);
157           temSum1 = _mm_extract_epi16(s3, 0);
158           if (temSum1 < lowSum) {
159             lowSum = temSum1;
160             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
161             yBest = i;
162           }
163         }
164         ref += srcStrideRaw;
165       }
166     }
167 
168     break;
169 
170   case 8:
171     if (!(height % 4)) {
172       EB_U32 srcStrideT = 3 * srcStride;
173       EB_U32 refStrideT = 3 * refStride;
174       for (i=0; i<searchAreaHeight; i++) {
175         for (j=0; j<=searchAreaWidth-8; j+=8) {
176           pSrc = src;
177           pRef = ref + j;
178           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
179           for (k=0; k<height; k+=4) {
180 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + 2 * refStride)), 0x1);
181 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + refStride))), _mm_loadu_si128((__m128i*)(pRef + refStrideT)), 0x1);
182 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)pSrc), _mm_loadl_epi64((__m128i*)(pSrc + srcStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(pSrc + 2 * srcStride)), _mm_loadl_epi64((__m128i*)(pSrc + srcStrideT))), 0x1);
183             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
184             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
185             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
186             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
187             pSrc += srcStride << 2;
188             pRef += refStride << 2;
189           }
190           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
191           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
192           s3 = _mm_minpos_epu16(s3);
193           temSum1 = _mm_extract_epi16(s3, 0);
194           if (temSum1 < lowSum) {
195             lowSum = temSum1;
196             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
197             yBest = i;
198           }
199         }
200 
201         if (leftover) {
202           pSrc = src;
203           pRef = ref + j;
204           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
205           for (k=0; k<height; k+=4) {
206             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)),             _mm_loadu_si128((__m128i*)(pRef+2*refStride)), 0x1);
207 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + refStride))), _mm_loadu_si128((__m128i*)(pRef + refStrideT)), 0x1);
208 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)pSrc), _mm_loadl_epi64((__m128i*)(pSrc + srcStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(pSrc + 2 * srcStride)), _mm_loadl_epi64((__m128i*)(pSrc + srcStrideT))), 0x1);
209             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
210             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
211             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
212             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
213             pSrc += srcStride << 2;
214             pRef += refStride << 2;
215           }
216           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
217           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
218           s3 = _mm_or_si128(s3, s8);
219           s3 = _mm_minpos_epu16(s3);
220           temSum1 = _mm_extract_epi16(s3, 0);
221           if (temSum1 < lowSum) {
222             lowSum = temSum1;
223             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
224             yBest = i;
225           }
226         }
227         ref += srcStrideRaw;
228       }
229     }
230     else {
231       for (i=0; i<searchAreaHeight; i++) {
232         for (j=0; j<=searchAreaWidth-8; j+=8) {
233           pSrc = src;
234           pRef = ref + j;
235           s3 = s4 = _mm_setzero_si128();
236           for (k=0; k<height; k+=2) {
237             s0 = _mm_loadu_si128((__m128i*)pRef);
238             s1 = _mm_loadu_si128((__m128i*)(pRef+refStride));
239             s2 = _mm_loadl_epi64((__m128i*)pSrc);
240             s5 = _mm_loadl_epi64((__m128i*)(pSrc+srcStride));
241             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
242             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
243             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
244             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
245             pSrc += srcStride << 1;
246             pRef += refStride << 1;
247           }
248           s3 = _mm_adds_epu16(s3, s4);
249           s3 = _mm_minpos_epu16(s3);
250           temSum1 = _mm_extract_epi16(s3, 0);
251           if (temSum1 < lowSum) {
252             lowSum = temSum1;
253             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
254             yBest = i;
255           }
256         }
257 
258         if (leftover) {
259           pSrc = src;
260           pRef = ref + j;
261           s3 = s4 = _mm_setzero_si128();
262           for (k=0; k<height; k+=2) {
263             s0 = _mm_loadu_si128((__m128i*)pRef);
264             s1 = _mm_loadu_si128((__m128i*)(pRef+refStride));
265             s2 = _mm_loadl_epi64((__m128i*)pSrc);
266             s5 = _mm_loadl_epi64((__m128i*)(pSrc+srcStride));
267             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
268             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
269             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
270             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
271             pSrc += srcStride << 1;
272             pRef += refStride << 1;
273           }
274           s3 = _mm_adds_epu16(s3, s4);
275           s3 = _mm_or_si128(s3, s8);
276           s3 = _mm_minpos_epu16(s3);
277           temSum1 = _mm_extract_epi16(s3, 0);
278           if (temSum1 < lowSum) {
279             lowSum = temSum1;
280             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
281             yBest = i;
282           }
283         }
284         ref += srcStrideRaw;
285       }
286     }
287     break;
288 
289   case 16:
290     if (height <= 16) {
291       for (i=0; i<searchAreaHeight; i++) {
292         for (j=0; j<=searchAreaWidth-8; j+=8) {
293           pSrc = src;
294           pRef = ref + j;
295           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
296           for (k=0; k<height; k+=2) {
297 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
298 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
299 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
300             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
301             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
302             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
303             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
304             pSrc += 2 * srcStride;
305             pRef += 2 * refStride;
306           }
307           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
308           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
309           s3 = _mm_minpos_epu16(s3);
310           temSum1 = _mm_extract_epi16(s3, 0);
311           if (temSum1 < lowSum) {
312             lowSum = temSum1;
313             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
314             yBest = i;
315           }
316         }
317 
318         if (leftover) {
319           pSrc = src;
320           pRef = ref + j;
321           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
322           for (k=0; k<height; k+=2) {
323 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
324 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
325 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
326             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
327             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
328             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
329             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
330             pSrc += 2 * srcStride;
331             pRef += 2 * refStride;
332           }
333           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
334           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
335           s3 = _mm_or_si128(s3, s8);
336           s3 = _mm_minpos_epu16(s3);
337           temSum1 = _mm_extract_epi16(s3, 0);
338           if (temSum1 < lowSum) {
339             lowSum = temSum1;
340             xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
341             yBest = i;
342           }
343         }
344         ref += srcStrideRaw;
345       }
346     }
347     else if (height <= 32) {
348       for (i=0; i<searchAreaHeight; i++) {
349         for (j=0; j<=searchAreaWidth-8; j+=8) {
350           pSrc = src;
351           pRef = ref + j;
352           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
353           for (k=0; k<height; k+=2) {
354 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
355 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
356 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
357             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
358             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
359             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
360             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
361             pSrc += 2 * srcStride;
362             pRef += 2 * refStride;
363           }
364           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
365           s3 = _mm256_extracti128_si256(ss3, 0);
366           s5 = _mm256_extracti128_si256(ss3, 1);
367           s4 = _mm_minpos_epu16(s3);
368           s6 = _mm_minpos_epu16(s5);
369           s4 = _mm_unpacklo_epi16(s4, s4);
370           s4 = _mm_unpacklo_epi32(s4, s4);
371           s4 = _mm_unpacklo_epi64(s4, s4);
372           s6 = _mm_unpacklo_epi16(s6, s6);
373           s6 = _mm_unpacklo_epi32(s6, s6);
374           s6 = _mm_unpacklo_epi64(s6, s6);
375           s3 = _mm_sub_epi16(s3, s4);
376           s5 = _mm_adds_epu16(s5, s3);
377           s5 = _mm_sub_epi16(s5, s6);
378           s5 = _mm_minpos_epu16(s5);
379           temSum1  = _mm_extract_epi16(s5, 0);
380           temSum1 += _mm_extract_epi16(s4, 0);
381           temSum1 += _mm_extract_epi16(s6, 0);
382           if (temSum1 < lowSum) {
383             lowSum = temSum1;
384             xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
385             yBest = i;
386           }
387         }
388 
389         if (leftover) {
390           pSrc = src;
391           pRef = ref + j;
392           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
393           for (k=0; k<height; k+=2) {
394 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
395 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
396 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
397             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
398             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
399             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
400             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
401             pSrc += 2 * srcStride;
402             pRef += 2 * refStride;
403           }
404           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
405           s3 = _mm256_extracti128_si256(ss3, 0);
406           s5 = _mm256_extracti128_si256(ss3, 1);
407           s3 = _mm_or_si128(s3, s8);
408           s5 = _mm_or_si128(s5, s8);
409           s4 = _mm_minpos_epu16(s3);
410           s6 = _mm_minpos_epu16(s5);
411           s4 = _mm_unpacklo_epi16(s4, s4);
412           s4 = _mm_unpacklo_epi32(s4, s4);
413           s4 = _mm_unpacklo_epi64(s4, s4);
414           s6 = _mm_unpacklo_epi16(s6, s6);
415           s6 = _mm_unpacklo_epi32(s6, s6);
416           s6 = _mm_unpacklo_epi64(s6, s6);
417           s3 = _mm_sub_epi16(s3, s4);
418           s5 = _mm_adds_epu16(s5, s3);
419           s5 = _mm_sub_epi16(s5, s6);
420           s5 = _mm_minpos_epu16(s5);
421           temSum1  = _mm_extract_epi16(s5, 0);
422           temSum1 += _mm_extract_epi16(s4, 0);
423           temSum1 += _mm_extract_epi16(s6, 0);
424           if (temSum1 < lowSum) {
425             lowSum = temSum1;
426             xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
427             yBest = i;
428           }
429         }
430         ref += srcStrideRaw;
431       }
432     }
433     else {
434       for (i=0; i<searchAreaHeight; i++) {
435         for (j=0; j<=searchAreaWidth-8; j+=8) {
436           pSrc = src;
437           pRef = ref + j;
438           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
439           for (k=0; k<height; k+=2) {
440 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
441 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
442 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
443             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
444             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
445             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
446             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
447             pSrc += 2 * srcStride;
448             pRef += 2 * refStride;
449           }
450           ss3 = _mm256_adds_epu16(ss3, ss4);
451           ss5 = _mm256_adds_epu16(ss5, ss6);
452           ss0 = _mm256_adds_epu16(ss3, ss5);
453           s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
454           s0 = _mm_minpos_epu16(s0);
455           temSum1 = _mm_extract_epi16(s0, 0);
456           if (temSum1 < lowSum) {
457             if (temSum1 != 0xFFFF) { // no overflow
458               lowSum = temSum1;
459               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
460               yBest = i;
461             }
462             else {
463               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
464               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
465               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
466               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
467               ss4 = _mm256_add_epi32(ss4, ss6);
468               ss3 = _mm256_add_epi32(ss3, ss5);
469               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
470               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
471               UPDATE_BEST(s0, 0, 0);
472               UPDATE_BEST(s0, 1, 0);
473               UPDATE_BEST(s0, 2, 0);
474               UPDATE_BEST(s0, 3, 0);
475               UPDATE_BEST(s3, 0, 4);
476               UPDATE_BEST(s3, 1, 4);
477               UPDATE_BEST(s3, 2, 4);
478               UPDATE_BEST(s3, 3, 4);
479             }
480           }
481         }
482 
483         if (leftover) {
484           pSrc = src;
485           pRef = ref + j;
486           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
487           for (k=0; k<height; k+=2) {
488 			ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
489 			ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
490 			ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
491             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
492             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
493             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
494             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
495             pSrc += 2 * srcStride;
496             pRef += 2 * refStride;
497           }
498           ss3 = _mm256_adds_epu16(ss3, ss4);
499           ss5 = _mm256_adds_epu16(ss5, ss6);
500           ss0 = _mm256_adds_epu16(ss3, ss5);
501           s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
502           s0 = _mm_or_si128(s0, s8);
503           s0 = _mm_minpos_epu16(s0);
504           temSum1 = _mm_extract_epi16(s0, 0);
505           if (temSum1 < lowSum) {
506             if (temSum1 != 0xFFFF) { // no overflow
507               lowSum = temSum1;
508               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
509               yBest = i;
510             }
511             else {
512               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
513               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
514               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
515               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
516               ss4 = _mm256_add_epi32(ss4, ss6);
517               ss3 = _mm256_add_epi32(ss3, ss5);
518               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
519               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
520               k = leftover;
521               while (k > 0) {
522                 for (l=0; l < 4 && k; l++, k--) {
523                   temSum1 = _mm_extract_epi32(s0, 0);
524                   s0 = _mm_srli_si128(s0, 4);
525                   if (temSum1 < lowSum) {
526                     lowSum = temSum1;
527                     xBest = (EB_S16)(j + leftover - k);
528                     yBest = i;
529                   }
530                 }
531                 s0 = s3;
532               }
533             }
534           }
535         }
536         ref += srcStrideRaw;
537       }
538     }
539     break;
540 
541   case 24:
542     if (height <= 16) {
543       for (i=0; i<searchAreaHeight; i++) {
544         for (j=0; j<=searchAreaWidth-8; j+=8) {
545           pSrc = src;
546           pRef = ref + j;
547           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
548           for (k=0; k<height; k++) {
549             ss0 = _mm256_loadu_si256((__m256i*)pRef);
550             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
551             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
552             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
553             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
554             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
555             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
556             pSrc += srcStride;
557             pRef += refStride;
558           }
559           ss3 = _mm256_adds_epu16(ss3, ss4);
560           ss5 = _mm256_adds_epu16(ss5, ss6);
561           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
562           s5 = _mm256_extracti128_si256(ss5, 0);
563           s4 = _mm_minpos_epu16(s3);
564           s6 = _mm_minpos_epu16(s5);
565           s4 = _mm_unpacklo_epi16(s4, s4);
566           s4 = _mm_unpacklo_epi32(s4, s4);
567           s4 = _mm_unpacklo_epi64(s4, s4);
568           s6 = _mm_unpacklo_epi16(s6, s6);
569           s6 = _mm_unpacklo_epi32(s6, s6);
570           s6 = _mm_unpacklo_epi64(s6, s6);
571           s3 = _mm_sub_epi16(s3, s4);
572           s5 = _mm_adds_epu16(s5, s3);
573           s5 = _mm_sub_epi16(s5, s6);
574           s5 = _mm_minpos_epu16(s5);
575           temSum1  = _mm_extract_epi16(s5, 0);
576           temSum1 += _mm_extract_epi16(s4, 0);
577           temSum1 += _mm_extract_epi16(s6, 0);
578           if (temSum1 < lowSum) {
579             lowSum = temSum1;
580             xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
581             yBest = i;
582           }
583         }
584 
585         if (leftover) {
586           pSrc = src;
587           pRef = ref + j;
588           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
589           for (k=0; k<height; k++) {
590             ss0 = _mm256_loadu_si256((__m256i*)pRef);
591             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
592             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
593             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
594             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
595             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
596             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
597             pSrc += srcStride;
598             pRef += refStride;
599           }
600           ss3 = _mm256_adds_epu16(ss3, ss4);
601           ss5 = _mm256_adds_epu16(ss5, ss6);
602           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
603           s5 = _mm256_extracti128_si256(ss5, 0);
604           s3 = _mm_or_si128(s3, s8);
605           s5 = _mm_or_si128(s5, s8);
606           s4 = _mm_minpos_epu16(s3);
607           s6 = _mm_minpos_epu16(s5);
608           s4 = _mm_unpacklo_epi16(s4, s4);
609           s4 = _mm_unpacklo_epi32(s4, s4);
610           s4 = _mm_unpacklo_epi64(s4, s4);
611           s6 = _mm_unpacklo_epi16(s6, s6);
612           s6 = _mm_unpacklo_epi32(s6, s6);
613           s6 = _mm_unpacklo_epi64(s6, s6);
614           s3 = _mm_sub_epi16(s3, s4);
615           s5 = _mm_adds_epu16(s5, s3);
616           s5 = _mm_sub_epi16(s5, s6);
617           s5 = _mm_minpos_epu16(s5);
618           temSum1  = _mm_extract_epi16(s5, 0);
619           temSum1 += _mm_extract_epi16(s4, 0);
620           temSum1 += _mm_extract_epi16(s6, 0);
621           if (temSum1 < lowSum) {
622             lowSum = temSum1;
623             xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
624             yBest = i;
625           }
626         }
627         ref += srcStrideRaw;
628       }
629     }
630     else {
631       for (i=0; i<searchAreaHeight; i++) {
632         for (j=0; j<=searchAreaWidth-8; j+=8) {
633           pSrc = src;
634           pRef = ref + j;
635           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
636           for (k=0; k<height; k++) {
637             ss0 = _mm256_loadu_si256((__m256i*)pRef);
638             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
639             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
640             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
641             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
642             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
643             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
644             pSrc += srcStride;
645             pRef += refStride;
646           }
647           ss3 = _mm256_adds_epu16(ss3, ss4);
648           ss5 = _mm256_adds_epu16(ss5, ss6);
649           s3 = _mm256_extracti128_si256(ss3, 0);
650           s4 = _mm256_extracti128_si256(ss3, 1);
651           s5 = _mm256_extracti128_si256(ss5, 0);
652           s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), s5);
653           s0 = _mm_minpos_epu16(s0);
654           temSum1 = _mm_extract_epi16(s0, 0);
655           if (temSum1 < lowSum) {
656             if (temSum1 != 0xFFFF) { // no overflow
657               lowSum = temSum1;
658               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
659               yBest = i;
660             }
661             else {
662               s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
663               s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
664               s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
665               s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
666               s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
667               s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
668               s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), s2);
669               s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), s5);
670               UPDATE_BEST(s0, 0, 0);
671               UPDATE_BEST(s0, 1, 0);
672               UPDATE_BEST(s0, 2, 0);
673               UPDATE_BEST(s0, 3, 0);
674               UPDATE_BEST(s3, 0, 4);
675               UPDATE_BEST(s3, 1, 4);
676               UPDATE_BEST(s3, 2, 4);
677               UPDATE_BEST(s3, 3, 4);
678             }
679           }
680         }
681 
682         if (leftover) {
683           pSrc = src;
684           pRef = ref + j;
685           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
686           for (k=0; k<height; k++) {
687             ss0 = _mm256_loadu_si256((__m256i*)pRef);
688             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
689             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
690             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
691             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
692             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
693             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
694             pSrc += srcStride;
695             pRef += refStride;
696           }
697           ss3 = _mm256_adds_epu16(ss3, ss4);
698           ss5 = _mm256_adds_epu16(ss5, ss6);
699           s3 = _mm256_extracti128_si256(ss3, 0);
700           s4 = _mm256_extracti128_si256(ss3, 1);
701           s5 = _mm256_extracti128_si256(ss5, 0);
702           s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), s5);
703           s0 = _mm_or_si128(s0, s8);
704           s0 = _mm_minpos_epu16(s0);
705           temSum1 = _mm_extract_epi16(s0, 0);
706           if (temSum1 < lowSum) {
707             if (temSum1 != 0xFFFF) { // no overflow
708               lowSum = temSum1;
709               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
710               yBest = i;
711             }
712             else {
713               s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
714               s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
715               s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
716               s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
717               s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
718               s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
719               s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), s2);
720               s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), s5);
721               k = leftover;
722               while (k > 0) {
723                 for (l=0; l < 4 && k; l++, k--) {
724                   temSum1 = _mm_extract_epi32(s0, 0);
725                   s0 = _mm_srli_si128(s0, 4);
726                   if (temSum1 < lowSum) {
727                     lowSum = temSum1;
728                     xBest = (EB_S16)(j + leftover - k);
729                     yBest = i;
730                   }
731                 }
732                 s0 = s3;
733               }
734             }
735           }
736         }
737         ref += srcStrideRaw;
738       }
739     }
740     break;
741 
742   case 32:
743     if (height <= 16) {
744       for (i=0; i<searchAreaHeight; i++) {
745         for (j=0; j<=searchAreaWidth-8; j+=8) {
746           pSrc = src;
747           pRef = ref + j;
748           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
749           for (k=0; k<height; k++) {
750             ss0 = _mm256_loadu_si256((__m256i*)pRef);
751             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
752             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
753             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
754             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
755             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
756             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
757             pSrc += srcStride;
758             pRef += refStride;
759           }
760           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
761           s3 = _mm256_extracti128_si256(ss3, 0);
762           s5 = _mm256_extracti128_si256(ss3, 1);
763           s4 = _mm_minpos_epu16(s3);
764           s6 = _mm_minpos_epu16(s5);
765           s4 = _mm_unpacklo_epi16(s4, s4);
766           s4 = _mm_unpacklo_epi32(s4, s4);
767           s4 = _mm_unpacklo_epi64(s4, s4);
768           s6 = _mm_unpacklo_epi16(s6, s6);
769           s6 = _mm_unpacklo_epi32(s6, s6);
770           s6 = _mm_unpacklo_epi64(s6, s6);
771           s3 = _mm_sub_epi16(s3, s4);
772           s5 = _mm_adds_epu16(s5, s3);
773           s5 = _mm_sub_epi16(s5, s6);
774           s5 = _mm_minpos_epu16(s5);
775           temSum1  = _mm_extract_epi16(s5, 0);
776           temSum1 += _mm_extract_epi16(s4, 0);
777           temSum1 += _mm_extract_epi16(s6, 0);
778           if (temSum1 < lowSum) {
779             lowSum = temSum1;
780             xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
781             yBest = i;
782           }
783         }
784 
785         if (leftover) {
786           pSrc = src;
787           pRef = ref + j;
788           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
789           for (k=0; k<height; k++) {
790             ss0 = _mm256_loadu_si256((__m256i*)pRef);
791             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
792             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
793             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
794             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
795             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
796             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
797             pSrc += srcStride;
798             pRef += refStride;
799           }
800           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
801           s3 = _mm256_extracti128_si256(ss3, 0);
802           s5 = _mm256_extracti128_si256(ss3, 1);
803           s3 = _mm_or_si128(s3, s8);
804           s5 = _mm_or_si128(s5, s8);
805           s4 = _mm_minpos_epu16(s3);
806           s6 = _mm_minpos_epu16(s5);
807           s4 = _mm_unpacklo_epi16(s4, s4);
808           s4 = _mm_unpacklo_epi32(s4, s4);
809           s4 = _mm_unpacklo_epi64(s4, s4);
810           s6 = _mm_unpacklo_epi16(s6, s6);
811           s6 = _mm_unpacklo_epi32(s6, s6);
812           s6 = _mm_unpacklo_epi64(s6, s6);
813           s3 = _mm_sub_epi16(s3, s4);
814           s5 = _mm_adds_epu16(s5, s3);
815           s5 = _mm_sub_epi16(s5, s6);
816           s5 = _mm_minpos_epu16(s5);
817           temSum1  = _mm_extract_epi16(s5, 0);
818           temSum1 += _mm_extract_epi16(s4, 0);
819           temSum1 += _mm_extract_epi16(s6, 0);
820           if (temSum1 < lowSum) {
821             lowSum = temSum1;
822             xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
823             yBest = i;
824           }
825         }
826         ref += srcStrideRaw;
827       }
828     }
829     else if (height <= 32) {
830       for (i=0; i<searchAreaHeight; i++) {
831         for (j=0; j<=searchAreaWidth-8; j+=8) {
832           pSrc = src;
833           pRef = ref + j;
834           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
835           for (k=0; k<height; k++) {
836             ss0 = _mm256_loadu_si256((__m256i*)pRef);
837             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
838             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
839             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
840             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
841             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
842             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
843             pSrc += srcStride;
844             pRef += refStride;
845           }
846           ss3 = _mm256_adds_epu16(ss3, ss4);
847           ss5 = _mm256_adds_epu16(ss5, ss6);
848           ss6 = _mm256_adds_epu16(ss3, ss5);
849           s3 = _mm256_extracti128_si256(ss6, 0);
850           s4 = _mm256_extracti128_si256(ss6, 1);
851           s0 = _mm_adds_epu16(s3, s4);
852           s0 = _mm_minpos_epu16(s0);
853           temSum1 = _mm_extract_epi16(s0, 0);
854           if (temSum1 < lowSum) {
855             if (temSum1 != 0xFFFF) { // no overflow
856               lowSum = temSum1;
857               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
858               yBest = i;
859             }
860             else {
861               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
862               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
863               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
864               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
865               ss4 = _mm256_add_epi32(ss4, ss6);
866               ss3 = _mm256_add_epi32(ss3, ss5);
867               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
868               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
869               UPDATE_BEST(s0, 0, 0);
870               UPDATE_BEST(s0, 1, 0);
871               UPDATE_BEST(s0, 2, 0);
872               UPDATE_BEST(s0, 3, 0);
873               UPDATE_BEST(s3, 0, 4);
874               UPDATE_BEST(s3, 1, 4);
875               UPDATE_BEST(s3, 2, 4);
876               UPDATE_BEST(s3, 3, 4);
877             }
878           }
879         }
880 
881         if (leftover) {
882           pSrc = src;
883           pRef = ref + j;
884           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
885           for (k=0; k<height; k++) {
886             ss0 = _mm256_loadu_si256((__m256i*)pRef);
887             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
888             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
889             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
890             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
891             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
892             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
893             pSrc += srcStride;
894             pRef += refStride;
895           }
896           ss3 = _mm256_adds_epu16(ss3, ss4);
897           ss5 = _mm256_adds_epu16(ss5, ss6);
898           ss6 = _mm256_adds_epu16(ss3, ss5);
899           s3 = _mm256_extracti128_si256(ss6, 0);
900           s4 = _mm256_extracti128_si256(ss6, 1);
901           s0 = _mm_adds_epu16(s3, s4);
902           //s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
903           s0 = _mm_or_si128(s0, s8);
904           s0 = _mm_minpos_epu16(s0);
905           temSum1 = _mm_extract_epi16(s0, 0);
906           if (temSum1 < lowSum) {
907             if (temSum1 != 0xFFFF) { // no overflow
908               lowSum = temSum1;
909               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
910               yBest = i;
911             }
912             else {
913               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
914               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
915               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
916               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
917               ss4 = _mm256_add_epi32(ss4, ss6);
918               ss3 = _mm256_add_epi32(ss3, ss5);
919               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
920               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
921               k = leftover;
922               while (k > 0) {
923                 for (l=0; l < 4 && k; l++, k--) {
924                   temSum1 = _mm_extract_epi32(s0, 0);
925                   s0 = _mm_srli_si128(s0, 4);
926                   if (temSum1 < lowSum) {
927                     lowSum = temSum1;
928                     xBest = (EB_S16)(j + leftover - k);
929                     yBest = i;
930                   }
931                 }
932                 s0 = s3;
933               }
934             }
935           }
936         }
937         ref += srcStrideRaw;
938       }
939     }
940     else {
941       for (i=0; i<searchAreaHeight; i++) {
942         for (j=0; j<=searchAreaWidth-8; j+=8) {
943           pSrc = src;
944           pRef = ref + j;
945           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
946           for (k=0; k<height; k++) {
947             ss0 = _mm256_loadu_si256((__m256i*)pRef);
948             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
949             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
950             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
951             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
952             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
953             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
954             pSrc += srcStride;
955             pRef += refStride;
956           }
957           ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
958           s3 = _mm256_extracti128_si256(ss7, 0);
959           s4 = _mm256_extracti128_si256(ss7, 1);
960           s0 = _mm_adds_epu16(s3, s4);
961           s0 = _mm_minpos_epu16(s0);
962           temSum1 = _mm_extract_epi16(s0, 0);
963           if (temSum1 < lowSum) {
964             if (temSum1 != 0xFFFF) { // no overflow
965               lowSum = temSum1;
966               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
967               yBest = i;
968             }
969             else {
970               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
971               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
972               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
973               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
974               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
975               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
976               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
977               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
978               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
979               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
980               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
981               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
982               UPDATE_BEST(s0, 0, 0);
983               UPDATE_BEST(s0, 1, 0);
984               UPDATE_BEST(s0, 2, 0);
985               UPDATE_BEST(s0, 3, 0);
986               UPDATE_BEST(s3, 0, 4);
987               UPDATE_BEST(s3, 1, 4);
988               UPDATE_BEST(s3, 2, 4);
989               UPDATE_BEST(s3, 3, 4);
990             }
991           }
992         }
993 
994         if (leftover) {
995           pSrc = src;
996           pRef = ref + j;
997           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
998           for (k=0; k<height; k++) {
999             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1000             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1001             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1002             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1003             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1004             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1005             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1006             pSrc += srcStride;
1007             pRef += refStride;
1008           }
1009           ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1010           s3 = _mm256_extracti128_si256(ss7, 0);
1011           s4 = _mm256_extracti128_si256(ss7, 1);
1012           s0 = _mm_adds_epu16(s3, s4);
1013           s0 = _mm_minpos_epu16(s0);
1014           temSum1 = _mm_extract_epi16(s0, 0);
1015           if (temSum1 < lowSum) {
1016             if (temSum1 != 0xFFFF) { // no overflow
1017               lowSum = temSum1;
1018               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1019               yBest = i;
1020             }
1021             else {
1022               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1023               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1024               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1025               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1026               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1027               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1028               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1029               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1030               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1031               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1032               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1033               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1034               k = leftover;
1035               while (k > 0) {
1036                 for (l=0; l < 4 && k; l++, k--) {
1037                   temSum1 = _mm_extract_epi32(s0, 0);
1038                   s0 = _mm_srli_si128(s0, 4);
1039                   if (temSum1 < lowSum) {
1040                     lowSum = temSum1;
1041                     xBest = (EB_S16)(j + leftover - k);
1042                     yBest = i;
1043                   }
1044                 }
1045                 s0 = s3;
1046               }
1047             }
1048           }
1049         }
1050         ref += srcStrideRaw;
1051       }
1052     }
1053     break;
1054 
1055   case 48:
1056     if (height <= 32) {
1057       for (i=0; i<searchAreaHeight; i++) {
1058         for (j=0; j<=searchAreaWidth-8; j+=8) {
1059           pSrc = src;
1060           pRef = ref + j;
1061           s3 = s4 = s5 = s6 = _mm_setzero_si128();
1062           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1063           for (k=0; k<height; k++) {
1064             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1065             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1066             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1067             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1068             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1069             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1070             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1071             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1072             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1073             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1074             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1075             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1076             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1077             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1078             pSrc += srcStride;
1079             pRef += refStride;
1080           }
1081           s3 = _mm_adds_epu16(s3, s4);
1082           s5 = _mm_adds_epu16(s5, s6);
1083           s0 = _mm_adds_epu16(s3, s5);
1084           ss3 = _mm256_adds_epu16(ss3, ss4);
1085           ss5 = _mm256_adds_epu16(ss5, ss6);
1086           ss6 = _mm256_adds_epu16(ss3, ss5);
1087           s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss6, 0), _mm256_extracti128_si256(ss6, 1)));
1088           s0 = _mm_minpos_epu16(s0);
1089           temSum1 = _mm_extract_epi16(s0, 0);
1090           if (temSum1 < lowSum) {
1091             if (temSum1 != 0xFFFF) { // no overflow
1092               lowSum = temSum1;
1093               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1094               yBest = i;
1095             }
1096             else {
1097               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1098               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1099               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1100               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1101               ss4 = _mm256_add_epi32(ss4, ss6);
1102               ss3 = _mm256_add_epi32(ss3, ss5);
1103               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1104               s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1105               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1106               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1107               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1108               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1109               UPDATE_BEST(s0, 0, 0);
1110               UPDATE_BEST(s0, 1, 0);
1111               UPDATE_BEST(s0, 2, 0);
1112               UPDATE_BEST(s0, 3, 0);
1113               UPDATE_BEST(s1, 0, 4);
1114               UPDATE_BEST(s1, 1, 4);
1115               UPDATE_BEST(s1, 2, 4);
1116               UPDATE_BEST(s1, 3, 4);
1117             }
1118           }
1119         }
1120 
1121         if (leftover) {
1122           pSrc = src;
1123           pRef = ref + j;
1124           s3 = s4 = s5 = s6 = _mm_setzero_si128();
1125           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1126           for (k=0; k<height; k++) {
1127             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1128             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1129             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1130             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1131             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1132             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1133             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1134             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1135             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1136             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1137             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1138             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1139             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1140             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1141             pSrc += srcStride;
1142             pRef += refStride;
1143           }
1144           s3 = _mm_adds_epu16(s3, s4);
1145           s5 = _mm_adds_epu16(s5, s6);
1146           s0 = _mm_adds_epu16(s3, s5);
1147           ss3 = _mm256_adds_epu16(ss3, ss4);
1148           ss5 = _mm256_adds_epu16(ss5, ss6);
1149           ss6 = _mm256_adds_epu16(ss3, ss5);
1150           s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss6, 0), _mm256_extracti128_si256(ss6, 1)));
1151           s0 = _mm_or_si128(s0, s8);
1152           s0 = _mm_minpos_epu16(s0);
1153           temSum1 = _mm_extract_epi16(s0, 0);
1154           if (temSum1 < lowSum) {
1155             if (temSum1 != 0xFFFF) { // no overflow
1156               lowSum = temSum1;
1157               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1158               yBest = i;
1159             }
1160             else {
1161               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1162               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1163               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1164               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1165               ss4 = _mm256_add_epi32(ss4, ss6);
1166               ss3 = _mm256_add_epi32(ss3, ss5);
1167               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1168               s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1169               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1170               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1171               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1172               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1173               k = leftover;
1174               while (k > 0) {
1175                 for (l=0; l < 4 && k; l++, k--) {
1176                   temSum1 = _mm_extract_epi32(s0, 0);
1177                   s0 = _mm_srli_si128(s0, 4);
1178                   if (temSum1 < lowSum) {
1179                     lowSum = temSum1;
1180                     xBest = (EB_S16)(j + leftover - k);
1181                     yBest = i;
1182                   }
1183                 }
1184                 s0 = s1;
1185               }
1186             }
1187           }
1188         }
1189         ref += srcStrideRaw;
1190       }
1191     }
1192     else {
1193       for (i=0; i<searchAreaHeight; i++) {
1194         for (j=0; j<=searchAreaWidth-8; j+=8) {
1195           pSrc = src;
1196           pRef = ref + j;
1197           s3 = s4 = s5 = s6 = _mm_setzero_si128();
1198           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1199           for (k=0; k<height; k++) {
1200             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1201             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1202             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1203             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1204             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1205             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1206             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1207             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1208             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1209             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1210             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1211             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1212             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1213             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1214             pSrc += srcStride;
1215             pRef += refStride;
1216           }
1217           s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1218           ss7 = _mm256_adds_epu16(ss3, ss4);
1219           ss8 = _mm256_adds_epu16(ss5, ss6);
1220           ss7 = _mm256_adds_epu16(ss7, ss8);
1221           s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss7, 0), _mm256_extracti128_si256(ss7, 1)));
1222           s0 = _mm_minpos_epu16(s0);
1223           temSum1 = _mm_extract_epi16(s0, 0);
1224           if (temSum1 < lowSum) {
1225             if (temSum1 != 0xFFFF) { // no overflow
1226               lowSum = temSum1;
1227               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1228               yBest = i;
1229             }
1230             else {
1231               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1232               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1233               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1234               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1235               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1236               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1237               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1238               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1239               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1240               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1241               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1242               s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1243               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1244               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s4, _mm_setzero_si128()));
1245               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1246               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s6, _mm_setzero_si128()));
1247               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1248               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s4, _mm_setzero_si128()));
1249               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1250               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s6, _mm_setzero_si128()));
1251               UPDATE_BEST(s0, 0, 0);
1252               UPDATE_BEST(s0, 1, 0);
1253               UPDATE_BEST(s0, 2, 0);
1254               UPDATE_BEST(s0, 3, 0);
1255               UPDATE_BEST(s1, 0, 4);
1256               UPDATE_BEST(s1, 1, 4);
1257               UPDATE_BEST(s1, 2, 4);
1258               UPDATE_BEST(s1, 3, 4);
1259             }
1260           }
1261         }
1262 
1263         if (leftover) {
1264           pSrc = src;
1265           pRef = ref + j;
1266           s3 = s4 = s5 = s6 = _mm_setzero_si128();
1267           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1268           for (k=0; k<height; k++) {
1269             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1270             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1271             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1272             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1273             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1274             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1275             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1276             s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1277             s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1278             s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1279             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1280             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1281             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1282             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1283             pSrc += srcStride;
1284             pRef += refStride;
1285           }
1286           s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1287           ss7 = _mm256_adds_epu16(ss3, ss4);
1288           ss8 = _mm256_adds_epu16(ss5, ss6);
1289           ss7 = _mm256_adds_epu16(ss7, ss8);
1290           s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss7, 0), _mm256_extracti128_si256(ss7, 1)));
1291           s0 = _mm_or_si128(s0, s8);
1292           s0 = _mm_minpos_epu16(s0);
1293           temSum1 = _mm_extract_epi16(s0, 0);
1294           if (temSum1 < lowSum) {
1295             if (temSum1 != 0xFFFF) { // no overflow
1296               lowSum = temSum1;
1297               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1298               yBest = i;
1299             }
1300             else {
1301               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1302               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1303               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1304               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1305               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1306               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1307               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1308               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1309               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1310               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1311               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1312               s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1313               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1314               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s4, _mm_setzero_si128()));
1315               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1316               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s6, _mm_setzero_si128()));
1317               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1318               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s4, _mm_setzero_si128()));
1319               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1320               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s6, _mm_setzero_si128()));
1321               k = leftover;
1322               while (k > 0) {
1323                 for (l=0; l < 4 && k; l++, k--) {
1324                   temSum1 = _mm_extract_epi32(s0, 0);
1325                   s0 = _mm_srli_si128(s0, 4);
1326                   if (temSum1 < lowSum) {
1327                     lowSum = temSum1;
1328                     xBest = (EB_S16)(j + leftover - k);
1329                     yBest = i;
1330                   }
1331                 }
1332                 s0 = s1;
1333               }
1334             }
1335           }
1336         }
1337         ref += srcStrideRaw;
1338       }
1339     }
1340     break;
1341 
1342   case 64:
1343     if (height <= 32) {
1344       for (i=0; i<searchAreaHeight; i++) {
1345         for (j=0; j<=searchAreaWidth-8; j+=8) {
1346           pSrc = src;
1347           pRef = ref + j;
1348           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1349           for (k=0; k<height; k++) {
1350             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1351             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1352             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1353             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1354             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1355             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1356             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1357             ss0 = _mm256_loadu_si256((__m256i*)(pRef + 32));
1358             ss1 = _mm256_loadu_si256((__m256i*)(pRef + 40));
1359             ss2 = _mm256_loadu_si256((__m256i *)(pSrc + 32));
1360             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1361             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1362             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1363             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1364             pSrc += srcStride;
1365             pRef += refStride;
1366           }
1367           ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1368           s3 = _mm256_extracti128_si256(ss7, 0);
1369           s4 = _mm256_extracti128_si256(ss7, 1);
1370           s0 = _mm_adds_epu16(s3, s4);
1371           s0 = _mm_minpos_epu16(s0);
1372           temSum1 = _mm_extract_epi16(s0, 0);
1373           if (temSum1 < lowSum) {
1374             if (temSum1 != 0xFFFF) { // no overflow
1375               lowSum = temSum1;
1376               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1377               yBest = i;
1378             }
1379             else {
1380               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1381               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1382               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1383               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1384               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1385               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1386               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1387               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1388               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1389               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1390               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1391               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1392               UPDATE_BEST(s0, 0, 0);
1393               UPDATE_BEST(s0, 1, 0);
1394               UPDATE_BEST(s0, 2, 0);
1395               UPDATE_BEST(s0, 3, 0);
1396               UPDATE_BEST(s3, 0, 4);
1397               UPDATE_BEST(s3, 1, 4);
1398               UPDATE_BEST(s3, 2, 4);
1399               UPDATE_BEST(s3, 3, 4);
1400             }
1401           }
1402         }
1403 
1404         if (leftover) {
1405           pSrc = src;
1406           pRef = ref + j;
1407           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1408           for (k=0; k<height; k++) {
1409             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1410             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1411             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1412             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1413             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1414             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1415             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1416             ss0 = _mm256_loadu_si256((__m256i*)(pRef + 32));
1417             ss1 = _mm256_loadu_si256((__m256i*)(pRef + 40));
1418             ss2 = _mm256_loadu_si256((__m256i *)(pSrc + 32));
1419             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1420             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1421             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1422             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1423             pSrc += srcStride;
1424             pRef += refStride;
1425           }
1426           ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1427           s3 = _mm256_extracti128_si256(ss7, 0);
1428           s4 = _mm256_extracti128_si256(ss7, 1);
1429           s0 = _mm_adds_epu16(s3, s4);
1430           s0 = _mm_or_si128(s0, s8);
1431           s0 = _mm_minpos_epu16(s0);
1432           temSum1 = _mm_extract_epi16(s0, 0);
1433           if (temSum1 < lowSum) {
1434             if (temSum1 != 0xFFFF) { // no overflow
1435               lowSum = temSum1;
1436               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1437               yBest = i;
1438             }
1439             else {
1440               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1441               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1442               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1443               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1444               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1445               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1446               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1447               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1448               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1449               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1450               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1451               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1452               k = leftover;
1453               while (k > 0) {
1454                 for (l=0; l < 4 && k; l++, k--) {
1455                   temSum1 = _mm_extract_epi32(s0, 0);
1456                   s0 = _mm_srli_si128(s0, 4);
1457                   if (temSum1 < lowSum) {
1458                     lowSum = temSum1;
1459                     xBest = (EB_S16)(j + leftover - k);
1460                     yBest = i;
1461                   }
1462                 }
1463                 s0 = s3;
1464               }
1465             }
1466           }
1467         }
1468         ref += srcStrideRaw;
1469       }
1470     }
1471     else {
1472       __m256i ss9, ss10;
1473       for (i=0; i<searchAreaHeight; i++) {
1474         for (j=0; j<=searchAreaWidth-8; j+=8) {
1475           pSrc = src;
1476           pRef = ref + j;
1477           ss3 = ss4 = ss5 = ss6 = ss7 = ss8 = ss9 = ss10 = _mm256_setzero_si256();
1478           for (k=0; k<height; k++) {
1479             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1480             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1481             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1482             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1483             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1484             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1485             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1486             ss0 = _mm256_loadu_si256((__m256i*)(pRef + 32));
1487             ss1 = _mm256_loadu_si256((__m256i*)(pRef + 40));
1488             ss2 = _mm256_loadu_si256((__m256i *)(pSrc + 32));
1489             ss7 = _mm256_adds_epu16(ss7, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1490             ss8 = _mm256_adds_epu16(ss8, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1491             ss9 = _mm256_adds_epu16(ss9, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1492             ss10 = _mm256_adds_epu16(ss10, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1493             pSrc += srcStride;
1494             pRef += refStride;
1495           }
1496           ss0 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1497           ss0 = _mm256_adds_epu16(ss0, _mm256_adds_epu16(_mm256_adds_epu16(ss7, ss8), _mm256_adds_epu16(ss9, ss10)));
1498           s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1499           s0 = _mm_minpos_epu16(s0);
1500           temSum1 = _mm_extract_epi16(s0, 0);
1501           if (temSum1 < lowSum) {
1502             if (temSum1 != 0xFFFF) { // no overflow
1503               lowSum = temSum1;
1504               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1505               yBest = i;
1506             }
1507             else {
1508               ss0 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss3, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss5, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256())));
1509               ss1 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss3, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss5, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256())));
1510               ss2 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss7, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss9, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss10, _mm256_setzero_si256())));
1511               ss3 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss7, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss9, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss10, _mm256_setzero_si256())));
1512               ss0 = _mm256_add_epi32(ss0, ss2);
1513               ss1 = _mm256_add_epi32(ss1, ss3);
1514               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1515               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss1, 0), _mm256_extracti128_si256(ss1, 1));
1516               UPDATE_BEST(s0, 0, 0);
1517               UPDATE_BEST(s0, 1, 0);
1518               UPDATE_BEST(s0, 2, 0);
1519               UPDATE_BEST(s0, 3, 0);
1520               UPDATE_BEST(s3, 0, 4);
1521               UPDATE_BEST(s3, 1, 4);
1522               UPDATE_BEST(s3, 2, 4);
1523               UPDATE_BEST(s3, 3, 4);
1524             }
1525           }
1526         }
1527 
1528         if (leftover) {
1529           pSrc = src;
1530           pRef = ref + j;
1531           ss3 = ss4 = ss5 = ss6 = ss7 = ss8 = ss9 = ss10 = _mm256_setzero_si256();
1532           for (k=0; k<height; k++) {
1533             ss0 = _mm256_loadu_si256((__m256i*)pRef);
1534             ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1535             ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1536             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1537             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1538             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1539             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1540             ss0 = _mm256_loadu_si256((__m256i*)(pRef + 32));
1541             ss1 = _mm256_loadu_si256((__m256i*)(pRef + 40));
1542             ss2 = _mm256_loadu_si256((__m256i *)(pSrc + 32));
1543             ss7 = _mm256_adds_epu16(ss7, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1544             ss8 = _mm256_adds_epu16(ss8, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1545             ss9 = _mm256_adds_epu16(ss9, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1546             ss10 = _mm256_adds_epu16(ss10, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1547             pSrc += srcStride;
1548             pRef += refStride;
1549           }
1550           ss0 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1551           ss0 = _mm256_adds_epu16(ss0, _mm256_adds_epu16(_mm256_adds_epu16(ss7, ss8), _mm256_adds_epu16(ss9, ss10)));
1552           s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1553           s0 = _mm_or_si128(s0, s8);
1554           s0 = _mm_minpos_epu16(s0);
1555           temSum1 = _mm_extract_epi16(s0, 0);
1556           if (temSum1 < lowSum) {
1557             if (temSum1 != 0xFFFF) { // no overflow
1558               lowSum = temSum1;
1559               xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1560               yBest = i;
1561             }
1562             else {
1563               ss0 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss3, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss5, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256())));
1564               ss1 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss3, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss5, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256())));
1565               ss2 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss7, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss9, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss10, _mm256_setzero_si256())));
1566               ss3 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss7, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss9, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss10, _mm256_setzero_si256())));
1567               ss0 = _mm256_add_epi32(ss0, ss2);
1568               ss1 = _mm256_add_epi32(ss1, ss3);
1569               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1570               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss1, 0), _mm256_extracti128_si256(ss1, 1));
1571               k = leftover;
1572               while (k > 0) {
1573                 for (l=0; l < 4 && k; l++, k--) {
1574                   temSum1 = _mm_extract_epi32(s0, 0);
1575                   s0 = _mm_srli_si128(s0, 4);
1576                   if (temSum1 < lowSum) {
1577                     lowSum = temSum1;
1578                     xBest = (EB_S16)(j + leftover - k);
1579                     yBest = i;
1580                   }
1581                 }
1582                 s0 = s3;
1583               }
1584             }
1585           }
1586         }
1587         ref += srcStrideRaw;
1588       }
1589     }
1590     break;
1591 
1592   default:
1593     break;
1594   }
1595 
1596 
1597   *bestSad = lowSum;
1598   *xSearchCenter = xBest;
1599   *ySearchCenter = yBest;
1600 }
1601 
1602 /*******************************************************************************
1603 * Requirement: height % 2 = 0
1604 *******************************************************************************/
Compute24xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1605 EB_U32 Compute24xMSad_AVX2_INTRIN(
1606 	EB_U8  *src,        // input parameter, source samples Ptr
1607 	EB_U32  srcStride,  // input parameter, source stride
1608 	EB_U8  *ref,        // input parameter, reference samples Ptr
1609 	EB_U32  refStride,  // input parameter, reference stride
1610 	EB_U32  height,     // input parameter, block height (M)
1611 	EB_U32  width)     // input parameter, block width (N)
1612 {
1613 	__m128i xmm0, xmm1;
1614 	__m256i ymm0, ymm1;
1615 	EB_U32 y;
1616 	(void)width;
1617 
1618 	ymm0 = ymm1 = _mm256_setzero_si256();
1619 	for (y = 0; y < height; y += 2) {
1620 		ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i *)ref)));
1621 		ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i *)(ref + refStride))));
1622 		src += srcStride << 1;
1623 		ref += refStride << 1;
1624 	}
1625 	xmm0 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm1, 0));
1626 	xmm1 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 1), _mm256_extracti128_si256(ymm1, 1));
1627 	xmm0 = _mm_add_epi32(_mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8)), xmm1);
1628 	return (EB_U32)_mm_cvtsi128_si32(xmm0);
1629 }
1630 
1631 /*******************************************************************************
1632 * Requirement: height % 2 = 0
1633 *******************************************************************************/
Compute32xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1634 EB_U32 Compute32xMSad_AVX2_INTRIN(
1635 	EB_U8  *src,        // input parameter, source samples Ptr
1636 	EB_U32  srcStride,  // input parameter, source stride
1637 	EB_U8  *ref,        // input parameter, reference samples Ptr
1638 	EB_U32  refStride,  // input parameter, reference stride
1639 	EB_U32  height,     // input parameter, block height (M)
1640 	EB_U32  width)     // input parameter, block width (N)
1641 {
1642 	__m128i xmm0;
1643 	__m256i ymm0, ymm1;
1644 	EB_U32 y;
1645 	(void)width;
1646 
1647 	ymm0 = ymm1 = _mm256_setzero_si256();
1648 	for (y = 0; y < height; y += 2) {
1649 		ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i *)ref)));
1650 		ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i *)(ref + refStride))));
1651 		src += srcStride << 1;
1652 		ref += refStride << 1;
1653 	}
1654 	ymm0 = _mm256_add_epi32(ymm0, ymm1);
1655 	xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm0), _mm256_extracti128_si256(ymm0, 1));
1656 	xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1657 	return (EB_U32) /*xmm0.m128i_i64[0];*/ _mm_cvtsi128_si32(xmm0);
1658 }
1659 
1660 /*******************************************************************************
1661 * Requirement: height % 2 = 0
1662 *******************************************************************************/
Compute40xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1663 EB_U32 Compute40xMSad_AVX2_INTRIN(
1664         EB_U8  *src,        // input parameter, source samples Ptr
1665         EB_U32  srcStride,  // input parameter, source stride
1666         EB_U8  *ref,        // input parameter, reference samples Ptr
1667         EB_U32  refStride,  // input parameter, reference stride
1668         EB_U32  height,     // input parameter, block height (M)
1669         EB_U32  width)     // input parameter, block width (N)
1670 {
1671         __m128i xmm0, xmm1;
1672         __m256i ymm0, ymm1;
1673         EB_U32 y;
1674         (void)width;
1675 
1676         ymm0 = ymm1 = _mm256_setzero_si256();
1677         xmm0 = xmm1 = _mm_setzero_si128();
1678         for (y = 0; y < height; y += 2) {
1679                 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1680                 xmm0 = _mm_add_epi16(xmm0, _mm_sad_epu8(_mm_loadl_epi64((__m128i*)(src + 32)), _mm_loadl_epi64((__m128i*)(ref + 32))));
1681                 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i*)(ref + refStride))));
1682                 xmm1 = _mm_add_epi16(xmm1, _mm_sad_epu8(_mm_loadl_epi64((__m128i*)(src + srcStride + 32)), _mm_loadl_epi64((__m128i*)(ref + refStride + 32))));
1683 
1684                 src += srcStride << 1;
1685                 ref += refStride << 1;
1686         }
1687         ymm0 = _mm256_add_epi32(ymm0, ymm1);
1688         xmm0 = _mm_add_epi32(xmm0, xmm1);
1689         xmm0 = _mm_add_epi32(xmm0, _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm0, 1)));
1690         xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1691         return _mm_extract_epi32(xmm0, 0);
1692 }
1693 
1694 /*******************************************************************************
1695 * Requirement: height % 2 = 0
1696 *******************************************************************************/
Compute48xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1697 EB_U32 Compute48xMSad_AVX2_INTRIN(
1698 	EB_U8  *src,        // input parameter, source samples Ptr
1699 	EB_U32  srcStride,  // input parameter, source stride
1700 	EB_U8  *ref,        // input parameter, reference samples Ptr
1701 	EB_U32  refStride,  // input parameter, reference stride
1702 	EB_U32  height,     // input parameter, block height (M)
1703 	EB_U32  width)     // input parameter, block width (N)
1704 {
1705 	__m128i xmm0, xmm1;
1706 	__m256i ymm0, ymm1;
1707 	EB_U32 y;
1708 	(void)width;
1709 
1710 	ymm0 = ymm1 = _mm256_setzero_si256();
1711 	xmm0 = xmm1 = _mm_setzero_si128();
1712 	for (y = 0; y < height; y += 2) {
1713 		ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1714 		xmm0 = _mm_add_epi32(xmm0, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 32)), _mm_loadu_si128((__m128i*)(ref + 32))));
1715 		ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i*)(ref + refStride))));
1716 		xmm1 = _mm_add_epi32(xmm1, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + srcStride + 32)), _mm_loadu_si128((__m128i*)(ref + refStride + 32))));
1717 		src += srcStride << 1;
1718 		ref += refStride << 1;
1719 	}
1720 	ymm0 = _mm256_add_epi32(ymm0, ymm1);
1721 	xmm0 = _mm_add_epi32(xmm0, xmm1);
1722 	xmm0 = _mm_add_epi32(xmm0, _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm0, 1)));
1723 	xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1724 	return _mm_extract_epi32(xmm0, 0);
1725 }
1726 
1727 /*******************************************************************************
1728 * Requirement: height % 2 = 0
1729 *******************************************************************************/
Compute56xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1730 EB_U32 Compute56xMSad_AVX2_INTRIN(
1731         EB_U8  *src,        // input parameter, source samples Ptr
1732         EB_U32  srcStride,  // input parameter, source stride
1733         EB_U8  *ref,        // input parameter, reference samples Ptr
1734         EB_U32  refStride,  // input parameter, reference stride
1735         EB_U32  height,     // input parameter, block height (M)
1736         EB_U32  width)     // input parameter, block width (N)
1737 {
1738         __m128i xmm0, xmm1;
1739         __m256i ymm0, ymm1, ymm2, ymm3;
1740         EB_U32 y;
1741         (void)width;
1742 
1743         ymm0 = ymm1 = ymm2 = ymm3 = _mm256_setzero_si256();
1744         for (y = 0; y < height; y += 2) {
1745                 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1746                 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + 32)), _mm256_loadu_si256((__m256i*)(ref + 32))));
1747                 ymm2 = _mm256_add_epi32(ymm2, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i*)(ref + refStride))));
1748                 ymm3 = _mm256_add_epi32(ymm3, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride + 32)), _mm256_loadu_si256((__m256i*)(ref + refStride + 32))));
1749 
1750                 src += srcStride << 1;
1751                 ref += refStride << 1;
1752         }
1753         ymm0 = _mm256_add_epi32(ymm0, ymm2);
1754         xmm0 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm0, 1));
1755 
1756         xmm0 = _mm_add_epi32(xmm0, _mm_add_epi32(_mm256_extracti128_si256(ymm1, 0), _mm256_extracti128_si256(ymm3, 0)));
1757         xmm1 = _mm_add_epi32(_mm256_extracti128_si256(ymm1, 1), _mm256_extracti128_si256(ymm3, 1));
1758 
1759         xmm0 = _mm_add_epi32(_mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8)), xmm1);
1760        return _mm_extract_epi32(xmm0, 0);
1761 }
1762 
1763 /*******************************************************************************
1764 * Requirement: height % 2 = 0
1765 *******************************************************************************/
Compute64xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1766 EB_U32 Compute64xMSad_AVX2_INTRIN(
1767 	EB_U8  *src,        // input parameter, source samples Ptr
1768 	EB_U32  srcStride,  // input parameter, source stride
1769 	EB_U8  *ref,        // input parameter, reference samples Ptr
1770 	EB_U32  refStride,  // input parameter, reference stride
1771 	EB_U32  height,     // input parameter, block height (M)
1772 	EB_U32  width)     // input parameter, block width (N)
1773 {
1774 	__m128i xmm0;
1775 	__m256i ymm0, ymm1, ymm2, ymm3;
1776 	EB_U32 y;
1777 	(void)width;
1778 
1779 	ymm0 = ymm1 = ymm2 = ymm3 = _mm256_setzero_si256();
1780 	for (y = 0; y < height; y += 2) {
1781 		ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1782 		ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + 32)), _mm256_loadu_si256((__m256i*)(ref + 32))));
1783 		ymm2 = _mm256_add_epi32(ymm2, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i*)(ref + refStride))));
1784 		ymm3 = _mm256_add_epi32(ymm3, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride + 32)), _mm256_loadu_si256((__m256i*)(ref + refStride + 32))));
1785 		src += srcStride << 1;
1786 		ref += refStride << 1;
1787 	}
1788 	ymm0 = _mm256_add_epi32(_mm256_add_epi32(ymm0, ymm1), _mm256_add_epi32(ymm2, ymm3));
1789 	xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm0), _mm256_extracti128_si256(ymm0, 1));
1790 	xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1791 	return _mm_extract_epi32(xmm0, 0);
1792 }
1793 
1794 #ifdef NON_AVX512_SUPPORT
1795 
GetEightHorizontalSearchPointResults_8x8_16x16_PU_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 * pBestSad8x8,EB_U32 * pBestMV8x8,EB_U32 * pBestSad16x16,EB_U32 * pBestMV16x16,EB_U32 mv,EB_U16 * pSad16x16)1796 void GetEightHorizontalSearchPointResults_8x8_16x16_PU_AVX2_INTRIN(
1797     EB_U8   *src,
1798     EB_U32   srcStride,
1799     EB_U8   *ref,
1800     EB_U32   refStride,
1801     EB_U32  *pBestSad8x8,
1802     EB_U32  *pBestMV8x8,
1803     EB_U32  *pBestSad16x16,
1804     EB_U32  *pBestMV16x16,
1805     EB_U32   mv,
1806     EB_U16  *pSad16x16)
1807 {
1808 
1809     EB_S16 xMv, yMv;
1810     __m128i s3;
1811     __m128i sad_0, sad_1, sad_2, sad_3;
1812     __m256i ss0, ss1, ss2, ss3, ss4;
1813     EB_U32 temSum;
1814 
1815 
1816     /*
1817     -------------------------------------   -----------------------------------
1818     | 8x8_00 | 8x8_01 | 8x8_04 | 8x8_05 |   8x8_16 | 8x8_17 | 8x8_20 | 8x8_21 |
1819     -------------------------------------   -----------------------------------
1820     | 8x8_02 | 8x8_03 | 8x8_06 | 8x8_07 |   8x8_18 | 8x8_19 | 8x8_22 | 8x8_23 |
1821     -----------------------   -----------   ----------------------   ----------
1822     | 8x8_08 | 8x8_09 | 8x8_12 | 8x8_13 |   8x8_24 | 8x8_25 | 8x8_29 | 8x8_29 |
1823     ----------------------    -----------   ---------------------    ----------
1824     | 8x8_10 | 8x8_11 | 8x8_14 | 8x8_15 |   8x8_26 | 8x8_27 | 8x8_30 | 8x8_31 |
1825     -------------------------------------   -----------------------------------
1826 
1827     -------------------------------------   -----------------------------------
1828     | 8x8_32 | 8x8_33 | 8x8_36 | 8x8_37 |   8x8_48 | 8x8_49 | 8x8_52 | 8x8_53 |
1829     -------------------------------------   -----------------------------------
1830     | 8x8_34 | 8x8_35 | 8x8_38 | 8x8_39 |   8x8_50 | 8x8_51 | 8x8_54 | 8x8_55 |
1831     -----------------------   -----------   ----------------------   ----------
1832     | 8x8_40 | 8x8_41 | 8x8_44 | 8x8_45 |   8x8_56 | 8x8_57 | 8x8_60 | 8x8_61 |
1833     ----------------------    -----------   ---------------------    ----------
1834     | 8x8_42 | 8x8_43 | 8x8_46 | 8x8_48 |   8x8_58 | 8x8_59 | 8x8_62 | 8x8_63 |
1835     -------------------------------------   -----------------------------------
1836     */
1837 
1838     /*
1839     ----------------------    ----------------------
1840     |  16x16_0  |  16x16_1  |  16x16_4  |  16x16_5  |
1841     ----------------------    ----------------------
1842     |  16x16_2  |  16x16_3  |  16x16_6  |  16x16_7  |
1843     -----------------------   -----------------------
1844     |  16x16_8  |  16x16_9  |  16x16_12 |  16x16_13 |
1845     ----------------------    ----------------------
1846     |  16x16_10 |  16x16_11 |  16x16_14 |  16x16_15 |
1847     -----------------------   -----------------------
1848     */
1849 
1850     //8x8_0 & 8x8_1
1851     ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * refStride)));
1852     ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * refStride + 8)));
1853     ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * srcStride)));
1854     ss3 = _mm256_mpsadbw_epu8(ss0, ss2, 0);
1855     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1856     ss4 = _mm256_mpsadbw_epu8(ss1, ss2, 18);                         // 010 010
1857     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1858     src += srcStride * 4;
1859     ref += refStride * 4;
1860     ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * refStride)));
1861     ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * refStride + 8)));
1862     ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * srcStride)));
1863     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1864     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1865     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1866     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1867     sad_0 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1868     sad_1 = _mm_adds_epu16(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1869 
1870     //8x8_2 & 8x8_3
1871     src += srcStride * 4;
1872     ref += refStride * 4;
1873     ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * refStride)));
1874     ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * refStride + 8)));
1875     ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * srcStride)));
1876     ss3 = _mm256_mpsadbw_epu8(ss0, ss2, 0);
1877     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1878     ss4 = _mm256_mpsadbw_epu8(ss1, ss2, 18);                         // 010 010
1879     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1880 
1881     src += srcStride * 4;
1882     ref += refStride * 4;
1883     ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * refStride)));
1884     ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * refStride + 8)));
1885     ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * srcStride)));
1886     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1887     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1888     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1889     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1890     sad_2 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1891     sad_3 = _mm_adds_epu16(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1892 
1893     //16x16
1894     s3 = _mm_adds_epu16(_mm_adds_epu16(sad_0, sad_1), _mm_adds_epu16(sad_2, sad_3));
1895     //sotore the 8 SADs(16x8 SADs)
1896     _mm_store_si128((__m128i*)pSad16x16, s3);
1897     //find the best for 16x16
1898     s3 = _mm_minpos_epu16(s3);
1899     temSum = _mm_extract_epi16(s3, 0) << 1;
1900     if (temSum <  pBestSad16x16[0]) {
1901         pBestSad16x16[0] = temSum;
1902         xMv = _MVXT(mv) + (EB_S16)(_mm_extract_epi16(s3, 1) * 4);
1903         yMv = _MVYT(mv);
1904         pBestMV16x16[0] = ((EB_U16)yMv << 16) | ((EB_U16)xMv);
1905     }
1906 
1907     //find the best for 8x8_0, 8x8_1, 8x8_2 & 8x8_3
1908     sad_0 = _mm_minpos_epu16(sad_0);
1909     sad_1 = _mm_minpos_epu16(sad_1);
1910     sad_2 = _mm_minpos_epu16(sad_2);
1911     sad_3 = _mm_minpos_epu16(sad_3);
1912     sad_0 = _mm_unpacklo_epi16(sad_0, sad_1);
1913     sad_2 = _mm_unpacklo_epi16(sad_2, sad_3);
1914     sad_0 = _mm_unpacklo_epi32(sad_0, sad_2);
1915     sad_1 = _mm_unpackhi_epi16(sad_0, _mm_setzero_si128());
1916     sad_0 = _mm_unpacklo_epi16(sad_0, _mm_setzero_si128());
1917     sad_0 = _mm_slli_epi32(sad_0, 1);
1918     sad_1 = _mm_slli_epi16(sad_1, 2);
1919     sad_2 = _mm_loadu_si128((__m128i*)pBestSad8x8);
1920     s3 = _mm_cmpgt_epi32(sad_2, sad_0);
1921     sad_0 = _mm_min_epu32(sad_0, sad_2);
1922     _mm_storeu_si128((__m128i*)pBestSad8x8, sad_0);
1923     sad_3 = _mm_loadu_si128((__m128i*)pBestMV8x8);
1924     sad_3 = _mm_andnot_si128(s3, sad_3);
1925     sad_2 = _mm_set1_epi32(mv);
1926     sad_2 = _mm_add_epi16(sad_2, sad_1);
1927     sad_2 = _mm_and_si128(sad_2, s3);
1928     sad_2 = _mm_or_si128(sad_2, sad_3);
1929     _mm_storeu_si128((__m128i*)pBestMV8x8, sad_2);
1930 
1931 }
GetEightHorizontalSearchPointResults_32x32_64x64_PU_AVX2_INTRIN(EB_U16 * pSad16x16,EB_U32 * pBestSad32x32,EB_U32 * pBestSad64x64,EB_U32 * pBestMV32x32,EB_U32 * pBestMV64x64,EB_U32 mv)1932 void GetEightHorizontalSearchPointResults_32x32_64x64_PU_AVX2_INTRIN(
1933     EB_U16  *pSad16x16,
1934     EB_U32  *pBestSad32x32,
1935     EB_U32  *pBestSad64x64,
1936     EB_U32  *pBestMV32x32,
1937     EB_U32  *pBestMV64x64,
1938     EB_U32   mv)
1939 {
1940     EB_S16 xMv, yMv;
1941     EB_U32 temSum, bestMV64x64;
1942 
1943     __m128i s0, s1, s2, s3, s4, s5, s6, s7, sad_0, sad_1;
1944     __m128i sad_00, sad_01, sad_10, sad_11, sad_20, sad_21, sad_30, sad_31;
1945     __m256i ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7;
1946 
1947     s0 = _mm_setzero_si128();
1948     s1 = _mm_setzero_si128();
1949     s2 = _mm_setzero_si128();
1950     s3 = _mm_setzero_si128();
1951     s4 = _mm_setzero_si128();
1952     s5 = _mm_setzero_si128();
1953     s6 = _mm_setzero_si128();
1954     s7 = _mm_setzero_si128();
1955     sad_0 = _mm_setzero_si128();
1956     sad_1 = _mm_setzero_si128();
1957 
1958     sad_00 = _mm_setzero_si128();
1959     sad_01 = _mm_setzero_si128();
1960     sad_10 = _mm_setzero_si128();
1961     sad_11 = _mm_setzero_si128();
1962     sad_20 = _mm_setzero_si128();
1963     sad_21 = _mm_setzero_si128();
1964     sad_30 = _mm_setzero_si128();
1965     sad_31 = _mm_setzero_si128();
1966 
1967     ss0 = _mm256_setzero_si256();
1968     ss1 = _mm256_setzero_si256();
1969     ss2 = _mm256_setzero_si256();
1970     ss3 = _mm256_setzero_si256();
1971     ss4 = _mm256_setzero_si256();
1972     ss5 = _mm256_setzero_si256();
1973     ss6 = _mm256_setzero_si256();
1974     ss7 = _mm256_setzero_si256();
1975 
1976 
1977     /*--------------------
1978     |  32x32_0  |  32x32_1
1979     ----------------------
1980     |  32x32_2  |  32x32_3
1981     ----------------------*/
1982 
1983 
1984     /*  data ordering in pSad16x16 buffer
1985 
1986     Search    Search            Search
1987     Point 0   Point 1           Point 7
1988     ---------------------------------------
1989     16x16_0    |    x    |    x    | ...... |    x    |
1990     ---------------------------------------
1991     16x16_1    |    x    |    x    | ...... |    x    |
1992 
1993     16x16_n    |    x    |    x    | ...... |    x    |
1994 
1995     ---------------------------------------
1996     16x16_15   |    x    |    x    | ...... |    x    |
1997     ---------------------------------------
1998     */
1999 
2000     //    __m128i Zero = _mm_setzero_si128();
2001 
2002     //32x32_0
2003     s0 = _mm_loadu_si128((__m128i*)(pSad16x16 + 0 * 8));
2004     s1 = _mm_loadu_si128((__m128i*)(pSad16x16 + 1 * 8));
2005     s2 = _mm_loadu_si128((__m128i*)(pSad16x16 + 2 * 8));
2006     s3 = _mm_loadu_si128((__m128i*)(pSad16x16 + 3 * 8));
2007 
2008     s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2009     s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2010     s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2011     s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2012     s0 = _mm_add_epi32(s4, s6);
2013     s1 = _mm_add_epi32(s5, s7);
2014 
2015     s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2016     s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2017     s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2018     s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2019     s2 = _mm_add_epi32(s4, s6);
2020     s3 = _mm_add_epi32(s5, s7);
2021 
2022     sad_01 = _mm_add_epi32(s0, s2);
2023     sad_00 = _mm_add_epi32(s1, s3);
2024 
2025     //32x32_1
2026     s0 = _mm_loadu_si128((__m128i*)(pSad16x16 + 4 * 8));
2027     s1 = _mm_loadu_si128((__m128i*)(pSad16x16 + 5 * 8));
2028     s2 = _mm_loadu_si128((__m128i*)(pSad16x16 + 6 * 8));
2029     s3 = _mm_loadu_si128((__m128i*)(pSad16x16 + 7 * 8));
2030 
2031     s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2032     s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2033     s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2034     s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2035     s0 = _mm_add_epi32(s4, s6);
2036     s1 = _mm_add_epi32(s5, s7);
2037 
2038     s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2039     s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2040     s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2041     s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2042     s2 = _mm_add_epi32(s4, s6);
2043     s3 = _mm_add_epi32(s5, s7);
2044 
2045     sad_11 = _mm_add_epi32(s0, s2);
2046     sad_10 = _mm_add_epi32(s1, s3);
2047 
2048     //32x32_2
2049     s0 = _mm_loadu_si128((__m128i*)(pSad16x16 + 8 * 8));
2050     s1 = _mm_loadu_si128((__m128i*)(pSad16x16 + 9 * 8));
2051     s2 = _mm_loadu_si128((__m128i*)(pSad16x16 + 10 * 8));
2052     s3 = _mm_loadu_si128((__m128i*)(pSad16x16 + 11 * 8));
2053 
2054     s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2055     s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2056     s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2057     s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2058     s0 = _mm_add_epi32(s4, s6);
2059     s1 = _mm_add_epi32(s5, s7);
2060 
2061     s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2062     s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2063     s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2064     s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2065     s2 = _mm_add_epi32(s4, s6);
2066     s3 = _mm_add_epi32(s5, s7);
2067 
2068     sad_21 = _mm_add_epi32(s0, s2);
2069     sad_20 = _mm_add_epi32(s1, s3);
2070 
2071     //32x32_3
2072     s0 = _mm_loadu_si128((__m128i*)(pSad16x16 + 12 * 8));
2073     s1 = _mm_loadu_si128((__m128i*)(pSad16x16 + 13 * 8));
2074     s2 = _mm_loadu_si128((__m128i*)(pSad16x16 + 14 * 8));
2075     s3 = _mm_loadu_si128((__m128i*)(pSad16x16 + 15 * 8));
2076 
2077     s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2078     s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2079     s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2080     s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2081     s0 = _mm_add_epi32(s4, s6);
2082     s1 = _mm_add_epi32(s5, s7);
2083 
2084     s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2085     s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2086     s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2087     s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2088     s2 = _mm_add_epi32(s4, s6);
2089     s3 = _mm_add_epi32(s5, s7);
2090 
2091     sad_31 = _mm_add_epi32(s0, s2);
2092     sad_30 = _mm_add_epi32(s1, s3);
2093 
2094     sad_0 = _mm_add_epi32(_mm_add_epi32(sad_00, sad_10), _mm_add_epi32(sad_20, sad_30));
2095     sad_1 = _mm_add_epi32(_mm_add_epi32(sad_01, sad_11), _mm_add_epi32(sad_21, sad_31));
2096     sad_0 = _mm_slli_epi32(sad_0, 1);
2097     sad_1 = _mm_slli_epi32(sad_1, 1);
2098 
2099 
2100     bestMV64x64 = 0xff;
2101 
2102     //sad_0
2103     temSum = _mm_extract_epi32(sad_0, 0);
2104     if (temSum <= pBestSad64x64[0]) {
2105         pBestSad64x64[0] = temSum;
2106 		bestMV64x64 = 0;
2107     }
2108     temSum = _mm_extract_epi32(sad_0, 1);
2109     if (temSum <= pBestSad64x64[0]) {
2110         pBestSad64x64[0] = temSum;
2111         bestMV64x64 = 1 * 4;
2112     }
2113     temSum = _mm_extract_epi32(sad_0, 2);
2114     if (temSum <= pBestSad64x64[0]) {
2115         pBestSad64x64[0] = temSum;
2116         bestMV64x64 = 2 * 4;
2117     }
2118     temSum = _mm_extract_epi32(sad_0, 3);
2119     if (temSum <= pBestSad64x64[0]) {
2120         pBestSad64x64[0] = temSum;
2121         bestMV64x64 = 3 * 4;
2122     }
2123 
2124     //sad_1
2125     temSum = _mm_extract_epi32(sad_1, 0);
2126     if (temSum <= pBestSad64x64[0]) {
2127         pBestSad64x64[0] = temSum;
2128         bestMV64x64 = 4 * 4;
2129     }
2130     temSum = _mm_extract_epi32(sad_1, 1);
2131     if (temSum <= pBestSad64x64[0]) {
2132         pBestSad64x64[0] = temSum;
2133         bestMV64x64 = 5 * 4;
2134     }
2135     temSum = _mm_extract_epi32(sad_1, 2);
2136     if (temSum <= pBestSad64x64[0]) {
2137         pBestSad64x64[0] = temSum;
2138         bestMV64x64 = 6 * 4;
2139     }
2140     temSum = _mm_extract_epi32(sad_1, 3);
2141     if (temSum <= pBestSad64x64[0]) {
2142         pBestSad64x64[0] = temSum;
2143         bestMV64x64 = 7 * 4;
2144     }
2145 
2146     if (bestMV64x64 != 0xff) {
2147         xMv = _MVXT(mv) + (EB_S16)bestMV64x64;  yMv = _MVYT(mv);
2148         pBestMV64x64[0] = ((EB_U16)yMv << 16) | ((EB_U16)xMv);
2149     }
2150 
2151     // ****CODE PAST HERE IS BUGGY FOR GCC****
2152 
2153     // XY
2154     // X: 32x32 block [0..3]
2155     // Y: Search position [0..7]
2156     ss0 = _mm256_setr_m128i(sad_00, sad_01); // 07 06 05 04  03 02 01 00
2157     ss1 = _mm256_setr_m128i(sad_10, sad_11); // 17 16 15 14  13 12 11 10
2158     ss2 = _mm256_setr_m128i(sad_20, sad_21); // 27 26 25 24  23 22 21 20
2159     ss3 = _mm256_setr_m128i(sad_30, sad_31); // 37 36 35 34  33 32 31 30
2160     ss4 = _mm256_unpacklo_epi32(ss0, ss1);   // 15 05 14 04  11 01 10 00
2161     ss5 = _mm256_unpacklo_epi32(ss2, ss3);   // 35 25 34 24  31 21 30 20
2162     ss6 = _mm256_unpackhi_epi32(ss0, ss1);   // 17 07 16 06  13 03 12 02
2163     ss7 = _mm256_unpackhi_epi32(ss2, ss3);   // 37 27 36 26  33 23 32 22
2164     ss0 = _mm256_unpacklo_epi64(ss4, ss5);   // 34 24 14 04  30 20 10 00
2165     ss1 = _mm256_unpackhi_epi64(ss4, ss5);   // 35 25 15 05  31 21 11 01
2166     ss2 = _mm256_unpacklo_epi64(ss6, ss7);   // 36 26 16 06  32 22 12 02
2167     ss3 = _mm256_unpackhi_epi64(ss6, ss7);   // 37 27 17 07  33 23 13 03
2168 
2169                                              //   ss4   |  ss5-2  |                ss6        |
2170                                              // a0 : a1 | a2 : a3 | min(a0, a1) : min(a2, a3) |    | (ss4 & !ss6) | ((ss5-2) & ss6) | ((ss4 & !ss6) | ((ss5-2) & ss6)) |
2171                                              // > (-1)  | >  (-3) |         >     (-1)        | a3 |       0      |       -3        |              -3                  |
2172                                              // > (-1)  | >  (-3) |         <=     (0)        | a1 |      -1      |        0        |              -1                  |
2173                                              // > (-1)  | <= (-2) |         >     (-1)        | a2 |       0      |       -2        |              -2                  |
2174                                              // > (-1)  | <= (-2) |         <=     (0)        | a1 |      -1      |        0        |              -1                  |
2175                                              // <= (0)  | >  (-3) |         >     (-1)        | a3 |       0      |       -3        |              -3                  |
2176                                              // <= (0)  | >  (-3) |         <=     (0)        | a0 |       0      |        0        |               0                  |
2177                                              // <= (0)  | <= (-2) |         >     (-1)        | a2 |       0      |       -2        |              -2                  |
2178                                              // <= (0)  | <= (-2) |         <=     (0)        | a0 |       0      |        0        |               0                  |
2179 
2180                                              // *** 8 search points per position ***
2181 
2182                                              // ss0: Search Pos 0,4 for blocks 0,1,2,3
2183                                              // ss1: Search Pos 1,5 for blocks 0,1,2,3
2184                                              // ss2: Search Pos 2,6 for blocks 0,1,2,3
2185                                              // ss3: Search Pos 3,7 for blocks 0,1,2,3
2186 
2187     ss4 = _mm256_cmpgt_epi32(ss0, ss1);
2188     // not different printf("%d\n", _mm_extract_epi32(_mm256_extracti128_si256(ss4, 0), 0)); // DEBUG
2189     //ss4 = _mm256_or_si256(_mm256_cmpgt_epi32(ss0, ss1), _mm256_cmpeq_epi32(ss0, ss1));
2190     ss0 = _mm256_min_epi32(ss0, ss1);
2191     ss5 = _mm256_cmpgt_epi32(ss2, ss3);
2192     //ss5 = _mm256_or_si256(_mm256_cmpgt_epi32(ss2, ss3), _mm256_cmpeq_epi32(ss2, ss3));
2193     ss2 = _mm256_min_epi32(ss2, ss3);
2194     ss5 = _mm256_sub_epi32(ss5, _mm256_set1_epi32(2)); // ss5-2
2195 
2196 
2197                                                        // *** 4 search points per position ***
2198     ss6 = _mm256_cmpgt_epi32(ss0, ss2);
2199     //ss6 = _mm256_or_si256(_mm256_cmpgt_epi32(ss0, ss2), _mm256_cmpeq_epi32(ss0, ss2));
2200     ss0 = _mm256_min_epi32(ss0, ss2);
2201     ss5 = _mm256_and_si256(ss5, ss6); // (ss5-2) & ss6
2202     ss4 = _mm256_andnot_si256(ss6, ss4); // ss4 & !ss6
2203     ss4 = _mm256_or_si256(ss4, ss5); // (ss4 & !ss6) | ((ss5-2) & ss6)
2204 
2205                                      // *** 2 search points per position ***
2206     s0 = _mm_setzero_si128();
2207     s1 = _mm_setzero_si128();
2208     s2 = _mm_setzero_si128();
2209     s3 = _mm_setzero_si128();
2210     s4 = _mm_setzero_si128();
2211     s5 = _mm_setzero_si128();
2212     s6 = _mm_setzero_si128();
2213     s7 = _mm_setzero_si128();
2214 
2215     // ss0 = 8 SADs, two search points for each 32x32
2216     // ss4 = 8 MVs, two search points for each 32x32
2217     //
2218     // XY
2219     // X: 32x32 block [0..3]
2220     // Y: search position [0..1]
2221     // Format: 00 10 20 30  01 11 21 31
2222 
2223     // Each 128 bits contains 4 32x32 32bit block results
2224 #ifdef __GNUC__
2225     // SAD
2226     s0 = _mm256_extracti128_si256(ss0, 1);
2227     s1 = _mm256_extracti128_si256(ss0, 0);
2228     // MV
2229     s2 = _mm256_extracti128_si256(ss4, 1);
2230     s3 = _mm256_extracti128_si256(ss4, 0);
2231 #else
2232     // SAD
2233     s0 = _mm256_extracti128_si256(ss0, 0);
2234     s1 = _mm256_extracti128_si256(ss0, 1);
2235     // MV
2236     s2 = _mm256_extracti128_si256(ss4, 0);
2237     s3 = _mm256_extracti128_si256(ss4, 1);
2238 #endif
2239 
2240     //// Should be fine
2241     //printf("sad0 %d, %d, %d, %d\n", _mm_extract_epi32(s0, 0), _mm_extract_epi32(s0, 1), _mm_extract_epi32(s0, 2), _mm_extract_epi32(s0, 3)); // DEBUG
2242     //printf("sad1 %d, %d, %d, %d\n", _mm_extract_epi32(s1, 0), _mm_extract_epi32(s1, 1), _mm_extract_epi32(s1, 2), _mm_extract_epi32(s1, 3)); // DEBUG
2243     //printf("mv0 %d, %d, %d, %d\n", _mm_extract_epi32(s2, 0), _mm_extract_epi32(s2, 1), _mm_extract_epi32(s2, 2), _mm_extract_epi32(s2, 3)); // DEBUG
2244     //printf("mv1 %d, %d, %d, %d\n", _mm_extract_epi32(s3, 0), _mm_extract_epi32(s3, 1), _mm_extract_epi32(s3, 2), _mm_extract_epi32(s3, 3)); // DEBUG
2245 
2246 
2247     // Choose the best MV out of the two, use s4 to hold results of min
2248     s4 = _mm_cmpgt_epi32(s0, s1);
2249 
2250     // DIFFERENT BETWEEN VS AND GCC
2251     // printf("%d, %d, %d, %d\n", _mm_extract_epi32(s4, 0), _mm_extract_epi32(s4, 1), _mm_extract_epi32(s4, 2), _mm_extract_epi32(s4, 3)); // DEBUG
2252 
2253     //s4 = _mm_or_si128(_mm_cmpgt_epi32(s0, s1), _mm_cmpeq_epi32(s0, s1));
2254     s0 = _mm_min_epi32(s0, s1);
2255 
2256 
2257 
2258     // Extract MV's based on the blocks to s2
2259     s3 = _mm_sub_epi32(s3, _mm_set1_epi32(4)); // s3-4
2260                                                // Remove the MV's are not used from s2
2261     s2 = _mm_andnot_si128(s4, s2);
2262     // Remove the MV's that are not used from s3 (inverse from s2 above operation)
2263     s3 = _mm_and_si128(s4, s3);
2264     // Combine the remaining candidates into s2
2265     s2 = _mm_or_si128(s2, s3);
2266     // Convert MV's into encoders format
2267     s2 = _mm_sub_epi32(_mm_setzero_si128(), s2);
2268     s2 = _mm_slli_epi32(s2, 2); // mv info
2269 
2270 
2271                                 // ***SAD***
2272                                 // s0: current SAD candidates for each 32x32
2273                                 // s1: best SAD's for 32x32
2274 
2275                                 // << 1 to compensate for every other line
2276     s0 = _mm_slli_epi32(s0, 1); // best sad info
2277                                 // Load best SAD's
2278     s1 = _mm_loadu_si128((__m128i*)pBestSad32x32);
2279 
2280     // Determine which candidates are better than the current best SAD's.
2281     // s4 is used to determine the MV's of the new best SAD's
2282     s4 = _mm_cmpgt_epi32(s1, s0);
2283     // not different printf("%d, %d, %d, %d\n", _mm_extract_epi32(s4, 0), _mm_extract_epi32(s4, 1), _mm_extract_epi32(s4, 2), _mm_extract_epi32(s4, 3)); // DEBUG
2284     //s4 = _mm_or_si128(_mm_cmpgt_epi32(s1, s0), _mm_cmpeq_epi32(s1, s0));
2285     // Combine old and new min SAD's
2286     s0 = _mm_min_epu32(s0, s1);
2287     // Store new best SAD's back to memory
2288     _mm_storeu_si128((__m128i*)pBestSad32x32, s0);
2289 
2290 
2291     // ***Motion Vectors***
2292     // Load best MV's
2293     // s3: candidate MV's
2294     // s4: results of comparing SAD's
2295     // s5: previous best MV's
2296 
2297     // Load previous best MV's
2298     s5 = _mm_loadu_si128((__m128i*)pBestMV32x32);
2299     // Remove the MV's that are being replaced
2300     s5 = _mm_andnot_si128(s4, s5);
2301     // Set s3 to the base MV
2302     s3 = _mm_set1_epi32(mv);
2303     // Add candidate MV's to base MV
2304     s3 = _mm_add_epi16(s3, s2);
2305     // Remove non-candidate's
2306     s3 = _mm_and_si128(s3, s4);
2307     // Combine remaining candidates with remaining best MVs
2308     s3 = _mm_or_si128(s3, s5);
2309     // Store back to memory
2310     _mm_storeu_si128((__m128i*)pBestMV32x32, s3);
2311 }
2312 
2313 #endif
2314