1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbComputeSAD_AVX2.h"
7 #include "stdint.h"
8 #include "EbMemory_AVX2.h"
9 #include "immintrin.h"
10 
11 #define UPDATE_BEST(s, k, offset) \
12   tem_sum1 = _mm_extract_epi32(s, k); \
13   if (tem_sum1 < low_sum) { \
14     low_sum = tem_sum1; \
15     x_best = j + offset + k; \
16     y_best = i; \
17   }
18 
19 /*******************************************************************************
20  * Requirement: width   = 4, 8, 16, 24, 32, 48 or 64
21  * Requirement: height <= 64
22  * Requirement: height % 2 = 0 when width = 4 or 8
23 *******************************************************************************/
eb_vp9_sad_loop_kernel_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width,uint64_t * best_sad,int16_t * x_search_center,int16_t * y_search_center,uint32_t src_stride_raw,int16_t search_area_width,int16_t search_area_height)24 void eb_vp9_sad_loop_kernel_avx2_intrin(
25     uint8_t  *src,                             // input parameter, source samples Ptr
26     uint32_t  src_stride,                      // input parameter, source stride
27     uint8_t  *ref,                             // input parameter, reference samples Ptr
28     uint32_t  ref_stride,                      // input parameter, reference stride
29     uint32_t  height,                          // input parameter, block height (M)
30     uint32_t  width,                           // input parameter, block width (N)
31     uint64_t *best_sad,
32     int16_t *x_search_center,
33     int16_t *y_search_center,
34     uint32_t  src_stride_raw,                  // input parameter, source stride (no line skipping)
35     int16_t  search_area_width,
36     int16_t  search_area_height)
37 {
38   int16_t x_best = *x_search_center, y_best = *y_search_center;
39   uint32_t low_sum = 0xffffff;
40   uint32_t tem_sum1 = 0;
41   int16_t i, j;
42   uint32_t k, l;
43   uint32_t leftover = search_area_width & 7;
44   const uint8_t *p_ref, *p_src;
45   __m128i s0, s1, s2, s3, s4, s5, s6, s8 = _mm_set1_epi32(-1);
46   __m256i ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, ss8;
47 
48   if (leftover) {
49     for (k=0; k<leftover; k++) {
50       s8 = _mm_slli_si128(s8, 2);
51     }
52   }
53 
54   switch (width) {
55   case 4:
56 
57     if (!(height % 4)) {
58       uint32_t srcStrideT = 3 * src_stride;
59       uint32_t refStrideT = 3 * ref_stride;
60       for (i=0; i<search_area_height; i++) {
61         for (j=0; j<=search_area_width-8; j+=8) {
62           p_src = src;
63           p_ref = ref + j;
64           ss3 = ss5 = _mm256_setzero_si256();
65           for (k=0; k<height; k+=4) {
66             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 2 * ref_stride))), _mm_loadu_si128((__m128i*)p_ref), 0x1);
67             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + ref_stride))), _mm_loadu_si128((__m128i*)(p_ref + refStrideT)), 0x1);
68             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_cvtsi32_si128(*(uint32_t *)p_src), _mm_cvtsi32_si128(*(uint32_t *)(p_src + src_stride)))), _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(uint32_t *)(p_src + 2 * src_stride)), _mm_cvtsi32_si128(*(uint32_t *)(p_src + srcStrideT))), 0x1);
69             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
70             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
71             p_src += src_stride << 2;
72             p_ref += ref_stride << 2;
73           }
74           ss3 = _mm256_adds_epu16(ss3, ss5);
75           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
76           s3 = _mm_minpos_epu16(s3);
77           tem_sum1 = _mm_extract_epi16(s3, 0);
78           if (tem_sum1 < low_sum) {
79             low_sum = tem_sum1;
80             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
81             y_best = i;
82           }
83         }
84 
85         if (leftover) {
86           p_src = src;
87           p_ref = ref + j;
88           ss3 = ss5 = _mm256_setzero_si256();
89           for (k=0; k<height; k+=4) {
90             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + 2 * ref_stride)), 0x1);
91             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + ref_stride))), _mm_loadu_si128((__m128i*)(p_ref + refStrideT)), 0x1);
92             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_cvtsi32_si128(*(uint32_t *)p_src), _mm_cvtsi32_si128(*(uint32_t *)(p_src + src_stride)))), _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(uint32_t *)(p_src + 2 * src_stride)), _mm_cvtsi32_si128(*(uint32_t *)(p_src + srcStrideT))), 0x1);
93             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
94             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
95             p_src += src_stride << 2;
96             p_ref += ref_stride << 2;
97           }
98           ss3 = _mm256_adds_epu16(ss3, ss5);
99           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
100           s3 = _mm_or_si128(s3, s8);
101           s3 = _mm_minpos_epu16(s3);
102           tem_sum1 = _mm_extract_epi16(s3, 0);
103           if (tem_sum1 < low_sum) {
104             low_sum = tem_sum1;
105             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
106             y_best = i;
107           }
108         }
109         ref += src_stride_raw;
110       }
111     }
112     else {
113       for (i=0; i<search_area_height; i++) {
114         for (j=0; j<=search_area_width-8; j+=8) {
115           p_src = src;
116           p_ref = ref + j;
117           s3 = _mm_setzero_si128();
118           for (k=0; k<height; k+=2) {
119             s0 = _mm_loadu_si128((__m128i*)p_ref);
120             s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride));
121             s2 = _mm_cvtsi32_si128(*(uint32_t *)p_src);
122             s5 = _mm_cvtsi32_si128(*(uint32_t *)(p_src+src_stride));
123             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
124             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
125             p_src += src_stride << 1;
126             p_ref += ref_stride << 1;
127           }
128           s3 = _mm_minpos_epu16(s3);
129           tem_sum1 = _mm_extract_epi16(s3, 0);
130           if (tem_sum1 < low_sum) {
131             low_sum = tem_sum1;
132             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
133             y_best = i;
134           }
135         }
136 
137         if (leftover) {
138           p_src = src;
139           p_ref = ref + j;
140           s3 = _mm_setzero_si128();
141           for (k=0; k<height; k+=2) {
142             s0 = _mm_loadu_si128((__m128i*)p_ref);
143             s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride));
144             s2 = _mm_cvtsi32_si128(*(uint32_t *)p_src);
145             s5 = _mm_cvtsi32_si128(*(uint32_t *)(p_src+src_stride));
146             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
147             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
148             p_src += src_stride << 1;
149             p_ref += ref_stride << 1;
150           }
151           s3 = _mm_or_si128(s3, s8);
152           s3 = _mm_minpos_epu16(s3);
153           tem_sum1 = _mm_extract_epi16(s3, 0);
154           if (tem_sum1 < low_sum) {
155             low_sum = tem_sum1;
156             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
157             y_best = i;
158           }
159         }
160         ref += src_stride_raw;
161       }
162     }
163 
164     break;
165 
166   case 8:
167     if (!(height % 4)) {
168       uint32_t srcStrideT = 3 * src_stride;
169       uint32_t refStrideT = 3 * ref_stride;
170       for (i=0; i<search_area_height; i++) {
171         for (j=0; j<=search_area_width-8; j+=8) {
172           p_src = src;
173           p_ref = ref + j;
174           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
175           for (k=0; k<height; k+=4) {
176             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + 2 * ref_stride)), 0x1);
177             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + ref_stride))), _mm_loadu_si128((__m128i*)(p_ref + refStrideT)), 0x1);
178             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)p_src), _mm_loadl_epi64((__m128i*)(p_src + src_stride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(p_src + 2 * src_stride)), _mm_loadl_epi64((__m128i*)(p_src + srcStrideT))), 0x1);
179             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
180             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
181             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
182             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
183             p_src += src_stride << 2;
184             p_ref += ref_stride << 2;
185           }
186           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
187           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
188           s3 = _mm_minpos_epu16(s3);
189           tem_sum1 = _mm_extract_epi16(s3, 0);
190           if (tem_sum1 < low_sum) {
191             low_sum = tem_sum1;
192             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
193             y_best = i;
194           }
195         }
196 
197         if (leftover) {
198           p_src = src;
199           p_ref = ref + j;
200           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
201           for (k=0; k<height; k+=4) {
202             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)),             _mm_loadu_si128((__m128i*)(p_ref+2*ref_stride)), 0x1);
203             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + ref_stride))), _mm_loadu_si128((__m128i*)(p_ref + refStrideT)), 0x1);
204             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)p_src), _mm_loadl_epi64((__m128i*)(p_src + src_stride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(p_src + 2 * src_stride)), _mm_loadl_epi64((__m128i*)(p_src + srcStrideT))), 0x1);
205             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
206             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
207             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
208             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
209             p_src += src_stride << 2;
210             p_ref += ref_stride << 2;
211           }
212           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
213           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
214           s3 = _mm_or_si128(s3, s8);
215           s3 = _mm_minpos_epu16(s3);
216           tem_sum1 = _mm_extract_epi16(s3, 0);
217           if (tem_sum1 < low_sum) {
218             low_sum = tem_sum1;
219             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
220             y_best = i;
221           }
222         }
223         ref += src_stride_raw;
224       }
225     }
226     else {
227       for (i=0; i<search_area_height; i++) {
228         for (j=0; j<=search_area_width-8; j+=8) {
229           p_src = src;
230           p_ref = ref + j;
231           s3 = s4 = _mm_setzero_si128();
232           for (k=0; k<height; k+=2) {
233             s0 = _mm_loadu_si128((__m128i*)p_ref);
234             s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride));
235             s2 = _mm_loadl_epi64((__m128i*)p_src);
236             s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride));
237             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
238             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
239             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
240             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
241             p_src += src_stride << 1;
242             p_ref += ref_stride << 1;
243           }
244           s3 = _mm_adds_epu16(s3, s4);
245           s3 = _mm_minpos_epu16(s3);
246           tem_sum1 = _mm_extract_epi16(s3, 0);
247           if (tem_sum1 < low_sum) {
248             low_sum = tem_sum1;
249             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
250             y_best = i;
251           }
252         }
253 
254         if (leftover) {
255           p_src = src;
256           p_ref = ref + j;
257           s3 = s4 = _mm_setzero_si128();
258           for (k=0; k<height; k+=2) {
259             s0 = _mm_loadu_si128((__m128i*)p_ref);
260             s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride));
261             s2 = _mm_loadl_epi64((__m128i*)p_src);
262             s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride));
263             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
264             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
265             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
266             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
267             p_src += src_stride << 1;
268             p_ref += ref_stride << 1;
269           }
270           s3 = _mm_adds_epu16(s3, s4);
271           s3 = _mm_or_si128(s3, s8);
272           s3 = _mm_minpos_epu16(s3);
273           tem_sum1 = _mm_extract_epi16(s3, 0);
274           if (tem_sum1 < low_sum) {
275             low_sum = tem_sum1;
276             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
277             y_best = i;
278           }
279         }
280         ref += src_stride_raw;
281       }
282     }
283     break;
284 
285   case 16:
286     if (height <= 16) {
287       for (i=0; i<search_area_height; i++) {
288         for (j=0; j<=search_area_width-8; j+=8) {
289           p_src = src;
290           p_ref = ref + j;
291           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
292           for (k=0; k<height; k+=2) {
293             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
294             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
295             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
296             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
297             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
298             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
299             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
300             p_src += 2 * src_stride;
301             p_ref += 2 * ref_stride;
302           }
303           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
304           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
305           s3 = _mm_minpos_epu16(s3);
306           tem_sum1 = _mm_extract_epi16(s3, 0);
307           if (tem_sum1 < low_sum) {
308             low_sum = tem_sum1;
309             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
310             y_best = i;
311           }
312         }
313 
314         if (leftover) {
315           p_src = src;
316           p_ref = ref + j;
317           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
318           for (k=0; k<height; k+=2) {
319             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
320             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
321             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
322             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
323             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
324             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
325             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
326             p_src += 2 * src_stride;
327             p_ref += 2 * ref_stride;
328           }
329           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
330           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
331           s3 = _mm_or_si128(s3, s8);
332           s3 = _mm_minpos_epu16(s3);
333           tem_sum1 = _mm_extract_epi16(s3, 0);
334           if (tem_sum1 < low_sum) {
335             low_sum = tem_sum1;
336             x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
337             y_best = i;
338           }
339         }
340         ref += src_stride_raw;
341       }
342     }
343     else if (height <= 32) {
344       for (i=0; i<search_area_height; i++) {
345         for (j=0; j<=search_area_width-8; j+=8) {
346           p_src = src;
347           p_ref = ref + j;
348           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
349           for (k=0; k<height; k+=2) {
350             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
351             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
352             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
353             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
354             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
355             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
356             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
357             p_src += 2 * src_stride;
358             p_ref += 2 * ref_stride;
359           }
360           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
361           s3 = _mm256_extracti128_si256(ss3, 0);
362           s5 = _mm256_extracti128_si256(ss3, 1);
363           s4 = _mm_minpos_epu16(s3);
364           s6 = _mm_minpos_epu16(s5);
365           s4 = _mm_unpacklo_epi16(s4, s4);
366           s4 = _mm_unpacklo_epi32(s4, s4);
367           s4 = _mm_unpacklo_epi64(s4, s4);
368           s6 = _mm_unpacklo_epi16(s6, s6);
369           s6 = _mm_unpacklo_epi32(s6, s6);
370           s6 = _mm_unpacklo_epi64(s6, s6);
371           s3 = _mm_sub_epi16(s3, s4);
372           s5 = _mm_adds_epu16(s5, s3);
373           s5 = _mm_sub_epi16(s5, s6);
374           s5 = _mm_minpos_epu16(s5);
375           tem_sum1  = _mm_extract_epi16(s5, 0);
376           tem_sum1 += _mm_extract_epi16(s4, 0);
377           tem_sum1 += _mm_extract_epi16(s6, 0);
378           if (tem_sum1 < low_sum) {
379             low_sum = tem_sum1;
380             x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
381             y_best = i;
382           }
383         }
384 
385         if (leftover) {
386           p_src = src;
387           p_ref = ref + j;
388           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
389           for (k=0; k<height; k+=2) {
390             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
391             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
392             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
393             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
394             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
395             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
396             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
397             p_src += 2 * src_stride;
398             p_ref += 2 * ref_stride;
399           }
400           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
401           s3 = _mm256_extracti128_si256(ss3, 0);
402           s5 = _mm256_extracti128_si256(ss3, 1);
403           s3 = _mm_or_si128(s3, s8);
404           s5 = _mm_or_si128(s5, s8);
405           s4 = _mm_minpos_epu16(s3);
406           s6 = _mm_minpos_epu16(s5);
407           s4 = _mm_unpacklo_epi16(s4, s4);
408           s4 = _mm_unpacklo_epi32(s4, s4);
409           s4 = _mm_unpacklo_epi64(s4, s4);
410           s6 = _mm_unpacklo_epi16(s6, s6);
411           s6 = _mm_unpacklo_epi32(s6, s6);
412           s6 = _mm_unpacklo_epi64(s6, s6);
413           s3 = _mm_sub_epi16(s3, s4);
414           s5 = _mm_adds_epu16(s5, s3);
415           s5 = _mm_sub_epi16(s5, s6);
416           s5 = _mm_minpos_epu16(s5);
417           tem_sum1  = _mm_extract_epi16(s5, 0);
418           tem_sum1 += _mm_extract_epi16(s4, 0);
419           tem_sum1 += _mm_extract_epi16(s6, 0);
420           if (tem_sum1 < low_sum) {
421             low_sum = tem_sum1;
422             x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
423             y_best = i;
424           }
425         }
426         ref += src_stride_raw;
427       }
428     }
429     else {
430       for (i=0; i<search_area_height; i++) {
431         for (j=0; j<=search_area_width-8; j+=8) {
432           p_src = src;
433           p_ref = ref + j;
434           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
435           for (k=0; k<height; k+=2) {
436             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
437             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
438             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
439             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
440             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
441             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
442             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
443             p_src += 2 * src_stride;
444             p_ref += 2 * ref_stride;
445           }
446           ss3 = _mm256_adds_epu16(ss3, ss4);
447           ss5 = _mm256_adds_epu16(ss5, ss6);
448           ss0 = _mm256_adds_epu16(ss3, ss5);
449           s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
450           s0 = _mm_minpos_epu16(s0);
451           tem_sum1 = _mm_extract_epi16(s0, 0);
452           if (tem_sum1 < low_sum) {
453             if (tem_sum1 != 0xFFFF) { // no overflow
454               low_sum = tem_sum1;
455               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
456               y_best = i;
457             }
458             else {
459               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
460               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
461               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
462               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
463               ss4 = _mm256_add_epi32(ss4, ss6);
464               ss3 = _mm256_add_epi32(ss3, ss5);
465               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
466               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
467               UPDATE_BEST(s0, 0, 0);
468               UPDATE_BEST(s0, 1, 0);
469               UPDATE_BEST(s0, 2, 0);
470               UPDATE_BEST(s0, 3, 0);
471               UPDATE_BEST(s3, 0, 4);
472               UPDATE_BEST(s3, 1, 4);
473               UPDATE_BEST(s3, 2, 4);
474               UPDATE_BEST(s3, 3, 4);
475             }
476           }
477         }
478 
479         if (leftover) {
480           p_src = src;
481           p_ref = ref + j;
482           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
483           for (k=0; k<height; k+=2) {
484             ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
485             ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
486             ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
487             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
488             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
489             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
490             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
491             p_src += 2 * src_stride;
492             p_ref += 2 * ref_stride;
493           }
494           ss3 = _mm256_adds_epu16(ss3, ss4);
495           ss5 = _mm256_adds_epu16(ss5, ss6);
496           ss0 = _mm256_adds_epu16(ss3, ss5);
497           s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
498           s0 = _mm_or_si128(s0, s8);
499           s0 = _mm_minpos_epu16(s0);
500           tem_sum1 = _mm_extract_epi16(s0, 0);
501           if (tem_sum1 < low_sum) {
502             if (tem_sum1 != 0xFFFF) { // no overflow
503               low_sum = tem_sum1;
504               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
505               y_best = i;
506             }
507             else {
508               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
509               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
510               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
511               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
512               ss4 = _mm256_add_epi32(ss4, ss6);
513               ss3 = _mm256_add_epi32(ss3, ss5);
514               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
515               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
516               k = leftover;
517               while (k > 0) {
518                 for (l=0; l < 4 && k; l++, k--) {
519                   tem_sum1 = _mm_extract_epi32(s0, 0);
520                   s0 = _mm_srli_si128(s0, 4);
521                   if (tem_sum1 < low_sum) {
522                     low_sum = tem_sum1;
523                     x_best = (int16_t)(j + leftover - k);
524                     y_best = i;
525                   }
526                 }
527                 s0 = s3;
528               }
529             }
530           }
531         }
532         ref += src_stride_raw;
533       }
534     }
535     break;
536 
537   case 24:
538     if (height <= 16) {
539       for (i=0; i<search_area_height; i++) {
540         for (j=0; j<=search_area_width-8; j+=8) {
541           p_src = src;
542           p_ref = ref + j;
543           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
544           for (k=0; k<height; k++) {
545             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
546             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
547             ss2 = _mm256_loadu_si256((__m256i *)p_src);
548             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
549             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
550             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
551             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
552             p_src += src_stride;
553             p_ref += ref_stride;
554           }
555           ss3 = _mm256_adds_epu16(ss3, ss4);
556           ss5 = _mm256_adds_epu16(ss5, ss6);
557           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
558           s5 = _mm256_extracti128_si256(ss5, 0);
559           s4 = _mm_minpos_epu16(s3);
560           s6 = _mm_minpos_epu16(s5);
561           s4 = _mm_unpacklo_epi16(s4, s4);
562           s4 = _mm_unpacklo_epi32(s4, s4);
563           s4 = _mm_unpacklo_epi64(s4, s4);
564           s6 = _mm_unpacklo_epi16(s6, s6);
565           s6 = _mm_unpacklo_epi32(s6, s6);
566           s6 = _mm_unpacklo_epi64(s6, s6);
567           s3 = _mm_sub_epi16(s3, s4);
568           s5 = _mm_adds_epu16(s5, s3);
569           s5 = _mm_sub_epi16(s5, s6);
570           s5 = _mm_minpos_epu16(s5);
571           tem_sum1  = _mm_extract_epi16(s5, 0);
572           tem_sum1 += _mm_extract_epi16(s4, 0);
573           tem_sum1 += _mm_extract_epi16(s6, 0);
574           if (tem_sum1 < low_sum) {
575             low_sum = tem_sum1;
576             x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
577             y_best = i;
578           }
579         }
580 
581         if (leftover) {
582           p_src = src;
583           p_ref = ref + j;
584           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
585           for (k=0; k<height; k++) {
586             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
587             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
588             ss2 = _mm256_loadu_si256((__m256i *)p_src);
589             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
590             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
591             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
592             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
593             p_src += src_stride;
594             p_ref += ref_stride;
595           }
596           ss3 = _mm256_adds_epu16(ss3, ss4);
597           ss5 = _mm256_adds_epu16(ss5, ss6);
598           s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
599           s5 = _mm256_extracti128_si256(ss5, 0);
600           s3 = _mm_or_si128(s3, s8);
601           s5 = _mm_or_si128(s5, s8);
602           s4 = _mm_minpos_epu16(s3);
603           s6 = _mm_minpos_epu16(s5);
604           s4 = _mm_unpacklo_epi16(s4, s4);
605           s4 = _mm_unpacklo_epi32(s4, s4);
606           s4 = _mm_unpacklo_epi64(s4, s4);
607           s6 = _mm_unpacklo_epi16(s6, s6);
608           s6 = _mm_unpacklo_epi32(s6, s6);
609           s6 = _mm_unpacklo_epi64(s6, s6);
610           s3 = _mm_sub_epi16(s3, s4);
611           s5 = _mm_adds_epu16(s5, s3);
612           s5 = _mm_sub_epi16(s5, s6);
613           s5 = _mm_minpos_epu16(s5);
614           tem_sum1  = _mm_extract_epi16(s5, 0);
615           tem_sum1 += _mm_extract_epi16(s4, 0);
616           tem_sum1 += _mm_extract_epi16(s6, 0);
617           if (tem_sum1 < low_sum) {
618             low_sum = tem_sum1;
619             x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
620             y_best = i;
621           }
622         }
623         ref += src_stride_raw;
624       }
625     }
626     else {
627       for (i=0; i<search_area_height; i++) {
628         for (j=0; j<=search_area_width-8; j+=8) {
629           p_src = src;
630           p_ref = ref + j;
631           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
632           for (k=0; k<height; k++) {
633             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
634             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
635             ss2 = _mm256_loadu_si256((__m256i *)p_src);
636             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
637             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
638             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
639             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
640             p_src += src_stride;
641             p_ref += ref_stride;
642           }
643           ss3 = _mm256_adds_epu16(ss3, ss4);
644           ss5 = _mm256_adds_epu16(ss5, ss6);
645           s3 = _mm256_extracti128_si256(ss3, 0);
646           s4 = _mm256_extracti128_si256(ss3, 1);
647           s5 = _mm256_extracti128_si256(ss5, 0);
648           s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), s5);
649           s0 = _mm_minpos_epu16(s0);
650           tem_sum1 = _mm_extract_epi16(s0, 0);
651           if (tem_sum1 < low_sum) {
652             if (tem_sum1 != 0xFFFF) { // no overflow
653               low_sum = tem_sum1;
654               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
655               y_best = i;
656             }
657             else {
658               s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
659               s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
660               s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
661               s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
662               s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
663               s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
664               s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), s2);
665               s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), s5);
666               UPDATE_BEST(s0, 0, 0);
667               UPDATE_BEST(s0, 1, 0);
668               UPDATE_BEST(s0, 2, 0);
669               UPDATE_BEST(s0, 3, 0);
670               UPDATE_BEST(s3, 0, 4);
671               UPDATE_BEST(s3, 1, 4);
672               UPDATE_BEST(s3, 2, 4);
673               UPDATE_BEST(s3, 3, 4);
674             }
675           }
676         }
677 
678         if (leftover) {
679           p_src = src;
680           p_ref = ref + j;
681           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
682           for (k=0; k<height; k++) {
683             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
684             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
685             ss2 = _mm256_loadu_si256((__m256i *)p_src);
686             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
687             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
688             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
689             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
690             p_src += src_stride;
691             p_ref += ref_stride;
692           }
693           ss3 = _mm256_adds_epu16(ss3, ss4);
694           ss5 = _mm256_adds_epu16(ss5, ss6);
695           s3 = _mm256_extracti128_si256(ss3, 0);
696           s4 = _mm256_extracti128_si256(ss3, 1);
697           s5 = _mm256_extracti128_si256(ss5, 0);
698           s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), s5);
699           s0 = _mm_or_si128(s0, s8);
700           s0 = _mm_minpos_epu16(s0);
701           tem_sum1 = _mm_extract_epi16(s0, 0);
702           if (tem_sum1 < low_sum) {
703             if (tem_sum1 != 0xFFFF) { // no overflow
704               low_sum = tem_sum1;
705               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
706               y_best = i;
707             }
708             else {
709               s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
710               s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
711               s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
712               s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
713               s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
714               s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
715               s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), s2);
716               s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), s5);
717               k = leftover;
718               while (k > 0) {
719                 for (l=0; l < 4 && k; l++, k--) {
720                   tem_sum1 = _mm_extract_epi32(s0, 0);
721                   s0 = _mm_srli_si128(s0, 4);
722                   if (tem_sum1 < low_sum) {
723                     low_sum = tem_sum1;
724                     x_best = (int16_t)(j + leftover - k);
725                     y_best = i;
726                   }
727                 }
728                 s0 = s3;
729               }
730             }
731           }
732         }
733         ref += src_stride_raw;
734       }
735     }
736     break;
737 
738   case 32:
739     if (height <= 16) {
740       for (i=0; i<search_area_height; i++) {
741         for (j=0; j<=search_area_width-8; j+=8) {
742           p_src = src;
743           p_ref = ref + j;
744           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
745           for (k=0; k<height; k++) {
746             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
747             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
748             ss2 = _mm256_loadu_si256((__m256i *)p_src);
749             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
750             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
751             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
752             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
753             p_src += src_stride;
754             p_ref += ref_stride;
755           }
756           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
757           s3 = _mm256_extracti128_si256(ss3, 0);
758           s5 = _mm256_extracti128_si256(ss3, 1);
759           s4 = _mm_minpos_epu16(s3);
760           s6 = _mm_minpos_epu16(s5);
761           s4 = _mm_unpacklo_epi16(s4, s4);
762           s4 = _mm_unpacklo_epi32(s4, s4);
763           s4 = _mm_unpacklo_epi64(s4, s4);
764           s6 = _mm_unpacklo_epi16(s6, s6);
765           s6 = _mm_unpacklo_epi32(s6, s6);
766           s6 = _mm_unpacklo_epi64(s6, s6);
767           s3 = _mm_sub_epi16(s3, s4);
768           s5 = _mm_adds_epu16(s5, s3);
769           s5 = _mm_sub_epi16(s5, s6);
770           s5 = _mm_minpos_epu16(s5);
771           tem_sum1  = _mm_extract_epi16(s5, 0);
772           tem_sum1 += _mm_extract_epi16(s4, 0);
773           tem_sum1 += _mm_extract_epi16(s6, 0);
774           if (tem_sum1 < low_sum) {
775             low_sum = tem_sum1;
776             x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
777             y_best = i;
778           }
779         }
780 
781         if (leftover) {
782           p_src = src;
783           p_ref = ref + j;
784           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
785           for (k=0; k<height; k++) {
786             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
787             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
788             ss2 = _mm256_loadu_si256((__m256i *)p_src);
789             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
790             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
791             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
792             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
793             p_src += src_stride;
794             p_ref += ref_stride;
795           }
796           ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
797           s3 = _mm256_extracti128_si256(ss3, 0);
798           s5 = _mm256_extracti128_si256(ss3, 1);
799           s3 = _mm_or_si128(s3, s8);
800           s5 = _mm_or_si128(s5, s8);
801           s4 = _mm_minpos_epu16(s3);
802           s6 = _mm_minpos_epu16(s5);
803           s4 = _mm_unpacklo_epi16(s4, s4);
804           s4 = _mm_unpacklo_epi32(s4, s4);
805           s4 = _mm_unpacklo_epi64(s4, s4);
806           s6 = _mm_unpacklo_epi16(s6, s6);
807           s6 = _mm_unpacklo_epi32(s6, s6);
808           s6 = _mm_unpacklo_epi64(s6, s6);
809           s3 = _mm_sub_epi16(s3, s4);
810           s5 = _mm_adds_epu16(s5, s3);
811           s5 = _mm_sub_epi16(s5, s6);
812           s5 = _mm_minpos_epu16(s5);
813           tem_sum1  = _mm_extract_epi16(s5, 0);
814           tem_sum1 += _mm_extract_epi16(s4, 0);
815           tem_sum1 += _mm_extract_epi16(s6, 0);
816           if (tem_sum1 < low_sum) {
817             low_sum = tem_sum1;
818             x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
819             y_best = i;
820           }
821         }
822         ref += src_stride_raw;
823       }
824     }
825     else if (height <= 32) {
826       for (i=0; i<search_area_height; i++) {
827         for (j=0; j<=search_area_width-8; j+=8) {
828           p_src = src;
829           p_ref = ref + j;
830           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
831           for (k=0; k<height; k++) {
832             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
833             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
834             ss2 = _mm256_loadu_si256((__m256i *)p_src);
835             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
836             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
837             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
838             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
839             p_src += src_stride;
840             p_ref += ref_stride;
841           }
842           ss3 = _mm256_adds_epu16(ss3, ss4);
843           ss5 = _mm256_adds_epu16(ss5, ss6);
844           ss6 = _mm256_adds_epu16(ss3, ss5);
845           s3 = _mm256_extracti128_si256(ss6, 0);
846           s4 = _mm256_extracti128_si256(ss6, 1);
847           s0 = _mm_adds_epu16(s3, s4);
848           s0 = _mm_minpos_epu16(s0);
849           tem_sum1 = _mm_extract_epi16(s0, 0);
850           if (tem_sum1 < low_sum) {
851             if (tem_sum1 != 0xFFFF) { // no overflow
852               low_sum = tem_sum1;
853               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
854               y_best = i;
855             }
856             else {
857               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
858               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
859               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
860               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
861               ss4 = _mm256_add_epi32(ss4, ss6);
862               ss3 = _mm256_add_epi32(ss3, ss5);
863               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
864               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
865               UPDATE_BEST(s0, 0, 0);
866               UPDATE_BEST(s0, 1, 0);
867               UPDATE_BEST(s0, 2, 0);
868               UPDATE_BEST(s0, 3, 0);
869               UPDATE_BEST(s3, 0, 4);
870               UPDATE_BEST(s3, 1, 4);
871               UPDATE_BEST(s3, 2, 4);
872               UPDATE_BEST(s3, 3, 4);
873             }
874           }
875         }
876 
877         if (leftover) {
878           p_src = src;
879           p_ref = ref + j;
880           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
881           for (k=0; k<height; k++) {
882             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
883             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
884             ss2 = _mm256_loadu_si256((__m256i *)p_src);
885             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
886             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
887             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
888             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
889             p_src += src_stride;
890             p_ref += ref_stride;
891           }
892           ss3 = _mm256_adds_epu16(ss3, ss4);
893           ss5 = _mm256_adds_epu16(ss5, ss6);
894           ss6 = _mm256_adds_epu16(ss3, ss5);
895           s3 = _mm256_extracti128_si256(ss6, 0);
896           s4 = _mm256_extracti128_si256(ss6, 1);
897           s0 = _mm_adds_epu16(s3, s4);
898           //s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
899           s0 = _mm_or_si128(s0, s8);
900           s0 = _mm_minpos_epu16(s0);
901           tem_sum1 = _mm_extract_epi16(s0, 0);
902           if (tem_sum1 < low_sum) {
903             if (tem_sum1 != 0xFFFF) { // no overflow
904               low_sum = tem_sum1;
905               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
906               y_best = i;
907             }
908             else {
909               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
910               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
911               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
912               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
913               ss4 = _mm256_add_epi32(ss4, ss6);
914               ss3 = _mm256_add_epi32(ss3, ss5);
915               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
916               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
917               k = leftover;
918               while (k > 0) {
919                 for (l=0; l < 4 && k; l++, k--) {
920                   tem_sum1 = _mm_extract_epi32(s0, 0);
921                   s0 = _mm_srli_si128(s0, 4);
922                   if (tem_sum1 < low_sum) {
923                     low_sum = tem_sum1;
924                     x_best = (int16_t)(j + leftover - k);
925                     y_best = i;
926                   }
927                 }
928                 s0 = s3;
929               }
930             }
931           }
932         }
933         ref += src_stride_raw;
934       }
935     }
936     else {
937       for (i=0; i<search_area_height; i++) {
938         for (j=0; j<=search_area_width-8; j+=8) {
939           p_src = src;
940           p_ref = ref + j;
941           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
942           for (k=0; k<height; k++) {
943             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
944             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
945             ss2 = _mm256_loadu_si256((__m256i *)p_src);
946             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
947             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
948             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
949             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
950             p_src += src_stride;
951             p_ref += ref_stride;
952           }
953           ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
954           s3 = _mm256_extracti128_si256(ss7, 0);
955           s4 = _mm256_extracti128_si256(ss7, 1);
956           s0 = _mm_adds_epu16(s3, s4);
957           s0 = _mm_minpos_epu16(s0);
958           tem_sum1 = _mm_extract_epi16(s0, 0);
959           if (tem_sum1 < low_sum) {
960             if (tem_sum1 != 0xFFFF) { // no overflow
961               low_sum = tem_sum1;
962               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
963               y_best = i;
964             }
965             else {
966               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
967               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
968               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
969               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
970               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
971               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
972               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
973               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
974               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
975               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
976               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
977               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
978               UPDATE_BEST(s0, 0, 0);
979               UPDATE_BEST(s0, 1, 0);
980               UPDATE_BEST(s0, 2, 0);
981               UPDATE_BEST(s0, 3, 0);
982               UPDATE_BEST(s3, 0, 4);
983               UPDATE_BEST(s3, 1, 4);
984               UPDATE_BEST(s3, 2, 4);
985               UPDATE_BEST(s3, 3, 4);
986             }
987           }
988         }
989 
990         if (leftover) {
991           p_src = src;
992           p_ref = ref + j;
993           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
994           for (k=0; k<height; k++) {
995             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
996             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
997             ss2 = _mm256_loadu_si256((__m256i *)p_src);
998             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
999             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1000             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1001             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1002             p_src += src_stride;
1003             p_ref += ref_stride;
1004           }
1005           ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1006           s3 = _mm256_extracti128_si256(ss7, 0);
1007           s4 = _mm256_extracti128_si256(ss7, 1);
1008           s0 = _mm_adds_epu16(s3, s4);
1009           s0 = _mm_minpos_epu16(s0);
1010           tem_sum1 = _mm_extract_epi16(s0, 0);
1011           if (tem_sum1 < low_sum) {
1012             if (tem_sum1 != 0xFFFF) { // no overflow
1013               low_sum = tem_sum1;
1014               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1015               y_best = i;
1016             }
1017             else {
1018               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1019               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1020               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1021               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1022               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1023               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1024               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1025               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1026               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1027               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1028               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1029               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1030               k = leftover;
1031               while (k > 0) {
1032                 for (l=0; l < 4 && k; l++, k--) {
1033                   tem_sum1 = _mm_extract_epi32(s0, 0);
1034                   s0 = _mm_srli_si128(s0, 4);
1035                   if (tem_sum1 < low_sum) {
1036                     low_sum = tem_sum1;
1037                     x_best = (int16_t)(j + leftover - k);
1038                     y_best = i;
1039                   }
1040                 }
1041                 s0 = s3;
1042               }
1043             }
1044           }
1045         }
1046         ref += src_stride_raw;
1047       }
1048     }
1049     break;
1050 
1051   case 48:
1052     if (height <= 32) {
1053       for (i=0; i<search_area_height; i++) {
1054         for (j=0; j<=search_area_width-8; j+=8) {
1055           p_src = src;
1056           p_ref = ref + j;
1057           s3 = s4 = s5 = s6 = _mm_setzero_si128();
1058           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1059           for (k=0; k<height; k++) {
1060             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1061             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1062             ss2 = _mm256_loadu_si256((__m256i *)p_src);
1063             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1064             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1065             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1066             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1067             s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
1068             s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
1069             s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
1070             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1071             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1072             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1073             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1074             p_src += src_stride;
1075             p_ref += ref_stride;
1076           }
1077           s3 = _mm_adds_epu16(s3, s4);
1078           s5 = _mm_adds_epu16(s5, s6);
1079           s0 = _mm_adds_epu16(s3, s5);
1080           ss3 = _mm256_adds_epu16(ss3, ss4);
1081           ss5 = _mm256_adds_epu16(ss5, ss6);
1082           ss6 = _mm256_adds_epu16(ss3, ss5);
1083           s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss6, 0), _mm256_extracti128_si256(ss6, 1)));
1084           s0 = _mm_minpos_epu16(s0);
1085           tem_sum1 = _mm_extract_epi16(s0, 0);
1086           if (tem_sum1 < low_sum) {
1087             if (tem_sum1 != 0xFFFF) { // no overflow
1088               low_sum = tem_sum1;
1089               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1090               y_best = i;
1091             }
1092             else {
1093               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1094               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1095               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1096               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1097               ss4 = _mm256_add_epi32(ss4, ss6);
1098               ss3 = _mm256_add_epi32(ss3, ss5);
1099               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1100               s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1101               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1102               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1103               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1104               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1105               UPDATE_BEST(s0, 0, 0);
1106               UPDATE_BEST(s0, 1, 0);
1107               UPDATE_BEST(s0, 2, 0);
1108               UPDATE_BEST(s0, 3, 0);
1109               UPDATE_BEST(s1, 0, 4);
1110               UPDATE_BEST(s1, 1, 4);
1111               UPDATE_BEST(s1, 2, 4);
1112               UPDATE_BEST(s1, 3, 4);
1113             }
1114           }
1115         }
1116 
1117         if (leftover) {
1118           p_src = src;
1119           p_ref = ref + j;
1120           s3 = s4 = s5 = s6 = _mm_setzero_si128();
1121           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1122           for (k=0; k<height; k++) {
1123             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1124             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1125             ss2 = _mm256_loadu_si256((__m256i *)p_src);
1126             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1127             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1128             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1129             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1130             s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
1131             s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
1132             s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
1133             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1134             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1135             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1136             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1137             p_src += src_stride;
1138             p_ref += ref_stride;
1139           }
1140           s3 = _mm_adds_epu16(s3, s4);
1141           s5 = _mm_adds_epu16(s5, s6);
1142           s0 = _mm_adds_epu16(s3, s5);
1143           ss3 = _mm256_adds_epu16(ss3, ss4);
1144           ss5 = _mm256_adds_epu16(ss5, ss6);
1145           ss6 = _mm256_adds_epu16(ss3, ss5);
1146           s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss6, 0), _mm256_extracti128_si256(ss6, 1)));
1147           s0 = _mm_or_si128(s0, s8);
1148           s0 = _mm_minpos_epu16(s0);
1149           tem_sum1 = _mm_extract_epi16(s0, 0);
1150           if (tem_sum1 < low_sum) {
1151             if (tem_sum1 != 0xFFFF) { // no overflow
1152               low_sum = tem_sum1;
1153               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1154               y_best = i;
1155             }
1156             else {
1157               ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1158               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1159               ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1160               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1161               ss4 = _mm256_add_epi32(ss4, ss6);
1162               ss3 = _mm256_add_epi32(ss3, ss5);
1163               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1164               s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1165               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1166               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1167               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1168               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1169               k = leftover;
1170               while (k > 0) {
1171                 for (l=0; l < 4 && k; l++, k--) {
1172                   tem_sum1 = _mm_extract_epi32(s0, 0);
1173                   s0 = _mm_srli_si128(s0, 4);
1174                   if (tem_sum1 < low_sum) {
1175                     low_sum = tem_sum1;
1176                     x_best = (int16_t)(j + leftover - k);
1177                     y_best = i;
1178                   }
1179                 }
1180                 s0 = s1;
1181               }
1182             }
1183           }
1184         }
1185         ref += src_stride_raw;
1186       }
1187     }
1188     else {
1189       for (i=0; i<search_area_height; i++) {
1190         for (j=0; j<=search_area_width-8; j+=8) {
1191           p_src = src;
1192           p_ref = ref + j;
1193           s3 = s4 = s5 = s6 = _mm_setzero_si128();
1194           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1195           for (k=0; k<height; k++) {
1196             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1197             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1198             ss2 = _mm256_loadu_si256((__m256i *)p_src);
1199             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1200             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1201             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1202             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1203             s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
1204             s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
1205             s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
1206             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1207             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1208             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1209             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1210             p_src += src_stride;
1211             p_ref += ref_stride;
1212           }
1213           s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1214           ss7 = _mm256_adds_epu16(ss3, ss4);
1215           ss8 = _mm256_adds_epu16(ss5, ss6);
1216           ss7 = _mm256_adds_epu16(ss7, ss8);
1217           s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss7, 0), _mm256_extracti128_si256(ss7, 1)));
1218           s0 = _mm_minpos_epu16(s0);
1219           tem_sum1 = _mm_extract_epi16(s0, 0);
1220           if (tem_sum1 < low_sum) {
1221             if (tem_sum1 != 0xFFFF) { // no overflow
1222               low_sum = tem_sum1;
1223               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1224               y_best = i;
1225             }
1226             else {
1227               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1228               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1229               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1230               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1231               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1232               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1233               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1234               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1235               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1236               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1237               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1238               s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1239               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1240               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s4, _mm_setzero_si128()));
1241               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1242               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s6, _mm_setzero_si128()));
1243               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1244               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s4, _mm_setzero_si128()));
1245               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1246               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s6, _mm_setzero_si128()));
1247               UPDATE_BEST(s0, 0, 0);
1248               UPDATE_BEST(s0, 1, 0);
1249               UPDATE_BEST(s0, 2, 0);
1250               UPDATE_BEST(s0, 3, 0);
1251               UPDATE_BEST(s1, 0, 4);
1252               UPDATE_BEST(s1, 1, 4);
1253               UPDATE_BEST(s1, 2, 4);
1254               UPDATE_BEST(s1, 3, 4);
1255             }
1256           }
1257         }
1258 
1259         if (leftover) {
1260           p_src = src;
1261           p_ref = ref + j;
1262           s3 = s4 = s5 = s6 = _mm_setzero_si128();
1263           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1264           for (k=0; k<height; k++) {
1265             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1266             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1267             ss2 = _mm256_loadu_si256((__m256i *)p_src);
1268             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1269             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1270             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1271             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1272             s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
1273             s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
1274             s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
1275             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1276             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1277             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1278             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1279             p_src += src_stride;
1280             p_ref += ref_stride;
1281           }
1282           s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1283           ss7 = _mm256_adds_epu16(ss3, ss4);
1284           ss8 = _mm256_adds_epu16(ss5, ss6);
1285           ss7 = _mm256_adds_epu16(ss7, ss8);
1286           s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss7, 0), _mm256_extracti128_si256(ss7, 1)));
1287           s0 = _mm_or_si128(s0, s8);
1288           s0 = _mm_minpos_epu16(s0);
1289           tem_sum1 = _mm_extract_epi16(s0, 0);
1290           if (tem_sum1 < low_sum) {
1291             if (tem_sum1 != 0xFFFF) { // no overflow
1292               low_sum = tem_sum1;
1293               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1294               y_best = i;
1295             }
1296             else {
1297               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1298               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1299               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1300               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1301               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1302               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1303               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1304               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1305               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1306               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1307               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1308               s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1309               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1310               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s4, _mm_setzero_si128()));
1311               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1312               s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s6, _mm_setzero_si128()));
1313               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1314               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s4, _mm_setzero_si128()));
1315               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1316               s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s6, _mm_setzero_si128()));
1317               k = leftover;
1318               while (k > 0) {
1319                 for (l=0; l < 4 && k; l++, k--) {
1320                   tem_sum1 = _mm_extract_epi32(s0, 0);
1321                   s0 = _mm_srli_si128(s0, 4);
1322                   if (tem_sum1 < low_sum) {
1323                     low_sum = tem_sum1;
1324                     x_best = (int16_t)(j + leftover - k);
1325                     y_best = i;
1326                   }
1327                 }
1328                 s0 = s1;
1329               }
1330             }
1331           }
1332         }
1333         ref += src_stride_raw;
1334       }
1335     }
1336     break;
1337 
1338   case 64:
1339     if (height <= 32) {
1340       for (i=0; i<search_area_height; i++) {
1341         for (j=0; j<=search_area_width-8; j+=8) {
1342           p_src = src;
1343           p_ref = ref + j;
1344           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1345           for (k=0; k<height; k++) {
1346             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1347             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1348             ss2 = _mm256_loadu_si256((__m256i *)p_src);
1349             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1350             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1351             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1352             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1353             ss0 = _mm256_loadu_si256((__m256i*)(p_ref + 32));
1354             ss1 = _mm256_loadu_si256((__m256i*)(p_ref + 40));
1355             ss2 = _mm256_loadu_si256((__m256i *)(p_src + 32));
1356             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1357             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1358             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1359             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1360             p_src += src_stride;
1361             p_ref += ref_stride;
1362           }
1363           ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1364           s3 = _mm256_extracti128_si256(ss7, 0);
1365           s4 = _mm256_extracti128_si256(ss7, 1);
1366           s0 = _mm_adds_epu16(s3, s4);
1367           s0 = _mm_minpos_epu16(s0);
1368           tem_sum1 = _mm_extract_epi16(s0, 0);
1369           if (tem_sum1 < low_sum) {
1370             if (tem_sum1 != 0xFFFF) { // no overflow
1371               low_sum = tem_sum1;
1372               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1373               y_best = i;
1374             }
1375             else {
1376               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1377               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1378               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1379               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1380               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1381               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1382               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1383               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1384               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1385               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1386               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1387               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1388               UPDATE_BEST(s0, 0, 0);
1389               UPDATE_BEST(s0, 1, 0);
1390               UPDATE_BEST(s0, 2, 0);
1391               UPDATE_BEST(s0, 3, 0);
1392               UPDATE_BEST(s3, 0, 4);
1393               UPDATE_BEST(s3, 1, 4);
1394               UPDATE_BEST(s3, 2, 4);
1395               UPDATE_BEST(s3, 3, 4);
1396             }
1397           }
1398         }
1399 
1400         if (leftover) {
1401           p_src = src;
1402           p_ref = ref + j;
1403           ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1404           for (k=0; k<height; k++) {
1405             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1406             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1407             ss2 = _mm256_loadu_si256((__m256i *)p_src);
1408             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1409             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1410             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1411             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1412             ss0 = _mm256_loadu_si256((__m256i*)(p_ref + 32));
1413             ss1 = _mm256_loadu_si256((__m256i*)(p_ref + 40));
1414             ss2 = _mm256_loadu_si256((__m256i *)(p_src + 32));
1415             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1416             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1417             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1418             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1419             p_src += src_stride;
1420             p_ref += ref_stride;
1421           }
1422           ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1423           s3 = _mm256_extracti128_si256(ss7, 0);
1424           s4 = _mm256_extracti128_si256(ss7, 1);
1425           s0 = _mm_adds_epu16(s3, s4);
1426           s0 = _mm_or_si128(s0, s8);
1427           s0 = _mm_minpos_epu16(s0);
1428           tem_sum1 = _mm_extract_epi16(s0, 0);
1429           if (tem_sum1 < low_sum) {
1430             if (tem_sum1 != 0xFFFF) { // no overflow
1431               low_sum = tem_sum1;
1432               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1433               y_best = i;
1434             }
1435             else {
1436               ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1437               ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1438               ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1439               ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1440               ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1441               ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1442               ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1443               ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1444               ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1445               ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1446               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1447               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1448               k = leftover;
1449               while (k > 0) {
1450                 for (l=0; l < 4 && k; l++, k--) {
1451                   tem_sum1 = _mm_extract_epi32(s0, 0);
1452                   s0 = _mm_srli_si128(s0, 4);
1453                   if (tem_sum1 < low_sum) {
1454                     low_sum = tem_sum1;
1455                     x_best = (int16_t)(j + leftover - k);
1456                     y_best = i;
1457                   }
1458                 }
1459                 s0 = s3;
1460               }
1461             }
1462           }
1463         }
1464         ref += src_stride_raw;
1465       }
1466     }
1467     else {
1468       __m256i ss9, ss10;
1469       for (i=0; i<search_area_height; i++) {
1470         for (j=0; j<=search_area_width-8; j+=8) {
1471           p_src = src;
1472           p_ref = ref + j;
1473           ss3 = ss4 = ss5 = ss6 = ss7 = ss8 = ss9 = ss10 = _mm256_setzero_si256();
1474           for (k=0; k<height; k++) {
1475             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1476             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1477             ss2 = _mm256_loadu_si256((__m256i *)p_src);
1478             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1479             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1480             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1481             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1482             ss0 = _mm256_loadu_si256((__m256i*)(p_ref + 32));
1483             ss1 = _mm256_loadu_si256((__m256i*)(p_ref + 40));
1484             ss2 = _mm256_loadu_si256((__m256i *)(p_src + 32));
1485             ss7 = _mm256_adds_epu16(ss7, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1486             ss8 = _mm256_adds_epu16(ss8, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1487             ss9 = _mm256_adds_epu16(ss9, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1488             ss10 = _mm256_adds_epu16(ss10, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1489             p_src += src_stride;
1490             p_ref += ref_stride;
1491           }
1492           ss0 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1493           ss0 = _mm256_adds_epu16(ss0, _mm256_adds_epu16(_mm256_adds_epu16(ss7, ss8), _mm256_adds_epu16(ss9, ss10)));
1494           s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1495           s0 = _mm_minpos_epu16(s0);
1496           tem_sum1 = _mm_extract_epi16(s0, 0);
1497           if (tem_sum1 < low_sum) {
1498             if (tem_sum1 != 0xFFFF) { // no overflow
1499               low_sum = tem_sum1;
1500               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1501               y_best = i;
1502             }
1503             else {
1504               ss0 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss3, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss5, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256())));
1505               ss1 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss3, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss5, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256())));
1506               ss2 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss7, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss9, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss10, _mm256_setzero_si256())));
1507               ss3 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss7, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss9, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss10, _mm256_setzero_si256())));
1508               ss0 = _mm256_add_epi32(ss0, ss2);
1509               ss1 = _mm256_add_epi32(ss1, ss3);
1510               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1511               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss1, 0), _mm256_extracti128_si256(ss1, 1));
1512               UPDATE_BEST(s0, 0, 0);
1513               UPDATE_BEST(s0, 1, 0);
1514               UPDATE_BEST(s0, 2, 0);
1515               UPDATE_BEST(s0, 3, 0);
1516               UPDATE_BEST(s3, 0, 4);
1517               UPDATE_BEST(s3, 1, 4);
1518               UPDATE_BEST(s3, 2, 4);
1519               UPDATE_BEST(s3, 3, 4);
1520             }
1521           }
1522         }
1523 
1524         if (leftover) {
1525           p_src = src;
1526           p_ref = ref + j;
1527           ss3 = ss4 = ss5 = ss6 = ss7 = ss8 = ss9 = ss10 = _mm256_setzero_si256();
1528           for (k=0; k<height; k++) {
1529             ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1530             ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1531             ss2 = _mm256_loadu_si256((__m256i *)p_src);
1532             ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1533             ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1534             ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1535             ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1536             ss0 = _mm256_loadu_si256((__m256i*)(p_ref + 32));
1537             ss1 = _mm256_loadu_si256((__m256i*)(p_ref + 40));
1538             ss2 = _mm256_loadu_si256((__m256i *)(p_src + 32));
1539             ss7 = _mm256_adds_epu16(ss7, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1540             ss8 = _mm256_adds_epu16(ss8, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1541             ss9 = _mm256_adds_epu16(ss9, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1542             ss10 = _mm256_adds_epu16(ss10, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1543             p_src += src_stride;
1544             p_ref += ref_stride;
1545           }
1546           ss0 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1547           ss0 = _mm256_adds_epu16(ss0, _mm256_adds_epu16(_mm256_adds_epu16(ss7, ss8), _mm256_adds_epu16(ss9, ss10)));
1548           s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1549           s0 = _mm_or_si128(s0, s8);
1550           s0 = _mm_minpos_epu16(s0);
1551           tem_sum1 = _mm_extract_epi16(s0, 0);
1552           if (tem_sum1 < low_sum) {
1553             if (tem_sum1 != 0xFFFF) { // no overflow
1554               low_sum = tem_sum1;
1555               x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1556               y_best = i;
1557             }
1558             else {
1559               ss0 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss3, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss5, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256())));
1560               ss1 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss3, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss5, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256())));
1561               ss2 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss7, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss9, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss10, _mm256_setzero_si256())));
1562               ss3 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss7, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss9, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss10, _mm256_setzero_si256())));
1563               ss0 = _mm256_add_epi32(ss0, ss2);
1564               ss1 = _mm256_add_epi32(ss1, ss3);
1565               s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1566               s3 = _mm_add_epi32(_mm256_extracti128_si256(ss1, 0), _mm256_extracti128_si256(ss1, 1));
1567               k = leftover;
1568               while (k > 0) {
1569                 for (l=0; l < 4 && k; l++, k--) {
1570                   tem_sum1 = _mm_extract_epi32(s0, 0);
1571                   s0 = _mm_srli_si128(s0, 4);
1572                   if (tem_sum1 < low_sum) {
1573                     low_sum = tem_sum1;
1574                     x_best = (int16_t)(j + leftover - k);
1575                     y_best = i;
1576                   }
1577                 }
1578                 s0 = s3;
1579               }
1580             }
1581           }
1582         }
1583         ref += src_stride_raw;
1584       }
1585     }
1586     break;
1587 
1588   default:
1589     break;
1590   }
1591 
1592   *best_sad = low_sum;
1593   *x_search_center = x_best;
1594   *y_search_center = y_best;
1595 }
1596 
1597 /*******************************************************************************
1598 * Requirement: height % 4 = 0
1599 *******************************************************************************/
compute4x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1600 uint32_t compute4x_m_sad_avx2_intrin(
1601     uint8_t  *src,         // input parameter, source samples Ptr
1602     uint32_t  src_stride,  // input parameter, source stride
1603     uint8_t  *ref,         // input parameter, reference samples Ptr
1604     uint32_t  ref_stride,  // input parameter, reference stride
1605     uint32_t  height,      // input parameter, block height (M)
1606     uint32_t  width)       // input parameter, block width (N)
1607 {
1608     __m128i xmm0;
1609     __m256i ymm = _mm256_setzero_si256();
1610     uint32_t y;
1611     (void)width;
1612 
1613     for (y = 0; y < height; y += 4) {
1614         const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
1615         const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
1616         ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
1617         src += src_stride << 2;
1618         ref += ref_stride << 2;
1619     }
1620 
1621     xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
1622         _mm256_extracti128_si256(ymm, 1));
1623 
1624     return (uint32_t)_mm_cvtsi128_si32(xmm0);
1625 }
1626 
1627 /*******************************************************************************
1628 * Requirement: height % 4 = 0
1629 *******************************************************************************/
compute8x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1630 uint32_t compute8x_m_sad_avx2_intrin(
1631     uint8_t  *src,         // input parameter, source samples Ptr
1632     uint32_t  src_stride,  // input parameter, source stride
1633     uint8_t  *ref,         // input parameter, reference samples Ptr
1634     uint32_t  ref_stride,  // input parameter, reference stride
1635     uint32_t  height,      // input parameter, block height (M)
1636     uint32_t  width)       // input parameter, block width (N)
1637 {
1638     __m128i xmm0;
1639     __m256i ymm = _mm256_setzero_si256();
1640     uint32_t y;
1641     (void)width;
1642 
1643     for (y = 0; y < height; y += 4) {
1644         const __m256i src0123 = load8bit_8x4_avx2(src, src_stride);
1645         const __m256i ref0123 = load8bit_8x4_avx2(ref, ref_stride);
1646         ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
1647         src += src_stride << 2;
1648         ref += ref_stride << 2;
1649     }
1650 
1651     xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
1652                          _mm256_extracti128_si256(ymm, 1));
1653     xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1654 
1655     return (uint32_t)_mm_cvtsi128_si32(xmm0);
1656 }
1657 
Compute16x2Sad_Kernel(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,const __m256i ymm)1658 static __m256i Compute16x2Sad_Kernel(const uint8_t *const src,
1659     const uint32_t src_stride, const uint8_t *const ref,
1660     const uint32_t ref_stride, const __m256i ymm)
1661 {
1662     const __m256i src01 = load8bit_16x2_unaligned_avx2(src, src_stride);
1663     const __m256i ref01 = load8bit_16x2_unaligned_avx2(ref, ref_stride);
1664     return _mm256_add_epi32(ymm, _mm256_sad_epu8(src01, ref01));
1665 }
1666 
1667 /*******************************************************************************
1668 * Requirement: height % 4 = 0
1669 *******************************************************************************/
compute16x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1670 uint32_t compute16x_m_sad_avx2_intrin(
1671     uint8_t  *src,         // input parameter, source samples Ptr
1672     uint32_t  src_stride,  // input parameter, source stride
1673     uint8_t  *ref,         // input parameter, reference samples Ptr
1674     uint32_t  ref_stride,  // input parameter, reference stride
1675     uint32_t  height,      // input parameter, block height (M)
1676     uint32_t  width)       // input parameter, block width (N)
1677 {
1678     __m128i xmm0;
1679     __m256i ymm = _mm256_setzero_si256();
1680     uint32_t y;
1681     (void)width;
1682 
1683     for (y = 0; y < height; y += 4) {
1684         ymm = Compute16x2Sad_Kernel(src + 0 * src_stride, src_stride,
1685             ref + 0 * ref_stride, ref_stride, ymm);
1686         ymm = Compute16x2Sad_Kernel(src + 2 * src_stride, src_stride,
1687             ref + 2 * ref_stride, ref_stride, ymm);
1688         src += src_stride << 2;
1689         ref += ref_stride << 2;
1690     }
1691 
1692     xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
1693                          _mm256_extracti128_si256(ymm, 1));
1694     xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1695 
1696     return (uint32_t)_mm_cvtsi128_si32(xmm0);
1697 }
1698 
1699 /*******************************************************************************
1700 * Requirement: height % 2 = 0
1701 *******************************************************************************/
compute24x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1702 uint32_t compute24x_m_sad_avx2_intrin(
1703     uint8_t  *src,         // input parameter, source samples Ptr
1704     uint32_t  src_stride,  // input parameter, source stride
1705     uint8_t  *ref,         // input parameter, reference samples Ptr
1706     uint32_t  ref_stride,  // input parameter, reference stride
1707     uint32_t  height,      // input parameter, block height (M)
1708     uint32_t  width)       // input parameter, block width (N)
1709 {
1710     __m128i xmm0, xmm1;
1711     __m256i ymm0, ymm1;
1712     uint32_t y;
1713     (void)width;
1714 
1715     ymm0 = ymm1 = _mm256_setzero_si256();
1716     for (y = 0; y < height; y += 2) {
1717         ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i *)ref)));
1718         ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride)), _mm256_loadu_si256((__m256i *)(ref + ref_stride))));
1719         src += src_stride << 1;
1720         ref += ref_stride << 1;
1721     }
1722     xmm0 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm1, 0));
1723     xmm1 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 1), _mm256_extracti128_si256(ymm1, 1));
1724     xmm0 = _mm_add_epi32(_mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8)), xmm1);
1725     return (uint32_t)_mm_cvtsi128_si32(xmm0);
1726 }
1727 
1728 /*******************************************************************************
1729 * Requirement: height % 2 = 0
1730 *******************************************************************************/
compute32x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1731 uint32_t compute32x_m_sad_avx2_intrin(
1732     uint8_t  *src,         // input parameter, source samples Ptr
1733     uint32_t  src_stride,  // input parameter, source stride
1734     uint8_t  *ref,         // input parameter, reference samples Ptr
1735     uint32_t  ref_stride,  // input parameter, reference stride
1736     uint32_t  height,      // input parameter, block height (M)
1737     uint32_t  width)       // input parameter, block width (N)
1738 {
1739     __m128i xmm0;
1740     __m256i ymm0, ymm1;
1741     uint32_t y;
1742     (void)width;
1743 
1744     ymm0 = ymm1 = _mm256_setzero_si256();
1745     for (y = 0; y < height; y += 2) {
1746         ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i *)ref)));
1747         ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride)), _mm256_loadu_si256((__m256i *)(ref + ref_stride))));
1748         src += src_stride << 1;
1749         ref += ref_stride << 1;
1750     }
1751     ymm0 = _mm256_add_epi32(ymm0, ymm1);
1752     xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm0), _mm256_extracti128_si256(ymm0, 1));
1753     xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1754     return (uint32_t) /*xmm0.m128i_i64[0];*/ _mm_cvtsi128_si32(xmm0);
1755 }
1756 
1757 /*******************************************************************************
1758 * Requirement: height % 2 = 0
1759 *******************************************************************************/
compute48x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1760 uint32_t compute48x_m_sad_avx2_intrin(
1761     uint8_t  *src,         // input parameter, source samples Ptr
1762     uint32_t  src_stride,  // input parameter, source stride
1763     uint8_t  *ref,         // input parameter, reference samples Ptr
1764     uint32_t  ref_stride,  // input parameter, reference stride
1765     uint32_t  height,      // input parameter, block height (M)
1766     uint32_t  width)       // input parameter, block width (N)
1767 {
1768     __m128i xmm0, xmm1;
1769     __m256i ymm0, ymm1;
1770     uint32_t y;
1771     (void)width;
1772 
1773     ymm0 = ymm1 = _mm256_setzero_si256();
1774     xmm0 = xmm1 = _mm_setzero_si128();
1775     for (y = 0; y < height; y += 2) {
1776         ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1777         xmm0 = _mm_add_epi32(xmm0, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 32)), _mm_loadu_si128((__m128i*)(ref + 32))));
1778         ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride)), _mm256_loadu_si256((__m256i*)(ref + ref_stride))));
1779         xmm1 = _mm_add_epi32(xmm1, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + src_stride + 32)), _mm_loadu_si128((__m128i*)(ref + ref_stride + 32))));
1780         src += src_stride << 1;
1781         ref += ref_stride << 1;
1782     }
1783     ymm0 = _mm256_add_epi32(ymm0, ymm1);
1784     xmm0 = _mm_add_epi32(xmm0, xmm1);
1785     xmm0 = _mm_add_epi32(xmm0, _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm0, 1)));
1786     xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1787     return _mm_extract_epi32(xmm0, 0);
1788 }
1789 
1790 /*******************************************************************************
1791 * Requirement: height % 2 = 0
1792 *******************************************************************************/
compute64x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1793 uint32_t compute64x_m_sad_avx2_intrin(
1794     uint8_t  *src,         // input parameter, source samples Ptr
1795     uint32_t  src_stride,  // input parameter, source stride
1796     uint8_t  *ref,         // input parameter, reference samples Ptr
1797     uint32_t  ref_stride,  // input parameter, reference stride
1798     uint32_t  height,      // input parameter, block height (M)
1799     uint32_t  width)       // input parameter, block width (N)
1800 {
1801     __m128i xmm0;
1802     __m256i ymm0, ymm1, ymm2, ymm3;
1803     uint32_t y;
1804     (void)width;
1805 
1806     ymm0 = ymm1 = ymm2 = ymm3 = _mm256_setzero_si256();
1807     for (y = 0; y < height; y += 2) {
1808         ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1809         ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + 32)), _mm256_loadu_si256((__m256i*)(ref + 32))));
1810         ymm2 = _mm256_add_epi32(ymm2, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride)), _mm256_loadu_si256((__m256i*)(ref + ref_stride))));
1811         ymm3 = _mm256_add_epi32(ymm3, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride + 32)), _mm256_loadu_si256((__m256i*)(ref + ref_stride + 32))));
1812         src += src_stride << 1;
1813         ref += ref_stride << 1;
1814     }
1815     ymm0 = _mm256_add_epi32(_mm256_add_epi32(ymm0, ymm1), _mm256_add_epi32(ymm2, ymm3));
1816     xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm0), _mm256_extracti128_si256(ymm0, 1));
1817     xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1818     return _mm_extract_epi32(xmm0, 0);
1819 }
1820 
1821 #ifdef DISABLE_AVX512
1822 
eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t * p_best_sad8x8,uint32_t * p_best_mv8x8,uint32_t * p_best_sad16x16,uint32_t * p_best_mv16x16,uint32_t mv,uint16_t * p_sad16x16)1823 void eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu_avx2_intrin(
1824     uint8_t   *src,
1825     uint32_t   src_stride,
1826     uint8_t   *ref,
1827     uint32_t   ref_stride,
1828     uint32_t  *p_best_sad8x8,
1829     uint32_t  *p_best_mv8x8,
1830     uint32_t  *p_best_sad16x16,
1831     uint32_t  *p_best_mv16x16,
1832     uint32_t   mv,
1833     uint16_t  *p_sad16x16)
1834 {
1835 
1836     int16_t x_mv, y_mv;
1837     __m128i s3;
1838     __m128i sad_0, sad_1, sad_2, sad_3;
1839     __m256i ss0, ss1, ss2, ss3, ss4;
1840     uint32_t tem_sum;
1841 
1842     /*
1843     -------------------------------------   -----------------------------------
1844     | 8x8_00 | 8x8_01 | 8x8_04 | 8x8_05 |   8x8_16 | 8x8_17 | 8x8_20 | 8x8_21 |
1845     -------------------------------------   -----------------------------------
1846     | 8x8_02 | 8x8_03 | 8x8_06 | 8x8_07 |   8x8_18 | 8x8_19 | 8x8_22 | 8x8_23 |
1847     -----------------------   -----------   ----------------------   ----------
1848     | 8x8_08 | 8x8_09 | 8x8_12 | 8x8_13 |   8x8_24 | 8x8_25 | 8x8_29 | 8x8_29 |
1849     ----------------------    -----------   ---------------------    ----------
1850     | 8x8_10 | 8x8_11 | 8x8_14 | 8x8_15 |   8x8_26 | 8x8_27 | 8x8_30 | 8x8_31 |
1851     -------------------------------------   -----------------------------------
1852 
1853     -------------------------------------   -----------------------------------
1854     | 8x8_32 | 8x8_33 | 8x8_36 | 8x8_37 |   8x8_48 | 8x8_49 | 8x8_52 | 8x8_53 |
1855     -------------------------------------   -----------------------------------
1856     | 8x8_34 | 8x8_35 | 8x8_38 | 8x8_39 |   8x8_50 | 8x8_51 | 8x8_54 | 8x8_55 |
1857     -----------------------   -----------   ----------------------   ----------
1858     | 8x8_40 | 8x8_41 | 8x8_44 | 8x8_45 |   8x8_56 | 8x8_57 | 8x8_60 | 8x8_61 |
1859     ----------------------    -----------   ---------------------    ----------
1860     | 8x8_42 | 8x8_43 | 8x8_46 | 8x8_48 |   8x8_58 | 8x8_59 | 8x8_62 | 8x8_63 |
1861     -------------------------------------   -----------------------------------
1862     */
1863 
1864     /*
1865     ----------------------    ----------------------
1866     |  16x16_0  |  16x16_1  |  16x16_4  |  16x16_5  |
1867     ----------------------    ----------------------
1868     |  16x16_2  |  16x16_3  |  16x16_6  |  16x16_7  |
1869     -----------------------   -----------------------
1870     |  16x16_8  |  16x16_9  |  16x16_12 |  16x16_13 |
1871     ----------------------    ----------------------
1872     |  16x16_10 |  16x16_11 |  16x16_14 |  16x16_15 |
1873     -----------------------   -----------------------
1874     */
1875 
1876     //8x8_0 & 8x8_1
1877     ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride)));
1878     ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride + 8)));
1879     ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * src_stride)));
1880     ss3 = _mm256_mpsadbw_epu8(ss0, ss2, 0);
1881     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1882     ss4 = _mm256_mpsadbw_epu8(ss1, ss2, 18);                         // 010 010
1883     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1884     src += src_stride * 4;
1885     ref += ref_stride * 4;
1886     ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride)));
1887     ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride + 8)));
1888     ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * src_stride)));
1889     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1890     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1891     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1892     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1893     sad_0 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1894     sad_1 = _mm_adds_epu16(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1895 
1896     //8x8_2 & 8x8_3
1897     src += src_stride * 4;
1898     ref += ref_stride * 4;
1899     ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride)));
1900     ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride + 8)));
1901     ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * src_stride)));
1902     ss3 = _mm256_mpsadbw_epu8(ss0, ss2, 0);
1903     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1904     ss4 = _mm256_mpsadbw_epu8(ss1, ss2, 18);                         // 010 010
1905     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1906 
1907     src += src_stride * 4;
1908     ref += ref_stride * 4;
1909     ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride)));
1910     ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride + 8)));
1911     ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * src_stride)));
1912     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1913     ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1914     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1915     ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1916     sad_2 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1917     sad_3 = _mm_adds_epu16(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1918 
1919     //16x16
1920     s3 = _mm_adds_epu16(_mm_adds_epu16(sad_0, sad_1), _mm_adds_epu16(sad_2, sad_3));
1921     //sotore the 8 SADs(16x8 SADs)
1922     _mm_store_si128((__m128i*)p_sad16x16, s3);
1923     //find the best for 16x16
1924     s3 = _mm_minpos_epu16(s3);
1925     tem_sum = _mm_extract_epi16(s3, 0) << 1;
1926     if (tem_sum <  p_best_sad16x16[0]) {
1927         p_best_sad16x16[0] = tem_sum;
1928         x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
1929         y_mv = _MVYT(mv);
1930         p_best_mv16x16[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
1931     }
1932 
1933     //find the best for 8x8_0, 8x8_1, 8x8_2 & 8x8_3
1934     sad_0 = _mm_minpos_epu16(sad_0);
1935     sad_1 = _mm_minpos_epu16(sad_1);
1936     sad_2 = _mm_minpos_epu16(sad_2);
1937     sad_3 = _mm_minpos_epu16(sad_3);
1938     sad_0 = _mm_unpacklo_epi16(sad_0, sad_1);
1939     sad_2 = _mm_unpacklo_epi16(sad_2, sad_3);
1940     sad_0 = _mm_unpacklo_epi32(sad_0, sad_2);
1941     sad_1 = _mm_unpackhi_epi16(sad_0, _mm_setzero_si128());
1942     sad_0 = _mm_unpacklo_epi16(sad_0, _mm_setzero_si128());
1943     sad_0 = _mm_slli_epi32(sad_0, 1);
1944     sad_1 = _mm_slli_epi16(sad_1, 2);
1945     sad_2 = _mm_loadu_si128((__m128i*)p_best_sad8x8);
1946     s3 = _mm_cmpgt_epi32(sad_2, sad_0);
1947     sad_0 = _mm_min_epu32(sad_0, sad_2);
1948     _mm_storeu_si128((__m128i*)p_best_sad8x8, sad_0);
1949     sad_3 = _mm_loadu_si128((__m128i*)p_best_mv8x8);
1950     sad_3 = _mm_andnot_si128(s3, sad_3);
1951     sad_2 = _mm_set1_epi32(mv);
1952     sad_2 = _mm_add_epi16(sad_2, sad_1);
1953     sad_2 = _mm_and_si128(sad_2, s3);
1954     sad_2 = _mm_or_si128(sad_2, sad_3);
1955     _mm_storeu_si128((__m128i*)p_best_mv8x8, sad_2);
1956 
1957 }
eb_vp9_get_eight_horizontal_search_point_results_32x32_64x64_pu_avx2_intrin(uint16_t * p_sad16x16,uint32_t * p_best_sad32x32,uint32_t * p_best_sad64x64,uint32_t * p_best_mv32x32,uint32_t * p_best_mv64x64,uint32_t mv)1958 void eb_vp9_get_eight_horizontal_search_point_results_32x32_64x64_pu_avx2_intrin(
1959     uint16_t  *p_sad16x16,
1960     uint32_t  *p_best_sad32x32,
1961     uint32_t  *p_best_sad64x64,
1962     uint32_t  *p_best_mv32x32,
1963     uint32_t  *p_best_mv64x64,
1964     uint32_t   mv)
1965 {
1966     int16_t x_mv, y_mv;
1967     uint32_t tem_sum, best_sad64x64, best_mv_64x64;
1968     __m128i s0, s1, s2, s3, s4, s5, s6, s7, sad_0, sad_1;
1969     __m128i sad_00, sad_01, sad_10, sad_11, sad_20, sad_21, sad_30, sad_31;
1970     __m256i ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7;
1971 
1972     s0 = _mm_setzero_si128();
1973     s1 = _mm_setzero_si128();
1974     s2 = _mm_setzero_si128();
1975     s3 = _mm_setzero_si128();
1976     s4 = _mm_setzero_si128();
1977     s5 = _mm_setzero_si128();
1978     s6 = _mm_setzero_si128();
1979     s7 = _mm_setzero_si128();
1980     sad_0 = _mm_setzero_si128();
1981     sad_1 = _mm_setzero_si128();
1982 
1983     sad_00 = _mm_setzero_si128();
1984     sad_01 = _mm_setzero_si128();
1985     sad_10 = _mm_setzero_si128();
1986     sad_11 = _mm_setzero_si128();
1987     sad_20 = _mm_setzero_si128();
1988     sad_21 = _mm_setzero_si128();
1989     sad_30 = _mm_setzero_si128();
1990     sad_31 = _mm_setzero_si128();
1991 
1992     ss0 = _mm256_setzero_si256();
1993     ss1 = _mm256_setzero_si256();
1994     ss2 = _mm256_setzero_si256();
1995     ss3 = _mm256_setzero_si256();
1996     ss4 = _mm256_setzero_si256();
1997     ss5 = _mm256_setzero_si256();
1998     ss6 = _mm256_setzero_si256();
1999     ss7 = _mm256_setzero_si256();
2000 
2001     /*--------------------
2002     |  32x32_0  |  32x32_1
2003     ----------------------
2004     |  32x32_2  |  32x32_3
2005     ----------------------*/
2006 
2007     /*  data ordering in p_sad16x16 buffer
2008 
2009     Search    Search            Search
2010     Point 0   Point 1           Point 7
2011     ---------------------------------------
2012     16x16_0    |    x    |    x    | ...... |    x    |
2013     ---------------------------------------
2014     16x16_1    |    x    |    x    | ...... |    x    |
2015 
2016     16x16_n    |    x    |    x    | ...... |    x    |
2017 
2018     ---------------------------------------
2019     16x16_15   |    x    |    x    | ...... |    x    |
2020     ---------------------------------------
2021     */
2022 
2023     //    __m128i Zero = _mm_setzero_si128();
2024 
2025     //32x32_0
2026     s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 0 * 8));
2027     s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 1 * 8));
2028     s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 2 * 8));
2029     s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 3 * 8));
2030 
2031     s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2032     s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2033     s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2034     s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2035     s0 = _mm_add_epi32(s4, s6);
2036     s1 = _mm_add_epi32(s5, s7);
2037 
2038     s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2039     s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2040     s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2041     s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2042     s2 = _mm_add_epi32(s4, s6);
2043     s3 = _mm_add_epi32(s5, s7);
2044 
2045     sad_01 = _mm_add_epi32(s0, s2);
2046     sad_00 = _mm_add_epi32(s1, s3);
2047 
2048     //32x32_1
2049     s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 4 * 8));
2050     s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 5 * 8));
2051     s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 6 * 8));
2052     s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 7 * 8));
2053 
2054     s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2055     s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2056     s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2057     s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2058     s0 = _mm_add_epi32(s4, s6);
2059     s1 = _mm_add_epi32(s5, s7);
2060 
2061     s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2062     s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2063     s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2064     s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2065     s2 = _mm_add_epi32(s4, s6);
2066     s3 = _mm_add_epi32(s5, s7);
2067 
2068     sad_11 = _mm_add_epi32(s0, s2);
2069     sad_10 = _mm_add_epi32(s1, s3);
2070 
2071     //32x32_2
2072     s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 8 * 8));
2073     s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 9 * 8));
2074     s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 10 * 8));
2075     s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 11 * 8));
2076 
2077     s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2078     s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2079     s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2080     s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2081     s0 = _mm_add_epi32(s4, s6);
2082     s1 = _mm_add_epi32(s5, s7);
2083 
2084     s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2085     s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2086     s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2087     s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2088     s2 = _mm_add_epi32(s4, s6);
2089     s3 = _mm_add_epi32(s5, s7);
2090 
2091     sad_21 = _mm_add_epi32(s0, s2);
2092     sad_20 = _mm_add_epi32(s1, s3);
2093 
2094     //32x32_3
2095     s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 12 * 8));
2096     s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 13 * 8));
2097     s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 14 * 8));
2098     s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 15 * 8));
2099 
2100     s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2101     s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2102     s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2103     s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2104     s0 = _mm_add_epi32(s4, s6);
2105     s1 = _mm_add_epi32(s5, s7);
2106 
2107     s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2108     s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2109     s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2110     s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2111     s2 = _mm_add_epi32(s4, s6);
2112     s3 = _mm_add_epi32(s5, s7);
2113 
2114     sad_31 = _mm_add_epi32(s0, s2);
2115     sad_30 = _mm_add_epi32(s1, s3);
2116 
2117     sad_0 = _mm_add_epi32(_mm_add_epi32(sad_00, sad_10), _mm_add_epi32(sad_20, sad_30));
2118     sad_1 = _mm_add_epi32(_mm_add_epi32(sad_01, sad_11), _mm_add_epi32(sad_21, sad_31));
2119     sad_0 = _mm_slli_epi32(sad_0, 1);
2120     sad_1 = _mm_slli_epi32(sad_1, 1);
2121 
2122     best_sad64x64 = p_best_sad64x64[0];
2123     best_mv_64x64 = 0;
2124     //sad_0
2125     tem_sum = _mm_extract_epi32(sad_0, 0);
2126     if (tem_sum < best_sad64x64) {
2127         best_sad64x64 = tem_sum;
2128     }
2129     tem_sum = _mm_extract_epi32(sad_0, 1);
2130     if (tem_sum < best_sad64x64) {
2131         best_sad64x64 = tem_sum;
2132         best_mv_64x64 = 1 * 4;
2133     }
2134     tem_sum = _mm_extract_epi32(sad_0, 2);
2135     if (tem_sum < best_sad64x64) {
2136         best_sad64x64 = tem_sum;
2137         best_mv_64x64 = 2 * 4;
2138     }
2139     tem_sum = _mm_extract_epi32(sad_0, 3);
2140     if (tem_sum < best_sad64x64) {
2141         best_sad64x64 = tem_sum;
2142         best_mv_64x64 = 3 * 4;
2143     }
2144 
2145     //sad_1
2146     tem_sum = _mm_extract_epi32(sad_1, 0);
2147     if (tem_sum < best_sad64x64) {
2148         best_sad64x64 = tem_sum;
2149         best_mv_64x64 = 4 * 4;
2150     }
2151     tem_sum = _mm_extract_epi32(sad_1, 1);
2152     if (tem_sum < best_sad64x64) {
2153         best_sad64x64 = tem_sum;
2154         best_mv_64x64 = 5 * 4;
2155     }
2156     tem_sum = _mm_extract_epi32(sad_1, 2);
2157     if (tem_sum < best_sad64x64) {
2158         best_sad64x64 = tem_sum;
2159         best_mv_64x64 = 6 * 4;
2160     }
2161     tem_sum = _mm_extract_epi32(sad_1, 3);
2162     if (tem_sum < best_sad64x64) {
2163         best_sad64x64 = tem_sum;
2164         best_mv_64x64 = 7 * 4;
2165     }
2166     if (p_best_sad64x64[0] != best_sad64x64) {
2167         p_best_sad64x64[0] = best_sad64x64;
2168         x_mv = _MVXT(mv) + (int16_t)best_mv_64x64;  y_mv = _MVYT(mv);
2169         p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
2170     }
2171 
2172     // ****CODE PAST HERE IS BUGGY FOR GCC****
2173 
2174     // XY
2175     // X: 32x32 block [0..3]
2176     // Y: Search position [0..7]
2177     ss0 = _mm256_setr_m128i(sad_00, sad_01); // 07 06 05 04  03 02 01 00
2178     ss1 = _mm256_setr_m128i(sad_10, sad_11); // 17 16 15 14  13 12 11 10
2179     ss2 = _mm256_setr_m128i(sad_20, sad_21); // 27 26 25 24  23 22 21 20
2180     ss3 = _mm256_setr_m128i(sad_30, sad_31); // 37 36 35 34  33 32 31 30
2181     ss4 = _mm256_unpacklo_epi32(ss0, ss1);   // 15 05 14 04  11 01 10 00
2182     ss5 = _mm256_unpacklo_epi32(ss2, ss3);   // 35 25 34 24  31 21 30 20
2183     ss6 = _mm256_unpackhi_epi32(ss0, ss1);   // 17 07 16 06  13 03 12 02
2184     ss7 = _mm256_unpackhi_epi32(ss2, ss3);   // 37 27 36 26  33 23 32 22
2185     ss0 = _mm256_unpacklo_epi64(ss4, ss5);   // 34 24 14 04  30 20 10 00
2186     ss1 = _mm256_unpackhi_epi64(ss4, ss5);   // 35 25 15 05  31 21 11 01
2187     ss2 = _mm256_unpacklo_epi64(ss6, ss7);   // 36 26 16 06  32 22 12 02
2188     ss3 = _mm256_unpackhi_epi64(ss6, ss7);   // 37 27 17 07  33 23 13 03
2189 
2190                                              //   ss4   |  ss5-2  |                ss6        |
2191                                              // a0 : a1 | a2 : a3 | min(a0, a1) : min(a2, a3) |    | (ss4 & !ss6) | ((ss5-2) & ss6) | ((ss4 & !ss6) | ((ss5-2) & ss6)) |
2192                                              // > (-1)  | >  (-3) |         >     (-1)        | a3 |       0      |       -3        |              -3                  |
2193                                              // > (-1)  | >  (-3) |         <=     (0)        | a1 |      -1      |        0        |              -1                  |
2194                                              // > (-1)  | <= (-2) |         >     (-1)        | a2 |       0      |       -2        |              -2                  |
2195                                              // > (-1)  | <= (-2) |         <=     (0)        | a1 |      -1      |        0        |              -1                  |
2196                                              // <= (0)  | >  (-3) |         >     (-1)        | a3 |       0      |       -3        |              -3                  |
2197                                              // <= (0)  | >  (-3) |         <=     (0)        | a0 |       0      |        0        |               0                  |
2198                                              // <= (0)  | <= (-2) |         >     (-1)        | a2 |       0      |       -2        |              -2                  |
2199                                              // <= (0)  | <= (-2) |         <=     (0)        | a0 |       0      |        0        |               0                  |
2200 
2201                                              // *** 8 search points per position ***
2202 
2203                                              // ss0: Search Pos 0,4 for blocks 0,1,2,3
2204                                              // ss1: Search Pos 1,5 for blocks 0,1,2,3
2205                                              // ss2: Search Pos 2,6 for blocks 0,1,2,3
2206                                              // ss3: Search Pos 3,7 for blocks 0,1,2,3
2207 
2208     ss4 = _mm256_cmpgt_epi32(ss0, ss1);
2209     // not different SVT_LOG("%d\n", _mm_extract_epi32(_mm256_extracti128_si256(ss4, 0), 0)); // DEBUG
2210     //ss4 = _mm256_or_si256(_mm256_cmpgt_epi32(ss0, ss1), _mm256_cmpeq_epi32(ss0, ss1));
2211     ss0 = _mm256_min_epi32(ss0, ss1);
2212     ss5 = _mm256_cmpgt_epi32(ss2, ss3);
2213     //ss5 = _mm256_or_si256(_mm256_cmpgt_epi32(ss2, ss3), _mm256_cmpeq_epi32(ss2, ss3));
2214     ss2 = _mm256_min_epi32(ss2, ss3);
2215     ss5 = _mm256_sub_epi32(ss5, _mm256_set1_epi32(2)); // ss5-2
2216 
2217                                                        // *** 4 search points per position ***
2218     ss6 = _mm256_cmpgt_epi32(ss0, ss2);
2219     //ss6 = _mm256_or_si256(_mm256_cmpgt_epi32(ss0, ss2), _mm256_cmpeq_epi32(ss0, ss2));
2220     ss0 = _mm256_min_epi32(ss0, ss2);
2221     ss5 = _mm256_and_si256(ss5, ss6); // (ss5-2) & ss6
2222     ss4 = _mm256_andnot_si256(ss6, ss4); // ss4 & !ss6
2223     ss4 = _mm256_or_si256(ss4, ss5); // (ss4 & !ss6) | ((ss5-2) & ss6)
2224 
2225                                      // *** 2 search points per position ***
2226     s0 = _mm_setzero_si128();
2227     s1 = _mm_setzero_si128();
2228     s2 = _mm_setzero_si128();
2229     s3 = _mm_setzero_si128();
2230     s4 = _mm_setzero_si128();
2231     s5 = _mm_setzero_si128();
2232     s6 = _mm_setzero_si128();
2233     s7 = _mm_setzero_si128();
2234 
2235     // ss0 = 8 SADs, two search points for each 32x32
2236     // ss4 = 8 MVs, two search points for each 32x32
2237     //
2238     // XY
2239     // X: 32x32 block [0..3]
2240     // Y: search position [0..1]
2241     // Format: 00 10 20 30  01 11 21 31
2242 
2243     // Each 128 bits contains 4 32x32 32bit block results
2244 #ifdef __GNUC__
2245     // SAD
2246     s0 = _mm256_extracti128_si256(ss0, 1);
2247     s1 = _mm256_extracti128_si256(ss0, 0);
2248     // MV
2249     s2 = _mm256_extracti128_si256(ss4, 1);
2250     s3 = _mm256_extracti128_si256(ss4, 0);
2251 #else
2252     // SAD
2253     s0 = _mm256_extracti128_si256(ss0, 0);
2254     s1 = _mm256_extracti128_si256(ss0, 1);
2255     // MV
2256     s2 = _mm256_extracti128_si256(ss4, 0);
2257     s3 = _mm256_extracti128_si256(ss4, 1);
2258 #endif
2259 
2260     //// Should be fine
2261     //SVT_LOG("sad0 %d, %d, %d, %d\n", _mm_extract_epi32(s0, 0), _mm_extract_epi32(s0, 1), _mm_extract_epi32(s0, 2), _mm_extract_epi32(s0, 3)); // DEBUG
2262     //SVT_LOG("sad1 %d, %d, %d, %d\n", _mm_extract_epi32(s1, 0), _mm_extract_epi32(s1, 1), _mm_extract_epi32(s1, 2), _mm_extract_epi32(s1, 3)); // DEBUG
2263     //SVT_LOG("mv0 %d, %d, %d, %d\n", _mm_extract_epi32(s2, 0), _mm_extract_epi32(s2, 1), _mm_extract_epi32(s2, 2), _mm_extract_epi32(s2, 3)); // DEBUG
2264     //SVT_LOG("mv1 %d, %d, %d, %d\n", _mm_extract_epi32(s3, 0), _mm_extract_epi32(s3, 1), _mm_extract_epi32(s3, 2), _mm_extract_epi32(s3, 3)); // DEBUG
2265 
2266     // Choose the best MV out of the two, use s4 to hold results of min
2267     s4 = _mm_cmpgt_epi32(s0, s1);
2268 
2269     // DIFFERENT BETWEEN VS AND GCC
2270     // SVT_LOG("%d, %d, %d, %d\n", _mm_extract_epi32(s4, 0), _mm_extract_epi32(s4, 1), _mm_extract_epi32(s4, 2), _mm_extract_epi32(s4, 3)); // DEBUG
2271 
2272     //s4 = _mm_or_si128(_mm_cmpgt_epi32(s0, s1), _mm_cmpeq_epi32(s0, s1));
2273     s0 = _mm_min_epi32(s0, s1);
2274 
2275     // Extract MV's based on the blocks to s2
2276     s3 = _mm_sub_epi32(s3, _mm_set1_epi32(4)); // s3-4
2277                                                // Remove the MV's are not used from s2
2278     s2 = _mm_andnot_si128(s4, s2);
2279     // Remove the MV's that are not used from s3 (inverse from s2 above operation)
2280     s3 = _mm_and_si128(s4, s3);
2281     // Combine the remaining candidates into s2
2282     s2 = _mm_or_si128(s2, s3);
2283     // Convert MV's into encoders format
2284     s2 = _mm_sub_epi32(_mm_setzero_si128(), s2);
2285     s2 = _mm_slli_epi32(s2, 2); // mv info
2286 
2287                                 // ***SAD***
2288                                 // s0: current SAD candidates for each 32x32
2289                                 // s1: best SAD's for 32x32
2290 
2291                                 // << 1 to compensate for every other line
2292     s0 = _mm_slli_epi32(s0, 1); // best sad info
2293                                 // Load best SAD's
2294     s1 = _mm_loadu_si128((__m128i*)p_best_sad32x32);
2295 
2296     // Determine which candidates are better than the current best SAD's.
2297     // s4 is used to determine the MV's of the new best SAD's
2298     s4 = _mm_cmpgt_epi32(s1, s0);
2299     // not different SVT_LOG("%d, %d, %d, %d\n", _mm_extract_epi32(s4, 0), _mm_extract_epi32(s4, 1), _mm_extract_epi32(s4, 2), _mm_extract_epi32(s4, 3)); // DEBUG
2300     //s4 = _mm_or_si128(_mm_cmpgt_epi32(s1, s0), _mm_cmpeq_epi32(s1, s0));
2301     // Combine old and new min SAD's
2302     s0 = _mm_min_epu32(s0, s1);
2303     // Store new best SAD's back to memory
2304     _mm_storeu_si128((__m128i*)p_best_sad32x32, s0);
2305 
2306     // ***Motion Vectors***
2307     // Load best MV's
2308     // s3: candidate MV's
2309     // s4: results of comparing SAD's
2310     // s5: previous best MV's
2311 
2312     // Load previous best MV's
2313     s5 = _mm_loadu_si128((__m128i*)p_best_mv32x32);
2314     // Remove the MV's that are being replaced
2315     s5 = _mm_andnot_si128(s4, s5);
2316     // Set s3 to the base MV
2317     s3 = _mm_set1_epi32(mv);
2318     // Add candidate MV's to base MV
2319     s3 = _mm_add_epi16(s3, s2);
2320     // Remove non-candidate's
2321     s3 = _mm_and_si128(s3, s4);
2322     // Combine remaining candidates with remaining best MVs
2323     s3 = _mm_or_si128(s3, s5);
2324     // Store back to memory
2325     _mm_storeu_si128((__m128i*)p_best_mv32x32, s3);
2326 }
2327 
2328 #endif
2329