1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbComputeSAD_SSE4_1.h"
7 #include "EbDefinitions.h"
8 #include "smmintrin.h"
9 
10 #define UPDATE_BEST(s, k, offset) \
11   tem_sum = _mm_extract_epi32(s, k); \
12   if (tem_sum < low_sum) { \
13     low_sum = tem_sum; \
14     x_best = j + offset + k; \
15     y_best = i; \
16   }
17 
18 /*******************************************************************************
19 * Requirement: width   = 4, 8, 16, 24, 32, 48 or 64
20 * Requirement: height <= 64
21 * Requirement: height % 2 = 0 when width = 4 or 8
22 *******************************************************************************/
eb_vp9_sad_loop_kernel_sse4_1_hme_l0_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width,uint64_t * best_sad,int16_t * x_search_center,int16_t * y_search_center,uint32_t src_stride_raw,int16_t search_area_width,int16_t search_area_height)23 void eb_vp9_sad_loop_kernel_sse4_1_hme_l0_intrin(
24     uint8_t  *src,                            // input parameter, source samples Ptr
25     uint32_t  src_stride,                      // input parameter, source stride
26     uint8_t  *ref,                            // input parameter, reference samples Ptr
27     uint32_t  ref_stride,                      // input parameter, reference stride
28     uint32_t  height,                         // input parameter, block height (M)
29     uint32_t  width,                          // input parameter, block width (N)
30     uint64_t *best_sad,
31     int16_t  *x_search_center,
32     int16_t  *y_search_center,
33     uint32_t  src_stride_raw,                   // input parameter, source stride (no line skipping)
34     int16_t   search_area_width,
35     int16_t   search_area_height)
36 {
37     int16_t x_best = *x_search_center, y_best = *y_search_center;
38     uint32_t low_sum = 0xffffff;
39     uint32_t tem_sum = 0;
40     int16_t i, j;
41     uint32_t k, l;
42     const uint8_t *p_ref, *p_src;
43     __m128i s0, s1, s2, s3, s4, s5, s6, s7, s9, s10, s11;
44 
45     switch (width) {
46     case 4:
47         for (i = 0; i<search_area_height; i++) {
48             for (j = 0; j <= search_area_width - 8; j += 8) {
49                 p_src = src;
50                 p_ref = ref + j;
51                 s3 = _mm_setzero_si128();
52                 for (k = 0; k<height; k += 2) {
53                     s0 = _mm_loadu_si128((__m128i*)p_ref);
54                     s1 = _mm_loadu_si128((__m128i*)(p_ref + ref_stride));
55                     s2 = _mm_cvtsi32_si128(*(uint32_t *)p_src);
56                     s5 = _mm_cvtsi32_si128(*(uint32_t *)(p_src + src_stride));
57                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
58                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
59                     p_src += src_stride << 1;
60                     p_ref += ref_stride << 1;
61                 }
62                 s3 = _mm_minpos_epu16(s3);
63                 tem_sum = _mm_extract_epi16(s3, 0);
64                 if (tem_sum < low_sum) {
65                     low_sum = tem_sum;
66                     x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
67                     y_best = i;
68                 }
69             }
70             ref += src_stride_raw;
71         }
72         break;
73 
74     case 8:
75         for (i = 0; i<search_area_height; i++) {
76             for (j = 0; j <= search_area_width - 8; j += 8) {
77                 p_src = src;
78                 p_ref = ref + j;
79                 s3 = s4 = _mm_setzero_si128();
80                 for (k = 0; k<height; k += 2) {
81                     s0 = _mm_loadu_si128((__m128i*)p_ref);
82                     s1 = _mm_loadu_si128((__m128i*)(p_ref + ref_stride));
83                     s2 = _mm_loadl_epi64((__m128i*)p_src);
84                     s5 = _mm_loadl_epi64((__m128i*)(p_src + src_stride));
85                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
86                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
87                     s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
88                     s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
89                     p_src += src_stride << 1;
90                     p_ref += ref_stride << 1;
91                 }
92                 s3 = _mm_adds_epu16(s3, s4);
93                 s3 = _mm_minpos_epu16(s3);
94                 tem_sum = _mm_extract_epi16(s3, 0);
95                 if (tem_sum < low_sum) {
96                     low_sum = tem_sum;
97                     x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
98                     y_best = i;
99                 }
100             }
101 
102             ref += src_stride_raw;
103         }
104         break;
105 
106     case 16:
107         if (height <= 16) {
108             for (i = 0; i<search_area_height; i++) {
109                 for (j = 0; j <= search_area_width - 16; j += 16) {
110                     p_src = src;
111                     p_ref = ref + j;
112                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
113                     s7 = s9 = s10 = s11 = _mm_setzero_si128();
114                     for (k = 0; k<height; k++) {
115                         s0 = _mm_loadu_si128((__m128i*)p_ref);
116                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
117                         s2 = _mm_loadu_si128((__m128i*)p_src);
118                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
119                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
120                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
121                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
122                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
123                         s7 = _mm_adds_epu16(s7, _mm_mpsadbw_epu8(s1, s2, 0));
124                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 5));
125                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 2));
126                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 7));
127                         p_src += src_stride;
128                         p_ref += ref_stride;
129                     }
130                     s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
131                     s3 = _mm_minpos_epu16(s3);
132                     tem_sum = _mm_extract_epi16(s3, 0);
133                     if (tem_sum < low_sum) {
134                         low_sum = tem_sum;
135                         x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
136                         y_best = i;
137                     }
138 
139                     s7 = _mm_adds_epu16(_mm_adds_epu16(s7, s11), _mm_adds_epu16(s9, s10));
140                     s7 = _mm_minpos_epu16(s7);
141                     tem_sum = _mm_extract_epi16(s7, 0);
142                     if (tem_sum < low_sum) {
143                         low_sum = tem_sum;
144                         x_best = (int16_t)(j + 8 + _mm_extract_epi16(s7, 1));
145                         y_best = i;
146                     }
147                 }
148 
149                 ref += src_stride_raw;
150             }
151         }
152         else if (height <= 32) {
153             for (i = 0; i<search_area_height; i++) {
154                 for (j = 0; j <= search_area_width - 8; j += 8) {
155                     p_src = src;
156                     p_ref = ref + j;
157                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
158                     for (k = 0; k<height; k++) {
159                         s0 = _mm_loadu_si128((__m128i*)p_ref);
160                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
161                         s2 = _mm_loadu_si128((__m128i*)p_src);
162                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
163                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
164                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
165                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
166                         p_src += src_stride;
167                         p_ref += ref_stride;
168                     }
169                     s3 = _mm_adds_epu16(s3, s4);
170                     s5 = _mm_adds_epu16(s5, s6);
171                     s4 = _mm_minpos_epu16(s3);
172                     s6 = _mm_minpos_epu16(s5);
173                     s4 = _mm_unpacklo_epi16(s4, s4);
174                     s4 = _mm_unpacklo_epi32(s4, s4);
175                     s4 = _mm_unpacklo_epi64(s4, s4);
176                     s6 = _mm_unpacklo_epi16(s6, s6);
177                     s6 = _mm_unpacklo_epi32(s6, s6);
178                     s6 = _mm_unpacklo_epi64(s6, s6);
179                     s3 = _mm_sub_epi16(s3, s4);
180                     s5 = _mm_adds_epu16(s5, s3);
181                     s5 = _mm_sub_epi16(s5, s6);
182                     s5 = _mm_minpos_epu16(s5);
183                     tem_sum = _mm_extract_epi16(s5, 0);
184                     tem_sum += _mm_extract_epi16(s4, 0);
185                     tem_sum += _mm_extract_epi16(s6, 0);
186                     if (tem_sum < low_sum) {
187                         low_sum = tem_sum;
188                         x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
189                         y_best = i;
190                     }
191                 }
192 
193                 ref += src_stride_raw;
194             }
195         }
196         else {
197             for (i = 0; i<search_area_height; i++) {
198                 for (j = 0; j <= search_area_width - 8; j += 8) {
199                     p_src = src;
200                     p_ref = ref + j;
201                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
202                     for (k = 0; k<height; k++) {
203                         s0 = _mm_loadu_si128((__m128i*)p_ref);
204                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
205                         s2 = _mm_loadu_si128((__m128i*)p_src);
206                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
207                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
208                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
209                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
210                         p_src += src_stride;
211                         p_ref += ref_stride;
212                     }
213                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
214                     s0 = _mm_minpos_epu16(s0);
215                     tem_sum = _mm_extract_epi16(s0, 0);
216                     if (tem_sum < low_sum) {
217                         if (tem_sum != 0xFFFF) { // no overflow
218                             low_sum = tem_sum;
219                             x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
220                             y_best = i;
221                         }
222                         else {
223                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
224                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
225                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
226                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
227                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
228                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
229                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
230                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
231                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
232                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
233                             UPDATE_BEST(s0, 0, 0);
234                             UPDATE_BEST(s0, 1, 0);
235                             UPDATE_BEST(s0, 2, 0);
236                             UPDATE_BEST(s0, 3, 0);
237                             UPDATE_BEST(s3, 0, 4);
238                             UPDATE_BEST(s3, 1, 4);
239                             UPDATE_BEST(s3, 2, 4);
240                             UPDATE_BEST(s3, 3, 4);
241                         }
242                     }
243                 }
244                 ref += src_stride_raw;
245             }
246         }
247         break;
248 
249     case 24:
250         if (height <= 16) {
251             for (i = 0; i<search_area_height; i++) {
252                 for (j = 0; j <= search_area_width - 8; j += 8) {
253                     p_src = src;
254                     p_ref = ref + j;
255                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
256                     for (k = 0; k<height; k++) {
257                         s0 = _mm_loadu_si128((__m128i*)p_ref);
258                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
259                         s2 = _mm_loadu_si128((__m128i*)p_src);
260                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
261                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
262                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
263                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
264                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
265                         s2 = _mm_loadl_epi64((__m128i*)(p_src + 16));
266                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
267                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
268                         p_src += src_stride;
269                         p_ref += ref_stride;
270                     }
271                     s3 = _mm_adds_epu16(s3, s4);
272                     s5 = _mm_adds_epu16(s5, s6);
273                     s4 = _mm_minpos_epu16(s3);
274                     s6 = _mm_minpos_epu16(s5);
275                     s4 = _mm_unpacklo_epi16(s4, s4);
276                     s4 = _mm_unpacklo_epi32(s4, s4);
277                     s4 = _mm_unpacklo_epi64(s4, s4);
278                     s6 = _mm_unpacklo_epi16(s6, s6);
279                     s6 = _mm_unpacklo_epi32(s6, s6);
280                     s6 = _mm_unpacklo_epi64(s6, s6);
281                     s3 = _mm_sub_epi16(s3, s4);
282                     s5 = _mm_adds_epu16(s5, s3);
283                     s5 = _mm_sub_epi16(s5, s6);
284                     s5 = _mm_minpos_epu16(s5);
285                     tem_sum = _mm_extract_epi16(s5, 0);
286                     tem_sum += _mm_extract_epi16(s4, 0);
287                     tem_sum += _mm_extract_epi16(s6, 0);
288                     if (tem_sum < low_sum) {
289                         low_sum = tem_sum;
290                         x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
291                         y_best = i;
292                     }
293                 }
294                 ref += src_stride_raw;
295             }
296         }
297         else {
298             for (i = 0; i<search_area_height; i++) {
299                 for (j = 0; j <= search_area_width - 8; j += 8) {
300                     p_src = src;
301                     p_ref = ref + j;
302                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
303                     for (k = 0; k<height; k++) {
304                         s0 = _mm_loadu_si128((__m128i*)p_ref);
305                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
306                         s2 = _mm_loadu_si128((__m128i*)p_src);
307                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
308                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
309                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
310                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
311                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
312                         s2 = _mm_loadl_epi64((__m128i*)(p_src + 16));
313                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
314                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
315                         p_src += src_stride;
316                         p_ref += ref_stride;
317                     }
318                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
319                     s0 = _mm_minpos_epu16(s0);
320                     tem_sum = _mm_extract_epi16(s0, 0);
321                     if (tem_sum < low_sum) {
322                         if (tem_sum != 0xFFFF) { // no overflow
323                             low_sum = tem_sum;
324                             x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
325                             y_best = i;
326                         }
327                         else {
328                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
329                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
330                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
331                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
332                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
333                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
334                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
335                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
336                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
337                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
338                             UPDATE_BEST(s0, 0, 0);
339                             UPDATE_BEST(s0, 1, 0);
340                             UPDATE_BEST(s0, 2, 0);
341                             UPDATE_BEST(s0, 3, 0);
342                             UPDATE_BEST(s3, 0, 4);
343                             UPDATE_BEST(s3, 1, 4);
344                             UPDATE_BEST(s3, 2, 4);
345                             UPDATE_BEST(s3, 3, 4);
346                         }
347                     }
348                 }
349 
350                 ref += src_stride_raw;
351             }
352         }
353         break;
354 
355     case 32:
356         if (height <= 16) {
357             for (i = 0; i<search_area_height; i++) {
358                 for (j = 0; j <= search_area_width - 8; j += 8) {
359                     p_src = src;
360                     p_ref = ref + j;
361                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
362                     for (k = 0; k<height; k++) {
363                         s0 = _mm_loadu_si128((__m128i*)p_ref);
364                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
365                         s2 = _mm_loadu_si128((__m128i*)p_src);
366                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
367                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
368                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
369                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
370                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
371                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
372                         s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
373                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
374                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
375                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
376                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
377                         p_src += src_stride;
378                         p_ref += ref_stride;
379                     }
380                     s3 = _mm_adds_epu16(s3, s4);
381                     s5 = _mm_adds_epu16(s5, s6);
382                     s4 = _mm_minpos_epu16(s3);
383                     s6 = _mm_minpos_epu16(s5);
384                     s4 = _mm_unpacklo_epi16(s4, s4);
385                     s4 = _mm_unpacklo_epi32(s4, s4);
386                     s4 = _mm_unpacklo_epi64(s4, s4);
387                     s6 = _mm_unpacklo_epi16(s6, s6);
388                     s6 = _mm_unpacklo_epi32(s6, s6);
389                     s6 = _mm_unpacklo_epi64(s6, s6);
390                     s3 = _mm_sub_epi16(s3, s4);
391                     s5 = _mm_adds_epu16(s5, s3);
392                     s5 = _mm_sub_epi16(s5, s6);
393                     s5 = _mm_minpos_epu16(s5);
394                     tem_sum = _mm_extract_epi16(s5, 0);
395                     tem_sum += _mm_extract_epi16(s4, 0);
396                     tem_sum += _mm_extract_epi16(s6, 0);
397                     tem_sum &= 0x0000FFFF;
398                     if (tem_sum < low_sum) {
399                         low_sum = tem_sum;
400                         x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
401                         y_best = i;
402                     }
403                 }
404 
405                 ref += src_stride_raw;
406             }
407         }
408         else if (height <= 32) {
409             for (i = 0; i<search_area_height; i++) {
410                 for (j = 0; j <= search_area_width - 8; j += 8) {
411                     p_src = src;
412                     p_ref = ref + j;
413                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
414                     for (k = 0; k<height; k++) {
415                         s0 = _mm_loadu_si128((__m128i*)p_ref);
416                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
417                         s2 = _mm_loadu_si128((__m128i*)p_src);
418                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
419                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
420                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
421                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
422                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
423                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
424                         s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
425                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
426                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
427                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
428                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
429                         p_src += src_stride;
430                         p_ref += ref_stride;
431                     }
432                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
433                     s0 = _mm_minpos_epu16(s0);
434                     tem_sum = _mm_extract_epi16(s0, 0);
435                     tem_sum &= 0x0000FFFF;
436                     if (tem_sum < low_sum) {
437                         if (tem_sum != 0xFFFF) { // no overflow
438                             low_sum = tem_sum;
439                             x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
440                             y_best = i;
441                         }
442                         else {
443                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
444                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
445                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
446                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
447                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
448                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
449                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
450                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
451                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
452                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
453                             UPDATE_BEST(s0, 0, 0);
454                             UPDATE_BEST(s0, 1, 0);
455                             UPDATE_BEST(s0, 2, 0);
456                             UPDATE_BEST(s0, 3, 0);
457                             UPDATE_BEST(s3, 0, 4);
458                             UPDATE_BEST(s3, 1, 4);
459                             UPDATE_BEST(s3, 2, 4);
460                             UPDATE_BEST(s3, 3, 4);
461                         }
462                     }
463                 }
464                 ref += src_stride_raw;
465             }
466         }
467         else {
468             __m128i s9, s10, s11, s12;
469             for (i = 0; i<search_area_height; i++) {
470                 for (j = 0; j <= search_area_width - 8; j += 8) {
471                     p_src = src;
472                     p_ref = ref + j;
473                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
474                     for (k = 0; k<height >> 1; k++) {
475                         s0 = _mm_loadu_si128((__m128i*)p_ref);
476                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
477                         s2 = _mm_loadu_si128((__m128i*)p_src);
478                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
479                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
480                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
481                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
482                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
483                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
484                         s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
485                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
486                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
487                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
488                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
489                         p_src += src_stride;
490                         p_ref += ref_stride;
491                     }
492                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
493                     for (; k<height; k++) {
494                         s0 = _mm_loadu_si128((__m128i*)p_ref);
495                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
496                         s2 = _mm_loadu_si128((__m128i*)p_src);
497                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
498                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
499                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
500                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
501                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
502                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
503                         s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
504                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
505                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
506                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
507                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
508                         p_src += src_stride;
509                         p_ref += ref_stride;
510                     }
511                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
512                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
513                     s0 = _mm_minpos_epu16(s0);
514                     tem_sum = _mm_extract_epi16(s0, 0);
515                     tem_sum &= 0x0000FFFF;
516                     if (tem_sum < low_sum) {
517                         if (tem_sum != 0xFFFF) { // no overflow
518                             low_sum = tem_sum;
519                             x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
520                             y_best = i;
521                         }
522                         else {
523                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
524                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
525                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
526                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
527                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
528                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
529                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
530                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
531                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
532                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
533                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
534                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
535                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
536                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
537                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
538                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
539                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
540                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
541                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
542                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
543                             UPDATE_BEST(s0, 0, 0);
544                             UPDATE_BEST(s0, 1, 0);
545                             UPDATE_BEST(s0, 2, 0);
546                             UPDATE_BEST(s0, 3, 0);
547                             UPDATE_BEST(s3, 0, 4);
548                             UPDATE_BEST(s3, 1, 4);
549                             UPDATE_BEST(s3, 2, 4);
550                             UPDATE_BEST(s3, 3, 4);
551                         }
552                     }
553                 }
554 
555                 ref += src_stride_raw;
556             }
557         }
558         break;
559 
560     case 48:
561         if (height <= 32) {
562             __m128i s9, s10, s11, s12;
563             for (i = 0; i<search_area_height; i++) {
564                 for (j = 0; j <= search_area_width - 8; j += 8) {
565                     p_src = src;
566                     p_ref = ref + j;
567                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
568                     for (k = 0; k<height >> 1; k++) {
569                         s0 = _mm_loadu_si128((__m128i*)p_ref);
570                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
571                         s2 = _mm_loadu_si128((__m128i*)p_src);
572                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
573                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
574                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
575                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
576                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
577                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
578                         s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
579                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
580                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
581                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
582                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
583                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
584                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
585                         s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
586                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
587                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
588                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
589                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
590                         p_src += src_stride;
591                         p_ref += ref_stride;
592                     }
593                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
594                     for (; k<height; k++) {
595                         s0 = _mm_loadu_si128((__m128i*)p_ref);
596                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
597                         s2 = _mm_loadu_si128((__m128i*)p_src);
598                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
599                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
600                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
601                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
602                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
603                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
604                         s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
605                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
606                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
607                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
608                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
609                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
610                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
611                         s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
612                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
613                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
614                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
615                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
616                         p_src += src_stride;
617                         p_ref += ref_stride;
618                     }
619                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
620                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
621                     s0 = _mm_minpos_epu16(s0);
622                     tem_sum = _mm_extract_epi16(s0, 0);
623                     tem_sum &= 0x0000FFFF;
624                     if (tem_sum < low_sum) {
625                         if (tem_sum != 0xFFFF) { // no overflow
626                             low_sum = tem_sum;
627                             x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
628                             y_best = i;
629                         }
630                         else {
631                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
632                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
633                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
634                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
635                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
636                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
637                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
638                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
639                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
640                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
641                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
642                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
643                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
644                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
645                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
646                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
647                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
648                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
649                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
650                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
651                             UPDATE_BEST(s0, 0, 0);
652                             UPDATE_BEST(s0, 1, 0);
653                             UPDATE_BEST(s0, 2, 0);
654                             UPDATE_BEST(s0, 3, 0);
655                             UPDATE_BEST(s3, 0, 4);
656                             UPDATE_BEST(s3, 1, 4);
657                             UPDATE_BEST(s3, 2, 4);
658                             UPDATE_BEST(s3, 3, 4);
659                         }
660                     }
661                 }
662 
663                 ref += src_stride_raw;
664             }
665         }
666         else {
667             __m128i s9, s10;
668             for (i = 0; i<search_area_height; i++) {
669                 for (j = 0; j <= search_area_width - 8; j += 8) {
670                     p_src = src;
671                     p_ref = ref + j;
672                     s9 = s10 = _mm_setzero_si128();
673                     k = 0;
674                     while (k<height) {
675                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
676                         for (l = 0; l<21 && k<height; k++, l++) {
677                             s0 = _mm_loadu_si128((__m128i*)p_ref);
678                             s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
679                             s2 = _mm_loadu_si128((__m128i*)p_src);
680                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
681                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
682                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
683                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
684                             s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
685                             s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
686                             s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
687                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
688                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
689                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
690                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
691                             s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
692                             s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
693                             s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
694                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
695                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
696                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
697                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
698                             p_src += src_stride;
699                             p_ref += ref_stride;
700                         }
701                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
702                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
703                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
704                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
705                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
706                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
707                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
708                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
709                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
710                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
711                     }
712                     s0 = _mm_packus_epi32(s9, s10);
713                     s0 = _mm_minpos_epu16(s0);
714                     tem_sum = _mm_extract_epi16(s0, 0);
715                     tem_sum &= 0x0000FFFF;
716                     if (tem_sum < low_sum) {
717                         if (tem_sum != 0xFFFF) { // no overflow
718                             low_sum = tem_sum;
719                             x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
720                             y_best = i;
721                         }
722                         else {
723                             UPDATE_BEST(s9, 0, 0);
724                             UPDATE_BEST(s9, 1, 0);
725                             UPDATE_BEST(s9, 2, 0);
726                             UPDATE_BEST(s9, 3, 0);
727                             UPDATE_BEST(s10, 0, 4);
728                             UPDATE_BEST(s10, 1, 4);
729                             UPDATE_BEST(s10, 2, 4);
730                             UPDATE_BEST(s10, 3, 4);
731                         }
732                     }
733                 }
734 
735                 ref += src_stride_raw;
736             }
737         }
738         break;
739 
740     case 64:
741         if (height <= 32) {
742             __m128i s9, s10, s11, s12;
743             for (i = 0; i<search_area_height; i++) {
744                 for (j = 0; j <= search_area_width - 8; j += 8) {
745                     p_src = src;
746                     p_ref = ref + j;
747                     s3 = s4 = s5 = s6 = _mm_setzero_si128();
748                     for (k = 0; k<height >> 1; k++) {
749                         s0 = _mm_loadu_si128((__m128i*)p_ref);
750                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
751                         s2 = _mm_loadu_si128((__m128i*)p_src);
752                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
753                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
754                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
755                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
756                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
757                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
758                         s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
759                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
760                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
761                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
762                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
763                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
764                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
765                         s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
766                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
767                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
768                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
769                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
770                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 48));
771                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 56));
772                         s2 = _mm_loadu_si128((__m128i*)(p_src + 48));
773                         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
774                         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
775                         s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
776                         s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
777                         p_src += src_stride;
778                         p_ref += ref_stride;
779                     }
780                     s9 = s10 = s11 = s12 = _mm_setzero_si128();
781                     for (; k<height; k++) {
782                         s0 = _mm_loadu_si128((__m128i*)p_ref);
783                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
784                         s2 = _mm_loadu_si128((__m128i*)p_src);
785                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
786                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
787                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
788                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
789                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
790                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
791                         s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
792                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
793                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
794                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
795                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
796                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
797                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
798                         s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
799                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
800                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
801                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
802                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
803                         s0 = _mm_loadu_si128((__m128i*)(p_ref + 48));
804                         s1 = _mm_loadu_si128((__m128i*)(p_ref + 56));
805                         s2 = _mm_loadu_si128((__m128i*)(p_src + 48));
806                         s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
807                         s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
808                         s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
809                         s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
810                         p_src += src_stride;
811                         p_ref += ref_stride;
812                     }
813                     s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
814                     s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
815                     s0 = _mm_minpos_epu16(s0);
816                     tem_sum = _mm_extract_epi16(s0, 0);
817                     tem_sum &= 0x0000FFFF;
818                     if (tem_sum < low_sum) {
819                         if (tem_sum != 0xFFFF) { // no overflow
820                             low_sum = tem_sum;
821                             x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
822                             y_best = i;
823                         }
824                         else {
825                             s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
826                             s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
827                             s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
828                             s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
829                             s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
830                             s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
831                             s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
832                             s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
833                             s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
834                             s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
835                             s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
836                             s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
837                             s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
838                             s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
839                             s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
840                             s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
841                             s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
842                             s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
843                             s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
844                             s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
845                             UPDATE_BEST(s0, 0, 0);
846                             UPDATE_BEST(s0, 1, 0);
847                             UPDATE_BEST(s0, 2, 0);
848                             UPDATE_BEST(s0, 3, 0);
849                             UPDATE_BEST(s3, 0, 4);
850                             UPDATE_BEST(s3, 1, 4);
851                             UPDATE_BEST(s3, 2, 4);
852                             UPDATE_BEST(s3, 3, 4);
853                         }
854                     }
855                 }
856 
857                 ref += src_stride_raw;
858             }
859         }
860         else {
861             __m128i s9, s10;
862             for (i = 0; i<search_area_height; i++) {
863                 for (j = 0; j <= search_area_width - 8; j += 8) {
864                     p_src = src;
865                     p_ref = ref + j;
866                     s9 = s10 = _mm_setzero_si128();
867                     k = 0;
868                     while (k<height) {
869                         s3 = s4 = s5 = s6 = _mm_setzero_si128();
870                         for (l = 0; l<16 && k<height; k++, l++) {
871                             s0 = _mm_loadu_si128((__m128i*)p_ref);
872                             s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
873                             s2 = _mm_loadu_si128((__m128i*)p_src);
874                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
875                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
876                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
877                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
878                             s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
879                             s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
880                             s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
881                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
882                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
883                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
884                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
885                             s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
886                             s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
887                             s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
888                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
889                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
890                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
891                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
892                             s0 = _mm_loadu_si128((__m128i*)(p_ref + 48));
893                             s1 = _mm_loadu_si128((__m128i*)(p_ref + 56));
894                             s2 = _mm_loadu_si128((__m128i*)(p_src + 48));
895                             s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
896                             s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
897                             s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
898                             s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
899                             p_src += src_stride;
900                             p_ref += ref_stride;
901                         }
902                         s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
903                         s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
904                         s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
905                         s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
906                         s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
907                         s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
908                         s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
909                         s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
910                         s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
911                         s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
912                     }
913                     s0 = _mm_packus_epi32(s9, s10);
914                     s0 = _mm_minpos_epu16(s0);
915                     tem_sum = _mm_extract_epi16(s0, 0);
916                     tem_sum &= 0x0000FFFF;
917                     if (tem_sum < low_sum) {
918                         if (tem_sum != 0xFFFF) { // no overflow
919                             low_sum = tem_sum;
920                             x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
921                             y_best = i;
922                         }
923                         else {
924                             UPDATE_BEST(s9, 0, 0);
925                             UPDATE_BEST(s9, 1, 0);
926                             UPDATE_BEST(s9, 2, 0);
927                             UPDATE_BEST(s9, 3, 0);
928                             UPDATE_BEST(s10, 0, 4);
929                             UPDATE_BEST(s10, 1, 4);
930                             UPDATE_BEST(s10, 2, 4);
931                             UPDATE_BEST(s10, 3, 4);
932                         }
933                     }
934                 }
935 
936                 ref += src_stride_raw;
937             }
938         }
939         break;
940 
941     default:
942         break;
943     }
944 
945     *best_sad = low_sum;
946     *x_search_center = x_best;
947     *y_search_center = y_best;
948 }
949 
950 /*******************************************
951  * eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu
952  *******************************************/
eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu_sse41_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t * p_best_sad8x8,uint32_t * p_best_mv8x8,uint32_t * p_best_sad16x16,uint32_t * p_best_mv16x16,uint32_t mv,uint16_t * p_sad16x16)953 void eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu_sse41_intrin(
954     uint8_t   *src,
955     uint32_t   src_stride,
956     uint8_t   *ref,
957     uint32_t   ref_stride,
958     uint32_t  *p_best_sad8x8,
959     uint32_t  *p_best_mv8x8,
960     uint32_t  *p_best_sad16x16,
961     uint32_t  *p_best_mv16x16,
962     uint32_t   mv,
963     uint16_t  *p_sad16x16 )
964 {
965 
966     int16_t x_mv,y_mv;
967     const uint8_t *p_ref, *p_src;
968     __m128i s0, s1, s2, s3, s4, s5;
969      __m128i sad_0, sad_1, sad_2, sad_3;
970     uint32_t tem_sum;
971 
972      /*
973     -------------------------------------   -----------------------------------
974     | 8x8_00 | 8x8_01 | 8x8_04 | 8x8_05 |   8x8_16 | 8x8_17 | 8x8_20 | 8x8_21 |
975     -------------------------------------   -----------------------------------
976     | 8x8_02 | 8x8_03 | 8x8_06 | 8x8_07 |   8x8_18 | 8x8_19 | 8x8_22 | 8x8_23 |
977     -----------------------   -----------   ----------------------   ----------
978     | 8x8_08 | 8x8_09 | 8x8_12 | 8x8_13 |   8x8_24 | 8x8_25 | 8x8_29 | 8x8_29 |
979     ----------------------    -----------   ---------------------    ----------
980     | 8x8_10 | 8x8_11 | 8x8_14 | 8x8_15 |   8x8_26 | 8x8_27 | 8x8_30 | 8x8_31 |
981     -------------------------------------   -----------------------------------
982 
983     -------------------------------------   -----------------------------------
984     | 8x8_32 | 8x8_33 | 8x8_36 | 8x8_37 |   8x8_48 | 8x8_49 | 8x8_52 | 8x8_53 |
985     -------------------------------------   -----------------------------------
986     | 8x8_34 | 8x8_35 | 8x8_38 | 8x8_39 |   8x8_50 | 8x8_51 | 8x8_54 | 8x8_55 |
987     -----------------------   -----------   ----------------------   ----------
988     | 8x8_40 | 8x8_41 | 8x8_44 | 8x8_45 |   8x8_56 | 8x8_57 | 8x8_60 | 8x8_61 |
989     ----------------------    -----------   ---------------------    ----------
990     | 8x8_42 | 8x8_43 | 8x8_46 | 8x8_48 |   8x8_58 | 8x8_59 | 8x8_62 | 8x8_63 |
991     -------------------------------------   -----------------------------------
992     */
993 
994     /*
995     ----------------------    ----------------------
996     |  16x16_0  |  16x16_1  |  16x16_4  |  16x16_5  |
997     ----------------------    ----------------------
998     |  16x16_2  |  16x16_3  |  16x16_6  |  16x16_7  |
999     -----------------------   -----------------------
1000     |  16x16_8  |  16x16_9  |  16x16_12 |  16x16_13 |
1001     ----------------------    ----------------------
1002     |  16x16_10 |  16x16_11 |  16x16_14 |  16x16_15 |
1003     -----------------------   -----------------------
1004     */
1005 
1006     //8x8_0
1007     {
1008         p_src = src;
1009         p_ref = ref;
1010         s3 = s4 = _mm_setzero_si128();
1011 
1012         s0 = _mm_loadu_si128((__m128i*)p_ref);
1013         s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1014         s2 = _mm_loadl_epi64((__m128i*)p_src);
1015         s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1016         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1017         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1018         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1019         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1020 
1021         p_src += src_stride *4;
1022         p_ref += ref_stride *4;
1023 
1024         s0 = _mm_loadu_si128((__m128i*)p_ref);
1025         s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1026         s2 = _mm_loadl_epi64((__m128i*)p_src);
1027         s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1028         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1029         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1030         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1031         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1032 
1033         //final 8x4 SAD
1034         sad_0 = _mm_adds_epu16(s3, s4);
1035 
1036         //find the best for 8x8_0
1037         s3 = _mm_minpos_epu16(sad_0);
1038         tem_sum = _mm_extract_epi16(s3, 0);
1039         if (2*tem_sum <  p_best_sad8x8[0]) {
1040             p_best_sad8x8[0] = 2*tem_sum;
1041             x_mv = _MVXT(mv)  + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1042             y_mv = _MVYT(mv);
1043             p_best_mv8x8[0]  = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1044         }
1045     }
1046 
1047     //8x8_1
1048     {
1049         p_src = src + 8;
1050         p_ref = ref + 8;
1051         s3 = s4 = _mm_setzero_si128();
1052 
1053         s0 = _mm_loadu_si128((__m128i*)p_ref);
1054         s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1055         s2 = _mm_loadl_epi64((__m128i*)p_src);
1056         s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1057         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1058         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1059         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1060         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1061 
1062         p_src += src_stride *4;
1063         p_ref += ref_stride *4;
1064 
1065         s0 = _mm_loadu_si128((__m128i*)p_ref);
1066         s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1067         s2 = _mm_loadl_epi64((__m128i*)p_src);
1068         s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1069         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1070         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1071         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1072         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1073 
1074         //final 8x4 SAD
1075         sad_1 = _mm_adds_epu16(s3, s4);
1076 
1077         //find the best for 8x8_1
1078         s3 = _mm_minpos_epu16(sad_1);
1079         tem_sum = _mm_extract_epi16(s3, 0);
1080         if (2*tem_sum <  p_best_sad8x8[1]) {
1081             p_best_sad8x8[1] = 2*tem_sum;
1082             x_mv = _MVXT(mv)  + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1083             y_mv = _MVYT(mv);
1084             p_best_mv8x8[1]  = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1085         }
1086     }
1087 
1088     //8x8_2
1089     {
1090         p_src = src + 8*src_stride;
1091         p_ref = ref + 8*ref_stride;
1092         s3 = s4 = _mm_setzero_si128();
1093 
1094         s0 = _mm_loadu_si128((__m128i*)p_ref);
1095         s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1096         s2 = _mm_loadl_epi64((__m128i*)p_src);
1097         s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1098         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1099         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1100         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1101         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1102 
1103         p_src += src_stride *4;
1104         p_ref += ref_stride *4;
1105 
1106         s0 = _mm_loadu_si128((__m128i*)p_ref);
1107         s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1108         s2 = _mm_loadl_epi64((__m128i*)p_src);
1109         s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1110         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1111         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1112         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1113         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1114 
1115         //final 8x4 SAD
1116         sad_2 = _mm_adds_epu16(s3, s4);
1117 
1118         //find the best for 8x8_2
1119         s3 = _mm_minpos_epu16(sad_2);
1120         tem_sum = _mm_extract_epi16(s3, 0);
1121         if (2*tem_sum <  p_best_sad8x8[2]) {
1122             p_best_sad8x8[2] = 2*tem_sum;
1123             x_mv = _MVXT(mv)  + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1124             y_mv = _MVYT(mv);
1125             p_best_mv8x8[2]  = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1126         }
1127     }
1128 
1129     //8x8_3
1130     {
1131         p_src = src + 8 + 8*src_stride;
1132         p_ref = ref + 8 + 8*ref_stride;
1133         s3 = s4 = _mm_setzero_si128();
1134 
1135         s0 = _mm_loadu_si128((__m128i*)p_ref);
1136         s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1137         s2 = _mm_loadl_epi64((__m128i*)p_src);
1138         s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1139         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1140         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1141         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1142         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1143 
1144         p_src += src_stride *4;
1145         p_ref += ref_stride *4;
1146 
1147         s0 = _mm_loadu_si128((__m128i*)p_ref);
1148         s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1149         s2 = _mm_loadl_epi64((__m128i*)p_src);
1150         s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1151         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1152         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1153         s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1154         s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1155 
1156         //final 8x4 SAD
1157         sad_3 = _mm_adds_epu16(s3, s4);
1158 
1159         //find the best for 8x8_3
1160         s3 = _mm_minpos_epu16(sad_3);
1161         tem_sum = _mm_extract_epi16(s3, 0);
1162         if (2*tem_sum <  p_best_sad8x8[3]) {
1163             p_best_sad8x8[3] = 2*tem_sum;
1164             x_mv = _MVXT(mv)  + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1165             y_mv = _MVYT(mv);
1166             p_best_mv8x8[3]  = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1167         }
1168     }
1169 
1170     //16x16
1171     {
1172         s0 = _mm_adds_epu16(sad_0, sad_1);
1173         s1 = _mm_adds_epu16(sad_2, sad_3);
1174         s3 = _mm_adds_epu16(s0   , s1   );
1175         //sotore the 8 SADs(16x8 SADs)
1176         _mm_store_si128( (__m128i*)p_sad16x16, s3);
1177         //find the best for 16x16
1178         s3 = _mm_minpos_epu16(s3);
1179         tem_sum = _mm_extract_epi16(s3, 0);
1180         if (2*tem_sum <  p_best_sad16x16[0]) {
1181             p_best_sad16x16[0] = 2*tem_sum;
1182             x_mv = _MVXT(mv)  + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1183             y_mv = _MVYT(mv);
1184             p_best_mv16x16[0]  = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1185         }
1186     }
1187 
1188 }
1189