1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include "EbComputeSAD_SSE4_1.h"
7 #include "EbDefinitions.h"
8 #include "smmintrin.h"
9
10 #define UPDATE_BEST(s, k, offset) \
11 tem_sum = _mm_extract_epi32(s, k); \
12 if (tem_sum < low_sum) { \
13 low_sum = tem_sum; \
14 x_best = j + offset + k; \
15 y_best = i; \
16 }
17
18 /*******************************************************************************
19 * Requirement: width = 4, 8, 16, 24, 32, 48 or 64
20 * Requirement: height <= 64
21 * Requirement: height % 2 = 0 when width = 4 or 8
22 *******************************************************************************/
eb_vp9_sad_loop_kernel_sse4_1_hme_l0_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width,uint64_t * best_sad,int16_t * x_search_center,int16_t * y_search_center,uint32_t src_stride_raw,int16_t search_area_width,int16_t search_area_height)23 void eb_vp9_sad_loop_kernel_sse4_1_hme_l0_intrin(
24 uint8_t *src, // input parameter, source samples Ptr
25 uint32_t src_stride, // input parameter, source stride
26 uint8_t *ref, // input parameter, reference samples Ptr
27 uint32_t ref_stride, // input parameter, reference stride
28 uint32_t height, // input parameter, block height (M)
29 uint32_t width, // input parameter, block width (N)
30 uint64_t *best_sad,
31 int16_t *x_search_center,
32 int16_t *y_search_center,
33 uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
34 int16_t search_area_width,
35 int16_t search_area_height)
36 {
37 int16_t x_best = *x_search_center, y_best = *y_search_center;
38 uint32_t low_sum = 0xffffff;
39 uint32_t tem_sum = 0;
40 int16_t i, j;
41 uint32_t k, l;
42 const uint8_t *p_ref, *p_src;
43 __m128i s0, s1, s2, s3, s4, s5, s6, s7, s9, s10, s11;
44
45 switch (width) {
46 case 4:
47 for (i = 0; i<search_area_height; i++) {
48 for (j = 0; j <= search_area_width - 8; j += 8) {
49 p_src = src;
50 p_ref = ref + j;
51 s3 = _mm_setzero_si128();
52 for (k = 0; k<height; k += 2) {
53 s0 = _mm_loadu_si128((__m128i*)p_ref);
54 s1 = _mm_loadu_si128((__m128i*)(p_ref + ref_stride));
55 s2 = _mm_cvtsi32_si128(*(uint32_t *)p_src);
56 s5 = _mm_cvtsi32_si128(*(uint32_t *)(p_src + src_stride));
57 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
58 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
59 p_src += src_stride << 1;
60 p_ref += ref_stride << 1;
61 }
62 s3 = _mm_minpos_epu16(s3);
63 tem_sum = _mm_extract_epi16(s3, 0);
64 if (tem_sum < low_sum) {
65 low_sum = tem_sum;
66 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
67 y_best = i;
68 }
69 }
70 ref += src_stride_raw;
71 }
72 break;
73
74 case 8:
75 for (i = 0; i<search_area_height; i++) {
76 for (j = 0; j <= search_area_width - 8; j += 8) {
77 p_src = src;
78 p_ref = ref + j;
79 s3 = s4 = _mm_setzero_si128();
80 for (k = 0; k<height; k += 2) {
81 s0 = _mm_loadu_si128((__m128i*)p_ref);
82 s1 = _mm_loadu_si128((__m128i*)(p_ref + ref_stride));
83 s2 = _mm_loadl_epi64((__m128i*)p_src);
84 s5 = _mm_loadl_epi64((__m128i*)(p_src + src_stride));
85 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
86 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
87 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
88 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
89 p_src += src_stride << 1;
90 p_ref += ref_stride << 1;
91 }
92 s3 = _mm_adds_epu16(s3, s4);
93 s3 = _mm_minpos_epu16(s3);
94 tem_sum = _mm_extract_epi16(s3, 0);
95 if (tem_sum < low_sum) {
96 low_sum = tem_sum;
97 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
98 y_best = i;
99 }
100 }
101
102 ref += src_stride_raw;
103 }
104 break;
105
106 case 16:
107 if (height <= 16) {
108 for (i = 0; i<search_area_height; i++) {
109 for (j = 0; j <= search_area_width - 16; j += 16) {
110 p_src = src;
111 p_ref = ref + j;
112 s3 = s4 = s5 = s6 = _mm_setzero_si128();
113 s7 = s9 = s10 = s11 = _mm_setzero_si128();
114 for (k = 0; k<height; k++) {
115 s0 = _mm_loadu_si128((__m128i*)p_ref);
116 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
117 s2 = _mm_loadu_si128((__m128i*)p_src);
118 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
119 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
120 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
121 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
122 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
123 s7 = _mm_adds_epu16(s7, _mm_mpsadbw_epu8(s1, s2, 0));
124 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 5));
125 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 2));
126 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 7));
127 p_src += src_stride;
128 p_ref += ref_stride;
129 }
130 s3 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
131 s3 = _mm_minpos_epu16(s3);
132 tem_sum = _mm_extract_epi16(s3, 0);
133 if (tem_sum < low_sum) {
134 low_sum = tem_sum;
135 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
136 y_best = i;
137 }
138
139 s7 = _mm_adds_epu16(_mm_adds_epu16(s7, s11), _mm_adds_epu16(s9, s10));
140 s7 = _mm_minpos_epu16(s7);
141 tem_sum = _mm_extract_epi16(s7, 0);
142 if (tem_sum < low_sum) {
143 low_sum = tem_sum;
144 x_best = (int16_t)(j + 8 + _mm_extract_epi16(s7, 1));
145 y_best = i;
146 }
147 }
148
149 ref += src_stride_raw;
150 }
151 }
152 else if (height <= 32) {
153 for (i = 0; i<search_area_height; i++) {
154 for (j = 0; j <= search_area_width - 8; j += 8) {
155 p_src = src;
156 p_ref = ref + j;
157 s3 = s4 = s5 = s6 = _mm_setzero_si128();
158 for (k = 0; k<height; k++) {
159 s0 = _mm_loadu_si128((__m128i*)p_ref);
160 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
161 s2 = _mm_loadu_si128((__m128i*)p_src);
162 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
163 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
164 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
165 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
166 p_src += src_stride;
167 p_ref += ref_stride;
168 }
169 s3 = _mm_adds_epu16(s3, s4);
170 s5 = _mm_adds_epu16(s5, s6);
171 s4 = _mm_minpos_epu16(s3);
172 s6 = _mm_minpos_epu16(s5);
173 s4 = _mm_unpacklo_epi16(s4, s4);
174 s4 = _mm_unpacklo_epi32(s4, s4);
175 s4 = _mm_unpacklo_epi64(s4, s4);
176 s6 = _mm_unpacklo_epi16(s6, s6);
177 s6 = _mm_unpacklo_epi32(s6, s6);
178 s6 = _mm_unpacklo_epi64(s6, s6);
179 s3 = _mm_sub_epi16(s3, s4);
180 s5 = _mm_adds_epu16(s5, s3);
181 s5 = _mm_sub_epi16(s5, s6);
182 s5 = _mm_minpos_epu16(s5);
183 tem_sum = _mm_extract_epi16(s5, 0);
184 tem_sum += _mm_extract_epi16(s4, 0);
185 tem_sum += _mm_extract_epi16(s6, 0);
186 if (tem_sum < low_sum) {
187 low_sum = tem_sum;
188 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
189 y_best = i;
190 }
191 }
192
193 ref += src_stride_raw;
194 }
195 }
196 else {
197 for (i = 0; i<search_area_height; i++) {
198 for (j = 0; j <= search_area_width - 8; j += 8) {
199 p_src = src;
200 p_ref = ref + j;
201 s3 = s4 = s5 = s6 = _mm_setzero_si128();
202 for (k = 0; k<height; k++) {
203 s0 = _mm_loadu_si128((__m128i*)p_ref);
204 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
205 s2 = _mm_loadu_si128((__m128i*)p_src);
206 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
207 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
208 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
209 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
210 p_src += src_stride;
211 p_ref += ref_stride;
212 }
213 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
214 s0 = _mm_minpos_epu16(s0);
215 tem_sum = _mm_extract_epi16(s0, 0);
216 if (tem_sum < low_sum) {
217 if (tem_sum != 0xFFFF) { // no overflow
218 low_sum = tem_sum;
219 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
220 y_best = i;
221 }
222 else {
223 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
224 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
225 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
226 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
227 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
228 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
229 s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
230 s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
231 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
232 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
233 UPDATE_BEST(s0, 0, 0);
234 UPDATE_BEST(s0, 1, 0);
235 UPDATE_BEST(s0, 2, 0);
236 UPDATE_BEST(s0, 3, 0);
237 UPDATE_BEST(s3, 0, 4);
238 UPDATE_BEST(s3, 1, 4);
239 UPDATE_BEST(s3, 2, 4);
240 UPDATE_BEST(s3, 3, 4);
241 }
242 }
243 }
244 ref += src_stride_raw;
245 }
246 }
247 break;
248
249 case 24:
250 if (height <= 16) {
251 for (i = 0; i<search_area_height; i++) {
252 for (j = 0; j <= search_area_width - 8; j += 8) {
253 p_src = src;
254 p_ref = ref + j;
255 s3 = s4 = s5 = s6 = _mm_setzero_si128();
256 for (k = 0; k<height; k++) {
257 s0 = _mm_loadu_si128((__m128i*)p_ref);
258 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
259 s2 = _mm_loadu_si128((__m128i*)p_src);
260 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
261 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
262 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
263 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
264 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
265 s2 = _mm_loadl_epi64((__m128i*)(p_src + 16));
266 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
267 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
268 p_src += src_stride;
269 p_ref += ref_stride;
270 }
271 s3 = _mm_adds_epu16(s3, s4);
272 s5 = _mm_adds_epu16(s5, s6);
273 s4 = _mm_minpos_epu16(s3);
274 s6 = _mm_minpos_epu16(s5);
275 s4 = _mm_unpacklo_epi16(s4, s4);
276 s4 = _mm_unpacklo_epi32(s4, s4);
277 s4 = _mm_unpacklo_epi64(s4, s4);
278 s6 = _mm_unpacklo_epi16(s6, s6);
279 s6 = _mm_unpacklo_epi32(s6, s6);
280 s6 = _mm_unpacklo_epi64(s6, s6);
281 s3 = _mm_sub_epi16(s3, s4);
282 s5 = _mm_adds_epu16(s5, s3);
283 s5 = _mm_sub_epi16(s5, s6);
284 s5 = _mm_minpos_epu16(s5);
285 tem_sum = _mm_extract_epi16(s5, 0);
286 tem_sum += _mm_extract_epi16(s4, 0);
287 tem_sum += _mm_extract_epi16(s6, 0);
288 if (tem_sum < low_sum) {
289 low_sum = tem_sum;
290 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
291 y_best = i;
292 }
293 }
294 ref += src_stride_raw;
295 }
296 }
297 else {
298 for (i = 0; i<search_area_height; i++) {
299 for (j = 0; j <= search_area_width - 8; j += 8) {
300 p_src = src;
301 p_ref = ref + j;
302 s3 = s4 = s5 = s6 = _mm_setzero_si128();
303 for (k = 0; k<height; k++) {
304 s0 = _mm_loadu_si128((__m128i*)p_ref);
305 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
306 s2 = _mm_loadu_si128((__m128i*)p_src);
307 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
308 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
309 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
310 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
311 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
312 s2 = _mm_loadl_epi64((__m128i*)(p_src + 16));
313 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
314 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
315 p_src += src_stride;
316 p_ref += ref_stride;
317 }
318 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
319 s0 = _mm_minpos_epu16(s0);
320 tem_sum = _mm_extract_epi16(s0, 0);
321 if (tem_sum < low_sum) {
322 if (tem_sum != 0xFFFF) { // no overflow
323 low_sum = tem_sum;
324 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
325 y_best = i;
326 }
327 else {
328 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
329 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
330 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
331 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
332 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
333 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
334 s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
335 s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
336 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
337 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
338 UPDATE_BEST(s0, 0, 0);
339 UPDATE_BEST(s0, 1, 0);
340 UPDATE_BEST(s0, 2, 0);
341 UPDATE_BEST(s0, 3, 0);
342 UPDATE_BEST(s3, 0, 4);
343 UPDATE_BEST(s3, 1, 4);
344 UPDATE_BEST(s3, 2, 4);
345 UPDATE_BEST(s3, 3, 4);
346 }
347 }
348 }
349
350 ref += src_stride_raw;
351 }
352 }
353 break;
354
355 case 32:
356 if (height <= 16) {
357 for (i = 0; i<search_area_height; i++) {
358 for (j = 0; j <= search_area_width - 8; j += 8) {
359 p_src = src;
360 p_ref = ref + j;
361 s3 = s4 = s5 = s6 = _mm_setzero_si128();
362 for (k = 0; k<height; k++) {
363 s0 = _mm_loadu_si128((__m128i*)p_ref);
364 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
365 s2 = _mm_loadu_si128((__m128i*)p_src);
366 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
367 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
368 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
369 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
370 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
371 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
372 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
373 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
374 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
375 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
376 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
377 p_src += src_stride;
378 p_ref += ref_stride;
379 }
380 s3 = _mm_adds_epu16(s3, s4);
381 s5 = _mm_adds_epu16(s5, s6);
382 s4 = _mm_minpos_epu16(s3);
383 s6 = _mm_minpos_epu16(s5);
384 s4 = _mm_unpacklo_epi16(s4, s4);
385 s4 = _mm_unpacklo_epi32(s4, s4);
386 s4 = _mm_unpacklo_epi64(s4, s4);
387 s6 = _mm_unpacklo_epi16(s6, s6);
388 s6 = _mm_unpacklo_epi32(s6, s6);
389 s6 = _mm_unpacklo_epi64(s6, s6);
390 s3 = _mm_sub_epi16(s3, s4);
391 s5 = _mm_adds_epu16(s5, s3);
392 s5 = _mm_sub_epi16(s5, s6);
393 s5 = _mm_minpos_epu16(s5);
394 tem_sum = _mm_extract_epi16(s5, 0);
395 tem_sum += _mm_extract_epi16(s4, 0);
396 tem_sum += _mm_extract_epi16(s6, 0);
397 tem_sum &= 0x0000FFFF;
398 if (tem_sum < low_sum) {
399 low_sum = tem_sum;
400 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
401 y_best = i;
402 }
403 }
404
405 ref += src_stride_raw;
406 }
407 }
408 else if (height <= 32) {
409 for (i = 0; i<search_area_height; i++) {
410 for (j = 0; j <= search_area_width - 8; j += 8) {
411 p_src = src;
412 p_ref = ref + j;
413 s3 = s4 = s5 = s6 = _mm_setzero_si128();
414 for (k = 0; k<height; k++) {
415 s0 = _mm_loadu_si128((__m128i*)p_ref);
416 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
417 s2 = _mm_loadu_si128((__m128i*)p_src);
418 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
419 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
420 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
421 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
422 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
423 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
424 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
425 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
426 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
427 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
428 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
429 p_src += src_stride;
430 p_ref += ref_stride;
431 }
432 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
433 s0 = _mm_minpos_epu16(s0);
434 tem_sum = _mm_extract_epi16(s0, 0);
435 tem_sum &= 0x0000FFFF;
436 if (tem_sum < low_sum) {
437 if (tem_sum != 0xFFFF) { // no overflow
438 low_sum = tem_sum;
439 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
440 y_best = i;
441 }
442 else {
443 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
444 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
445 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
446 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
447 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
448 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
449 s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
450 s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
451 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
452 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
453 UPDATE_BEST(s0, 0, 0);
454 UPDATE_BEST(s0, 1, 0);
455 UPDATE_BEST(s0, 2, 0);
456 UPDATE_BEST(s0, 3, 0);
457 UPDATE_BEST(s3, 0, 4);
458 UPDATE_BEST(s3, 1, 4);
459 UPDATE_BEST(s3, 2, 4);
460 UPDATE_BEST(s3, 3, 4);
461 }
462 }
463 }
464 ref += src_stride_raw;
465 }
466 }
467 else {
468 __m128i s9, s10, s11, s12;
469 for (i = 0; i<search_area_height; i++) {
470 for (j = 0; j <= search_area_width - 8; j += 8) {
471 p_src = src;
472 p_ref = ref + j;
473 s3 = s4 = s5 = s6 = _mm_setzero_si128();
474 for (k = 0; k<height >> 1; k++) {
475 s0 = _mm_loadu_si128((__m128i*)p_ref);
476 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
477 s2 = _mm_loadu_si128((__m128i*)p_src);
478 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
479 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
480 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
481 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
482 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
483 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
484 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
485 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
486 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
487 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
488 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
489 p_src += src_stride;
490 p_ref += ref_stride;
491 }
492 s9 = s10 = s11 = s12 = _mm_setzero_si128();
493 for (; k<height; k++) {
494 s0 = _mm_loadu_si128((__m128i*)p_ref);
495 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
496 s2 = _mm_loadu_si128((__m128i*)p_src);
497 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
498 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
499 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
500 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
501 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
502 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
503 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
504 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
505 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
506 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
507 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
508 p_src += src_stride;
509 p_ref += ref_stride;
510 }
511 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
512 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
513 s0 = _mm_minpos_epu16(s0);
514 tem_sum = _mm_extract_epi16(s0, 0);
515 tem_sum &= 0x0000FFFF;
516 if (tem_sum < low_sum) {
517 if (tem_sum != 0xFFFF) { // no overflow
518 low_sum = tem_sum;
519 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
520 y_best = i;
521 }
522 else {
523 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
524 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
525 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
526 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
527 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
528 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
529 s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
530 s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
531 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
532 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
533 s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
534 s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
535 s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
536 s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
537 s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
538 s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
539 s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
540 s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
541 s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
542 s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
543 UPDATE_BEST(s0, 0, 0);
544 UPDATE_BEST(s0, 1, 0);
545 UPDATE_BEST(s0, 2, 0);
546 UPDATE_BEST(s0, 3, 0);
547 UPDATE_BEST(s3, 0, 4);
548 UPDATE_BEST(s3, 1, 4);
549 UPDATE_BEST(s3, 2, 4);
550 UPDATE_BEST(s3, 3, 4);
551 }
552 }
553 }
554
555 ref += src_stride_raw;
556 }
557 }
558 break;
559
560 case 48:
561 if (height <= 32) {
562 __m128i s9, s10, s11, s12;
563 for (i = 0; i<search_area_height; i++) {
564 for (j = 0; j <= search_area_width - 8; j += 8) {
565 p_src = src;
566 p_ref = ref + j;
567 s3 = s4 = s5 = s6 = _mm_setzero_si128();
568 for (k = 0; k<height >> 1; k++) {
569 s0 = _mm_loadu_si128((__m128i*)p_ref);
570 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
571 s2 = _mm_loadu_si128((__m128i*)p_src);
572 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
573 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
574 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
575 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
576 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
577 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
578 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
579 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
580 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
581 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
582 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
583 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
584 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
585 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
586 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
587 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
588 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
589 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
590 p_src += src_stride;
591 p_ref += ref_stride;
592 }
593 s9 = s10 = s11 = s12 = _mm_setzero_si128();
594 for (; k<height; k++) {
595 s0 = _mm_loadu_si128((__m128i*)p_ref);
596 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
597 s2 = _mm_loadu_si128((__m128i*)p_src);
598 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
599 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
600 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
601 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
602 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
603 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
604 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
605 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
606 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
607 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
608 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
609 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
610 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
611 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
612 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
613 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
614 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
615 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
616 p_src += src_stride;
617 p_ref += ref_stride;
618 }
619 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
620 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
621 s0 = _mm_minpos_epu16(s0);
622 tem_sum = _mm_extract_epi16(s0, 0);
623 tem_sum &= 0x0000FFFF;
624 if (tem_sum < low_sum) {
625 if (tem_sum != 0xFFFF) { // no overflow
626 low_sum = tem_sum;
627 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
628 y_best = i;
629 }
630 else {
631 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
632 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
633 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
634 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
635 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
636 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
637 s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
638 s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
639 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
640 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
641 s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
642 s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
643 s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
644 s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
645 s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
646 s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
647 s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
648 s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
649 s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
650 s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
651 UPDATE_BEST(s0, 0, 0);
652 UPDATE_BEST(s0, 1, 0);
653 UPDATE_BEST(s0, 2, 0);
654 UPDATE_BEST(s0, 3, 0);
655 UPDATE_BEST(s3, 0, 4);
656 UPDATE_BEST(s3, 1, 4);
657 UPDATE_BEST(s3, 2, 4);
658 UPDATE_BEST(s3, 3, 4);
659 }
660 }
661 }
662
663 ref += src_stride_raw;
664 }
665 }
666 else {
667 __m128i s9, s10;
668 for (i = 0; i<search_area_height; i++) {
669 for (j = 0; j <= search_area_width - 8; j += 8) {
670 p_src = src;
671 p_ref = ref + j;
672 s9 = s10 = _mm_setzero_si128();
673 k = 0;
674 while (k<height) {
675 s3 = s4 = s5 = s6 = _mm_setzero_si128();
676 for (l = 0; l<21 && k<height; k++, l++) {
677 s0 = _mm_loadu_si128((__m128i*)p_ref);
678 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
679 s2 = _mm_loadu_si128((__m128i*)p_src);
680 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
681 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
682 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
683 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
684 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
685 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
686 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
687 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
688 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
689 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
690 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
691 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
692 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
693 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
694 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
695 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
696 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
697 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
698 p_src += src_stride;
699 p_ref += ref_stride;
700 }
701 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
702 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
703 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
704 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
705 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
706 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
707 s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
708 s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
709 s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
710 s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
711 }
712 s0 = _mm_packus_epi32(s9, s10);
713 s0 = _mm_minpos_epu16(s0);
714 tem_sum = _mm_extract_epi16(s0, 0);
715 tem_sum &= 0x0000FFFF;
716 if (tem_sum < low_sum) {
717 if (tem_sum != 0xFFFF) { // no overflow
718 low_sum = tem_sum;
719 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
720 y_best = i;
721 }
722 else {
723 UPDATE_BEST(s9, 0, 0);
724 UPDATE_BEST(s9, 1, 0);
725 UPDATE_BEST(s9, 2, 0);
726 UPDATE_BEST(s9, 3, 0);
727 UPDATE_BEST(s10, 0, 4);
728 UPDATE_BEST(s10, 1, 4);
729 UPDATE_BEST(s10, 2, 4);
730 UPDATE_BEST(s10, 3, 4);
731 }
732 }
733 }
734
735 ref += src_stride_raw;
736 }
737 }
738 break;
739
740 case 64:
741 if (height <= 32) {
742 __m128i s9, s10, s11, s12;
743 for (i = 0; i<search_area_height; i++) {
744 for (j = 0; j <= search_area_width - 8; j += 8) {
745 p_src = src;
746 p_ref = ref + j;
747 s3 = s4 = s5 = s6 = _mm_setzero_si128();
748 for (k = 0; k<height >> 1; k++) {
749 s0 = _mm_loadu_si128((__m128i*)p_ref);
750 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
751 s2 = _mm_loadu_si128((__m128i*)p_src);
752 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
753 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
754 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
755 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
756 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
757 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
758 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
759 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
760 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
761 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
762 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
763 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
764 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
765 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
766 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
767 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
768 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
769 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
770 s0 = _mm_loadu_si128((__m128i*)(p_ref + 48));
771 s1 = _mm_loadu_si128((__m128i*)(p_ref + 56));
772 s2 = _mm_loadu_si128((__m128i*)(p_src + 48));
773 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
774 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
775 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
776 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
777 p_src += src_stride;
778 p_ref += ref_stride;
779 }
780 s9 = s10 = s11 = s12 = _mm_setzero_si128();
781 for (; k<height; k++) {
782 s0 = _mm_loadu_si128((__m128i*)p_ref);
783 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
784 s2 = _mm_loadu_si128((__m128i*)p_src);
785 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
786 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
787 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
788 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
789 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
790 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
791 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
792 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
793 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
794 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
795 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
796 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
797 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
798 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
799 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
800 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
801 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
802 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
803 s0 = _mm_loadu_si128((__m128i*)(p_ref + 48));
804 s1 = _mm_loadu_si128((__m128i*)(p_ref + 56));
805 s2 = _mm_loadu_si128((__m128i*)(p_src + 48));
806 s9 = _mm_adds_epu16(s9, _mm_mpsadbw_epu8(s0, s2, 0));
807 s10 = _mm_adds_epu16(s10, _mm_mpsadbw_epu8(s0, s2, 5));
808 s11 = _mm_adds_epu16(s11, _mm_mpsadbw_epu8(s1, s2, 2));
809 s12 = _mm_adds_epu16(s12, _mm_mpsadbw_epu8(s1, s2, 7));
810 p_src += src_stride;
811 p_ref += ref_stride;
812 }
813 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
814 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm_adds_epu16(s9, s10), _mm_adds_epu16(s11, s12)));
815 s0 = _mm_minpos_epu16(s0);
816 tem_sum = _mm_extract_epi16(s0, 0);
817 tem_sum &= 0x0000FFFF;
818 if (tem_sum < low_sum) {
819 if (tem_sum != 0xFFFF) { // no overflow
820 low_sum = tem_sum;
821 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
822 y_best = i;
823 }
824 else {
825 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
826 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
827 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
828 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
829 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
830 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
831 s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
832 s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
833 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7));
834 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6));
835 s1 = _mm_unpacklo_epi16(s9, _mm_setzero_si128());
836 s9 = _mm_unpackhi_epi16(s9, _mm_setzero_si128());
837 s2 = _mm_unpacklo_epi16(s10, _mm_setzero_si128());
838 s10 = _mm_unpackhi_epi16(s10, _mm_setzero_si128());
839 s4 = _mm_unpacklo_epi16(s11, _mm_setzero_si128());
840 s11 = _mm_unpackhi_epi16(s11, _mm_setzero_si128());
841 s5 = _mm_unpacklo_epi16(s12, _mm_setzero_si128());
842 s12 = _mm_unpackhi_epi16(s12, _mm_setzero_si128());
843 s0 = _mm_add_epi32(s0, _mm_add_epi32(_mm_add_epi32(s1, s2), _mm_add_epi32(s4, s5)));
844 s3 = _mm_add_epi32(s3, _mm_add_epi32(_mm_add_epi32(s9, s10), _mm_add_epi32(s11, s12)));
845 UPDATE_BEST(s0, 0, 0);
846 UPDATE_BEST(s0, 1, 0);
847 UPDATE_BEST(s0, 2, 0);
848 UPDATE_BEST(s0, 3, 0);
849 UPDATE_BEST(s3, 0, 4);
850 UPDATE_BEST(s3, 1, 4);
851 UPDATE_BEST(s3, 2, 4);
852 UPDATE_BEST(s3, 3, 4);
853 }
854 }
855 }
856
857 ref += src_stride_raw;
858 }
859 }
860 else {
861 __m128i s9, s10;
862 for (i = 0; i<search_area_height; i++) {
863 for (j = 0; j <= search_area_width - 8; j += 8) {
864 p_src = src;
865 p_ref = ref + j;
866 s9 = s10 = _mm_setzero_si128();
867 k = 0;
868 while (k<height) {
869 s3 = s4 = s5 = s6 = _mm_setzero_si128();
870 for (l = 0; l<16 && k<height; k++, l++) {
871 s0 = _mm_loadu_si128((__m128i*)p_ref);
872 s1 = _mm_loadu_si128((__m128i*)(p_ref + 8));
873 s2 = _mm_loadu_si128((__m128i*)p_src);
874 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
875 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
876 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
877 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
878 s0 = _mm_loadu_si128((__m128i*)(p_ref + 16));
879 s1 = _mm_loadu_si128((__m128i*)(p_ref + 24));
880 s2 = _mm_loadu_si128((__m128i*)(p_src + 16));
881 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
882 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
883 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
884 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
885 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
886 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
887 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
888 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
889 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
890 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
891 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
892 s0 = _mm_loadu_si128((__m128i*)(p_ref + 48));
893 s1 = _mm_loadu_si128((__m128i*)(p_ref + 56));
894 s2 = _mm_loadu_si128((__m128i*)(p_src + 48));
895 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
896 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
897 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
898 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
899 p_src += src_stride;
900 p_ref += ref_stride;
901 }
902 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
903 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
904 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
905 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
906 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
907 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
908 s7 = _mm_unpacklo_epi16(s6, _mm_setzero_si128());
909 s6 = _mm_unpackhi_epi16(s6, _mm_setzero_si128());
910 s9 = _mm_add_epi32(s9, _mm_add_epi32(_mm_add_epi32(s0, s1), _mm_add_epi32(s2, s7)));
911 s10 = _mm_add_epi32(s10, _mm_add_epi32(_mm_add_epi32(s3, s4), _mm_add_epi32(s5, s6)));
912 }
913 s0 = _mm_packus_epi32(s9, s10);
914 s0 = _mm_minpos_epu16(s0);
915 tem_sum = _mm_extract_epi16(s0, 0);
916 tem_sum &= 0x0000FFFF;
917 if (tem_sum < low_sum) {
918 if (tem_sum != 0xFFFF) { // no overflow
919 low_sum = tem_sum;
920 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
921 y_best = i;
922 }
923 else {
924 UPDATE_BEST(s9, 0, 0);
925 UPDATE_BEST(s9, 1, 0);
926 UPDATE_BEST(s9, 2, 0);
927 UPDATE_BEST(s9, 3, 0);
928 UPDATE_BEST(s10, 0, 4);
929 UPDATE_BEST(s10, 1, 4);
930 UPDATE_BEST(s10, 2, 4);
931 UPDATE_BEST(s10, 3, 4);
932 }
933 }
934 }
935
936 ref += src_stride_raw;
937 }
938 }
939 break;
940
941 default:
942 break;
943 }
944
945 *best_sad = low_sum;
946 *x_search_center = x_best;
947 *y_search_center = y_best;
948 }
949
950 /*******************************************
951 * eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu
952 *******************************************/
eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu_sse41_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t * p_best_sad8x8,uint32_t * p_best_mv8x8,uint32_t * p_best_sad16x16,uint32_t * p_best_mv16x16,uint32_t mv,uint16_t * p_sad16x16)953 void eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu_sse41_intrin(
954 uint8_t *src,
955 uint32_t src_stride,
956 uint8_t *ref,
957 uint32_t ref_stride,
958 uint32_t *p_best_sad8x8,
959 uint32_t *p_best_mv8x8,
960 uint32_t *p_best_sad16x16,
961 uint32_t *p_best_mv16x16,
962 uint32_t mv,
963 uint16_t *p_sad16x16 )
964 {
965
966 int16_t x_mv,y_mv;
967 const uint8_t *p_ref, *p_src;
968 __m128i s0, s1, s2, s3, s4, s5;
969 __m128i sad_0, sad_1, sad_2, sad_3;
970 uint32_t tem_sum;
971
972 /*
973 ------------------------------------- -----------------------------------
974 | 8x8_00 | 8x8_01 | 8x8_04 | 8x8_05 | 8x8_16 | 8x8_17 | 8x8_20 | 8x8_21 |
975 ------------------------------------- -----------------------------------
976 | 8x8_02 | 8x8_03 | 8x8_06 | 8x8_07 | 8x8_18 | 8x8_19 | 8x8_22 | 8x8_23 |
977 ----------------------- ----------- ---------------------- ----------
978 | 8x8_08 | 8x8_09 | 8x8_12 | 8x8_13 | 8x8_24 | 8x8_25 | 8x8_29 | 8x8_29 |
979 ---------------------- ----------- --------------------- ----------
980 | 8x8_10 | 8x8_11 | 8x8_14 | 8x8_15 | 8x8_26 | 8x8_27 | 8x8_30 | 8x8_31 |
981 ------------------------------------- -----------------------------------
982
983 ------------------------------------- -----------------------------------
984 | 8x8_32 | 8x8_33 | 8x8_36 | 8x8_37 | 8x8_48 | 8x8_49 | 8x8_52 | 8x8_53 |
985 ------------------------------------- -----------------------------------
986 | 8x8_34 | 8x8_35 | 8x8_38 | 8x8_39 | 8x8_50 | 8x8_51 | 8x8_54 | 8x8_55 |
987 ----------------------- ----------- ---------------------- ----------
988 | 8x8_40 | 8x8_41 | 8x8_44 | 8x8_45 | 8x8_56 | 8x8_57 | 8x8_60 | 8x8_61 |
989 ---------------------- ----------- --------------------- ----------
990 | 8x8_42 | 8x8_43 | 8x8_46 | 8x8_48 | 8x8_58 | 8x8_59 | 8x8_62 | 8x8_63 |
991 ------------------------------------- -----------------------------------
992 */
993
994 /*
995 ---------------------- ----------------------
996 | 16x16_0 | 16x16_1 | 16x16_4 | 16x16_5 |
997 ---------------------- ----------------------
998 | 16x16_2 | 16x16_3 | 16x16_6 | 16x16_7 |
999 ----------------------- -----------------------
1000 | 16x16_8 | 16x16_9 | 16x16_12 | 16x16_13 |
1001 ---------------------- ----------------------
1002 | 16x16_10 | 16x16_11 | 16x16_14 | 16x16_15 |
1003 ----------------------- -----------------------
1004 */
1005
1006 //8x8_0
1007 {
1008 p_src = src;
1009 p_ref = ref;
1010 s3 = s4 = _mm_setzero_si128();
1011
1012 s0 = _mm_loadu_si128((__m128i*)p_ref);
1013 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1014 s2 = _mm_loadl_epi64((__m128i*)p_src);
1015 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1016 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1017 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1018 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1019 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1020
1021 p_src += src_stride *4;
1022 p_ref += ref_stride *4;
1023
1024 s0 = _mm_loadu_si128((__m128i*)p_ref);
1025 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1026 s2 = _mm_loadl_epi64((__m128i*)p_src);
1027 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1028 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1029 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1030 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1031 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1032
1033 //final 8x4 SAD
1034 sad_0 = _mm_adds_epu16(s3, s4);
1035
1036 //find the best for 8x8_0
1037 s3 = _mm_minpos_epu16(sad_0);
1038 tem_sum = _mm_extract_epi16(s3, 0);
1039 if (2*tem_sum < p_best_sad8x8[0]) {
1040 p_best_sad8x8[0] = 2*tem_sum;
1041 x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1042 y_mv = _MVYT(mv);
1043 p_best_mv8x8[0] = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1044 }
1045 }
1046
1047 //8x8_1
1048 {
1049 p_src = src + 8;
1050 p_ref = ref + 8;
1051 s3 = s4 = _mm_setzero_si128();
1052
1053 s0 = _mm_loadu_si128((__m128i*)p_ref);
1054 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1055 s2 = _mm_loadl_epi64((__m128i*)p_src);
1056 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1057 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1058 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1059 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1060 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1061
1062 p_src += src_stride *4;
1063 p_ref += ref_stride *4;
1064
1065 s0 = _mm_loadu_si128((__m128i*)p_ref);
1066 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1067 s2 = _mm_loadl_epi64((__m128i*)p_src);
1068 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1069 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1070 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1071 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1072 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1073
1074 //final 8x4 SAD
1075 sad_1 = _mm_adds_epu16(s3, s4);
1076
1077 //find the best for 8x8_1
1078 s3 = _mm_minpos_epu16(sad_1);
1079 tem_sum = _mm_extract_epi16(s3, 0);
1080 if (2*tem_sum < p_best_sad8x8[1]) {
1081 p_best_sad8x8[1] = 2*tem_sum;
1082 x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1083 y_mv = _MVYT(mv);
1084 p_best_mv8x8[1] = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1085 }
1086 }
1087
1088 //8x8_2
1089 {
1090 p_src = src + 8*src_stride;
1091 p_ref = ref + 8*ref_stride;
1092 s3 = s4 = _mm_setzero_si128();
1093
1094 s0 = _mm_loadu_si128((__m128i*)p_ref);
1095 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1096 s2 = _mm_loadl_epi64((__m128i*)p_src);
1097 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1098 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1099 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1100 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1101 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1102
1103 p_src += src_stride *4;
1104 p_ref += ref_stride *4;
1105
1106 s0 = _mm_loadu_si128((__m128i*)p_ref);
1107 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1108 s2 = _mm_loadl_epi64((__m128i*)p_src);
1109 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1110 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1111 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1112 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1113 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1114
1115 //final 8x4 SAD
1116 sad_2 = _mm_adds_epu16(s3, s4);
1117
1118 //find the best for 8x8_2
1119 s3 = _mm_minpos_epu16(sad_2);
1120 tem_sum = _mm_extract_epi16(s3, 0);
1121 if (2*tem_sum < p_best_sad8x8[2]) {
1122 p_best_sad8x8[2] = 2*tem_sum;
1123 x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1124 y_mv = _MVYT(mv);
1125 p_best_mv8x8[2] = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1126 }
1127 }
1128
1129 //8x8_3
1130 {
1131 p_src = src + 8 + 8*src_stride;
1132 p_ref = ref + 8 + 8*ref_stride;
1133 s3 = s4 = _mm_setzero_si128();
1134
1135 s0 = _mm_loadu_si128((__m128i*)p_ref);
1136 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1137 s2 = _mm_loadl_epi64((__m128i*)p_src);
1138 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1139 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1140 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1141 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1142 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1143
1144 p_src += src_stride *4;
1145 p_ref += ref_stride *4;
1146
1147 s0 = _mm_loadu_si128((__m128i*)p_ref);
1148 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride*2));
1149 s2 = _mm_loadl_epi64((__m128i*)p_src);
1150 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride*2));
1151 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1152 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1153 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
1154 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
1155
1156 //final 8x4 SAD
1157 sad_3 = _mm_adds_epu16(s3, s4);
1158
1159 //find the best for 8x8_3
1160 s3 = _mm_minpos_epu16(sad_3);
1161 tem_sum = _mm_extract_epi16(s3, 0);
1162 if (2*tem_sum < p_best_sad8x8[3]) {
1163 p_best_sad8x8[3] = 2*tem_sum;
1164 x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1165 y_mv = _MVYT(mv);
1166 p_best_mv8x8[3] = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1167 }
1168 }
1169
1170 //16x16
1171 {
1172 s0 = _mm_adds_epu16(sad_0, sad_1);
1173 s1 = _mm_adds_epu16(sad_2, sad_3);
1174 s3 = _mm_adds_epu16(s0 , s1 );
1175 //sotore the 8 SADs(16x8 SADs)
1176 _mm_store_si128( (__m128i*)p_sad16x16, s3);
1177 //find the best for 16x16
1178 s3 = _mm_minpos_epu16(s3);
1179 tem_sum = _mm_extract_epi16(s3, 0);
1180 if (2*tem_sum < p_best_sad16x16[0]) {
1181 p_best_sad16x16[0] = 2*tem_sum;
1182 x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1)*4) ;
1183 y_mv = _MVYT(mv);
1184 p_best_mv16x16[0] = ((uint16_t)y_mv<<16) | ((uint16_t)x_mv);
1185 }
1186 }
1187
1188 }
1189