1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include "EbComputeSAD_AVX2.h"
7 #include "stdint.h"
8 #include "EbMemory_AVX2.h"
9 #include "immintrin.h"
10
11 #define UPDATE_BEST(s, k, offset) \
12 tem_sum1 = _mm_extract_epi32(s, k); \
13 if (tem_sum1 < low_sum) { \
14 low_sum = tem_sum1; \
15 x_best = j + offset + k; \
16 y_best = i; \
17 }
18
19 /*******************************************************************************
20 * Requirement: width = 4, 8, 16, 24, 32, 48 or 64
21 * Requirement: height <= 64
22 * Requirement: height % 2 = 0 when width = 4 or 8
23 *******************************************************************************/
eb_vp9_sad_loop_kernel_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width,uint64_t * best_sad,int16_t * x_search_center,int16_t * y_search_center,uint32_t src_stride_raw,int16_t search_area_width,int16_t search_area_height)24 void eb_vp9_sad_loop_kernel_avx2_intrin(
25 uint8_t *src, // input parameter, source samples Ptr
26 uint32_t src_stride, // input parameter, source stride
27 uint8_t *ref, // input parameter, reference samples Ptr
28 uint32_t ref_stride, // input parameter, reference stride
29 uint32_t height, // input parameter, block height (M)
30 uint32_t width, // input parameter, block width (N)
31 uint64_t *best_sad,
32 int16_t *x_search_center,
33 int16_t *y_search_center,
34 uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
35 int16_t search_area_width,
36 int16_t search_area_height)
37 {
38 int16_t x_best = *x_search_center, y_best = *y_search_center;
39 uint32_t low_sum = 0xffffff;
40 uint32_t tem_sum1 = 0;
41 int16_t i, j;
42 uint32_t k, l;
43 uint32_t leftover = search_area_width & 7;
44 const uint8_t *p_ref, *p_src;
45 __m128i s0, s1, s2, s3, s4, s5, s6, s8 = _mm_set1_epi32(-1);
46 __m256i ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, ss8;
47
48 if (leftover) {
49 for (k=0; k<leftover; k++) {
50 s8 = _mm_slli_si128(s8, 2);
51 }
52 }
53
54 switch (width) {
55 case 4:
56
57 if (!(height % 4)) {
58 uint32_t srcStrideT = 3 * src_stride;
59 uint32_t refStrideT = 3 * ref_stride;
60 for (i=0; i<search_area_height; i++) {
61 for (j=0; j<=search_area_width-8; j+=8) {
62 p_src = src;
63 p_ref = ref + j;
64 ss3 = ss5 = _mm256_setzero_si256();
65 for (k=0; k<height; k+=4) {
66 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 2 * ref_stride))), _mm_loadu_si128((__m128i*)p_ref), 0x1);
67 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + ref_stride))), _mm_loadu_si128((__m128i*)(p_ref + refStrideT)), 0x1);
68 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_cvtsi32_si128(*(uint32_t *)p_src), _mm_cvtsi32_si128(*(uint32_t *)(p_src + src_stride)))), _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(uint32_t *)(p_src + 2 * src_stride)), _mm_cvtsi32_si128(*(uint32_t *)(p_src + srcStrideT))), 0x1);
69 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
70 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
71 p_src += src_stride << 2;
72 p_ref += ref_stride << 2;
73 }
74 ss3 = _mm256_adds_epu16(ss3, ss5);
75 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
76 s3 = _mm_minpos_epu16(s3);
77 tem_sum1 = _mm_extract_epi16(s3, 0);
78 if (tem_sum1 < low_sum) {
79 low_sum = tem_sum1;
80 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
81 y_best = i;
82 }
83 }
84
85 if (leftover) {
86 p_src = src;
87 p_ref = ref + j;
88 ss3 = ss5 = _mm256_setzero_si256();
89 for (k=0; k<height; k+=4) {
90 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + 2 * ref_stride)), 0x1);
91 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + ref_stride))), _mm_loadu_si128((__m128i*)(p_ref + refStrideT)), 0x1);
92 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_cvtsi32_si128(*(uint32_t *)p_src), _mm_cvtsi32_si128(*(uint32_t *)(p_src + src_stride)))), _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(uint32_t *)(p_src + 2 * src_stride)), _mm_cvtsi32_si128(*(uint32_t *)(p_src + srcStrideT))), 0x1);
93 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
94 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
95 p_src += src_stride << 2;
96 p_ref += ref_stride << 2;
97 }
98 ss3 = _mm256_adds_epu16(ss3, ss5);
99 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
100 s3 = _mm_or_si128(s3, s8);
101 s3 = _mm_minpos_epu16(s3);
102 tem_sum1 = _mm_extract_epi16(s3, 0);
103 if (tem_sum1 < low_sum) {
104 low_sum = tem_sum1;
105 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
106 y_best = i;
107 }
108 }
109 ref += src_stride_raw;
110 }
111 }
112 else {
113 for (i=0; i<search_area_height; i++) {
114 for (j=0; j<=search_area_width-8; j+=8) {
115 p_src = src;
116 p_ref = ref + j;
117 s3 = _mm_setzero_si128();
118 for (k=0; k<height; k+=2) {
119 s0 = _mm_loadu_si128((__m128i*)p_ref);
120 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride));
121 s2 = _mm_cvtsi32_si128(*(uint32_t *)p_src);
122 s5 = _mm_cvtsi32_si128(*(uint32_t *)(p_src+src_stride));
123 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
124 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
125 p_src += src_stride << 1;
126 p_ref += ref_stride << 1;
127 }
128 s3 = _mm_minpos_epu16(s3);
129 tem_sum1 = _mm_extract_epi16(s3, 0);
130 if (tem_sum1 < low_sum) {
131 low_sum = tem_sum1;
132 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
133 y_best = i;
134 }
135 }
136
137 if (leftover) {
138 p_src = src;
139 p_ref = ref + j;
140 s3 = _mm_setzero_si128();
141 for (k=0; k<height; k+=2) {
142 s0 = _mm_loadu_si128((__m128i*)p_ref);
143 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride));
144 s2 = _mm_cvtsi32_si128(*(uint32_t *)p_src);
145 s5 = _mm_cvtsi32_si128(*(uint32_t *)(p_src+src_stride));
146 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
147 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
148 p_src += src_stride << 1;
149 p_ref += ref_stride << 1;
150 }
151 s3 = _mm_or_si128(s3, s8);
152 s3 = _mm_minpos_epu16(s3);
153 tem_sum1 = _mm_extract_epi16(s3, 0);
154 if (tem_sum1 < low_sum) {
155 low_sum = tem_sum1;
156 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
157 y_best = i;
158 }
159 }
160 ref += src_stride_raw;
161 }
162 }
163
164 break;
165
166 case 8:
167 if (!(height % 4)) {
168 uint32_t srcStrideT = 3 * src_stride;
169 uint32_t refStrideT = 3 * ref_stride;
170 for (i=0; i<search_area_height; i++) {
171 for (j=0; j<=search_area_width-8; j+=8) {
172 p_src = src;
173 p_ref = ref + j;
174 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
175 for (k=0; k<height; k+=4) {
176 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + 2 * ref_stride)), 0x1);
177 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + ref_stride))), _mm_loadu_si128((__m128i*)(p_ref + refStrideT)), 0x1);
178 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)p_src), _mm_loadl_epi64((__m128i*)(p_src + src_stride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(p_src + 2 * src_stride)), _mm_loadl_epi64((__m128i*)(p_src + srcStrideT))), 0x1);
179 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
180 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
181 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
182 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
183 p_src += src_stride << 2;
184 p_ref += ref_stride << 2;
185 }
186 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
187 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
188 s3 = _mm_minpos_epu16(s3);
189 tem_sum1 = _mm_extract_epi16(s3, 0);
190 if (tem_sum1 < low_sum) {
191 low_sum = tem_sum1;
192 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
193 y_best = i;
194 }
195 }
196
197 if (leftover) {
198 p_src = src;
199 p_ref = ref + j;
200 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
201 for (k=0; k<height; k+=4) {
202 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref+2*ref_stride)), 0x1);
203 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + ref_stride))), _mm_loadu_si128((__m128i*)(p_ref + refStrideT)), 0x1);
204 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)p_src), _mm_loadl_epi64((__m128i*)(p_src + src_stride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(p_src + 2 * src_stride)), _mm_loadl_epi64((__m128i*)(p_src + srcStrideT))), 0x1);
205 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
206 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
207 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
208 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
209 p_src += src_stride << 2;
210 p_ref += ref_stride << 2;
211 }
212 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
213 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
214 s3 = _mm_or_si128(s3, s8);
215 s3 = _mm_minpos_epu16(s3);
216 tem_sum1 = _mm_extract_epi16(s3, 0);
217 if (tem_sum1 < low_sum) {
218 low_sum = tem_sum1;
219 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
220 y_best = i;
221 }
222 }
223 ref += src_stride_raw;
224 }
225 }
226 else {
227 for (i=0; i<search_area_height; i++) {
228 for (j=0; j<=search_area_width-8; j+=8) {
229 p_src = src;
230 p_ref = ref + j;
231 s3 = s4 = _mm_setzero_si128();
232 for (k=0; k<height; k+=2) {
233 s0 = _mm_loadu_si128((__m128i*)p_ref);
234 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride));
235 s2 = _mm_loadl_epi64((__m128i*)p_src);
236 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride));
237 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
238 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
239 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
240 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
241 p_src += src_stride << 1;
242 p_ref += ref_stride << 1;
243 }
244 s3 = _mm_adds_epu16(s3, s4);
245 s3 = _mm_minpos_epu16(s3);
246 tem_sum1 = _mm_extract_epi16(s3, 0);
247 if (tem_sum1 < low_sum) {
248 low_sum = tem_sum1;
249 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
250 y_best = i;
251 }
252 }
253
254 if (leftover) {
255 p_src = src;
256 p_ref = ref + j;
257 s3 = s4 = _mm_setzero_si128();
258 for (k=0; k<height; k+=2) {
259 s0 = _mm_loadu_si128((__m128i*)p_ref);
260 s1 = _mm_loadu_si128((__m128i*)(p_ref+ref_stride));
261 s2 = _mm_loadl_epi64((__m128i*)p_src);
262 s5 = _mm_loadl_epi64((__m128i*)(p_src+src_stride));
263 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
264 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
265 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
266 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
267 p_src += src_stride << 1;
268 p_ref += ref_stride << 1;
269 }
270 s3 = _mm_adds_epu16(s3, s4);
271 s3 = _mm_or_si128(s3, s8);
272 s3 = _mm_minpos_epu16(s3);
273 tem_sum1 = _mm_extract_epi16(s3, 0);
274 if (tem_sum1 < low_sum) {
275 low_sum = tem_sum1;
276 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
277 y_best = i;
278 }
279 }
280 ref += src_stride_raw;
281 }
282 }
283 break;
284
285 case 16:
286 if (height <= 16) {
287 for (i=0; i<search_area_height; i++) {
288 for (j=0; j<=search_area_width-8; j+=8) {
289 p_src = src;
290 p_ref = ref + j;
291 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
292 for (k=0; k<height; k+=2) {
293 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
294 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
295 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
296 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
297 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
298 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
299 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
300 p_src += 2 * src_stride;
301 p_ref += 2 * ref_stride;
302 }
303 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
304 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
305 s3 = _mm_minpos_epu16(s3);
306 tem_sum1 = _mm_extract_epi16(s3, 0);
307 if (tem_sum1 < low_sum) {
308 low_sum = tem_sum1;
309 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
310 y_best = i;
311 }
312 }
313
314 if (leftover) {
315 p_src = src;
316 p_ref = ref + j;
317 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
318 for (k=0; k<height; k+=2) {
319 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
320 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
321 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
322 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
323 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
324 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
325 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
326 p_src += 2 * src_stride;
327 p_ref += 2 * ref_stride;
328 }
329 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
330 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
331 s3 = _mm_or_si128(s3, s8);
332 s3 = _mm_minpos_epu16(s3);
333 tem_sum1 = _mm_extract_epi16(s3, 0);
334 if (tem_sum1 < low_sum) {
335 low_sum = tem_sum1;
336 x_best = (int16_t)(j + _mm_extract_epi16(s3, 1));
337 y_best = i;
338 }
339 }
340 ref += src_stride_raw;
341 }
342 }
343 else if (height <= 32) {
344 for (i=0; i<search_area_height; i++) {
345 for (j=0; j<=search_area_width-8; j+=8) {
346 p_src = src;
347 p_ref = ref + j;
348 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
349 for (k=0; k<height; k+=2) {
350 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
351 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
352 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
353 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
354 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
355 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
356 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
357 p_src += 2 * src_stride;
358 p_ref += 2 * ref_stride;
359 }
360 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
361 s3 = _mm256_extracti128_si256(ss3, 0);
362 s5 = _mm256_extracti128_si256(ss3, 1);
363 s4 = _mm_minpos_epu16(s3);
364 s6 = _mm_minpos_epu16(s5);
365 s4 = _mm_unpacklo_epi16(s4, s4);
366 s4 = _mm_unpacklo_epi32(s4, s4);
367 s4 = _mm_unpacklo_epi64(s4, s4);
368 s6 = _mm_unpacklo_epi16(s6, s6);
369 s6 = _mm_unpacklo_epi32(s6, s6);
370 s6 = _mm_unpacklo_epi64(s6, s6);
371 s3 = _mm_sub_epi16(s3, s4);
372 s5 = _mm_adds_epu16(s5, s3);
373 s5 = _mm_sub_epi16(s5, s6);
374 s5 = _mm_minpos_epu16(s5);
375 tem_sum1 = _mm_extract_epi16(s5, 0);
376 tem_sum1 += _mm_extract_epi16(s4, 0);
377 tem_sum1 += _mm_extract_epi16(s6, 0);
378 if (tem_sum1 < low_sum) {
379 low_sum = tem_sum1;
380 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
381 y_best = i;
382 }
383 }
384
385 if (leftover) {
386 p_src = src;
387 p_ref = ref + j;
388 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
389 for (k=0; k<height; k+=2) {
390 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
391 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
392 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
393 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
394 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
395 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
396 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
397 p_src += 2 * src_stride;
398 p_ref += 2 * ref_stride;
399 }
400 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
401 s3 = _mm256_extracti128_si256(ss3, 0);
402 s5 = _mm256_extracti128_si256(ss3, 1);
403 s3 = _mm_or_si128(s3, s8);
404 s5 = _mm_or_si128(s5, s8);
405 s4 = _mm_minpos_epu16(s3);
406 s6 = _mm_minpos_epu16(s5);
407 s4 = _mm_unpacklo_epi16(s4, s4);
408 s4 = _mm_unpacklo_epi32(s4, s4);
409 s4 = _mm_unpacklo_epi64(s4, s4);
410 s6 = _mm_unpacklo_epi16(s6, s6);
411 s6 = _mm_unpacklo_epi32(s6, s6);
412 s6 = _mm_unpacklo_epi64(s6, s6);
413 s3 = _mm_sub_epi16(s3, s4);
414 s5 = _mm_adds_epu16(s5, s3);
415 s5 = _mm_sub_epi16(s5, s6);
416 s5 = _mm_minpos_epu16(s5);
417 tem_sum1 = _mm_extract_epi16(s5, 0);
418 tem_sum1 += _mm_extract_epi16(s4, 0);
419 tem_sum1 += _mm_extract_epi16(s6, 0);
420 if (tem_sum1 < low_sum) {
421 low_sum = tem_sum1;
422 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
423 y_best = i;
424 }
425 }
426 ref += src_stride_raw;
427 }
428 }
429 else {
430 for (i=0; i<search_area_height; i++) {
431 for (j=0; j<=search_area_width-8; j+=8) {
432 p_src = src;
433 p_ref = ref + j;
434 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
435 for (k=0; k<height; k+=2) {
436 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
437 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
438 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
439 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
440 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
441 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
442 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
443 p_src += 2 * src_stride;
444 p_ref += 2 * ref_stride;
445 }
446 ss3 = _mm256_adds_epu16(ss3, ss4);
447 ss5 = _mm256_adds_epu16(ss5, ss6);
448 ss0 = _mm256_adds_epu16(ss3, ss5);
449 s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
450 s0 = _mm_minpos_epu16(s0);
451 tem_sum1 = _mm_extract_epi16(s0, 0);
452 if (tem_sum1 < low_sum) {
453 if (tem_sum1 != 0xFFFF) { // no overflow
454 low_sum = tem_sum1;
455 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
456 y_best = i;
457 }
458 else {
459 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
460 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
461 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
462 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
463 ss4 = _mm256_add_epi32(ss4, ss6);
464 ss3 = _mm256_add_epi32(ss3, ss5);
465 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
466 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
467 UPDATE_BEST(s0, 0, 0);
468 UPDATE_BEST(s0, 1, 0);
469 UPDATE_BEST(s0, 2, 0);
470 UPDATE_BEST(s0, 3, 0);
471 UPDATE_BEST(s3, 0, 4);
472 UPDATE_BEST(s3, 1, 4);
473 UPDATE_BEST(s3, 2, 4);
474 UPDATE_BEST(s3, 3, 4);
475 }
476 }
477 }
478
479 if (leftover) {
480 p_src = src;
481 p_ref = ref + j;
482 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
483 for (k=0; k<height; k+=2) {
484 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_ref)), _mm_loadu_si128((__m128i*)(p_ref + ref_stride)), 0x1);
485 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(p_ref + 8))), _mm_loadu_si128((__m128i*)(p_ref + ref_stride + 8)), 0x1);
486 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)p_src)), _mm_loadu_si128((__m128i*)(p_src + src_stride)), 0x1);
487 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
488 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
489 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
490 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
491 p_src += 2 * src_stride;
492 p_ref += 2 * ref_stride;
493 }
494 ss3 = _mm256_adds_epu16(ss3, ss4);
495 ss5 = _mm256_adds_epu16(ss5, ss6);
496 ss0 = _mm256_adds_epu16(ss3, ss5);
497 s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
498 s0 = _mm_or_si128(s0, s8);
499 s0 = _mm_minpos_epu16(s0);
500 tem_sum1 = _mm_extract_epi16(s0, 0);
501 if (tem_sum1 < low_sum) {
502 if (tem_sum1 != 0xFFFF) { // no overflow
503 low_sum = tem_sum1;
504 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
505 y_best = i;
506 }
507 else {
508 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
509 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
510 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
511 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
512 ss4 = _mm256_add_epi32(ss4, ss6);
513 ss3 = _mm256_add_epi32(ss3, ss5);
514 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
515 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
516 k = leftover;
517 while (k > 0) {
518 for (l=0; l < 4 && k; l++, k--) {
519 tem_sum1 = _mm_extract_epi32(s0, 0);
520 s0 = _mm_srli_si128(s0, 4);
521 if (tem_sum1 < low_sum) {
522 low_sum = tem_sum1;
523 x_best = (int16_t)(j + leftover - k);
524 y_best = i;
525 }
526 }
527 s0 = s3;
528 }
529 }
530 }
531 }
532 ref += src_stride_raw;
533 }
534 }
535 break;
536
537 case 24:
538 if (height <= 16) {
539 for (i=0; i<search_area_height; i++) {
540 for (j=0; j<=search_area_width-8; j+=8) {
541 p_src = src;
542 p_ref = ref + j;
543 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
544 for (k=0; k<height; k++) {
545 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
546 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
547 ss2 = _mm256_loadu_si256((__m256i *)p_src);
548 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
549 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
550 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
551 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
552 p_src += src_stride;
553 p_ref += ref_stride;
554 }
555 ss3 = _mm256_adds_epu16(ss3, ss4);
556 ss5 = _mm256_adds_epu16(ss5, ss6);
557 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
558 s5 = _mm256_extracti128_si256(ss5, 0);
559 s4 = _mm_minpos_epu16(s3);
560 s6 = _mm_minpos_epu16(s5);
561 s4 = _mm_unpacklo_epi16(s4, s4);
562 s4 = _mm_unpacklo_epi32(s4, s4);
563 s4 = _mm_unpacklo_epi64(s4, s4);
564 s6 = _mm_unpacklo_epi16(s6, s6);
565 s6 = _mm_unpacklo_epi32(s6, s6);
566 s6 = _mm_unpacklo_epi64(s6, s6);
567 s3 = _mm_sub_epi16(s3, s4);
568 s5 = _mm_adds_epu16(s5, s3);
569 s5 = _mm_sub_epi16(s5, s6);
570 s5 = _mm_minpos_epu16(s5);
571 tem_sum1 = _mm_extract_epi16(s5, 0);
572 tem_sum1 += _mm_extract_epi16(s4, 0);
573 tem_sum1 += _mm_extract_epi16(s6, 0);
574 if (tem_sum1 < low_sum) {
575 low_sum = tem_sum1;
576 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
577 y_best = i;
578 }
579 }
580
581 if (leftover) {
582 p_src = src;
583 p_ref = ref + j;
584 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
585 for (k=0; k<height; k++) {
586 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
587 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
588 ss2 = _mm256_loadu_si256((__m256i *)p_src);
589 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
590 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
591 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
592 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
593 p_src += src_stride;
594 p_ref += ref_stride;
595 }
596 ss3 = _mm256_adds_epu16(ss3, ss4);
597 ss5 = _mm256_adds_epu16(ss5, ss6);
598 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
599 s5 = _mm256_extracti128_si256(ss5, 0);
600 s3 = _mm_or_si128(s3, s8);
601 s5 = _mm_or_si128(s5, s8);
602 s4 = _mm_minpos_epu16(s3);
603 s6 = _mm_minpos_epu16(s5);
604 s4 = _mm_unpacklo_epi16(s4, s4);
605 s4 = _mm_unpacklo_epi32(s4, s4);
606 s4 = _mm_unpacklo_epi64(s4, s4);
607 s6 = _mm_unpacklo_epi16(s6, s6);
608 s6 = _mm_unpacklo_epi32(s6, s6);
609 s6 = _mm_unpacklo_epi64(s6, s6);
610 s3 = _mm_sub_epi16(s3, s4);
611 s5 = _mm_adds_epu16(s5, s3);
612 s5 = _mm_sub_epi16(s5, s6);
613 s5 = _mm_minpos_epu16(s5);
614 tem_sum1 = _mm_extract_epi16(s5, 0);
615 tem_sum1 += _mm_extract_epi16(s4, 0);
616 tem_sum1 += _mm_extract_epi16(s6, 0);
617 if (tem_sum1 < low_sum) {
618 low_sum = tem_sum1;
619 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
620 y_best = i;
621 }
622 }
623 ref += src_stride_raw;
624 }
625 }
626 else {
627 for (i=0; i<search_area_height; i++) {
628 for (j=0; j<=search_area_width-8; j+=8) {
629 p_src = src;
630 p_ref = ref + j;
631 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
632 for (k=0; k<height; k++) {
633 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
634 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
635 ss2 = _mm256_loadu_si256((__m256i *)p_src);
636 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
637 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
638 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
639 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
640 p_src += src_stride;
641 p_ref += ref_stride;
642 }
643 ss3 = _mm256_adds_epu16(ss3, ss4);
644 ss5 = _mm256_adds_epu16(ss5, ss6);
645 s3 = _mm256_extracti128_si256(ss3, 0);
646 s4 = _mm256_extracti128_si256(ss3, 1);
647 s5 = _mm256_extracti128_si256(ss5, 0);
648 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), s5);
649 s0 = _mm_minpos_epu16(s0);
650 tem_sum1 = _mm_extract_epi16(s0, 0);
651 if (tem_sum1 < low_sum) {
652 if (tem_sum1 != 0xFFFF) { // no overflow
653 low_sum = tem_sum1;
654 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
655 y_best = i;
656 }
657 else {
658 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
659 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
660 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
661 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
662 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
663 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
664 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), s2);
665 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), s5);
666 UPDATE_BEST(s0, 0, 0);
667 UPDATE_BEST(s0, 1, 0);
668 UPDATE_BEST(s0, 2, 0);
669 UPDATE_BEST(s0, 3, 0);
670 UPDATE_BEST(s3, 0, 4);
671 UPDATE_BEST(s3, 1, 4);
672 UPDATE_BEST(s3, 2, 4);
673 UPDATE_BEST(s3, 3, 4);
674 }
675 }
676 }
677
678 if (leftover) {
679 p_src = src;
680 p_ref = ref + j;
681 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
682 for (k=0; k<height; k++) {
683 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
684 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
685 ss2 = _mm256_loadu_si256((__m256i *)p_src);
686 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
687 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
688 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
689 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
690 p_src += src_stride;
691 p_ref += ref_stride;
692 }
693 ss3 = _mm256_adds_epu16(ss3, ss4);
694 ss5 = _mm256_adds_epu16(ss5, ss6);
695 s3 = _mm256_extracti128_si256(ss3, 0);
696 s4 = _mm256_extracti128_si256(ss3, 1);
697 s5 = _mm256_extracti128_si256(ss5, 0);
698 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), s5);
699 s0 = _mm_or_si128(s0, s8);
700 s0 = _mm_minpos_epu16(s0);
701 tem_sum1 = _mm_extract_epi16(s0, 0);
702 if (tem_sum1 < low_sum) {
703 if (tem_sum1 != 0xFFFF) { // no overflow
704 low_sum = tem_sum1;
705 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
706 y_best = i;
707 }
708 else {
709 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
710 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
711 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
712 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
713 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
714 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
715 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), s2);
716 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), s5);
717 k = leftover;
718 while (k > 0) {
719 for (l=0; l < 4 && k; l++, k--) {
720 tem_sum1 = _mm_extract_epi32(s0, 0);
721 s0 = _mm_srli_si128(s0, 4);
722 if (tem_sum1 < low_sum) {
723 low_sum = tem_sum1;
724 x_best = (int16_t)(j + leftover - k);
725 y_best = i;
726 }
727 }
728 s0 = s3;
729 }
730 }
731 }
732 }
733 ref += src_stride_raw;
734 }
735 }
736 break;
737
738 case 32:
739 if (height <= 16) {
740 for (i=0; i<search_area_height; i++) {
741 for (j=0; j<=search_area_width-8; j+=8) {
742 p_src = src;
743 p_ref = ref + j;
744 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
745 for (k=0; k<height; k++) {
746 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
747 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
748 ss2 = _mm256_loadu_si256((__m256i *)p_src);
749 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
750 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
751 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
752 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
753 p_src += src_stride;
754 p_ref += ref_stride;
755 }
756 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
757 s3 = _mm256_extracti128_si256(ss3, 0);
758 s5 = _mm256_extracti128_si256(ss3, 1);
759 s4 = _mm_minpos_epu16(s3);
760 s6 = _mm_minpos_epu16(s5);
761 s4 = _mm_unpacklo_epi16(s4, s4);
762 s4 = _mm_unpacklo_epi32(s4, s4);
763 s4 = _mm_unpacklo_epi64(s4, s4);
764 s6 = _mm_unpacklo_epi16(s6, s6);
765 s6 = _mm_unpacklo_epi32(s6, s6);
766 s6 = _mm_unpacklo_epi64(s6, s6);
767 s3 = _mm_sub_epi16(s3, s4);
768 s5 = _mm_adds_epu16(s5, s3);
769 s5 = _mm_sub_epi16(s5, s6);
770 s5 = _mm_minpos_epu16(s5);
771 tem_sum1 = _mm_extract_epi16(s5, 0);
772 tem_sum1 += _mm_extract_epi16(s4, 0);
773 tem_sum1 += _mm_extract_epi16(s6, 0);
774 if (tem_sum1 < low_sum) {
775 low_sum = tem_sum1;
776 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
777 y_best = i;
778 }
779 }
780
781 if (leftover) {
782 p_src = src;
783 p_ref = ref + j;
784 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
785 for (k=0; k<height; k++) {
786 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
787 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
788 ss2 = _mm256_loadu_si256((__m256i *)p_src);
789 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
790 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
791 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
792 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
793 p_src += src_stride;
794 p_ref += ref_stride;
795 }
796 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
797 s3 = _mm256_extracti128_si256(ss3, 0);
798 s5 = _mm256_extracti128_si256(ss3, 1);
799 s3 = _mm_or_si128(s3, s8);
800 s5 = _mm_or_si128(s5, s8);
801 s4 = _mm_minpos_epu16(s3);
802 s6 = _mm_minpos_epu16(s5);
803 s4 = _mm_unpacklo_epi16(s4, s4);
804 s4 = _mm_unpacklo_epi32(s4, s4);
805 s4 = _mm_unpacklo_epi64(s4, s4);
806 s6 = _mm_unpacklo_epi16(s6, s6);
807 s6 = _mm_unpacklo_epi32(s6, s6);
808 s6 = _mm_unpacklo_epi64(s6, s6);
809 s3 = _mm_sub_epi16(s3, s4);
810 s5 = _mm_adds_epu16(s5, s3);
811 s5 = _mm_sub_epi16(s5, s6);
812 s5 = _mm_minpos_epu16(s5);
813 tem_sum1 = _mm_extract_epi16(s5, 0);
814 tem_sum1 += _mm_extract_epi16(s4, 0);
815 tem_sum1 += _mm_extract_epi16(s6, 0);
816 if (tem_sum1 < low_sum) {
817 low_sum = tem_sum1;
818 x_best = (int16_t)(j + _mm_extract_epi16(s5, 1));
819 y_best = i;
820 }
821 }
822 ref += src_stride_raw;
823 }
824 }
825 else if (height <= 32) {
826 for (i=0; i<search_area_height; i++) {
827 for (j=0; j<=search_area_width-8; j+=8) {
828 p_src = src;
829 p_ref = ref + j;
830 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
831 for (k=0; k<height; k++) {
832 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
833 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
834 ss2 = _mm256_loadu_si256((__m256i *)p_src);
835 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
836 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
837 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
838 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
839 p_src += src_stride;
840 p_ref += ref_stride;
841 }
842 ss3 = _mm256_adds_epu16(ss3, ss4);
843 ss5 = _mm256_adds_epu16(ss5, ss6);
844 ss6 = _mm256_adds_epu16(ss3, ss5);
845 s3 = _mm256_extracti128_si256(ss6, 0);
846 s4 = _mm256_extracti128_si256(ss6, 1);
847 s0 = _mm_adds_epu16(s3, s4);
848 s0 = _mm_minpos_epu16(s0);
849 tem_sum1 = _mm_extract_epi16(s0, 0);
850 if (tem_sum1 < low_sum) {
851 if (tem_sum1 != 0xFFFF) { // no overflow
852 low_sum = tem_sum1;
853 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
854 y_best = i;
855 }
856 else {
857 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
858 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
859 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
860 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
861 ss4 = _mm256_add_epi32(ss4, ss6);
862 ss3 = _mm256_add_epi32(ss3, ss5);
863 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
864 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
865 UPDATE_BEST(s0, 0, 0);
866 UPDATE_BEST(s0, 1, 0);
867 UPDATE_BEST(s0, 2, 0);
868 UPDATE_BEST(s0, 3, 0);
869 UPDATE_BEST(s3, 0, 4);
870 UPDATE_BEST(s3, 1, 4);
871 UPDATE_BEST(s3, 2, 4);
872 UPDATE_BEST(s3, 3, 4);
873 }
874 }
875 }
876
877 if (leftover) {
878 p_src = src;
879 p_ref = ref + j;
880 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
881 for (k=0; k<height; k++) {
882 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
883 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
884 ss2 = _mm256_loadu_si256((__m256i *)p_src);
885 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
886 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
887 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
888 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
889 p_src += src_stride;
890 p_ref += ref_stride;
891 }
892 ss3 = _mm256_adds_epu16(ss3, ss4);
893 ss5 = _mm256_adds_epu16(ss5, ss6);
894 ss6 = _mm256_adds_epu16(ss3, ss5);
895 s3 = _mm256_extracti128_si256(ss6, 0);
896 s4 = _mm256_extracti128_si256(ss6, 1);
897 s0 = _mm_adds_epu16(s3, s4);
898 //s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
899 s0 = _mm_or_si128(s0, s8);
900 s0 = _mm_minpos_epu16(s0);
901 tem_sum1 = _mm_extract_epi16(s0, 0);
902 if (tem_sum1 < low_sum) {
903 if (tem_sum1 != 0xFFFF) { // no overflow
904 low_sum = tem_sum1;
905 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
906 y_best = i;
907 }
908 else {
909 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
910 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
911 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
912 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
913 ss4 = _mm256_add_epi32(ss4, ss6);
914 ss3 = _mm256_add_epi32(ss3, ss5);
915 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
916 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
917 k = leftover;
918 while (k > 0) {
919 for (l=0; l < 4 && k; l++, k--) {
920 tem_sum1 = _mm_extract_epi32(s0, 0);
921 s0 = _mm_srli_si128(s0, 4);
922 if (tem_sum1 < low_sum) {
923 low_sum = tem_sum1;
924 x_best = (int16_t)(j + leftover - k);
925 y_best = i;
926 }
927 }
928 s0 = s3;
929 }
930 }
931 }
932 }
933 ref += src_stride_raw;
934 }
935 }
936 else {
937 for (i=0; i<search_area_height; i++) {
938 for (j=0; j<=search_area_width-8; j+=8) {
939 p_src = src;
940 p_ref = ref + j;
941 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
942 for (k=0; k<height; k++) {
943 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
944 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
945 ss2 = _mm256_loadu_si256((__m256i *)p_src);
946 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
947 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
948 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
949 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
950 p_src += src_stride;
951 p_ref += ref_stride;
952 }
953 ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
954 s3 = _mm256_extracti128_si256(ss7, 0);
955 s4 = _mm256_extracti128_si256(ss7, 1);
956 s0 = _mm_adds_epu16(s3, s4);
957 s0 = _mm_minpos_epu16(s0);
958 tem_sum1 = _mm_extract_epi16(s0, 0);
959 if (tem_sum1 < low_sum) {
960 if (tem_sum1 != 0xFFFF) { // no overflow
961 low_sum = tem_sum1;
962 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
963 y_best = i;
964 }
965 else {
966 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
967 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
968 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
969 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
970 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
971 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
972 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
973 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
974 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
975 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
976 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
977 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
978 UPDATE_BEST(s0, 0, 0);
979 UPDATE_BEST(s0, 1, 0);
980 UPDATE_BEST(s0, 2, 0);
981 UPDATE_BEST(s0, 3, 0);
982 UPDATE_BEST(s3, 0, 4);
983 UPDATE_BEST(s3, 1, 4);
984 UPDATE_BEST(s3, 2, 4);
985 UPDATE_BEST(s3, 3, 4);
986 }
987 }
988 }
989
990 if (leftover) {
991 p_src = src;
992 p_ref = ref + j;
993 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
994 for (k=0; k<height; k++) {
995 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
996 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
997 ss2 = _mm256_loadu_si256((__m256i *)p_src);
998 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
999 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1000 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1001 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1002 p_src += src_stride;
1003 p_ref += ref_stride;
1004 }
1005 ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1006 s3 = _mm256_extracti128_si256(ss7, 0);
1007 s4 = _mm256_extracti128_si256(ss7, 1);
1008 s0 = _mm_adds_epu16(s3, s4);
1009 s0 = _mm_minpos_epu16(s0);
1010 tem_sum1 = _mm_extract_epi16(s0, 0);
1011 if (tem_sum1 < low_sum) {
1012 if (tem_sum1 != 0xFFFF) { // no overflow
1013 low_sum = tem_sum1;
1014 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1015 y_best = i;
1016 }
1017 else {
1018 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1019 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1020 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1021 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1022 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1023 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1024 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1025 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1026 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1027 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1028 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1029 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1030 k = leftover;
1031 while (k > 0) {
1032 for (l=0; l < 4 && k; l++, k--) {
1033 tem_sum1 = _mm_extract_epi32(s0, 0);
1034 s0 = _mm_srli_si128(s0, 4);
1035 if (tem_sum1 < low_sum) {
1036 low_sum = tem_sum1;
1037 x_best = (int16_t)(j + leftover - k);
1038 y_best = i;
1039 }
1040 }
1041 s0 = s3;
1042 }
1043 }
1044 }
1045 }
1046 ref += src_stride_raw;
1047 }
1048 }
1049 break;
1050
1051 case 48:
1052 if (height <= 32) {
1053 for (i=0; i<search_area_height; i++) {
1054 for (j=0; j<=search_area_width-8; j+=8) {
1055 p_src = src;
1056 p_ref = ref + j;
1057 s3 = s4 = s5 = s6 = _mm_setzero_si128();
1058 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1059 for (k=0; k<height; k++) {
1060 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1061 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1062 ss2 = _mm256_loadu_si256((__m256i *)p_src);
1063 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1064 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1065 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1066 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1067 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
1068 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
1069 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
1070 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1071 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1072 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1073 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1074 p_src += src_stride;
1075 p_ref += ref_stride;
1076 }
1077 s3 = _mm_adds_epu16(s3, s4);
1078 s5 = _mm_adds_epu16(s5, s6);
1079 s0 = _mm_adds_epu16(s3, s5);
1080 ss3 = _mm256_adds_epu16(ss3, ss4);
1081 ss5 = _mm256_adds_epu16(ss5, ss6);
1082 ss6 = _mm256_adds_epu16(ss3, ss5);
1083 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss6, 0), _mm256_extracti128_si256(ss6, 1)));
1084 s0 = _mm_minpos_epu16(s0);
1085 tem_sum1 = _mm_extract_epi16(s0, 0);
1086 if (tem_sum1 < low_sum) {
1087 if (tem_sum1 != 0xFFFF) { // no overflow
1088 low_sum = tem_sum1;
1089 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1090 y_best = i;
1091 }
1092 else {
1093 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1094 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1095 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1096 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1097 ss4 = _mm256_add_epi32(ss4, ss6);
1098 ss3 = _mm256_add_epi32(ss3, ss5);
1099 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1100 s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1101 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1102 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1103 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1104 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1105 UPDATE_BEST(s0, 0, 0);
1106 UPDATE_BEST(s0, 1, 0);
1107 UPDATE_BEST(s0, 2, 0);
1108 UPDATE_BEST(s0, 3, 0);
1109 UPDATE_BEST(s1, 0, 4);
1110 UPDATE_BEST(s1, 1, 4);
1111 UPDATE_BEST(s1, 2, 4);
1112 UPDATE_BEST(s1, 3, 4);
1113 }
1114 }
1115 }
1116
1117 if (leftover) {
1118 p_src = src;
1119 p_ref = ref + j;
1120 s3 = s4 = s5 = s6 = _mm_setzero_si128();
1121 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1122 for (k=0; k<height; k++) {
1123 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1124 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1125 ss2 = _mm256_loadu_si256((__m256i *)p_src);
1126 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1127 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1128 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1129 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1130 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
1131 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
1132 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
1133 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1134 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1135 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1136 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1137 p_src += src_stride;
1138 p_ref += ref_stride;
1139 }
1140 s3 = _mm_adds_epu16(s3, s4);
1141 s5 = _mm_adds_epu16(s5, s6);
1142 s0 = _mm_adds_epu16(s3, s5);
1143 ss3 = _mm256_adds_epu16(ss3, ss4);
1144 ss5 = _mm256_adds_epu16(ss5, ss6);
1145 ss6 = _mm256_adds_epu16(ss3, ss5);
1146 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss6, 0), _mm256_extracti128_si256(ss6, 1)));
1147 s0 = _mm_or_si128(s0, s8);
1148 s0 = _mm_minpos_epu16(s0);
1149 tem_sum1 = _mm_extract_epi16(s0, 0);
1150 if (tem_sum1 < low_sum) {
1151 if (tem_sum1 != 0xFFFF) { // no overflow
1152 low_sum = tem_sum1;
1153 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1154 y_best = i;
1155 }
1156 else {
1157 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1158 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1159 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1160 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1161 ss4 = _mm256_add_epi32(ss4, ss6);
1162 ss3 = _mm256_add_epi32(ss3, ss5);
1163 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1164 s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1165 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1166 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1167 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1168 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1169 k = leftover;
1170 while (k > 0) {
1171 for (l=0; l < 4 && k; l++, k--) {
1172 tem_sum1 = _mm_extract_epi32(s0, 0);
1173 s0 = _mm_srli_si128(s0, 4);
1174 if (tem_sum1 < low_sum) {
1175 low_sum = tem_sum1;
1176 x_best = (int16_t)(j + leftover - k);
1177 y_best = i;
1178 }
1179 }
1180 s0 = s1;
1181 }
1182 }
1183 }
1184 }
1185 ref += src_stride_raw;
1186 }
1187 }
1188 else {
1189 for (i=0; i<search_area_height; i++) {
1190 for (j=0; j<=search_area_width-8; j+=8) {
1191 p_src = src;
1192 p_ref = ref + j;
1193 s3 = s4 = s5 = s6 = _mm_setzero_si128();
1194 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1195 for (k=0; k<height; k++) {
1196 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1197 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1198 ss2 = _mm256_loadu_si256((__m256i *)p_src);
1199 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1200 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1201 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1202 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1203 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
1204 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
1205 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
1206 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1207 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1208 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1209 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1210 p_src += src_stride;
1211 p_ref += ref_stride;
1212 }
1213 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1214 ss7 = _mm256_adds_epu16(ss3, ss4);
1215 ss8 = _mm256_adds_epu16(ss5, ss6);
1216 ss7 = _mm256_adds_epu16(ss7, ss8);
1217 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss7, 0), _mm256_extracti128_si256(ss7, 1)));
1218 s0 = _mm_minpos_epu16(s0);
1219 tem_sum1 = _mm_extract_epi16(s0, 0);
1220 if (tem_sum1 < low_sum) {
1221 if (tem_sum1 != 0xFFFF) { // no overflow
1222 low_sum = tem_sum1;
1223 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1224 y_best = i;
1225 }
1226 else {
1227 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1228 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1229 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1230 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1231 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1232 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1233 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1234 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1235 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1236 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1237 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1238 s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1239 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1240 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s4, _mm_setzero_si128()));
1241 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1242 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s6, _mm_setzero_si128()));
1243 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1244 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s4, _mm_setzero_si128()));
1245 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1246 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s6, _mm_setzero_si128()));
1247 UPDATE_BEST(s0, 0, 0);
1248 UPDATE_BEST(s0, 1, 0);
1249 UPDATE_BEST(s0, 2, 0);
1250 UPDATE_BEST(s0, 3, 0);
1251 UPDATE_BEST(s1, 0, 4);
1252 UPDATE_BEST(s1, 1, 4);
1253 UPDATE_BEST(s1, 2, 4);
1254 UPDATE_BEST(s1, 3, 4);
1255 }
1256 }
1257 }
1258
1259 if (leftover) {
1260 p_src = src;
1261 p_ref = ref + j;
1262 s3 = s4 = s5 = s6 = _mm_setzero_si128();
1263 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1264 for (k=0; k<height; k++) {
1265 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1266 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1267 ss2 = _mm256_loadu_si256((__m256i *)p_src);
1268 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1269 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1270 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1271 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1272 s0 = _mm_loadu_si128((__m128i*)(p_ref + 32));
1273 s1 = _mm_loadu_si128((__m128i*)(p_ref + 40));
1274 s2 = _mm_loadu_si128((__m128i*)(p_src + 32));
1275 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1276 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1277 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1278 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1279 p_src += src_stride;
1280 p_ref += ref_stride;
1281 }
1282 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1283 ss7 = _mm256_adds_epu16(ss3, ss4);
1284 ss8 = _mm256_adds_epu16(ss5, ss6);
1285 ss7 = _mm256_adds_epu16(ss7, ss8);
1286 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss7, 0), _mm256_extracti128_si256(ss7, 1)));
1287 s0 = _mm_or_si128(s0, s8);
1288 s0 = _mm_minpos_epu16(s0);
1289 tem_sum1 = _mm_extract_epi16(s0, 0);
1290 if (tem_sum1 < low_sum) {
1291 if (tem_sum1 != 0xFFFF) { // no overflow
1292 low_sum = tem_sum1;
1293 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1294 y_best = i;
1295 }
1296 else {
1297 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1298 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1299 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1300 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1301 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1302 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1303 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1304 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1305 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1306 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1307 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1308 s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1309 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1310 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s4, _mm_setzero_si128()));
1311 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1312 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s6, _mm_setzero_si128()));
1313 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1314 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s4, _mm_setzero_si128()));
1315 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1316 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s6, _mm_setzero_si128()));
1317 k = leftover;
1318 while (k > 0) {
1319 for (l=0; l < 4 && k; l++, k--) {
1320 tem_sum1 = _mm_extract_epi32(s0, 0);
1321 s0 = _mm_srli_si128(s0, 4);
1322 if (tem_sum1 < low_sum) {
1323 low_sum = tem_sum1;
1324 x_best = (int16_t)(j + leftover - k);
1325 y_best = i;
1326 }
1327 }
1328 s0 = s1;
1329 }
1330 }
1331 }
1332 }
1333 ref += src_stride_raw;
1334 }
1335 }
1336 break;
1337
1338 case 64:
1339 if (height <= 32) {
1340 for (i=0; i<search_area_height; i++) {
1341 for (j=0; j<=search_area_width-8; j+=8) {
1342 p_src = src;
1343 p_ref = ref + j;
1344 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1345 for (k=0; k<height; k++) {
1346 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1347 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1348 ss2 = _mm256_loadu_si256((__m256i *)p_src);
1349 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1350 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1351 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1352 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1353 ss0 = _mm256_loadu_si256((__m256i*)(p_ref + 32));
1354 ss1 = _mm256_loadu_si256((__m256i*)(p_ref + 40));
1355 ss2 = _mm256_loadu_si256((__m256i *)(p_src + 32));
1356 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1357 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1358 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1359 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1360 p_src += src_stride;
1361 p_ref += ref_stride;
1362 }
1363 ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1364 s3 = _mm256_extracti128_si256(ss7, 0);
1365 s4 = _mm256_extracti128_si256(ss7, 1);
1366 s0 = _mm_adds_epu16(s3, s4);
1367 s0 = _mm_minpos_epu16(s0);
1368 tem_sum1 = _mm_extract_epi16(s0, 0);
1369 if (tem_sum1 < low_sum) {
1370 if (tem_sum1 != 0xFFFF) { // no overflow
1371 low_sum = tem_sum1;
1372 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1373 y_best = i;
1374 }
1375 else {
1376 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1377 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1378 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1379 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1380 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1381 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1382 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1383 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1384 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1385 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1386 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1387 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1388 UPDATE_BEST(s0, 0, 0);
1389 UPDATE_BEST(s0, 1, 0);
1390 UPDATE_BEST(s0, 2, 0);
1391 UPDATE_BEST(s0, 3, 0);
1392 UPDATE_BEST(s3, 0, 4);
1393 UPDATE_BEST(s3, 1, 4);
1394 UPDATE_BEST(s3, 2, 4);
1395 UPDATE_BEST(s3, 3, 4);
1396 }
1397 }
1398 }
1399
1400 if (leftover) {
1401 p_src = src;
1402 p_ref = ref + j;
1403 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1404 for (k=0; k<height; k++) {
1405 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1406 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1407 ss2 = _mm256_loadu_si256((__m256i *)p_src);
1408 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1409 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1410 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1411 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1412 ss0 = _mm256_loadu_si256((__m256i*)(p_ref + 32));
1413 ss1 = _mm256_loadu_si256((__m256i*)(p_ref + 40));
1414 ss2 = _mm256_loadu_si256((__m256i *)(p_src + 32));
1415 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1416 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1417 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1418 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1419 p_src += src_stride;
1420 p_ref += ref_stride;
1421 }
1422 ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1423 s3 = _mm256_extracti128_si256(ss7, 0);
1424 s4 = _mm256_extracti128_si256(ss7, 1);
1425 s0 = _mm_adds_epu16(s3, s4);
1426 s0 = _mm_or_si128(s0, s8);
1427 s0 = _mm_minpos_epu16(s0);
1428 tem_sum1 = _mm_extract_epi16(s0, 0);
1429 if (tem_sum1 < low_sum) {
1430 if (tem_sum1 != 0xFFFF) { // no overflow
1431 low_sum = tem_sum1;
1432 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1433 y_best = i;
1434 }
1435 else {
1436 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1437 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1438 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1439 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1440 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1441 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1442 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1443 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1444 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1445 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1446 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1447 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1448 k = leftover;
1449 while (k > 0) {
1450 for (l=0; l < 4 && k; l++, k--) {
1451 tem_sum1 = _mm_extract_epi32(s0, 0);
1452 s0 = _mm_srli_si128(s0, 4);
1453 if (tem_sum1 < low_sum) {
1454 low_sum = tem_sum1;
1455 x_best = (int16_t)(j + leftover - k);
1456 y_best = i;
1457 }
1458 }
1459 s0 = s3;
1460 }
1461 }
1462 }
1463 }
1464 ref += src_stride_raw;
1465 }
1466 }
1467 else {
1468 __m256i ss9, ss10;
1469 for (i=0; i<search_area_height; i++) {
1470 for (j=0; j<=search_area_width-8; j+=8) {
1471 p_src = src;
1472 p_ref = ref + j;
1473 ss3 = ss4 = ss5 = ss6 = ss7 = ss8 = ss9 = ss10 = _mm256_setzero_si256();
1474 for (k=0; k<height; k++) {
1475 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1476 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1477 ss2 = _mm256_loadu_si256((__m256i *)p_src);
1478 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1479 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1480 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1481 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1482 ss0 = _mm256_loadu_si256((__m256i*)(p_ref + 32));
1483 ss1 = _mm256_loadu_si256((__m256i*)(p_ref + 40));
1484 ss2 = _mm256_loadu_si256((__m256i *)(p_src + 32));
1485 ss7 = _mm256_adds_epu16(ss7, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1486 ss8 = _mm256_adds_epu16(ss8, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1487 ss9 = _mm256_adds_epu16(ss9, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1488 ss10 = _mm256_adds_epu16(ss10, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1489 p_src += src_stride;
1490 p_ref += ref_stride;
1491 }
1492 ss0 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1493 ss0 = _mm256_adds_epu16(ss0, _mm256_adds_epu16(_mm256_adds_epu16(ss7, ss8), _mm256_adds_epu16(ss9, ss10)));
1494 s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1495 s0 = _mm_minpos_epu16(s0);
1496 tem_sum1 = _mm_extract_epi16(s0, 0);
1497 if (tem_sum1 < low_sum) {
1498 if (tem_sum1 != 0xFFFF) { // no overflow
1499 low_sum = tem_sum1;
1500 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1501 y_best = i;
1502 }
1503 else {
1504 ss0 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss3, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss5, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256())));
1505 ss1 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss3, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss5, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256())));
1506 ss2 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss7, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss9, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss10, _mm256_setzero_si256())));
1507 ss3 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss7, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss9, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss10, _mm256_setzero_si256())));
1508 ss0 = _mm256_add_epi32(ss0, ss2);
1509 ss1 = _mm256_add_epi32(ss1, ss3);
1510 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1511 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss1, 0), _mm256_extracti128_si256(ss1, 1));
1512 UPDATE_BEST(s0, 0, 0);
1513 UPDATE_BEST(s0, 1, 0);
1514 UPDATE_BEST(s0, 2, 0);
1515 UPDATE_BEST(s0, 3, 0);
1516 UPDATE_BEST(s3, 0, 4);
1517 UPDATE_BEST(s3, 1, 4);
1518 UPDATE_BEST(s3, 2, 4);
1519 UPDATE_BEST(s3, 3, 4);
1520 }
1521 }
1522 }
1523
1524 if (leftover) {
1525 p_src = src;
1526 p_ref = ref + j;
1527 ss3 = ss4 = ss5 = ss6 = ss7 = ss8 = ss9 = ss10 = _mm256_setzero_si256();
1528 for (k=0; k<height; k++) {
1529 ss0 = _mm256_loadu_si256((__m256i*)p_ref);
1530 ss1 = _mm256_loadu_si256((__m256i*)(p_ref+8));
1531 ss2 = _mm256_loadu_si256((__m256i *)p_src);
1532 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1533 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1534 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1535 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1536 ss0 = _mm256_loadu_si256((__m256i*)(p_ref + 32));
1537 ss1 = _mm256_loadu_si256((__m256i*)(p_ref + 40));
1538 ss2 = _mm256_loadu_si256((__m256i *)(p_src + 32));
1539 ss7 = _mm256_adds_epu16(ss7, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1540 ss8 = _mm256_adds_epu16(ss8, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1541 ss9 = _mm256_adds_epu16(ss9, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1542 ss10 = _mm256_adds_epu16(ss10, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1543 p_src += src_stride;
1544 p_ref += ref_stride;
1545 }
1546 ss0 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1547 ss0 = _mm256_adds_epu16(ss0, _mm256_adds_epu16(_mm256_adds_epu16(ss7, ss8), _mm256_adds_epu16(ss9, ss10)));
1548 s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1549 s0 = _mm_or_si128(s0, s8);
1550 s0 = _mm_minpos_epu16(s0);
1551 tem_sum1 = _mm_extract_epi16(s0, 0);
1552 if (tem_sum1 < low_sum) {
1553 if (tem_sum1 != 0xFFFF) { // no overflow
1554 low_sum = tem_sum1;
1555 x_best = (int16_t)(j + _mm_extract_epi16(s0, 1));
1556 y_best = i;
1557 }
1558 else {
1559 ss0 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss3, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss5, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256())));
1560 ss1 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss3, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss5, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256())));
1561 ss2 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss7, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss9, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss10, _mm256_setzero_si256())));
1562 ss3 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss7, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss9, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss10, _mm256_setzero_si256())));
1563 ss0 = _mm256_add_epi32(ss0, ss2);
1564 ss1 = _mm256_add_epi32(ss1, ss3);
1565 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1566 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss1, 0), _mm256_extracti128_si256(ss1, 1));
1567 k = leftover;
1568 while (k > 0) {
1569 for (l=0; l < 4 && k; l++, k--) {
1570 tem_sum1 = _mm_extract_epi32(s0, 0);
1571 s0 = _mm_srli_si128(s0, 4);
1572 if (tem_sum1 < low_sum) {
1573 low_sum = tem_sum1;
1574 x_best = (int16_t)(j + leftover - k);
1575 y_best = i;
1576 }
1577 }
1578 s0 = s3;
1579 }
1580 }
1581 }
1582 }
1583 ref += src_stride_raw;
1584 }
1585 }
1586 break;
1587
1588 default:
1589 break;
1590 }
1591
1592 *best_sad = low_sum;
1593 *x_search_center = x_best;
1594 *y_search_center = y_best;
1595 }
1596
1597 /*******************************************************************************
1598 * Requirement: height % 4 = 0
1599 *******************************************************************************/
compute4x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1600 uint32_t compute4x_m_sad_avx2_intrin(
1601 uint8_t *src, // input parameter, source samples Ptr
1602 uint32_t src_stride, // input parameter, source stride
1603 uint8_t *ref, // input parameter, reference samples Ptr
1604 uint32_t ref_stride, // input parameter, reference stride
1605 uint32_t height, // input parameter, block height (M)
1606 uint32_t width) // input parameter, block width (N)
1607 {
1608 __m128i xmm0;
1609 __m256i ymm = _mm256_setzero_si256();
1610 uint32_t y;
1611 (void)width;
1612
1613 for (y = 0; y < height; y += 4) {
1614 const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
1615 const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
1616 ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
1617 src += src_stride << 2;
1618 ref += ref_stride << 2;
1619 }
1620
1621 xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
1622 _mm256_extracti128_si256(ymm, 1));
1623
1624 return (uint32_t)_mm_cvtsi128_si32(xmm0);
1625 }
1626
1627 /*******************************************************************************
1628 * Requirement: height % 4 = 0
1629 *******************************************************************************/
compute8x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1630 uint32_t compute8x_m_sad_avx2_intrin(
1631 uint8_t *src, // input parameter, source samples Ptr
1632 uint32_t src_stride, // input parameter, source stride
1633 uint8_t *ref, // input parameter, reference samples Ptr
1634 uint32_t ref_stride, // input parameter, reference stride
1635 uint32_t height, // input parameter, block height (M)
1636 uint32_t width) // input parameter, block width (N)
1637 {
1638 __m128i xmm0;
1639 __m256i ymm = _mm256_setzero_si256();
1640 uint32_t y;
1641 (void)width;
1642
1643 for (y = 0; y < height; y += 4) {
1644 const __m256i src0123 = load8bit_8x4_avx2(src, src_stride);
1645 const __m256i ref0123 = load8bit_8x4_avx2(ref, ref_stride);
1646 ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
1647 src += src_stride << 2;
1648 ref += ref_stride << 2;
1649 }
1650
1651 xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
1652 _mm256_extracti128_si256(ymm, 1));
1653 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1654
1655 return (uint32_t)_mm_cvtsi128_si32(xmm0);
1656 }
1657
Compute16x2Sad_Kernel(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,const __m256i ymm)1658 static __m256i Compute16x2Sad_Kernel(const uint8_t *const src,
1659 const uint32_t src_stride, const uint8_t *const ref,
1660 const uint32_t ref_stride, const __m256i ymm)
1661 {
1662 const __m256i src01 = load8bit_16x2_unaligned_avx2(src, src_stride);
1663 const __m256i ref01 = load8bit_16x2_unaligned_avx2(ref, ref_stride);
1664 return _mm256_add_epi32(ymm, _mm256_sad_epu8(src01, ref01));
1665 }
1666
1667 /*******************************************************************************
1668 * Requirement: height % 4 = 0
1669 *******************************************************************************/
compute16x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1670 uint32_t compute16x_m_sad_avx2_intrin(
1671 uint8_t *src, // input parameter, source samples Ptr
1672 uint32_t src_stride, // input parameter, source stride
1673 uint8_t *ref, // input parameter, reference samples Ptr
1674 uint32_t ref_stride, // input parameter, reference stride
1675 uint32_t height, // input parameter, block height (M)
1676 uint32_t width) // input parameter, block width (N)
1677 {
1678 __m128i xmm0;
1679 __m256i ymm = _mm256_setzero_si256();
1680 uint32_t y;
1681 (void)width;
1682
1683 for (y = 0; y < height; y += 4) {
1684 ymm = Compute16x2Sad_Kernel(src + 0 * src_stride, src_stride,
1685 ref + 0 * ref_stride, ref_stride, ymm);
1686 ymm = Compute16x2Sad_Kernel(src + 2 * src_stride, src_stride,
1687 ref + 2 * ref_stride, ref_stride, ymm);
1688 src += src_stride << 2;
1689 ref += ref_stride << 2;
1690 }
1691
1692 xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
1693 _mm256_extracti128_si256(ymm, 1));
1694 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1695
1696 return (uint32_t)_mm_cvtsi128_si32(xmm0);
1697 }
1698
1699 /*******************************************************************************
1700 * Requirement: height % 2 = 0
1701 *******************************************************************************/
compute24x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1702 uint32_t compute24x_m_sad_avx2_intrin(
1703 uint8_t *src, // input parameter, source samples Ptr
1704 uint32_t src_stride, // input parameter, source stride
1705 uint8_t *ref, // input parameter, reference samples Ptr
1706 uint32_t ref_stride, // input parameter, reference stride
1707 uint32_t height, // input parameter, block height (M)
1708 uint32_t width) // input parameter, block width (N)
1709 {
1710 __m128i xmm0, xmm1;
1711 __m256i ymm0, ymm1;
1712 uint32_t y;
1713 (void)width;
1714
1715 ymm0 = ymm1 = _mm256_setzero_si256();
1716 for (y = 0; y < height; y += 2) {
1717 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i *)ref)));
1718 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride)), _mm256_loadu_si256((__m256i *)(ref + ref_stride))));
1719 src += src_stride << 1;
1720 ref += ref_stride << 1;
1721 }
1722 xmm0 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm1, 0));
1723 xmm1 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 1), _mm256_extracti128_si256(ymm1, 1));
1724 xmm0 = _mm_add_epi32(_mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8)), xmm1);
1725 return (uint32_t)_mm_cvtsi128_si32(xmm0);
1726 }
1727
1728 /*******************************************************************************
1729 * Requirement: height % 2 = 0
1730 *******************************************************************************/
compute32x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1731 uint32_t compute32x_m_sad_avx2_intrin(
1732 uint8_t *src, // input parameter, source samples Ptr
1733 uint32_t src_stride, // input parameter, source stride
1734 uint8_t *ref, // input parameter, reference samples Ptr
1735 uint32_t ref_stride, // input parameter, reference stride
1736 uint32_t height, // input parameter, block height (M)
1737 uint32_t width) // input parameter, block width (N)
1738 {
1739 __m128i xmm0;
1740 __m256i ymm0, ymm1;
1741 uint32_t y;
1742 (void)width;
1743
1744 ymm0 = ymm1 = _mm256_setzero_si256();
1745 for (y = 0; y < height; y += 2) {
1746 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i *)ref)));
1747 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride)), _mm256_loadu_si256((__m256i *)(ref + ref_stride))));
1748 src += src_stride << 1;
1749 ref += ref_stride << 1;
1750 }
1751 ymm0 = _mm256_add_epi32(ymm0, ymm1);
1752 xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm0), _mm256_extracti128_si256(ymm0, 1));
1753 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1754 return (uint32_t) /*xmm0.m128i_i64[0];*/ _mm_cvtsi128_si32(xmm0);
1755 }
1756
1757 /*******************************************************************************
1758 * Requirement: height % 2 = 0
1759 *******************************************************************************/
compute48x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1760 uint32_t compute48x_m_sad_avx2_intrin(
1761 uint8_t *src, // input parameter, source samples Ptr
1762 uint32_t src_stride, // input parameter, source stride
1763 uint8_t *ref, // input parameter, reference samples Ptr
1764 uint32_t ref_stride, // input parameter, reference stride
1765 uint32_t height, // input parameter, block height (M)
1766 uint32_t width) // input parameter, block width (N)
1767 {
1768 __m128i xmm0, xmm1;
1769 __m256i ymm0, ymm1;
1770 uint32_t y;
1771 (void)width;
1772
1773 ymm0 = ymm1 = _mm256_setzero_si256();
1774 xmm0 = xmm1 = _mm_setzero_si128();
1775 for (y = 0; y < height; y += 2) {
1776 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1777 xmm0 = _mm_add_epi32(xmm0, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 32)), _mm_loadu_si128((__m128i*)(ref + 32))));
1778 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride)), _mm256_loadu_si256((__m256i*)(ref + ref_stride))));
1779 xmm1 = _mm_add_epi32(xmm1, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + src_stride + 32)), _mm_loadu_si128((__m128i*)(ref + ref_stride + 32))));
1780 src += src_stride << 1;
1781 ref += ref_stride << 1;
1782 }
1783 ymm0 = _mm256_add_epi32(ymm0, ymm1);
1784 xmm0 = _mm_add_epi32(xmm0, xmm1);
1785 xmm0 = _mm_add_epi32(xmm0, _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm0, 1)));
1786 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1787 return _mm_extract_epi32(xmm0, 0);
1788 }
1789
1790 /*******************************************************************************
1791 * Requirement: height % 2 = 0
1792 *******************************************************************************/
compute64x_m_sad_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width)1793 uint32_t compute64x_m_sad_avx2_intrin(
1794 uint8_t *src, // input parameter, source samples Ptr
1795 uint32_t src_stride, // input parameter, source stride
1796 uint8_t *ref, // input parameter, reference samples Ptr
1797 uint32_t ref_stride, // input parameter, reference stride
1798 uint32_t height, // input parameter, block height (M)
1799 uint32_t width) // input parameter, block width (N)
1800 {
1801 __m128i xmm0;
1802 __m256i ymm0, ymm1, ymm2, ymm3;
1803 uint32_t y;
1804 (void)width;
1805
1806 ymm0 = ymm1 = ymm2 = ymm3 = _mm256_setzero_si256();
1807 for (y = 0; y < height; y += 2) {
1808 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1809 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + 32)), _mm256_loadu_si256((__m256i*)(ref + 32))));
1810 ymm2 = _mm256_add_epi32(ymm2, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride)), _mm256_loadu_si256((__m256i*)(ref + ref_stride))));
1811 ymm3 = _mm256_add_epi32(ymm3, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + src_stride + 32)), _mm256_loadu_si256((__m256i*)(ref + ref_stride + 32))));
1812 src += src_stride << 1;
1813 ref += ref_stride << 1;
1814 }
1815 ymm0 = _mm256_add_epi32(_mm256_add_epi32(ymm0, ymm1), _mm256_add_epi32(ymm2, ymm3));
1816 xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm0), _mm256_extracti128_si256(ymm0, 1));
1817 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1818 return _mm_extract_epi32(xmm0, 0);
1819 }
1820
1821 #ifdef DISABLE_AVX512
1822
eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu_avx2_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t * p_best_sad8x8,uint32_t * p_best_mv8x8,uint32_t * p_best_sad16x16,uint32_t * p_best_mv16x16,uint32_t mv,uint16_t * p_sad16x16)1823 void eb_vp9_get_eight_horizontal_search_point_results_8x8_16x16_pu_avx2_intrin(
1824 uint8_t *src,
1825 uint32_t src_stride,
1826 uint8_t *ref,
1827 uint32_t ref_stride,
1828 uint32_t *p_best_sad8x8,
1829 uint32_t *p_best_mv8x8,
1830 uint32_t *p_best_sad16x16,
1831 uint32_t *p_best_mv16x16,
1832 uint32_t mv,
1833 uint16_t *p_sad16x16)
1834 {
1835
1836 int16_t x_mv, y_mv;
1837 __m128i s3;
1838 __m128i sad_0, sad_1, sad_2, sad_3;
1839 __m256i ss0, ss1, ss2, ss3, ss4;
1840 uint32_t tem_sum;
1841
1842 /*
1843 ------------------------------------- -----------------------------------
1844 | 8x8_00 | 8x8_01 | 8x8_04 | 8x8_05 | 8x8_16 | 8x8_17 | 8x8_20 | 8x8_21 |
1845 ------------------------------------- -----------------------------------
1846 | 8x8_02 | 8x8_03 | 8x8_06 | 8x8_07 | 8x8_18 | 8x8_19 | 8x8_22 | 8x8_23 |
1847 ----------------------- ----------- ---------------------- ----------
1848 | 8x8_08 | 8x8_09 | 8x8_12 | 8x8_13 | 8x8_24 | 8x8_25 | 8x8_29 | 8x8_29 |
1849 ---------------------- ----------- --------------------- ----------
1850 | 8x8_10 | 8x8_11 | 8x8_14 | 8x8_15 | 8x8_26 | 8x8_27 | 8x8_30 | 8x8_31 |
1851 ------------------------------------- -----------------------------------
1852
1853 ------------------------------------- -----------------------------------
1854 | 8x8_32 | 8x8_33 | 8x8_36 | 8x8_37 | 8x8_48 | 8x8_49 | 8x8_52 | 8x8_53 |
1855 ------------------------------------- -----------------------------------
1856 | 8x8_34 | 8x8_35 | 8x8_38 | 8x8_39 | 8x8_50 | 8x8_51 | 8x8_54 | 8x8_55 |
1857 ----------------------- ----------- ---------------------- ----------
1858 | 8x8_40 | 8x8_41 | 8x8_44 | 8x8_45 | 8x8_56 | 8x8_57 | 8x8_60 | 8x8_61 |
1859 ---------------------- ----------- --------------------- ----------
1860 | 8x8_42 | 8x8_43 | 8x8_46 | 8x8_48 | 8x8_58 | 8x8_59 | 8x8_62 | 8x8_63 |
1861 ------------------------------------- -----------------------------------
1862 */
1863
1864 /*
1865 ---------------------- ----------------------
1866 | 16x16_0 | 16x16_1 | 16x16_4 | 16x16_5 |
1867 ---------------------- ----------------------
1868 | 16x16_2 | 16x16_3 | 16x16_6 | 16x16_7 |
1869 ----------------------- -----------------------
1870 | 16x16_8 | 16x16_9 | 16x16_12 | 16x16_13 |
1871 ---------------------- ----------------------
1872 | 16x16_10 | 16x16_11 | 16x16_14 | 16x16_15 |
1873 ----------------------- -----------------------
1874 */
1875
1876 //8x8_0 & 8x8_1
1877 ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride)));
1878 ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride + 8)));
1879 ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * src_stride)));
1880 ss3 = _mm256_mpsadbw_epu8(ss0, ss2, 0);
1881 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1882 ss4 = _mm256_mpsadbw_epu8(ss1, ss2, 18); // 010 010
1883 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1884 src += src_stride * 4;
1885 ref += ref_stride * 4;
1886 ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride)));
1887 ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride + 8)));
1888 ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * src_stride)));
1889 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1890 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1891 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1892 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1893 sad_0 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1894 sad_1 = _mm_adds_epu16(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1895
1896 //8x8_2 & 8x8_3
1897 src += src_stride * 4;
1898 ref += ref_stride * 4;
1899 ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride)));
1900 ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride + 8)));
1901 ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * src_stride)));
1902 ss3 = _mm256_mpsadbw_epu8(ss0, ss2, 0);
1903 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1904 ss4 = _mm256_mpsadbw_epu8(ss1, ss2, 18); // 010 010
1905 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1906
1907 src += src_stride * 4;
1908 ref += ref_stride * 4;
1909 ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride)));
1910 ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * ref_stride + 8)));
1911 ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * src_stride)));
1912 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1913 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1914 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1915 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1916 sad_2 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1917 sad_3 = _mm_adds_epu16(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1918
1919 //16x16
1920 s3 = _mm_adds_epu16(_mm_adds_epu16(sad_0, sad_1), _mm_adds_epu16(sad_2, sad_3));
1921 //sotore the 8 SADs(16x8 SADs)
1922 _mm_store_si128((__m128i*)p_sad16x16, s3);
1923 //find the best for 16x16
1924 s3 = _mm_minpos_epu16(s3);
1925 tem_sum = _mm_extract_epi16(s3, 0) << 1;
1926 if (tem_sum < p_best_sad16x16[0]) {
1927 p_best_sad16x16[0] = tem_sum;
1928 x_mv = _MVXT(mv) + (int16_t)(_mm_extract_epi16(s3, 1) * 4);
1929 y_mv = _MVYT(mv);
1930 p_best_mv16x16[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
1931 }
1932
1933 //find the best for 8x8_0, 8x8_1, 8x8_2 & 8x8_3
1934 sad_0 = _mm_minpos_epu16(sad_0);
1935 sad_1 = _mm_minpos_epu16(sad_1);
1936 sad_2 = _mm_minpos_epu16(sad_2);
1937 sad_3 = _mm_minpos_epu16(sad_3);
1938 sad_0 = _mm_unpacklo_epi16(sad_0, sad_1);
1939 sad_2 = _mm_unpacklo_epi16(sad_2, sad_3);
1940 sad_0 = _mm_unpacklo_epi32(sad_0, sad_2);
1941 sad_1 = _mm_unpackhi_epi16(sad_0, _mm_setzero_si128());
1942 sad_0 = _mm_unpacklo_epi16(sad_0, _mm_setzero_si128());
1943 sad_0 = _mm_slli_epi32(sad_0, 1);
1944 sad_1 = _mm_slli_epi16(sad_1, 2);
1945 sad_2 = _mm_loadu_si128((__m128i*)p_best_sad8x8);
1946 s3 = _mm_cmpgt_epi32(sad_2, sad_0);
1947 sad_0 = _mm_min_epu32(sad_0, sad_2);
1948 _mm_storeu_si128((__m128i*)p_best_sad8x8, sad_0);
1949 sad_3 = _mm_loadu_si128((__m128i*)p_best_mv8x8);
1950 sad_3 = _mm_andnot_si128(s3, sad_3);
1951 sad_2 = _mm_set1_epi32(mv);
1952 sad_2 = _mm_add_epi16(sad_2, sad_1);
1953 sad_2 = _mm_and_si128(sad_2, s3);
1954 sad_2 = _mm_or_si128(sad_2, sad_3);
1955 _mm_storeu_si128((__m128i*)p_best_mv8x8, sad_2);
1956
1957 }
eb_vp9_get_eight_horizontal_search_point_results_32x32_64x64_pu_avx2_intrin(uint16_t * p_sad16x16,uint32_t * p_best_sad32x32,uint32_t * p_best_sad64x64,uint32_t * p_best_mv32x32,uint32_t * p_best_mv64x64,uint32_t mv)1958 void eb_vp9_get_eight_horizontal_search_point_results_32x32_64x64_pu_avx2_intrin(
1959 uint16_t *p_sad16x16,
1960 uint32_t *p_best_sad32x32,
1961 uint32_t *p_best_sad64x64,
1962 uint32_t *p_best_mv32x32,
1963 uint32_t *p_best_mv64x64,
1964 uint32_t mv)
1965 {
1966 int16_t x_mv, y_mv;
1967 uint32_t tem_sum, best_sad64x64, best_mv_64x64;
1968 __m128i s0, s1, s2, s3, s4, s5, s6, s7, sad_0, sad_1;
1969 __m128i sad_00, sad_01, sad_10, sad_11, sad_20, sad_21, sad_30, sad_31;
1970 __m256i ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7;
1971
1972 s0 = _mm_setzero_si128();
1973 s1 = _mm_setzero_si128();
1974 s2 = _mm_setzero_si128();
1975 s3 = _mm_setzero_si128();
1976 s4 = _mm_setzero_si128();
1977 s5 = _mm_setzero_si128();
1978 s6 = _mm_setzero_si128();
1979 s7 = _mm_setzero_si128();
1980 sad_0 = _mm_setzero_si128();
1981 sad_1 = _mm_setzero_si128();
1982
1983 sad_00 = _mm_setzero_si128();
1984 sad_01 = _mm_setzero_si128();
1985 sad_10 = _mm_setzero_si128();
1986 sad_11 = _mm_setzero_si128();
1987 sad_20 = _mm_setzero_si128();
1988 sad_21 = _mm_setzero_si128();
1989 sad_30 = _mm_setzero_si128();
1990 sad_31 = _mm_setzero_si128();
1991
1992 ss0 = _mm256_setzero_si256();
1993 ss1 = _mm256_setzero_si256();
1994 ss2 = _mm256_setzero_si256();
1995 ss3 = _mm256_setzero_si256();
1996 ss4 = _mm256_setzero_si256();
1997 ss5 = _mm256_setzero_si256();
1998 ss6 = _mm256_setzero_si256();
1999 ss7 = _mm256_setzero_si256();
2000
2001 /*--------------------
2002 | 32x32_0 | 32x32_1
2003 ----------------------
2004 | 32x32_2 | 32x32_3
2005 ----------------------*/
2006
2007 /* data ordering in p_sad16x16 buffer
2008
2009 Search Search Search
2010 Point 0 Point 1 Point 7
2011 ---------------------------------------
2012 16x16_0 | x | x | ...... | x |
2013 ---------------------------------------
2014 16x16_1 | x | x | ...... | x |
2015
2016 16x16_n | x | x | ...... | x |
2017
2018 ---------------------------------------
2019 16x16_15 | x | x | ...... | x |
2020 ---------------------------------------
2021 */
2022
2023 // __m128i Zero = _mm_setzero_si128();
2024
2025 //32x32_0
2026 s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 0 * 8));
2027 s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 1 * 8));
2028 s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 2 * 8));
2029 s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 3 * 8));
2030
2031 s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2032 s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2033 s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2034 s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2035 s0 = _mm_add_epi32(s4, s6);
2036 s1 = _mm_add_epi32(s5, s7);
2037
2038 s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2039 s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2040 s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2041 s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2042 s2 = _mm_add_epi32(s4, s6);
2043 s3 = _mm_add_epi32(s5, s7);
2044
2045 sad_01 = _mm_add_epi32(s0, s2);
2046 sad_00 = _mm_add_epi32(s1, s3);
2047
2048 //32x32_1
2049 s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 4 * 8));
2050 s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 5 * 8));
2051 s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 6 * 8));
2052 s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 7 * 8));
2053
2054 s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2055 s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2056 s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2057 s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2058 s0 = _mm_add_epi32(s4, s6);
2059 s1 = _mm_add_epi32(s5, s7);
2060
2061 s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2062 s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2063 s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2064 s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2065 s2 = _mm_add_epi32(s4, s6);
2066 s3 = _mm_add_epi32(s5, s7);
2067
2068 sad_11 = _mm_add_epi32(s0, s2);
2069 sad_10 = _mm_add_epi32(s1, s3);
2070
2071 //32x32_2
2072 s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 8 * 8));
2073 s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 9 * 8));
2074 s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 10 * 8));
2075 s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 11 * 8));
2076
2077 s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2078 s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2079 s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2080 s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2081 s0 = _mm_add_epi32(s4, s6);
2082 s1 = _mm_add_epi32(s5, s7);
2083
2084 s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2085 s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2086 s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2087 s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2088 s2 = _mm_add_epi32(s4, s6);
2089 s3 = _mm_add_epi32(s5, s7);
2090
2091 sad_21 = _mm_add_epi32(s0, s2);
2092 sad_20 = _mm_add_epi32(s1, s3);
2093
2094 //32x32_3
2095 s0 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 12 * 8));
2096 s1 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 13 * 8));
2097 s2 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 14 * 8));
2098 s3 = _mm_loadu_si128((__m128i*)(p_sad16x16 + 15 * 8));
2099
2100 s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2101 s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2102 s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2103 s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2104 s0 = _mm_add_epi32(s4, s6);
2105 s1 = _mm_add_epi32(s5, s7);
2106
2107 s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2108 s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2109 s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2110 s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2111 s2 = _mm_add_epi32(s4, s6);
2112 s3 = _mm_add_epi32(s5, s7);
2113
2114 sad_31 = _mm_add_epi32(s0, s2);
2115 sad_30 = _mm_add_epi32(s1, s3);
2116
2117 sad_0 = _mm_add_epi32(_mm_add_epi32(sad_00, sad_10), _mm_add_epi32(sad_20, sad_30));
2118 sad_1 = _mm_add_epi32(_mm_add_epi32(sad_01, sad_11), _mm_add_epi32(sad_21, sad_31));
2119 sad_0 = _mm_slli_epi32(sad_0, 1);
2120 sad_1 = _mm_slli_epi32(sad_1, 1);
2121
2122 best_sad64x64 = p_best_sad64x64[0];
2123 best_mv_64x64 = 0;
2124 //sad_0
2125 tem_sum = _mm_extract_epi32(sad_0, 0);
2126 if (tem_sum < best_sad64x64) {
2127 best_sad64x64 = tem_sum;
2128 }
2129 tem_sum = _mm_extract_epi32(sad_0, 1);
2130 if (tem_sum < best_sad64x64) {
2131 best_sad64x64 = tem_sum;
2132 best_mv_64x64 = 1 * 4;
2133 }
2134 tem_sum = _mm_extract_epi32(sad_0, 2);
2135 if (tem_sum < best_sad64x64) {
2136 best_sad64x64 = tem_sum;
2137 best_mv_64x64 = 2 * 4;
2138 }
2139 tem_sum = _mm_extract_epi32(sad_0, 3);
2140 if (tem_sum < best_sad64x64) {
2141 best_sad64x64 = tem_sum;
2142 best_mv_64x64 = 3 * 4;
2143 }
2144
2145 //sad_1
2146 tem_sum = _mm_extract_epi32(sad_1, 0);
2147 if (tem_sum < best_sad64x64) {
2148 best_sad64x64 = tem_sum;
2149 best_mv_64x64 = 4 * 4;
2150 }
2151 tem_sum = _mm_extract_epi32(sad_1, 1);
2152 if (tem_sum < best_sad64x64) {
2153 best_sad64x64 = tem_sum;
2154 best_mv_64x64 = 5 * 4;
2155 }
2156 tem_sum = _mm_extract_epi32(sad_1, 2);
2157 if (tem_sum < best_sad64x64) {
2158 best_sad64x64 = tem_sum;
2159 best_mv_64x64 = 6 * 4;
2160 }
2161 tem_sum = _mm_extract_epi32(sad_1, 3);
2162 if (tem_sum < best_sad64x64) {
2163 best_sad64x64 = tem_sum;
2164 best_mv_64x64 = 7 * 4;
2165 }
2166 if (p_best_sad64x64[0] != best_sad64x64) {
2167 p_best_sad64x64[0] = best_sad64x64;
2168 x_mv = _MVXT(mv) + (int16_t)best_mv_64x64; y_mv = _MVYT(mv);
2169 p_best_mv64x64[0] = ((uint16_t)y_mv << 16) | ((uint16_t)x_mv);
2170 }
2171
2172 // ****CODE PAST HERE IS BUGGY FOR GCC****
2173
2174 // XY
2175 // X: 32x32 block [0..3]
2176 // Y: Search position [0..7]
2177 ss0 = _mm256_setr_m128i(sad_00, sad_01); // 07 06 05 04 03 02 01 00
2178 ss1 = _mm256_setr_m128i(sad_10, sad_11); // 17 16 15 14 13 12 11 10
2179 ss2 = _mm256_setr_m128i(sad_20, sad_21); // 27 26 25 24 23 22 21 20
2180 ss3 = _mm256_setr_m128i(sad_30, sad_31); // 37 36 35 34 33 32 31 30
2181 ss4 = _mm256_unpacklo_epi32(ss0, ss1); // 15 05 14 04 11 01 10 00
2182 ss5 = _mm256_unpacklo_epi32(ss2, ss3); // 35 25 34 24 31 21 30 20
2183 ss6 = _mm256_unpackhi_epi32(ss0, ss1); // 17 07 16 06 13 03 12 02
2184 ss7 = _mm256_unpackhi_epi32(ss2, ss3); // 37 27 36 26 33 23 32 22
2185 ss0 = _mm256_unpacklo_epi64(ss4, ss5); // 34 24 14 04 30 20 10 00
2186 ss1 = _mm256_unpackhi_epi64(ss4, ss5); // 35 25 15 05 31 21 11 01
2187 ss2 = _mm256_unpacklo_epi64(ss6, ss7); // 36 26 16 06 32 22 12 02
2188 ss3 = _mm256_unpackhi_epi64(ss6, ss7); // 37 27 17 07 33 23 13 03
2189
2190 // ss4 | ss5-2 | ss6 |
2191 // a0 : a1 | a2 : a3 | min(a0, a1) : min(a2, a3) | | (ss4 & !ss6) | ((ss5-2) & ss6) | ((ss4 & !ss6) | ((ss5-2) & ss6)) |
2192 // > (-1) | > (-3) | > (-1) | a3 | 0 | -3 | -3 |
2193 // > (-1) | > (-3) | <= (0) | a1 | -1 | 0 | -1 |
2194 // > (-1) | <= (-2) | > (-1) | a2 | 0 | -2 | -2 |
2195 // > (-1) | <= (-2) | <= (0) | a1 | -1 | 0 | -1 |
2196 // <= (0) | > (-3) | > (-1) | a3 | 0 | -3 | -3 |
2197 // <= (0) | > (-3) | <= (0) | a0 | 0 | 0 | 0 |
2198 // <= (0) | <= (-2) | > (-1) | a2 | 0 | -2 | -2 |
2199 // <= (0) | <= (-2) | <= (0) | a0 | 0 | 0 | 0 |
2200
2201 // *** 8 search points per position ***
2202
2203 // ss0: Search Pos 0,4 for blocks 0,1,2,3
2204 // ss1: Search Pos 1,5 for blocks 0,1,2,3
2205 // ss2: Search Pos 2,6 for blocks 0,1,2,3
2206 // ss3: Search Pos 3,7 for blocks 0,1,2,3
2207
2208 ss4 = _mm256_cmpgt_epi32(ss0, ss1);
2209 // not different SVT_LOG("%d\n", _mm_extract_epi32(_mm256_extracti128_si256(ss4, 0), 0)); // DEBUG
2210 //ss4 = _mm256_or_si256(_mm256_cmpgt_epi32(ss0, ss1), _mm256_cmpeq_epi32(ss0, ss1));
2211 ss0 = _mm256_min_epi32(ss0, ss1);
2212 ss5 = _mm256_cmpgt_epi32(ss2, ss3);
2213 //ss5 = _mm256_or_si256(_mm256_cmpgt_epi32(ss2, ss3), _mm256_cmpeq_epi32(ss2, ss3));
2214 ss2 = _mm256_min_epi32(ss2, ss3);
2215 ss5 = _mm256_sub_epi32(ss5, _mm256_set1_epi32(2)); // ss5-2
2216
2217 // *** 4 search points per position ***
2218 ss6 = _mm256_cmpgt_epi32(ss0, ss2);
2219 //ss6 = _mm256_or_si256(_mm256_cmpgt_epi32(ss0, ss2), _mm256_cmpeq_epi32(ss0, ss2));
2220 ss0 = _mm256_min_epi32(ss0, ss2);
2221 ss5 = _mm256_and_si256(ss5, ss6); // (ss5-2) & ss6
2222 ss4 = _mm256_andnot_si256(ss6, ss4); // ss4 & !ss6
2223 ss4 = _mm256_or_si256(ss4, ss5); // (ss4 & !ss6) | ((ss5-2) & ss6)
2224
2225 // *** 2 search points per position ***
2226 s0 = _mm_setzero_si128();
2227 s1 = _mm_setzero_si128();
2228 s2 = _mm_setzero_si128();
2229 s3 = _mm_setzero_si128();
2230 s4 = _mm_setzero_si128();
2231 s5 = _mm_setzero_si128();
2232 s6 = _mm_setzero_si128();
2233 s7 = _mm_setzero_si128();
2234
2235 // ss0 = 8 SADs, two search points for each 32x32
2236 // ss4 = 8 MVs, two search points for each 32x32
2237 //
2238 // XY
2239 // X: 32x32 block [0..3]
2240 // Y: search position [0..1]
2241 // Format: 00 10 20 30 01 11 21 31
2242
2243 // Each 128 bits contains 4 32x32 32bit block results
2244 #ifdef __GNUC__
2245 // SAD
2246 s0 = _mm256_extracti128_si256(ss0, 1);
2247 s1 = _mm256_extracti128_si256(ss0, 0);
2248 // MV
2249 s2 = _mm256_extracti128_si256(ss4, 1);
2250 s3 = _mm256_extracti128_si256(ss4, 0);
2251 #else
2252 // SAD
2253 s0 = _mm256_extracti128_si256(ss0, 0);
2254 s1 = _mm256_extracti128_si256(ss0, 1);
2255 // MV
2256 s2 = _mm256_extracti128_si256(ss4, 0);
2257 s3 = _mm256_extracti128_si256(ss4, 1);
2258 #endif
2259
2260 //// Should be fine
2261 //SVT_LOG("sad0 %d, %d, %d, %d\n", _mm_extract_epi32(s0, 0), _mm_extract_epi32(s0, 1), _mm_extract_epi32(s0, 2), _mm_extract_epi32(s0, 3)); // DEBUG
2262 //SVT_LOG("sad1 %d, %d, %d, %d\n", _mm_extract_epi32(s1, 0), _mm_extract_epi32(s1, 1), _mm_extract_epi32(s1, 2), _mm_extract_epi32(s1, 3)); // DEBUG
2263 //SVT_LOG("mv0 %d, %d, %d, %d\n", _mm_extract_epi32(s2, 0), _mm_extract_epi32(s2, 1), _mm_extract_epi32(s2, 2), _mm_extract_epi32(s2, 3)); // DEBUG
2264 //SVT_LOG("mv1 %d, %d, %d, %d\n", _mm_extract_epi32(s3, 0), _mm_extract_epi32(s3, 1), _mm_extract_epi32(s3, 2), _mm_extract_epi32(s3, 3)); // DEBUG
2265
2266 // Choose the best MV out of the two, use s4 to hold results of min
2267 s4 = _mm_cmpgt_epi32(s0, s1);
2268
2269 // DIFFERENT BETWEEN VS AND GCC
2270 // SVT_LOG("%d, %d, %d, %d\n", _mm_extract_epi32(s4, 0), _mm_extract_epi32(s4, 1), _mm_extract_epi32(s4, 2), _mm_extract_epi32(s4, 3)); // DEBUG
2271
2272 //s4 = _mm_or_si128(_mm_cmpgt_epi32(s0, s1), _mm_cmpeq_epi32(s0, s1));
2273 s0 = _mm_min_epi32(s0, s1);
2274
2275 // Extract MV's based on the blocks to s2
2276 s3 = _mm_sub_epi32(s3, _mm_set1_epi32(4)); // s3-4
2277 // Remove the MV's are not used from s2
2278 s2 = _mm_andnot_si128(s4, s2);
2279 // Remove the MV's that are not used from s3 (inverse from s2 above operation)
2280 s3 = _mm_and_si128(s4, s3);
2281 // Combine the remaining candidates into s2
2282 s2 = _mm_or_si128(s2, s3);
2283 // Convert MV's into encoders format
2284 s2 = _mm_sub_epi32(_mm_setzero_si128(), s2);
2285 s2 = _mm_slli_epi32(s2, 2); // mv info
2286
2287 // ***SAD***
2288 // s0: current SAD candidates for each 32x32
2289 // s1: best SAD's for 32x32
2290
2291 // << 1 to compensate for every other line
2292 s0 = _mm_slli_epi32(s0, 1); // best sad info
2293 // Load best SAD's
2294 s1 = _mm_loadu_si128((__m128i*)p_best_sad32x32);
2295
2296 // Determine which candidates are better than the current best SAD's.
2297 // s4 is used to determine the MV's of the new best SAD's
2298 s4 = _mm_cmpgt_epi32(s1, s0);
2299 // not different SVT_LOG("%d, %d, %d, %d\n", _mm_extract_epi32(s4, 0), _mm_extract_epi32(s4, 1), _mm_extract_epi32(s4, 2), _mm_extract_epi32(s4, 3)); // DEBUG
2300 //s4 = _mm_or_si128(_mm_cmpgt_epi32(s1, s0), _mm_cmpeq_epi32(s1, s0));
2301 // Combine old and new min SAD's
2302 s0 = _mm_min_epu32(s0, s1);
2303 // Store new best SAD's back to memory
2304 _mm_storeu_si128((__m128i*)p_best_sad32x32, s0);
2305
2306 // ***Motion Vectors***
2307 // Load best MV's
2308 // s3: candidate MV's
2309 // s4: results of comparing SAD's
2310 // s5: previous best MV's
2311
2312 // Load previous best MV's
2313 s5 = _mm_loadu_si128((__m128i*)p_best_mv32x32);
2314 // Remove the MV's that are being replaced
2315 s5 = _mm_andnot_si128(s4, s5);
2316 // Set s3 to the base MV
2317 s3 = _mm_set1_epi32(mv);
2318 // Add candidate MV's to base MV
2319 s3 = _mm_add_epi16(s3, s2);
2320 // Remove non-candidate's
2321 s3 = _mm_and_si128(s3, s4);
2322 // Combine remaining candidates with remaining best MVs
2323 s3 = _mm_or_si128(s3, s5);
2324 // Store back to memory
2325 _mm_storeu_si128((__m128i*)p_best_mv32x32, s3);
2326 }
2327
2328 #endif
2329