1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include "EbComputeSAD_AVX2.h"
7 #include "EbDefinitions.h"
8 #include "immintrin.h"
9
10 #ifndef _mm256_setr_m128i
11 #define _mm256_setr_m128i(/* __m128i */ hi, /* __m128i */ lo) \
12 _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
13 #endif
14
15 #define UPDATE_BEST(s, k, offset) \
16 temSum1 = _mm_extract_epi32(s, k); \
17 if (temSum1 < lowSum) { \
18 lowSum = temSum1; \
19 xBest = j + offset + k; \
20 yBest = i; \
21 }
22
23 /*******************************************************************************
24 * Requirement: width = 4, 8, 16, 24, 32, 48 or 64
25 * Requirement: height <= 64
26 * Requirement: height % 2 = 0 when width = 4 or 8
27 *******************************************************************************/
SadLoopKernel_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width,EB_U64 * bestSad,EB_S16 * xSearchCenter,EB_S16 * ySearchCenter,EB_U32 srcStrideRaw,EB_S16 searchAreaWidth,EB_S16 searchAreaHeight)28 void SadLoopKernel_AVX2_INTRIN(
29 EB_U8 *src, // input parameter, source samples Ptr
30 EB_U32 srcStride, // input parameter, source stride
31 EB_U8 *ref, // input parameter, reference samples Ptr
32 EB_U32 refStride, // input parameter, reference stride
33 EB_U32 height, // input parameter, block height (M)
34 EB_U32 width, // input parameter, block width (N)
35 EB_U64 *bestSad,
36 EB_S16 *xSearchCenter,
37 EB_S16 *ySearchCenter,
38 EB_U32 srcStrideRaw, // input parameter, source stride (no line skipping)
39 EB_S16 searchAreaWidth,
40 EB_S16 searchAreaHeight)
41 {
42 EB_S16 xBest = *xSearchCenter, yBest = *ySearchCenter;
43 EB_U32 lowSum = 0xffffff;
44 EB_U32 temSum1 = 0;
45 EB_S16 i, j;
46 EB_U32 k, l;
47 EB_U32 leftover = searchAreaWidth & 7;
48 const EB_U8 *pRef, *pSrc;
49 __m128i s0, s1, s2, s3, s4, s5, s6, s8 = _mm_set1_epi32(-1);
50 __m256i ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7, ss8;
51
52 if (leftover) {
53 for (k=0; k<leftover; k++) {
54 s8 = _mm_slli_si128(s8, 2);
55 }
56 }
57
58 switch (width) {
59 case 4:
60
61 if (!(height % 4)) {
62 EB_U32 srcStrideT = 3 * srcStride;
63 EB_U32 refStrideT = 3 * refStride;
64 for (i=0; i<searchAreaHeight; i++) {
65 for (j=0; j<=searchAreaWidth-8; j+=8) {
66 pSrc = src;
67 pRef = ref + j;
68 ss3 = ss5 = _mm256_setzero_si256();
69 for (k=0; k<height; k+=4) {
70 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef))), _mm_loadu_si128((__m128i*)(pRef + 2 * refStride)), 0x1);
71 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + refStride))), _mm_loadu_si128((__m128i*)(pRef + refStrideT)), 0x1);
72 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_cvtsi32_si128(*(EB_U32 *)pSrc), _mm_cvtsi32_si128(*(EB_U32 *)(pSrc + srcStride)))), _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(EB_U32 *)(pSrc + 2 * srcStride)), _mm_cvtsi32_si128(*(EB_U32 *)(pSrc + srcStrideT))), 0x1);
73 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
74 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
75 pSrc += srcStride << 2;
76 pRef += refStride << 2;
77 }
78 ss3 = _mm256_adds_epu16(ss3, ss5);
79 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
80 s3 = _mm_minpos_epu16(s3);
81 temSum1 = _mm_extract_epi16(s3, 0);
82 if (temSum1 < lowSum) {
83 lowSum = temSum1;
84 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
85 yBest = i;
86 }
87 }
88
89 if (leftover) {
90 pSrc = src;
91 pRef = ref + j;
92 ss3 = ss5 = _mm256_setzero_si256();
93 for (k=0; k<height; k+=4) {
94 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + 2 * refStride)), 0x1);
95 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + refStride))), _mm_loadu_si128((__m128i*)(pRef + refStrideT)), 0x1);
96 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_cvtsi32_si128(*(EB_U32 *)pSrc), _mm_cvtsi32_si128(*(EB_U32 *)(pSrc + srcStride)))), _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(EB_U32 *)(pSrc + 2 * srcStride)), _mm_cvtsi32_si128(*(EB_U32 *)(pSrc + srcStrideT))), 0x1);
97 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
98 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
99 pSrc += srcStride << 2;
100 pRef += refStride << 2;
101 }
102 ss3 = _mm256_adds_epu16(ss3, ss5);
103 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
104 s3 = _mm_or_si128(s3, s8);
105 s3 = _mm_minpos_epu16(s3);
106 temSum1 = _mm_extract_epi16(s3, 0);
107 if (temSum1 < lowSum) {
108 lowSum = temSum1;
109 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
110 yBest = i;
111 }
112 }
113 ref += srcStrideRaw;
114 }
115 }
116 else {
117 for (i=0; i<searchAreaHeight; i++) {
118 for (j=0; j<=searchAreaWidth-8; j+=8) {
119 pSrc = src;
120 pRef = ref + j;
121 s3 = _mm_setzero_si128();
122 for (k=0; k<height; k+=2) {
123 s0 = _mm_loadu_si128((__m128i*)pRef);
124 s1 = _mm_loadu_si128((__m128i*)(pRef+refStride));
125 s2 = _mm_cvtsi32_si128(*(EB_U32 *)pSrc);
126 s5 = _mm_cvtsi32_si128(*(EB_U32 *)(pSrc+srcStride));
127 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
128 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
129 pSrc += srcStride << 1;
130 pRef += refStride << 1;
131 }
132 s3 = _mm_minpos_epu16(s3);
133 temSum1 = _mm_extract_epi16(s3, 0);
134 if (temSum1 < lowSum) {
135 lowSum = temSum1;
136 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
137 yBest = i;
138 }
139 }
140
141 if (leftover) {
142 pSrc = src;
143 pRef = ref + j;
144 s3 = _mm_setzero_si128();
145 for (k=0; k<height; k+=2) {
146 s0 = _mm_loadu_si128((__m128i*)pRef);
147 s1 = _mm_loadu_si128((__m128i*)(pRef+refStride));
148 s2 = _mm_cvtsi32_si128(*(EB_U32 *)pSrc);
149 s5 = _mm_cvtsi32_si128(*(EB_U32 *)(pSrc+srcStride));
150 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
151 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
152 pSrc += srcStride << 1;
153 pRef += refStride << 1;
154 }
155 s3 = _mm_or_si128(s3, s8);
156 s3 = _mm_minpos_epu16(s3);
157 temSum1 = _mm_extract_epi16(s3, 0);
158 if (temSum1 < lowSum) {
159 lowSum = temSum1;
160 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
161 yBest = i;
162 }
163 }
164 ref += srcStrideRaw;
165 }
166 }
167
168 break;
169
170 case 8:
171 if (!(height % 4)) {
172 EB_U32 srcStrideT = 3 * srcStride;
173 EB_U32 refStrideT = 3 * refStride;
174 for (i=0; i<searchAreaHeight; i++) {
175 for (j=0; j<=searchAreaWidth-8; j+=8) {
176 pSrc = src;
177 pRef = ref + j;
178 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
179 for (k=0; k<height; k+=4) {
180 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + 2 * refStride)), 0x1);
181 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + refStride))), _mm_loadu_si128((__m128i*)(pRef + refStrideT)), 0x1);
182 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)pSrc), _mm_loadl_epi64((__m128i*)(pSrc + srcStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(pSrc + 2 * srcStride)), _mm_loadl_epi64((__m128i*)(pSrc + srcStrideT))), 0x1);
183 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
184 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
185 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
186 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
187 pSrc += srcStride << 2;
188 pRef += refStride << 2;
189 }
190 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
191 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
192 s3 = _mm_minpos_epu16(s3);
193 temSum1 = _mm_extract_epi16(s3, 0);
194 if (temSum1 < lowSum) {
195 lowSum = temSum1;
196 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
197 yBest = i;
198 }
199 }
200
201 if (leftover) {
202 pSrc = src;
203 pRef = ref + j;
204 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
205 for (k=0; k<height; k+=4) {
206 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef+2*refStride)), 0x1);
207 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + refStride))), _mm_loadu_si128((__m128i*)(pRef + refStrideT)), 0x1);
208 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)pSrc), _mm_loadl_epi64((__m128i*)(pSrc + srcStride)))), _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i*)(pSrc + 2 * srcStride)), _mm_loadl_epi64((__m128i*)(pSrc + srcStrideT))), 0x1);
209 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
210 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
211 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
212 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
213 pSrc += srcStride << 2;
214 pRef += refStride << 2;
215 }
216 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
217 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
218 s3 = _mm_or_si128(s3, s8);
219 s3 = _mm_minpos_epu16(s3);
220 temSum1 = _mm_extract_epi16(s3, 0);
221 if (temSum1 < lowSum) {
222 lowSum = temSum1;
223 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
224 yBest = i;
225 }
226 }
227 ref += srcStrideRaw;
228 }
229 }
230 else {
231 for (i=0; i<searchAreaHeight; i++) {
232 for (j=0; j<=searchAreaWidth-8; j+=8) {
233 pSrc = src;
234 pRef = ref + j;
235 s3 = s4 = _mm_setzero_si128();
236 for (k=0; k<height; k+=2) {
237 s0 = _mm_loadu_si128((__m128i*)pRef);
238 s1 = _mm_loadu_si128((__m128i*)(pRef+refStride));
239 s2 = _mm_loadl_epi64((__m128i*)pSrc);
240 s5 = _mm_loadl_epi64((__m128i*)(pSrc+srcStride));
241 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
242 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
243 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
244 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
245 pSrc += srcStride << 1;
246 pRef += refStride << 1;
247 }
248 s3 = _mm_adds_epu16(s3, s4);
249 s3 = _mm_minpos_epu16(s3);
250 temSum1 = _mm_extract_epi16(s3, 0);
251 if (temSum1 < lowSum) {
252 lowSum = temSum1;
253 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
254 yBest = i;
255 }
256 }
257
258 if (leftover) {
259 pSrc = src;
260 pRef = ref + j;
261 s3 = s4 = _mm_setzero_si128();
262 for (k=0; k<height; k+=2) {
263 s0 = _mm_loadu_si128((__m128i*)pRef);
264 s1 = _mm_loadu_si128((__m128i*)(pRef+refStride));
265 s2 = _mm_loadl_epi64((__m128i*)pSrc);
266 s5 = _mm_loadl_epi64((__m128i*)(pSrc+srcStride));
267 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
268 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
269 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s1, s5, 0));
270 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s1, s5, 5));
271 pSrc += srcStride << 1;
272 pRef += refStride << 1;
273 }
274 s3 = _mm_adds_epu16(s3, s4);
275 s3 = _mm_or_si128(s3, s8);
276 s3 = _mm_minpos_epu16(s3);
277 temSum1 = _mm_extract_epi16(s3, 0);
278 if (temSum1 < lowSum) {
279 lowSum = temSum1;
280 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
281 yBest = i;
282 }
283 }
284 ref += srcStrideRaw;
285 }
286 }
287 break;
288
289 case 16:
290 if (height <= 16) {
291 for (i=0; i<searchAreaHeight; i++) {
292 for (j=0; j<=searchAreaWidth-8; j+=8) {
293 pSrc = src;
294 pRef = ref + j;
295 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
296 for (k=0; k<height; k+=2) {
297 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
298 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
299 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
300 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
301 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
302 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
303 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
304 pSrc += 2 * srcStride;
305 pRef += 2 * refStride;
306 }
307 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
308 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
309 s3 = _mm_minpos_epu16(s3);
310 temSum1 = _mm_extract_epi16(s3, 0);
311 if (temSum1 < lowSum) {
312 lowSum = temSum1;
313 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
314 yBest = i;
315 }
316 }
317
318 if (leftover) {
319 pSrc = src;
320 pRef = ref + j;
321 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
322 for (k=0; k<height; k+=2) {
323 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
324 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
325 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
326 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
327 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
328 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
329 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
330 pSrc += 2 * srcStride;
331 pRef += 2 * refStride;
332 }
333 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
334 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
335 s3 = _mm_or_si128(s3, s8);
336 s3 = _mm_minpos_epu16(s3);
337 temSum1 = _mm_extract_epi16(s3, 0);
338 if (temSum1 < lowSum) {
339 lowSum = temSum1;
340 xBest = (EB_S16)(j + _mm_extract_epi16(s3, 1));
341 yBest = i;
342 }
343 }
344 ref += srcStrideRaw;
345 }
346 }
347 else if (height <= 32) {
348 for (i=0; i<searchAreaHeight; i++) {
349 for (j=0; j<=searchAreaWidth-8; j+=8) {
350 pSrc = src;
351 pRef = ref + j;
352 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
353 for (k=0; k<height; k+=2) {
354 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
355 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
356 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
357 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
358 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
359 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
360 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
361 pSrc += 2 * srcStride;
362 pRef += 2 * refStride;
363 }
364 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
365 s3 = _mm256_extracti128_si256(ss3, 0);
366 s5 = _mm256_extracti128_si256(ss3, 1);
367 s4 = _mm_minpos_epu16(s3);
368 s6 = _mm_minpos_epu16(s5);
369 s4 = _mm_unpacklo_epi16(s4, s4);
370 s4 = _mm_unpacklo_epi32(s4, s4);
371 s4 = _mm_unpacklo_epi64(s4, s4);
372 s6 = _mm_unpacklo_epi16(s6, s6);
373 s6 = _mm_unpacklo_epi32(s6, s6);
374 s6 = _mm_unpacklo_epi64(s6, s6);
375 s3 = _mm_sub_epi16(s3, s4);
376 s5 = _mm_adds_epu16(s5, s3);
377 s5 = _mm_sub_epi16(s5, s6);
378 s5 = _mm_minpos_epu16(s5);
379 temSum1 = _mm_extract_epi16(s5, 0);
380 temSum1 += _mm_extract_epi16(s4, 0);
381 temSum1 += _mm_extract_epi16(s6, 0);
382 if (temSum1 < lowSum) {
383 lowSum = temSum1;
384 xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
385 yBest = i;
386 }
387 }
388
389 if (leftover) {
390 pSrc = src;
391 pRef = ref + j;
392 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
393 for (k=0; k<height; k+=2) {
394 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
395 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
396 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
397 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
398 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
399 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
400 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
401 pSrc += 2 * srcStride;
402 pRef += 2 * refStride;
403 }
404 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
405 s3 = _mm256_extracti128_si256(ss3, 0);
406 s5 = _mm256_extracti128_si256(ss3, 1);
407 s3 = _mm_or_si128(s3, s8);
408 s5 = _mm_or_si128(s5, s8);
409 s4 = _mm_minpos_epu16(s3);
410 s6 = _mm_minpos_epu16(s5);
411 s4 = _mm_unpacklo_epi16(s4, s4);
412 s4 = _mm_unpacklo_epi32(s4, s4);
413 s4 = _mm_unpacklo_epi64(s4, s4);
414 s6 = _mm_unpacklo_epi16(s6, s6);
415 s6 = _mm_unpacklo_epi32(s6, s6);
416 s6 = _mm_unpacklo_epi64(s6, s6);
417 s3 = _mm_sub_epi16(s3, s4);
418 s5 = _mm_adds_epu16(s5, s3);
419 s5 = _mm_sub_epi16(s5, s6);
420 s5 = _mm_minpos_epu16(s5);
421 temSum1 = _mm_extract_epi16(s5, 0);
422 temSum1 += _mm_extract_epi16(s4, 0);
423 temSum1 += _mm_extract_epi16(s6, 0);
424 if (temSum1 < lowSum) {
425 lowSum = temSum1;
426 xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
427 yBest = i;
428 }
429 }
430 ref += srcStrideRaw;
431 }
432 }
433 else {
434 for (i=0; i<searchAreaHeight; i++) {
435 for (j=0; j<=searchAreaWidth-8; j+=8) {
436 pSrc = src;
437 pRef = ref + j;
438 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
439 for (k=0; k<height; k+=2) {
440 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
441 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
442 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
443 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
444 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
445 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
446 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
447 pSrc += 2 * srcStride;
448 pRef += 2 * refStride;
449 }
450 ss3 = _mm256_adds_epu16(ss3, ss4);
451 ss5 = _mm256_adds_epu16(ss5, ss6);
452 ss0 = _mm256_adds_epu16(ss3, ss5);
453 s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
454 s0 = _mm_minpos_epu16(s0);
455 temSum1 = _mm_extract_epi16(s0, 0);
456 if (temSum1 < lowSum) {
457 if (temSum1 != 0xFFFF) { // no overflow
458 lowSum = temSum1;
459 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
460 yBest = i;
461 }
462 else {
463 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
464 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
465 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
466 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
467 ss4 = _mm256_add_epi32(ss4, ss6);
468 ss3 = _mm256_add_epi32(ss3, ss5);
469 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
470 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
471 UPDATE_BEST(s0, 0, 0);
472 UPDATE_BEST(s0, 1, 0);
473 UPDATE_BEST(s0, 2, 0);
474 UPDATE_BEST(s0, 3, 0);
475 UPDATE_BEST(s3, 0, 4);
476 UPDATE_BEST(s3, 1, 4);
477 UPDATE_BEST(s3, 2, 4);
478 UPDATE_BEST(s3, 3, 4);
479 }
480 }
481 }
482
483 if (leftover) {
484 pSrc = src;
485 pRef = ref + j;
486 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
487 for (k=0; k<height; k+=2) {
488 ss0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pRef)), _mm_loadu_si128((__m128i*)(pRef + refStride)), 0x1);
489 ss1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)(pRef + 8))), _mm_loadu_si128((__m128i*)(pRef + refStride + 8)), 0x1);
490 ss2 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)pSrc)), _mm_loadu_si128((__m128i*)(pSrc + srcStride)), 0x1);
491 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
492 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
493 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
494 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
495 pSrc += 2 * srcStride;
496 pRef += 2 * refStride;
497 }
498 ss3 = _mm256_adds_epu16(ss3, ss4);
499 ss5 = _mm256_adds_epu16(ss5, ss6);
500 ss0 = _mm256_adds_epu16(ss3, ss5);
501 s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
502 s0 = _mm_or_si128(s0, s8);
503 s0 = _mm_minpos_epu16(s0);
504 temSum1 = _mm_extract_epi16(s0, 0);
505 if (temSum1 < lowSum) {
506 if (temSum1 != 0xFFFF) { // no overflow
507 lowSum = temSum1;
508 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
509 yBest = i;
510 }
511 else {
512 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
513 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
514 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
515 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
516 ss4 = _mm256_add_epi32(ss4, ss6);
517 ss3 = _mm256_add_epi32(ss3, ss5);
518 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
519 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
520 k = leftover;
521 while (k > 0) {
522 for (l=0; l < 4 && k; l++, k--) {
523 temSum1 = _mm_extract_epi32(s0, 0);
524 s0 = _mm_srli_si128(s0, 4);
525 if (temSum1 < lowSum) {
526 lowSum = temSum1;
527 xBest = (EB_S16)(j + leftover - k);
528 yBest = i;
529 }
530 }
531 s0 = s3;
532 }
533 }
534 }
535 }
536 ref += srcStrideRaw;
537 }
538 }
539 break;
540
541 case 24:
542 if (height <= 16) {
543 for (i=0; i<searchAreaHeight; i++) {
544 for (j=0; j<=searchAreaWidth-8; j+=8) {
545 pSrc = src;
546 pRef = ref + j;
547 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
548 for (k=0; k<height; k++) {
549 ss0 = _mm256_loadu_si256((__m256i*)pRef);
550 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
551 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
552 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
553 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
554 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
555 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
556 pSrc += srcStride;
557 pRef += refStride;
558 }
559 ss3 = _mm256_adds_epu16(ss3, ss4);
560 ss5 = _mm256_adds_epu16(ss5, ss6);
561 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
562 s5 = _mm256_extracti128_si256(ss5, 0);
563 s4 = _mm_minpos_epu16(s3);
564 s6 = _mm_minpos_epu16(s5);
565 s4 = _mm_unpacklo_epi16(s4, s4);
566 s4 = _mm_unpacklo_epi32(s4, s4);
567 s4 = _mm_unpacklo_epi64(s4, s4);
568 s6 = _mm_unpacklo_epi16(s6, s6);
569 s6 = _mm_unpacklo_epi32(s6, s6);
570 s6 = _mm_unpacklo_epi64(s6, s6);
571 s3 = _mm_sub_epi16(s3, s4);
572 s5 = _mm_adds_epu16(s5, s3);
573 s5 = _mm_sub_epi16(s5, s6);
574 s5 = _mm_minpos_epu16(s5);
575 temSum1 = _mm_extract_epi16(s5, 0);
576 temSum1 += _mm_extract_epi16(s4, 0);
577 temSum1 += _mm_extract_epi16(s6, 0);
578 if (temSum1 < lowSum) {
579 lowSum = temSum1;
580 xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
581 yBest = i;
582 }
583 }
584
585 if (leftover) {
586 pSrc = src;
587 pRef = ref + j;
588 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
589 for (k=0; k<height; k++) {
590 ss0 = _mm256_loadu_si256((__m256i*)pRef);
591 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
592 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
593 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
594 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
595 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
596 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
597 pSrc += srcStride;
598 pRef += refStride;
599 }
600 ss3 = _mm256_adds_epu16(ss3, ss4);
601 ss5 = _mm256_adds_epu16(ss5, ss6);
602 s3 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
603 s5 = _mm256_extracti128_si256(ss5, 0);
604 s3 = _mm_or_si128(s3, s8);
605 s5 = _mm_or_si128(s5, s8);
606 s4 = _mm_minpos_epu16(s3);
607 s6 = _mm_minpos_epu16(s5);
608 s4 = _mm_unpacklo_epi16(s4, s4);
609 s4 = _mm_unpacklo_epi32(s4, s4);
610 s4 = _mm_unpacklo_epi64(s4, s4);
611 s6 = _mm_unpacklo_epi16(s6, s6);
612 s6 = _mm_unpacklo_epi32(s6, s6);
613 s6 = _mm_unpacklo_epi64(s6, s6);
614 s3 = _mm_sub_epi16(s3, s4);
615 s5 = _mm_adds_epu16(s5, s3);
616 s5 = _mm_sub_epi16(s5, s6);
617 s5 = _mm_minpos_epu16(s5);
618 temSum1 = _mm_extract_epi16(s5, 0);
619 temSum1 += _mm_extract_epi16(s4, 0);
620 temSum1 += _mm_extract_epi16(s6, 0);
621 if (temSum1 < lowSum) {
622 lowSum = temSum1;
623 xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
624 yBest = i;
625 }
626 }
627 ref += srcStrideRaw;
628 }
629 }
630 else {
631 for (i=0; i<searchAreaHeight; i++) {
632 for (j=0; j<=searchAreaWidth-8; j+=8) {
633 pSrc = src;
634 pRef = ref + j;
635 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
636 for (k=0; k<height; k++) {
637 ss0 = _mm256_loadu_si256((__m256i*)pRef);
638 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
639 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
640 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
641 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
642 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
643 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
644 pSrc += srcStride;
645 pRef += refStride;
646 }
647 ss3 = _mm256_adds_epu16(ss3, ss4);
648 ss5 = _mm256_adds_epu16(ss5, ss6);
649 s3 = _mm256_extracti128_si256(ss3, 0);
650 s4 = _mm256_extracti128_si256(ss3, 1);
651 s5 = _mm256_extracti128_si256(ss5, 0);
652 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), s5);
653 s0 = _mm_minpos_epu16(s0);
654 temSum1 = _mm_extract_epi16(s0, 0);
655 if (temSum1 < lowSum) {
656 if (temSum1 != 0xFFFF) { // no overflow
657 lowSum = temSum1;
658 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
659 yBest = i;
660 }
661 else {
662 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
663 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
664 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
665 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
666 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
667 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
668 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), s2);
669 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), s5);
670 UPDATE_BEST(s0, 0, 0);
671 UPDATE_BEST(s0, 1, 0);
672 UPDATE_BEST(s0, 2, 0);
673 UPDATE_BEST(s0, 3, 0);
674 UPDATE_BEST(s3, 0, 4);
675 UPDATE_BEST(s3, 1, 4);
676 UPDATE_BEST(s3, 2, 4);
677 UPDATE_BEST(s3, 3, 4);
678 }
679 }
680 }
681
682 if (leftover) {
683 pSrc = src;
684 pRef = ref + j;
685 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
686 for (k=0; k<height; k++) {
687 ss0 = _mm256_loadu_si256((__m256i*)pRef);
688 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
689 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
690 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
691 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
692 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
693 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
694 pSrc += srcStride;
695 pRef += refStride;
696 }
697 ss3 = _mm256_adds_epu16(ss3, ss4);
698 ss5 = _mm256_adds_epu16(ss5, ss6);
699 s3 = _mm256_extracti128_si256(ss3, 0);
700 s4 = _mm256_extracti128_si256(ss3, 1);
701 s5 = _mm256_extracti128_si256(ss5, 0);
702 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), s5);
703 s0 = _mm_or_si128(s0, s8);
704 s0 = _mm_minpos_epu16(s0);
705 temSum1 = _mm_extract_epi16(s0, 0);
706 if (temSum1 < lowSum) {
707 if (temSum1 != 0xFFFF) { // no overflow
708 lowSum = temSum1;
709 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
710 yBest = i;
711 }
712 else {
713 s0 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
714 s3 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
715 s1 = _mm_unpacklo_epi16(s4, _mm_setzero_si128());
716 s4 = _mm_unpackhi_epi16(s4, _mm_setzero_si128());
717 s2 = _mm_unpacklo_epi16(s5, _mm_setzero_si128());
718 s5 = _mm_unpackhi_epi16(s5, _mm_setzero_si128());
719 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), s2);
720 s3 = _mm_add_epi32(_mm_add_epi32(s3, s4), s5);
721 k = leftover;
722 while (k > 0) {
723 for (l=0; l < 4 && k; l++, k--) {
724 temSum1 = _mm_extract_epi32(s0, 0);
725 s0 = _mm_srli_si128(s0, 4);
726 if (temSum1 < lowSum) {
727 lowSum = temSum1;
728 xBest = (EB_S16)(j + leftover - k);
729 yBest = i;
730 }
731 }
732 s0 = s3;
733 }
734 }
735 }
736 }
737 ref += srcStrideRaw;
738 }
739 }
740 break;
741
742 case 32:
743 if (height <= 16) {
744 for (i=0; i<searchAreaHeight; i++) {
745 for (j=0; j<=searchAreaWidth-8; j+=8) {
746 pSrc = src;
747 pRef = ref + j;
748 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
749 for (k=0; k<height; k++) {
750 ss0 = _mm256_loadu_si256((__m256i*)pRef);
751 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
752 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
753 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
754 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
755 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
756 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
757 pSrc += srcStride;
758 pRef += refStride;
759 }
760 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
761 s3 = _mm256_extracti128_si256(ss3, 0);
762 s5 = _mm256_extracti128_si256(ss3, 1);
763 s4 = _mm_minpos_epu16(s3);
764 s6 = _mm_minpos_epu16(s5);
765 s4 = _mm_unpacklo_epi16(s4, s4);
766 s4 = _mm_unpacklo_epi32(s4, s4);
767 s4 = _mm_unpacklo_epi64(s4, s4);
768 s6 = _mm_unpacklo_epi16(s6, s6);
769 s6 = _mm_unpacklo_epi32(s6, s6);
770 s6 = _mm_unpacklo_epi64(s6, s6);
771 s3 = _mm_sub_epi16(s3, s4);
772 s5 = _mm_adds_epu16(s5, s3);
773 s5 = _mm_sub_epi16(s5, s6);
774 s5 = _mm_minpos_epu16(s5);
775 temSum1 = _mm_extract_epi16(s5, 0);
776 temSum1 += _mm_extract_epi16(s4, 0);
777 temSum1 += _mm_extract_epi16(s6, 0);
778 if (temSum1 < lowSum) {
779 lowSum = temSum1;
780 xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
781 yBest = i;
782 }
783 }
784
785 if (leftover) {
786 pSrc = src;
787 pRef = ref + j;
788 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
789 for (k=0; k<height; k++) {
790 ss0 = _mm256_loadu_si256((__m256i*)pRef);
791 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
792 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
793 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
794 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
795 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
796 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
797 pSrc += srcStride;
798 pRef += refStride;
799 }
800 ss3 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
801 s3 = _mm256_extracti128_si256(ss3, 0);
802 s5 = _mm256_extracti128_si256(ss3, 1);
803 s3 = _mm_or_si128(s3, s8);
804 s5 = _mm_or_si128(s5, s8);
805 s4 = _mm_minpos_epu16(s3);
806 s6 = _mm_minpos_epu16(s5);
807 s4 = _mm_unpacklo_epi16(s4, s4);
808 s4 = _mm_unpacklo_epi32(s4, s4);
809 s4 = _mm_unpacklo_epi64(s4, s4);
810 s6 = _mm_unpacklo_epi16(s6, s6);
811 s6 = _mm_unpacklo_epi32(s6, s6);
812 s6 = _mm_unpacklo_epi64(s6, s6);
813 s3 = _mm_sub_epi16(s3, s4);
814 s5 = _mm_adds_epu16(s5, s3);
815 s5 = _mm_sub_epi16(s5, s6);
816 s5 = _mm_minpos_epu16(s5);
817 temSum1 = _mm_extract_epi16(s5, 0);
818 temSum1 += _mm_extract_epi16(s4, 0);
819 temSum1 += _mm_extract_epi16(s6, 0);
820 if (temSum1 < lowSum) {
821 lowSum = temSum1;
822 xBest = (EB_S16)(j + _mm_extract_epi16(s5, 1));
823 yBest = i;
824 }
825 }
826 ref += srcStrideRaw;
827 }
828 }
829 else if (height <= 32) {
830 for (i=0; i<searchAreaHeight; i++) {
831 for (j=0; j<=searchAreaWidth-8; j+=8) {
832 pSrc = src;
833 pRef = ref + j;
834 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
835 for (k=0; k<height; k++) {
836 ss0 = _mm256_loadu_si256((__m256i*)pRef);
837 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
838 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
839 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
840 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
841 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
842 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
843 pSrc += srcStride;
844 pRef += refStride;
845 }
846 ss3 = _mm256_adds_epu16(ss3, ss4);
847 ss5 = _mm256_adds_epu16(ss5, ss6);
848 ss6 = _mm256_adds_epu16(ss3, ss5);
849 s3 = _mm256_extracti128_si256(ss6, 0);
850 s4 = _mm256_extracti128_si256(ss6, 1);
851 s0 = _mm_adds_epu16(s3, s4);
852 s0 = _mm_minpos_epu16(s0);
853 temSum1 = _mm_extract_epi16(s0, 0);
854 if (temSum1 < lowSum) {
855 if (temSum1 != 0xFFFF) { // no overflow
856 lowSum = temSum1;
857 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
858 yBest = i;
859 }
860 else {
861 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
862 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
863 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
864 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
865 ss4 = _mm256_add_epi32(ss4, ss6);
866 ss3 = _mm256_add_epi32(ss3, ss5);
867 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
868 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
869 UPDATE_BEST(s0, 0, 0);
870 UPDATE_BEST(s0, 1, 0);
871 UPDATE_BEST(s0, 2, 0);
872 UPDATE_BEST(s0, 3, 0);
873 UPDATE_BEST(s3, 0, 4);
874 UPDATE_BEST(s3, 1, 4);
875 UPDATE_BEST(s3, 2, 4);
876 UPDATE_BEST(s3, 3, 4);
877 }
878 }
879 }
880
881 if (leftover) {
882 pSrc = src;
883 pRef = ref + j;
884 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
885 for (k=0; k<height; k++) {
886 ss0 = _mm256_loadu_si256((__m256i*)pRef);
887 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
888 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
889 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
890 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
891 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
892 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
893 pSrc += srcStride;
894 pRef += refStride;
895 }
896 ss3 = _mm256_adds_epu16(ss3, ss4);
897 ss5 = _mm256_adds_epu16(ss5, ss6);
898 ss6 = _mm256_adds_epu16(ss3, ss5);
899 s3 = _mm256_extracti128_si256(ss6, 0);
900 s4 = _mm256_extracti128_si256(ss6, 1);
901 s0 = _mm_adds_epu16(s3, s4);
902 //s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
903 s0 = _mm_or_si128(s0, s8);
904 s0 = _mm_minpos_epu16(s0);
905 temSum1 = _mm_extract_epi16(s0, 0);
906 if (temSum1 < lowSum) {
907 if (temSum1 != 0xFFFF) { // no overflow
908 lowSum = temSum1;
909 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
910 yBest = i;
911 }
912 else {
913 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
914 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
915 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
916 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
917 ss4 = _mm256_add_epi32(ss4, ss6);
918 ss3 = _mm256_add_epi32(ss3, ss5);
919 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
920 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
921 k = leftover;
922 while (k > 0) {
923 for (l=0; l < 4 && k; l++, k--) {
924 temSum1 = _mm_extract_epi32(s0, 0);
925 s0 = _mm_srli_si128(s0, 4);
926 if (temSum1 < lowSum) {
927 lowSum = temSum1;
928 xBest = (EB_S16)(j + leftover - k);
929 yBest = i;
930 }
931 }
932 s0 = s3;
933 }
934 }
935 }
936 }
937 ref += srcStrideRaw;
938 }
939 }
940 else {
941 for (i=0; i<searchAreaHeight; i++) {
942 for (j=0; j<=searchAreaWidth-8; j+=8) {
943 pSrc = src;
944 pRef = ref + j;
945 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
946 for (k=0; k<height; k++) {
947 ss0 = _mm256_loadu_si256((__m256i*)pRef);
948 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
949 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
950 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
951 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
952 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
953 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
954 pSrc += srcStride;
955 pRef += refStride;
956 }
957 ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
958 s3 = _mm256_extracti128_si256(ss7, 0);
959 s4 = _mm256_extracti128_si256(ss7, 1);
960 s0 = _mm_adds_epu16(s3, s4);
961 s0 = _mm_minpos_epu16(s0);
962 temSum1 = _mm_extract_epi16(s0, 0);
963 if (temSum1 < lowSum) {
964 if (temSum1 != 0xFFFF) { // no overflow
965 lowSum = temSum1;
966 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
967 yBest = i;
968 }
969 else {
970 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
971 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
972 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
973 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
974 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
975 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
976 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
977 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
978 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
979 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
980 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
981 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
982 UPDATE_BEST(s0, 0, 0);
983 UPDATE_BEST(s0, 1, 0);
984 UPDATE_BEST(s0, 2, 0);
985 UPDATE_BEST(s0, 3, 0);
986 UPDATE_BEST(s3, 0, 4);
987 UPDATE_BEST(s3, 1, 4);
988 UPDATE_BEST(s3, 2, 4);
989 UPDATE_BEST(s3, 3, 4);
990 }
991 }
992 }
993
994 if (leftover) {
995 pSrc = src;
996 pRef = ref + j;
997 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
998 for (k=0; k<height; k++) {
999 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1000 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1001 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1002 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1003 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1004 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1005 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1006 pSrc += srcStride;
1007 pRef += refStride;
1008 }
1009 ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1010 s3 = _mm256_extracti128_si256(ss7, 0);
1011 s4 = _mm256_extracti128_si256(ss7, 1);
1012 s0 = _mm_adds_epu16(s3, s4);
1013 s0 = _mm_minpos_epu16(s0);
1014 temSum1 = _mm_extract_epi16(s0, 0);
1015 if (temSum1 < lowSum) {
1016 if (temSum1 != 0xFFFF) { // no overflow
1017 lowSum = temSum1;
1018 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1019 yBest = i;
1020 }
1021 else {
1022 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1023 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1024 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1025 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1026 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1027 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1028 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1029 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1030 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1031 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1032 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1033 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1034 k = leftover;
1035 while (k > 0) {
1036 for (l=0; l < 4 && k; l++, k--) {
1037 temSum1 = _mm_extract_epi32(s0, 0);
1038 s0 = _mm_srli_si128(s0, 4);
1039 if (temSum1 < lowSum) {
1040 lowSum = temSum1;
1041 xBest = (EB_S16)(j + leftover - k);
1042 yBest = i;
1043 }
1044 }
1045 s0 = s3;
1046 }
1047 }
1048 }
1049 }
1050 ref += srcStrideRaw;
1051 }
1052 }
1053 break;
1054
1055 case 48:
1056 if (height <= 32) {
1057 for (i=0; i<searchAreaHeight; i++) {
1058 for (j=0; j<=searchAreaWidth-8; j+=8) {
1059 pSrc = src;
1060 pRef = ref + j;
1061 s3 = s4 = s5 = s6 = _mm_setzero_si128();
1062 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1063 for (k=0; k<height; k++) {
1064 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1065 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1066 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1067 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1068 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1069 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1070 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1071 s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1072 s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1073 s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1074 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1075 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1076 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1077 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1078 pSrc += srcStride;
1079 pRef += refStride;
1080 }
1081 s3 = _mm_adds_epu16(s3, s4);
1082 s5 = _mm_adds_epu16(s5, s6);
1083 s0 = _mm_adds_epu16(s3, s5);
1084 ss3 = _mm256_adds_epu16(ss3, ss4);
1085 ss5 = _mm256_adds_epu16(ss5, ss6);
1086 ss6 = _mm256_adds_epu16(ss3, ss5);
1087 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss6, 0), _mm256_extracti128_si256(ss6, 1)));
1088 s0 = _mm_minpos_epu16(s0);
1089 temSum1 = _mm_extract_epi16(s0, 0);
1090 if (temSum1 < lowSum) {
1091 if (temSum1 != 0xFFFF) { // no overflow
1092 lowSum = temSum1;
1093 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1094 yBest = i;
1095 }
1096 else {
1097 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1098 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1099 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1100 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1101 ss4 = _mm256_add_epi32(ss4, ss6);
1102 ss3 = _mm256_add_epi32(ss3, ss5);
1103 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1104 s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1105 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1106 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1107 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1108 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1109 UPDATE_BEST(s0, 0, 0);
1110 UPDATE_BEST(s0, 1, 0);
1111 UPDATE_BEST(s0, 2, 0);
1112 UPDATE_BEST(s0, 3, 0);
1113 UPDATE_BEST(s1, 0, 4);
1114 UPDATE_BEST(s1, 1, 4);
1115 UPDATE_BEST(s1, 2, 4);
1116 UPDATE_BEST(s1, 3, 4);
1117 }
1118 }
1119 }
1120
1121 if (leftover) {
1122 pSrc = src;
1123 pRef = ref + j;
1124 s3 = s4 = s5 = s6 = _mm_setzero_si128();
1125 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1126 for (k=0; k<height; k++) {
1127 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1128 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1129 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1130 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1131 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1132 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1133 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1134 s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1135 s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1136 s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1137 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1138 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1139 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1140 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1141 pSrc += srcStride;
1142 pRef += refStride;
1143 }
1144 s3 = _mm_adds_epu16(s3, s4);
1145 s5 = _mm_adds_epu16(s5, s6);
1146 s0 = _mm_adds_epu16(s3, s5);
1147 ss3 = _mm256_adds_epu16(ss3, ss4);
1148 ss5 = _mm256_adds_epu16(ss5, ss6);
1149 ss6 = _mm256_adds_epu16(ss3, ss5);
1150 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss6, 0), _mm256_extracti128_si256(ss6, 1)));
1151 s0 = _mm_or_si128(s0, s8);
1152 s0 = _mm_minpos_epu16(s0);
1153 temSum1 = _mm_extract_epi16(s0, 0);
1154 if (temSum1 < lowSum) {
1155 if (temSum1 != 0xFFFF) { // no overflow
1156 lowSum = temSum1;
1157 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1158 yBest = i;
1159 }
1160 else {
1161 ss4 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1162 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1163 ss6 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1164 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1165 ss4 = _mm256_add_epi32(ss4, ss6);
1166 ss3 = _mm256_add_epi32(ss3, ss5);
1167 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1168 s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1169 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1170 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1171 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1172 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1173 k = leftover;
1174 while (k > 0) {
1175 for (l=0; l < 4 && k; l++, k--) {
1176 temSum1 = _mm_extract_epi32(s0, 0);
1177 s0 = _mm_srli_si128(s0, 4);
1178 if (temSum1 < lowSum) {
1179 lowSum = temSum1;
1180 xBest = (EB_S16)(j + leftover - k);
1181 yBest = i;
1182 }
1183 }
1184 s0 = s1;
1185 }
1186 }
1187 }
1188 }
1189 ref += srcStrideRaw;
1190 }
1191 }
1192 else {
1193 for (i=0; i<searchAreaHeight; i++) {
1194 for (j=0; j<=searchAreaWidth-8; j+=8) {
1195 pSrc = src;
1196 pRef = ref + j;
1197 s3 = s4 = s5 = s6 = _mm_setzero_si128();
1198 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1199 for (k=0; k<height; k++) {
1200 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1201 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1202 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1203 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1204 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1205 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1206 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1207 s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1208 s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1209 s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1210 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1211 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1212 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1213 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1214 pSrc += srcStride;
1215 pRef += refStride;
1216 }
1217 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1218 ss7 = _mm256_adds_epu16(ss3, ss4);
1219 ss8 = _mm256_adds_epu16(ss5, ss6);
1220 ss7 = _mm256_adds_epu16(ss7, ss8);
1221 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss7, 0), _mm256_extracti128_si256(ss7, 1)));
1222 s0 = _mm_minpos_epu16(s0);
1223 temSum1 = _mm_extract_epi16(s0, 0);
1224 if (temSum1 < lowSum) {
1225 if (temSum1 != 0xFFFF) { // no overflow
1226 lowSum = temSum1;
1227 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1228 yBest = i;
1229 }
1230 else {
1231 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1232 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1233 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1234 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1235 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1236 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1237 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1238 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1239 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1240 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1241 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1242 s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1243 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1244 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s4, _mm_setzero_si128()));
1245 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1246 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s6, _mm_setzero_si128()));
1247 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1248 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s4, _mm_setzero_si128()));
1249 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1250 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s6, _mm_setzero_si128()));
1251 UPDATE_BEST(s0, 0, 0);
1252 UPDATE_BEST(s0, 1, 0);
1253 UPDATE_BEST(s0, 2, 0);
1254 UPDATE_BEST(s0, 3, 0);
1255 UPDATE_BEST(s1, 0, 4);
1256 UPDATE_BEST(s1, 1, 4);
1257 UPDATE_BEST(s1, 2, 4);
1258 UPDATE_BEST(s1, 3, 4);
1259 }
1260 }
1261 }
1262
1263 if (leftover) {
1264 pSrc = src;
1265 pRef = ref + j;
1266 s3 = s4 = s5 = s6 = _mm_setzero_si128();
1267 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1268 for (k=0; k<height; k++) {
1269 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1270 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1271 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1272 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1273 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1274 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1275 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1276 s0 = _mm_loadu_si128((__m128i*)(pRef + 32));
1277 s1 = _mm_loadu_si128((__m128i*)(pRef + 40));
1278 s2 = _mm_loadu_si128((__m128i*)(pSrc + 32));
1279 s3 = _mm_adds_epu16(s3, _mm_mpsadbw_epu8(s0, s2, 0));
1280 s4 = _mm_adds_epu16(s4, _mm_mpsadbw_epu8(s0, s2, 5));
1281 s5 = _mm_adds_epu16(s5, _mm_mpsadbw_epu8(s1, s2, 2));
1282 s6 = _mm_adds_epu16(s6, _mm_mpsadbw_epu8(s1, s2, 7));
1283 pSrc += srcStride;
1284 pRef += refStride;
1285 }
1286 s0 = _mm_adds_epu16(_mm_adds_epu16(s3, s4), _mm_adds_epu16(s5, s6));
1287 ss7 = _mm256_adds_epu16(ss3, ss4);
1288 ss8 = _mm256_adds_epu16(ss5, ss6);
1289 ss7 = _mm256_adds_epu16(ss7, ss8);
1290 s0 = _mm_adds_epu16(s0, _mm_adds_epu16(_mm256_extracti128_si256(ss7, 0), _mm256_extracti128_si256(ss7, 1)));
1291 s0 = _mm_or_si128(s0, s8);
1292 s0 = _mm_minpos_epu16(s0);
1293 temSum1 = _mm_extract_epi16(s0, 0);
1294 if (temSum1 < lowSum) {
1295 if (temSum1 != 0xFFFF) { // no overflow
1296 lowSum = temSum1;
1297 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1298 yBest = i;
1299 }
1300 else {
1301 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1302 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1303 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1304 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1305 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1306 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1307 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1308 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1309 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1310 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1311 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1312 s1 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1313 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s3, _mm_setzero_si128()));
1314 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s4, _mm_setzero_si128()));
1315 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s5, _mm_setzero_si128()));
1316 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(s6, _mm_setzero_si128()));
1317 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s3, _mm_setzero_si128()));
1318 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s4, _mm_setzero_si128()));
1319 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s5, _mm_setzero_si128()));
1320 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(s6, _mm_setzero_si128()));
1321 k = leftover;
1322 while (k > 0) {
1323 for (l=0; l < 4 && k; l++, k--) {
1324 temSum1 = _mm_extract_epi32(s0, 0);
1325 s0 = _mm_srli_si128(s0, 4);
1326 if (temSum1 < lowSum) {
1327 lowSum = temSum1;
1328 xBest = (EB_S16)(j + leftover - k);
1329 yBest = i;
1330 }
1331 }
1332 s0 = s1;
1333 }
1334 }
1335 }
1336 }
1337 ref += srcStrideRaw;
1338 }
1339 }
1340 break;
1341
1342 case 64:
1343 if (height <= 32) {
1344 for (i=0; i<searchAreaHeight; i++) {
1345 for (j=0; j<=searchAreaWidth-8; j+=8) {
1346 pSrc = src;
1347 pRef = ref + j;
1348 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1349 for (k=0; k<height; k++) {
1350 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1351 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1352 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1353 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1354 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1355 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1356 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1357 ss0 = _mm256_loadu_si256((__m256i*)(pRef + 32));
1358 ss1 = _mm256_loadu_si256((__m256i*)(pRef + 40));
1359 ss2 = _mm256_loadu_si256((__m256i *)(pSrc + 32));
1360 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1361 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1362 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1363 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1364 pSrc += srcStride;
1365 pRef += refStride;
1366 }
1367 ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1368 s3 = _mm256_extracti128_si256(ss7, 0);
1369 s4 = _mm256_extracti128_si256(ss7, 1);
1370 s0 = _mm_adds_epu16(s3, s4);
1371 s0 = _mm_minpos_epu16(s0);
1372 temSum1 = _mm_extract_epi16(s0, 0);
1373 if (temSum1 < lowSum) {
1374 if (temSum1 != 0xFFFF) { // no overflow
1375 lowSum = temSum1;
1376 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1377 yBest = i;
1378 }
1379 else {
1380 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1381 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1382 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1383 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1384 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1385 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1386 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1387 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1388 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1389 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1390 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1391 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1392 UPDATE_BEST(s0, 0, 0);
1393 UPDATE_BEST(s0, 1, 0);
1394 UPDATE_BEST(s0, 2, 0);
1395 UPDATE_BEST(s0, 3, 0);
1396 UPDATE_BEST(s3, 0, 4);
1397 UPDATE_BEST(s3, 1, 4);
1398 UPDATE_BEST(s3, 2, 4);
1399 UPDATE_BEST(s3, 3, 4);
1400 }
1401 }
1402 }
1403
1404 if (leftover) {
1405 pSrc = src;
1406 pRef = ref + j;
1407 ss3 = ss4 = ss5 = ss6 = _mm256_setzero_si256();
1408 for (k=0; k<height; k++) {
1409 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1410 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1411 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1412 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1413 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1414 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1415 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1416 ss0 = _mm256_loadu_si256((__m256i*)(pRef + 32));
1417 ss1 = _mm256_loadu_si256((__m256i*)(pRef + 40));
1418 ss2 = _mm256_loadu_si256((__m256i *)(pSrc + 32));
1419 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1420 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1421 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1422 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1423 pSrc += srcStride;
1424 pRef += refStride;
1425 }
1426 ss7 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1427 s3 = _mm256_extracti128_si256(ss7, 0);
1428 s4 = _mm256_extracti128_si256(ss7, 1);
1429 s0 = _mm_adds_epu16(s3, s4);
1430 s0 = _mm_or_si128(s0, s8);
1431 s0 = _mm_minpos_epu16(s0);
1432 temSum1 = _mm_extract_epi16(s0, 0);
1433 if (temSum1 < lowSum) {
1434 if (temSum1 != 0xFFFF) { // no overflow
1435 lowSum = temSum1;
1436 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1437 yBest = i;
1438 }
1439 else {
1440 ss0 = _mm256_unpacklo_epi16(ss3, _mm256_setzero_si256());
1441 ss3 = _mm256_unpackhi_epi16(ss3, _mm256_setzero_si256());
1442 ss1 = _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256());
1443 ss4 = _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256());
1444 ss2 = _mm256_unpacklo_epi16(ss5, _mm256_setzero_si256());
1445 ss5 = _mm256_unpackhi_epi16(ss5, _mm256_setzero_si256());
1446 ss7 = _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256());
1447 ss6 = _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256());
1448 ss0 = _mm256_add_epi32(_mm256_add_epi32(ss0, ss1), _mm256_add_epi32(ss2, ss7));
1449 ss3 = _mm256_add_epi32(_mm256_add_epi32(ss3, ss4), _mm256_add_epi32(ss5, ss6));
1450 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1451 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1452 k = leftover;
1453 while (k > 0) {
1454 for (l=0; l < 4 && k; l++, k--) {
1455 temSum1 = _mm_extract_epi32(s0, 0);
1456 s0 = _mm_srli_si128(s0, 4);
1457 if (temSum1 < lowSum) {
1458 lowSum = temSum1;
1459 xBest = (EB_S16)(j + leftover - k);
1460 yBest = i;
1461 }
1462 }
1463 s0 = s3;
1464 }
1465 }
1466 }
1467 }
1468 ref += srcStrideRaw;
1469 }
1470 }
1471 else {
1472 __m256i ss9, ss10;
1473 for (i=0; i<searchAreaHeight; i++) {
1474 for (j=0; j<=searchAreaWidth-8; j+=8) {
1475 pSrc = src;
1476 pRef = ref + j;
1477 ss3 = ss4 = ss5 = ss6 = ss7 = ss8 = ss9 = ss10 = _mm256_setzero_si256();
1478 for (k=0; k<height; k++) {
1479 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1480 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1481 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1482 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1483 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1484 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1485 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1486 ss0 = _mm256_loadu_si256((__m256i*)(pRef + 32));
1487 ss1 = _mm256_loadu_si256((__m256i*)(pRef + 40));
1488 ss2 = _mm256_loadu_si256((__m256i *)(pSrc + 32));
1489 ss7 = _mm256_adds_epu16(ss7, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1490 ss8 = _mm256_adds_epu16(ss8, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1491 ss9 = _mm256_adds_epu16(ss9, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1492 ss10 = _mm256_adds_epu16(ss10, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1493 pSrc += srcStride;
1494 pRef += refStride;
1495 }
1496 ss0 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1497 ss0 = _mm256_adds_epu16(ss0, _mm256_adds_epu16(_mm256_adds_epu16(ss7, ss8), _mm256_adds_epu16(ss9, ss10)));
1498 s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1499 s0 = _mm_minpos_epu16(s0);
1500 temSum1 = _mm_extract_epi16(s0, 0);
1501 if (temSum1 < lowSum) {
1502 if (temSum1 != 0xFFFF) { // no overflow
1503 lowSum = temSum1;
1504 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1505 yBest = i;
1506 }
1507 else {
1508 ss0 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss3, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss5, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256())));
1509 ss1 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss3, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss5, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256())));
1510 ss2 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss7, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss9, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss10, _mm256_setzero_si256())));
1511 ss3 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss7, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss9, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss10, _mm256_setzero_si256())));
1512 ss0 = _mm256_add_epi32(ss0, ss2);
1513 ss1 = _mm256_add_epi32(ss1, ss3);
1514 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1515 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss1, 0), _mm256_extracti128_si256(ss1, 1));
1516 UPDATE_BEST(s0, 0, 0);
1517 UPDATE_BEST(s0, 1, 0);
1518 UPDATE_BEST(s0, 2, 0);
1519 UPDATE_BEST(s0, 3, 0);
1520 UPDATE_BEST(s3, 0, 4);
1521 UPDATE_BEST(s3, 1, 4);
1522 UPDATE_BEST(s3, 2, 4);
1523 UPDATE_BEST(s3, 3, 4);
1524 }
1525 }
1526 }
1527
1528 if (leftover) {
1529 pSrc = src;
1530 pRef = ref + j;
1531 ss3 = ss4 = ss5 = ss6 = ss7 = ss8 = ss9 = ss10 = _mm256_setzero_si256();
1532 for (k=0; k<height; k++) {
1533 ss0 = _mm256_loadu_si256((__m256i*)pRef);
1534 ss1 = _mm256_loadu_si256((__m256i*)(pRef+8));
1535 ss2 = _mm256_loadu_si256((__m256i *)pSrc);
1536 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1537 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1538 ss5 = _mm256_adds_epu16(ss5, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1539 ss6 = _mm256_adds_epu16(ss6, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1540 ss0 = _mm256_loadu_si256((__m256i*)(pRef + 32));
1541 ss1 = _mm256_loadu_si256((__m256i*)(pRef + 40));
1542 ss2 = _mm256_loadu_si256((__m256i *)(pSrc + 32));
1543 ss7 = _mm256_adds_epu16(ss7, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1544 ss8 = _mm256_adds_epu16(ss8, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1545 ss9 = _mm256_adds_epu16(ss9, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1546 ss10 = _mm256_adds_epu16(ss10, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1547 pSrc += srcStride;
1548 pRef += refStride;
1549 }
1550 ss0 = _mm256_adds_epu16(_mm256_adds_epu16(ss3, ss4), _mm256_adds_epu16(ss5, ss6));
1551 ss0 = _mm256_adds_epu16(ss0, _mm256_adds_epu16(_mm256_adds_epu16(ss7, ss8), _mm256_adds_epu16(ss9, ss10)));
1552 s0 = _mm_adds_epu16(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1553 s0 = _mm_or_si128(s0, s8);
1554 s0 = _mm_minpos_epu16(s0);
1555 temSum1 = _mm_extract_epi16(s0, 0);
1556 if (temSum1 < lowSum) {
1557 if (temSum1 != 0xFFFF) { // no overflow
1558 lowSum = temSum1;
1559 xBest = (EB_S16)(j + _mm_extract_epi16(s0, 1));
1560 yBest = i;
1561 }
1562 else {
1563 ss0 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss3, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss5, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss6, _mm256_setzero_si256())));
1564 ss1 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss3, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss4, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss5, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss6, _mm256_setzero_si256())));
1565 ss2 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpacklo_epi16(ss7, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpacklo_epi16(ss9, _mm256_setzero_si256()), _mm256_unpacklo_epi16(ss10, _mm256_setzero_si256())));
1566 ss3 = _mm256_add_epi32(_mm256_add_epi32(_mm256_unpackhi_epi16(ss7, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss8, _mm256_setzero_si256())), _mm256_add_epi32(_mm256_unpackhi_epi16(ss9, _mm256_setzero_si256()), _mm256_unpackhi_epi16(ss10, _mm256_setzero_si256())));
1567 ss0 = _mm256_add_epi32(ss0, ss2);
1568 ss1 = _mm256_add_epi32(ss1, ss3);
1569 s0 = _mm_add_epi32(_mm256_extracti128_si256(ss0, 0), _mm256_extracti128_si256(ss0, 1));
1570 s3 = _mm_add_epi32(_mm256_extracti128_si256(ss1, 0), _mm256_extracti128_si256(ss1, 1));
1571 k = leftover;
1572 while (k > 0) {
1573 for (l=0; l < 4 && k; l++, k--) {
1574 temSum1 = _mm_extract_epi32(s0, 0);
1575 s0 = _mm_srli_si128(s0, 4);
1576 if (temSum1 < lowSum) {
1577 lowSum = temSum1;
1578 xBest = (EB_S16)(j + leftover - k);
1579 yBest = i;
1580 }
1581 }
1582 s0 = s3;
1583 }
1584 }
1585 }
1586 }
1587 ref += srcStrideRaw;
1588 }
1589 }
1590 break;
1591
1592 default:
1593 break;
1594 }
1595
1596
1597 *bestSad = lowSum;
1598 *xSearchCenter = xBest;
1599 *ySearchCenter = yBest;
1600 }
1601
1602 /*******************************************************************************
1603 * Requirement: height % 2 = 0
1604 *******************************************************************************/
Compute24xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1605 EB_U32 Compute24xMSad_AVX2_INTRIN(
1606 EB_U8 *src, // input parameter, source samples Ptr
1607 EB_U32 srcStride, // input parameter, source stride
1608 EB_U8 *ref, // input parameter, reference samples Ptr
1609 EB_U32 refStride, // input parameter, reference stride
1610 EB_U32 height, // input parameter, block height (M)
1611 EB_U32 width) // input parameter, block width (N)
1612 {
1613 __m128i xmm0, xmm1;
1614 __m256i ymm0, ymm1;
1615 EB_U32 y;
1616 (void)width;
1617
1618 ymm0 = ymm1 = _mm256_setzero_si256();
1619 for (y = 0; y < height; y += 2) {
1620 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i *)ref)));
1621 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i *)(ref + refStride))));
1622 src += srcStride << 1;
1623 ref += refStride << 1;
1624 }
1625 xmm0 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm1, 0));
1626 xmm1 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 1), _mm256_extracti128_si256(ymm1, 1));
1627 xmm0 = _mm_add_epi32(_mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8)), xmm1);
1628 return (EB_U32)_mm_cvtsi128_si32(xmm0);
1629 }
1630
1631 /*******************************************************************************
1632 * Requirement: height % 2 = 0
1633 *******************************************************************************/
Compute32xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1634 EB_U32 Compute32xMSad_AVX2_INTRIN(
1635 EB_U8 *src, // input parameter, source samples Ptr
1636 EB_U32 srcStride, // input parameter, source stride
1637 EB_U8 *ref, // input parameter, reference samples Ptr
1638 EB_U32 refStride, // input parameter, reference stride
1639 EB_U32 height, // input parameter, block height (M)
1640 EB_U32 width) // input parameter, block width (N)
1641 {
1642 __m128i xmm0;
1643 __m256i ymm0, ymm1;
1644 EB_U32 y;
1645 (void)width;
1646
1647 ymm0 = ymm1 = _mm256_setzero_si256();
1648 for (y = 0; y < height; y += 2) {
1649 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i *)ref)));
1650 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i *)(ref + refStride))));
1651 src += srcStride << 1;
1652 ref += refStride << 1;
1653 }
1654 ymm0 = _mm256_add_epi32(ymm0, ymm1);
1655 xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm0), _mm256_extracti128_si256(ymm0, 1));
1656 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1657 return (EB_U32) /*xmm0.m128i_i64[0];*/ _mm_cvtsi128_si32(xmm0);
1658 }
1659
1660 /*******************************************************************************
1661 * Requirement: height % 2 = 0
1662 *******************************************************************************/
Compute40xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1663 EB_U32 Compute40xMSad_AVX2_INTRIN(
1664 EB_U8 *src, // input parameter, source samples Ptr
1665 EB_U32 srcStride, // input parameter, source stride
1666 EB_U8 *ref, // input parameter, reference samples Ptr
1667 EB_U32 refStride, // input parameter, reference stride
1668 EB_U32 height, // input parameter, block height (M)
1669 EB_U32 width) // input parameter, block width (N)
1670 {
1671 __m128i xmm0, xmm1;
1672 __m256i ymm0, ymm1;
1673 EB_U32 y;
1674 (void)width;
1675
1676 ymm0 = ymm1 = _mm256_setzero_si256();
1677 xmm0 = xmm1 = _mm_setzero_si128();
1678 for (y = 0; y < height; y += 2) {
1679 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1680 xmm0 = _mm_add_epi16(xmm0, _mm_sad_epu8(_mm_loadl_epi64((__m128i*)(src + 32)), _mm_loadl_epi64((__m128i*)(ref + 32))));
1681 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i*)(ref + refStride))));
1682 xmm1 = _mm_add_epi16(xmm1, _mm_sad_epu8(_mm_loadl_epi64((__m128i*)(src + srcStride + 32)), _mm_loadl_epi64((__m128i*)(ref + refStride + 32))));
1683
1684 src += srcStride << 1;
1685 ref += refStride << 1;
1686 }
1687 ymm0 = _mm256_add_epi32(ymm0, ymm1);
1688 xmm0 = _mm_add_epi32(xmm0, xmm1);
1689 xmm0 = _mm_add_epi32(xmm0, _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm0, 1)));
1690 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1691 return _mm_extract_epi32(xmm0, 0);
1692 }
1693
1694 /*******************************************************************************
1695 * Requirement: height % 2 = 0
1696 *******************************************************************************/
Compute48xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1697 EB_U32 Compute48xMSad_AVX2_INTRIN(
1698 EB_U8 *src, // input parameter, source samples Ptr
1699 EB_U32 srcStride, // input parameter, source stride
1700 EB_U8 *ref, // input parameter, reference samples Ptr
1701 EB_U32 refStride, // input parameter, reference stride
1702 EB_U32 height, // input parameter, block height (M)
1703 EB_U32 width) // input parameter, block width (N)
1704 {
1705 __m128i xmm0, xmm1;
1706 __m256i ymm0, ymm1;
1707 EB_U32 y;
1708 (void)width;
1709
1710 ymm0 = ymm1 = _mm256_setzero_si256();
1711 xmm0 = xmm1 = _mm_setzero_si128();
1712 for (y = 0; y < height; y += 2) {
1713 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1714 xmm0 = _mm_add_epi32(xmm0, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + 32)), _mm_loadu_si128((__m128i*)(ref + 32))));
1715 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i*)(ref + refStride))));
1716 xmm1 = _mm_add_epi32(xmm1, _mm_sad_epu8(_mm_loadu_si128((__m128i*)(src + srcStride + 32)), _mm_loadu_si128((__m128i*)(ref + refStride + 32))));
1717 src += srcStride << 1;
1718 ref += refStride << 1;
1719 }
1720 ymm0 = _mm256_add_epi32(ymm0, ymm1);
1721 xmm0 = _mm_add_epi32(xmm0, xmm1);
1722 xmm0 = _mm_add_epi32(xmm0, _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm0, 1)));
1723 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1724 return _mm_extract_epi32(xmm0, 0);
1725 }
1726
1727 /*******************************************************************************
1728 * Requirement: height % 2 = 0
1729 *******************************************************************************/
Compute56xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1730 EB_U32 Compute56xMSad_AVX2_INTRIN(
1731 EB_U8 *src, // input parameter, source samples Ptr
1732 EB_U32 srcStride, // input parameter, source stride
1733 EB_U8 *ref, // input parameter, reference samples Ptr
1734 EB_U32 refStride, // input parameter, reference stride
1735 EB_U32 height, // input parameter, block height (M)
1736 EB_U32 width) // input parameter, block width (N)
1737 {
1738 __m128i xmm0, xmm1;
1739 __m256i ymm0, ymm1, ymm2, ymm3;
1740 EB_U32 y;
1741 (void)width;
1742
1743 ymm0 = ymm1 = ymm2 = ymm3 = _mm256_setzero_si256();
1744 for (y = 0; y < height; y += 2) {
1745 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1746 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + 32)), _mm256_loadu_si256((__m256i*)(ref + 32))));
1747 ymm2 = _mm256_add_epi32(ymm2, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i*)(ref + refStride))));
1748 ymm3 = _mm256_add_epi32(ymm3, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride + 32)), _mm256_loadu_si256((__m256i*)(ref + refStride + 32))));
1749
1750 src += srcStride << 1;
1751 ref += refStride << 1;
1752 }
1753 ymm0 = _mm256_add_epi32(ymm0, ymm2);
1754 xmm0 = _mm_add_epi32(_mm256_extracti128_si256(ymm0, 0), _mm256_extracti128_si256(ymm0, 1));
1755
1756 xmm0 = _mm_add_epi32(xmm0, _mm_add_epi32(_mm256_extracti128_si256(ymm1, 0), _mm256_extracti128_si256(ymm3, 0)));
1757 xmm1 = _mm_add_epi32(_mm256_extracti128_si256(ymm1, 1), _mm256_extracti128_si256(ymm3, 1));
1758
1759 xmm0 = _mm_add_epi32(_mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8)), xmm1);
1760 return _mm_extract_epi32(xmm0, 0);
1761 }
1762
1763 /*******************************************************************************
1764 * Requirement: height % 2 = 0
1765 *******************************************************************************/
Compute64xMSad_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 height,EB_U32 width)1766 EB_U32 Compute64xMSad_AVX2_INTRIN(
1767 EB_U8 *src, // input parameter, source samples Ptr
1768 EB_U32 srcStride, // input parameter, source stride
1769 EB_U8 *ref, // input parameter, reference samples Ptr
1770 EB_U32 refStride, // input parameter, reference stride
1771 EB_U32 height, // input parameter, block height (M)
1772 EB_U32 width) // input parameter, block width (N)
1773 {
1774 __m128i xmm0;
1775 __m256i ymm0, ymm1, ymm2, ymm3;
1776 EB_U32 y;
1777 (void)width;
1778
1779 ymm0 = ymm1 = ymm2 = ymm3 = _mm256_setzero_si256();
1780 for (y = 0; y < height; y += 2) {
1781 ymm0 = _mm256_add_epi32(ymm0, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)src), _mm256_loadu_si256((__m256i*)ref)));
1782 ymm1 = _mm256_add_epi32(ymm1, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + 32)), _mm256_loadu_si256((__m256i*)(ref + 32))));
1783 ymm2 = _mm256_add_epi32(ymm2, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride)), _mm256_loadu_si256((__m256i*)(ref + refStride))));
1784 ymm3 = _mm256_add_epi32(ymm3, _mm256_sad_epu8(_mm256_loadu_si256((__m256i*)(src + srcStride + 32)), _mm256_loadu_si256((__m256i*)(ref + refStride + 32))));
1785 src += srcStride << 1;
1786 ref += refStride << 1;
1787 }
1788 ymm0 = _mm256_add_epi32(_mm256_add_epi32(ymm0, ymm1), _mm256_add_epi32(ymm2, ymm3));
1789 xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm0), _mm256_extracti128_si256(ymm0, 1));
1790 xmm0 = _mm_add_epi32(xmm0, _mm_srli_si128(xmm0, 8));
1791 return _mm_extract_epi32(xmm0, 0);
1792 }
1793
1794 #ifdef NON_AVX512_SUPPORT
1795
GetEightHorizontalSearchPointResults_8x8_16x16_PU_AVX2_INTRIN(EB_U8 * src,EB_U32 srcStride,EB_U8 * ref,EB_U32 refStride,EB_U32 * pBestSad8x8,EB_U32 * pBestMV8x8,EB_U32 * pBestSad16x16,EB_U32 * pBestMV16x16,EB_U32 mv,EB_U16 * pSad16x16)1796 void GetEightHorizontalSearchPointResults_8x8_16x16_PU_AVX2_INTRIN(
1797 EB_U8 *src,
1798 EB_U32 srcStride,
1799 EB_U8 *ref,
1800 EB_U32 refStride,
1801 EB_U32 *pBestSad8x8,
1802 EB_U32 *pBestMV8x8,
1803 EB_U32 *pBestSad16x16,
1804 EB_U32 *pBestMV16x16,
1805 EB_U32 mv,
1806 EB_U16 *pSad16x16)
1807 {
1808
1809 EB_S16 xMv, yMv;
1810 __m128i s3;
1811 __m128i sad_0, sad_1, sad_2, sad_3;
1812 __m256i ss0, ss1, ss2, ss3, ss4;
1813 EB_U32 temSum;
1814
1815
1816 /*
1817 ------------------------------------- -----------------------------------
1818 | 8x8_00 | 8x8_01 | 8x8_04 | 8x8_05 | 8x8_16 | 8x8_17 | 8x8_20 | 8x8_21 |
1819 ------------------------------------- -----------------------------------
1820 | 8x8_02 | 8x8_03 | 8x8_06 | 8x8_07 | 8x8_18 | 8x8_19 | 8x8_22 | 8x8_23 |
1821 ----------------------- ----------- ---------------------- ----------
1822 | 8x8_08 | 8x8_09 | 8x8_12 | 8x8_13 | 8x8_24 | 8x8_25 | 8x8_29 | 8x8_29 |
1823 ---------------------- ----------- --------------------- ----------
1824 | 8x8_10 | 8x8_11 | 8x8_14 | 8x8_15 | 8x8_26 | 8x8_27 | 8x8_30 | 8x8_31 |
1825 ------------------------------------- -----------------------------------
1826
1827 ------------------------------------- -----------------------------------
1828 | 8x8_32 | 8x8_33 | 8x8_36 | 8x8_37 | 8x8_48 | 8x8_49 | 8x8_52 | 8x8_53 |
1829 ------------------------------------- -----------------------------------
1830 | 8x8_34 | 8x8_35 | 8x8_38 | 8x8_39 | 8x8_50 | 8x8_51 | 8x8_54 | 8x8_55 |
1831 ----------------------- ----------- ---------------------- ----------
1832 | 8x8_40 | 8x8_41 | 8x8_44 | 8x8_45 | 8x8_56 | 8x8_57 | 8x8_60 | 8x8_61 |
1833 ---------------------- ----------- --------------------- ----------
1834 | 8x8_42 | 8x8_43 | 8x8_46 | 8x8_48 | 8x8_58 | 8x8_59 | 8x8_62 | 8x8_63 |
1835 ------------------------------------- -----------------------------------
1836 */
1837
1838 /*
1839 ---------------------- ----------------------
1840 | 16x16_0 | 16x16_1 | 16x16_4 | 16x16_5 |
1841 ---------------------- ----------------------
1842 | 16x16_2 | 16x16_3 | 16x16_6 | 16x16_7 |
1843 ----------------------- -----------------------
1844 | 16x16_8 | 16x16_9 | 16x16_12 | 16x16_13 |
1845 ---------------------- ----------------------
1846 | 16x16_10 | 16x16_11 | 16x16_14 | 16x16_15 |
1847 ----------------------- -----------------------
1848 */
1849
1850 //8x8_0 & 8x8_1
1851 ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * refStride)));
1852 ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * refStride + 8)));
1853 ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * srcStride)));
1854 ss3 = _mm256_mpsadbw_epu8(ss0, ss2, 0);
1855 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1856 ss4 = _mm256_mpsadbw_epu8(ss1, ss2, 18); // 010 010
1857 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1858 src += srcStride * 4;
1859 ref += refStride * 4;
1860 ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * refStride)));
1861 ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * refStride + 8)));
1862 ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * srcStride)));
1863 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1864 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1865 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1866 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1867 sad_0 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1868 sad_1 = _mm_adds_epu16(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1869
1870 //8x8_2 & 8x8_3
1871 src += srcStride * 4;
1872 ref += refStride * 4;
1873 ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * refStride)));
1874 ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * refStride + 8)));
1875 ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * srcStride)));
1876 ss3 = _mm256_mpsadbw_epu8(ss0, ss2, 0);
1877 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1878 ss4 = _mm256_mpsadbw_epu8(ss1, ss2, 18); // 010 010
1879 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1880
1881 src += srcStride * 4;
1882 ref += refStride * 4;
1883 ss0 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)ref), _mm_loadu_si128((__m128i*)(ref + 2 * refStride)));
1884 ss1 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)(ref + 8)), _mm_loadu_si128((__m128i*)(ref + 2 * refStride + 8)));
1885 ss2 = _mm256_setr_m128i(_mm_loadu_si128((__m128i*)src), _mm_loadu_si128((__m128i*)(src + 2 * srcStride)));
1886 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 0));
1887 ss3 = _mm256_adds_epu16(ss3, _mm256_mpsadbw_epu8(ss0, ss2, 45)); // 101 101
1888 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 18)); // 010 010
1889 ss4 = _mm256_adds_epu16(ss4, _mm256_mpsadbw_epu8(ss1, ss2, 63)); // 111 111
1890 sad_2 = _mm_adds_epu16(_mm256_extracti128_si256(ss3, 0), _mm256_extracti128_si256(ss3, 1));
1891 sad_3 = _mm_adds_epu16(_mm256_extracti128_si256(ss4, 0), _mm256_extracti128_si256(ss4, 1));
1892
1893 //16x16
1894 s3 = _mm_adds_epu16(_mm_adds_epu16(sad_0, sad_1), _mm_adds_epu16(sad_2, sad_3));
1895 //sotore the 8 SADs(16x8 SADs)
1896 _mm_store_si128((__m128i*)pSad16x16, s3);
1897 //find the best for 16x16
1898 s3 = _mm_minpos_epu16(s3);
1899 temSum = _mm_extract_epi16(s3, 0) << 1;
1900 if (temSum < pBestSad16x16[0]) {
1901 pBestSad16x16[0] = temSum;
1902 xMv = _MVXT(mv) + (EB_S16)(_mm_extract_epi16(s3, 1) * 4);
1903 yMv = _MVYT(mv);
1904 pBestMV16x16[0] = ((EB_U16)yMv << 16) | ((EB_U16)xMv);
1905 }
1906
1907 //find the best for 8x8_0, 8x8_1, 8x8_2 & 8x8_3
1908 sad_0 = _mm_minpos_epu16(sad_0);
1909 sad_1 = _mm_minpos_epu16(sad_1);
1910 sad_2 = _mm_minpos_epu16(sad_2);
1911 sad_3 = _mm_minpos_epu16(sad_3);
1912 sad_0 = _mm_unpacklo_epi16(sad_0, sad_1);
1913 sad_2 = _mm_unpacklo_epi16(sad_2, sad_3);
1914 sad_0 = _mm_unpacklo_epi32(sad_0, sad_2);
1915 sad_1 = _mm_unpackhi_epi16(sad_0, _mm_setzero_si128());
1916 sad_0 = _mm_unpacklo_epi16(sad_0, _mm_setzero_si128());
1917 sad_0 = _mm_slli_epi32(sad_0, 1);
1918 sad_1 = _mm_slli_epi16(sad_1, 2);
1919 sad_2 = _mm_loadu_si128((__m128i*)pBestSad8x8);
1920 s3 = _mm_cmpgt_epi32(sad_2, sad_0);
1921 sad_0 = _mm_min_epu32(sad_0, sad_2);
1922 _mm_storeu_si128((__m128i*)pBestSad8x8, sad_0);
1923 sad_3 = _mm_loadu_si128((__m128i*)pBestMV8x8);
1924 sad_3 = _mm_andnot_si128(s3, sad_3);
1925 sad_2 = _mm_set1_epi32(mv);
1926 sad_2 = _mm_add_epi16(sad_2, sad_1);
1927 sad_2 = _mm_and_si128(sad_2, s3);
1928 sad_2 = _mm_or_si128(sad_2, sad_3);
1929 _mm_storeu_si128((__m128i*)pBestMV8x8, sad_2);
1930
1931 }
GetEightHorizontalSearchPointResults_32x32_64x64_PU_AVX2_INTRIN(EB_U16 * pSad16x16,EB_U32 * pBestSad32x32,EB_U32 * pBestSad64x64,EB_U32 * pBestMV32x32,EB_U32 * pBestMV64x64,EB_U32 mv)1932 void GetEightHorizontalSearchPointResults_32x32_64x64_PU_AVX2_INTRIN(
1933 EB_U16 *pSad16x16,
1934 EB_U32 *pBestSad32x32,
1935 EB_U32 *pBestSad64x64,
1936 EB_U32 *pBestMV32x32,
1937 EB_U32 *pBestMV64x64,
1938 EB_U32 mv)
1939 {
1940 EB_S16 xMv, yMv;
1941 EB_U32 temSum, bestMV64x64;
1942
1943 __m128i s0, s1, s2, s3, s4, s5, s6, s7, sad_0, sad_1;
1944 __m128i sad_00, sad_01, sad_10, sad_11, sad_20, sad_21, sad_30, sad_31;
1945 __m256i ss0, ss1, ss2, ss3, ss4, ss5, ss6, ss7;
1946
1947 s0 = _mm_setzero_si128();
1948 s1 = _mm_setzero_si128();
1949 s2 = _mm_setzero_si128();
1950 s3 = _mm_setzero_si128();
1951 s4 = _mm_setzero_si128();
1952 s5 = _mm_setzero_si128();
1953 s6 = _mm_setzero_si128();
1954 s7 = _mm_setzero_si128();
1955 sad_0 = _mm_setzero_si128();
1956 sad_1 = _mm_setzero_si128();
1957
1958 sad_00 = _mm_setzero_si128();
1959 sad_01 = _mm_setzero_si128();
1960 sad_10 = _mm_setzero_si128();
1961 sad_11 = _mm_setzero_si128();
1962 sad_20 = _mm_setzero_si128();
1963 sad_21 = _mm_setzero_si128();
1964 sad_30 = _mm_setzero_si128();
1965 sad_31 = _mm_setzero_si128();
1966
1967 ss0 = _mm256_setzero_si256();
1968 ss1 = _mm256_setzero_si256();
1969 ss2 = _mm256_setzero_si256();
1970 ss3 = _mm256_setzero_si256();
1971 ss4 = _mm256_setzero_si256();
1972 ss5 = _mm256_setzero_si256();
1973 ss6 = _mm256_setzero_si256();
1974 ss7 = _mm256_setzero_si256();
1975
1976
1977 /*--------------------
1978 | 32x32_0 | 32x32_1
1979 ----------------------
1980 | 32x32_2 | 32x32_3
1981 ----------------------*/
1982
1983
1984 /* data ordering in pSad16x16 buffer
1985
1986 Search Search Search
1987 Point 0 Point 1 Point 7
1988 ---------------------------------------
1989 16x16_0 | x | x | ...... | x |
1990 ---------------------------------------
1991 16x16_1 | x | x | ...... | x |
1992
1993 16x16_n | x | x | ...... | x |
1994
1995 ---------------------------------------
1996 16x16_15 | x | x | ...... | x |
1997 ---------------------------------------
1998 */
1999
2000 // __m128i Zero = _mm_setzero_si128();
2001
2002 //32x32_0
2003 s0 = _mm_loadu_si128((__m128i*)(pSad16x16 + 0 * 8));
2004 s1 = _mm_loadu_si128((__m128i*)(pSad16x16 + 1 * 8));
2005 s2 = _mm_loadu_si128((__m128i*)(pSad16x16 + 2 * 8));
2006 s3 = _mm_loadu_si128((__m128i*)(pSad16x16 + 3 * 8));
2007
2008 s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2009 s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2010 s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2011 s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2012 s0 = _mm_add_epi32(s4, s6);
2013 s1 = _mm_add_epi32(s5, s7);
2014
2015 s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2016 s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2017 s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2018 s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2019 s2 = _mm_add_epi32(s4, s6);
2020 s3 = _mm_add_epi32(s5, s7);
2021
2022 sad_01 = _mm_add_epi32(s0, s2);
2023 sad_00 = _mm_add_epi32(s1, s3);
2024
2025 //32x32_1
2026 s0 = _mm_loadu_si128((__m128i*)(pSad16x16 + 4 * 8));
2027 s1 = _mm_loadu_si128((__m128i*)(pSad16x16 + 5 * 8));
2028 s2 = _mm_loadu_si128((__m128i*)(pSad16x16 + 6 * 8));
2029 s3 = _mm_loadu_si128((__m128i*)(pSad16x16 + 7 * 8));
2030
2031 s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2032 s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2033 s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2034 s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2035 s0 = _mm_add_epi32(s4, s6);
2036 s1 = _mm_add_epi32(s5, s7);
2037
2038 s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2039 s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2040 s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2041 s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2042 s2 = _mm_add_epi32(s4, s6);
2043 s3 = _mm_add_epi32(s5, s7);
2044
2045 sad_11 = _mm_add_epi32(s0, s2);
2046 sad_10 = _mm_add_epi32(s1, s3);
2047
2048 //32x32_2
2049 s0 = _mm_loadu_si128((__m128i*)(pSad16x16 + 8 * 8));
2050 s1 = _mm_loadu_si128((__m128i*)(pSad16x16 + 9 * 8));
2051 s2 = _mm_loadu_si128((__m128i*)(pSad16x16 + 10 * 8));
2052 s3 = _mm_loadu_si128((__m128i*)(pSad16x16 + 11 * 8));
2053
2054 s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2055 s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2056 s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2057 s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2058 s0 = _mm_add_epi32(s4, s6);
2059 s1 = _mm_add_epi32(s5, s7);
2060
2061 s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2062 s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2063 s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2064 s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2065 s2 = _mm_add_epi32(s4, s6);
2066 s3 = _mm_add_epi32(s5, s7);
2067
2068 sad_21 = _mm_add_epi32(s0, s2);
2069 sad_20 = _mm_add_epi32(s1, s3);
2070
2071 //32x32_3
2072 s0 = _mm_loadu_si128((__m128i*)(pSad16x16 + 12 * 8));
2073 s1 = _mm_loadu_si128((__m128i*)(pSad16x16 + 13 * 8));
2074 s2 = _mm_loadu_si128((__m128i*)(pSad16x16 + 14 * 8));
2075 s3 = _mm_loadu_si128((__m128i*)(pSad16x16 + 15 * 8));
2076
2077 s4 = _mm_unpackhi_epi16(s0, _mm_setzero_si128());
2078 s5 = _mm_unpacklo_epi16(s0, _mm_setzero_si128());
2079 s6 = _mm_unpackhi_epi16(s1, _mm_setzero_si128());
2080 s7 = _mm_unpacklo_epi16(s1, _mm_setzero_si128());
2081 s0 = _mm_add_epi32(s4, s6);
2082 s1 = _mm_add_epi32(s5, s7);
2083
2084 s4 = _mm_unpackhi_epi16(s2, _mm_setzero_si128());
2085 s5 = _mm_unpacklo_epi16(s2, _mm_setzero_si128());
2086 s6 = _mm_unpackhi_epi16(s3, _mm_setzero_si128());
2087 s7 = _mm_unpacklo_epi16(s3, _mm_setzero_si128());
2088 s2 = _mm_add_epi32(s4, s6);
2089 s3 = _mm_add_epi32(s5, s7);
2090
2091 sad_31 = _mm_add_epi32(s0, s2);
2092 sad_30 = _mm_add_epi32(s1, s3);
2093
2094 sad_0 = _mm_add_epi32(_mm_add_epi32(sad_00, sad_10), _mm_add_epi32(sad_20, sad_30));
2095 sad_1 = _mm_add_epi32(_mm_add_epi32(sad_01, sad_11), _mm_add_epi32(sad_21, sad_31));
2096 sad_0 = _mm_slli_epi32(sad_0, 1);
2097 sad_1 = _mm_slli_epi32(sad_1, 1);
2098
2099
2100 bestMV64x64 = 0xff;
2101
2102 //sad_0
2103 temSum = _mm_extract_epi32(sad_0, 0);
2104 if (temSum <= pBestSad64x64[0]) {
2105 pBestSad64x64[0] = temSum;
2106 bestMV64x64 = 0;
2107 }
2108 temSum = _mm_extract_epi32(sad_0, 1);
2109 if (temSum <= pBestSad64x64[0]) {
2110 pBestSad64x64[0] = temSum;
2111 bestMV64x64 = 1 * 4;
2112 }
2113 temSum = _mm_extract_epi32(sad_0, 2);
2114 if (temSum <= pBestSad64x64[0]) {
2115 pBestSad64x64[0] = temSum;
2116 bestMV64x64 = 2 * 4;
2117 }
2118 temSum = _mm_extract_epi32(sad_0, 3);
2119 if (temSum <= pBestSad64x64[0]) {
2120 pBestSad64x64[0] = temSum;
2121 bestMV64x64 = 3 * 4;
2122 }
2123
2124 //sad_1
2125 temSum = _mm_extract_epi32(sad_1, 0);
2126 if (temSum <= pBestSad64x64[0]) {
2127 pBestSad64x64[0] = temSum;
2128 bestMV64x64 = 4 * 4;
2129 }
2130 temSum = _mm_extract_epi32(sad_1, 1);
2131 if (temSum <= pBestSad64x64[0]) {
2132 pBestSad64x64[0] = temSum;
2133 bestMV64x64 = 5 * 4;
2134 }
2135 temSum = _mm_extract_epi32(sad_1, 2);
2136 if (temSum <= pBestSad64x64[0]) {
2137 pBestSad64x64[0] = temSum;
2138 bestMV64x64 = 6 * 4;
2139 }
2140 temSum = _mm_extract_epi32(sad_1, 3);
2141 if (temSum <= pBestSad64x64[0]) {
2142 pBestSad64x64[0] = temSum;
2143 bestMV64x64 = 7 * 4;
2144 }
2145
2146 if (bestMV64x64 != 0xff) {
2147 xMv = _MVXT(mv) + (EB_S16)bestMV64x64; yMv = _MVYT(mv);
2148 pBestMV64x64[0] = ((EB_U16)yMv << 16) | ((EB_U16)xMv);
2149 }
2150
2151 // ****CODE PAST HERE IS BUGGY FOR GCC****
2152
2153 // XY
2154 // X: 32x32 block [0..3]
2155 // Y: Search position [0..7]
2156 ss0 = _mm256_setr_m128i(sad_00, sad_01); // 07 06 05 04 03 02 01 00
2157 ss1 = _mm256_setr_m128i(sad_10, sad_11); // 17 16 15 14 13 12 11 10
2158 ss2 = _mm256_setr_m128i(sad_20, sad_21); // 27 26 25 24 23 22 21 20
2159 ss3 = _mm256_setr_m128i(sad_30, sad_31); // 37 36 35 34 33 32 31 30
2160 ss4 = _mm256_unpacklo_epi32(ss0, ss1); // 15 05 14 04 11 01 10 00
2161 ss5 = _mm256_unpacklo_epi32(ss2, ss3); // 35 25 34 24 31 21 30 20
2162 ss6 = _mm256_unpackhi_epi32(ss0, ss1); // 17 07 16 06 13 03 12 02
2163 ss7 = _mm256_unpackhi_epi32(ss2, ss3); // 37 27 36 26 33 23 32 22
2164 ss0 = _mm256_unpacklo_epi64(ss4, ss5); // 34 24 14 04 30 20 10 00
2165 ss1 = _mm256_unpackhi_epi64(ss4, ss5); // 35 25 15 05 31 21 11 01
2166 ss2 = _mm256_unpacklo_epi64(ss6, ss7); // 36 26 16 06 32 22 12 02
2167 ss3 = _mm256_unpackhi_epi64(ss6, ss7); // 37 27 17 07 33 23 13 03
2168
2169 // ss4 | ss5-2 | ss6 |
2170 // a0 : a1 | a2 : a3 | min(a0, a1) : min(a2, a3) | | (ss4 & !ss6) | ((ss5-2) & ss6) | ((ss4 & !ss6) | ((ss5-2) & ss6)) |
2171 // > (-1) | > (-3) | > (-1) | a3 | 0 | -3 | -3 |
2172 // > (-1) | > (-3) | <= (0) | a1 | -1 | 0 | -1 |
2173 // > (-1) | <= (-2) | > (-1) | a2 | 0 | -2 | -2 |
2174 // > (-1) | <= (-2) | <= (0) | a1 | -1 | 0 | -1 |
2175 // <= (0) | > (-3) | > (-1) | a3 | 0 | -3 | -3 |
2176 // <= (0) | > (-3) | <= (0) | a0 | 0 | 0 | 0 |
2177 // <= (0) | <= (-2) | > (-1) | a2 | 0 | -2 | -2 |
2178 // <= (0) | <= (-2) | <= (0) | a0 | 0 | 0 | 0 |
2179
2180 // *** 8 search points per position ***
2181
2182 // ss0: Search Pos 0,4 for blocks 0,1,2,3
2183 // ss1: Search Pos 1,5 for blocks 0,1,2,3
2184 // ss2: Search Pos 2,6 for blocks 0,1,2,3
2185 // ss3: Search Pos 3,7 for blocks 0,1,2,3
2186
2187 ss4 = _mm256_cmpgt_epi32(ss0, ss1);
2188 // not different printf("%d\n", _mm_extract_epi32(_mm256_extracti128_si256(ss4, 0), 0)); // DEBUG
2189 //ss4 = _mm256_or_si256(_mm256_cmpgt_epi32(ss0, ss1), _mm256_cmpeq_epi32(ss0, ss1));
2190 ss0 = _mm256_min_epi32(ss0, ss1);
2191 ss5 = _mm256_cmpgt_epi32(ss2, ss3);
2192 //ss5 = _mm256_or_si256(_mm256_cmpgt_epi32(ss2, ss3), _mm256_cmpeq_epi32(ss2, ss3));
2193 ss2 = _mm256_min_epi32(ss2, ss3);
2194 ss5 = _mm256_sub_epi32(ss5, _mm256_set1_epi32(2)); // ss5-2
2195
2196
2197 // *** 4 search points per position ***
2198 ss6 = _mm256_cmpgt_epi32(ss0, ss2);
2199 //ss6 = _mm256_or_si256(_mm256_cmpgt_epi32(ss0, ss2), _mm256_cmpeq_epi32(ss0, ss2));
2200 ss0 = _mm256_min_epi32(ss0, ss2);
2201 ss5 = _mm256_and_si256(ss5, ss6); // (ss5-2) & ss6
2202 ss4 = _mm256_andnot_si256(ss6, ss4); // ss4 & !ss6
2203 ss4 = _mm256_or_si256(ss4, ss5); // (ss4 & !ss6) | ((ss5-2) & ss6)
2204
2205 // *** 2 search points per position ***
2206 s0 = _mm_setzero_si128();
2207 s1 = _mm_setzero_si128();
2208 s2 = _mm_setzero_si128();
2209 s3 = _mm_setzero_si128();
2210 s4 = _mm_setzero_si128();
2211 s5 = _mm_setzero_si128();
2212 s6 = _mm_setzero_si128();
2213 s7 = _mm_setzero_si128();
2214
2215 // ss0 = 8 SADs, two search points for each 32x32
2216 // ss4 = 8 MVs, two search points for each 32x32
2217 //
2218 // XY
2219 // X: 32x32 block [0..3]
2220 // Y: search position [0..1]
2221 // Format: 00 10 20 30 01 11 21 31
2222
2223 // Each 128 bits contains 4 32x32 32bit block results
2224 #ifdef __GNUC__
2225 // SAD
2226 s0 = _mm256_extracti128_si256(ss0, 1);
2227 s1 = _mm256_extracti128_si256(ss0, 0);
2228 // MV
2229 s2 = _mm256_extracti128_si256(ss4, 1);
2230 s3 = _mm256_extracti128_si256(ss4, 0);
2231 #else
2232 // SAD
2233 s0 = _mm256_extracti128_si256(ss0, 0);
2234 s1 = _mm256_extracti128_si256(ss0, 1);
2235 // MV
2236 s2 = _mm256_extracti128_si256(ss4, 0);
2237 s3 = _mm256_extracti128_si256(ss4, 1);
2238 #endif
2239
2240 //// Should be fine
2241 //printf("sad0 %d, %d, %d, %d\n", _mm_extract_epi32(s0, 0), _mm_extract_epi32(s0, 1), _mm_extract_epi32(s0, 2), _mm_extract_epi32(s0, 3)); // DEBUG
2242 //printf("sad1 %d, %d, %d, %d\n", _mm_extract_epi32(s1, 0), _mm_extract_epi32(s1, 1), _mm_extract_epi32(s1, 2), _mm_extract_epi32(s1, 3)); // DEBUG
2243 //printf("mv0 %d, %d, %d, %d\n", _mm_extract_epi32(s2, 0), _mm_extract_epi32(s2, 1), _mm_extract_epi32(s2, 2), _mm_extract_epi32(s2, 3)); // DEBUG
2244 //printf("mv1 %d, %d, %d, %d\n", _mm_extract_epi32(s3, 0), _mm_extract_epi32(s3, 1), _mm_extract_epi32(s3, 2), _mm_extract_epi32(s3, 3)); // DEBUG
2245
2246
2247 // Choose the best MV out of the two, use s4 to hold results of min
2248 s4 = _mm_cmpgt_epi32(s0, s1);
2249
2250 // DIFFERENT BETWEEN VS AND GCC
2251 // printf("%d, %d, %d, %d\n", _mm_extract_epi32(s4, 0), _mm_extract_epi32(s4, 1), _mm_extract_epi32(s4, 2), _mm_extract_epi32(s4, 3)); // DEBUG
2252
2253 //s4 = _mm_or_si128(_mm_cmpgt_epi32(s0, s1), _mm_cmpeq_epi32(s0, s1));
2254 s0 = _mm_min_epi32(s0, s1);
2255
2256
2257
2258 // Extract MV's based on the blocks to s2
2259 s3 = _mm_sub_epi32(s3, _mm_set1_epi32(4)); // s3-4
2260 // Remove the MV's are not used from s2
2261 s2 = _mm_andnot_si128(s4, s2);
2262 // Remove the MV's that are not used from s3 (inverse from s2 above operation)
2263 s3 = _mm_and_si128(s4, s3);
2264 // Combine the remaining candidates into s2
2265 s2 = _mm_or_si128(s2, s3);
2266 // Convert MV's into encoders format
2267 s2 = _mm_sub_epi32(_mm_setzero_si128(), s2);
2268 s2 = _mm_slli_epi32(s2, 2); // mv info
2269
2270
2271 // ***SAD***
2272 // s0: current SAD candidates for each 32x32
2273 // s1: best SAD's for 32x32
2274
2275 // << 1 to compensate for every other line
2276 s0 = _mm_slli_epi32(s0, 1); // best sad info
2277 // Load best SAD's
2278 s1 = _mm_loadu_si128((__m128i*)pBestSad32x32);
2279
2280 // Determine which candidates are better than the current best SAD's.
2281 // s4 is used to determine the MV's of the new best SAD's
2282 s4 = _mm_cmpgt_epi32(s1, s0);
2283 // not different printf("%d, %d, %d, %d\n", _mm_extract_epi32(s4, 0), _mm_extract_epi32(s4, 1), _mm_extract_epi32(s4, 2), _mm_extract_epi32(s4, 3)); // DEBUG
2284 //s4 = _mm_or_si128(_mm_cmpgt_epi32(s1, s0), _mm_cmpeq_epi32(s1, s0));
2285 // Combine old and new min SAD's
2286 s0 = _mm_min_epu32(s0, s1);
2287 // Store new best SAD's back to memory
2288 _mm_storeu_si128((__m128i*)pBestSad32x32, s0);
2289
2290
2291 // ***Motion Vectors***
2292 // Load best MV's
2293 // s3: candidate MV's
2294 // s4: results of comparing SAD's
2295 // s5: previous best MV's
2296
2297 // Load previous best MV's
2298 s5 = _mm_loadu_si128((__m128i*)pBestMV32x32);
2299 // Remove the MV's that are being replaced
2300 s5 = _mm_andnot_si128(s4, s5);
2301 // Set s3 to the base MV
2302 s3 = _mm_set1_epi32(mv);
2303 // Add candidate MV's to base MV
2304 s3 = _mm_add_epi16(s3, s2);
2305 // Remove non-candidate's
2306 s3 = _mm_and_si128(s3, s4);
2307 // Combine remaining candidates with remaining best MVs
2308 s3 = _mm_or_si128(s3, s5);
2309 // Store back to memory
2310 _mm_storeu_si128((__m128i*)pBestMV32x32, s3);
2311 }
2312
2313 #endif
2314