1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include "EbDefinitions.h"
13
14 #if EN_AVX512_SUPPORT
15 #include <assert.h>
16 #include "EbComputeSAD_AVX2.h"
17 #include <immintrin.h>
18 #include "EbMemory_AVX2.h"
19 #include "transpose_avx2.h"
20 #include "EbUtility.h"
21 #include "EbComputeSAD_C.h"
22
sad64_kernel_avx512(const __m512i s,const uint8_t * const ref,__m512i * const sum)23 static INLINE void sad64_kernel_avx512(const __m512i s, const uint8_t *const ref,
24 __m512i *const sum) {
25 const __m512i r = _mm512_loadu_si512((__m512i *)ref);
26 *sum = _mm512_add_epi32(*sum, _mm512_sad_epu8(s, r));
27 }
28
sad64_avx512(const uint8_t * const src,const uint8_t * ref,__m512i * const sum)29 static INLINE void sad64_avx512(const uint8_t *const src, const uint8_t *ref, __m512i *const sum) {
30 const __m512i s = _mm512_loadu_si512((__m512i *)src);
31 sad64_kernel_avx512(s, ref, sum);
32 }
33
sad_final_avx512(const __m512i zmm)34 static INLINE uint32_t sad_final_avx512(const __m512i zmm) {
35 const __m256i zmm_L = _mm512_castsi512_si256(zmm);
36 const __m256i zmm_H = _mm512_extracti64x4_epi64(zmm, 1);
37 const __m256i ymm = _mm256_add_epi32(zmm_L, zmm_H);
38 const __m128i ymm_L = _mm256_castsi256_si128(ymm);
39 const __m128i ymm_H = _mm256_extracti128_si256(ymm, 1);
40 const __m128i xmm0 = _mm_add_epi32(ymm_L, ymm_H);
41 const __m128i xmm0_H = _mm_srli_si128(xmm0, 8);
42 const __m128i xmm1 = _mm_add_epi32(xmm0, xmm0_H);
43
44 return _mm_extract_epi32(xmm1, 0);
45 }
46
47 /*******************************************************************************
48 * Requirement: height % 2 = 0
49 *******************************************************************************/
compute64x_m_sad_avx512_intrin(const uint8_t * src,const uint32_t src_stride,const uint8_t * ref,const uint32_t ref_stride,const uint32_t height)50 static AOM_FORCE_INLINE uint32_t compute64x_m_sad_avx512_intrin(const uint8_t *src,
51 const uint32_t src_stride,
52 const uint8_t *ref,
53 const uint32_t ref_stride,
54 const uint32_t height) {
55 uint32_t y = height;
56 __m512i zmm = _mm512_setzero_si512();
57
58 do {
59 sad64_avx512(src + 0 * src_stride, ref + 0 * ref_stride, &zmm);
60 sad64_avx512(src + 1 * src_stride, ref + 1 * ref_stride, &zmm);
61 src += src_stride << 1;
62 ref += ref_stride << 1;
63 y -= 2;
64 } while (y);
65
66 return sad_final_avx512(zmm);
67 }
68
69 /*******************************************************************************
70 * Requirement: height % 2 = 0
71 *******************************************************************************/
72 SIMD_INLINE uint32_t
compute128x_m_sad_avx512_intrin(const uint8_t * src,const uint32_t src_stride,const uint8_t * ref,const uint32_t ref_stride,const uint32_t height)73 compute128x_m_sad_avx512_intrin(const uint8_t *src, // input parameter, source samples Ptr
74 const uint32_t src_stride, // input parameter, source stride
75 const uint8_t *ref, // input parameter, reference samples Ptr
76 const uint32_t ref_stride, // input parameter, reference stride
77 const uint32_t height) // input parameter, block height (M)
78 {
79 uint32_t y = height;
80 __m512i zmm = _mm512_setzero_si512();
81
82 do {
83 sad64_avx512(src + 0 * src_stride + 0 * 64, ref + 0 * ref_stride + 0 * 64, &zmm);
84 sad64_avx512(src + 0 * src_stride + 1 * 64, ref + 0 * ref_stride + 1 * 64, &zmm);
85 sad64_avx512(src + 1 * src_stride + 0 * 64, ref + 1 * ref_stride + 0 * 64, &zmm);
86 sad64_avx512(src + 1 * src_stride + 1 * 64, ref + 1 * ref_stride + 1 * 64, &zmm);
87 src += src_stride << 1;
88 ref += ref_stride << 1;
89 y -= 2;
90 } while (y);
91
92 return sad_final_avx512(zmm);
93 }
94
svt_aom_sad64x16_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)95 uint32_t svt_aom_sad64x16_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
96 int ref_stride) {
97 return compute64x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 16);
98 }
99
svt_aom_sad64x32_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)100 uint32_t svt_aom_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
101 int ref_stride) {
102 return compute64x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 32);
103 }
104
svt_aom_sad64x64_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)105 uint32_t svt_aom_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
106 int ref_stride) {
107 return compute64x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 64);
108 }
109
svt_aom_sad64x128_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)110 uint32_t svt_aom_sad64x128_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
111 int ref_stride) {
112 return compute64x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 128);
113 }
114
svt_aom_sad128x64_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)115 uint32_t svt_aom_sad128x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
116 int ref_stride) {
117 return compute128x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 64);
118 }
119
svt_aom_sad128x128_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)120 uint32_t svt_aom_sad128x128_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
121 int ref_stride) {
122 return compute128x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 128);
123 }
124
sad64_4d_avx512(const uint8_t * const src,const uint8_t * const ref_array[4],const uint32_t offset,__m512i sum[4])125 static INLINE void sad64_4d_avx512(const uint8_t *const src, const uint8_t *const ref_array[4],
126 const uint32_t offset, __m512i sum[4]) {
127 const __m512i s = _mm512_loadu_si512((__m512i *)(src + offset));
128 sad64_kernel_avx512(s, ref_array[0] + offset, &sum[0]);
129 sad64_kernel_avx512(s, ref_array[1] + offset, &sum[1]);
130 sad64_kernel_avx512(s, ref_array[2] + offset, &sum[2]);
131 sad64_kernel_avx512(s, ref_array[3] + offset, &sum[3]);
132 }
133
add_hi_lo_32_avx512(const __m512i src)134 static INLINE __m256i add_hi_lo_32_avx512(const __m512i src) {
135 const __m256i s0 = _mm512_castsi512_si256(src);
136 const __m256i s1 = _mm512_extracti64x4_epi64(src, 1);
137 return _mm256_add_epi32(s0, s1);
138 }
139
hadd_four_32_avx2(const __m256i src0,const __m256i src1,const __m256i src2,const __m256i src3)140 static INLINE __m128i hadd_four_32_avx2(const __m256i src0, const __m256i src1, const __m256i src2,
141 const __m256i src3) {
142 const __m256i s01 = _mm256_hadd_epi32(src0, src1); // 0 0 1 1 0 0 1 1
143 const __m256i s23 = _mm256_hadd_epi32(src2, src3); // 2 2 3 3 2 2 3 3
144 const __m256i s0123 = _mm256_hadd_epi32(s01, s23); // 0 1 2 3 0 1 2 3
145 const __m128i sum0 = _mm256_castsi256_si128(s0123); // 0 1 2 3
146 const __m128i sum1 = _mm256_extracti128_si256(s0123, 1); // 0 1 2 3
147 return _mm_add_epi32(sum0, sum1); // 0 1 2 3
148 }
149
hadd_four_32_avx512(const __m512i src0,const __m512i src1,const __m512i src2,const __m512i src3)150 static INLINE __m128i hadd_four_32_avx512(const __m512i src0, const __m512i src1,
151 const __m512i src2, const __m512i src3) {
152 __m256i s[4];
153
154 s[0] = add_hi_lo_32_avx512(src0);
155 s[1] = add_hi_lo_32_avx512(src1);
156 s[2] = add_hi_lo_32_avx512(src2);
157 s[3] = add_hi_lo_32_avx512(src3);
158
159 return hadd_four_32_avx2(s[0], s[1], s[2], s[3]);
160 }
161
compute128x_m_4d_sad_avx512_intrin(const uint8_t * src,const uint32_t src_stride,const uint8_t * const ref_array[4],const uint32_t ref_stride,uint32_t sad_array[4],const uint32_t height)162 SIMD_INLINE void compute128x_m_4d_sad_avx512_intrin(const uint8_t *src, const uint32_t src_stride,
163 const uint8_t *const ref_array[4],
164 const uint32_t ref_stride,
165 uint32_t sad_array[4], const uint32_t height) {
166 const uint8_t *ref[4];
167 uint32_t y = height;
168 __m512i zmm[4] = {0};
169
170 ref[0] = ref_array[0];
171 ref[1] = ref_array[1];
172 ref[2] = ref_array[2];
173 ref[3] = ref_array[3];
174
175 do {
176 sad64_4d_avx512(src, ref, 0 * 64, zmm);
177 sad64_4d_avx512(src, ref, 1 * 64, zmm);
178 src += src_stride;
179 ref[0] += ref_stride;
180 ref[1] += ref_stride;
181 ref[2] += ref_stride;
182 ref[3] += ref_stride;
183 } while (--y);
184
185 const __m128i sum = hadd_four_32_avx512(zmm[0], zmm[1], zmm[2], zmm[3]);
186 _mm_storeu_si128((__m128i *)sad_array, sum);
187 }
188
svt_aom_sad128x64x4d_avx512(const uint8_t * src,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])189 void svt_aom_sad128x64x4d_avx512(const uint8_t *src, int src_stride,
190 const uint8_t *const ref_array[4], int ref_stride,
191 uint32_t sad_array[4]) {
192 compute128x_m_4d_sad_avx512_intrin(src, src_stride, ref_array, ref_stride, sad_array, 64);
193 }
194
svt_aom_sad128x128x4d_avx512(const uint8_t * src,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])195 void svt_aom_sad128x128x4d_avx512(const uint8_t *src, int src_stride,
196 const uint8_t *const ref_array[4], int ref_stride,
197 uint32_t sad_array[4]) {
198 compute128x_m_4d_sad_avx512_intrin(src, src_stride, ref_array, ref_stride, sad_array, 128);
199 }
200
201 // =============================================================================
202
add16x8x2to32bit(const __m256i sads256[2],__m128i sads128[2])203 static INLINE void add16x8x2to32bit(const __m256i sads256[2], __m128i sads128[2]) {
204 const __m256i zero = _mm256_setzero_si256();
205 const __m256i sad256_0_lo = _mm256_unpacklo_epi16(sads256[0], zero);
206 const __m256i sad256_0_hi = _mm256_unpackhi_epi16(sads256[0], zero);
207 const __m256i sad256_1_lo = _mm256_unpacklo_epi16(sads256[1], zero);
208 const __m256i sad256_1_hi = _mm256_unpackhi_epi16(sads256[1], zero);
209 const __m256i sad256_lo = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
210 const __m256i sad256_hi = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
211 const __m128i sad128_ll = _mm256_castsi256_si128(sad256_lo);
212 const __m128i sad128_lh = _mm256_extracti128_si256(sad256_lo, 1);
213 const __m128i sad128_hl = _mm256_castsi256_si128(sad256_hi);
214 const __m128i sad128_hh = _mm256_extracti128_si256(sad256_hi, 1);
215 sads128[0] = _mm_add_epi32(sad128_ll, sad128_lh);
216 sads128[1] = _mm_add_epi32(sad128_hl, sad128_hh);
217 }
218
add16x8x3to32bit(const __m256i sads256[3],__m128i sads128[2])219 SIMD_INLINE void add16x8x3to32bit(const __m256i sads256[3], __m128i sads128[2]) {
220 const __m256i zero = _mm256_setzero_si256();
221 const __m256i sad256_0_lo = _mm256_unpacklo_epi16(sads256[0], zero);
222 const __m256i sad256_0_hi = _mm256_unpackhi_epi16(sads256[0], zero);
223 const __m256i sad256_1_lo = _mm256_unpacklo_epi16(sads256[1], zero);
224 const __m256i sad256_1_hi = _mm256_unpackhi_epi16(sads256[1], zero);
225 const __m256i sad256_2_lo = _mm256_unpacklo_epi16(sads256[2], zero);
226 const __m256i sad256_2_hi = _mm256_unpackhi_epi16(sads256[2], zero);
227 const __m256i sad256_01_lo = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
228 const __m256i sad256_01_hi = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
229 const __m256i sad256_lo = _mm256_add_epi32(sad256_01_lo, sad256_2_lo);
230 const __m256i sad256_hi = _mm256_add_epi32(sad256_01_hi, sad256_2_hi);
231 const __m128i sad128_ll = _mm256_castsi256_si128(sad256_lo);
232 const __m128i sad128_lh = _mm256_extracti128_si256(sad256_lo, 1);
233 const __m128i sad128_hl = _mm256_castsi256_si128(sad256_hi);
234 const __m128i sad128_hh = _mm256_extracti128_si256(sad256_hi, 1);
235 sads128[0] = _mm_add_epi32(sad128_ll, sad128_lh);
236 sads128[1] = _mm_add_epi32(sad128_hl, sad128_hh);
237 }
238
add16x8x4to32bit(const __m256i sads256[4],__m128i sads128[2])239 SIMD_INLINE void add16x8x4to32bit(const __m256i sads256[4], __m128i sads128[2]) {
240 const __m256i zero = _mm256_setzero_si256();
241 const __m256i sad256_0_lo = _mm256_unpacklo_epi16(sads256[0], zero);
242 const __m256i sad256_0_hi = _mm256_unpackhi_epi16(sads256[0], zero);
243 const __m256i sad256_1_lo = _mm256_unpacklo_epi16(sads256[1], zero);
244 const __m256i sad256_1_hi = _mm256_unpackhi_epi16(sads256[1], zero);
245 const __m256i sad256_2_lo = _mm256_unpacklo_epi16(sads256[2], zero);
246 const __m256i sad256_2_hi = _mm256_unpackhi_epi16(sads256[2], zero);
247 const __m256i sad256_3_lo = _mm256_unpacklo_epi16(sads256[3], zero);
248 const __m256i sad256_3_hi = _mm256_unpackhi_epi16(sads256[3], zero);
249 const __m256i sad256_01_lo = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
250 const __m256i sad256_01_hi = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
251 const __m256i sad256_23_lo = _mm256_add_epi32(sad256_2_lo, sad256_3_lo);
252 const __m256i sad256_23_hi = _mm256_add_epi32(sad256_2_hi, sad256_3_hi);
253 const __m256i sad256_lo = _mm256_add_epi32(sad256_01_lo, sad256_23_lo);
254 const __m256i sad256_hi = _mm256_add_epi32(sad256_01_hi, sad256_23_hi);
255 const __m128i sad128_ll = _mm256_castsi256_si128(sad256_lo);
256 const __m128i sad128_lh = _mm256_extracti128_si256(sad256_lo, 1);
257 const __m128i sad128_hl = _mm256_castsi256_si128(sad256_hi);
258 const __m128i sad128_hh = _mm256_extracti128_si256(sad256_hi, 1);
259 sads128[0] = _mm_add_epi32(sad128_ll, sad128_lh);
260 sads128[1] = _mm_add_epi32(sad128_hl, sad128_hh);
261 }
262
add16x8x6to32bit(const __m256i sads256[6],__m128i sads128[2])263 SIMD_INLINE void add16x8x6to32bit(const __m256i sads256[6], __m128i sads128[2]) {
264 const __m256i zero = _mm256_setzero_si256();
265 const __m256i sad256_0_lo = _mm256_unpacklo_epi16(sads256[0], zero);
266 const __m256i sad256_0_hi = _mm256_unpackhi_epi16(sads256[0], zero);
267 const __m256i sad256_1_lo = _mm256_unpacklo_epi16(sads256[1], zero);
268 const __m256i sad256_1_hi = _mm256_unpackhi_epi16(sads256[1], zero);
269 const __m256i sad256_2_lo = _mm256_unpacklo_epi16(sads256[2], zero);
270 const __m256i sad256_2_hi = _mm256_unpackhi_epi16(sads256[2], zero);
271 const __m256i sad256_3_lo = _mm256_unpacklo_epi16(sads256[3], zero);
272 const __m256i sad256_3_hi = _mm256_unpackhi_epi16(sads256[3], zero);
273 const __m256i sad256_4_lo = _mm256_unpacklo_epi16(sads256[4], zero);
274 const __m256i sad256_4_hi = _mm256_unpackhi_epi16(sads256[4], zero);
275 const __m256i sad256_5_lo = _mm256_unpacklo_epi16(sads256[5], zero);
276 const __m256i sad256_5_hi = _mm256_unpackhi_epi16(sads256[5], zero);
277 const __m256i sad256_01_lo = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
278 const __m256i sad256_01_hi = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
279 const __m256i sad256_23_lo = _mm256_add_epi32(sad256_2_lo, sad256_3_lo);
280 const __m256i sad256_23_hi = _mm256_add_epi32(sad256_2_hi, sad256_3_hi);
281 const __m256i sad256_45_lo = _mm256_add_epi32(sad256_4_lo, sad256_5_lo);
282 const __m256i sad256_45_hi = _mm256_add_epi32(sad256_4_hi, sad256_5_hi);
283 const __m256i sad256_0123_lo = _mm256_add_epi32(sad256_01_lo, sad256_23_lo);
284 const __m256i sad256_0123_hi = _mm256_add_epi32(sad256_01_hi, sad256_23_hi);
285 const __m256i sad256_lo = _mm256_add_epi32(sad256_0123_lo, sad256_45_lo);
286 const __m256i sad256_hi = _mm256_add_epi32(sad256_0123_hi, sad256_45_hi);
287 const __m128i sad128_ll = _mm256_castsi256_si128(sad256_lo);
288 const __m128i sad128_lh = _mm256_extracti128_si256(sad256_lo, 1);
289 const __m128i sad128_hl = _mm256_castsi256_si128(sad256_hi);
290 const __m128i sad128_hh = _mm256_extracti128_si256(sad256_hi, 1);
291 sads128[0] = _mm_add_epi32(sad128_ll, sad128_lh);
292 sads128[1] = _mm_add_epi32(sad128_hl, sad128_hh);
293 }
294
add16x8x8to32bit(const __m256i sads256[8],__m128i sads128[2])295 SIMD_INLINE void add16x8x8to32bit(const __m256i sads256[8], __m128i sads128[2]) {
296 const __m256i zero = _mm256_setzero_si256();
297 const __m256i sad256_0_lo = _mm256_unpacklo_epi16(sads256[0], zero);
298 const __m256i sad256_0_hi = _mm256_unpackhi_epi16(sads256[0], zero);
299 const __m256i sad256_1_lo = _mm256_unpacklo_epi16(sads256[1], zero);
300 const __m256i sad256_1_hi = _mm256_unpackhi_epi16(sads256[1], zero);
301 const __m256i sad256_2_lo = _mm256_unpacklo_epi16(sads256[2], zero);
302 const __m256i sad256_2_hi = _mm256_unpackhi_epi16(sads256[2], zero);
303 const __m256i sad256_3_lo = _mm256_unpacklo_epi16(sads256[3], zero);
304 const __m256i sad256_3_hi = _mm256_unpackhi_epi16(sads256[3], zero);
305 const __m256i sad256_4_lo = _mm256_unpacklo_epi16(sads256[4], zero);
306 const __m256i sad256_4_hi = _mm256_unpackhi_epi16(sads256[4], zero);
307 const __m256i sad256_5_lo = _mm256_unpacklo_epi16(sads256[5], zero);
308 const __m256i sad256_5_hi = _mm256_unpackhi_epi16(sads256[5], zero);
309 const __m256i sad256_6_lo = _mm256_unpacklo_epi16(sads256[6], zero);
310 const __m256i sad256_6_hi = _mm256_unpackhi_epi16(sads256[6], zero);
311 const __m256i sad256_7_lo = _mm256_unpacklo_epi16(sads256[7], zero);
312 const __m256i sad256_7_hi = _mm256_unpackhi_epi16(sads256[7], zero);
313 const __m256i sad256_01_lo = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
314 const __m256i sad256_01_hi = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
315 const __m256i sad256_23_lo = _mm256_add_epi32(sad256_2_lo, sad256_3_lo);
316 const __m256i sad256_23_hi = _mm256_add_epi32(sad256_2_hi, sad256_3_hi);
317 const __m256i sad256_45_lo = _mm256_add_epi32(sad256_4_lo, sad256_5_lo);
318 const __m256i sad256_45_hi = _mm256_add_epi32(sad256_4_hi, sad256_5_hi);
319 const __m256i sad256_67_lo = _mm256_add_epi32(sad256_6_lo, sad256_7_lo);
320 const __m256i sad256_67_hi = _mm256_add_epi32(sad256_6_hi, sad256_7_hi);
321 const __m256i sad256_0123_lo = _mm256_add_epi32(sad256_01_lo, sad256_23_lo);
322 const __m256i sad256_0123_hi = _mm256_add_epi32(sad256_01_hi, sad256_23_hi);
323 const __m256i sad256_4567_lo = _mm256_add_epi32(sad256_45_lo, sad256_67_lo);
324 const __m256i sad256_4567_hi = _mm256_add_epi32(sad256_45_hi, sad256_67_hi);
325 const __m256i sad256_lo = _mm256_add_epi32(sad256_0123_lo, sad256_4567_lo);
326 const __m256i sad256_hi = _mm256_add_epi32(sad256_0123_hi, sad256_4567_hi);
327 const __m128i sad128_ll = _mm256_castsi256_si128(sad256_lo);
328 const __m128i sad128_lh = _mm256_extracti128_si256(sad256_lo, 1);
329 const __m128i sad128_hl = _mm256_castsi256_si128(sad256_hi);
330 const __m128i sad128_hh = _mm256_extracti128_si256(sad256_hi, 1);
331 sads128[0] = _mm_add_epi32(sad128_ll, sad128_lh);
332 sads128[1] = _mm_add_epi32(sad128_hl, sad128_hh);
333 }
334
add16x16x2to32bit(const __m512i sads512[2],__m256i sads256[2])335 SIMD_INLINE void add16x16x2to32bit(const __m512i sads512[2], __m256i sads256[2]) {
336 const __m512i zero = _mm512_setzero_si512();
337
338 const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
339 const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
340 const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
341 const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
342
343 // 0 1 2 3 8 9 A b 0 1 2 3 8 9 A b
344 // 4 5 6 7 C D E F 4 5 6 7 C D E F
345 const __m512i sad512_lo = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
346 const __m512i sad512_hi = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
347
348 const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
349 const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
350 const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
351 const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
352
353 // 0 1 2 3 8 9 A b
354 // 4 5 6 7 C D E F
355 const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
356 const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
357
358 // 0 1 2 3 4 5 6 7
359 // 8 9 A b C D E F
360 sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
361 sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
362 }
363
add16x16x3to32bit(const __m512i sads512[3],__m256i sads256[2])364 SIMD_INLINE void add16x16x3to32bit(const __m512i sads512[3], __m256i sads256[2]) {
365 const __m512i zero = _mm512_setzero_si512();
366
367 const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
368 const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
369 const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
370 const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
371 const __m512i sad512_2_lo = _mm512_unpacklo_epi16(sads512[2], zero);
372 const __m512i sad512_2_hi = _mm512_unpackhi_epi16(sads512[2], zero);
373
374 const __m512i sad512_01_lo = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
375 const __m512i sad512_01_hi = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
376
377 // 0 1 2 3 8 9 A b 0 1 2 3 8 9 A b
378 // 4 5 6 7 C D E F 4 5 6 7 C D E F
379 const __m512i sad512_lo = _mm512_add_epi32(sad512_01_lo, sad512_2_lo);
380 const __m512i sad512_hi = _mm512_add_epi32(sad512_01_hi, sad512_2_hi);
381
382 const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
383 const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
384 const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
385 const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
386
387 // 0 1 2 3 8 9 A b
388 // 4 5 6 7 C D E F
389 const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
390 const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
391
392 // 0 1 2 3 4 5 6 7
393 // 8 9 A b C D E F
394 sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
395 sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
396 }
397
add16x16x4to32bit(const __m512i sads512[4],__m256i sads256[2])398 SIMD_INLINE void add16x16x4to32bit(const __m512i sads512[4], __m256i sads256[2]) {
399 // Don't call two add16x16x2to32bit(), which is slower.
400 const __m512i zero = _mm512_setzero_si512();
401
402 const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
403 const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
404 const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
405 const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
406 const __m512i sad512_2_lo = _mm512_unpacklo_epi16(sads512[2], zero);
407 const __m512i sad512_2_hi = _mm512_unpackhi_epi16(sads512[2], zero);
408 const __m512i sad512_3_lo = _mm512_unpacklo_epi16(sads512[3], zero);
409 const __m512i sad512_3_hi = _mm512_unpackhi_epi16(sads512[3], zero);
410
411 const __m512i sad512_01_lo = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
412 const __m512i sad512_01_hi = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
413 const __m512i sad512_23_lo = _mm512_add_epi32(sad512_2_lo, sad512_3_lo);
414 const __m512i sad512_23_hi = _mm512_add_epi32(sad512_2_hi, sad512_3_hi);
415
416 // 0 1 2 3 8 9 A b 0 1 2 3 8 9 A b
417 // 4 5 6 7 C D E F 4 5 6 7 C D E F
418 const __m512i sad512_lo = _mm512_add_epi32(sad512_01_lo, sad512_23_lo);
419 const __m512i sad512_hi = _mm512_add_epi32(sad512_01_hi, sad512_23_hi);
420
421 const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
422 const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
423 const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
424 const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
425
426 // 0 1 2 3 8 9 A b
427 // 4 5 6 7 C D E F
428 const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
429 const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
430
431 // 0 1 2 3 4 5 6 7
432 // 8 9 A b C D E F
433 sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
434 sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
435 }
436
add16x16x6to32bit(const __m512i sads512[6],__m256i sads256[2])437 SIMD_INLINE void add16x16x6to32bit(const __m512i sads512[6], __m256i sads256[2]) {
438 const __m512i zero = _mm512_setzero_si512();
439
440 const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
441 const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
442 const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
443 const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
444 const __m512i sad512_2_lo = _mm512_unpacklo_epi16(sads512[2], zero);
445 const __m512i sad512_2_hi = _mm512_unpackhi_epi16(sads512[2], zero);
446 const __m512i sad512_3_lo = _mm512_unpacklo_epi16(sads512[3], zero);
447 const __m512i sad512_3_hi = _mm512_unpackhi_epi16(sads512[3], zero);
448 const __m512i sad512_4_lo = _mm512_unpacklo_epi16(sads512[4], zero);
449 const __m512i sad512_4_hi = _mm512_unpackhi_epi16(sads512[4], zero);
450 const __m512i sad512_5_lo = _mm512_unpacklo_epi16(sads512[5], zero);
451 const __m512i sad512_5_hi = _mm512_unpackhi_epi16(sads512[5], zero);
452
453 const __m512i sad512_01_lo = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
454 const __m512i sad512_01_hi = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
455 const __m512i sad512_23_lo = _mm512_add_epi32(sad512_2_lo, sad512_3_lo);
456 const __m512i sad512_23_hi = _mm512_add_epi32(sad512_2_hi, sad512_3_hi);
457 const __m512i sad512_45_lo = _mm512_add_epi32(sad512_4_lo, sad512_5_lo);
458 const __m512i sad512_45_hi = _mm512_add_epi32(sad512_4_hi, sad512_5_hi);
459 const __m512i sad512_0123_lo = _mm512_add_epi32(sad512_01_lo, sad512_23_lo);
460 const __m512i sad512_0123_hi = _mm512_add_epi32(sad512_01_hi, sad512_23_hi);
461
462 // 0 1 2 3 8 9 A b 0 1 2 3 8 9 A b
463 // 4 5 6 7 C D E F 4 5 6 7 C D E F
464 const __m512i sad512_lo = _mm512_add_epi32(sad512_0123_lo, sad512_45_lo);
465 const __m512i sad512_hi = _mm512_add_epi32(sad512_0123_hi, sad512_45_hi);
466
467 const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
468 const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
469 const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
470 const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
471
472 // 0 1 2 3 8 9 A b
473 // 4 5 6 7 C D E F
474 const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
475 const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
476
477 // 0 1 2 3 4 5 6 7
478 // 8 9 A b C D E F
479 sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
480 sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
481 }
482
add16x16x8to32bit(const __m512i sads512[8],__m256i sads256[2])483 SIMD_INLINE void add16x16x8to32bit(const __m512i sads512[8], __m256i sads256[2]) {
484 // Don't call two add16x16x4to32bit(), which is slower.
485 const __m512i zero = _mm512_setzero_si512();
486
487 const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
488 const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
489 const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
490 const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
491 const __m512i sad512_2_lo = _mm512_unpacklo_epi16(sads512[2], zero);
492 const __m512i sad512_2_hi = _mm512_unpackhi_epi16(sads512[2], zero);
493 const __m512i sad512_3_lo = _mm512_unpacklo_epi16(sads512[3], zero);
494 const __m512i sad512_3_hi = _mm512_unpackhi_epi16(sads512[3], zero);
495 const __m512i sad512_4_lo = _mm512_unpacklo_epi16(sads512[4], zero);
496 const __m512i sad512_4_hi = _mm512_unpackhi_epi16(sads512[4], zero);
497 const __m512i sad512_5_lo = _mm512_unpacklo_epi16(sads512[5], zero);
498 const __m512i sad512_5_hi = _mm512_unpackhi_epi16(sads512[5], zero);
499 const __m512i sad512_6_lo = _mm512_unpacklo_epi16(sads512[6], zero);
500 const __m512i sad512_6_hi = _mm512_unpackhi_epi16(sads512[6], zero);
501 const __m512i sad512_7_lo = _mm512_unpacklo_epi16(sads512[7], zero);
502 const __m512i sad512_7_hi = _mm512_unpackhi_epi16(sads512[7], zero);
503
504 const __m512i sad512_01_lo = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
505 const __m512i sad512_01_hi = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
506 const __m512i sad512_23_lo = _mm512_add_epi32(sad512_2_lo, sad512_3_lo);
507 const __m512i sad512_23_hi = _mm512_add_epi32(sad512_2_hi, sad512_3_hi);
508 const __m512i sad512_45_lo = _mm512_add_epi32(sad512_4_lo, sad512_5_lo);
509 const __m512i sad512_45_hi = _mm512_add_epi32(sad512_4_hi, sad512_5_hi);
510 const __m512i sad512_67_lo = _mm512_add_epi32(sad512_6_lo, sad512_7_lo);
511 const __m512i sad512_67_hi = _mm512_add_epi32(sad512_6_hi, sad512_7_hi);
512 const __m512i sad512_0123_lo = _mm512_add_epi32(sad512_01_lo, sad512_23_lo);
513 const __m512i sad512_0123_hi = _mm512_add_epi32(sad512_01_hi, sad512_23_hi);
514 const __m512i sad512_4567_lo = _mm512_add_epi32(sad512_45_lo, sad512_67_lo);
515 const __m512i sad512_4567_hi = _mm512_add_epi32(sad512_45_hi, sad512_67_hi);
516
517 // 0 1 2 3 8 9 A b 0 1 2 3 8 9 A b
518 // 4 5 6 7 C D E F 4 5 6 7 C D E F
519 const __m512i sad512_lo = _mm512_add_epi32(sad512_0123_lo, sad512_4567_lo);
520 const __m512i sad512_hi = _mm512_add_epi32(sad512_0123_hi, sad512_4567_hi);
521
522 const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
523 const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
524 const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
525 const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
526
527 // 0 1 2 3 8 9 A b
528 // 4 5 6 7 C D E F
529 const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
530 const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
531
532 // 0 1 2 3 4 5 6 7
533 // 8 9 A b C D E F
534 sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
535 sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
536 }
537
saturate_add(const __m128i sum0,const __m128i sum1,__m128i * const minpos)538 static INLINE uint32_t saturate_add(const __m128i sum0, const __m128i sum1, __m128i *const minpos) {
539 uint32_t min_val;
540 __m128i min0, min1;
541
542 const __m128i minpos0 = _mm_minpos_epu16(sum0);
543 const __m128i minpos1 = _mm_minpos_epu16(sum1);
544 min0 = _mm_unpacklo_epi16(minpos0, minpos0);
545 min0 = _mm_unpacklo_epi32(min0, min0);
546 min0 = _mm_unpacklo_epi64(min0, min0);
547 min1 = _mm_unpacklo_epi16(minpos1, minpos1);
548 min1 = _mm_unpacklo_epi32(min1, min1);
549 min1 = _mm_unpacklo_epi64(min1, min1);
550 const __m128i t0 = _mm_sub_epi16(sum0, min0);
551 const __m128i t1 = _mm_sub_epi16(sum1, min1);
552 const __m128i sum = _mm_adds_epu16(t0, t1);
553 *minpos = _mm_minpos_epu16(sum);
554 min_val = _mm_extract_epi16(*minpos, 0);
555 min_val += _mm_extract_epi16(minpos0, 0);
556 min_val += _mm_extract_epi16(minpos1, 0);
557
558 return min_val;
559 }
560
sad_loop_kernel_4_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i * const sum)561 static INLINE void sad_loop_kernel_4_avx2(const uint8_t *const src, const uint32_t src_stride,
562 const uint8_t *const ref, const uint32_t ref_stride,
563 __m256i *const sum) {
564 const __m256i ss0 = _mm256_insertf128_si256(
565 _mm256_castsi128_si256(_mm_cvtsi32_si128(*(uint32_t *)src)),
566 _mm_cvtsi32_si128(*(uint32_t *)(src + src_stride)),
567 1);
568 const __m256i rr0 = _mm256_insertf128_si256(
569 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
570 _mm_loadu_si128((__m128i *)(ref + ref_stride)),
571 1);
572 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, 0));
573 }
574
575 /*******************************************************************************
576 * Function helper adds to "sum" vector SAD's for block width of 4, uses AVX2 instructions
577 * Requirement: width = 4
578 * Requirement: height = 1
579 * Compute one line
580 *******************************************************************************/
sad_loop_kernel_4_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)581 static INLINE void sad_loop_kernel_4_oneline_avx2(const uint8_t *const src,
582 const uint8_t *const ref, __m256i *const sum) {
583 const __m256i ss0 = _mm256_insertf128_si256(
584 _mm256_castsi128_si256(_mm_cvtsi32_si128(*(uint32_t *)src)), _mm_setzero_si128(), 1);
585 const __m256i rr0 = _mm256_insertf128_si256(
586 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
587 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, 0));
588 }
589
sad_loop_kernel_4_sse4_1(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m128i * const sum)590 static INLINE void sad_loop_kernel_4_sse4_1(const uint8_t *const src, const uint32_t src_stride,
591 const uint8_t *const ref, const uint32_t ref_stride,
592 __m128i *const sum) {
593 const __m128i s0 = _mm_cvtsi32_si128(*(uint32_t *)src);
594 const __m128i s1 = _mm_cvtsi32_si128(*(uint32_t *)(src + src_stride));
595 const __m128i r0 = _mm_loadu_si128((__m128i *)ref);
596 const __m128i r1 = _mm_loadu_si128((__m128i *)(ref + ref_stride));
597 *sum = _mm_adds_epu16(*sum, _mm_mpsadbw_epu8(r0, s0, 0));
598 *sum = _mm_adds_epu16(*sum, _mm_mpsadbw_epu8(r1, s1, 0));
599 }
600
601 /*******************************************************************************
602 * Function helper adds to "sum" vector SAD's for block width of 4, uses sse4_1 instructions
603 * Requirement: width = 4
604 * Requirement: height = 1
605 * Compute one line
606 *******************************************************************************/
sad_loop_kernel_4_oneline_sse4_1(const uint8_t * const src,const uint8_t * const ref,__m128i * const sum)607 static INLINE void sad_loop_kernel_4_oneline_sse4_1(const uint8_t *const src,
608 const uint8_t *const ref, __m128i *const sum) {
609 const __m128i s0 = _mm_cvtsi32_si128(*(uint32_t *)src);
610 const __m128i r0 = _mm_loadu_si128((__m128i *)ref);
611 *sum = _mm_adds_epu16(*sum, _mm_mpsadbw_epu8(r0, s0, 0));
612 }
613
sad_loop_kernel_8_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i * const sum)614 static INLINE void sad_loop_kernel_8_avx2(const uint8_t *const src, const uint32_t src_stride,
615 const uint8_t *const ref, const uint32_t ref_stride,
616 __m256i *const sum) {
617 const __m256i ss0 = _mm256_insertf128_si256(
618 _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)src)),
619 _mm_loadl_epi64((__m128i *)(src + 1 * src_stride)),
620 1);
621 const __m256i rr0 = _mm256_insertf128_si256(
622 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
623 _mm_loadu_si128((__m128i *)(ref + 1 * ref_stride)),
624 1);
625 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
626 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
627 }
628
629 /*******************************************************************************
630 * Function helper adds to "sum" vector SAD's for block width of 8, uses AVX2 instructions
631 * Requirement: width = 8
632 * Requirement: height = 1
633 * Compute one line
634 *******************************************************************************/
sad_loop_kernel_8_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)635 static INLINE void sad_loop_kernel_8_oneline_avx2(const uint8_t *const src,
636 const uint8_t *const ref, __m256i *const sum) {
637 const __m256i ss0 = _mm256_insertf128_si256(
638 _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)src)), _mm_setzero_si128(), 1);
639 const __m256i rr0 = _mm256_insertf128_si256(
640 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
641 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
642 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
643 }
644
645 /*******************************************************************************
646 * Function helper adds to "sums" vectors SAD's for block width of 12, uses AVX512 instructions
647 * Requirement: width = 12
648 * Requirement: height = 2
649 *******************************************************************************/
sad_loop_kernel_12_avx512(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m512i * const sum)650 SIMD_INLINE void sad_loop_kernel_12_avx512(const uint8_t *const src, const uint32_t src_stride,
651 const uint8_t *const ref, const uint32_t ref_stride,
652 __m512i *const sum) {
653 const __m128i s0 = _mm_loadu_si128((__m128i *)src);
654 const __m128i s1 = _mm_loadu_si128((__m128i *)(src + src_stride));
655 const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), s1, 1);
656 const __m512i s = _mm512_castsi256_si512(s01);
657 const __m512i ss0 = _mm512_permutexvar_epi32(
658 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
659 const __m512i ss1 = _mm512_permutexvar_epi32(
660 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
661 const __m512i ss2 = _mm512_permutexvar_epi32(
662 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
663
664 const __m256i r0 = _mm256_loadu_si256((__m256i *)ref);
665 const __m256i r1 = _mm256_loadu_si256((__m256i *)(ref + ref_stride));
666 const __m512i r = _mm512_inserti64x4(_mm512_castsi256_si512(r0), r1, 1);
667 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
668 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
669
670 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
671 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
672 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
673 }
674
675 /*******************************************************************************
676 * Function helper adds to "sums" vectors SAD's for block width of 12, uses AVX512 instructions
677 * Requirement: width = 12
678 * Requirement: height = 1
679 * Compute one line
680 *******************************************************************************/
sad_loop_kernel_12_oneline_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i * const sum)681 SIMD_INLINE void sad_loop_kernel_12_oneline_avx512(const uint8_t *const src,
682 const uint8_t *const ref, __m512i *const sum) {
683 const __m128i s0 = _mm_loadu_si128((__m128i *)src);
684 const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), _mm_setzero_si128(), 1);
685 const __m512i s = _mm512_castsi256_si512(s01);
686 const __m512i ss0 = _mm512_permutexvar_epi32(
687 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
688 const __m512i ss1 = _mm512_permutexvar_epi32(
689 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
690 const __m512i ss2 = _mm512_permutexvar_epi32(
691 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
692
693 const __m256i r0 = _mm256_loadu_si256((__m256i *)ref);
694 const __m512i r = _mm512_inserti64x4(_mm512_castsi256_si512(r0), _mm256_setzero_si256(), 1);
695 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
696 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
697
698 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
699 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
700 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
701 }
702
sad_loop_kernel_16_avx512(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m512i * const sum)703 SIMD_INLINE void sad_loop_kernel_16_avx512(const uint8_t *const src, const uint32_t src_stride,
704 const uint8_t *const ref, const uint32_t ref_stride,
705 __m512i *const sum) {
706 const __m128i s0 = _mm_loadu_si128((__m128i *)src);
707 const __m128i s1 = _mm_loadu_si128((__m128i *)(src + src_stride));
708 const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), s1, 1);
709 const __m512i s = _mm512_castsi256_si512(s01);
710 const __m512i ss0 = _mm512_permutexvar_epi32(
711 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
712 const __m512i ss1 = _mm512_permutexvar_epi32(
713 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
714 const __m512i ss2 = _mm512_permutexvar_epi32(
715 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
716 const __m512i ss3 = _mm512_permutexvar_epi32(
717 _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
718
719 const __m256i r0 = _mm256_loadu_si256((__m256i *)ref);
720 const __m256i r1 = _mm256_loadu_si256((__m256i *)(ref + ref_stride));
721 const __m512i r = _mm512_inserti64x4(_mm512_castsi256_si512(r0), r1, 1);
722 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
723 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
724
725 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
726 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
727 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
728 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss3, rr1, 0xE9));
729 }
730
731 /*******************************************************************************
732 * Function helper adds to "sums" vectors SAD's for block width of 16, uses AVX512 instructions
733 * Requirement: width = 16
734 * Requirement: height = 1
735 * Compute one line
736 *******************************************************************************/
sad_loop_kernel_16_oneline_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i * const sum)737 SIMD_INLINE void sad_loop_kernel_16_oneline_avx512(const uint8_t *const src,
738 const uint8_t *const ref, __m512i *const sum) {
739 const __m128i s0 = _mm_loadu_si128((__m128i *)src);
740 const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), _mm_setzero_si128(), 1);
741 const __m512i s = _mm512_castsi256_si512(s01);
742 const __m512i ss0 = _mm512_permutexvar_epi32(
743 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
744 const __m512i ss1 = _mm512_permutexvar_epi32(
745 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
746 const __m512i ss2 = _mm512_permutexvar_epi32(
747 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
748 const __m512i ss3 = _mm512_permutexvar_epi32(
749 _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
750
751 const __m256i r0 = _mm256_loadu_si256((__m256i *)ref);
752 const __m512i r = _mm512_inserti64x4(_mm512_castsi256_si512(r0), _mm256_setzero_si256(), 1);
753 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
754 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
755
756 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
757 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
758 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
759 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss3, rr1, 0xE9));
760 }
761
762 /*******************************************************************************
763 * Function helper adds to two elements "sums" vectors SAD's for block width of 12, uses AVX512 instructions
764 * Requirement: width = 12
765 * Requirement: height = 2
766 *******************************************************************************/
sad_loop_kernel_12_2sum_avx512(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m512i sum[2])767 SIMD_INLINE void sad_loop_kernel_12_2sum_avx512(const uint8_t *const src, const uint32_t src_stride,
768 const uint8_t *const ref, const uint32_t ref_stride,
769 __m512i sum[2]) {
770 const __m128i s0 = _mm_loadu_si128((__m128i *)src);
771 const __m128i s1 = _mm_loadu_si128((__m128i *)(src + src_stride));
772 const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), s1, 1);
773 const __m512i s = _mm512_castsi256_si512(s01);
774 const __m512i ss0 = _mm512_permutexvar_epi32(
775 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
776 const __m512i ss1 = _mm512_permutexvar_epi32(
777 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
778 const __m512i ss2 = _mm512_permutexvar_epi32(
779 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
780
781 const __m256i r0 = _mm256_loadu_si256((__m256i *)ref);
782 const __m256i r1 = _mm256_loadu_si256((__m256i *)(ref + ref_stride));
783 const __m512i r = _mm512_inserti64x4(_mm512_castsi256_si512(r0), r1, 1);
784 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
785 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
786
787 sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
788 sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
789 sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
790 }
791
792 /*******************************************************************************
793 * Function helper adds to two elements "sums" vectors SAD's for block width of 12, uses AVX512 instructions
794 * Requirement: width = 12
795 * Requirement: height = 1
796 * Compute one line
797 *******************************************************************************/
sad_loop_kernel_12_2sum_oneline_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i sum[2])798 SIMD_INLINE void sad_loop_kernel_12_2sum_oneline_avx512(const uint8_t *const src,
799 const uint8_t *const ref, __m512i sum[2]) {
800 const __m128i s0 = _mm_loadu_si128((__m128i *)src);
801 const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), _mm_setzero_si128(), 1);
802 const __m512i s = _mm512_castsi256_si512(s01);
803 const __m512i ss0 = _mm512_permutexvar_epi32(
804 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
805 const __m512i ss1 = _mm512_permutexvar_epi32(
806 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
807 const __m512i ss2 = _mm512_permutexvar_epi32(
808 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
809
810 const __m256i r0 = _mm256_loadu_si256((__m256i *)ref);
811 const __m512i r = _mm512_inserti64x4(_mm512_castsi256_si512(r0), _mm256_setzero_si256(), 1);
812 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
813 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
814
815 sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
816 sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
817 sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
818 }
819
sad_loop_kernel_16_2sum_avx512(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m512i sum[2])820 SIMD_INLINE void sad_loop_kernel_16_2sum_avx512(const uint8_t *const src, const uint32_t src_stride,
821 const uint8_t *const ref, const uint32_t ref_stride,
822 __m512i sum[2]) {
823 const __m128i s0 = _mm_loadu_si128((__m128i *)src);
824 const __m128i s1 = _mm_loadu_si128((__m128i *)(src + src_stride));
825 const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), s1, 1);
826 const __m512i s = _mm512_castsi256_si512(s01);
827 const __m512i ss0 = _mm512_permutexvar_epi32(
828 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
829 const __m512i ss1 = _mm512_permutexvar_epi32(
830 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
831 const __m512i ss2 = _mm512_permutexvar_epi32(
832 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
833 const __m512i ss3 = _mm512_permutexvar_epi32(
834 _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
835
836 const __m256i r0 = _mm256_loadu_si256((__m256i *)ref);
837 const __m256i r1 = _mm256_loadu_si256((__m256i *)(ref + ref_stride));
838 const __m512i r = _mm512_inserti64x4(_mm512_castsi256_si512(r0), r1, 1);
839 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
840 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
841
842 sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
843 sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
844 sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
845 sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss3, rr1, 0xE9));
846 }
847
848 /*******************************************************************************
849 * Function helper adds to two elements "sums" vectors SAD's for block width of 16, uses AVX512 instructions
850 * Requirement: width = 16
851 * Requirement: height = 1
852 * Compute one line
853 *******************************************************************************/
sad_loop_kernel_16_2sum_oneline_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i sum[2])854 SIMD_INLINE void sad_loop_kernel_16_2sum_oneline_avx512(const uint8_t *const src,
855 const uint8_t *const ref, __m512i sum[2]) {
856 const __m128i s0 = _mm_loadu_si128((__m128i *)src);
857 const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), _mm_setzero_si128(), 1);
858 const __m512i s = _mm512_castsi256_si512(s01);
859 const __m512i ss0 = _mm512_permutexvar_epi32(
860 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
861 const __m512i ss1 = _mm512_permutexvar_epi32(
862 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
863 const __m512i ss2 = _mm512_permutexvar_epi32(
864 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
865 const __m512i ss3 = _mm512_permutexvar_epi32(
866 _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
867
868 const __m256i r0 = _mm256_loadu_si256((__m256i *)ref);
869 const __m512i r = _mm512_inserti64x4(_mm512_castsi256_si512(r0), _mm256_setzero_si256(), 1);
870 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
871 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
872
873 sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
874 sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
875 sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
876 sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss3, rr1, 0xE9));
877 }
878
879 /*******************************************************************************
880 * Function helper adds to "sums" vectors SAD's for block width of 12, uses AVX2 instructions
881 * Requirement: width = 12
882 * Requirement: height = 2
883 *******************************************************************************/
sad_loop_kernel_12_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i * const sum)884 static INLINE void sad_loop_kernel_12_avx2(const uint8_t *const src, const uint32_t src_stride,
885 const uint8_t *const ref, const uint32_t ref_stride,
886 __m256i *const sum) {
887 const __m256i ss0 = _mm256_insertf128_si256(
888 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)),
889 _mm_loadu_si128((__m128i *)(src + src_stride)),
890 1);
891 const __m256i rr0 = _mm256_insertf128_si256(
892 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
893 _mm_loadu_si128((__m128i *)(ref + ref_stride)),
894 1);
895 const __m256i rr1 = _mm256_insertf128_si256(
896 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))),
897 _mm_loadu_si128((__m128i *)(ref + ref_stride + 8)),
898 1);
899 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
900 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
901 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
902 }
903
904 /*******************************************************************************
905 * Function helper adds to "sums" vectors SAD's for block width of 12, uses AVX2 instructions
906 * Requirement: width = 12
907 * Requirement: height = 1
908 * Compute one line
909 *******************************************************************************/
sad_loop_kernel_12_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)910 static INLINE void sad_loop_kernel_12_oneline_avx2(const uint8_t *const src,
911 const uint8_t *const ref, __m256i *const sum) {
912 const __m256i ss0 = _mm256_insertf128_si256(
913 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)), _mm_setzero_si128(), 1);
914 const __m256i rr0 = _mm256_insertf128_si256(
915 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
916 const __m256i rr1 = _mm256_insertf128_si256(
917 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))), _mm_setzero_si128(), 1);
918 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
919 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
920 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
921 }
922
sad_loop_kernel_16_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i * const sum)923 static INLINE void sad_loop_kernel_16_avx2(const uint8_t *const src, const uint32_t src_stride,
924 const uint8_t *const ref, const uint32_t ref_stride,
925 __m256i *const sum) {
926 const __m256i ss0 = _mm256_insertf128_si256(
927 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)),
928 _mm_loadu_si128((__m128i *)(src + src_stride)),
929 1);
930 const __m256i rr0 = _mm256_insertf128_si256(
931 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
932 _mm_loadu_si128((__m128i *)(ref + ref_stride)),
933 1);
934 const __m256i rr1 = _mm256_insertf128_si256(
935 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))),
936 _mm_loadu_si128((__m128i *)(ref + ref_stride + 8)),
937 1);
938 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
939 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
940 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
941 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
942 }
943
944 /*******************************************************************************
945 * Function helper adds to "sums" vectors SAD's for block width of 16, uses AVX512 instructions
946 * Requirement: width = 16
947 * Requirement: height = 1
948 * Compute one line
949 *******************************************************************************/
sad_loop_kernel_16_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)950 static INLINE void sad_loop_kernel_16_oneline_avx2(const uint8_t *const src,
951 const uint8_t *const ref, __m256i *const sum) {
952 const __m256i ss0 = _mm256_insertf128_si256(
953 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)), _mm_setzero_si128(), 1);
954 const __m256i rr0 = _mm256_insertf128_si256(
955 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
956 const __m256i rr1 = _mm256_insertf128_si256(
957 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))), _mm_setzero_si128(), 1);
958 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
959 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
960 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
961 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
962 }
963
964 /*******************************************************************************
965 * Function helper adds to two elements "sums" vectors SAD's for block width of 12, uses AVX2 instructions
966 * Requirement: width = 12
967 * Requirement: height = 2
968 *******************************************************************************/
sad_loop_kernel_12_2sum_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i sums[2])969 static INLINE void sad_loop_kernel_12_2sum_avx2(const uint8_t *const src, const uint32_t src_stride,
970 const uint8_t *const ref, const uint32_t ref_stride,
971 __m256i sums[2]) {
972 const __m256i ss0 = _mm256_insertf128_si256(
973 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)),
974 _mm_loadu_si128((__m128i *)(src + src_stride)),
975 1);
976 const __m256i rr0 = _mm256_insertf128_si256(
977 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
978 _mm_loadu_si128((__m128i *)(ref + ref_stride)),
979 1);
980 const __m256i rr1 = _mm256_insertf128_si256(
981 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))),
982 _mm_loadu_si128((__m128i *)(ref + ref_stride + 8)),
983 1);
984 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
985 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
986 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
987 }
988
989 /*******************************************************************************
990 * Function helper adds to two elements "sums" vectors SAD's for block width of 12, uses AVX2 instructions
991 * Requirement: width = 12
992 * Requirement: height = 1
993 * Compute one line
994 *******************************************************************************/
sad_loop_kernel_12_2sum_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[2])995 static INLINE void sad_loop_kernel_12_2sum_oneline_avx2(const uint8_t *const src,
996 const uint8_t *const ref, __m256i sums[2]) {
997 const __m256i ss0 = _mm256_insertf128_si256(
998 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)), _mm_setzero_si128(), 1);
999 const __m256i rr0 = _mm256_insertf128_si256(
1000 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
1001 const __m256i rr1 = _mm256_insertf128_si256(
1002 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))), _mm_setzero_si128(), 1);
1003 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1004 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1005 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1006 }
1007
sad_loop_kernel_16_2sum_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i sums[2])1008 static INLINE void sad_loop_kernel_16_2sum_avx2(const uint8_t *const src, const uint32_t src_stride,
1009 const uint8_t *const ref, const uint32_t ref_stride,
1010 __m256i sums[2]) {
1011 const __m256i ss0 = _mm256_insertf128_si256(
1012 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)),
1013 _mm_loadu_si128((__m128i *)(src + src_stride)),
1014 1);
1015 const __m256i rr0 = _mm256_insertf128_si256(
1016 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
1017 _mm_loadu_si128((__m128i *)(ref + ref_stride)),
1018 1);
1019 const __m256i rr1 = _mm256_insertf128_si256(
1020 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))),
1021 _mm_loadu_si128((__m128i *)(ref + ref_stride + 8)),
1022 1);
1023 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1024 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1025 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1026 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1027 }
1028
1029 /*******************************************************************************
1030 * Function helper adds to two elements "sums" vectors SAD's for block width of 16, uses AVX2 instructions
1031 * Requirement: width = 16
1032 * Requirement: height = 1
1033 * Compute one line
1034 *******************************************************************************/
sad_loop_kernel_16_2sum_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[2])1035 static INLINE void sad_loop_kernel_16_2sum_oneline_avx2(const uint8_t *const src,
1036 const uint8_t *const ref, __m256i sums[2]) {
1037 const __m256i ss0 = _mm256_insertf128_si256(
1038 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)), _mm_setzero_si128(), 1);
1039 const __m256i rr0 = _mm256_insertf128_si256(
1040 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
1041 const __m256i rr1 = _mm256_insertf128_si256(
1042 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))), _mm_setzero_si128(), 1);
1043 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1044 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1045 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1046 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1047 }
1048
sad_loop_kernel_32_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i * const sum)1049 SIMD_INLINE void sad_loop_kernel_32_avx512(const uint8_t *const src, const uint8_t *const ref,
1050 __m512i *const sum) {
1051 const __m256i s01 = _mm256_loadu_si256((__m256i *)src);
1052 const __m512i s = _mm512_castsi256_si512(s01);
1053 const __m512i ss0 = _mm512_permutexvar_epi32(
1054 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
1055 const __m512i ss1 = _mm512_permutexvar_epi32(
1056 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
1057 const __m512i ss2 = _mm512_permutexvar_epi32(
1058 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
1059 const __m512i ss3 = _mm512_permutexvar_epi32(
1060 _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
1061
1062 const __m512i r = _mm512_loadu_si512((__m512i *)ref);
1063 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 2, 3, 3, 4), r);
1064 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 3, 4, 4, 5), r);
1065
1066 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
1067 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
1068 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
1069 *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss3, rr1, 0xE9));
1070 }
1071
sad_loop_kernel_32_2sum_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i sums[2])1072 SIMD_INLINE void sad_loop_kernel_32_2sum_avx512(const uint8_t *const src, const uint8_t *const ref,
1073 __m512i sums[2]) {
1074 const __m256i s01 = _mm256_loadu_si256((__m256i *)src);
1075 const __m512i s = _mm512_castsi256_si512(s01);
1076 const __m512i ss0 = _mm512_permutexvar_epi32(
1077 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
1078 const __m512i ss1 = _mm512_permutexvar_epi32(
1079 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
1080 const __m512i ss2 = _mm512_permutexvar_epi32(
1081 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
1082 const __m512i ss3 = _mm512_permutexvar_epi32(
1083 _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
1084
1085 const __m512i r = _mm512_loadu_si512((__m512i *)ref);
1086 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 2, 3, 3, 4), r);
1087 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 3, 4, 4, 5), r);
1088
1089 sums[0] = _mm512_adds_epu16(sums[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
1090 sums[1] = _mm512_adds_epu16(sums[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
1091 sums[0] = _mm512_adds_epu16(sums[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
1092 sums[1] = _mm512_adds_epu16(sums[1], _mm512_dbsad_epu8(ss3, rr1, 0xE9));
1093 }
1094
sad_loop_kernel_32_4sum_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i sums[4])1095 SIMD_INLINE void sad_loop_kernel_32_4sum_avx512(const uint8_t *const src, const uint8_t *const ref,
1096 __m512i sums[4]) {
1097 const __m256i s01 = _mm256_loadu_si256((__m256i *)src);
1098 const __m512i s = _mm512_castsi256_si512(s01);
1099 const __m512i ss0 = _mm512_permutexvar_epi32(
1100 _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
1101 const __m512i ss1 = _mm512_permutexvar_epi32(
1102 _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
1103 const __m512i ss2 = _mm512_permutexvar_epi32(
1104 _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
1105 const __m512i ss3 = _mm512_permutexvar_epi32(
1106 _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
1107
1108 const __m512i r = _mm512_loadu_si512((__m512i *)ref);
1109 const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 2, 3, 3, 4), r);
1110 const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 3, 4, 4, 5), r);
1111
1112 sums[0] = _mm512_adds_epu16(sums[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
1113 sums[1] = _mm512_adds_epu16(sums[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
1114 sums[2] = _mm512_adds_epu16(sums[2], _mm512_dbsad_epu8(ss2, rr1, 0x94));
1115 sums[3] = _mm512_adds_epu16(sums[3], _mm512_dbsad_epu8(ss3, rr1, 0xE9));
1116 }
1117
sad_loop_kernel_32_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)1118 static INLINE void sad_loop_kernel_32_avx2(const uint8_t *const src, const uint8_t *const ref,
1119 __m256i *const sum) {
1120 const __m256i ss0 = _mm256_loadu_si256((__m256i *)src);
1121 const __m256i rr0 = _mm256_loadu_si256((__m256i *)ref);
1122 const __m256i rr1 = _mm256_loadu_si256((__m256i *)(ref + 8));
1123 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1124 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1125 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1126 *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1127 }
1128
sad_loop_kernel_32_2sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[2])1129 static INLINE void sad_loop_kernel_32_2sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1130 __m256i sums[2]) {
1131 const __m256i ss0 = _mm256_loadu_si256((__m256i *)src);
1132 const __m256i rr0 = _mm256_loadu_si256((__m256i *)ref);
1133 const __m256i rr1 = _mm256_loadu_si256((__m256i *)(ref + 8));
1134 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1135 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1136 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1137 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1138 }
1139
sad_loop_kernel_32_4sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[4])1140 SIMD_INLINE void sad_loop_kernel_32_4sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1141 __m256i sums[4]) {
1142 const __m256i ss0 = _mm256_loadu_si256((__m256i *)src);
1143 const __m256i rr0 = _mm256_loadu_si256((__m256i *)ref);
1144 const __m256i rr1 = _mm256_loadu_si256((__m256i *)(ref + 8));
1145 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1146 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1147 sums[2] = _mm256_adds_epu16(sums[2], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1148 sums[3] = _mm256_adds_epu16(sums[3], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1149 }
1150
sad_loop_kernel_64_2sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[2])1151 static INLINE void sad_loop_kernel_64_2sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1152 __m256i sums[2]) {
1153 sad_loop_kernel_32_2sum_avx2(src + 0 * 32, ref + 0 * 32, sums);
1154 sad_loop_kernel_32_2sum_avx2(src + 1 * 32, ref + 1 * 32, sums);
1155 }
1156
sad_loop_kernel_64_4sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[4])1157 static INLINE void sad_loop_kernel_64_4sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1158 __m256i sums[4]) {
1159 sad_loop_kernel_32_4sum_avx2(src + 0 * 32, ref + 0 * 32, sums);
1160 sad_loop_kernel_32_4sum_avx2(src + 1 * 32, ref + 1 * 32, sums);
1161 }
1162
sad_loop_kernel_64_8sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[8])1163 static INLINE void sad_loop_kernel_64_8sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1164 __m256i sums[8]) {
1165 //sad_loop_kernel_32_4sum_avx2(src + 0 * 32, ref + 0 * 32, sums + 0);
1166 //sad_loop_kernel_32_4sum_avx2(src + 1 * 32, ref + 1 * 32, sums + 4);
1167 const __m256i ss0 = _mm256_loadu_si256((__m256i *)src);
1168 const __m256i ss1 = _mm256_loadu_si256((__m256i *)(src + 32));
1169 const __m256i rr0 = _mm256_loadu_si256((__m256i *)ref);
1170 const __m256i rr1 = _mm256_loadu_si256((__m256i *)(ref + 8));
1171 const __m256i rr2 = _mm256_loadu_si256((__m256i *)(ref + 32));
1172 const __m256i rr3 = _mm256_loadu_si256((__m256i *)(ref + 40));
1173
1174 sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1175 sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1176 sums[2] = _mm256_adds_epu16(sums[2], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1177 sums[3] = _mm256_adds_epu16(sums[3], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1178 sums[4] = _mm256_adds_epu16(sums[4], _mm256_mpsadbw_epu8(rr2, ss1, (0 << 3) | 0)); // 000 000
1179 sums[5] = _mm256_adds_epu16(sums[5], _mm256_mpsadbw_epu8(rr2, ss1, (5 << 3) | 5)); // 101 101
1180 sums[6] = _mm256_adds_epu16(sums[6], _mm256_mpsadbw_epu8(rr3, ss1, (2 << 3) | 2)); // 010 010
1181 sums[7] = _mm256_adds_epu16(sums[7], _mm256_mpsadbw_epu8(rr3, ss1, (7 << 3) | 7)); // 111 111
1182 }
1183
1184 #define UPDATE_BEST(sum, idx, offset, best_s, best_x, best_y) \
1185 { \
1186 const uint32_t sad = _mm_extract_epi32(sum, idx); \
1187 \
1188 if (sad < best_s) { \
1189 best_s = sad; \
1190 best_x = (offset) + (idx); \
1191 best_y = y; \
1192 } \
1193 }
1194
update_best_kernel(const uint32_t sad,const __m128i minpos,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1195 static INLINE void update_best_kernel(const uint32_t sad, const __m128i minpos, const int32_t x,
1196 const int32_t y, uint32_t *const best_s,
1197 int32_t *const best_x, int32_t *const best_y) {
1198 if (sad < *best_s) {
1199 const int32_t x_offset = _mm_extract_epi16(minpos, 1);
1200 *best_s = sad;
1201 *best_x = x + x_offset;
1202 *best_y = y;
1203 }
1204 }
1205
update_best(const __m128i sad,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1206 static INLINE void update_best(const __m128i sad, const int32_t x, const int32_t y,
1207 uint32_t *const best_s, int32_t *const best_x,
1208 int32_t *const best_y) {
1209 const __m128i minpos = _mm_minpos_epu16(sad);
1210 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
1211
1212 if (min0 < *best_s) {
1213 const int32_t x_offset = _mm_extract_epi16(minpos, 1);
1214 *best_s = min0;
1215 *best_x = x + x_offset;
1216 *best_y = y;
1217 }
1218 }
1219
update_8_best(const __m128i sads[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1220 SIMD_INLINE void update_8_best(const __m128i sads[2], const int32_t x, const int32_t y,
1221 uint32_t *const best_s, int32_t *const best_x,
1222 int32_t *const best_y) {
1223 UPDATE_BEST(sads[0], 0, x + 0, *best_s, *best_x, *best_y);
1224 UPDATE_BEST(sads[0], 1, x + 0, *best_s, *best_x, *best_y);
1225 UPDATE_BEST(sads[0], 2, x + 0, *best_s, *best_x, *best_y);
1226 UPDATE_BEST(sads[0], 3, x + 0, *best_s, *best_x, *best_y);
1227 UPDATE_BEST(sads[1], 0, x + 4, *best_s, *best_x, *best_y);
1228 UPDATE_BEST(sads[1], 1, x + 4, *best_s, *best_x, *best_y);
1229 UPDATE_BEST(sads[1], 2, x + 4, *best_s, *best_x, *best_y);
1230 UPDATE_BEST(sads[1], 3, x + 4, *best_s, *best_x, *best_y);
1231 }
1232
update_small_pel(const __m256i sum256,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1233 static INLINE void update_small_pel(const __m256i sum256, const int32_t x, const int32_t y,
1234 uint32_t *const best_s, int32_t *const best_x,
1235 int32_t *const best_y) {
1236 const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
1237 const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
1238 const __m128i sad = _mm_adds_epu16(sum256_lo, sum256_hi);
1239 update_best(sad, x, y, best_s, best_x, best_y);
1240 }
1241
update_some_pel(const __m256i sum256,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1242 static INLINE void update_some_pel(const __m256i sum256, const int32_t x, const int32_t y,
1243 uint32_t *const best_s, int32_t *const best_x,
1244 int32_t *const best_y) {
1245 const __m128i sum0 = _mm256_castsi256_si128(sum256);
1246 const __m128i sum1 = _mm256_extracti128_si256(sum256, 1);
1247 __m128i minpos;
1248 const uint32_t min0 = saturate_add(sum0, sum1, &minpos);
1249 update_best_kernel(min0, minpos, x, y, best_s, best_x, best_y);
1250 }
1251
update_256_pel(const __m512i sum512,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1252 static INLINE void update_256_pel(const __m512i sum512, const int32_t x, const int32_t y,
1253 uint32_t *const best_s, int32_t *const best_x,
1254 int32_t *const best_y) {
1255 const __m256i sum512_lo = _mm512_castsi512_si256(sum512);
1256 const __m256i sum512_hi = _mm512_extracti64x4_epi64(sum512, 1);
1257 const __m256i sum256 = _mm256_adds_epu16(sum512_lo, sum512_hi);
1258 const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
1259 const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
1260 update_best(sum256_lo, x + 0, y, best_s, best_x, best_y);
1261 update_best(sum256_hi, x + 8, y, best_s, best_x, best_y);
1262 }
1263
update_384_pel(const __m512i sum512,const __m256i sums256[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1264 SIMD_INLINE void update_384_pel(const __m512i sum512, const __m256i sums256[2], const int32_t x,
1265 const int32_t y, uint32_t *const best_s, int32_t *const best_x,
1266 int32_t *const best_y) {
1267 const __m256i sum512_lo = _mm512_castsi512_si256(sum512);
1268 const __m256i sum512_hi = _mm512_extracti64x4_epi64(sum512, 1);
1269 const __m256i sad256 = _mm256_adds_epu16(sum512_lo, sum512_hi);
1270 const __m128i sad256_lo = _mm256_castsi256_si128(sad256);
1271 const __m128i sad256_hi = _mm256_extracti128_si256(sad256, 1);
1272
1273 const __m128i sum256_0_lo = _mm256_castsi256_si128(sums256[0]);
1274 const __m128i sum256_0_hi = _mm256_extracti128_si256(sums256[0], 1);
1275 const __m128i sad128_0 = _mm_adds_epu16(sum256_0_lo, sum256_0_hi);
1276
1277 const __m128i sum256_1_lo = _mm256_castsi256_si128(sums256[1]);
1278 const __m128i sum256_1_hi = _mm256_extracti128_si256(sums256[1], 1);
1279 const __m128i sad128_1 = _mm_adds_epu16(sum256_1_lo, sum256_1_hi);
1280
1281 __m128i minpos_lo, minpos_hi;
1282 const uint32_t min_lo = saturate_add(sad256_lo, sad128_0, &minpos_lo);
1283 update_best_kernel(min_lo, minpos_lo, x + 0, y, best_s, best_x, best_y);
1284 const uint32_t min_hi = saturate_add(sad256_hi, sad128_1, &minpos_hi);
1285 update_best_kernel(min_hi, minpos_hi, x + 8, y, best_s, best_x, best_y);
1286 }
1287
update_512_pel(const __m512i sum512,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1288 SIMD_INLINE void update_512_pel(const __m512i sum512, const int32_t x, const int32_t y,
1289 uint32_t *const best_s, int32_t *const best_x,
1290 int32_t *const best_y) {
1291 const __m256i sum512_lo = _mm512_castsi512_si256(sum512);
1292 const __m256i sum512_hi = _mm512_extracti64x4_epi64(sum512, 1);
1293 __m128i minpos_lo, minpos_hi;
1294
1295 const __m128i sum128_0 = _mm256_castsi256_si128(sum512_lo);
1296 const __m128i sum128_1 = _mm256_castsi256_si128(sum512_hi);
1297 const uint32_t min_lo = saturate_add(sum128_0, sum128_1, &minpos_lo);
1298 update_best_kernel(min_lo, minpos_lo, x + 0, y, best_s, best_x, best_y);
1299
1300 const __m128i sum128_2 = _mm256_extracti128_si256(sum512_lo, 1);
1301 const __m128i sum128_3 = _mm256_extracti128_si256(sum512_hi, 1);
1302 const uint32_t min_hi = saturate_add(sum128_2, sum128_3, &minpos_hi);
1303 update_best_kernel(min_hi, minpos_hi, x + 8, y, best_s, best_x, best_y);
1304 }
1305
update_1024_pel(const __m512i sums512[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1306 SIMD_INLINE void update_1024_pel(const __m512i sums512[2], const int32_t x, const int32_t y,
1307 uint32_t *const best_s, int32_t *const best_x,
1308 int32_t *const best_y) {
1309 const __m512i sum = _mm512_adds_epu16(sums512[0], sums512[1]);
1310 const __m256i sum_lo = _mm512_castsi512_si256(sum);
1311 const __m256i sum_hi = _mm512_extracti64x4_epi64(sum, 1);
1312 const __m256i sad = _mm256_adds_epu16(sum_lo, sum_hi);
1313 const __m128i sad_lo = _mm256_castsi256_si128(sad);
1314 const __m128i sad_hi = _mm256_extracti128_si256(sad, 1);
1315 const __m128i minpos_lo = _mm_minpos_epu16(sad_lo);
1316 const __m128i minpos_hi = _mm_minpos_epu16(sad_hi);
1317 const uint32_t min0 = _mm_extract_epi16(minpos_lo, 0);
1318 const uint32_t min1 = _mm_extract_epi16(minpos_hi, 0);
1319 uint32_t minmin, delta;
1320 __m128i minpos;
1321
1322 if (min0 <= min1) {
1323 minmin = min0;
1324 delta = 0;
1325 minpos = minpos_lo;
1326 } else {
1327 minmin = min1;
1328 delta = 8;
1329 minpos = minpos_hi;
1330 }
1331
1332 if (minmin < *best_s) {
1333 if (minmin != 0xFFFF) { // no overflow
1334 *best_s = minmin;
1335 *best_x = x + delta + _mm_extract_epi16(minpos, 1);
1336 *best_y = y;
1337 } else { // overflow
1338 __m256i sads256[2];
1339 __m128i sads128[2];
1340
1341 add16x16x2to32bit(sums512, sads256);
1342
1343 sads128[0] = _mm256_castsi256_si128(sads256[0]);
1344 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
1345 update_8_best(sads128, x + 0, y, best_s, best_x, best_y);
1346
1347 sads128[0] = _mm256_castsi256_si128(sads256[1]);
1348 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
1349 update_8_best(sads128, x + 8, y, best_s, best_x, best_y);
1350 }
1351 }
1352 }
1353
update_768_pel(const __m512i sum512,const __m256i sums256[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1354 SIMD_INLINE void update_768_pel(const __m512i sum512, const __m256i sums256[2], const int32_t x,
1355 const int32_t y, uint32_t *const best_s, int32_t *const best_x,
1356 int32_t *const best_y) {
1357 __m512i sums512[2];
1358
1359 sums512[0] = sum512;
1360 sums512[1] = _mm512_inserti64x4(_mm512_castsi256_si512(sums256[0]), sums256[1], 1);
1361 sums512[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), sums512[1]);
1362 update_1024_pel(sums512, x, y, best_s, best_x, best_y);
1363 }
1364
update_1536_pel(const __m512i sums512[3],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1365 SIMD_INLINE void update_1536_pel(const __m512i sums512[3], const int32_t x, const int32_t y,
1366 uint32_t *const best_s, int32_t *const best_x,
1367 int32_t *const best_y) {
1368 const __m512i sum01 = _mm512_adds_epu16(sums512[0], sums512[1]);
1369 const __m512i sum = _mm512_adds_epu16(sum01, sums512[2]);
1370 const __m256i sum_lo = _mm512_castsi512_si256(sum);
1371 const __m256i sum_hi = _mm512_extracti64x4_epi64(sum, 1);
1372 const __m256i sad = _mm256_adds_epu16(sum_lo, sum_hi);
1373 const __m128i sad_lo = _mm256_castsi256_si128(sad);
1374 const __m128i sad_hi = _mm256_extracti128_si256(sad, 1);
1375 const __m128i minpos_lo = _mm_minpos_epu16(sad_lo);
1376 const __m128i minpos_hi = _mm_minpos_epu16(sad_hi);
1377 const uint32_t min0 = _mm_extract_epi16(minpos_lo, 0);
1378 const uint32_t min1 = _mm_extract_epi16(minpos_hi, 0);
1379 uint32_t minmin, delta;
1380 __m128i minpos;
1381
1382 if (min0 <= min1) {
1383 minmin = min0;
1384 delta = 0;
1385 minpos = minpos_lo;
1386 } else {
1387 minmin = min1;
1388 delta = 8;
1389 minpos = minpos_hi;
1390 }
1391
1392 if (minmin < *best_s) {
1393 if (minmin != 0xFFFF) { // no overflow
1394 *best_s = minmin;
1395 *best_x = x + delta + _mm_extract_epi16(minpos, 1);
1396 *best_y = y;
1397 } else { // overflow
1398 __m256i sads256[2];
1399 __m128i sads128[2];
1400
1401 add16x16x3to32bit(sums512, sads256);
1402
1403 sads128[0] = _mm256_castsi256_si128(sads256[0]);
1404 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
1405 update_8_best(sads128, x + 0, y, best_s, best_x, best_y);
1406
1407 sads128[0] = _mm256_castsi256_si128(sads256[1]);
1408 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
1409 update_8_best(sads128, x + 8, y, best_s, best_x, best_y);
1410 }
1411 }
1412 }
1413
update_2048_pel(const __m512i sums512[4],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1414 SIMD_INLINE void update_2048_pel(const __m512i sums512[4], const int32_t x, const int32_t y,
1415 uint32_t *const best_s, int32_t *const best_x,
1416 int32_t *const best_y) {
1417 const __m512i sum01 = _mm512_adds_epu16(sums512[0], sums512[1]);
1418 const __m512i sum23 = _mm512_adds_epu16(sums512[2], sums512[3]);
1419 const __m512i sum = _mm512_adds_epu16(sum01, sum23);
1420 const __m256i sum_lo = _mm512_castsi512_si256(sum);
1421 const __m256i sum_hi = _mm512_extracti64x4_epi64(sum, 1);
1422 const __m256i sad = _mm256_adds_epu16(sum_lo, sum_hi);
1423 const __m128i sad_lo = _mm256_castsi256_si128(sad);
1424 const __m128i sad_hi = _mm256_extracti128_si256(sad, 1);
1425 const __m128i minpos_lo = _mm_minpos_epu16(sad_lo);
1426 const __m128i minpos_hi = _mm_minpos_epu16(sad_hi);
1427 const uint32_t min0 = _mm_extract_epi16(minpos_lo, 0);
1428 const uint32_t min1 = _mm_extract_epi16(minpos_hi, 0);
1429 uint32_t minmin, delta;
1430 __m128i minpos;
1431
1432 if (min0 <= min1) {
1433 minmin = min0;
1434 delta = 0;
1435 minpos = minpos_lo;
1436 } else {
1437 minmin = min1;
1438 delta = 8;
1439 minpos = minpos_hi;
1440 }
1441
1442 if (minmin < *best_s) {
1443 if (minmin != 0xFFFF) { // no overflow
1444 *best_s = minmin;
1445 *best_x = x + delta + _mm_extract_epi16(minpos, 1);
1446 *best_y = y;
1447 } else { // overflow
1448 __m256i sads256[2];
1449 __m128i sads128[2];
1450
1451 add16x16x4to32bit(sums512, sads256);
1452
1453 sads128[0] = _mm256_castsi256_si128(sads256[0]);
1454 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
1455 update_8_best(sads128, x + 0, y, best_s, best_x, best_y);
1456
1457 sads128[0] = _mm256_castsi256_si128(sads256[1]);
1458 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
1459 update_8_best(sads128, x + 8, y, best_s, best_x, best_y);
1460 }
1461 }
1462 }
1463
update_leftover_small_pel(const __m256i sum256,const int32_t x,const int32_t y,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1464 static INLINE void update_leftover_small_pel(const __m256i sum256, const int32_t x, const int32_t y,
1465 const __m128i mask, uint32_t *const best_s,
1466 int32_t *const best_x, int32_t *const best_y) {
1467 const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
1468 const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
1469 __m128i sad = _mm_adds_epu16(sum256_lo, sum256_hi);
1470 sad = _mm_or_si128(sad, mask);
1471 update_best(sad, x, y, best_s, best_x, best_y);
1472 }
1473
update_leftover_256_pel(const __m256i sum256,const int16_t search_area_width,const int32_t x,const int32_t y,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1474 static INLINE void update_leftover_256_pel(const __m256i sum256, const int16_t search_area_width,
1475 const int32_t x, const int32_t y, const __m128i mask,
1476 uint32_t *const best_s, int32_t *const best_x,
1477 int32_t *const best_y) {
1478 const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
1479 const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
1480 __m128i sad = _mm_adds_epu16(sum256_lo, sum256_hi);
1481 if ((x + 8) > search_area_width) {
1482 sad = _mm_or_si128(sad, mask);
1483 }
1484 update_best(sad, x, y, best_s, best_x, best_y);
1485 }
1486
update_leftover_512_pel(const __m256i sum256,const int16_t search_area_width,const int32_t x,const int32_t y,const __m256i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1487 static INLINE void update_leftover_512_pel(const __m256i sum256, const int16_t search_area_width,
1488 const int32_t x, const int32_t y, const __m256i mask,
1489 uint32_t *const best_s, int32_t *const best_x,
1490 int32_t *const best_y) {
1491 __m256i sum = sum256;
1492 if ((x + 8) > search_area_width) {
1493 sum = _mm256_or_si256(sum256, mask);
1494 }
1495 const __m128i sum0 = _mm256_castsi256_si128(sum);
1496 const __m128i sum1 = _mm256_extracti128_si256(sum, 1);
1497 __m128i minpos;
1498 const uint32_t min0 = saturate_add(sum0, sum1, &minpos);
1499 update_best_kernel(min0, minpos, x, y, best_s, best_x, best_y);
1500 }
1501
update_leftover8_1024_pel(const __m256i sums256[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1502 SIMD_INLINE void update_leftover8_1024_pel(const __m256i sums256[2], const int32_t x,
1503 const int32_t y, uint32_t *const best_s,
1504 int32_t *const best_x, int32_t *const best_y) {
1505 const __m256i sum256 = _mm256_adds_epu16(sums256[0], sums256[1]);
1506 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
1507 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
1508 const __m128i sad = _mm_adds_epu16(sum_lo, sum_hi);
1509 const __m128i minpos = _mm_minpos_epu16(sad);
1510 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
1511
1512 if (min0 < *best_s) {
1513 if (min0 != 0xFFFF) { // no overflow
1514 *best_s = min0;
1515 *best_x = x + _mm_extract_epi16(minpos, 1);
1516 *best_y = y;
1517 } else { // overflow
1518 __m128i sads[2];
1519 add16x8x2to32bit(sums256, sads);
1520 update_8_best(sads, x, y, best_s, best_x, best_y);
1521 }
1522 }
1523 }
1524
update_leftover_1024_pel(const __m256i sums256[2],const int16_t search_area_width,const int32_t x,const int32_t y,const uint32_t leftover,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1525 SIMD_INLINE void update_leftover_1024_pel(const __m256i sums256[2], const int16_t search_area_width,
1526 const int32_t x, const int32_t y, const uint32_t leftover,
1527 const __m128i mask, uint32_t *const best_s,
1528 int32_t *const best_x, int32_t *const best_y) {
1529 const __m256i sum256 = _mm256_adds_epu16(sums256[0], sums256[1]);
1530 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
1531 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
1532 const __m128i sad0 = _mm_adds_epu16(sum_lo, sum_hi);
1533 const __m128i sad1 = _mm_or_si128(sad0, mask);
1534 const __m128i minpos = _mm_minpos_epu16(sad1);
1535 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
1536 int32_t xx = x;
1537
1538 if (min0 < *best_s) {
1539 if (min0 != 0xFFFF) { // no overflow
1540 *best_s = min0;
1541 *best_x = xx + _mm_extract_epi16(minpos, 1);
1542 *best_y = y;
1543 } else { // overflow
1544 const int32_t num = xx + ((leftover < 4) ? leftover : 4);
1545 __m128i sads[2];
1546
1547 add16x8x2to32bit(sums256, sads);
1548
1549 do {
1550 UPDATE_BEST(sads[0], 0, xx, *best_s, *best_x, *best_y);
1551 sads[0] = _mm_srli_si128(sads[0], 4);
1552 } while (++xx < num);
1553
1554 while (xx < search_area_width) {
1555 UPDATE_BEST(sads[1], 0, xx, *best_s, *best_x, *best_y);
1556 sads[1] = _mm_srli_si128(sads[1], 4);
1557 xx++;
1558 }
1559 }
1560 }
1561 }
1562
update_leftover8_1536_pel(const __m256i sums256[3],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1563 SIMD_INLINE void update_leftover8_1536_pel(const __m256i sums256[3], const int32_t x,
1564 const int32_t y, uint32_t *const best_s,
1565 int32_t *const best_x, int32_t *const best_y) {
1566 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
1567 const __m256i sum256 = _mm256_adds_epu16(sum01, sums256[2]);
1568 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
1569 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
1570 const __m128i sad = _mm_adds_epu16(sum_lo, sum_hi);
1571 const __m128i minpos = _mm_minpos_epu16(sad);
1572 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
1573
1574 if (min0 < *best_s) {
1575 if (min0 != 0xFFFF) { // no overflow
1576 *best_s = min0;
1577 *best_x = x + _mm_extract_epi16(minpos, 1);
1578 *best_y = y;
1579 } else { // overflow
1580 __m128i sads[2];
1581 add16x8x3to32bit(sums256, sads);
1582 update_8_best(sads, x, y, best_s, best_x, best_y);
1583 }
1584 }
1585 }
1586
update_leftover_1536_pel(const __m256i sums256[3],const int16_t search_area_width,const int32_t x,const int32_t y,const uint32_t leftover,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1587 SIMD_INLINE void update_leftover_1536_pel(const __m256i sums256[3], const int16_t search_area_width,
1588 const int32_t x, const int32_t y, const uint32_t leftover,
1589 const __m128i mask, uint32_t *const best_s,
1590 int32_t *const best_x, int32_t *const best_y) {
1591 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
1592 const __m256i sum256 = _mm256_adds_epu16(sum01, sums256[2]);
1593 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
1594 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
1595 const __m128i sad0 = _mm_adds_epu16(sum_lo, sum_hi);
1596 const __m128i sad1 = _mm_or_si128(sad0, mask);
1597 const __m128i minpos = _mm_minpos_epu16(sad1);
1598 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
1599 int32_t xx = x;
1600
1601 if (min0 < *best_s) {
1602 if (min0 != 0xFFFF) { // no overflow
1603 *best_s = min0;
1604 *best_x = xx + _mm_extract_epi16(minpos, 1);
1605 *best_y = y;
1606 } else { // overflow
1607 const int32_t num = xx + ((leftover < 4) ? leftover : 4);
1608 __m128i sads[2];
1609
1610 add16x8x3to32bit(sums256, sads);
1611
1612 do {
1613 UPDATE_BEST(sads[0], 0, xx, *best_s, *best_x, *best_y);
1614 sads[0] = _mm_srli_si128(sads[0], 4);
1615 } while (++xx < num);
1616
1617 while (xx < search_area_width) {
1618 UPDATE_BEST(sads[1], 0, xx, *best_s, *best_x, *best_y);
1619 sads[1] = _mm_srli_si128(sads[1], 4);
1620 xx++;
1621 }
1622 }
1623 }
1624 }
1625
update_leftover8_2048_pel(const __m256i sums256[4],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1626 SIMD_INLINE void update_leftover8_2048_pel(const __m256i sums256[4], const int32_t x,
1627 const int32_t y, uint32_t *const best_s,
1628 int32_t *const best_x, int32_t *const best_y) {
1629 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
1630 const __m256i sum23 = _mm256_adds_epu16(sums256[2], sums256[3]);
1631 const __m256i sum256 = _mm256_adds_epu16(sum01, sum23);
1632 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
1633 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
1634 const __m128i sad = _mm_adds_epu16(sum_lo, sum_hi);
1635 const __m128i minpos = _mm_minpos_epu16(sad);
1636 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
1637
1638 if (min0 < *best_s) {
1639 if (min0 != 0xFFFF) { // no overflow
1640 *best_s = min0;
1641 *best_x = x + _mm_extract_epi16(minpos, 1);
1642 *best_y = y;
1643 } else { // overflow
1644 __m128i sads[2];
1645 add16x8x4to32bit(sums256, sads);
1646 update_8_best(sads, x, y, best_s, best_x, best_y);
1647 }
1648 }
1649 }
1650
update_leftover_2048_pel(const __m256i sums256[4],const int16_t search_area_width,const int32_t x,const int32_t y,const uint32_t leftover,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1651 SIMD_INLINE void update_leftover_2048_pel(const __m256i sums256[4], const int16_t search_area_width,
1652 const int32_t x, const int32_t y, const uint32_t leftover,
1653 const __m128i mask, uint32_t *const best_s,
1654 int32_t *const best_x, int32_t *const best_y) {
1655 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
1656 const __m256i sum23 = _mm256_adds_epu16(sums256[2], sums256[3]);
1657 const __m256i sum256 = _mm256_adds_epu16(sum01, sum23);
1658 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
1659 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
1660 const __m128i sad0 = _mm_adds_epu16(sum_lo, sum_hi);
1661 const __m128i sad1 = _mm_or_si128(sad0, mask);
1662 const __m128i minpos = _mm_minpos_epu16(sad1);
1663 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
1664 int32_t xx = x;
1665
1666 if (min0 < *best_s) {
1667 if (min0 != 0xFFFF) { // no overflow
1668 *best_s = min0;
1669 *best_x = xx + _mm_extract_epi16(minpos, 1);
1670 *best_y = y;
1671 } else { // overflow
1672 const int32_t num = xx + ((leftover < 4) ? leftover : 4);
1673 __m128i sads[2];
1674
1675 add16x8x4to32bit(sums256, sads);
1676
1677 do {
1678 UPDATE_BEST(sads[0], 0, xx, *best_s, *best_x, *best_y);
1679 sads[0] = _mm_srli_si128(sads[0], 4);
1680 } while (++xx < num);
1681
1682 while (xx < search_area_width) {
1683 UPDATE_BEST(sads[1], 0, xx, *best_s, *best_x, *best_y);
1684 sads[1] = _mm_srli_si128(sads[1], 4);
1685 xx++;
1686 }
1687 }
1688 }
1689 }
1690
1691 /*******************************************************************************
1692 * Requirement: search_size <= 8,
1693 * Returns "search_size" of SAD's for all height in 5th and 6th col
1694 *******************************************************************************/
complement_4_to_6(uint8_t * ref,uint32_t ref_stride,uint8_t * src,uint32_t src_stride,uint32_t height,uint32_t search_size)1695 static INLINE __m128i complement_4_to_6(uint8_t *ref, uint32_t ref_stride, uint8_t *src,
1696 uint32_t src_stride, uint32_t height,
1697 uint32_t search_size) {
1698 __m128i sum;
1699 DECLARE_ALIGNED(16, uint16_t, tsum[8]);
1700 memset(tsum, 0, 8 * sizeof(uint16_t));
1701 for (uint32_t search_area = 0; search_area < search_size; search_area++) {
1702 for (uint32_t y = 0; y < height; y++) {
1703 tsum[search_area] += EB_ABS_DIFF(src[y * src_stride + 4], ref[y * ref_stride + 4]) +
1704 EB_ABS_DIFF(src[y * src_stride + 5], ref[y * ref_stride + 5]);
1705 }
1706 ref += 1;
1707 }
1708 sum = _mm_loadu_si128((__m128i *)tsum);
1709 return sum;
1710 }
1711
1712 /*******************************************************************************
1713 * Requirement: block_height < 64
1714 * General version for SAD computing that support any block width and height
1715 *******************************************************************************/
sad_loop_kernel_generalized_avx512(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t block_height,uint32_t block_width,uint64_t * best_sad,int16_t * x_search_center,int16_t * y_search_center,uint32_t src_stride_raw,int16_t search_area_width,int16_t search_area_height)1716 void sad_loop_kernel_generalized_avx512(
1717 uint8_t * src, // input parameter, source samples Ptr
1718 uint32_t src_stride, // input parameter, source stride
1719 uint8_t * ref, // input parameter, reference samples Ptr
1720 uint32_t ref_stride, // input parameter, reference stride
1721 uint32_t block_height, // input parameter, block height (M)
1722 uint32_t block_width, // input parameter, block width (N)
1723 uint64_t *best_sad, int16_t *x_search_center, int16_t *y_search_center,
1724 uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
1725 int16_t search_area_width, int16_t search_area_height) {
1726 int16_t i, j;
1727 uint32_t k, l;
1728 const uint8_t *p_ref, *p_src;
1729 uint32_t low_sum = 0xffffff;
1730 int32_t x_best = *x_search_center, y_best = *y_search_center;
1731 uint32_t leftover = search_area_width & 15;
1732
1733 __m128i leftover_mask = _mm_set1_epi32(-1);
1734 __m128i leftover_mask32b = _mm_set1_epi32(-1);
1735 if (leftover) {
1736 for (k = 0; k < (uint32_t)(search_area_width & 7); k++)
1737 leftover_mask = _mm_slli_si128(leftover_mask, 2);
1738 for (k = 0; k < (uint32_t)(search_area_width & 3); k++)
1739 leftover_mask32b = _mm_slli_si128(leftover_mask32b, 4);
1740 }
1741
1742 for (i = 0; i < search_area_height; i++) {
1743 for (j = 0; j < search_area_width; j += 16) {
1744 p_src = src;
1745 p_ref = ref + j;
1746
1747 __m512i sums512[8] = {_mm512_setzero_si512(),
1748 _mm512_setzero_si512(),
1749 _mm512_setzero_si512(),
1750 _mm512_setzero_si512(),
1751 _mm512_setzero_si512(),
1752 _mm512_setzero_si512(),
1753 _mm512_setzero_si512(),
1754 _mm512_setzero_si512()};
1755 __m256i sum256_1 = _mm256_setzero_si256();
1756 __m256i sum256_2 = _mm256_setzero_si256();
1757 __m256i sum256_3 = _mm256_setzero_si256();
1758 __m256i sum256_4 = _mm256_setzero_si256();
1759 __m256i sum256 = _mm256_setzero_si256();
1760 for (k = 0; k + 2 <= block_height; k += 2) {
1761 uint32_t width_calc = block_width;
1762 const uint8_t *temp_src = p_src;
1763 const uint8_t *temp_ref = p_ref;
1764 if (width_calc >= 32) {
1765 sad_loop_kernel_32_4sum_avx512(temp_src, temp_ref, &sums512[0]);
1766 sad_loop_kernel_32_4sum_avx512(
1767 temp_src + src_stride, temp_ref + ref_stride, &sums512[4]);
1768
1769 width_calc -= 32;
1770 temp_src += 32;
1771 temp_ref += 32;
1772 }
1773 if (width_calc >= 16) {
1774 sad_loop_kernel_16_2sum_avx512(
1775 temp_src, src_stride, temp_ref, ref_stride, &sums512[0]);
1776
1777 width_calc -= 16;
1778 temp_src += 16;
1779 temp_ref += 16;
1780 }
1781 if (width_calc >= 8) {
1782 sad_loop_kernel_8_avx2(temp_src, src_stride, temp_ref, ref_stride, &sum256_1);
1783 sad_loop_kernel_8_avx2(
1784 temp_src, src_stride, temp_ref + 8, ref_stride, &sum256_2);
1785
1786 width_calc -= 8;
1787 temp_src += 8;
1788 temp_ref += 8;
1789 }
1790 if (width_calc >= 4) {
1791 sad_loop_kernel_4_avx2(temp_src, src_stride, temp_ref, ref_stride, &sum256_3);
1792 sad_loop_kernel_4_avx2(
1793 temp_src, src_stride, temp_ref + 8, ref_stride, &sum256_4);
1794
1795 width_calc -= 4;
1796 temp_src += 4;
1797 temp_ref += 4;
1798 }
1799 if (width_calc > 0) {
1800 DECLARE_ALIGNED(16, uint16_t, tsum[16]);
1801 memset(tsum, 0, 16 * sizeof(uint16_t));
1802 for (uint32_t search_area = 0; search_area < 16; search_area++) {
1803 for (l = 0; l < width_calc; l++) {
1804 tsum[search_area] += EB_ABS_DIFF(temp_src[l], temp_ref[l]) +
1805 EB_ABS_DIFF(temp_src[src_stride + l], temp_ref[ref_stride + l]);
1806 }
1807 temp_ref += 1;
1808 }
1809 sum256 = _mm256_adds_epu16(sum256, _mm256_loadu_si256((__m256i *)tsum));
1810 }
1811 p_src += 2 * src_stride;
1812 p_ref += 2 * ref_stride;
1813 }
1814 //when height is not multiple of 2,then compute last line
1815 if (k < block_height) {
1816 uint32_t width_calc = block_width;
1817 const uint8_t *temp_src = p_src;
1818 const uint8_t *temp_ref = p_ref;
1819 if (width_calc >= 32) {
1820 sad_loop_kernel_32_4sum_avx512(temp_src, temp_ref, &sums512[0]);
1821
1822 width_calc -= 32;
1823 temp_src += 32;
1824 temp_ref += 32;
1825 }
1826 if (width_calc >= 16) {
1827 sad_loop_kernel_16_2sum_oneline_avx512(temp_src, temp_ref, &sums512[4]);
1828
1829 width_calc -= 16;
1830 temp_src += 16;
1831 temp_ref += 16;
1832 }
1833 if (width_calc >= 8) {
1834 sad_loop_kernel_8_oneline_avx2(temp_src, temp_ref, &sum256_1);
1835 sad_loop_kernel_8_oneline_avx2(temp_src, temp_ref + 8, &sum256_2);
1836
1837 width_calc -= 8;
1838 temp_src += 8;
1839 temp_ref += 8;
1840 }
1841 if (width_calc >= 4) {
1842 sad_loop_kernel_4_oneline_avx2(temp_src, temp_ref, &sum256_3);
1843 sad_loop_kernel_4_oneline_avx2(temp_src, temp_ref + 8, &sum256_4);
1844
1845 width_calc -= 4;
1846 temp_src += 4;
1847 temp_ref += 4;
1848 }
1849 if (width_calc > 0) {
1850 DECLARE_ALIGNED(16, uint16_t, tsum[16]);
1851 memset(tsum, 0, 16 * sizeof(uint16_t));
1852 for (uint32_t search_area = 0; search_area < 16; search_area++) {
1853 for (l = 0; l < width_calc; l++) {
1854 tsum[search_area] += EB_ABS_DIFF(temp_src[l], temp_ref[l]);
1855 }
1856 temp_ref += 1;
1857 }
1858 sum256 = _mm256_adds_epu16(sum256, _mm256_loadu_si256((__m256i *)tsum));
1859 }
1860 p_src += src_stride;
1861 p_ref += ref_stride;
1862 }
1863
1864 //update all
1865 __m512i sum512 = _mm512_inserti64x4(_mm512_castsi256_si512(sum256_1), sum256_2, 0x1);
1866 sum512 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), sum512);
1867 sums512[5] = _mm512_adds_epu16(sums512[5], sum512);
1868
1869 sum512 = _mm512_inserti64x4(_mm512_castsi256_si512(sum256_3), sum256_4, 0x1);
1870 sum512 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), sum512);
1871 sums512[6] = _mm512_adds_epu16(sums512[6], sum512);
1872
1873 sums512[7] = _mm512_adds_epu16(sums512[7],
1874 _mm512_inserti64x4(_mm512_setzero_si512(), sum256, 0));
1875
1876 const __m512i sum512_01 = _mm512_adds_epu16(sums512[0], sums512[1]);
1877 const __m512i sum512_23 = _mm512_adds_epu16(sums512[2], sums512[3]);
1878 const __m512i sum512_45 = _mm512_adds_epu16(sums512[4], sums512[5]);
1879 const __m512i sum512_67 = _mm512_adds_epu16(sums512[6], sums512[7]);
1880 const __m512i sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
1881 const __m512i sum512_4567 = _mm512_adds_epu16(sum512_45, sum512_67);
1882 sum512 = _mm512_adds_epu16(sum512_0123, sum512_4567);
1883 const __m256i sum_lo = _mm512_castsi512_si256(sum512);
1884 const __m256i sum_hi = _mm512_extracti64x4_epi64(sum512, 1);
1885 const __m256i sad = _mm256_adds_epu16(sum_lo, sum_hi);
1886 __m128i sad_lo = _mm256_castsi256_si128(sad);
1887 __m128i sad_hi = _mm256_extracti128_si256(sad, 1);
1888 if (leftover && (j + 16) >= search_area_width) {
1889 if (leftover < 8) {
1890 sad_lo = _mm_or_si128(sad_lo, leftover_mask);
1891 sad_hi = _mm_set1_epi32(-1);
1892 } else {
1893 sad_hi = _mm_or_si128(sad_hi, leftover_mask);
1894 }
1895 }
1896 const __m128i minpos_lo = _mm_minpos_epu16(sad_lo);
1897 const __m128i minpos_hi = _mm_minpos_epu16(sad_hi);
1898 const uint32_t min0 = _mm_extract_epi16(minpos_lo, 0);
1899 const uint32_t min1 = _mm_extract_epi16(minpos_hi, 0);
1900 uint32_t minmin, delta;
1901 __m128i minpos;
1902
1903 if (min0 <= min1) {
1904 minmin = min0;
1905 delta = 0;
1906 minpos = minpos_lo;
1907 } else {
1908 minmin = min1;
1909 delta = 8;
1910 minpos = minpos_hi;
1911 }
1912
1913 if (minmin < low_sum) {
1914 if (minmin != 0xFFFF) { // no overflow
1915 low_sum = minmin;
1916 x_best = j + delta + _mm_extract_epi16(minpos, 1);
1917 y_best = i;
1918 } else { // overflow
1919 __m256i sads256[2];
1920 __m128i sads128[4];
1921
1922 add16x16x8to32bit(sums512, sads256);
1923
1924 sads128[0] = _mm256_castsi256_si128(sads256[0]);
1925 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
1926 sads128[2] = _mm256_castsi256_si128(sads256[1]);
1927 sads128[3] = _mm256_extracti128_si256(sads256[1], 1);
1928 if (leftover && (j + 16) >= search_area_width) {
1929 if (leftover < 4) {
1930 sads128[0] = _mm_or_si128(sads128[0], leftover_mask32b);
1931 sads128[1] = sads128[2] = sads128[3] = _mm_set1_epi32(-1);
1932 } else if (leftover < 8) {
1933 sads128[1] = _mm_or_si128(sads128[1], leftover_mask32b);
1934 sads128[2] = sads128[3] = _mm_set1_epi32(-1);
1935 } else if (leftover < 12) {
1936 sads128[2] = _mm_or_si128(sads128[2], leftover_mask32b);
1937 sads128[3] = _mm_set1_epi32(-1);
1938 } else {
1939 sads128[3] = _mm_or_si128(sads128[3], leftover_mask32b);
1940 }
1941 }
1942
1943 update_8_best(&sads128[0], j + 0, i, &low_sum, &x_best, &y_best);
1944 update_8_best(&sads128[2], j + 8, i, &low_sum, &x_best, &y_best);
1945 }
1946 }
1947 }
1948 ref += src_stride_raw;
1949 }
1950
1951 *best_sad = low_sum;
1952 *x_search_center = (int16_t)x_best;
1953 *y_search_center = (int16_t)y_best;
1954 }
1955
1956 /*******************************************************************************
1957 * Requirement: width = 4, 6, 8, 12, 16, 24, 32, 48 or 64 to use SIMD
1958 * otherwise general/slower SIMD verison is used
1959 * Requirement: height <= 64
1960 * Requirement: height % 2 = 0
1961 *******************************************************************************/
svt_sad_loop_kernel_avx512_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width,uint64_t * best_sad,int16_t * x_search_center,int16_t * y_search_center,uint32_t src_stride_raw,int16_t search_area_width,int16_t search_area_height)1962 void svt_sad_loop_kernel_avx512_intrin(
1963 uint8_t * src, // input parameter, source samples Ptr
1964 uint32_t src_stride, // input parameter, source stride
1965 uint8_t * ref, // input parameter, reference samples Ptr
1966 uint32_t ref_stride, // input parameter, reference stride
1967 uint32_t height, // input parameter, block height (M)
1968 uint32_t width, // input parameter, block width (N)
1969 uint64_t *best_sad, int16_t *x_search_center, int16_t *y_search_center,
1970 uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
1971 int16_t search_area_width, int16_t search_area_height) {
1972 const uint32_t height2 = height >> 1;
1973 const uint8_t *s, *r;
1974 int32_t best_x = *x_search_center, best_y = *y_search_center;
1975 uint32_t best_s = 0xffffff;
1976 int32_t x, y;
1977 uint32_t h;
1978
1979 if (search_area_width == 8) {
1980 switch (width) {
1981 case 4:
1982 if (height <= 4) {
1983 y = 0;
1984 do {
1985 __m128i sum = _mm_setzero_si128();
1986
1987 s = src;
1988 r = ref;
1989
1990 h = height;
1991 while (h >= 2) {
1992 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
1993 s += 2 * src_stride;
1994 r += 2 * ref_stride;
1995 h -= 2;
1996 };
1997
1998 if (h) {
1999 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2000 }
2001
2002 update_best(sum, 0, y, &best_s, &best_x, &best_y);
2003 ref += src_stride_raw;
2004 } while (++y < search_area_height);
2005 } else {
2006 y = 0;
2007 do {
2008 __m256i sum256 = _mm256_setzero_si256();
2009
2010 s = src;
2011 r = ref;
2012
2013 h = height;
2014 while (h >= 2) {
2015 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2016 s += 2 * src_stride;
2017 r += 2 * ref_stride;
2018 h -= 2;
2019 };
2020
2021 if (h) {
2022 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2023 }
2024
2025 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2026 ref += src_stride_raw;
2027 } while (++y < search_area_height);
2028 }
2029 break;
2030
2031 case 6:
2032 if (height <= 4) {
2033 y = 0;
2034 do {
2035 __m128i sum = _mm_setzero_si128();
2036
2037 s = src;
2038 r = ref;
2039
2040 h = height;
2041 while (h >= 2) {
2042 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2043 s += 2 * src_stride;
2044 r += 2 * ref_stride;
2045 h -= 2;
2046 };
2047
2048 if (h) {
2049 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2050 }
2051
2052 __m128i sum2 = complement_4_to_6(ref, ref_stride, src, src_stride, height, 8);
2053 sum = _mm_adds_epu16(sum, sum2);
2054
2055 update_best(sum, 0, y, &best_s, &best_x, &best_y);
2056 ref += src_stride_raw;
2057 } while (++y < search_area_height);
2058 } else {
2059 y = 0;
2060 do {
2061 __m256i sum256 = _mm256_setzero_si256();
2062
2063 s = src;
2064 r = ref;
2065
2066 h = height;
2067 while (h >= 2) {
2068 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2069 s += 2 * src_stride;
2070 r += 2 * ref_stride;
2071 h -= 2;
2072 };
2073
2074 if (h) {
2075 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2076 }
2077
2078 __m128i sum = complement_4_to_6(ref, ref_stride, src, src_stride, height, 8);
2079 sum256 = _mm256_adds_epu16(
2080 sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
2081
2082 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2083 ref += src_stride_raw;
2084 } while (++y < search_area_height);
2085 }
2086 break;
2087
2088 case 8:
2089 // Note: Tried _mm512_dbsad_epu8 but is even slower.
2090 y = 0;
2091 do {
2092 __m256i sum256 = _mm256_setzero_si256();
2093
2094 s = src;
2095 r = ref;
2096
2097 h = height;
2098 while (h >= 2) {
2099 sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
2100 s += 2 * src_stride;
2101 r += 2 * ref_stride;
2102 h -= 2;
2103 };
2104
2105 if (h) {
2106 sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
2107 };
2108
2109 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2110 ref += src_stride_raw;
2111 } while (++y < search_area_height);
2112 break;
2113
2114 case 12:
2115 if (height <= 16) {
2116 y = 0;
2117 do {
2118 __m256i sum256 = _mm256_setzero_si256();
2119
2120 s = src;
2121 r = ref;
2122
2123 h = height;
2124 while (h >= 2) {
2125 sad_loop_kernel_12_avx2(s, src_stride, r, ref_stride, &sum256);
2126 s += 2 * src_stride;
2127 r += 2 * ref_stride;
2128 h -= 2;
2129 };
2130
2131 if (h) {
2132 sad_loop_kernel_12_oneline_avx2(s, r, &sum256);
2133 }
2134
2135 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2136 ref += src_stride_raw;
2137 } while (++y < search_area_height);
2138 } else if (height <= 32) {
2139 y = 0;
2140 do {
2141 __m256i sum256 = _mm256_setzero_si256();
2142
2143 s = src;
2144 r = ref;
2145
2146 h = height;
2147 while (h >= 2) {
2148 sad_loop_kernel_12_avx2(s, src_stride, r, ref_stride, &sum256);
2149 s += 2 * src_stride;
2150 r += 2 * ref_stride;
2151 h -= 2;
2152 };
2153
2154 if (h) {
2155 sad_loop_kernel_12_oneline_avx2(s, r, &sum256);
2156 }
2157
2158 update_some_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2159 ref += src_stride_raw;
2160 } while (++y < search_area_height);
2161 } else {
2162 y = 0;
2163 do {
2164 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2165
2166 s = src;
2167 r = ref;
2168
2169 h = height;
2170 while (h >= 2) {
2171 sad_loop_kernel_12_2sum_avx2(s, src_stride, r, ref_stride, sums256);
2172 s += 2 * src_stride;
2173 r += 2 * ref_stride;
2174 h -= 2;
2175 };
2176
2177 if (h) {
2178 sad_loop_kernel_12_2sum_oneline_avx2(s, r, sums256);
2179 }
2180
2181 update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2182 ref += src_stride_raw;
2183 } while (++y < search_area_height);
2184 }
2185 break;
2186
2187 case 16:
2188 if (height <= 16) {
2189 y = 0;
2190 do {
2191 __m256i sum256 = _mm256_setzero_si256();
2192
2193 s = src;
2194 r = ref;
2195
2196 h = height;
2197 while (h >= 2) {
2198 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
2199 s += 2 * src_stride;
2200 r += 2 * ref_stride;
2201 h -= 2;
2202 };
2203
2204 if (h) {
2205 sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
2206 };
2207
2208 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2209 ref += src_stride_raw;
2210 } while (++y < search_area_height);
2211 } else if (height <= 32) {
2212 y = 0;
2213 do {
2214 __m256i sum256 = _mm256_setzero_si256();
2215
2216 s = src;
2217 r = ref;
2218
2219 h = height;
2220 while (h >= 2) {
2221 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
2222 s += 2 * src_stride;
2223 r += 2 * ref_stride;
2224 h -= 2;
2225 };
2226
2227 if (h) {
2228 sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
2229 };
2230
2231 update_some_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2232 ref += src_stride_raw;
2233 } while (++y < search_area_height);
2234 } else {
2235 y = 0;
2236 do {
2237 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2238
2239 s = src;
2240 r = ref;
2241
2242 h = height;
2243 while (h >= 2) {
2244 sad_loop_kernel_16_2sum_avx2(s, src_stride, r, ref_stride, sums256);
2245 s += 2 * src_stride;
2246 r += 2 * ref_stride;
2247 h -= 2;
2248 };
2249
2250 if (h) {
2251 sad_loop_kernel_16_2sum_oneline_avx2(s, r, sums256);
2252 };
2253
2254 update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2255 ref += src_stride_raw;
2256 } while (++y < search_area_height);
2257 }
2258 break;
2259
2260 case 24:
2261 if (height <= 16) {
2262 y = 0;
2263 do {
2264 __m256i sum256 = _mm256_setzero_si256();
2265
2266 s = src;
2267 r = ref;
2268
2269 h = height;
2270 while (h >= 2) {
2271 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
2272 sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sum256);
2273 s += 2 * src_stride;
2274 r += 2 * ref_stride;
2275 h -= 2;
2276 };
2277
2278 if (h) {
2279 sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
2280 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sum256);
2281 }
2282
2283 update_some_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2284 ref += src_stride_raw;
2285 } while (++y < search_area_height);
2286 } else {
2287 y = 0;
2288 do {
2289 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2290
2291 s = src;
2292 r = ref;
2293
2294 h = height;
2295 while (h >= 2) {
2296 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sums256[0]);
2297 sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sums256[1]);
2298 s += 2 * src_stride;
2299 r += 2 * ref_stride;
2300 h -= 2;
2301 };
2302
2303 if (h) {
2304 sad_loop_kernel_16_oneline_avx2(s, r, &sums256[0]);
2305 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[1]);
2306 }
2307
2308 update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2309 ref += src_stride_raw;
2310 } while (++y < search_area_height);
2311 }
2312 break;
2313
2314 case 32:
2315 if (height <= 8) {
2316 y = 0;
2317 do {
2318 __m256i sum256 = _mm256_setzero_si256();
2319
2320 s = src;
2321 r = ref;
2322
2323 h = height;
2324 do {
2325 sad_loop_kernel_32_avx2(s, r, &sum256);
2326 s += src_stride;
2327 r += ref_stride;
2328 } while (--h);
2329
2330 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2331 ref += src_stride_raw;
2332 } while (++y < search_area_height);
2333 } else if (height <= 16) {
2334 y = 0;
2335 do {
2336 __m256i sum256 = _mm256_setzero_si256();
2337
2338 s = src;
2339 r = ref;
2340
2341 h = height;
2342 do {
2343 sad_loop_kernel_32_avx2(s, r, &sum256);
2344 s += src_stride;
2345 r += ref_stride;
2346 } while (--h);
2347
2348 update_some_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2349 ref += src_stride_raw;
2350 } while (++y < search_area_height);
2351 } else if (height <= 32) {
2352 y = 0;
2353 do {
2354 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2355
2356 s = src;
2357 r = ref;
2358
2359 h = height;
2360 do {
2361 sad_loop_kernel_32_2sum_avx2(s, r, sums256);
2362 s += src_stride;
2363 r += ref_stride;
2364 } while (--h);
2365
2366 update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2367 ref += src_stride_raw;
2368 } while (++y < search_area_height);
2369 } else {
2370 y = 0;
2371 do {
2372 __m256i sums256[4] = {_mm256_setzero_si256(),
2373 _mm256_setzero_si256(),
2374 _mm256_setzero_si256(),
2375 _mm256_setzero_si256()};
2376
2377 s = src;
2378 r = ref;
2379
2380 h = height;
2381 do {
2382 sad_loop_kernel_32_4sum_avx2(s, r, sums256);
2383 s += src_stride;
2384 r += ref_stride;
2385 } while (--h);
2386
2387 update_leftover8_2048_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2388 ref += src_stride_raw;
2389 } while (++y < search_area_height);
2390 }
2391 break;
2392
2393 case 48:
2394 if (height <= 32) {
2395 y = 0;
2396 do {
2397 __m256i sums256[3] = {
2398 _mm256_setzero_si256(), _mm256_setzero_si256(), _mm256_setzero_si256()};
2399
2400 s = src;
2401 r = ref;
2402
2403 h = height2;
2404 do {
2405 sad_loop_kernel_32_2sum_avx2(s, r, sums256);
2406 sad_loop_kernel_32_2sum_avx2(s + src_stride, r + ref_stride, sums256);
2407 sad_loop_kernel_16_avx2(
2408 s + 32, src_stride, r + 32, ref_stride, &sums256[2]);
2409 s += 2 * src_stride;
2410 r += 2 * ref_stride;
2411 } while (--h);
2412
2413 update_leftover8_1536_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2414 ref += src_stride_raw;
2415 } while (++y < search_area_height);
2416 } else {
2417 y = 0;
2418 do {
2419 __m256i sums256[6] = {_mm256_setzero_si256(),
2420 _mm256_setzero_si256(),
2421 _mm256_setzero_si256(),
2422 _mm256_setzero_si256(),
2423 _mm256_setzero_si256(),
2424 _mm256_setzero_si256()};
2425
2426 s = src;
2427 r = ref;
2428
2429 h = height2;
2430 do {
2431 sad_loop_kernel_32_4sum_avx2(s, r, sums256);
2432 sad_loop_kernel_32_4sum_avx2(s + src_stride, r + ref_stride, sums256);
2433 sad_loop_kernel_16_2sum_avx2(
2434 s + 32, src_stride, r + 32, ref_stride, &sums256[4]);
2435 s += 2 * src_stride;
2436 r += 2 * ref_stride;
2437 } while (--h);
2438
2439 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
2440 const __m256i sum23 = _mm256_adds_epu16(sums256[2], sums256[3]);
2441 const __m256i sum45 = _mm256_adds_epu16(sums256[4], sums256[5]);
2442 const __m256i sum0123 = _mm256_adds_epu16(sum01, sum23);
2443 const __m256i sum256 = _mm256_adds_epu16(sum0123, sum45);
2444 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
2445 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
2446 const __m128i sad = _mm_adds_epu16(sum_lo, sum_hi);
2447 const __m128i minpos = _mm_minpos_epu16(sad);
2448 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
2449
2450 if (min0 < best_s) {
2451 if (min0 != 0xFFFF) { // no overflow
2452 best_s = min0;
2453 best_x = _mm_extract_epi16(minpos, 1);
2454 best_y = y;
2455 } else { // overflow
2456 __m128i sads[2];
2457
2458 add16x8x6to32bit(sums256, sads);
2459 update_8_best(sads, 0, y, &best_s, &best_x, &best_y);
2460 }
2461 }
2462
2463 ref += src_stride_raw;
2464 } while (++y < search_area_height);
2465 }
2466 break;
2467
2468 case 64:
2469 if (height <= 16) {
2470 y = 0;
2471 do {
2472 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2473
2474 s = src;
2475 r = ref;
2476
2477 h = height;
2478 do {
2479 sad_loop_kernel_64_2sum_avx2(s, r, sums256);
2480 s += src_stride;
2481 r += ref_stride;
2482 } while (--h);
2483
2484 update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2485 ref += src_stride_raw;
2486 } while (++y < search_area_height);
2487 } else if (height <= 32) {
2488 y = 0;
2489 do {
2490 __m256i sums256[4] = {_mm256_setzero_si256(),
2491 _mm256_setzero_si256(),
2492 _mm256_setzero_si256(),
2493 _mm256_setzero_si256()};
2494
2495 s = src;
2496 r = ref;
2497
2498 h = height;
2499 do {
2500 sad_loop_kernel_64_4sum_avx2(s, r, sums256);
2501 s += src_stride;
2502 r += ref_stride;
2503 } while (--h);
2504
2505 update_leftover8_2048_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2506 ref += src_stride_raw;
2507 } while (++y < search_area_height);
2508 } else {
2509 y = 0;
2510 do {
2511 __m256i sums256[8] = {_mm256_setzero_si256(),
2512 _mm256_setzero_si256(),
2513 _mm256_setzero_si256(),
2514 _mm256_setzero_si256(),
2515 _mm256_setzero_si256(),
2516 _mm256_setzero_si256(),
2517 _mm256_setzero_si256(),
2518 _mm256_setzero_si256()};
2519
2520 s = src;
2521 r = ref;
2522
2523 h = height;
2524 do {
2525 sad_loop_kernel_64_8sum_avx2(s, r, sums256);
2526 s += src_stride;
2527 r += ref_stride;
2528 } while (--h);
2529
2530 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
2531 const __m256i sum23 = _mm256_adds_epu16(sums256[2], sums256[3]);
2532 const __m256i sum45 = _mm256_adds_epu16(sums256[4], sums256[5]);
2533 const __m256i sum67 = _mm256_adds_epu16(sums256[6], sums256[7]);
2534 const __m256i sum0123 = _mm256_adds_epu16(sum01, sum23);
2535 const __m256i sum4567 = _mm256_adds_epu16(sum45, sum67);
2536 const __m256i sum256 = _mm256_adds_epu16(sum0123, sum4567);
2537 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
2538 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
2539 const __m128i sad = _mm_adds_epu16(sum_lo, sum_hi);
2540 const __m128i minpos = _mm_minpos_epu16(sad);
2541 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
2542
2543 if (min0 < best_s) {
2544 if (min0 != 0xFFFF) { // no overflow
2545 best_s = min0;
2546 best_x = _mm_extract_epi16(minpos, 1);
2547 best_y = y;
2548 } else { // overflow
2549 __m128i sads[2];
2550
2551 add16x8x8to32bit(sums256, sads);
2552 update_8_best(sads, 0, y, &best_s, &best_x, &best_y);
2553 }
2554 }
2555
2556 ref += src_stride_raw;
2557 } while (++y < search_area_height);
2558 }
2559 break;
2560
2561 default:
2562 sad_loop_kernel_generalized_avx512(src,
2563 src_stride,
2564 ref,
2565 ref_stride,
2566 height,
2567 width,
2568 best_sad,
2569 x_search_center,
2570 y_search_center,
2571 src_stride_raw,
2572 search_area_width,
2573 search_area_height);
2574 return;
2575 }
2576 } else if (search_area_width == 16) {
2577 switch (width) {
2578 case 4:
2579 if (height <= 4) {
2580 y = 0;
2581 do {
2582 {
2583 __m128i sum = _mm_setzero_si128();
2584
2585 s = src;
2586 r = ref;
2587
2588 h = height;
2589 while (h >= 2) {
2590 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2591 s += 2 * src_stride;
2592 r += 2 * ref_stride;
2593 h -= 2;
2594 };
2595
2596 if (h) {
2597 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2598 }
2599
2600 update_best(sum, 0, y, &best_s, &best_x, &best_y);
2601 }
2602
2603 {
2604 __m128i sum = _mm_setzero_si128();
2605
2606 s = src;
2607 r = ref + 8;
2608
2609 h = height;
2610 while (h >= 2) {
2611 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2612 s += 2 * src_stride;
2613 r += 2 * ref_stride;
2614 h -= 2;
2615 };
2616
2617 if (h) {
2618 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2619 }
2620
2621 update_best(sum, 8, y, &best_s, &best_x, &best_y);
2622 }
2623
2624 ref += src_stride_raw;
2625 } while (++y < search_area_height);
2626 } else {
2627 y = 0;
2628 do {
2629 {
2630 __m256i sum256 = _mm256_setzero_si256();
2631
2632 s = src;
2633 r = ref;
2634
2635 h = height;
2636 while (h >= 2) {
2637 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2638 s += 2 * src_stride;
2639 r += 2 * ref_stride;
2640 h -= 2;
2641 };
2642
2643 if (h) {
2644 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2645 }
2646
2647 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2648 }
2649
2650 {
2651 __m256i sum256 = _mm256_setzero_si256();
2652
2653 s = src;
2654 r = ref + 8;
2655
2656 h = height;
2657 while (h >= 2) {
2658 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2659 s += 2 * src_stride;
2660 r += 2 * ref_stride;
2661 h -= 2;
2662 };
2663
2664 if (h) {
2665 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2666 }
2667
2668 update_small_pel(sum256, 8, y, &best_s, &best_x, &best_y);
2669 }
2670
2671 ref += src_stride_raw;
2672 } while (++y < search_area_height);
2673 }
2674 break;
2675
2676 case 6:
2677 if (height <= 4) {
2678 y = 0;
2679 do {
2680 {
2681 __m128i sum = _mm_setzero_si128();
2682
2683 s = src;
2684 r = ref;
2685
2686 h = height;
2687 while (h >= 2) {
2688 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2689 s += 2 * src_stride;
2690 r += 2 * ref_stride;
2691 h -= 2;
2692 };
2693
2694 if (h) {
2695 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2696 }
2697
2698 __m128i sum2 = complement_4_to_6(
2699 ref, ref_stride, src, src_stride, height, 8);
2700 sum = _mm_adds_epu16(sum, sum2);
2701
2702 update_best(sum, 0, y, &best_s, &best_x, &best_y);
2703 }
2704
2705 {
2706 __m128i sum = _mm_setzero_si128();
2707
2708 s = src;
2709 r = ref + 8;
2710
2711 h = height;
2712 while (h >= 2) {
2713 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2714 s += 2 * src_stride;
2715 r += 2 * ref_stride;
2716 h -= 2;
2717 };
2718
2719 if (h) {
2720 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2721 }
2722
2723 __m128i sum2 = complement_4_to_6(
2724 ref + 8, ref_stride, src, src_stride, height, 8);
2725 sum = _mm_adds_epu16(sum, sum2);
2726
2727 update_best(sum, 8, y, &best_s, &best_x, &best_y);
2728 }
2729
2730 ref += src_stride_raw;
2731 } while (++y < search_area_height);
2732 } else {
2733 y = 0;
2734 do {
2735 {
2736 __m256i sum256 = _mm256_setzero_si256();
2737
2738 s = src;
2739 r = ref;
2740
2741 h = height;
2742 while (h >= 2) {
2743 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2744 s += 2 * src_stride;
2745 r += 2 * ref_stride;
2746 h -= 2;
2747 };
2748
2749 if (h) {
2750 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2751 }
2752
2753 __m128i sum = complement_4_to_6(
2754 ref, ref_stride, src, src_stride, height, 8);
2755 sum256 = _mm256_adds_epu16(
2756 sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
2757
2758 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2759 }
2760
2761 {
2762 __m256i sum256 = _mm256_setzero_si256();
2763
2764 s = src;
2765 r = ref + 8;
2766
2767 h = height;
2768 while (h >= 2) {
2769 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2770 s += 2 * src_stride;
2771 r += 2 * ref_stride;
2772 h -= 2;
2773 };
2774
2775 if (h) {
2776 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2777 }
2778
2779 __m128i sum = complement_4_to_6(
2780 ref + 8, ref_stride, src, src_stride, height, 8);
2781 sum256 = _mm256_adds_epu16(
2782 sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
2783
2784 update_small_pel(sum256, 8, y, &best_s, &best_x, &best_y);
2785 }
2786
2787 ref += src_stride_raw;
2788 } while (++y < search_area_height);
2789 }
2790 break;
2791
2792 case 8:
2793 // Note: Tried _mm512_dbsad_epu8 but is even slower.
2794 y = 0;
2795 do {
2796 {
2797 __m256i sum256 = _mm256_setzero_si256();
2798
2799 s = src;
2800 r = ref;
2801
2802 h = height;
2803 while (h >= 2) {
2804 sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
2805 s += 2 * src_stride;
2806 r += 2 * ref_stride;
2807 h -= 2;
2808 };
2809
2810 if (h) {
2811 sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
2812 };
2813
2814 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2815 }
2816
2817 {
2818 __m256i sum256 = _mm256_setzero_si256();
2819
2820 s = src;
2821 r = ref + 8;
2822
2823 h = height;
2824 while (h >= 2) {
2825 sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
2826 s += 2 * src_stride;
2827 r += 2 * ref_stride;
2828 h -= 2;
2829 };
2830
2831 if (h) {
2832 sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
2833 };
2834
2835 update_small_pel(sum256, 8, y, &best_s, &best_x, &best_y);
2836 }
2837
2838 ref += src_stride_raw;
2839 } while (++y < search_area_height);
2840 break;
2841
2842 case 12:
2843 if (height <= 16) {
2844 y = 0;
2845 do {
2846 __m512i sum512 = _mm512_setzero_si512();
2847
2848 s = src;
2849 r = ref;
2850
2851 h = height;
2852 while (h >= 2) {
2853 sad_loop_kernel_12_avx512(s, src_stride, r, ref_stride, &sum512);
2854 s += 2 * src_stride;
2855 r += 2 * ref_stride;
2856 h -= 2;
2857 };
2858
2859 if (h) {
2860 sad_loop_kernel_12_oneline_avx512(s, r, &sum512);
2861 }
2862
2863 update_256_pel(sum512, 0, y, &best_s, &best_x, &best_y);
2864 ref += src_stride_raw;
2865 } while (++y < search_area_height);
2866 } else if (height <= 32) {
2867 y = 0;
2868 do {
2869 __m512i sum512 = _mm512_setzero_si512();
2870
2871 s = src;
2872 r = ref;
2873
2874 h = height;
2875 while (h >= 2) {
2876 sad_loop_kernel_12_avx512(s, src_stride, r, ref_stride, &sum512);
2877 s += 2 * src_stride;
2878 r += 2 * ref_stride;
2879 h -= 2;
2880 };
2881
2882 if (h) {
2883 sad_loop_kernel_12_oneline_avx512(s, r, &sum512);
2884 }
2885
2886 update_512_pel(sum512, 0, y, &best_s, &best_x, &best_y);
2887 ref += src_stride_raw;
2888 } while (++y < search_area_height);
2889 } else {
2890 y = 0;
2891 do {
2892 __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
2893
2894 s = src;
2895 r = ref;
2896
2897 h = height;
2898 while (h >= 2) {
2899 sad_loop_kernel_12_2sum_avx512(s, src_stride, r, ref_stride, sums512);
2900 s += 2 * src_stride;
2901 r += 2 * ref_stride;
2902 h -= 2;
2903 };
2904
2905 if (h) {
2906 sad_loop_kernel_12_2sum_oneline_avx512(s, r, sums512);
2907 }
2908
2909 update_1024_pel(sums512, 0, y, &best_s, &best_x, &best_y);
2910 ref += src_stride_raw;
2911 } while (++y < search_area_height);
2912 }
2913 break;
2914
2915 case 16:
2916 if (height <= 16) {
2917 y = 0;
2918 do {
2919 __m512i sum512 = _mm512_setzero_si512();
2920
2921 s = src;
2922 r = ref;
2923
2924 h = height;
2925 while (h >= 2) {
2926 sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
2927 s += 2 * src_stride;
2928 r += 2 * ref_stride;
2929 h -= 2;
2930 };
2931
2932 if (h) {
2933 sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
2934 };
2935
2936 update_256_pel(sum512, 0, y, &best_s, &best_x, &best_y);
2937 ref += src_stride_raw;
2938 } while (++y < search_area_height);
2939 } else if (height <= 32) {
2940 y = 0;
2941 do {
2942 __m512i sum512 = _mm512_setzero_si512();
2943
2944 s = src;
2945 r = ref;
2946
2947 h = height;
2948 while (h >= 2) {
2949 sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
2950 s += 2 * src_stride;
2951 r += 2 * ref_stride;
2952 h -= 2;
2953 };
2954
2955 if (h) {
2956 sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
2957 };
2958
2959 update_512_pel(sum512, 0, y, &best_s, &best_x, &best_y);
2960 ref += src_stride_raw;
2961 } while (++y < search_area_height);
2962 } else {
2963 y = 0;
2964 do {
2965 __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
2966
2967 s = src;
2968 r = ref;
2969
2970 h = height;
2971 while (h >= 2) {
2972 sad_loop_kernel_16_2sum_avx512(s, src_stride, r, ref_stride, sums512);
2973 s += 2 * src_stride;
2974 r += 2 * ref_stride;
2975 h -= 2;
2976 };
2977
2978 if (h) {
2979 sad_loop_kernel_16_2sum_oneline_avx512(s, r, sums512);
2980 };
2981
2982 update_1024_pel(sums512, 0, y, &best_s, &best_x, &best_y);
2983 ref += src_stride_raw;
2984 } while (++y < search_area_height);
2985 }
2986 break;
2987
2988 case 24:
2989 if (height <= 16) {
2990 y = 0;
2991 do {
2992 __m512i sum512 = _mm512_setzero_si512();
2993 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2994
2995 s = src;
2996 r = ref;
2997
2998 h = height;
2999 while (h >= 2) {
3000 sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3001 sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sums256[0]);
3002 sad_loop_kernel_8_avx2(s + 16, src_stride, r + 24, ref_stride, &sums256[1]);
3003 s += 2 * src_stride;
3004 r += 2 * ref_stride;
3005 h -= 2;
3006 };
3007
3008 if (h) {
3009 sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
3010 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[0]);
3011 sad_loop_kernel_8_oneline_avx2(s + 16, r + 24, &sums256[1]);
3012 }
3013
3014 update_384_pel(sum512, sums256, 0, y, &best_s, &best_x, &best_y);
3015 ref += src_stride_raw;
3016 } while (++y < search_area_height);
3017 } else {
3018 y = 0;
3019 do {
3020 __m512i sum512 = _mm512_setzero_si512();
3021 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3022
3023 s = src;
3024 r = ref;
3025
3026 h = height;
3027 while (h >= 2) {
3028 sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3029 sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sums256[0]);
3030 sad_loop_kernel_8_avx2(s + 16, src_stride, r + 24, ref_stride, &sums256[1]);
3031 s += 2 * src_stride;
3032 r += 2 * ref_stride;
3033 h -= 2;
3034 };
3035
3036 if (h) {
3037 sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
3038 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[0]);
3039 sad_loop_kernel_8_oneline_avx2(s + 16, r + 24, &sums256[1]);
3040 }
3041
3042 update_768_pel(sum512, sums256, 0, y, &best_s, &best_x, &best_y);
3043 ref += src_stride_raw;
3044 } while (++y < search_area_height);
3045 }
3046 break;
3047
3048 case 32:
3049 if (height <= 16) {
3050 y = 0;
3051 do {
3052 __m512i sum512 = _mm512_setzero_si512();
3053
3054 s = src;
3055 r = ref;
3056
3057 // Note: faster than looping 2 rows.
3058 h = height;
3059 do {
3060 sad_loop_kernel_32_avx512(s, r, &sum512);
3061 s += src_stride;
3062 r += ref_stride;
3063 } while (--h);
3064
3065 update_512_pel(sum512, 0, y, &best_s, &best_x, &best_y);
3066 ref += src_stride_raw;
3067 } while (++y < search_area_height);
3068 } else if (height <= 32) {
3069 y = 0;
3070 do {
3071 __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
3072
3073 s = src;
3074 r = ref;
3075
3076 h = height;
3077 do {
3078 sad_loop_kernel_32_2sum_avx512(s, r, sums512);
3079 s += src_stride;
3080 r += ref_stride;
3081 } while (--h);
3082
3083 update_1024_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3084 ref += src_stride_raw;
3085 } while (++y < search_area_height);
3086 } else {
3087 y = 0;
3088 do {
3089 __m512i sums512[4] = {_mm512_setzero_si512(),
3090 _mm512_setzero_si512(),
3091 _mm512_setzero_si512(),
3092 _mm512_setzero_si512()};
3093
3094 s = src;
3095 r = ref;
3096
3097 h = height;
3098 do {
3099 sad_loop_kernel_32_4sum_avx512(s, r, sums512);
3100 s += src_stride;
3101 r += ref_stride;
3102 } while (--h);
3103
3104 update_2048_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3105 ref += src_stride_raw;
3106 } while (++y < search_area_height);
3107 }
3108 break;
3109
3110 case 48:
3111 if (height <= 32) {
3112 y = 0;
3113 do {
3114 __m512i sums512[3] = {
3115 _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()};
3116
3117 s = src;
3118 r = ref;
3119
3120 h = height2;
3121 do {
3122 sad_loop_kernel_32_2sum_avx512(s, r, sums512);
3123 sad_loop_kernel_32_2sum_avx512(s + src_stride, r + ref_stride, sums512);
3124 sad_loop_kernel_16_avx512(
3125 s + 32, src_stride, r + 32, ref_stride, &sums512[2]);
3126 s += 2 * src_stride;
3127 r += 2 * ref_stride;
3128 } while (--h);
3129
3130 update_1536_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3131 ref += src_stride_raw;
3132 } while (++y < search_area_height);
3133 } else {
3134 y = 0;
3135 do {
3136 __m512i sums512[6] = {_mm512_setzero_si512(),
3137 _mm512_setzero_si512(),
3138 _mm512_setzero_si512(),
3139 _mm512_setzero_si512(),
3140 _mm512_setzero_si512(),
3141 _mm512_setzero_si512()};
3142
3143 s = src;
3144 r = ref;
3145
3146 h = height2;
3147 do {
3148 sad_loop_kernel_32_4sum_avx512(s, r, sums512);
3149 sad_loop_kernel_32_4sum_avx512(s + src_stride, r + ref_stride, sums512);
3150 sad_loop_kernel_16_2sum_avx512(
3151 s + 32, src_stride, r + 32, ref_stride, &sums512[4]);
3152 s += 2 * src_stride;
3153 r += 2 * ref_stride;
3154 } while (--h);
3155
3156 const __m512i sum512_01 = _mm512_adds_epu16(sums512[0], sums512[1]);
3157 const __m512i sum512_23 = _mm512_adds_epu16(sums512[2], sums512[3]);
3158 const __m512i sum512_45 = _mm512_adds_epu16(sums512[4], sums512[5]);
3159 const __m512i sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
3160 const __m512i sum512 = _mm512_adds_epu16(sum512_0123, sum512_45);
3161 const __m256i sum_lo = _mm512_castsi512_si256(sum512);
3162 const __m256i sum_hi = _mm512_extracti64x4_epi64(sum512, 1);
3163 const __m256i sad = _mm256_adds_epu16(sum_lo, sum_hi);
3164 const __m128i sad_lo = _mm256_castsi256_si128(sad);
3165 const __m128i sad_hi = _mm256_extracti128_si256(sad, 1);
3166 const __m128i minpos_lo = _mm_minpos_epu16(sad_lo);
3167 const __m128i minpos_hi = _mm_minpos_epu16(sad_hi);
3168 const uint32_t min0 = _mm_extract_epi16(minpos_lo, 0);
3169 const uint32_t min1 = _mm_extract_epi16(minpos_hi, 0);
3170 uint32_t minmin, delta;
3171 __m128i minpos;
3172
3173 if (min0 <= min1) {
3174 minmin = min0;
3175 delta = 0;
3176 minpos = minpos_lo;
3177 } else {
3178 minmin = min1;
3179 delta = 8;
3180 minpos = minpos_hi;
3181 }
3182
3183 if (minmin < best_s) {
3184 if (minmin != 0xFFFF) { // no overflow
3185 best_s = minmin;
3186 best_x = delta + _mm_extract_epi16(minpos, 1);
3187 best_y = y;
3188 } else { // overflow
3189 __m256i sads256[2];
3190 __m128i sads128[2];
3191
3192 add16x16x6to32bit(sums512, sads256);
3193
3194 sads128[0] = _mm256_castsi256_si128(sads256[0]);
3195 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
3196 update_8_best(sads128, 0, y, &best_s, &best_x, &best_y);
3197
3198 sads128[0] = _mm256_castsi256_si128(sads256[1]);
3199 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
3200 update_8_best(sads128, 8, y, &best_s, &best_x, &best_y);
3201 }
3202 }
3203
3204 ref += src_stride_raw;
3205 } while (++y < search_area_height);
3206 }
3207 break;
3208
3209 case 64:
3210 if (height <= 16) {
3211 y = 0;
3212 do {
3213 __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
3214
3215 s = src;
3216 r = ref;
3217
3218 h = height;
3219 do {
3220 sad_loop_kernel_32_2sum_avx512(s + 0 * 32, r + 0 * 32, sums512);
3221 sad_loop_kernel_32_2sum_avx512(s + 1 * 32, r + 1 * 32, sums512);
3222 s += src_stride;
3223 r += ref_stride;
3224 } while (--h);
3225
3226 update_1024_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3227 ref += src_stride_raw;
3228 } while (++y < search_area_height);
3229 } else if (height <= 32) {
3230 y = 0;
3231 do {
3232 __m512i sums512[4] = {_mm512_setzero_si512(),
3233 _mm512_setzero_si512(),
3234 _mm512_setzero_si512(),
3235 _mm512_setzero_si512()};
3236
3237 s = src;
3238 r = ref;
3239
3240 h = height;
3241 do {
3242 sad_loop_kernel_32_4sum_avx512(s + 0 * 32, r + 0 * 32, sums512);
3243 sad_loop_kernel_32_4sum_avx512(s + 1 * 32, r + 1 * 32, sums512);
3244 s += src_stride;
3245 r += ref_stride;
3246 } while (--h);
3247
3248 update_2048_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3249 ref += src_stride_raw;
3250 } while (++y < search_area_height);
3251 } else {
3252 y = 0;
3253 do {
3254 __m512i sums512[8] = {_mm512_setzero_si512(),
3255 _mm512_setzero_si512(),
3256 _mm512_setzero_si512(),
3257 _mm512_setzero_si512(),
3258 _mm512_setzero_si512(),
3259 _mm512_setzero_si512(),
3260 _mm512_setzero_si512(),
3261 _mm512_setzero_si512()};
3262
3263 s = src;
3264 r = ref;
3265
3266 h = height;
3267 do {
3268 sad_loop_kernel_32_4sum_avx512(s + 0 * 32, r + 0 * 32, sums512 + 0);
3269 sad_loop_kernel_32_4sum_avx512(s + 1 * 32, r + 1 * 32, sums512 + 4);
3270 s += src_stride;
3271 r += ref_stride;
3272 } while (--h);
3273
3274 const __m512i sum512_01 = _mm512_adds_epu16(sums512[0], sums512[1]);
3275 const __m512i sum512_23 = _mm512_adds_epu16(sums512[2], sums512[3]);
3276 const __m512i sum512_45 = _mm512_adds_epu16(sums512[4], sums512[5]);
3277 const __m512i sum512_67 = _mm512_adds_epu16(sums512[6], sums512[7]);
3278 const __m512i sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
3279 const __m512i sum512_4567 = _mm512_adds_epu16(sum512_45, sum512_67);
3280 const __m512i sum512 = _mm512_adds_epu16(sum512_0123, sum512_4567);
3281 const __m256i sum_lo = _mm512_castsi512_si256(sum512);
3282 const __m256i sum_hi = _mm512_extracti64x4_epi64(sum512, 1);
3283 const __m256i sad = _mm256_adds_epu16(sum_lo, sum_hi);
3284 const __m128i sad_lo = _mm256_castsi256_si128(sad);
3285 const __m128i sad_hi = _mm256_extracti128_si256(sad, 1);
3286 const __m128i minpos_lo = _mm_minpos_epu16(sad_lo);
3287 const __m128i minpos_hi = _mm_minpos_epu16(sad_hi);
3288 const uint32_t min0 = _mm_extract_epi16(minpos_lo, 0);
3289 const uint32_t min1 = _mm_extract_epi16(minpos_hi, 0);
3290 uint32_t minmin, delta;
3291 __m128i minpos;
3292
3293 if (min0 <= min1) {
3294 minmin = min0;
3295 delta = 0;
3296 minpos = minpos_lo;
3297 } else {
3298 minmin = min1;
3299 delta = 8;
3300 minpos = minpos_hi;
3301 }
3302
3303 if (minmin < best_s) {
3304 if (minmin != 0xFFFF) { // no overflow
3305 best_s = minmin;
3306 best_x = delta + _mm_extract_epi16(minpos, 1);
3307 best_y = y;
3308 } else { // overflow
3309 __m256i sads256[2];
3310 __m128i sads128[2];
3311
3312 add16x16x8to32bit(sums512, sads256);
3313
3314 sads128[0] = _mm256_castsi256_si128(sads256[0]);
3315 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
3316 update_8_best(sads128, 0, y, &best_s, &best_x, &best_y);
3317
3318 sads128[0] = _mm256_castsi256_si128(sads256[1]);
3319 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
3320 update_8_best(sads128, 8, y, &best_s, &best_x, &best_y);
3321 }
3322 }
3323
3324 ref += src_stride_raw;
3325 } while (++y < search_area_height);
3326 }
3327 break;
3328
3329 default:
3330 sad_loop_kernel_generalized_avx512(src,
3331 src_stride,
3332 ref,
3333 ref_stride,
3334 height,
3335 width,
3336 best_sad,
3337 x_search_center,
3338 y_search_center,
3339 src_stride_raw,
3340 search_area_width,
3341 search_area_height);
3342 return;
3343 }
3344 } else {
3345 const uint32_t leftover = search_area_width & 7;
3346 __m128i mask128;
3347 __m256i mask256;
3348
3349 mask128 = _mm_set1_epi32(-1);
3350 for (x = 0; x < (int32_t)leftover; x++) { mask128 = _mm_slli_si128(mask128, 2); }
3351 mask256 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask128), mask128, 1);
3352
3353 switch (width) {
3354 case 4:
3355 if (height <= 4) {
3356 y = 0;
3357 do {
3358 for (x = 0; x <= search_area_width - 8; x += 8) {
3359 __m128i sum = _mm_setzero_si128();
3360
3361 s = src;
3362 r = ref + x;
3363
3364 h = height;
3365 while (h >= 2) {
3366 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
3367 s += 2 * src_stride;
3368 r += 2 * ref_stride;
3369 h -= 2;
3370 };
3371
3372 if (h) {
3373 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
3374 }
3375
3376 update_best(sum, x, y, &best_s, &best_x, &best_y);
3377 }
3378
3379 if (leftover) {
3380 __m128i sum = _mm_setzero_si128();
3381
3382 s = src;
3383 r = ref + x;
3384
3385 h = height;
3386 while (h >= 2) {
3387 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
3388 s += 2 * src_stride;
3389 r += 2 * ref_stride;
3390 h -= 2;
3391 };
3392
3393 if (h) {
3394 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
3395 }
3396
3397 sum = _mm_or_si128(sum, mask128);
3398 update_best(sum, x, y, &best_s, &best_x, &best_y);
3399 }
3400
3401 ref += src_stride_raw;
3402 } while (++y < search_area_height);
3403 } else {
3404 y = 0;
3405 do {
3406 for (x = 0; x <= search_area_width - 8; x += 8) {
3407 __m256i sum256 = _mm256_setzero_si256();
3408
3409 s = src;
3410 r = ref + x;
3411
3412 h = height;
3413 while (h >= 2) {
3414 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
3415 s += 2 * src_stride;
3416 r += 2 * ref_stride;
3417 h -= 2;
3418 };
3419
3420 if (h) {
3421 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
3422 }
3423
3424 update_small_pel(sum256, x, y, &best_s, &best_x, &best_y);
3425 }
3426
3427 if (leftover) {
3428 __m256i sum256 = _mm256_setzero_si256();
3429
3430 s = src;
3431 r = ref + x;
3432
3433 h = height;
3434 while (h >= 2) {
3435 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
3436 s += 2 * src_stride;
3437 r += 2 * ref_stride;
3438 h -= 2;
3439 };
3440 if (h) {
3441 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
3442 }
3443
3444 update_leftover_small_pel(sum256, x, y, mask128, &best_s, &best_x, &best_y);
3445 }
3446
3447 ref += src_stride_raw;
3448 } while (++y < search_area_height);
3449 }
3450 break;
3451
3452 case 6:
3453 if (height <= 4) {
3454 y = 0;
3455 do {
3456 for (x = 0; x <= search_area_width - 8; x += 8) {
3457 __m128i sum = _mm_setzero_si128();
3458
3459 s = src;
3460 r = ref + x;
3461
3462 h = height;
3463 while (h >= 2) {
3464 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
3465 s += 2 * src_stride;
3466 r += 2 * ref_stride;
3467 h -= 2;
3468 };
3469
3470 if (h) {
3471 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
3472 }
3473
3474 __m128i sum2 = complement_4_to_6(
3475 ref + x, ref_stride, src, src_stride, height, 8);
3476 sum = _mm_adds_epu16(sum, sum2);
3477
3478 update_best(sum, x, y, &best_s, &best_x, &best_y);
3479 }
3480
3481 if (leftover) {
3482 __m128i sum = _mm_setzero_si128();
3483
3484 s = src;
3485 r = ref + x;
3486
3487 h = height;
3488 while (h >= 2) {
3489 sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
3490 s += 2 * src_stride;
3491 r += 2 * ref_stride;
3492 h -= 2;
3493 };
3494
3495 if (h) {
3496 sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
3497 }
3498
3499 sum = _mm_or_si128(sum, mask128);
3500
3501 __m128i sum2 = complement_4_to_6(
3502 ref + x, ref_stride, src, src_stride, height, leftover);
3503 sum = _mm_adds_epu16(sum, sum2);
3504
3505 update_best(sum, x, y, &best_s, &best_x, &best_y);
3506 }
3507
3508 ref += src_stride_raw;
3509 } while (++y < search_area_height);
3510 } else {
3511 y = 0;
3512 do {
3513 for (x = 0; x <= search_area_width - 8; x += 8) {
3514 __m256i sum256 = _mm256_setzero_si256();
3515
3516 s = src;
3517 r = ref + x;
3518
3519 h = height;
3520 while (h >= 2) {
3521 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
3522 s += 2 * src_stride;
3523 r += 2 * ref_stride;
3524 h -= 2;
3525 };
3526
3527 if (h) {
3528 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
3529 }
3530
3531 __m128i sum = complement_4_to_6(
3532 ref + x, ref_stride, src, src_stride, height, 8);
3533 sum256 = _mm256_adds_epu16(
3534 sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
3535
3536 update_small_pel(sum256, x, y, &best_s, &best_x, &best_y);
3537 }
3538
3539 if (leftover) {
3540 __m256i sum256 = _mm256_setzero_si256();
3541
3542 s = src;
3543 r = ref + x;
3544
3545 h = height;
3546 while (h >= 2) {
3547 sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
3548 s += 2 * src_stride;
3549 r += 2 * ref_stride;
3550 h -= 2;
3551 };
3552
3553 if (h) {
3554 sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
3555 }
3556
3557 __m128i sum = complement_4_to_6(
3558 ref + x, ref_stride, src, src_stride, height, leftover);
3559 sum256 = _mm256_adds_epu16(
3560 sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
3561
3562 update_leftover_small_pel(sum256, x, y, mask128, &best_s, &best_x, &best_y);
3563 }
3564
3565 ref += src_stride_raw;
3566 } while (++y < search_area_height);
3567 }
3568 break;
3569
3570 case 8:
3571 // Note: Tried _mm512_dbsad_epu8 but is even slower.
3572 y = 0;
3573 do {
3574 for (x = 0; x <= search_area_width - 8; x += 8) {
3575 __m256i sum256 = _mm256_setzero_si256();
3576
3577 s = src;
3578 r = ref + x;
3579
3580 h = height;
3581 while (h >= 2) {
3582 sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
3583 s += 2 * src_stride;
3584 r += 2 * ref_stride;
3585 h -= 2;
3586 };
3587
3588 if (h) {
3589 sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
3590 };
3591
3592 update_small_pel(sum256, x, y, &best_s, &best_x, &best_y);
3593 }
3594
3595 if (leftover) {
3596 __m256i sum256 = _mm256_setzero_si256();
3597
3598 s = src;
3599 r = ref + x;
3600
3601 h = height;
3602 while (h >= 2) {
3603 sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
3604 s += 2 * src_stride;
3605 r += 2 * ref_stride;
3606 h -= 2;
3607 };
3608
3609 if (h) {
3610 sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
3611 };
3612
3613 update_leftover_small_pel(sum256, x, y, mask128, &best_s, &best_x, &best_y);
3614 }
3615
3616 ref += src_stride_raw;
3617 } while (++y < search_area_height);
3618 break;
3619
3620 case 12:
3621 if (height <= 16) {
3622 y = 0;
3623 do {
3624 for (x = 0; x <= search_area_width - 16; x += 16) {
3625 __m512i sum512 = _mm512_setzero_si512();
3626
3627 s = src;
3628 r = ref + x;
3629
3630 h = height;
3631 while (h >= 2) {
3632 sad_loop_kernel_12_avx512(s, src_stride, r, ref_stride, &sum512);
3633 s += 2 * src_stride;
3634 r += 2 * ref_stride;
3635 h -= 2;
3636 };
3637
3638 if (h) {
3639 sad_loop_kernel_12_oneline_avx512(s, r, &sum512);
3640 }
3641
3642 update_256_pel(sum512, x, y, &best_s, &best_x, &best_y);
3643 }
3644
3645 // leftover
3646 for (; x < search_area_width; x += 8) {
3647 __m256i sum256 = _mm256_setzero_si256();
3648
3649 s = src;
3650 r = ref + x;
3651
3652 h = height;
3653 while (h >= 2) {
3654 sad_loop_kernel_12_avx2(s, src_stride, r, ref_stride, &sum256);
3655 s += 2 * src_stride;
3656 r += 2 * ref_stride;
3657 h -= 2;
3658 };
3659
3660 if (h) {
3661 sad_loop_kernel_12_oneline_avx2(s, r, &sum256);
3662 }
3663
3664 update_leftover_256_pel(
3665 sum256, search_area_width, x, y, mask128, &best_s, &best_x, &best_y);
3666 }
3667
3668 ref += src_stride_raw;
3669 } while (++y < search_area_height);
3670 } else if (height <= 32) {
3671 y = 0;
3672 do {
3673 for (x = 0; x <= search_area_width - 16; x += 16) {
3674 __m512i sum512 = _mm512_setzero_si512();
3675
3676 s = src;
3677 r = ref + x;
3678
3679 h = height;
3680 while (h >= 2) {
3681 sad_loop_kernel_12_avx512(s, src_stride, r, ref_stride, &sum512);
3682 s += 2 * src_stride;
3683 r += 2 * ref_stride;
3684 h -= 2;
3685 };
3686
3687 if (h) {
3688 sad_loop_kernel_12_oneline_avx512(s, r, &sum512);
3689 }
3690
3691 update_512_pel(sum512, x, y, &best_s, &best_x, &best_y);
3692 }
3693
3694 // leftover
3695 for (; x < search_area_width; x += 8) {
3696 __m256i sum256 = _mm256_setzero_si256();
3697
3698 s = src;
3699 r = ref + x;
3700
3701 h = height;
3702 while (h >= 2) {
3703 sad_loop_kernel_12_avx2(s, src_stride, r, ref_stride, &sum256);
3704 s += 2 * src_stride;
3705 r += 2 * ref_stride;
3706 h -= 2;
3707 };
3708
3709 if (h) {
3710 sad_loop_kernel_12_oneline_avx2(s, r, &sum256);
3711 }
3712
3713 update_leftover_512_pel(
3714 sum256, search_area_width, x, y, mask256, &best_s, &best_x, &best_y);
3715 }
3716
3717 ref += src_stride_raw;
3718 } while (++y < search_area_height);
3719 } else {
3720 const uint32_t leftover16 = search_area_width & 15;
3721
3722 y = 0;
3723 do {
3724 for (x = 0; x <= search_area_width - 16; x += 16) {
3725 __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
3726
3727 s = src;
3728 r = ref + x;
3729
3730 h = height;
3731 while (h >= 2) {
3732 sad_loop_kernel_12_2sum_avx512(s, src_stride, r, ref_stride, sums512);
3733 s += 2 * src_stride;
3734 r += 2 * ref_stride;
3735 h -= 2;
3736 };
3737
3738 if (h) {
3739 sad_loop_kernel_12_2sum_oneline_avx512(s, r, sums512);
3740 }
3741
3742 update_1024_pel(sums512, x, y, &best_s, &best_x, &best_y);
3743 }
3744
3745 if (leftover16 >= 8) {
3746 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3747
3748 s = src;
3749 r = ref + x;
3750
3751 h = height;
3752 while (h >= 2) {
3753 sad_loop_kernel_12_2sum_avx2(s, src_stride, r, ref_stride, sums256);
3754 s += 2 * src_stride;
3755 r += 2 * ref_stride;
3756 h -= 2;
3757 };
3758
3759 if (h) {
3760 sad_loop_kernel_12_2sum_oneline_avx2(s, r, sums256);
3761 }
3762
3763 update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
3764 x += 8;
3765 }
3766
3767 if (leftover) {
3768 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3769
3770 s = src;
3771 r = ref + x;
3772
3773 h = height;
3774 while (h >= 2) {
3775 sad_loop_kernel_12_2sum_avx2(s, src_stride, r, ref_stride, sums256);
3776 s += 2 * src_stride;
3777 r += 2 * ref_stride;
3778 h -= 2;
3779 };
3780
3781 if (h) {
3782 sad_loop_kernel_12_2sum_oneline_avx2(s, r, sums256);
3783 }
3784
3785 update_leftover_1024_pel(sums256,
3786 search_area_width,
3787 x,
3788 y,
3789 leftover,
3790 mask128,
3791 &best_s,
3792 &best_x,
3793 &best_y);
3794 }
3795
3796 ref += src_stride_raw;
3797 } while (++y < search_area_height);
3798 }
3799 break;
3800
3801 case 16:
3802 if (height <= 16) {
3803 y = 0;
3804 do {
3805 for (x = 0; x <= search_area_width - 16; x += 16) {
3806 __m512i sum512 = _mm512_setzero_si512();
3807
3808 s = src;
3809 r = ref + x;
3810
3811 h = height;
3812 while (h >= 2) {
3813 sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3814 s += 2 * src_stride;
3815 r += 2 * ref_stride;
3816 h -= 2;
3817 };
3818
3819 if (h) {
3820 sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
3821 };
3822
3823 update_256_pel(sum512, x, y, &best_s, &best_x, &best_y);
3824 }
3825
3826 // leftover
3827 for (; x < search_area_width; x += 8) {
3828 __m256i sum256 = _mm256_setzero_si256();
3829
3830 s = src;
3831 r = ref + x;
3832
3833 h = height;
3834 while (h >= 2) {
3835 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
3836 s += 2 * src_stride;
3837 r += 2 * ref_stride;
3838 h -= 2;
3839 }
3840
3841 if (h) {
3842 sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
3843 }
3844
3845 update_leftover_256_pel(
3846 sum256, search_area_width, x, y, mask128, &best_s, &best_x, &best_y);
3847 }
3848
3849 ref += src_stride_raw;
3850 } while (++y < search_area_height);
3851 } else if (height <= 32) {
3852 y = 0;
3853 do {
3854 for (x = 0; x <= search_area_width - 16; x += 16) {
3855 __m512i sum512 = _mm512_setzero_si512();
3856
3857 s = src;
3858 r = ref + x;
3859
3860 h = height;
3861 while (h >= 2) {
3862 sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3863 s += 2 * src_stride;
3864 r += 2 * ref_stride;
3865 h -= 2;
3866 }
3867
3868 if (h) {
3869 sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
3870 }
3871
3872 update_512_pel(sum512, x, y, &best_s, &best_x, &best_y);
3873 }
3874
3875 // leftover
3876 for (; x < search_area_width; x += 8) {
3877 __m256i sum256 = _mm256_setzero_si256();
3878
3879 s = src;
3880 r = ref + x;
3881
3882 h = height;
3883 while (h >= 2) {
3884 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
3885 s += 2 * src_stride;
3886 r += 2 * ref_stride;
3887 h -= 2;
3888 }
3889
3890 if (h) {
3891 sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
3892 }
3893
3894 update_leftover_512_pel(
3895 sum256, search_area_width, x, y, mask256, &best_s, &best_x, &best_y);
3896 }
3897
3898 ref += src_stride_raw;
3899 } while (++y < search_area_height);
3900 } else {
3901 const uint32_t leftover16 = search_area_width & 15;
3902
3903 y = 0;
3904 do {
3905 for (x = 0; x <= search_area_width - 16; x += 16) {
3906 __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
3907
3908 s = src;
3909 r = ref + x;
3910
3911 h = height;
3912 while (h >= 2) {
3913 sad_loop_kernel_16_2sum_avx512(s, src_stride, r, ref_stride, sums512);
3914 s += 2 * src_stride;
3915 r += 2 * ref_stride;
3916 h -= 2;
3917 }
3918
3919 if (h) {
3920 sad_loop_kernel_16_2sum_oneline_avx512(s, r, sums512);
3921 }
3922
3923 update_1024_pel(sums512, x, y, &best_s, &best_x, &best_y);
3924 }
3925
3926 if (leftover16 >= 8) {
3927 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3928
3929 s = src;
3930 r = ref + x;
3931
3932 h = height;
3933 while (h >= 2) {
3934 sad_loop_kernel_16_2sum_avx2(s, src_stride, r, ref_stride, sums256);
3935 s += 2 * src_stride;
3936 r += 2 * ref_stride;
3937 h -= 2;
3938 }
3939
3940 if (h) {
3941 sad_loop_kernel_16_2sum_oneline_avx2(s, r, sums256);
3942 }
3943
3944 update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
3945 x += 8;
3946 }
3947
3948 if (leftover) {
3949 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3950
3951 s = src;
3952 r = ref + x;
3953
3954 h = height;
3955 while (h >= 2) {
3956 sad_loop_kernel_16_2sum_avx2(s, src_stride, r, ref_stride, sums256);
3957 s += 2 * src_stride;
3958 r += 2 * ref_stride;
3959 h -= 2;
3960 }
3961
3962 if (h) {
3963 sad_loop_kernel_16_2sum_oneline_avx2(s, r, sums256);
3964 }
3965
3966 update_leftover_1024_pel(sums256,
3967 search_area_width,
3968 x,
3969 y,
3970 leftover,
3971 mask128,
3972 &best_s,
3973 &best_x,
3974 &best_y);
3975 }
3976
3977 ref += src_stride_raw;
3978 } while (++y < search_area_height);
3979 }
3980 break;
3981
3982 case 24:
3983 if (height <= 16) {
3984 y = 0;
3985 do {
3986 for (x = 0; x <= search_area_width - 16; x += 16) {
3987 __m512i sum512 = _mm512_setzero_si512();
3988 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3989
3990 s = src;
3991 r = ref + x;
3992
3993 h = height;
3994 while (h >= 2) {
3995 sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3996 sad_loop_kernel_8_avx2(
3997 s + 16, src_stride, r + 16, ref_stride, &sums256[0]);
3998 sad_loop_kernel_8_avx2(
3999 s + 16, src_stride, r + 24, ref_stride, &sums256[1]);
4000 s += 2 * src_stride;
4001 r += 2 * ref_stride;
4002 h -= 2;
4003 };
4004
4005 if (h) {
4006 sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
4007 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[0]);
4008 sad_loop_kernel_8_oneline_avx2(s + 16, r + 24, &sums256[1]);
4009 }
4010
4011 update_384_pel(sum512, sums256, x, y, &best_s, &best_x, &best_y);
4012 }
4013
4014 // leftover
4015 for (; x < search_area_width; x += 8) {
4016 __m256i sum256 = _mm256_setzero_si256();
4017
4018 s = src;
4019 r = ref + x;
4020
4021 h = height;
4022 while (h >= 2) {
4023 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
4024 sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sum256);
4025 s += 2 * src_stride;
4026 r += 2 * ref_stride;
4027 h -= 2;
4028 };
4029
4030 if (h) {
4031 sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
4032 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sum256);
4033 }
4034
4035 update_leftover_512_pel(
4036 sum256, search_area_width, x, y, mask256, &best_s, &best_x, &best_y);
4037 }
4038
4039 ref += src_stride_raw;
4040 } while (++y < search_area_height);
4041 } else {
4042 const uint32_t leftover16 = search_area_width & 15;
4043
4044 y = 0;
4045 do {
4046 for (x = 0; x <= search_area_width - 16; x += 16) {
4047 __m512i sum512 = _mm512_setzero_si512();
4048 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4049
4050 s = src;
4051 r = ref + x;
4052
4053 h = height;
4054 while (h >= 2) {
4055 sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
4056 sad_loop_kernel_8_avx2(
4057 s + 16, src_stride, r + 16, ref_stride, &sums256[0]);
4058 sad_loop_kernel_8_avx2(
4059 s + 16, src_stride, r + 24, ref_stride, &sums256[1]);
4060 s += 2 * src_stride;
4061 r += 2 * ref_stride;
4062 h -= 2;
4063 };
4064
4065 if (h) {
4066 sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
4067 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[0]);
4068 sad_loop_kernel_8_oneline_avx2(s + 16, r + 24, &sums256[1]);
4069 }
4070
4071 update_768_pel(sum512, sums256, x, y, &best_s, &best_x, &best_y);
4072 }
4073
4074 // leftover
4075 if (leftover16 >= 8) {
4076 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4077
4078 s = src;
4079 r = ref + x;
4080
4081 h = height;
4082 while (h >= 2) {
4083 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sums256[0]);
4084 sad_loop_kernel_8_avx2(
4085 s + 16, src_stride, r + 16, ref_stride, &sums256[1]);
4086 s += 2 * src_stride;
4087 r += 2 * ref_stride;
4088 h -= 2;
4089 };
4090
4091 if (h) {
4092 sad_loop_kernel_16_oneline_avx2(s, r, &sums256[0]);
4093 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[1]);
4094 }
4095
4096 update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
4097
4098 x += 8;
4099 }
4100
4101 if (leftover) {
4102 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4103
4104 s = src;
4105 r = ref + x;
4106
4107 h = height;
4108 while (h >= 2) {
4109 sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sums256[0]);
4110 sad_loop_kernel_8_avx2(
4111 s + 16, src_stride, r + 16, ref_stride, &sums256[1]);
4112 s += 2 * src_stride;
4113 r += 2 * ref_stride;
4114 h -= 2;
4115 };
4116
4117 if (h) {
4118 sad_loop_kernel_16_oneline_avx2(s, r, &sums256[0]);
4119 sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[1]);
4120 }
4121
4122 update_leftover_1024_pel(sums256,
4123 search_area_width,
4124 x,
4125 y,
4126 leftover,
4127 mask128,
4128 &best_s,
4129 &best_x,
4130 &best_y);
4131 }
4132
4133 ref += src_stride_raw;
4134 } while (++y < search_area_height);
4135 }
4136 break;
4137
4138 case 32:
4139 if (height <= 16) {
4140 y = 0;
4141 do {
4142 for (x = 0; x <= search_area_width - 16; x += 16) {
4143 __m512i sum512 = _mm512_setzero_si512();
4144
4145 s = src;
4146 r = ref + x;
4147
4148 // Note: faster than looping 2 rows.
4149 h = height;
4150 do {
4151 sad_loop_kernel_32_avx512(s, r, &sum512);
4152 s += src_stride;
4153 r += ref_stride;
4154 } while (--h);
4155
4156 update_512_pel(sum512, x, y, &best_s, &best_x, &best_y);
4157 }
4158
4159 // leftover
4160 for (; x < search_area_width; x += 8) {
4161 __m256i sum256 = _mm256_setzero_si256();
4162
4163 s = src;
4164 r = ref + x;
4165
4166 h = height;
4167 do {
4168 sad_loop_kernel_32_avx2(s, r, &sum256);
4169 s += src_stride;
4170 r += ref_stride;
4171 } while (--h);
4172
4173 update_leftover_512_pel(
4174 sum256, search_area_width, x, y, mask256, &best_s, &best_x, &best_y);
4175 }
4176
4177 ref += src_stride_raw;
4178 } while (++y < search_area_height);
4179 } else if (height <= 32) {
4180 const uint32_t leftover16 = search_area_width & 15;
4181
4182 y = 0;
4183 do {
4184 for (x = 0; x <= search_area_width - 16; x += 16) {
4185 __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
4186
4187 s = src;
4188 r = ref + x;
4189
4190 h = height;
4191 do {
4192 sad_loop_kernel_32_2sum_avx512(s, r, sums512);
4193 s += src_stride;
4194 r += ref_stride;
4195 } while (--h);
4196
4197 update_1024_pel(sums512, x, y, &best_s, &best_x, &best_y);
4198 }
4199
4200 if (leftover16 >= 8) {
4201 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4202
4203 s = src;
4204 r = ref + x;
4205
4206 h = height;
4207 do {
4208 sad_loop_kernel_32_2sum_avx2(s, r, sums256);
4209 s += src_stride;
4210 r += ref_stride;
4211 } while (--h);
4212
4213 update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
4214 x += 8;
4215 }
4216
4217 if (leftover) {
4218 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4219
4220 s = src;
4221 r = ref + x;
4222
4223 h = height;
4224 do {
4225 sad_loop_kernel_32_2sum_avx2(s, r, sums256);
4226 s += src_stride;
4227 r += ref_stride;
4228 } while (--h);
4229
4230 update_leftover_1024_pel(sums256,
4231 search_area_width,
4232 x,
4233 y,
4234 leftover,
4235 mask128,
4236 &best_s,
4237 &best_x,
4238 &best_y);
4239 }
4240
4241 ref += src_stride_raw;
4242 } while (++y < search_area_height);
4243 } else {
4244 const uint32_t leftover16 = search_area_width & 15;
4245
4246 y = 0;
4247 do {
4248 for (x = 0; x <= search_area_width - 16; x += 16) {
4249 __m512i sums512[4] = {_mm512_setzero_si512(),
4250 _mm512_setzero_si512(),
4251 _mm512_setzero_si512(),
4252 _mm512_setzero_si512()};
4253
4254 s = src;
4255 r = ref + x;
4256
4257 h = height;
4258 do {
4259 sad_loop_kernel_32_4sum_avx512(s, r, sums512);
4260 s += src_stride;
4261 r += ref_stride;
4262 } while (--h);
4263
4264 update_2048_pel(sums512, x, y, &best_s, &best_x, &best_y);
4265 }
4266
4267 if (leftover16 >= 8) {
4268 __m256i sums256[4] = {_mm256_setzero_si256(),
4269 _mm256_setzero_si256(),
4270 _mm256_setzero_si256(),
4271 _mm256_setzero_si256()};
4272
4273 s = src;
4274 r = ref + x;
4275
4276 h = height;
4277 do {
4278 sad_loop_kernel_32_4sum_avx2(s, r, sums256);
4279 s += src_stride;
4280 r += ref_stride;
4281 } while (--h);
4282
4283 update_leftover8_2048_pel(sums256, x, y, &best_s, &best_x, &best_y);
4284 x += 8;
4285 }
4286
4287 if (leftover) {
4288 __m256i sums256[4] = {_mm256_setzero_si256(),
4289 _mm256_setzero_si256(),
4290 _mm256_setzero_si256(),
4291 _mm256_setzero_si256()};
4292
4293 s = src;
4294 r = ref + x;
4295
4296 h = height;
4297 do {
4298 sad_loop_kernel_32_4sum_avx2(s, r, sums256);
4299 s += src_stride;
4300 r += ref_stride;
4301 } while (--h);
4302
4303 update_leftover_2048_pel(sums256,
4304 search_area_width,
4305 x,
4306 y,
4307 leftover,
4308 mask128,
4309 &best_s,
4310 &best_x,
4311 &best_y);
4312 }
4313
4314 ref += src_stride_raw;
4315 } while (++y < search_area_height);
4316 }
4317 break;
4318
4319 case 48:
4320 if (height <= 32) {
4321 const uint32_t leftover16 = search_area_width & 15;
4322
4323 y = 0;
4324 do {
4325 for (x = 0; x <= search_area_width - 16; x += 16) {
4326 __m512i sums512[3] = {
4327 _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()};
4328
4329 s = src;
4330 r = ref + x;
4331
4332 h = height2;
4333 do {
4334 sad_loop_kernel_32_2sum_avx512(s, r, sums512);
4335 sad_loop_kernel_32_2sum_avx512(s + src_stride, r + ref_stride, sums512);
4336 sad_loop_kernel_16_avx512(
4337 s + 32, src_stride, r + 32, ref_stride, &sums512[2]);
4338 s += 2 * src_stride;
4339 r += 2 * ref_stride;
4340 } while (--h);
4341
4342 update_1536_pel(sums512, x, y, &best_s, &best_x, &best_y);
4343 }
4344
4345 if (leftover16 >= 8) {
4346 __m256i sums256[3] = {
4347 _mm256_setzero_si256(), _mm256_setzero_si256(), _mm256_setzero_si256()};
4348
4349 s = src;
4350 r = ref + x;
4351
4352 h = height2;
4353 do {
4354 sad_loop_kernel_32_2sum_avx2(s, r, sums256);
4355 sad_loop_kernel_32_2sum_avx2(s + src_stride, r + ref_stride, sums256);
4356 sad_loop_kernel_16_avx2(
4357 s + 32, src_stride, r + 32, ref_stride, &sums256[2]);
4358 s += 2 * src_stride;
4359 r += 2 * ref_stride;
4360 } while (--h);
4361
4362 update_leftover8_1536_pel(sums256, x, y, &best_s, &best_x, &best_y);
4363 x += 8;
4364 }
4365
4366 if (leftover) {
4367 __m256i sums256[3] = {
4368 _mm256_setzero_si256(), _mm256_setzero_si256(), _mm256_setzero_si256()};
4369
4370 s = src;
4371 r = ref + x;
4372
4373 h = height2;
4374 do {
4375 sad_loop_kernel_32_2sum_avx2(s, r, sums256);
4376 sad_loop_kernel_32_2sum_avx2(s + src_stride, r + ref_stride, sums256);
4377 sad_loop_kernel_16_avx2(
4378 s + 32, src_stride, r + 32, ref_stride, &sums256[2]);
4379 s += 2 * src_stride;
4380 r += 2 * ref_stride;
4381 } while (--h);
4382
4383 update_leftover_1536_pel(sums256,
4384 search_area_width,
4385 x,
4386 y,
4387 leftover,
4388 mask128,
4389 &best_s,
4390 &best_x,
4391 &best_y);
4392 }
4393
4394 ref += src_stride_raw;
4395 } while (++y < search_area_height);
4396 } else {
4397 const uint32_t leftover16 = search_area_width & 15;
4398
4399 y = 0;
4400 do {
4401 for (x = 0; x <= search_area_width - 16; x += 16) {
4402 __m512i sums512[6] = {_mm512_setzero_si512(),
4403 _mm512_setzero_si512(),
4404 _mm512_setzero_si512(),
4405 _mm512_setzero_si512(),
4406 _mm512_setzero_si512(),
4407 _mm512_setzero_si512()};
4408
4409 s = src;
4410 r = ref + x;
4411
4412 h = height2;
4413 do {
4414 sad_loop_kernel_32_4sum_avx512(s, r, sums512);
4415 sad_loop_kernel_32_4sum_avx512(s + src_stride, r + ref_stride, sums512);
4416 sad_loop_kernel_16_2sum_avx512(
4417 s + 32, src_stride, r + 32, ref_stride, &sums512[4]);
4418 s += 2 * src_stride;
4419 r += 2 * ref_stride;
4420 } while (--h);
4421
4422 const __m512i sum512_01 = _mm512_adds_epu16(sums512[0], sums512[1]);
4423 const __m512i sum512_23 = _mm512_adds_epu16(sums512[2], sums512[3]);
4424 const __m512i sum512_45 = _mm512_adds_epu16(sums512[4], sums512[5]);
4425 const __m512i sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
4426 const __m512i sum512 = _mm512_adds_epu16(sum512_0123, sum512_45);
4427 const __m256i sum_lo = _mm512_castsi512_si256(sum512);
4428 const __m256i sum_hi = _mm512_extracti64x4_epi64(sum512, 1);
4429 const __m256i sad = _mm256_adds_epu16(sum_lo, sum_hi);
4430 const __m128i sad_lo = _mm256_castsi256_si128(sad);
4431 const __m128i sad_hi = _mm256_extracti128_si256(sad, 1);
4432 const __m128i minpos_lo = _mm_minpos_epu16(sad_lo);
4433 const __m128i minpos_hi = _mm_minpos_epu16(sad_hi);
4434 const uint32_t min0 = _mm_extract_epi16(minpos_lo, 0);
4435 const uint32_t min1 = _mm_extract_epi16(minpos_hi, 0);
4436 uint32_t minmin, delta;
4437 __m128i minpos;
4438
4439 if (min0 <= min1) {
4440 minmin = min0;
4441 delta = 0;
4442 minpos = minpos_lo;
4443 } else {
4444 minmin = min1;
4445 delta = 8;
4446 minpos = minpos_hi;
4447 }
4448
4449 if (minmin < best_s) {
4450 if (minmin != 0xFFFF) { // no overflow
4451 best_s = minmin;
4452 best_x = x + delta + _mm_extract_epi16(minpos, 1);
4453 best_y = y;
4454 } else { // overflow
4455 __m256i sads256[2];
4456 __m128i sads128[2];
4457
4458 add16x16x6to32bit(sums512, sads256);
4459
4460 sads128[0] = _mm256_castsi256_si128(sads256[0]);
4461 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
4462 update_8_best(sads128, x + 0, y, &best_s, &best_x, &best_y);
4463
4464 sads128[0] = _mm256_castsi256_si128(sads256[1]);
4465 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
4466 update_8_best(sads128, x + 8, y, &best_s, &best_x, &best_y);
4467 }
4468 }
4469 }
4470
4471 if (leftover16 >= 8) {
4472 __m256i sums256[6] = {_mm256_setzero_si256(),
4473 _mm256_setzero_si256(),
4474 _mm256_setzero_si256(),
4475 _mm256_setzero_si256(),
4476 _mm256_setzero_si256(),
4477 _mm256_setzero_si256()};
4478
4479 s = src;
4480 r = ref + x;
4481
4482 h = height2;
4483 do {
4484 sad_loop_kernel_32_4sum_avx2(s, r, sums256);
4485 sad_loop_kernel_32_4sum_avx2(s + src_stride, r + ref_stride, sums256);
4486 sad_loop_kernel_16_2sum_avx2(
4487 s + 32, src_stride, r + 32, ref_stride, &sums256[4]);
4488 s += 2 * src_stride;
4489 r += 2 * ref_stride;
4490 } while (--h);
4491
4492 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
4493 const __m256i sum23 = _mm256_adds_epu16(sums256[2], sums256[3]);
4494 const __m256i sum45 = _mm256_adds_epu16(sums256[4], sums256[5]);
4495 const __m256i sum0123 = _mm256_adds_epu16(sum01, sum23);
4496 const __m256i sum256 = _mm256_adds_epu16(sum0123, sum45);
4497 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
4498 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
4499 const __m128i sad = _mm_adds_epu16(sum_lo, sum_hi);
4500 const __m128i minpos = _mm_minpos_epu16(sad);
4501 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
4502
4503 if (min0 < best_s) {
4504 if (min0 != 0xFFFF) { // no overflow
4505 best_s = min0;
4506 best_x = x + _mm_extract_epi16(minpos, 1);
4507 best_y = y;
4508 } else { // overflow
4509 __m128i sads[2];
4510
4511 add16x8x6to32bit(sums256, sads);
4512 update_8_best(sads, x, y, &best_s, &best_x, &best_y);
4513 }
4514 }
4515
4516 x += 8;
4517 }
4518
4519 if (leftover) {
4520 __m256i sums256[6] = {_mm256_setzero_si256(),
4521 _mm256_setzero_si256(),
4522 _mm256_setzero_si256(),
4523 _mm256_setzero_si256(),
4524 _mm256_setzero_si256(),
4525 _mm256_setzero_si256()};
4526
4527 s = src;
4528 r = ref + x;
4529
4530 h = height2;
4531 do {
4532 sad_loop_kernel_32_4sum_avx2(s, r, sums256);
4533 sad_loop_kernel_32_4sum_avx2(s + src_stride, r + ref_stride, sums256);
4534 sad_loop_kernel_16_2sum_avx2(
4535 s + 32, src_stride, r + 32, ref_stride, &sums256[4]);
4536 s += 2 * src_stride;
4537 r += 2 * ref_stride;
4538 } while (--h);
4539
4540 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
4541 const __m256i sum23 = _mm256_adds_epu16(sums256[2], sums256[3]);
4542 const __m256i sum45 = _mm256_adds_epu16(sums256[4], sums256[5]);
4543 const __m256i sum0123 = _mm256_adds_epu16(sum01, sum23);
4544 const __m256i sum256 = _mm256_adds_epu16(sum0123, sum45);
4545 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
4546 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
4547 const __m128i sad0 = _mm_adds_epu16(sum_lo, sum_hi);
4548 const __m128i sad1 = _mm_or_si128(sad0, mask128);
4549 const __m128i minpos = _mm_minpos_epu16(sad1);
4550 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
4551
4552 if (min0 < best_s) {
4553 if (min0 != 0xFFFF) { // no overflow
4554 best_s = min0;
4555 best_x = x + _mm_extract_epi16(minpos, 1);
4556 best_y = y;
4557 } else { // overflow
4558 const int32_t num = x + ((leftover < 4) ? leftover : 4);
4559 __m128i sads[2];
4560
4561 add16x8x6to32bit(sums256, sads);
4562
4563 do {
4564 UPDATE_BEST(sads[0], 0, x, best_s, best_x, best_y);
4565 sads[0] = _mm_srli_si128(sads[0], 4);
4566 } while (++x < num);
4567
4568 while (x < search_area_width) {
4569 UPDATE_BEST(sads[1], 0, x, best_s, best_x, best_y);
4570 sads[1] = _mm_srli_si128(sads[1], 4);
4571 x++;
4572 }
4573 }
4574 }
4575 }
4576
4577 ref += src_stride_raw;
4578 } while (++y < search_area_height);
4579 }
4580 break;
4581
4582 case 64:
4583 if (height <= 16) {
4584 const uint32_t leftover16 = search_area_width & 15;
4585
4586 y = 0;
4587 do {
4588 for (x = 0; x <= search_area_width - 16; x += 16) {
4589 __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
4590
4591 s = src;
4592 r = ref + x;
4593
4594 h = height;
4595 do {
4596 sad_loop_kernel_32_2sum_avx512(s + 0 * 32, r + 0 * 32, sums512);
4597 sad_loop_kernel_32_2sum_avx512(s + 1 * 32, r + 1 * 32, sums512);
4598 s += src_stride;
4599 r += ref_stride;
4600 } while (--h);
4601
4602 update_1024_pel(sums512, x, y, &best_s, &best_x, &best_y);
4603 }
4604
4605 if (leftover16 >= 8) {
4606 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4607
4608 s = src;
4609 r = ref + x;
4610
4611 h = height;
4612 do {
4613 sad_loop_kernel_64_2sum_avx2(s, r, sums256);
4614 s += src_stride;
4615 r += ref_stride;
4616 } while (--h);
4617
4618 update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
4619 x += 8;
4620 }
4621
4622 if (leftover) {
4623 __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4624
4625 s = src;
4626 r = ref + x;
4627
4628 h = height;
4629 do {
4630 sad_loop_kernel_64_2sum_avx2(s, r, sums256);
4631 s += src_stride;
4632 r += ref_stride;
4633 } while (--h);
4634
4635 update_leftover_1024_pel(sums256,
4636 search_area_width,
4637 x,
4638 y,
4639 leftover,
4640 mask128,
4641 &best_s,
4642 &best_x,
4643 &best_y);
4644 }
4645
4646 ref += src_stride_raw;
4647 } while (++y < search_area_height);
4648 } else if (height <= 32) {
4649 const uint32_t leftover16 = search_area_width & 15;
4650
4651 y = 0;
4652 do {
4653 for (x = 0; x <= search_area_width - 16; x += 16) {
4654 __m512i sums512[4] = {_mm512_setzero_si512(),
4655 _mm512_setzero_si512(),
4656 _mm512_setzero_si512(),
4657 _mm512_setzero_si512()};
4658
4659 s = src;
4660 r = ref + x;
4661
4662 h = height;
4663 do {
4664 sad_loop_kernel_32_4sum_avx512(s + 0 * 32, r + 0 * 32, sums512);
4665 sad_loop_kernel_32_4sum_avx512(s + 1 * 32, r + 1 * 32, sums512);
4666 s += src_stride;
4667 r += ref_stride;
4668 } while (--h);
4669
4670 update_2048_pel(sums512, x, y, &best_s, &best_x, &best_y);
4671 }
4672
4673 if (leftover16 >= 8) {
4674 __m256i sums256[4] = {_mm256_setzero_si256(),
4675 _mm256_setzero_si256(),
4676 _mm256_setzero_si256(),
4677 _mm256_setzero_si256()};
4678
4679 s = src;
4680 r = ref + x;
4681
4682 h = height;
4683 do {
4684 sad_loop_kernel_64_4sum_avx2(s, r, sums256);
4685 s += src_stride;
4686 r += ref_stride;
4687 } while (--h);
4688
4689 update_leftover8_2048_pel(sums256, x, y, &best_s, &best_x, &best_y);
4690 x += 8;
4691 }
4692
4693 if (leftover) {
4694 __m256i sums256[4] = {_mm256_setzero_si256(),
4695 _mm256_setzero_si256(),
4696 _mm256_setzero_si256(),
4697 _mm256_setzero_si256()};
4698
4699 s = src;
4700 r = ref + x;
4701
4702 h = height;
4703 do {
4704 sad_loop_kernel_64_4sum_avx2(s, r, sums256);
4705 s += src_stride;
4706 r += ref_stride;
4707 } while (--h);
4708
4709 update_leftover_2048_pel(sums256,
4710 search_area_width,
4711 x,
4712 y,
4713 leftover,
4714 mask128,
4715 &best_s,
4716 &best_x,
4717 &best_y);
4718 }
4719
4720 ref += src_stride_raw;
4721 } while (++y < search_area_height);
4722 } else {
4723 const uint32_t leftover16 = search_area_width & 15;
4724
4725 y = 0;
4726 do {
4727 for (x = 0; x <= search_area_width - 16; x += 16) {
4728 __m512i sums512[8] = {_mm512_setzero_si512(),
4729 _mm512_setzero_si512(),
4730 _mm512_setzero_si512(),
4731 _mm512_setzero_si512(),
4732 _mm512_setzero_si512(),
4733 _mm512_setzero_si512(),
4734 _mm512_setzero_si512(),
4735 _mm512_setzero_si512()};
4736
4737 s = src;
4738 r = ref + x;
4739
4740 h = height;
4741 do {
4742 sad_loop_kernel_32_4sum_avx512(s + 0 * 32, r + 0 * 32, sums512 + 0);
4743 sad_loop_kernel_32_4sum_avx512(s + 1 * 32, r + 1 * 32, sums512 + 4);
4744 s += src_stride;
4745 r += ref_stride;
4746 } while (--h);
4747
4748 const __m512i sum512_01 = _mm512_adds_epu16(sums512[0], sums512[1]);
4749 const __m512i sum512_23 = _mm512_adds_epu16(sums512[2], sums512[3]);
4750 const __m512i sum512_45 = _mm512_adds_epu16(sums512[4], sums512[5]);
4751 const __m512i sum512_67 = _mm512_adds_epu16(sums512[6], sums512[7]);
4752 const __m512i sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
4753 const __m512i sum512_4567 = _mm512_adds_epu16(sum512_45, sum512_67);
4754 const __m512i sum512 = _mm512_adds_epu16(sum512_0123, sum512_4567);
4755 const __m256i sum_lo = _mm512_castsi512_si256(sum512);
4756 const __m256i sum_hi = _mm512_extracti64x4_epi64(sum512, 1);
4757 const __m256i sad = _mm256_adds_epu16(sum_lo, sum_hi);
4758 const __m128i sad_lo = _mm256_castsi256_si128(sad);
4759 const __m128i sad_hi = _mm256_extracti128_si256(sad, 1);
4760 const __m128i minpos_lo = _mm_minpos_epu16(sad_lo);
4761 const __m128i minpos_hi = _mm_minpos_epu16(sad_hi);
4762 const uint32_t min0 = _mm_extract_epi16(minpos_lo, 0);
4763 const uint32_t min1 = _mm_extract_epi16(minpos_hi, 0);
4764 uint32_t minmin, delta;
4765 __m128i minpos;
4766
4767 if (min0 <= min1) {
4768 minmin = min0;
4769 delta = 0;
4770 minpos = minpos_lo;
4771 } else {
4772 minmin = min1;
4773 delta = 8;
4774 minpos = minpos_hi;
4775 }
4776
4777 if (minmin < best_s) {
4778 if (minmin != 0xFFFF) { // no overflow
4779 best_s = minmin;
4780 best_x = x + delta + _mm_extract_epi16(minpos, 1);
4781 best_y = y;
4782 } else { // overflow
4783 __m256i sads256[2];
4784 __m128i sads128[2];
4785
4786 add16x16x8to32bit(sums512, sads256);
4787
4788 sads128[0] = _mm256_castsi256_si128(sads256[0]);
4789 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
4790 update_8_best(sads128, x + 0, y, &best_s, &best_x, &best_y);
4791
4792 sads128[0] = _mm256_castsi256_si128(sads256[1]);
4793 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
4794 update_8_best(sads128, x + 8, y, &best_s, &best_x, &best_y);
4795 }
4796 }
4797 }
4798
4799 if (leftover16 >= 8) {
4800 __m256i sums256[8] = {_mm256_setzero_si256(),
4801 _mm256_setzero_si256(),
4802 _mm256_setzero_si256(),
4803 _mm256_setzero_si256(),
4804 _mm256_setzero_si256(),
4805 _mm256_setzero_si256(),
4806 _mm256_setzero_si256(),
4807 _mm256_setzero_si256()};
4808
4809 s = src;
4810 r = ref + x;
4811
4812 h = height;
4813 do {
4814 sad_loop_kernel_64_8sum_avx2(s, r, sums256);
4815 s += src_stride;
4816 r += ref_stride;
4817 } while (--h);
4818
4819 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
4820 const __m256i sum23 = _mm256_adds_epu16(sums256[2], sums256[3]);
4821 const __m256i sum45 = _mm256_adds_epu16(sums256[4], sums256[5]);
4822 const __m256i sum67 = _mm256_adds_epu16(sums256[6], sums256[7]);
4823 const __m256i sum0123 = _mm256_adds_epu16(sum01, sum23);
4824 const __m256i sum4567 = _mm256_adds_epu16(sum45, sum67);
4825 const __m256i sum256 = _mm256_adds_epu16(sum0123, sum4567);
4826 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
4827 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
4828 const __m128i sad = _mm_adds_epu16(sum_lo, sum_hi);
4829 const __m128i minpos = _mm_minpos_epu16(sad);
4830 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
4831
4832 if (min0 < best_s) {
4833 if (min0 != 0xFFFF) { // no overflow
4834 best_s = min0;
4835 best_x = x + _mm_extract_epi16(minpos, 1);
4836 best_y = y;
4837 } else { // overflow
4838 __m128i sads[2];
4839
4840 add16x8x8to32bit(sums256, sads);
4841 update_8_best(sads, x, y, &best_s, &best_x, &best_y);
4842 }
4843 }
4844
4845 x += 8;
4846 }
4847
4848 if (leftover) {
4849 __m256i sums256[8] = {_mm256_setzero_si256(),
4850 _mm256_setzero_si256(),
4851 _mm256_setzero_si256(),
4852 _mm256_setzero_si256(),
4853 _mm256_setzero_si256(),
4854 _mm256_setzero_si256(),
4855 _mm256_setzero_si256(),
4856 _mm256_setzero_si256()};
4857
4858 s = src;
4859 r = ref + x;
4860
4861 h = height;
4862 do {
4863 sad_loop_kernel_64_8sum_avx2(s, r, sums256);
4864 s += src_stride;
4865 r += ref_stride;
4866 } while (--h);
4867
4868 const __m256i sum01 = _mm256_adds_epu16(sums256[0], sums256[1]);
4869 const __m256i sum23 = _mm256_adds_epu16(sums256[2], sums256[3]);
4870 const __m256i sum45 = _mm256_adds_epu16(sums256[4], sums256[5]);
4871 const __m256i sum67 = _mm256_adds_epu16(sums256[6], sums256[7]);
4872 const __m256i sum0123 = _mm256_adds_epu16(sum01, sum23);
4873 const __m256i sum4567 = _mm256_adds_epu16(sum45, sum67);
4874 const __m256i sum256 = _mm256_adds_epu16(sum0123, sum4567);
4875 const __m128i sum_lo = _mm256_castsi256_si128(sum256);
4876 const __m128i sum_hi = _mm256_extracti128_si256(sum256, 1);
4877 const __m128i sad0 = _mm_adds_epu16(sum_lo, sum_hi);
4878 const __m128i sad1 = _mm_or_si128(sad0, mask128);
4879 const __m128i minpos = _mm_minpos_epu16(sad1);
4880 const uint32_t min0 = _mm_extract_epi16(minpos, 0);
4881
4882 if (min0 < best_s) {
4883 if (min0 != 0xFFFF) { // no overflow
4884 best_s = min0;
4885 best_x = x + _mm_extract_epi16(minpos, 1);
4886 best_y = y;
4887 } else { // overflow
4888 const int32_t num = x + ((leftover < 4) ? leftover : 4);
4889 __m128i sads[2];
4890
4891 add16x8x8to32bit(sums256, sads);
4892
4893 do {
4894 UPDATE_BEST(sads[0], 0, x, best_s, best_x, best_y);
4895 sads[0] = _mm_srli_si128(sads[0], 4);
4896 } while (++x < num);
4897
4898 while (x < search_area_width) {
4899 UPDATE_BEST(sads[1], 0, x, best_s, best_x, best_y);
4900 sads[1] = _mm_srli_si128(sads[1], 4);
4901 x++;
4902 }
4903 }
4904 }
4905 }
4906
4907 ref += src_stride_raw;
4908 } while (++y < search_area_height);
4909 }
4910 break;
4911
4912 default:
4913 sad_loop_kernel_generalized_avx512(src,
4914 src_stride,
4915 ref,
4916 ref_stride,
4917 height,
4918 width,
4919 best_sad,
4920 x_search_center,
4921 y_search_center,
4922 src_stride_raw,
4923 search_area_width,
4924 search_area_height);
4925 return;
4926 }
4927 }
4928
4929 *best_sad = best_s;
4930 *x_search_center = (int16_t)best_x;
4931 *y_search_center = (int16_t)best_y;
4932 }
4933 #endif // EN_AVX512_SUPPORT
4934