1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11 
12 #include "EbDefinitions.h"
13 
14 #if EN_AVX512_SUPPORT
15 #include <assert.h>
16 #include "EbComputeSAD_AVX2.h"
17 #include <immintrin.h>
18 #include "EbMemory_AVX2.h"
19 #include "transpose_avx2.h"
20 #include "EbUtility.h"
21 #include "EbComputeSAD_C.h"
22 
sad64_kernel_avx512(const __m512i s,const uint8_t * const ref,__m512i * const sum)23 static INLINE void sad64_kernel_avx512(const __m512i s, const uint8_t *const ref,
24                                        __m512i *const sum) {
25     const __m512i r = _mm512_loadu_si512((__m512i *)ref);
26     *sum            = _mm512_add_epi32(*sum, _mm512_sad_epu8(s, r));
27 }
28 
sad64_avx512(const uint8_t * const src,const uint8_t * ref,__m512i * const sum)29 static INLINE void sad64_avx512(const uint8_t *const src, const uint8_t *ref, __m512i *const sum) {
30     const __m512i s = _mm512_loadu_si512((__m512i *)src);
31     sad64_kernel_avx512(s, ref, sum);
32 }
33 
sad_final_avx512(const __m512i zmm)34 static INLINE uint32_t sad_final_avx512(const __m512i zmm) {
35     const __m256i zmm_L  = _mm512_castsi512_si256(zmm);
36     const __m256i zmm_H  = _mm512_extracti64x4_epi64(zmm, 1);
37     const __m256i ymm    = _mm256_add_epi32(zmm_L, zmm_H);
38     const __m128i ymm_L  = _mm256_castsi256_si128(ymm);
39     const __m128i ymm_H  = _mm256_extracti128_si256(ymm, 1);
40     const __m128i xmm0   = _mm_add_epi32(ymm_L, ymm_H);
41     const __m128i xmm0_H = _mm_srli_si128(xmm0, 8);
42     const __m128i xmm1   = _mm_add_epi32(xmm0, xmm0_H);
43 
44     return _mm_extract_epi32(xmm1, 0);
45 }
46 
47 /*******************************************************************************
48 * Requirement: height % 2 = 0
49 *******************************************************************************/
compute64x_m_sad_avx512_intrin(const uint8_t * src,const uint32_t src_stride,const uint8_t * ref,const uint32_t ref_stride,const uint32_t height)50 static AOM_FORCE_INLINE uint32_t compute64x_m_sad_avx512_intrin(const uint8_t *src,
51                                                                 const uint32_t src_stride,
52                                                                 const uint8_t *ref,
53                                                                 const uint32_t ref_stride,
54                                                                 const uint32_t height) {
55     uint32_t y   = height;
56     __m512i  zmm = _mm512_setzero_si512();
57 
58     do {
59         sad64_avx512(src + 0 * src_stride, ref + 0 * ref_stride, &zmm);
60         sad64_avx512(src + 1 * src_stride, ref + 1 * ref_stride, &zmm);
61         src += src_stride << 1;
62         ref += ref_stride << 1;
63         y -= 2;
64     } while (y);
65 
66     return sad_final_avx512(zmm);
67 }
68 
69 /*******************************************************************************
70 * Requirement: height % 2 = 0
71 *******************************************************************************/
72 SIMD_INLINE uint32_t
compute128x_m_sad_avx512_intrin(const uint8_t * src,const uint32_t src_stride,const uint8_t * ref,const uint32_t ref_stride,const uint32_t height)73 compute128x_m_sad_avx512_intrin(const uint8_t *src, // input parameter, source samples Ptr
74                                 const uint32_t src_stride, // input parameter, source stride
75                                 const uint8_t *ref, // input parameter, reference samples Ptr
76                                 const uint32_t ref_stride, // input parameter, reference stride
77                                 const uint32_t height) // input parameter, block height (M)
78 {
79     uint32_t y   = height;
80     __m512i  zmm = _mm512_setzero_si512();
81 
82     do {
83         sad64_avx512(src + 0 * src_stride + 0 * 64, ref + 0 * ref_stride + 0 * 64, &zmm);
84         sad64_avx512(src + 0 * src_stride + 1 * 64, ref + 0 * ref_stride + 1 * 64, &zmm);
85         sad64_avx512(src + 1 * src_stride + 0 * 64, ref + 1 * ref_stride + 0 * 64, &zmm);
86         sad64_avx512(src + 1 * src_stride + 1 * 64, ref + 1 * ref_stride + 1 * 64, &zmm);
87         src += src_stride << 1;
88         ref += ref_stride << 1;
89         y -= 2;
90     } while (y);
91 
92     return sad_final_avx512(zmm);
93 }
94 
svt_aom_sad64x16_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)95 uint32_t svt_aom_sad64x16_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
96                                  int ref_stride) {
97     return compute64x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 16);
98 }
99 
svt_aom_sad64x32_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)100 uint32_t svt_aom_sad64x32_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
101                                  int ref_stride) {
102     return compute64x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 32);
103 }
104 
svt_aom_sad64x64_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)105 uint32_t svt_aom_sad64x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
106                                  int ref_stride) {
107     return compute64x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 64);
108 }
109 
svt_aom_sad64x128_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)110 uint32_t svt_aom_sad64x128_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
111                                   int ref_stride) {
112     return compute64x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 128);
113 }
114 
svt_aom_sad128x64_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)115 uint32_t svt_aom_sad128x64_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
116                                   int ref_stride) {
117     return compute128x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 64);
118 }
119 
svt_aom_sad128x128_avx512(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)120 uint32_t svt_aom_sad128x128_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
121                                    int ref_stride) {
122     return compute128x_m_sad_avx512_intrin(src_ptr, src_stride, ref_ptr, ref_stride, 128);
123 }
124 
sad64_4d_avx512(const uint8_t * const src,const uint8_t * const ref_array[4],const uint32_t offset,__m512i sum[4])125 static INLINE void sad64_4d_avx512(const uint8_t *const src, const uint8_t *const ref_array[4],
126                                    const uint32_t offset, __m512i sum[4]) {
127     const __m512i s = _mm512_loadu_si512((__m512i *)(src + offset));
128     sad64_kernel_avx512(s, ref_array[0] + offset, &sum[0]);
129     sad64_kernel_avx512(s, ref_array[1] + offset, &sum[1]);
130     sad64_kernel_avx512(s, ref_array[2] + offset, &sum[2]);
131     sad64_kernel_avx512(s, ref_array[3] + offset, &sum[3]);
132 }
133 
add_hi_lo_32_avx512(const __m512i src)134 static INLINE __m256i add_hi_lo_32_avx512(const __m512i src) {
135     const __m256i s0 = _mm512_castsi512_si256(src);
136     const __m256i s1 = _mm512_extracti64x4_epi64(src, 1);
137     return _mm256_add_epi32(s0, s1);
138 }
139 
hadd_four_32_avx2(const __m256i src0,const __m256i src1,const __m256i src2,const __m256i src3)140 static INLINE __m128i hadd_four_32_avx2(const __m256i src0, const __m256i src1, const __m256i src2,
141                                         const __m256i src3) {
142     const __m256i s01   = _mm256_hadd_epi32(src0, src1); // 0 0 1 1  0 0 1 1
143     const __m256i s23   = _mm256_hadd_epi32(src2, src3); // 2 2 3 3  2 2 3 3
144     const __m256i s0123 = _mm256_hadd_epi32(s01, s23); // 0 1 2 3  0 1 2 3
145     const __m128i sum0  = _mm256_castsi256_si128(s0123); // 0 1 2 3
146     const __m128i sum1  = _mm256_extracti128_si256(s0123, 1); // 0 1 2 3
147     return _mm_add_epi32(sum0, sum1); // 0 1 2 3
148 }
149 
hadd_four_32_avx512(const __m512i src0,const __m512i src1,const __m512i src2,const __m512i src3)150 static INLINE __m128i hadd_four_32_avx512(const __m512i src0, const __m512i src1,
151                                           const __m512i src2, const __m512i src3) {
152     __m256i s[4];
153 
154     s[0] = add_hi_lo_32_avx512(src0);
155     s[1] = add_hi_lo_32_avx512(src1);
156     s[2] = add_hi_lo_32_avx512(src2);
157     s[3] = add_hi_lo_32_avx512(src3);
158 
159     return hadd_four_32_avx2(s[0], s[1], s[2], s[3]);
160 }
161 
compute128x_m_4d_sad_avx512_intrin(const uint8_t * src,const uint32_t src_stride,const uint8_t * const ref_array[4],const uint32_t ref_stride,uint32_t sad_array[4],const uint32_t height)162 SIMD_INLINE void compute128x_m_4d_sad_avx512_intrin(const uint8_t *src, const uint32_t src_stride,
163                                                     const uint8_t *const ref_array[4],
164                                                     const uint32_t       ref_stride,
165                                                     uint32_t sad_array[4], const uint32_t height) {
166     const uint8_t *ref[4];
167     uint32_t       y      = height;
168     __m512i        zmm[4] = {0};
169 
170     ref[0] = ref_array[0];
171     ref[1] = ref_array[1];
172     ref[2] = ref_array[2];
173     ref[3] = ref_array[3];
174 
175     do {
176         sad64_4d_avx512(src, ref, 0 * 64, zmm);
177         sad64_4d_avx512(src, ref, 1 * 64, zmm);
178         src += src_stride;
179         ref[0] += ref_stride;
180         ref[1] += ref_stride;
181         ref[2] += ref_stride;
182         ref[3] += ref_stride;
183     } while (--y);
184 
185     const __m128i sum = hadd_four_32_avx512(zmm[0], zmm[1], zmm[2], zmm[3]);
186     _mm_storeu_si128((__m128i *)sad_array, sum);
187 }
188 
svt_aom_sad128x64x4d_avx512(const uint8_t * src,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])189 void svt_aom_sad128x64x4d_avx512(const uint8_t *src, int src_stride,
190                                  const uint8_t *const ref_array[4], int ref_stride,
191                                  uint32_t sad_array[4]) {
192     compute128x_m_4d_sad_avx512_intrin(src, src_stride, ref_array, ref_stride, sad_array, 64);
193 }
194 
svt_aom_sad128x128x4d_avx512(const uint8_t * src,int src_stride,const uint8_t * const ref_array[4],int ref_stride,uint32_t sad_array[4])195 void svt_aom_sad128x128x4d_avx512(const uint8_t *src, int src_stride,
196                                   const uint8_t *const ref_array[4], int ref_stride,
197                                   uint32_t sad_array[4]) {
198     compute128x_m_4d_sad_avx512_intrin(src, src_stride, ref_array, ref_stride, sad_array, 128);
199 }
200 
201 // =============================================================================
202 
add16x8x2to32bit(const __m256i sads256[2],__m128i sads128[2])203 static INLINE void add16x8x2to32bit(const __m256i sads256[2], __m128i sads128[2]) {
204     const __m256i zero        = _mm256_setzero_si256();
205     const __m256i sad256_0_lo = _mm256_unpacklo_epi16(sads256[0], zero);
206     const __m256i sad256_0_hi = _mm256_unpackhi_epi16(sads256[0], zero);
207     const __m256i sad256_1_lo = _mm256_unpacklo_epi16(sads256[1], zero);
208     const __m256i sad256_1_hi = _mm256_unpackhi_epi16(sads256[1], zero);
209     const __m256i sad256_lo   = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
210     const __m256i sad256_hi   = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
211     const __m128i sad128_ll   = _mm256_castsi256_si128(sad256_lo);
212     const __m128i sad128_lh   = _mm256_extracti128_si256(sad256_lo, 1);
213     const __m128i sad128_hl   = _mm256_castsi256_si128(sad256_hi);
214     const __m128i sad128_hh   = _mm256_extracti128_si256(sad256_hi, 1);
215     sads128[0]                = _mm_add_epi32(sad128_ll, sad128_lh);
216     sads128[1]                = _mm_add_epi32(sad128_hl, sad128_hh);
217 }
218 
add16x8x3to32bit(const __m256i sads256[3],__m128i sads128[2])219 SIMD_INLINE void add16x8x3to32bit(const __m256i sads256[3], __m128i sads128[2]) {
220     const __m256i zero         = _mm256_setzero_si256();
221     const __m256i sad256_0_lo  = _mm256_unpacklo_epi16(sads256[0], zero);
222     const __m256i sad256_0_hi  = _mm256_unpackhi_epi16(sads256[0], zero);
223     const __m256i sad256_1_lo  = _mm256_unpacklo_epi16(sads256[1], zero);
224     const __m256i sad256_1_hi  = _mm256_unpackhi_epi16(sads256[1], zero);
225     const __m256i sad256_2_lo  = _mm256_unpacklo_epi16(sads256[2], zero);
226     const __m256i sad256_2_hi  = _mm256_unpackhi_epi16(sads256[2], zero);
227     const __m256i sad256_01_lo = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
228     const __m256i sad256_01_hi = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
229     const __m256i sad256_lo    = _mm256_add_epi32(sad256_01_lo, sad256_2_lo);
230     const __m256i sad256_hi    = _mm256_add_epi32(sad256_01_hi, sad256_2_hi);
231     const __m128i sad128_ll    = _mm256_castsi256_si128(sad256_lo);
232     const __m128i sad128_lh    = _mm256_extracti128_si256(sad256_lo, 1);
233     const __m128i sad128_hl    = _mm256_castsi256_si128(sad256_hi);
234     const __m128i sad128_hh    = _mm256_extracti128_si256(sad256_hi, 1);
235     sads128[0]                 = _mm_add_epi32(sad128_ll, sad128_lh);
236     sads128[1]                 = _mm_add_epi32(sad128_hl, sad128_hh);
237 }
238 
add16x8x4to32bit(const __m256i sads256[4],__m128i sads128[2])239 SIMD_INLINE void add16x8x4to32bit(const __m256i sads256[4], __m128i sads128[2]) {
240     const __m256i zero         = _mm256_setzero_si256();
241     const __m256i sad256_0_lo  = _mm256_unpacklo_epi16(sads256[0], zero);
242     const __m256i sad256_0_hi  = _mm256_unpackhi_epi16(sads256[0], zero);
243     const __m256i sad256_1_lo  = _mm256_unpacklo_epi16(sads256[1], zero);
244     const __m256i sad256_1_hi  = _mm256_unpackhi_epi16(sads256[1], zero);
245     const __m256i sad256_2_lo  = _mm256_unpacklo_epi16(sads256[2], zero);
246     const __m256i sad256_2_hi  = _mm256_unpackhi_epi16(sads256[2], zero);
247     const __m256i sad256_3_lo  = _mm256_unpacklo_epi16(sads256[3], zero);
248     const __m256i sad256_3_hi  = _mm256_unpackhi_epi16(sads256[3], zero);
249     const __m256i sad256_01_lo = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
250     const __m256i sad256_01_hi = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
251     const __m256i sad256_23_lo = _mm256_add_epi32(sad256_2_lo, sad256_3_lo);
252     const __m256i sad256_23_hi = _mm256_add_epi32(sad256_2_hi, sad256_3_hi);
253     const __m256i sad256_lo    = _mm256_add_epi32(sad256_01_lo, sad256_23_lo);
254     const __m256i sad256_hi    = _mm256_add_epi32(sad256_01_hi, sad256_23_hi);
255     const __m128i sad128_ll    = _mm256_castsi256_si128(sad256_lo);
256     const __m128i sad128_lh    = _mm256_extracti128_si256(sad256_lo, 1);
257     const __m128i sad128_hl    = _mm256_castsi256_si128(sad256_hi);
258     const __m128i sad128_hh    = _mm256_extracti128_si256(sad256_hi, 1);
259     sads128[0]                 = _mm_add_epi32(sad128_ll, sad128_lh);
260     sads128[1]                 = _mm_add_epi32(sad128_hl, sad128_hh);
261 }
262 
add16x8x6to32bit(const __m256i sads256[6],__m128i sads128[2])263 SIMD_INLINE void add16x8x6to32bit(const __m256i sads256[6], __m128i sads128[2]) {
264     const __m256i zero           = _mm256_setzero_si256();
265     const __m256i sad256_0_lo    = _mm256_unpacklo_epi16(sads256[0], zero);
266     const __m256i sad256_0_hi    = _mm256_unpackhi_epi16(sads256[0], zero);
267     const __m256i sad256_1_lo    = _mm256_unpacklo_epi16(sads256[1], zero);
268     const __m256i sad256_1_hi    = _mm256_unpackhi_epi16(sads256[1], zero);
269     const __m256i sad256_2_lo    = _mm256_unpacklo_epi16(sads256[2], zero);
270     const __m256i sad256_2_hi    = _mm256_unpackhi_epi16(sads256[2], zero);
271     const __m256i sad256_3_lo    = _mm256_unpacklo_epi16(sads256[3], zero);
272     const __m256i sad256_3_hi    = _mm256_unpackhi_epi16(sads256[3], zero);
273     const __m256i sad256_4_lo    = _mm256_unpacklo_epi16(sads256[4], zero);
274     const __m256i sad256_4_hi    = _mm256_unpackhi_epi16(sads256[4], zero);
275     const __m256i sad256_5_lo    = _mm256_unpacklo_epi16(sads256[5], zero);
276     const __m256i sad256_5_hi    = _mm256_unpackhi_epi16(sads256[5], zero);
277     const __m256i sad256_01_lo   = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
278     const __m256i sad256_01_hi   = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
279     const __m256i sad256_23_lo   = _mm256_add_epi32(sad256_2_lo, sad256_3_lo);
280     const __m256i sad256_23_hi   = _mm256_add_epi32(sad256_2_hi, sad256_3_hi);
281     const __m256i sad256_45_lo   = _mm256_add_epi32(sad256_4_lo, sad256_5_lo);
282     const __m256i sad256_45_hi   = _mm256_add_epi32(sad256_4_hi, sad256_5_hi);
283     const __m256i sad256_0123_lo = _mm256_add_epi32(sad256_01_lo, sad256_23_lo);
284     const __m256i sad256_0123_hi = _mm256_add_epi32(sad256_01_hi, sad256_23_hi);
285     const __m256i sad256_lo      = _mm256_add_epi32(sad256_0123_lo, sad256_45_lo);
286     const __m256i sad256_hi      = _mm256_add_epi32(sad256_0123_hi, sad256_45_hi);
287     const __m128i sad128_ll      = _mm256_castsi256_si128(sad256_lo);
288     const __m128i sad128_lh      = _mm256_extracti128_si256(sad256_lo, 1);
289     const __m128i sad128_hl      = _mm256_castsi256_si128(sad256_hi);
290     const __m128i sad128_hh      = _mm256_extracti128_si256(sad256_hi, 1);
291     sads128[0]                   = _mm_add_epi32(sad128_ll, sad128_lh);
292     sads128[1]                   = _mm_add_epi32(sad128_hl, sad128_hh);
293 }
294 
add16x8x8to32bit(const __m256i sads256[8],__m128i sads128[2])295 SIMD_INLINE void add16x8x8to32bit(const __m256i sads256[8], __m128i sads128[2]) {
296     const __m256i zero           = _mm256_setzero_si256();
297     const __m256i sad256_0_lo    = _mm256_unpacklo_epi16(sads256[0], zero);
298     const __m256i sad256_0_hi    = _mm256_unpackhi_epi16(sads256[0], zero);
299     const __m256i sad256_1_lo    = _mm256_unpacklo_epi16(sads256[1], zero);
300     const __m256i sad256_1_hi    = _mm256_unpackhi_epi16(sads256[1], zero);
301     const __m256i sad256_2_lo    = _mm256_unpacklo_epi16(sads256[2], zero);
302     const __m256i sad256_2_hi    = _mm256_unpackhi_epi16(sads256[2], zero);
303     const __m256i sad256_3_lo    = _mm256_unpacklo_epi16(sads256[3], zero);
304     const __m256i sad256_3_hi    = _mm256_unpackhi_epi16(sads256[3], zero);
305     const __m256i sad256_4_lo    = _mm256_unpacklo_epi16(sads256[4], zero);
306     const __m256i sad256_4_hi    = _mm256_unpackhi_epi16(sads256[4], zero);
307     const __m256i sad256_5_lo    = _mm256_unpacklo_epi16(sads256[5], zero);
308     const __m256i sad256_5_hi    = _mm256_unpackhi_epi16(sads256[5], zero);
309     const __m256i sad256_6_lo    = _mm256_unpacklo_epi16(sads256[6], zero);
310     const __m256i sad256_6_hi    = _mm256_unpackhi_epi16(sads256[6], zero);
311     const __m256i sad256_7_lo    = _mm256_unpacklo_epi16(sads256[7], zero);
312     const __m256i sad256_7_hi    = _mm256_unpackhi_epi16(sads256[7], zero);
313     const __m256i sad256_01_lo   = _mm256_add_epi32(sad256_0_lo, sad256_1_lo);
314     const __m256i sad256_01_hi   = _mm256_add_epi32(sad256_0_hi, sad256_1_hi);
315     const __m256i sad256_23_lo   = _mm256_add_epi32(sad256_2_lo, sad256_3_lo);
316     const __m256i sad256_23_hi   = _mm256_add_epi32(sad256_2_hi, sad256_3_hi);
317     const __m256i sad256_45_lo   = _mm256_add_epi32(sad256_4_lo, sad256_5_lo);
318     const __m256i sad256_45_hi   = _mm256_add_epi32(sad256_4_hi, sad256_5_hi);
319     const __m256i sad256_67_lo   = _mm256_add_epi32(sad256_6_lo, sad256_7_lo);
320     const __m256i sad256_67_hi   = _mm256_add_epi32(sad256_6_hi, sad256_7_hi);
321     const __m256i sad256_0123_lo = _mm256_add_epi32(sad256_01_lo, sad256_23_lo);
322     const __m256i sad256_0123_hi = _mm256_add_epi32(sad256_01_hi, sad256_23_hi);
323     const __m256i sad256_4567_lo = _mm256_add_epi32(sad256_45_lo, sad256_67_lo);
324     const __m256i sad256_4567_hi = _mm256_add_epi32(sad256_45_hi, sad256_67_hi);
325     const __m256i sad256_lo      = _mm256_add_epi32(sad256_0123_lo, sad256_4567_lo);
326     const __m256i sad256_hi      = _mm256_add_epi32(sad256_0123_hi, sad256_4567_hi);
327     const __m128i sad128_ll      = _mm256_castsi256_si128(sad256_lo);
328     const __m128i sad128_lh      = _mm256_extracti128_si256(sad256_lo, 1);
329     const __m128i sad128_hl      = _mm256_castsi256_si128(sad256_hi);
330     const __m128i sad128_hh      = _mm256_extracti128_si256(sad256_hi, 1);
331     sads128[0]                   = _mm_add_epi32(sad128_ll, sad128_lh);
332     sads128[1]                   = _mm_add_epi32(sad128_hl, sad128_hh);
333 }
334 
add16x16x2to32bit(const __m512i sads512[2],__m256i sads256[2])335 SIMD_INLINE void add16x16x2to32bit(const __m512i sads512[2], __m256i sads256[2]) {
336     const __m512i zero = _mm512_setzero_si512();
337 
338     const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
339     const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
340     const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
341     const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
342 
343     // 0 1 2 3  8 9 A b   0 1 2 3  8 9 A b
344     // 4 5 6 7  C D E F   4 5 6 7  C D E F
345     const __m512i sad512_lo = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
346     const __m512i sad512_hi = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
347 
348     const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
349     const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
350     const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
351     const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
352 
353     // 0 1 2 3  8 9 A b
354     // 4 5 6 7  C D E F
355     const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
356     const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
357 
358     // 0 1 2 3  4 5 6 7
359     // 8 9 A b  C D E F
360     sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
361     sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
362 }
363 
add16x16x3to32bit(const __m512i sads512[3],__m256i sads256[2])364 SIMD_INLINE void add16x16x3to32bit(const __m512i sads512[3], __m256i sads256[2]) {
365     const __m512i zero = _mm512_setzero_si512();
366 
367     const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
368     const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
369     const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
370     const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
371     const __m512i sad512_2_lo = _mm512_unpacklo_epi16(sads512[2], zero);
372     const __m512i sad512_2_hi = _mm512_unpackhi_epi16(sads512[2], zero);
373 
374     const __m512i sad512_01_lo = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
375     const __m512i sad512_01_hi = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
376 
377     // 0 1 2 3  8 9 A b   0 1 2 3  8 9 A b
378     // 4 5 6 7  C D E F   4 5 6 7  C D E F
379     const __m512i sad512_lo = _mm512_add_epi32(sad512_01_lo, sad512_2_lo);
380     const __m512i sad512_hi = _mm512_add_epi32(sad512_01_hi, sad512_2_hi);
381 
382     const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
383     const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
384     const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
385     const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
386 
387     // 0 1 2 3  8 9 A b
388     // 4 5 6 7  C D E F
389     const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
390     const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
391 
392     // 0 1 2 3  4 5 6 7
393     // 8 9 A b  C D E F
394     sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
395     sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
396 }
397 
add16x16x4to32bit(const __m512i sads512[4],__m256i sads256[2])398 SIMD_INLINE void add16x16x4to32bit(const __m512i sads512[4], __m256i sads256[2]) {
399     // Don't call two add16x16x2to32bit(), which is slower.
400     const __m512i zero = _mm512_setzero_si512();
401 
402     const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
403     const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
404     const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
405     const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
406     const __m512i sad512_2_lo = _mm512_unpacklo_epi16(sads512[2], zero);
407     const __m512i sad512_2_hi = _mm512_unpackhi_epi16(sads512[2], zero);
408     const __m512i sad512_3_lo = _mm512_unpacklo_epi16(sads512[3], zero);
409     const __m512i sad512_3_hi = _mm512_unpackhi_epi16(sads512[3], zero);
410 
411     const __m512i sad512_01_lo = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
412     const __m512i sad512_01_hi = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
413     const __m512i sad512_23_lo = _mm512_add_epi32(sad512_2_lo, sad512_3_lo);
414     const __m512i sad512_23_hi = _mm512_add_epi32(sad512_2_hi, sad512_3_hi);
415 
416     // 0 1 2 3  8 9 A b   0 1 2 3  8 9 A b
417     // 4 5 6 7  C D E F   4 5 6 7  C D E F
418     const __m512i sad512_lo = _mm512_add_epi32(sad512_01_lo, sad512_23_lo);
419     const __m512i sad512_hi = _mm512_add_epi32(sad512_01_hi, sad512_23_hi);
420 
421     const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
422     const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
423     const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
424     const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
425 
426     // 0 1 2 3  8 9 A b
427     // 4 5 6 7  C D E F
428     const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
429     const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
430 
431     // 0 1 2 3  4 5 6 7
432     // 8 9 A b  C D E F
433     sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
434     sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
435 }
436 
add16x16x6to32bit(const __m512i sads512[6],__m256i sads256[2])437 SIMD_INLINE void add16x16x6to32bit(const __m512i sads512[6], __m256i sads256[2]) {
438     const __m512i zero = _mm512_setzero_si512();
439 
440     const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
441     const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
442     const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
443     const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
444     const __m512i sad512_2_lo = _mm512_unpacklo_epi16(sads512[2], zero);
445     const __m512i sad512_2_hi = _mm512_unpackhi_epi16(sads512[2], zero);
446     const __m512i sad512_3_lo = _mm512_unpacklo_epi16(sads512[3], zero);
447     const __m512i sad512_3_hi = _mm512_unpackhi_epi16(sads512[3], zero);
448     const __m512i sad512_4_lo = _mm512_unpacklo_epi16(sads512[4], zero);
449     const __m512i sad512_4_hi = _mm512_unpackhi_epi16(sads512[4], zero);
450     const __m512i sad512_5_lo = _mm512_unpacklo_epi16(sads512[5], zero);
451     const __m512i sad512_5_hi = _mm512_unpackhi_epi16(sads512[5], zero);
452 
453     const __m512i sad512_01_lo   = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
454     const __m512i sad512_01_hi   = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
455     const __m512i sad512_23_lo   = _mm512_add_epi32(sad512_2_lo, sad512_3_lo);
456     const __m512i sad512_23_hi   = _mm512_add_epi32(sad512_2_hi, sad512_3_hi);
457     const __m512i sad512_45_lo   = _mm512_add_epi32(sad512_4_lo, sad512_5_lo);
458     const __m512i sad512_45_hi   = _mm512_add_epi32(sad512_4_hi, sad512_5_hi);
459     const __m512i sad512_0123_lo = _mm512_add_epi32(sad512_01_lo, sad512_23_lo);
460     const __m512i sad512_0123_hi = _mm512_add_epi32(sad512_01_hi, sad512_23_hi);
461 
462     // 0 1 2 3  8 9 A b   0 1 2 3  8 9 A b
463     // 4 5 6 7  C D E F   4 5 6 7  C D E F
464     const __m512i sad512_lo = _mm512_add_epi32(sad512_0123_lo, sad512_45_lo);
465     const __m512i sad512_hi = _mm512_add_epi32(sad512_0123_hi, sad512_45_hi);
466 
467     const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
468     const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
469     const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
470     const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
471 
472     // 0 1 2 3  8 9 A b
473     // 4 5 6 7  C D E F
474     const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
475     const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
476 
477     // 0 1 2 3  4 5 6 7
478     // 8 9 A b  C D E F
479     sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
480     sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
481 }
482 
add16x16x8to32bit(const __m512i sads512[8],__m256i sads256[2])483 SIMD_INLINE void add16x16x8to32bit(const __m512i sads512[8], __m256i sads256[2]) {
484     // Don't call two add16x16x4to32bit(), which is slower.
485     const __m512i zero = _mm512_setzero_si512();
486 
487     const __m512i sad512_0_lo = _mm512_unpacklo_epi16(sads512[0], zero);
488     const __m512i sad512_0_hi = _mm512_unpackhi_epi16(sads512[0], zero);
489     const __m512i sad512_1_lo = _mm512_unpacklo_epi16(sads512[1], zero);
490     const __m512i sad512_1_hi = _mm512_unpackhi_epi16(sads512[1], zero);
491     const __m512i sad512_2_lo = _mm512_unpacklo_epi16(sads512[2], zero);
492     const __m512i sad512_2_hi = _mm512_unpackhi_epi16(sads512[2], zero);
493     const __m512i sad512_3_lo = _mm512_unpacklo_epi16(sads512[3], zero);
494     const __m512i sad512_3_hi = _mm512_unpackhi_epi16(sads512[3], zero);
495     const __m512i sad512_4_lo = _mm512_unpacklo_epi16(sads512[4], zero);
496     const __m512i sad512_4_hi = _mm512_unpackhi_epi16(sads512[4], zero);
497     const __m512i sad512_5_lo = _mm512_unpacklo_epi16(sads512[5], zero);
498     const __m512i sad512_5_hi = _mm512_unpackhi_epi16(sads512[5], zero);
499     const __m512i sad512_6_lo = _mm512_unpacklo_epi16(sads512[6], zero);
500     const __m512i sad512_6_hi = _mm512_unpackhi_epi16(sads512[6], zero);
501     const __m512i sad512_7_lo = _mm512_unpacklo_epi16(sads512[7], zero);
502     const __m512i sad512_7_hi = _mm512_unpackhi_epi16(sads512[7], zero);
503 
504     const __m512i sad512_01_lo   = _mm512_add_epi32(sad512_0_lo, sad512_1_lo);
505     const __m512i sad512_01_hi   = _mm512_add_epi32(sad512_0_hi, sad512_1_hi);
506     const __m512i sad512_23_lo   = _mm512_add_epi32(sad512_2_lo, sad512_3_lo);
507     const __m512i sad512_23_hi   = _mm512_add_epi32(sad512_2_hi, sad512_3_hi);
508     const __m512i sad512_45_lo   = _mm512_add_epi32(sad512_4_lo, sad512_5_lo);
509     const __m512i sad512_45_hi   = _mm512_add_epi32(sad512_4_hi, sad512_5_hi);
510     const __m512i sad512_67_lo   = _mm512_add_epi32(sad512_6_lo, sad512_7_lo);
511     const __m512i sad512_67_hi   = _mm512_add_epi32(sad512_6_hi, sad512_7_hi);
512     const __m512i sad512_0123_lo = _mm512_add_epi32(sad512_01_lo, sad512_23_lo);
513     const __m512i sad512_0123_hi = _mm512_add_epi32(sad512_01_hi, sad512_23_hi);
514     const __m512i sad512_4567_lo = _mm512_add_epi32(sad512_45_lo, sad512_67_lo);
515     const __m512i sad512_4567_hi = _mm512_add_epi32(sad512_45_hi, sad512_67_hi);
516 
517     // 0 1 2 3  8 9 A b   0 1 2 3  8 9 A b
518     // 4 5 6 7  C D E F   4 5 6 7  C D E F
519     const __m512i sad512_lo = _mm512_add_epi32(sad512_0123_lo, sad512_4567_lo);
520     const __m512i sad512_hi = _mm512_add_epi32(sad512_0123_hi, sad512_4567_hi);
521 
522     const __m256i sad256_ll = _mm512_castsi512_si256(sad512_lo);
523     const __m256i sad256_lh = _mm512_extracti64x4_epi64(sad512_lo, 1);
524     const __m256i sad256_hl = _mm512_castsi512_si256(sad512_hi);
525     const __m256i sad256_hh = _mm512_extracti64x4_epi64(sad512_hi, 1);
526 
527     // 0 1 2 3  8 9 A b
528     // 4 5 6 7  C D E F
529     const __m256i sad256_0 = _mm256_add_epi32(sad256_ll, sad256_lh);
530     const __m256i sad256_1 = _mm256_add_epi32(sad256_hl, sad256_hh);
531 
532     // 0 1 2 3  4 5 6 7
533     // 8 9 A b  C D E F
534     sads256[0] = _mm256_unpacklo_epi128(sad256_0, sad256_1);
535     sads256[1] = _mm256_unpackhi_epi128(sad256_0, sad256_1);
536 }
537 
saturate_add(const __m128i sum0,const __m128i sum1,__m128i * const minpos)538 static INLINE uint32_t saturate_add(const __m128i sum0, const __m128i sum1, __m128i *const minpos) {
539     uint32_t min_val;
540     __m128i  min0, min1;
541 
542     const __m128i minpos0 = _mm_minpos_epu16(sum0);
543     const __m128i minpos1 = _mm_minpos_epu16(sum1);
544     min0                  = _mm_unpacklo_epi16(minpos0, minpos0);
545     min0                  = _mm_unpacklo_epi32(min0, min0);
546     min0                  = _mm_unpacklo_epi64(min0, min0);
547     min1                  = _mm_unpacklo_epi16(minpos1, minpos1);
548     min1                  = _mm_unpacklo_epi32(min1, min1);
549     min1                  = _mm_unpacklo_epi64(min1, min1);
550     const __m128i t0      = _mm_sub_epi16(sum0, min0);
551     const __m128i t1      = _mm_sub_epi16(sum1, min1);
552     const __m128i sum     = _mm_adds_epu16(t0, t1);
553     *minpos               = _mm_minpos_epu16(sum);
554     min_val               = _mm_extract_epi16(*minpos, 0);
555     min_val += _mm_extract_epi16(minpos0, 0);
556     min_val += _mm_extract_epi16(minpos1, 0);
557 
558     return min_val;
559 }
560 
sad_loop_kernel_4_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i * const sum)561 static INLINE void sad_loop_kernel_4_avx2(const uint8_t *const src, const uint32_t src_stride,
562                                           const uint8_t *const ref, const uint32_t ref_stride,
563                                           __m256i *const sum) {
564     const __m256i ss0 = _mm256_insertf128_si256(
565         _mm256_castsi128_si256(_mm_cvtsi32_si128(*(uint32_t *)src)),
566         _mm_cvtsi32_si128(*(uint32_t *)(src + src_stride)),
567         1);
568     const __m256i rr0 = _mm256_insertf128_si256(
569         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
570         _mm_loadu_si128((__m128i *)(ref + ref_stride)),
571         1);
572     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, 0));
573 }
574 
575 /*******************************************************************************
576 * Function helper adds to "sum" vector SAD's for block width of 4, uses AVX2 instructions
577 * Requirement: width = 4
578 * Requirement: height = 1
579 * Compute one line
580 *******************************************************************************/
sad_loop_kernel_4_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)581 static INLINE void sad_loop_kernel_4_oneline_avx2(const uint8_t *const src,
582                                                   const uint8_t *const ref, __m256i *const sum) {
583     const __m256i ss0 = _mm256_insertf128_si256(
584         _mm256_castsi128_si256(_mm_cvtsi32_si128(*(uint32_t *)src)), _mm_setzero_si128(), 1);
585     const __m256i rr0 = _mm256_insertf128_si256(
586         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
587     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, 0));
588 }
589 
sad_loop_kernel_4_sse4_1(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m128i * const sum)590 static INLINE void sad_loop_kernel_4_sse4_1(const uint8_t *const src, const uint32_t src_stride,
591                                             const uint8_t *const ref, const uint32_t ref_stride,
592                                             __m128i *const sum) {
593     const __m128i s0 = _mm_cvtsi32_si128(*(uint32_t *)src);
594     const __m128i s1 = _mm_cvtsi32_si128(*(uint32_t *)(src + src_stride));
595     const __m128i r0 = _mm_loadu_si128((__m128i *)ref);
596     const __m128i r1 = _mm_loadu_si128((__m128i *)(ref + ref_stride));
597     *sum             = _mm_adds_epu16(*sum, _mm_mpsadbw_epu8(r0, s0, 0));
598     *sum             = _mm_adds_epu16(*sum, _mm_mpsadbw_epu8(r1, s1, 0));
599 }
600 
601 /*******************************************************************************
602 * Function helper adds to "sum" vector SAD's for block width of 4, uses sse4_1 instructions
603 * Requirement: width = 4
604 * Requirement: height = 1
605 * Compute one line
606 *******************************************************************************/
sad_loop_kernel_4_oneline_sse4_1(const uint8_t * const src,const uint8_t * const ref,__m128i * const sum)607 static INLINE void sad_loop_kernel_4_oneline_sse4_1(const uint8_t *const src,
608                                                     const uint8_t *const ref, __m128i *const sum) {
609     const __m128i s0 = _mm_cvtsi32_si128(*(uint32_t *)src);
610     const __m128i r0 = _mm_loadu_si128((__m128i *)ref);
611     *sum             = _mm_adds_epu16(*sum, _mm_mpsadbw_epu8(r0, s0, 0));
612 }
613 
sad_loop_kernel_8_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i * const sum)614 static INLINE void sad_loop_kernel_8_avx2(const uint8_t *const src, const uint32_t src_stride,
615                                           const uint8_t *const ref, const uint32_t ref_stride,
616                                           __m256i *const sum) {
617     const __m256i ss0 = _mm256_insertf128_si256(
618         _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)src)),
619         _mm_loadl_epi64((__m128i *)(src + 1 * src_stride)),
620         1);
621     const __m256i rr0 = _mm256_insertf128_si256(
622         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
623         _mm_loadu_si128((__m128i *)(ref + 1 * ref_stride)),
624         1);
625     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
626     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
627 }
628 
629 /*******************************************************************************
630 * Function helper adds to "sum" vector SAD's for block width of 8, uses AVX2 instructions
631 * Requirement: width = 8
632 * Requirement: height = 1
633 * Compute one line
634 *******************************************************************************/
sad_loop_kernel_8_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)635 static INLINE void sad_loop_kernel_8_oneline_avx2(const uint8_t *const src,
636                                                   const uint8_t *const ref, __m256i *const sum) {
637     const __m256i ss0 = _mm256_insertf128_si256(
638         _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)src)), _mm_setzero_si128(), 1);
639     const __m256i rr0 = _mm256_insertf128_si256(
640         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
641     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
642     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
643 }
644 
645 /*******************************************************************************
646 * Function helper adds to "sums" vectors SAD's for block width of 12, uses AVX512 instructions
647 * Requirement: width = 12
648 * Requirement: height = 2
649 *******************************************************************************/
sad_loop_kernel_12_avx512(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m512i * const sum)650 SIMD_INLINE void sad_loop_kernel_12_avx512(const uint8_t *const src, const uint32_t src_stride,
651                                            const uint8_t *const ref, const uint32_t ref_stride,
652                                            __m512i *const sum) {
653     const __m128i s0  = _mm_loadu_si128((__m128i *)src);
654     const __m128i s1  = _mm_loadu_si128((__m128i *)(src + src_stride));
655     const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), s1, 1);
656     const __m512i s   = _mm512_castsi256_si512(s01);
657     const __m512i ss0 = _mm512_permutexvar_epi32(
658         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
659     const __m512i ss1 = _mm512_permutexvar_epi32(
660         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
661     const __m512i ss2 = _mm512_permutexvar_epi32(
662         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
663 
664     const __m256i r0  = _mm256_loadu_si256((__m256i *)ref);
665     const __m256i r1  = _mm256_loadu_si256((__m256i *)(ref + ref_stride));
666     const __m512i r   = _mm512_inserti64x4(_mm512_castsi256_si512(r0), r1, 1);
667     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
668     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
669 
670     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
671     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
672     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
673 }
674 
675 /*******************************************************************************
676 * Function helper adds to "sums" vectors SAD's for block width of 12, uses AVX512 instructions
677 * Requirement: width = 12
678 * Requirement: height = 1
679 * Compute one line
680 *******************************************************************************/
sad_loop_kernel_12_oneline_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i * const sum)681 SIMD_INLINE void sad_loop_kernel_12_oneline_avx512(const uint8_t *const src,
682                                                    const uint8_t *const ref, __m512i *const sum) {
683     const __m128i s0  = _mm_loadu_si128((__m128i *)src);
684     const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), _mm_setzero_si128(), 1);
685     const __m512i s   = _mm512_castsi256_si512(s01);
686     const __m512i ss0 = _mm512_permutexvar_epi32(
687         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
688     const __m512i ss1 = _mm512_permutexvar_epi32(
689         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
690     const __m512i ss2 = _mm512_permutexvar_epi32(
691         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
692 
693     const __m256i r0  = _mm256_loadu_si256((__m256i *)ref);
694     const __m512i r   = _mm512_inserti64x4(_mm512_castsi256_si512(r0), _mm256_setzero_si256(), 1);
695     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
696     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
697 
698     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
699     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
700     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
701 }
702 
sad_loop_kernel_16_avx512(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m512i * const sum)703 SIMD_INLINE void sad_loop_kernel_16_avx512(const uint8_t *const src, const uint32_t src_stride,
704                                            const uint8_t *const ref, const uint32_t ref_stride,
705                                            __m512i *const sum) {
706     const __m128i s0  = _mm_loadu_si128((__m128i *)src);
707     const __m128i s1  = _mm_loadu_si128((__m128i *)(src + src_stride));
708     const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), s1, 1);
709     const __m512i s   = _mm512_castsi256_si512(s01);
710     const __m512i ss0 = _mm512_permutexvar_epi32(
711         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
712     const __m512i ss1 = _mm512_permutexvar_epi32(
713         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
714     const __m512i ss2 = _mm512_permutexvar_epi32(
715         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
716     const __m512i ss3 = _mm512_permutexvar_epi32(
717         _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
718 
719     const __m256i r0  = _mm256_loadu_si256((__m256i *)ref);
720     const __m256i r1  = _mm256_loadu_si256((__m256i *)(ref + ref_stride));
721     const __m512i r   = _mm512_inserti64x4(_mm512_castsi256_si512(r0), r1, 1);
722     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
723     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
724 
725     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
726     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
727     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
728     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss3, rr1, 0xE9));
729 }
730 
731 /*******************************************************************************
732 * Function helper adds to "sums" vectors SAD's for block width of 16, uses AVX512 instructions
733 * Requirement: width = 16
734 * Requirement: height = 1
735 * Compute one line
736 *******************************************************************************/
sad_loop_kernel_16_oneline_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i * const sum)737 SIMD_INLINE void sad_loop_kernel_16_oneline_avx512(const uint8_t *const src,
738                                                    const uint8_t *const ref, __m512i *const sum) {
739     const __m128i s0  = _mm_loadu_si128((__m128i *)src);
740     const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), _mm_setzero_si128(), 1);
741     const __m512i s   = _mm512_castsi256_si512(s01);
742     const __m512i ss0 = _mm512_permutexvar_epi32(
743         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
744     const __m512i ss1 = _mm512_permutexvar_epi32(
745         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
746     const __m512i ss2 = _mm512_permutexvar_epi32(
747         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
748     const __m512i ss3 = _mm512_permutexvar_epi32(
749         _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
750 
751     const __m256i r0  = _mm256_loadu_si256((__m256i *)ref);
752     const __m512i r   = _mm512_inserti64x4(_mm512_castsi256_si512(r0), _mm256_setzero_si256(), 1);
753     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
754     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
755 
756     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
757     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
758     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
759     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss3, rr1, 0xE9));
760 }
761 
762 /*******************************************************************************
763 * Function helper adds to two elements "sums" vectors SAD's for block width of 12, uses AVX512 instructions
764 * Requirement: width = 12
765 * Requirement: height = 2
766 *******************************************************************************/
sad_loop_kernel_12_2sum_avx512(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m512i sum[2])767 SIMD_INLINE void sad_loop_kernel_12_2sum_avx512(const uint8_t *const src, const uint32_t src_stride,
768                                                 const uint8_t *const ref, const uint32_t ref_stride,
769                                                 __m512i sum[2]) {
770     const __m128i s0  = _mm_loadu_si128((__m128i *)src);
771     const __m128i s1  = _mm_loadu_si128((__m128i *)(src + src_stride));
772     const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), s1, 1);
773     const __m512i s   = _mm512_castsi256_si512(s01);
774     const __m512i ss0 = _mm512_permutexvar_epi32(
775         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
776     const __m512i ss1 = _mm512_permutexvar_epi32(
777         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
778     const __m512i ss2 = _mm512_permutexvar_epi32(
779         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
780 
781     const __m256i r0  = _mm256_loadu_si256((__m256i *)ref);
782     const __m256i r1  = _mm256_loadu_si256((__m256i *)(ref + ref_stride));
783     const __m512i r   = _mm512_inserti64x4(_mm512_castsi256_si512(r0), r1, 1);
784     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
785     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
786 
787     sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
788     sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
789     sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
790 }
791 
792 /*******************************************************************************
793 * Function helper adds to two elements "sums" vectors SAD's for block width of 12, uses AVX512 instructions
794 * Requirement: width = 12
795 * Requirement: height = 1
796 * Compute one line
797 *******************************************************************************/
sad_loop_kernel_12_2sum_oneline_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i sum[2])798 SIMD_INLINE void sad_loop_kernel_12_2sum_oneline_avx512(const uint8_t *const src,
799                                                         const uint8_t *const ref, __m512i sum[2]) {
800     const __m128i s0  = _mm_loadu_si128((__m128i *)src);
801     const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), _mm_setzero_si128(), 1);
802     const __m512i s   = _mm512_castsi256_si512(s01);
803     const __m512i ss0 = _mm512_permutexvar_epi32(
804         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
805     const __m512i ss1 = _mm512_permutexvar_epi32(
806         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
807     const __m512i ss2 = _mm512_permutexvar_epi32(
808         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
809 
810     const __m256i r0  = _mm256_loadu_si256((__m256i *)ref);
811     const __m512i r   = _mm512_inserti64x4(_mm512_castsi256_si512(r0), _mm256_setzero_si256(), 1);
812     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
813     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
814 
815     sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
816     sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
817     sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
818 }
819 
sad_loop_kernel_16_2sum_avx512(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m512i sum[2])820 SIMD_INLINE void sad_loop_kernel_16_2sum_avx512(const uint8_t *const src, const uint32_t src_stride,
821                                                 const uint8_t *const ref, const uint32_t ref_stride,
822                                                 __m512i sum[2]) {
823     const __m128i s0  = _mm_loadu_si128((__m128i *)src);
824     const __m128i s1  = _mm_loadu_si128((__m128i *)(src + src_stride));
825     const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), s1, 1);
826     const __m512i s   = _mm512_castsi256_si512(s01);
827     const __m512i ss0 = _mm512_permutexvar_epi32(
828         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
829     const __m512i ss1 = _mm512_permutexvar_epi32(
830         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
831     const __m512i ss2 = _mm512_permutexvar_epi32(
832         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
833     const __m512i ss3 = _mm512_permutexvar_epi32(
834         _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
835 
836     const __m256i r0  = _mm256_loadu_si256((__m256i *)ref);
837     const __m256i r1  = _mm256_loadu_si256((__m256i *)(ref + ref_stride));
838     const __m512i r   = _mm512_inserti64x4(_mm512_castsi256_si512(r0), r1, 1);
839     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
840     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
841 
842     sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
843     sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
844     sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
845     sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss3, rr1, 0xE9));
846 }
847 
848 /*******************************************************************************
849 * Function helper adds to two elements "sums" vectors SAD's for block width of 16, uses AVX512 instructions
850 * Requirement: width = 16
851 * Requirement: height = 1
852 * Compute one line
853 *******************************************************************************/
sad_loop_kernel_16_2sum_oneline_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i sum[2])854 SIMD_INLINE void sad_loop_kernel_16_2sum_oneline_avx512(const uint8_t *const src,
855                                                         const uint8_t *const ref, __m512i sum[2]) {
856     const __m128i s0  = _mm_loadu_si128((__m128i *)src);
857     const __m256i s01 = _mm256_insertf128_si256(_mm256_castsi128_si256(s0), _mm_setzero_si128(), 1);
858     const __m512i s   = _mm512_castsi256_si512(s01);
859     const __m512i ss0 = _mm512_permutexvar_epi32(
860         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
861     const __m512i ss1 = _mm512_permutexvar_epi32(
862         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
863     const __m512i ss2 = _mm512_permutexvar_epi32(
864         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
865     const __m512i ss3 = _mm512_permutexvar_epi32(
866         _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
867 
868     const __m256i r0  = _mm256_loadu_si256((__m256i *)ref);
869     const __m512i r   = _mm512_inserti64x4(_mm512_castsi256_si512(r0), _mm256_setzero_si256(), 1);
870     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 4, 5, 5, 6), r);
871     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 5, 6, 6, 7), r);
872 
873     sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
874     sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
875     sum[0] = _mm512_adds_epu16(sum[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
876     sum[1] = _mm512_adds_epu16(sum[1], _mm512_dbsad_epu8(ss3, rr1, 0xE9));
877 }
878 
879 /*******************************************************************************
880 * Function helper adds to "sums" vectors SAD's for block width of 12, uses AVX2 instructions
881 * Requirement: width = 12
882 * Requirement: height = 2
883 *******************************************************************************/
sad_loop_kernel_12_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i * const sum)884 static INLINE void sad_loop_kernel_12_avx2(const uint8_t *const src, const uint32_t src_stride,
885                                            const uint8_t *const ref, const uint32_t ref_stride,
886                                            __m256i *const sum) {
887     const __m256i ss0 = _mm256_insertf128_si256(
888         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)),
889         _mm_loadu_si128((__m128i *)(src + src_stride)),
890         1);
891     const __m256i rr0 = _mm256_insertf128_si256(
892         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
893         _mm_loadu_si128((__m128i *)(ref + ref_stride)),
894         1);
895     const __m256i rr1 = _mm256_insertf128_si256(
896         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))),
897         _mm_loadu_si128((__m128i *)(ref + ref_stride + 8)),
898         1);
899     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
900     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
901     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
902 }
903 
904 /*******************************************************************************
905 * Function helper adds to "sums" vectors SAD's for block width of 12, uses AVX2 instructions
906 * Requirement: width = 12
907 * Requirement: height = 1
908 * Compute one line
909 *******************************************************************************/
sad_loop_kernel_12_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)910 static INLINE void sad_loop_kernel_12_oneline_avx2(const uint8_t *const src,
911                                                    const uint8_t *const ref, __m256i *const sum) {
912     const __m256i ss0 = _mm256_insertf128_si256(
913         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)), _mm_setzero_si128(), 1);
914     const __m256i rr0 = _mm256_insertf128_si256(
915         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
916     const __m256i rr1 = _mm256_insertf128_si256(
917         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))), _mm_setzero_si128(), 1);
918     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
919     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
920     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
921 }
922 
sad_loop_kernel_16_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i * const sum)923 static INLINE void sad_loop_kernel_16_avx2(const uint8_t *const src, const uint32_t src_stride,
924                                            const uint8_t *const ref, const uint32_t ref_stride,
925                                            __m256i *const sum) {
926     const __m256i ss0 = _mm256_insertf128_si256(
927         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)),
928         _mm_loadu_si128((__m128i *)(src + src_stride)),
929         1);
930     const __m256i rr0 = _mm256_insertf128_si256(
931         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
932         _mm_loadu_si128((__m128i *)(ref + ref_stride)),
933         1);
934     const __m256i rr1 = _mm256_insertf128_si256(
935         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))),
936         _mm_loadu_si128((__m128i *)(ref + ref_stride + 8)),
937         1);
938     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
939     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
940     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
941     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
942 }
943 
944 /*******************************************************************************
945 * Function helper adds to "sums" vectors SAD's for block width of 16, uses AVX512 instructions
946 * Requirement: width = 16
947 * Requirement: height = 1
948 * Compute one line
949 *******************************************************************************/
sad_loop_kernel_16_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)950 static INLINE void sad_loop_kernel_16_oneline_avx2(const uint8_t *const src,
951                                                    const uint8_t *const ref, __m256i *const sum) {
952     const __m256i ss0 = _mm256_insertf128_si256(
953         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)), _mm_setzero_si128(), 1);
954     const __m256i rr0 = _mm256_insertf128_si256(
955         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
956     const __m256i rr1 = _mm256_insertf128_si256(
957         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))), _mm_setzero_si128(), 1);
958     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
959     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
960     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
961     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
962 }
963 
964 /*******************************************************************************
965 * Function helper adds to two elements "sums" vectors SAD's for block width of 12, uses AVX2 instructions
966 * Requirement: width = 12
967 * Requirement: height = 2
968 *******************************************************************************/
sad_loop_kernel_12_2sum_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i sums[2])969 static INLINE void sad_loop_kernel_12_2sum_avx2(const uint8_t *const src, const uint32_t src_stride,
970                                                 const uint8_t *const ref, const uint32_t ref_stride,
971                                                 __m256i sums[2]) {
972     const __m256i ss0 = _mm256_insertf128_si256(
973         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)),
974         _mm_loadu_si128((__m128i *)(src + src_stride)),
975         1);
976     const __m256i rr0 = _mm256_insertf128_si256(
977         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
978         _mm_loadu_si128((__m128i *)(ref + ref_stride)),
979         1);
980     const __m256i rr1 = _mm256_insertf128_si256(
981         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))),
982         _mm_loadu_si128((__m128i *)(ref + ref_stride + 8)),
983         1);
984     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
985     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
986     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
987 }
988 
989 /*******************************************************************************
990 * Function helper adds to two elements "sums" vectors SAD's for block width of 12, uses AVX2 instructions
991 * Requirement: width = 12
992 * Requirement: height = 1
993 * Compute one line
994 *******************************************************************************/
sad_loop_kernel_12_2sum_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[2])995 static INLINE void sad_loop_kernel_12_2sum_oneline_avx2(const uint8_t *const src,
996                                                         const uint8_t *const ref, __m256i sums[2]) {
997     const __m256i ss0 = _mm256_insertf128_si256(
998         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)), _mm_setzero_si128(), 1);
999     const __m256i rr0 = _mm256_insertf128_si256(
1000         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
1001     const __m256i rr1 = _mm256_insertf128_si256(
1002         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))), _mm_setzero_si128(), 1);
1003     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1004     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1005     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1006 }
1007 
sad_loop_kernel_16_2sum_avx2(const uint8_t * const src,const uint32_t src_stride,const uint8_t * const ref,const uint32_t ref_stride,__m256i sums[2])1008 static INLINE void sad_loop_kernel_16_2sum_avx2(const uint8_t *const src, const uint32_t src_stride,
1009                                                 const uint8_t *const ref, const uint32_t ref_stride,
1010                                                 __m256i sums[2]) {
1011     const __m256i ss0 = _mm256_insertf128_si256(
1012         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)),
1013         _mm_loadu_si128((__m128i *)(src + src_stride)),
1014         1);
1015     const __m256i rr0 = _mm256_insertf128_si256(
1016         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)),
1017         _mm_loadu_si128((__m128i *)(ref + ref_stride)),
1018         1);
1019     const __m256i rr1 = _mm256_insertf128_si256(
1020         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))),
1021         _mm_loadu_si128((__m128i *)(ref + ref_stride + 8)),
1022         1);
1023     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1024     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1025     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1026     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1027 }
1028 
1029 /*******************************************************************************
1030 * Function helper adds to two elements "sums" vectors SAD's for block width of 16, uses AVX2 instructions
1031 * Requirement: width = 16
1032 * Requirement: height = 1
1033 * Compute one line
1034 *******************************************************************************/
sad_loop_kernel_16_2sum_oneline_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[2])1035 static INLINE void sad_loop_kernel_16_2sum_oneline_avx2(const uint8_t *const src,
1036                                                         const uint8_t *const ref, __m256i sums[2]) {
1037     const __m256i ss0 = _mm256_insertf128_si256(
1038         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src)), _mm_setzero_si128(), 1);
1039     const __m256i rr0 = _mm256_insertf128_si256(
1040         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)ref)), _mm_setzero_si128(), 1);
1041     const __m256i rr1 = _mm256_insertf128_si256(
1042         _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(ref + 8))), _mm_setzero_si128(), 1);
1043     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1044     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1045     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1046     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1047 }
1048 
sad_loop_kernel_32_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i * const sum)1049 SIMD_INLINE void sad_loop_kernel_32_avx512(const uint8_t *const src, const uint8_t *const ref,
1050                                            __m512i *const sum) {
1051     const __m256i s01 = _mm256_loadu_si256((__m256i *)src);
1052     const __m512i s   = _mm512_castsi256_si512(s01);
1053     const __m512i ss0 = _mm512_permutexvar_epi32(
1054         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
1055     const __m512i ss1 = _mm512_permutexvar_epi32(
1056         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
1057     const __m512i ss2 = _mm512_permutexvar_epi32(
1058         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
1059     const __m512i ss3 = _mm512_permutexvar_epi32(
1060         _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
1061 
1062     const __m512i r   = _mm512_loadu_si512((__m512i *)ref);
1063     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 2, 3, 3, 4), r);
1064     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 3, 4, 4, 5), r);
1065 
1066     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss0, rr0, 0x94));
1067     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss1, rr0, 0xE9));
1068     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss2, rr1, 0x94));
1069     *sum = _mm512_adds_epu16(*sum, _mm512_dbsad_epu8(ss3, rr1, 0xE9));
1070 }
1071 
sad_loop_kernel_32_2sum_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i sums[2])1072 SIMD_INLINE void sad_loop_kernel_32_2sum_avx512(const uint8_t *const src, const uint8_t *const ref,
1073                                                 __m512i sums[2]) {
1074     const __m256i s01 = _mm256_loadu_si256((__m256i *)src);
1075     const __m512i s   = _mm512_castsi256_si512(s01);
1076     const __m512i ss0 = _mm512_permutexvar_epi32(
1077         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
1078     const __m512i ss1 = _mm512_permutexvar_epi32(
1079         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
1080     const __m512i ss2 = _mm512_permutexvar_epi32(
1081         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
1082     const __m512i ss3 = _mm512_permutexvar_epi32(
1083         _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
1084 
1085     const __m512i r   = _mm512_loadu_si512((__m512i *)ref);
1086     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 2, 3, 3, 4), r);
1087     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 3, 4, 4, 5), r);
1088 
1089     sums[0] = _mm512_adds_epu16(sums[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
1090     sums[1] = _mm512_adds_epu16(sums[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
1091     sums[0] = _mm512_adds_epu16(sums[0], _mm512_dbsad_epu8(ss2, rr1, 0x94));
1092     sums[1] = _mm512_adds_epu16(sums[1], _mm512_dbsad_epu8(ss3, rr1, 0xE9));
1093 }
1094 
sad_loop_kernel_32_4sum_avx512(const uint8_t * const src,const uint8_t * const ref,__m512i sums[4])1095 SIMD_INLINE void sad_loop_kernel_32_4sum_avx512(const uint8_t *const src, const uint8_t *const ref,
1096                                                 __m512i sums[4]) {
1097     const __m256i s01 = _mm256_loadu_si256((__m256i *)src);
1098     const __m512i s   = _mm512_castsi256_si512(s01);
1099     const __m512i ss0 = _mm512_permutexvar_epi32(
1100         _mm512_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4), s);
1101     const __m512i ss1 = _mm512_permutexvar_epi32(
1102         _mm512_setr_epi32(1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5), s);
1103     const __m512i ss2 = _mm512_permutexvar_epi32(
1104         _mm512_setr_epi32(2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6), s);
1105     const __m512i ss3 = _mm512_permutexvar_epi32(
1106         _mm512_setr_epi32(3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7), s);
1107 
1108     const __m512i r   = _mm512_loadu_si512((__m512i *)ref);
1109     const __m512i rr0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 1, 2, 2, 3, 3, 4), r);
1110     const __m512i rr1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(1, 2, 2, 3, 3, 4, 4, 5), r);
1111 
1112     sums[0] = _mm512_adds_epu16(sums[0], _mm512_dbsad_epu8(ss0, rr0, 0x94));
1113     sums[1] = _mm512_adds_epu16(sums[1], _mm512_dbsad_epu8(ss1, rr0, 0xE9));
1114     sums[2] = _mm512_adds_epu16(sums[2], _mm512_dbsad_epu8(ss2, rr1, 0x94));
1115     sums[3] = _mm512_adds_epu16(sums[3], _mm512_dbsad_epu8(ss3, rr1, 0xE9));
1116 }
1117 
sad_loop_kernel_32_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i * const sum)1118 static INLINE void sad_loop_kernel_32_avx2(const uint8_t *const src, const uint8_t *const ref,
1119                                            __m256i *const sum) {
1120     const __m256i ss0 = _mm256_loadu_si256((__m256i *)src);
1121     const __m256i rr0 = _mm256_loadu_si256((__m256i *)ref);
1122     const __m256i rr1 = _mm256_loadu_si256((__m256i *)(ref + 8));
1123     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1124     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1125     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1126     *sum = _mm256_adds_epu16(*sum, _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1127 }
1128 
sad_loop_kernel_32_2sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[2])1129 static INLINE void sad_loop_kernel_32_2sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1130                                                 __m256i sums[2]) {
1131     const __m256i ss0 = _mm256_loadu_si256((__m256i *)src);
1132     const __m256i rr0 = _mm256_loadu_si256((__m256i *)ref);
1133     const __m256i rr1 = _mm256_loadu_si256((__m256i *)(ref + 8));
1134     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1135     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1136     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1137     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1138 }
1139 
sad_loop_kernel_32_4sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[4])1140 SIMD_INLINE void sad_loop_kernel_32_4sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1141                                               __m256i sums[4]) {
1142     const __m256i ss0 = _mm256_loadu_si256((__m256i *)src);
1143     const __m256i rr0 = _mm256_loadu_si256((__m256i *)ref);
1144     const __m256i rr1 = _mm256_loadu_si256((__m256i *)(ref + 8));
1145     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1146     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1147     sums[2] = _mm256_adds_epu16(sums[2], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1148     sums[3] = _mm256_adds_epu16(sums[3], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1149 }
1150 
sad_loop_kernel_64_2sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[2])1151 static INLINE void sad_loop_kernel_64_2sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1152                                                 __m256i sums[2]) {
1153     sad_loop_kernel_32_2sum_avx2(src + 0 * 32, ref + 0 * 32, sums);
1154     sad_loop_kernel_32_2sum_avx2(src + 1 * 32, ref + 1 * 32, sums);
1155 }
1156 
sad_loop_kernel_64_4sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[4])1157 static INLINE void sad_loop_kernel_64_4sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1158                                                 __m256i sums[4]) {
1159     sad_loop_kernel_32_4sum_avx2(src + 0 * 32, ref + 0 * 32, sums);
1160     sad_loop_kernel_32_4sum_avx2(src + 1 * 32, ref + 1 * 32, sums);
1161 }
1162 
sad_loop_kernel_64_8sum_avx2(const uint8_t * const src,const uint8_t * const ref,__m256i sums[8])1163 static INLINE void sad_loop_kernel_64_8sum_avx2(const uint8_t *const src, const uint8_t *const ref,
1164                                                 __m256i sums[8]) {
1165     //sad_loop_kernel_32_4sum_avx2(src + 0 * 32, ref + 0 * 32, sums + 0);
1166     //sad_loop_kernel_32_4sum_avx2(src + 1 * 32, ref + 1 * 32, sums + 4);
1167     const __m256i ss0 = _mm256_loadu_si256((__m256i *)src);
1168     const __m256i ss1 = _mm256_loadu_si256((__m256i *)(src + 32));
1169     const __m256i rr0 = _mm256_loadu_si256((__m256i *)ref);
1170     const __m256i rr1 = _mm256_loadu_si256((__m256i *)(ref + 8));
1171     const __m256i rr2 = _mm256_loadu_si256((__m256i *)(ref + 32));
1172     const __m256i rr3 = _mm256_loadu_si256((__m256i *)(ref + 40));
1173 
1174     sums[0] = _mm256_adds_epu16(sums[0], _mm256_mpsadbw_epu8(rr0, ss0, (0 << 3) | 0)); // 000 000
1175     sums[1] = _mm256_adds_epu16(sums[1], _mm256_mpsadbw_epu8(rr0, ss0, (5 << 3) | 5)); // 101 101
1176     sums[2] = _mm256_adds_epu16(sums[2], _mm256_mpsadbw_epu8(rr1, ss0, (2 << 3) | 2)); // 010 010
1177     sums[3] = _mm256_adds_epu16(sums[3], _mm256_mpsadbw_epu8(rr1, ss0, (7 << 3) | 7)); // 111 111
1178     sums[4] = _mm256_adds_epu16(sums[4], _mm256_mpsadbw_epu8(rr2, ss1, (0 << 3) | 0)); // 000 000
1179     sums[5] = _mm256_adds_epu16(sums[5], _mm256_mpsadbw_epu8(rr2, ss1, (5 << 3) | 5)); // 101 101
1180     sums[6] = _mm256_adds_epu16(sums[6], _mm256_mpsadbw_epu8(rr3, ss1, (2 << 3) | 2)); // 010 010
1181     sums[7] = _mm256_adds_epu16(sums[7], _mm256_mpsadbw_epu8(rr3, ss1, (7 << 3) | 7)); // 111 111
1182 }
1183 
1184 #define UPDATE_BEST(sum, idx, offset, best_s, best_x, best_y) \
1185     {                                                         \
1186         const uint32_t sad = _mm_extract_epi32(sum, idx);     \
1187                                                               \
1188         if (sad < best_s) {                                   \
1189             best_s = sad;                                     \
1190             best_x = (offset) + (idx);                        \
1191             best_y = y;                                       \
1192         }                                                     \
1193     }
1194 
update_best_kernel(const uint32_t sad,const __m128i minpos,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1195 static INLINE void update_best_kernel(const uint32_t sad, const __m128i minpos, const int32_t x,
1196                                       const int32_t y, uint32_t *const best_s,
1197                                       int32_t *const best_x, int32_t *const best_y) {
1198     if (sad < *best_s) {
1199         const int32_t x_offset = _mm_extract_epi16(minpos, 1);
1200         *best_s                = sad;
1201         *best_x                = x + x_offset;
1202         *best_y                = y;
1203     }
1204 }
1205 
update_best(const __m128i sad,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1206 static INLINE void update_best(const __m128i sad, const int32_t x, const int32_t y,
1207                                uint32_t *const best_s, int32_t *const best_x,
1208                                int32_t *const best_y) {
1209     const __m128i  minpos = _mm_minpos_epu16(sad);
1210     const uint32_t min0   = _mm_extract_epi16(minpos, 0);
1211 
1212     if (min0 < *best_s) {
1213         const int32_t x_offset = _mm_extract_epi16(minpos, 1);
1214         *best_s                = min0;
1215         *best_x                = x + x_offset;
1216         *best_y                = y;
1217     }
1218 }
1219 
update_8_best(const __m128i sads[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1220 SIMD_INLINE void update_8_best(const __m128i sads[2], const int32_t x, const int32_t y,
1221                                uint32_t *const best_s, int32_t *const best_x,
1222                                int32_t *const best_y) {
1223     UPDATE_BEST(sads[0], 0, x + 0, *best_s, *best_x, *best_y);
1224     UPDATE_BEST(sads[0], 1, x + 0, *best_s, *best_x, *best_y);
1225     UPDATE_BEST(sads[0], 2, x + 0, *best_s, *best_x, *best_y);
1226     UPDATE_BEST(sads[0], 3, x + 0, *best_s, *best_x, *best_y);
1227     UPDATE_BEST(sads[1], 0, x + 4, *best_s, *best_x, *best_y);
1228     UPDATE_BEST(sads[1], 1, x + 4, *best_s, *best_x, *best_y);
1229     UPDATE_BEST(sads[1], 2, x + 4, *best_s, *best_x, *best_y);
1230     UPDATE_BEST(sads[1], 3, x + 4, *best_s, *best_x, *best_y);
1231 }
1232 
update_small_pel(const __m256i sum256,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1233 static INLINE void update_small_pel(const __m256i sum256, const int32_t x, const int32_t y,
1234                                     uint32_t *const best_s, int32_t *const best_x,
1235                                     int32_t *const best_y) {
1236     const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
1237     const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
1238     const __m128i sad       = _mm_adds_epu16(sum256_lo, sum256_hi);
1239     update_best(sad, x, y, best_s, best_x, best_y);
1240 }
1241 
update_some_pel(const __m256i sum256,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1242 static INLINE void update_some_pel(const __m256i sum256, const int32_t x, const int32_t y,
1243                                    uint32_t *const best_s, int32_t *const best_x,
1244                                    int32_t *const best_y) {
1245     const __m128i  sum0 = _mm256_castsi256_si128(sum256);
1246     const __m128i  sum1 = _mm256_extracti128_si256(sum256, 1);
1247     __m128i        minpos;
1248     const uint32_t min0 = saturate_add(sum0, sum1, &minpos);
1249     update_best_kernel(min0, minpos, x, y, best_s, best_x, best_y);
1250 }
1251 
update_256_pel(const __m512i sum512,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1252 static INLINE void update_256_pel(const __m512i sum512, const int32_t x, const int32_t y,
1253                                   uint32_t *const best_s, int32_t *const best_x,
1254                                   int32_t *const best_y) {
1255     const __m256i sum512_lo = _mm512_castsi512_si256(sum512);
1256     const __m256i sum512_hi = _mm512_extracti64x4_epi64(sum512, 1);
1257     const __m256i sum256    = _mm256_adds_epu16(sum512_lo, sum512_hi);
1258     const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
1259     const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
1260     update_best(sum256_lo, x + 0, y, best_s, best_x, best_y);
1261     update_best(sum256_hi, x + 8, y, best_s, best_x, best_y);
1262 }
1263 
update_384_pel(const __m512i sum512,const __m256i sums256[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1264 SIMD_INLINE void update_384_pel(const __m512i sum512, const __m256i sums256[2], const int32_t x,
1265                                 const int32_t y, uint32_t *const best_s, int32_t *const best_x,
1266                                 int32_t *const best_y) {
1267     const __m256i sum512_lo = _mm512_castsi512_si256(sum512);
1268     const __m256i sum512_hi = _mm512_extracti64x4_epi64(sum512, 1);
1269     const __m256i sad256    = _mm256_adds_epu16(sum512_lo, sum512_hi);
1270     const __m128i sad256_lo = _mm256_castsi256_si128(sad256);
1271     const __m128i sad256_hi = _mm256_extracti128_si256(sad256, 1);
1272 
1273     const __m128i sum256_0_lo = _mm256_castsi256_si128(sums256[0]);
1274     const __m128i sum256_0_hi = _mm256_extracti128_si256(sums256[0], 1);
1275     const __m128i sad128_0    = _mm_adds_epu16(sum256_0_lo, sum256_0_hi);
1276 
1277     const __m128i sum256_1_lo = _mm256_castsi256_si128(sums256[1]);
1278     const __m128i sum256_1_hi = _mm256_extracti128_si256(sums256[1], 1);
1279     const __m128i sad128_1    = _mm_adds_epu16(sum256_1_lo, sum256_1_hi);
1280 
1281     __m128i        minpos_lo, minpos_hi;
1282     const uint32_t min_lo = saturate_add(sad256_lo, sad128_0, &minpos_lo);
1283     update_best_kernel(min_lo, minpos_lo, x + 0, y, best_s, best_x, best_y);
1284     const uint32_t min_hi = saturate_add(sad256_hi, sad128_1, &minpos_hi);
1285     update_best_kernel(min_hi, minpos_hi, x + 8, y, best_s, best_x, best_y);
1286 }
1287 
update_512_pel(const __m512i sum512,const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1288 SIMD_INLINE void update_512_pel(const __m512i sum512, const int32_t x, const int32_t y,
1289                                 uint32_t *const best_s, int32_t *const best_x,
1290                                 int32_t *const best_y) {
1291     const __m256i sum512_lo = _mm512_castsi512_si256(sum512);
1292     const __m256i sum512_hi = _mm512_extracti64x4_epi64(sum512, 1);
1293     __m128i       minpos_lo, minpos_hi;
1294 
1295     const __m128i  sum128_0 = _mm256_castsi256_si128(sum512_lo);
1296     const __m128i  sum128_1 = _mm256_castsi256_si128(sum512_hi);
1297     const uint32_t min_lo   = saturate_add(sum128_0, sum128_1, &minpos_lo);
1298     update_best_kernel(min_lo, minpos_lo, x + 0, y, best_s, best_x, best_y);
1299 
1300     const __m128i  sum128_2 = _mm256_extracti128_si256(sum512_lo, 1);
1301     const __m128i  sum128_3 = _mm256_extracti128_si256(sum512_hi, 1);
1302     const uint32_t min_hi   = saturate_add(sum128_2, sum128_3, &minpos_hi);
1303     update_best_kernel(min_hi, minpos_hi, x + 8, y, best_s, best_x, best_y);
1304 }
1305 
update_1024_pel(const __m512i sums512[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1306 SIMD_INLINE void update_1024_pel(const __m512i sums512[2], const int32_t x, const int32_t y,
1307                                  uint32_t *const best_s, int32_t *const best_x,
1308                                  int32_t *const best_y) {
1309     const __m512i  sum       = _mm512_adds_epu16(sums512[0], sums512[1]);
1310     const __m256i  sum_lo    = _mm512_castsi512_si256(sum);
1311     const __m256i  sum_hi    = _mm512_extracti64x4_epi64(sum, 1);
1312     const __m256i  sad       = _mm256_adds_epu16(sum_lo, sum_hi);
1313     const __m128i  sad_lo    = _mm256_castsi256_si128(sad);
1314     const __m128i  sad_hi    = _mm256_extracti128_si256(sad, 1);
1315     const __m128i  minpos_lo = _mm_minpos_epu16(sad_lo);
1316     const __m128i  minpos_hi = _mm_minpos_epu16(sad_hi);
1317     const uint32_t min0      = _mm_extract_epi16(minpos_lo, 0);
1318     const uint32_t min1      = _mm_extract_epi16(minpos_hi, 0);
1319     uint32_t       minmin, delta;
1320     __m128i        minpos;
1321 
1322     if (min0 <= min1) {
1323         minmin = min0;
1324         delta  = 0;
1325         minpos = minpos_lo;
1326     } else {
1327         minmin = min1;
1328         delta  = 8;
1329         minpos = minpos_hi;
1330     }
1331 
1332     if (minmin < *best_s) {
1333         if (minmin != 0xFFFF) { // no overflow
1334             *best_s = minmin;
1335             *best_x = x + delta + _mm_extract_epi16(minpos, 1);
1336             *best_y = y;
1337         } else { // overflow
1338             __m256i sads256[2];
1339             __m128i sads128[2];
1340 
1341             add16x16x2to32bit(sums512, sads256);
1342 
1343             sads128[0] = _mm256_castsi256_si128(sads256[0]);
1344             sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
1345             update_8_best(sads128, x + 0, y, best_s, best_x, best_y);
1346 
1347             sads128[0] = _mm256_castsi256_si128(sads256[1]);
1348             sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
1349             update_8_best(sads128, x + 8, y, best_s, best_x, best_y);
1350         }
1351     }
1352 }
1353 
update_768_pel(const __m512i sum512,const __m256i sums256[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1354 SIMD_INLINE void update_768_pel(const __m512i sum512, const __m256i sums256[2], const int32_t x,
1355                                 const int32_t y, uint32_t *const best_s, int32_t *const best_x,
1356                                 int32_t *const best_y) {
1357     __m512i sums512[2];
1358 
1359     sums512[0] = sum512;
1360     sums512[1] = _mm512_inserti64x4(_mm512_castsi256_si512(sums256[0]), sums256[1], 1);
1361     sums512[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), sums512[1]);
1362     update_1024_pel(sums512, x, y, best_s, best_x, best_y);
1363 }
1364 
update_1536_pel(const __m512i sums512[3],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1365 SIMD_INLINE void update_1536_pel(const __m512i sums512[3], const int32_t x, const int32_t y,
1366                                  uint32_t *const best_s, int32_t *const best_x,
1367                                  int32_t *const best_y) {
1368     const __m512i  sum01     = _mm512_adds_epu16(sums512[0], sums512[1]);
1369     const __m512i  sum       = _mm512_adds_epu16(sum01, sums512[2]);
1370     const __m256i  sum_lo    = _mm512_castsi512_si256(sum);
1371     const __m256i  sum_hi    = _mm512_extracti64x4_epi64(sum, 1);
1372     const __m256i  sad       = _mm256_adds_epu16(sum_lo, sum_hi);
1373     const __m128i  sad_lo    = _mm256_castsi256_si128(sad);
1374     const __m128i  sad_hi    = _mm256_extracti128_si256(sad, 1);
1375     const __m128i  minpos_lo = _mm_minpos_epu16(sad_lo);
1376     const __m128i  minpos_hi = _mm_minpos_epu16(sad_hi);
1377     const uint32_t min0      = _mm_extract_epi16(minpos_lo, 0);
1378     const uint32_t min1      = _mm_extract_epi16(minpos_hi, 0);
1379     uint32_t       minmin, delta;
1380     __m128i        minpos;
1381 
1382     if (min0 <= min1) {
1383         minmin = min0;
1384         delta  = 0;
1385         minpos = minpos_lo;
1386     } else {
1387         minmin = min1;
1388         delta  = 8;
1389         minpos = minpos_hi;
1390     }
1391 
1392     if (minmin < *best_s) {
1393         if (minmin != 0xFFFF) { // no overflow
1394             *best_s = minmin;
1395             *best_x = x + delta + _mm_extract_epi16(minpos, 1);
1396             *best_y = y;
1397         } else { // overflow
1398             __m256i sads256[2];
1399             __m128i sads128[2];
1400 
1401             add16x16x3to32bit(sums512, sads256);
1402 
1403             sads128[0] = _mm256_castsi256_si128(sads256[0]);
1404             sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
1405             update_8_best(sads128, x + 0, y, best_s, best_x, best_y);
1406 
1407             sads128[0] = _mm256_castsi256_si128(sads256[1]);
1408             sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
1409             update_8_best(sads128, x + 8, y, best_s, best_x, best_y);
1410         }
1411     }
1412 }
1413 
update_2048_pel(const __m512i sums512[4],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1414 SIMD_INLINE void update_2048_pel(const __m512i sums512[4], const int32_t x, const int32_t y,
1415                                  uint32_t *const best_s, int32_t *const best_x,
1416                                  int32_t *const best_y) {
1417     const __m512i  sum01     = _mm512_adds_epu16(sums512[0], sums512[1]);
1418     const __m512i  sum23     = _mm512_adds_epu16(sums512[2], sums512[3]);
1419     const __m512i  sum       = _mm512_adds_epu16(sum01, sum23);
1420     const __m256i  sum_lo    = _mm512_castsi512_si256(sum);
1421     const __m256i  sum_hi    = _mm512_extracti64x4_epi64(sum, 1);
1422     const __m256i  sad       = _mm256_adds_epu16(sum_lo, sum_hi);
1423     const __m128i  sad_lo    = _mm256_castsi256_si128(sad);
1424     const __m128i  sad_hi    = _mm256_extracti128_si256(sad, 1);
1425     const __m128i  minpos_lo = _mm_minpos_epu16(sad_lo);
1426     const __m128i  minpos_hi = _mm_minpos_epu16(sad_hi);
1427     const uint32_t min0      = _mm_extract_epi16(minpos_lo, 0);
1428     const uint32_t min1      = _mm_extract_epi16(minpos_hi, 0);
1429     uint32_t       minmin, delta;
1430     __m128i        minpos;
1431 
1432     if (min0 <= min1) {
1433         minmin = min0;
1434         delta  = 0;
1435         minpos = minpos_lo;
1436     } else {
1437         minmin = min1;
1438         delta  = 8;
1439         minpos = minpos_hi;
1440     }
1441 
1442     if (minmin < *best_s) {
1443         if (minmin != 0xFFFF) { // no overflow
1444             *best_s = minmin;
1445             *best_x = x + delta + _mm_extract_epi16(minpos, 1);
1446             *best_y = y;
1447         } else { // overflow
1448             __m256i sads256[2];
1449             __m128i sads128[2];
1450 
1451             add16x16x4to32bit(sums512, sads256);
1452 
1453             sads128[0] = _mm256_castsi256_si128(sads256[0]);
1454             sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
1455             update_8_best(sads128, x + 0, y, best_s, best_x, best_y);
1456 
1457             sads128[0] = _mm256_castsi256_si128(sads256[1]);
1458             sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
1459             update_8_best(sads128, x + 8, y, best_s, best_x, best_y);
1460         }
1461     }
1462 }
1463 
update_leftover_small_pel(const __m256i sum256,const int32_t x,const int32_t y,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1464 static INLINE void update_leftover_small_pel(const __m256i sum256, const int32_t x, const int32_t y,
1465                                              const __m128i mask, uint32_t *const best_s,
1466                                              int32_t *const best_x, int32_t *const best_y) {
1467     const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
1468     const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
1469     __m128i       sad       = _mm_adds_epu16(sum256_lo, sum256_hi);
1470     sad                     = _mm_or_si128(sad, mask);
1471     update_best(sad, x, y, best_s, best_x, best_y);
1472 }
1473 
update_leftover_256_pel(const __m256i sum256,const int16_t search_area_width,const int32_t x,const int32_t y,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1474 static INLINE void update_leftover_256_pel(const __m256i sum256, const int16_t search_area_width,
1475                                            const int32_t x, const int32_t y, const __m128i mask,
1476                                            uint32_t *const best_s, int32_t *const best_x,
1477                                            int32_t *const best_y) {
1478     const __m128i sum256_lo = _mm256_castsi256_si128(sum256);
1479     const __m128i sum256_hi = _mm256_extracti128_si256(sum256, 1);
1480     __m128i       sad       = _mm_adds_epu16(sum256_lo, sum256_hi);
1481     if ((x + 8) > search_area_width) {
1482         sad = _mm_or_si128(sad, mask);
1483     }
1484     update_best(sad, x, y, best_s, best_x, best_y);
1485 }
1486 
update_leftover_512_pel(const __m256i sum256,const int16_t search_area_width,const int32_t x,const int32_t y,const __m256i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1487 static INLINE void update_leftover_512_pel(const __m256i sum256, const int16_t search_area_width,
1488                                            const int32_t x, const int32_t y, const __m256i mask,
1489                                            uint32_t *const best_s, int32_t *const best_x,
1490                                            int32_t *const best_y) {
1491     __m256i sum = sum256;
1492     if ((x + 8) > search_area_width) {
1493         sum = _mm256_or_si256(sum256, mask);
1494     }
1495     const __m128i  sum0 = _mm256_castsi256_si128(sum);
1496     const __m128i  sum1 = _mm256_extracti128_si256(sum, 1);
1497     __m128i        minpos;
1498     const uint32_t min0 = saturate_add(sum0, sum1, &minpos);
1499     update_best_kernel(min0, minpos, x, y, best_s, best_x, best_y);
1500 }
1501 
update_leftover8_1024_pel(const __m256i sums256[2],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1502 SIMD_INLINE void update_leftover8_1024_pel(const __m256i sums256[2], const int32_t x,
1503                                            const int32_t y, uint32_t *const best_s,
1504                                            int32_t *const best_x, int32_t *const best_y) {
1505     const __m256i  sum256 = _mm256_adds_epu16(sums256[0], sums256[1]);
1506     const __m128i  sum_lo = _mm256_castsi256_si128(sum256);
1507     const __m128i  sum_hi = _mm256_extracti128_si256(sum256, 1);
1508     const __m128i  sad    = _mm_adds_epu16(sum_lo, sum_hi);
1509     const __m128i  minpos = _mm_minpos_epu16(sad);
1510     const uint32_t min0   = _mm_extract_epi16(minpos, 0);
1511 
1512     if (min0 < *best_s) {
1513         if (min0 != 0xFFFF) { // no overflow
1514             *best_s = min0;
1515             *best_x = x + _mm_extract_epi16(minpos, 1);
1516             *best_y = y;
1517         } else { // overflow
1518             __m128i sads[2];
1519             add16x8x2to32bit(sums256, sads);
1520             update_8_best(sads, x, y, best_s, best_x, best_y);
1521         }
1522     }
1523 }
1524 
update_leftover_1024_pel(const __m256i sums256[2],const int16_t search_area_width,const int32_t x,const int32_t y,const uint32_t leftover,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1525 SIMD_INLINE void update_leftover_1024_pel(const __m256i sums256[2], const int16_t search_area_width,
1526                                           const int32_t x, const int32_t y, const uint32_t leftover,
1527                                           const __m128i mask, uint32_t *const best_s,
1528                                           int32_t *const best_x, int32_t *const best_y) {
1529     const __m256i  sum256 = _mm256_adds_epu16(sums256[0], sums256[1]);
1530     const __m128i  sum_lo = _mm256_castsi256_si128(sum256);
1531     const __m128i  sum_hi = _mm256_extracti128_si256(sum256, 1);
1532     const __m128i  sad0   = _mm_adds_epu16(sum_lo, sum_hi);
1533     const __m128i  sad1   = _mm_or_si128(sad0, mask);
1534     const __m128i  minpos = _mm_minpos_epu16(sad1);
1535     const uint32_t min0   = _mm_extract_epi16(minpos, 0);
1536     int32_t        xx     = x;
1537 
1538     if (min0 < *best_s) {
1539         if (min0 != 0xFFFF) { // no overflow
1540             *best_s = min0;
1541             *best_x = xx + _mm_extract_epi16(minpos, 1);
1542             *best_y = y;
1543         } else { // overflow
1544             const int32_t num = xx + ((leftover < 4) ? leftover : 4);
1545             __m128i       sads[2];
1546 
1547             add16x8x2to32bit(sums256, sads);
1548 
1549             do {
1550                 UPDATE_BEST(sads[0], 0, xx, *best_s, *best_x, *best_y);
1551                 sads[0] = _mm_srli_si128(sads[0], 4);
1552             } while (++xx < num);
1553 
1554             while (xx < search_area_width) {
1555                 UPDATE_BEST(sads[1], 0, xx, *best_s, *best_x, *best_y);
1556                 sads[1] = _mm_srli_si128(sads[1], 4);
1557                 xx++;
1558             }
1559         }
1560     }
1561 }
1562 
update_leftover8_1536_pel(const __m256i sums256[3],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1563 SIMD_INLINE void update_leftover8_1536_pel(const __m256i sums256[3], const int32_t x,
1564                                            const int32_t y, uint32_t *const best_s,
1565                                            int32_t *const best_x, int32_t *const best_y) {
1566     const __m256i  sum01  = _mm256_adds_epu16(sums256[0], sums256[1]);
1567     const __m256i  sum256 = _mm256_adds_epu16(sum01, sums256[2]);
1568     const __m128i  sum_lo = _mm256_castsi256_si128(sum256);
1569     const __m128i  sum_hi = _mm256_extracti128_si256(sum256, 1);
1570     const __m128i  sad    = _mm_adds_epu16(sum_lo, sum_hi);
1571     const __m128i  minpos = _mm_minpos_epu16(sad);
1572     const uint32_t min0   = _mm_extract_epi16(minpos, 0);
1573 
1574     if (min0 < *best_s) {
1575         if (min0 != 0xFFFF) { // no overflow
1576             *best_s = min0;
1577             *best_x = x + _mm_extract_epi16(minpos, 1);
1578             *best_y = y;
1579         } else { // overflow
1580             __m128i sads[2];
1581             add16x8x3to32bit(sums256, sads);
1582             update_8_best(sads, x, y, best_s, best_x, best_y);
1583         }
1584     }
1585 }
1586 
update_leftover_1536_pel(const __m256i sums256[3],const int16_t search_area_width,const int32_t x,const int32_t y,const uint32_t leftover,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1587 SIMD_INLINE void update_leftover_1536_pel(const __m256i sums256[3], const int16_t search_area_width,
1588                                           const int32_t x, const int32_t y, const uint32_t leftover,
1589                                           const __m128i mask, uint32_t *const best_s,
1590                                           int32_t *const best_x, int32_t *const best_y) {
1591     const __m256i  sum01  = _mm256_adds_epu16(sums256[0], sums256[1]);
1592     const __m256i  sum256 = _mm256_adds_epu16(sum01, sums256[2]);
1593     const __m128i  sum_lo = _mm256_castsi256_si128(sum256);
1594     const __m128i  sum_hi = _mm256_extracti128_si256(sum256, 1);
1595     const __m128i  sad0   = _mm_adds_epu16(sum_lo, sum_hi);
1596     const __m128i  sad1   = _mm_or_si128(sad0, mask);
1597     const __m128i  minpos = _mm_minpos_epu16(sad1);
1598     const uint32_t min0   = _mm_extract_epi16(minpos, 0);
1599     int32_t        xx     = x;
1600 
1601     if (min0 < *best_s) {
1602         if (min0 != 0xFFFF) { // no overflow
1603             *best_s = min0;
1604             *best_x = xx + _mm_extract_epi16(minpos, 1);
1605             *best_y = y;
1606         } else { // overflow
1607             const int32_t num = xx + ((leftover < 4) ? leftover : 4);
1608             __m128i       sads[2];
1609 
1610             add16x8x3to32bit(sums256, sads);
1611 
1612             do {
1613                 UPDATE_BEST(sads[0], 0, xx, *best_s, *best_x, *best_y);
1614                 sads[0] = _mm_srli_si128(sads[0], 4);
1615             } while (++xx < num);
1616 
1617             while (xx < search_area_width) {
1618                 UPDATE_BEST(sads[1], 0, xx, *best_s, *best_x, *best_y);
1619                 sads[1] = _mm_srli_si128(sads[1], 4);
1620                 xx++;
1621             }
1622         }
1623     }
1624 }
1625 
update_leftover8_2048_pel(const __m256i sums256[4],const int32_t x,const int32_t y,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1626 SIMD_INLINE void update_leftover8_2048_pel(const __m256i sums256[4], const int32_t x,
1627                                            const int32_t y, uint32_t *const best_s,
1628                                            int32_t *const best_x, int32_t *const best_y) {
1629     const __m256i  sum01  = _mm256_adds_epu16(sums256[0], sums256[1]);
1630     const __m256i  sum23  = _mm256_adds_epu16(sums256[2], sums256[3]);
1631     const __m256i  sum256 = _mm256_adds_epu16(sum01, sum23);
1632     const __m128i  sum_lo = _mm256_castsi256_si128(sum256);
1633     const __m128i  sum_hi = _mm256_extracti128_si256(sum256, 1);
1634     const __m128i  sad    = _mm_adds_epu16(sum_lo, sum_hi);
1635     const __m128i  minpos = _mm_minpos_epu16(sad);
1636     const uint32_t min0   = _mm_extract_epi16(minpos, 0);
1637 
1638     if (min0 < *best_s) {
1639         if (min0 != 0xFFFF) { // no overflow
1640             *best_s = min0;
1641             *best_x = x + _mm_extract_epi16(minpos, 1);
1642             *best_y = y;
1643         } else { // overflow
1644             __m128i sads[2];
1645             add16x8x4to32bit(sums256, sads);
1646             update_8_best(sads, x, y, best_s, best_x, best_y);
1647         }
1648     }
1649 }
1650 
update_leftover_2048_pel(const __m256i sums256[4],const int16_t search_area_width,const int32_t x,const int32_t y,const uint32_t leftover,const __m128i mask,uint32_t * const best_s,int32_t * const best_x,int32_t * const best_y)1651 SIMD_INLINE void update_leftover_2048_pel(const __m256i sums256[4], const int16_t search_area_width,
1652                                           const int32_t x, const int32_t y, const uint32_t leftover,
1653                                           const __m128i mask, uint32_t *const best_s,
1654                                           int32_t *const best_x, int32_t *const best_y) {
1655     const __m256i  sum01  = _mm256_adds_epu16(sums256[0], sums256[1]);
1656     const __m256i  sum23  = _mm256_adds_epu16(sums256[2], sums256[3]);
1657     const __m256i  sum256 = _mm256_adds_epu16(sum01, sum23);
1658     const __m128i  sum_lo = _mm256_castsi256_si128(sum256);
1659     const __m128i  sum_hi = _mm256_extracti128_si256(sum256, 1);
1660     const __m128i  sad0   = _mm_adds_epu16(sum_lo, sum_hi);
1661     const __m128i  sad1   = _mm_or_si128(sad0, mask);
1662     const __m128i  minpos = _mm_minpos_epu16(sad1);
1663     const uint32_t min0   = _mm_extract_epi16(minpos, 0);
1664     int32_t        xx     = x;
1665 
1666     if (min0 < *best_s) {
1667         if (min0 != 0xFFFF) { // no overflow
1668             *best_s = min0;
1669             *best_x = xx + _mm_extract_epi16(minpos, 1);
1670             *best_y = y;
1671         } else { // overflow
1672             const int32_t num = xx + ((leftover < 4) ? leftover : 4);
1673             __m128i       sads[2];
1674 
1675             add16x8x4to32bit(sums256, sads);
1676 
1677             do {
1678                 UPDATE_BEST(sads[0], 0, xx, *best_s, *best_x, *best_y);
1679                 sads[0] = _mm_srli_si128(sads[0], 4);
1680             } while (++xx < num);
1681 
1682             while (xx < search_area_width) {
1683                 UPDATE_BEST(sads[1], 0, xx, *best_s, *best_x, *best_y);
1684                 sads[1] = _mm_srli_si128(sads[1], 4);
1685                 xx++;
1686             }
1687         }
1688     }
1689 }
1690 
1691 /*******************************************************************************
1692 * Requirement: search_size <= 8,
1693 * Returns "search_size" of SAD's for all height in 5th and 6th col
1694 *******************************************************************************/
complement_4_to_6(uint8_t * ref,uint32_t ref_stride,uint8_t * src,uint32_t src_stride,uint32_t height,uint32_t search_size)1695 static INLINE __m128i complement_4_to_6(uint8_t *ref, uint32_t ref_stride, uint8_t *src,
1696                                         uint32_t src_stride, uint32_t height,
1697                                         uint32_t search_size) {
1698     __m128i sum;
1699     DECLARE_ALIGNED(16, uint16_t, tsum[8]);
1700     memset(tsum, 0, 8 * sizeof(uint16_t));
1701     for (uint32_t search_area = 0; search_area < search_size; search_area++) {
1702         for (uint32_t y = 0; y < height; y++) {
1703             tsum[search_area] += EB_ABS_DIFF(src[y * src_stride + 4], ref[y * ref_stride + 4]) +
1704                 EB_ABS_DIFF(src[y * src_stride + 5], ref[y * ref_stride + 5]);
1705         }
1706         ref += 1;
1707     }
1708     sum = _mm_loadu_si128((__m128i *)tsum);
1709     return sum;
1710 }
1711 
1712 /*******************************************************************************
1713  * Requirement: block_height < 64
1714  * General version for SAD computing that support any block width and height
1715 *******************************************************************************/
sad_loop_kernel_generalized_avx512(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t block_height,uint32_t block_width,uint64_t * best_sad,int16_t * x_search_center,int16_t * y_search_center,uint32_t src_stride_raw,int16_t search_area_width,int16_t search_area_height)1716 void sad_loop_kernel_generalized_avx512(
1717     uint8_t * src, // input parameter, source samples Ptr
1718     uint32_t  src_stride, // input parameter, source stride
1719     uint8_t * ref, // input parameter, reference samples Ptr
1720     uint32_t  ref_stride, // input parameter, reference stride
1721     uint32_t  block_height, // input parameter, block height (M)
1722     uint32_t  block_width, // input parameter, block width (N)
1723     uint64_t *best_sad, int16_t *x_search_center, int16_t *y_search_center,
1724     uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
1725     int16_t search_area_width, int16_t search_area_height) {
1726     int16_t        i, j;
1727     uint32_t       k, l;
1728     const uint8_t *p_ref, *p_src;
1729     uint32_t       low_sum = 0xffffff;
1730     int32_t        x_best = *x_search_center, y_best = *y_search_center;
1731     uint32_t       leftover = search_area_width & 15;
1732 
1733     __m128i leftover_mask    = _mm_set1_epi32(-1);
1734     __m128i leftover_mask32b = _mm_set1_epi32(-1);
1735     if (leftover) {
1736         for (k = 0; k < (uint32_t)(search_area_width & 7); k++)
1737             leftover_mask = _mm_slli_si128(leftover_mask, 2);
1738         for (k = 0; k < (uint32_t)(search_area_width & 3); k++)
1739             leftover_mask32b = _mm_slli_si128(leftover_mask32b, 4);
1740     }
1741 
1742     for (i = 0; i < search_area_height; i++) {
1743         for (j = 0; j < search_area_width; j += 16) {
1744             p_src = src;
1745             p_ref = ref + j;
1746 
1747             __m512i sums512[8] = {_mm512_setzero_si512(),
1748                                   _mm512_setzero_si512(),
1749                                   _mm512_setzero_si512(),
1750                                   _mm512_setzero_si512(),
1751                                   _mm512_setzero_si512(),
1752                                   _mm512_setzero_si512(),
1753                                   _mm512_setzero_si512(),
1754                                   _mm512_setzero_si512()};
1755             __m256i sum256_1   = _mm256_setzero_si256();
1756             __m256i sum256_2   = _mm256_setzero_si256();
1757             __m256i sum256_3   = _mm256_setzero_si256();
1758             __m256i sum256_4   = _mm256_setzero_si256();
1759             __m256i sum256     = _mm256_setzero_si256();
1760             for (k = 0; k + 2 <= block_height; k += 2) {
1761                 uint32_t       width_calc = block_width;
1762                 const uint8_t *temp_src   = p_src;
1763                 const uint8_t *temp_ref   = p_ref;
1764                 if (width_calc >= 32) {
1765                     sad_loop_kernel_32_4sum_avx512(temp_src, temp_ref, &sums512[0]);
1766                     sad_loop_kernel_32_4sum_avx512(
1767                         temp_src + src_stride, temp_ref + ref_stride, &sums512[4]);
1768 
1769                     width_calc -= 32;
1770                     temp_src += 32;
1771                     temp_ref += 32;
1772                 }
1773                 if (width_calc >= 16) {
1774                     sad_loop_kernel_16_2sum_avx512(
1775                         temp_src, src_stride, temp_ref, ref_stride, &sums512[0]);
1776 
1777                     width_calc -= 16;
1778                     temp_src += 16;
1779                     temp_ref += 16;
1780                 }
1781                 if (width_calc >= 8) {
1782                     sad_loop_kernel_8_avx2(temp_src, src_stride, temp_ref, ref_stride, &sum256_1);
1783                     sad_loop_kernel_8_avx2(
1784                         temp_src, src_stride, temp_ref + 8, ref_stride, &sum256_2);
1785 
1786                     width_calc -= 8;
1787                     temp_src += 8;
1788                     temp_ref += 8;
1789                 }
1790                 if (width_calc >= 4) {
1791                     sad_loop_kernel_4_avx2(temp_src, src_stride, temp_ref, ref_stride, &sum256_3);
1792                     sad_loop_kernel_4_avx2(
1793                         temp_src, src_stride, temp_ref + 8, ref_stride, &sum256_4);
1794 
1795                     width_calc -= 4;
1796                     temp_src += 4;
1797                     temp_ref += 4;
1798                 }
1799                 if (width_calc > 0) {
1800                     DECLARE_ALIGNED(16, uint16_t, tsum[16]);
1801                     memset(tsum, 0, 16 * sizeof(uint16_t));
1802                     for (uint32_t search_area = 0; search_area < 16; search_area++) {
1803                         for (l = 0; l < width_calc; l++) {
1804                             tsum[search_area] += EB_ABS_DIFF(temp_src[l], temp_ref[l]) +
1805                                 EB_ABS_DIFF(temp_src[src_stride + l], temp_ref[ref_stride + l]);
1806                         }
1807                         temp_ref += 1;
1808                     }
1809                     sum256 = _mm256_adds_epu16(sum256, _mm256_loadu_si256((__m256i *)tsum));
1810                 }
1811                 p_src += 2 * src_stride;
1812                 p_ref += 2 * ref_stride;
1813             }
1814             //when height is not multiple of 2,then compute last line
1815             if (k < block_height) {
1816                 uint32_t       width_calc = block_width;
1817                 const uint8_t *temp_src   = p_src;
1818                 const uint8_t *temp_ref   = p_ref;
1819                 if (width_calc >= 32) {
1820                     sad_loop_kernel_32_4sum_avx512(temp_src, temp_ref, &sums512[0]);
1821 
1822                     width_calc -= 32;
1823                     temp_src += 32;
1824                     temp_ref += 32;
1825                 }
1826                 if (width_calc >= 16) {
1827                     sad_loop_kernel_16_2sum_oneline_avx512(temp_src, temp_ref, &sums512[4]);
1828 
1829                     width_calc -= 16;
1830                     temp_src += 16;
1831                     temp_ref += 16;
1832                 }
1833                 if (width_calc >= 8) {
1834                     sad_loop_kernel_8_oneline_avx2(temp_src, temp_ref, &sum256_1);
1835                     sad_loop_kernel_8_oneline_avx2(temp_src, temp_ref + 8, &sum256_2);
1836 
1837                     width_calc -= 8;
1838                     temp_src += 8;
1839                     temp_ref += 8;
1840                 }
1841                 if (width_calc >= 4) {
1842                     sad_loop_kernel_4_oneline_avx2(temp_src, temp_ref, &sum256_3);
1843                     sad_loop_kernel_4_oneline_avx2(temp_src, temp_ref + 8, &sum256_4);
1844 
1845                     width_calc -= 4;
1846                     temp_src += 4;
1847                     temp_ref += 4;
1848                 }
1849                 if (width_calc > 0) {
1850                     DECLARE_ALIGNED(16, uint16_t, tsum[16]);
1851                     memset(tsum, 0, 16 * sizeof(uint16_t));
1852                     for (uint32_t search_area = 0; search_area < 16; search_area++) {
1853                         for (l = 0; l < width_calc; l++) {
1854                             tsum[search_area] += EB_ABS_DIFF(temp_src[l], temp_ref[l]);
1855                         }
1856                         temp_ref += 1;
1857                     }
1858                     sum256 = _mm256_adds_epu16(sum256, _mm256_loadu_si256((__m256i *)tsum));
1859                 }
1860                 p_src += src_stride;
1861                 p_ref += ref_stride;
1862             }
1863 
1864             //update all
1865             __m512i sum512 = _mm512_inserti64x4(_mm512_castsi256_si512(sum256_1), sum256_2, 0x1);
1866             sum512 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), sum512);
1867             sums512[5] = _mm512_adds_epu16(sums512[5], sum512);
1868 
1869             sum512 = _mm512_inserti64x4(_mm512_castsi256_si512(sum256_3), sum256_4, 0x1);
1870             sum512 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), sum512);
1871             sums512[6] = _mm512_adds_epu16(sums512[6], sum512);
1872 
1873             sums512[7] = _mm512_adds_epu16(sums512[7],
1874                                            _mm512_inserti64x4(_mm512_setzero_si512(), sum256, 0));
1875 
1876             const __m512i sum512_01   = _mm512_adds_epu16(sums512[0], sums512[1]);
1877             const __m512i sum512_23   = _mm512_adds_epu16(sums512[2], sums512[3]);
1878             const __m512i sum512_45   = _mm512_adds_epu16(sums512[4], sums512[5]);
1879             const __m512i sum512_67   = _mm512_adds_epu16(sums512[6], sums512[7]);
1880             const __m512i sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
1881             const __m512i sum512_4567 = _mm512_adds_epu16(sum512_45, sum512_67);
1882             sum512                    = _mm512_adds_epu16(sum512_0123, sum512_4567);
1883             const __m256i sum_lo      = _mm512_castsi512_si256(sum512);
1884             const __m256i sum_hi      = _mm512_extracti64x4_epi64(sum512, 1);
1885             const __m256i sad         = _mm256_adds_epu16(sum_lo, sum_hi);
1886             __m128i       sad_lo      = _mm256_castsi256_si128(sad);
1887             __m128i       sad_hi      = _mm256_extracti128_si256(sad, 1);
1888             if (leftover && (j + 16) >= search_area_width) {
1889                 if (leftover < 8) {
1890                     sad_lo = _mm_or_si128(sad_lo, leftover_mask);
1891                     sad_hi = _mm_set1_epi32(-1);
1892                 } else {
1893                     sad_hi = _mm_or_si128(sad_hi, leftover_mask);
1894                 }
1895             }
1896             const __m128i  minpos_lo = _mm_minpos_epu16(sad_lo);
1897             const __m128i  minpos_hi = _mm_minpos_epu16(sad_hi);
1898             const uint32_t min0      = _mm_extract_epi16(minpos_lo, 0);
1899             const uint32_t min1      = _mm_extract_epi16(minpos_hi, 0);
1900             uint32_t       minmin, delta;
1901             __m128i        minpos;
1902 
1903             if (min0 <= min1) {
1904                 minmin = min0;
1905                 delta  = 0;
1906                 minpos = minpos_lo;
1907             } else {
1908                 minmin = min1;
1909                 delta  = 8;
1910                 minpos = minpos_hi;
1911             }
1912 
1913             if (minmin < low_sum) {
1914                 if (minmin != 0xFFFF) { // no overflow
1915                     low_sum = minmin;
1916                     x_best  = j + delta + _mm_extract_epi16(minpos, 1);
1917                     y_best  = i;
1918                 } else { // overflow
1919                     __m256i sads256[2];
1920                     __m128i sads128[4];
1921 
1922                     add16x16x8to32bit(sums512, sads256);
1923 
1924                     sads128[0] = _mm256_castsi256_si128(sads256[0]);
1925                     sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
1926                     sads128[2] = _mm256_castsi256_si128(sads256[1]);
1927                     sads128[3] = _mm256_extracti128_si256(sads256[1], 1);
1928                     if (leftover && (j + 16) >= search_area_width) {
1929                         if (leftover < 4) {
1930                             sads128[0] = _mm_or_si128(sads128[0], leftover_mask32b);
1931                             sads128[1] = sads128[2] = sads128[3] = _mm_set1_epi32(-1);
1932                         } else if (leftover < 8) {
1933                             sads128[1] = _mm_or_si128(sads128[1], leftover_mask32b);
1934                             sads128[2] = sads128[3] = _mm_set1_epi32(-1);
1935                         } else if (leftover < 12) {
1936                             sads128[2] = _mm_or_si128(sads128[2], leftover_mask32b);
1937                             sads128[3] = _mm_set1_epi32(-1);
1938                         } else {
1939                             sads128[3] = _mm_or_si128(sads128[3], leftover_mask32b);
1940                         }
1941                     }
1942 
1943                     update_8_best(&sads128[0], j + 0, i, &low_sum, &x_best, &y_best);
1944                     update_8_best(&sads128[2], j + 8, i, &low_sum, &x_best, &y_best);
1945                 }
1946             }
1947         }
1948         ref += src_stride_raw;
1949     }
1950 
1951     *best_sad        = low_sum;
1952     *x_search_center = (int16_t)x_best;
1953     *y_search_center = (int16_t)y_best;
1954 }
1955 
1956 /*******************************************************************************
1957 * Requirement: width   = 4, 6, 8, 12, 16, 24, 32, 48 or 64 to use SIMD
1958 * otherwise general/slower SIMD verison is used
1959 * Requirement: height <= 64
1960 * Requirement: height % 2 = 0
1961 *******************************************************************************/
svt_sad_loop_kernel_avx512_intrin(uint8_t * src,uint32_t src_stride,uint8_t * ref,uint32_t ref_stride,uint32_t height,uint32_t width,uint64_t * best_sad,int16_t * x_search_center,int16_t * y_search_center,uint32_t src_stride_raw,int16_t search_area_width,int16_t search_area_height)1962 void svt_sad_loop_kernel_avx512_intrin(
1963     uint8_t * src, // input parameter, source samples Ptr
1964     uint32_t  src_stride, // input parameter, source stride
1965     uint8_t * ref, // input parameter, reference samples Ptr
1966     uint32_t  ref_stride, // input parameter, reference stride
1967     uint32_t  height, // input parameter, block height (M)
1968     uint32_t  width, // input parameter, block width (N)
1969     uint64_t *best_sad, int16_t *x_search_center, int16_t *y_search_center,
1970     uint32_t src_stride_raw, // input parameter, source stride (no line skipping)
1971     int16_t search_area_width, int16_t search_area_height) {
1972     const uint32_t height2 = height >> 1;
1973     const uint8_t *s, *r;
1974     int32_t        best_x = *x_search_center, best_y = *y_search_center;
1975     uint32_t       best_s = 0xffffff;
1976     int32_t        x, y;
1977     uint32_t       h;
1978 
1979     if (search_area_width == 8) {
1980         switch (width) {
1981         case 4:
1982             if (height <= 4) {
1983                 y = 0;
1984                 do {
1985                     __m128i sum = _mm_setzero_si128();
1986 
1987                     s = src;
1988                     r = ref;
1989 
1990                     h = height;
1991                     while (h >= 2) {
1992                         sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
1993                         s += 2 * src_stride;
1994                         r += 2 * ref_stride;
1995                         h -= 2;
1996                     };
1997 
1998                     if (h) {
1999                         sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2000                     }
2001 
2002                     update_best(sum, 0, y, &best_s, &best_x, &best_y);
2003                     ref += src_stride_raw;
2004                 } while (++y < search_area_height);
2005             } else {
2006                 y = 0;
2007                 do {
2008                     __m256i sum256 = _mm256_setzero_si256();
2009 
2010                     s = src;
2011                     r = ref;
2012 
2013                     h = height;
2014                     while (h >= 2) {
2015                         sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2016                         s += 2 * src_stride;
2017                         r += 2 * ref_stride;
2018                         h -= 2;
2019                     };
2020 
2021                     if (h) {
2022                         sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2023                     }
2024 
2025                     update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2026                     ref += src_stride_raw;
2027                 } while (++y < search_area_height);
2028             }
2029             break;
2030 
2031         case 6:
2032             if (height <= 4) {
2033                 y = 0;
2034                 do {
2035                     __m128i sum = _mm_setzero_si128();
2036 
2037                     s = src;
2038                     r = ref;
2039 
2040                     h = height;
2041                     while (h >= 2) {
2042                         sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2043                         s += 2 * src_stride;
2044                         r += 2 * ref_stride;
2045                         h -= 2;
2046                     };
2047 
2048                     if (h) {
2049                         sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2050                     }
2051 
2052                     __m128i sum2 = complement_4_to_6(ref, ref_stride, src, src_stride, height, 8);
2053                     sum          = _mm_adds_epu16(sum, sum2);
2054 
2055                     update_best(sum, 0, y, &best_s, &best_x, &best_y);
2056                     ref += src_stride_raw;
2057                 } while (++y < search_area_height);
2058             } else {
2059                 y = 0;
2060                 do {
2061                     __m256i sum256 = _mm256_setzero_si256();
2062 
2063                     s = src;
2064                     r = ref;
2065 
2066                     h = height;
2067                     while (h >= 2) {
2068                         sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2069                         s += 2 * src_stride;
2070                         r += 2 * ref_stride;
2071                         h -= 2;
2072                     };
2073 
2074                     if (h) {
2075                         sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2076                     }
2077 
2078                     __m128i sum = complement_4_to_6(ref, ref_stride, src, src_stride, height, 8);
2079                     sum256      = _mm256_adds_epu16(
2080                         sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
2081 
2082                     update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2083                     ref += src_stride_raw;
2084                 } while (++y < search_area_height);
2085             }
2086             break;
2087 
2088         case 8:
2089             // Note: Tried _mm512_dbsad_epu8 but is even slower.
2090             y = 0;
2091             do {
2092                 __m256i sum256 = _mm256_setzero_si256();
2093 
2094                 s = src;
2095                 r = ref;
2096 
2097                 h = height;
2098                 while (h >= 2) {
2099                     sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
2100                     s += 2 * src_stride;
2101                     r += 2 * ref_stride;
2102                     h -= 2;
2103                 };
2104 
2105                 if (h) {
2106                     sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
2107                 };
2108 
2109                 update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2110                 ref += src_stride_raw;
2111             } while (++y < search_area_height);
2112             break;
2113 
2114         case 12:
2115             if (height <= 16) {
2116                 y = 0;
2117                 do {
2118                     __m256i sum256 = _mm256_setzero_si256();
2119 
2120                     s = src;
2121                     r = ref;
2122 
2123                     h = height;
2124                     while (h >= 2) {
2125                         sad_loop_kernel_12_avx2(s, src_stride, r, ref_stride, &sum256);
2126                         s += 2 * src_stride;
2127                         r += 2 * ref_stride;
2128                         h -= 2;
2129                     };
2130 
2131                     if (h) {
2132                         sad_loop_kernel_12_oneline_avx2(s, r, &sum256);
2133                     }
2134 
2135                     update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2136                     ref += src_stride_raw;
2137                 } while (++y < search_area_height);
2138             } else if (height <= 32) {
2139                 y = 0;
2140                 do {
2141                     __m256i sum256 = _mm256_setzero_si256();
2142 
2143                     s = src;
2144                     r = ref;
2145 
2146                     h = height;
2147                     while (h >= 2) {
2148                         sad_loop_kernel_12_avx2(s, src_stride, r, ref_stride, &sum256);
2149                         s += 2 * src_stride;
2150                         r += 2 * ref_stride;
2151                         h -= 2;
2152                     };
2153 
2154                     if (h) {
2155                         sad_loop_kernel_12_oneline_avx2(s, r, &sum256);
2156                     }
2157 
2158                     update_some_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2159                     ref += src_stride_raw;
2160                 } while (++y < search_area_height);
2161             } else {
2162                 y = 0;
2163                 do {
2164                     __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2165 
2166                     s = src;
2167                     r = ref;
2168 
2169                     h = height;
2170                     while (h >= 2) {
2171                         sad_loop_kernel_12_2sum_avx2(s, src_stride, r, ref_stride, sums256);
2172                         s += 2 * src_stride;
2173                         r += 2 * ref_stride;
2174                         h -= 2;
2175                     };
2176 
2177                     if (h) {
2178                         sad_loop_kernel_12_2sum_oneline_avx2(s, r, sums256);
2179                     }
2180 
2181                     update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2182                     ref += src_stride_raw;
2183                 } while (++y < search_area_height);
2184             }
2185             break;
2186 
2187         case 16:
2188             if (height <= 16) {
2189                 y = 0;
2190                 do {
2191                     __m256i sum256 = _mm256_setzero_si256();
2192 
2193                     s = src;
2194                     r = ref;
2195 
2196                     h = height;
2197                     while (h >= 2) {
2198                         sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
2199                         s += 2 * src_stride;
2200                         r += 2 * ref_stride;
2201                         h -= 2;
2202                     };
2203 
2204                     if (h) {
2205                         sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
2206                     };
2207 
2208                     update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2209                     ref += src_stride_raw;
2210                 } while (++y < search_area_height);
2211             } else if (height <= 32) {
2212                 y = 0;
2213                 do {
2214                     __m256i sum256 = _mm256_setzero_si256();
2215 
2216                     s = src;
2217                     r = ref;
2218 
2219                     h = height;
2220                     while (h >= 2) {
2221                         sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
2222                         s += 2 * src_stride;
2223                         r += 2 * ref_stride;
2224                         h -= 2;
2225                     };
2226 
2227                     if (h) {
2228                         sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
2229                     };
2230 
2231                     update_some_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2232                     ref += src_stride_raw;
2233                 } while (++y < search_area_height);
2234             } else {
2235                 y = 0;
2236                 do {
2237                     __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2238 
2239                     s = src;
2240                     r = ref;
2241 
2242                     h = height;
2243                     while (h >= 2) {
2244                         sad_loop_kernel_16_2sum_avx2(s, src_stride, r, ref_stride, sums256);
2245                         s += 2 * src_stride;
2246                         r += 2 * ref_stride;
2247                         h -= 2;
2248                     };
2249 
2250                     if (h) {
2251                         sad_loop_kernel_16_2sum_oneline_avx2(s, r, sums256);
2252                     };
2253 
2254                     update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2255                     ref += src_stride_raw;
2256                 } while (++y < search_area_height);
2257             }
2258             break;
2259 
2260         case 24:
2261             if (height <= 16) {
2262                 y = 0;
2263                 do {
2264                     __m256i sum256 = _mm256_setzero_si256();
2265 
2266                     s = src;
2267                     r = ref;
2268 
2269                     h = height;
2270                     while (h >= 2) {
2271                         sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
2272                         sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sum256);
2273                         s += 2 * src_stride;
2274                         r += 2 * ref_stride;
2275                         h -= 2;
2276                     };
2277 
2278                     if (h) {
2279                         sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
2280                         sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sum256);
2281                     }
2282 
2283                     update_some_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2284                     ref += src_stride_raw;
2285                 } while (++y < search_area_height);
2286             } else {
2287                 y = 0;
2288                 do {
2289                     __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2290 
2291                     s = src;
2292                     r = ref;
2293 
2294                     h = height;
2295                     while (h >= 2) {
2296                         sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sums256[0]);
2297                         sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sums256[1]);
2298                         s += 2 * src_stride;
2299                         r += 2 * ref_stride;
2300                         h -= 2;
2301                     };
2302 
2303                     if (h) {
2304                         sad_loop_kernel_16_oneline_avx2(s, r, &sums256[0]);
2305                         sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[1]);
2306                     }
2307 
2308                     update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2309                     ref += src_stride_raw;
2310                 } while (++y < search_area_height);
2311             }
2312             break;
2313 
2314         case 32:
2315             if (height <= 8) {
2316                 y = 0;
2317                 do {
2318                     __m256i sum256 = _mm256_setzero_si256();
2319 
2320                     s = src;
2321                     r = ref;
2322 
2323                     h = height;
2324                     do {
2325                         sad_loop_kernel_32_avx2(s, r, &sum256);
2326                         s += src_stride;
2327                         r += ref_stride;
2328                     } while (--h);
2329 
2330                     update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2331                     ref += src_stride_raw;
2332                 } while (++y < search_area_height);
2333             } else if (height <= 16) {
2334                 y = 0;
2335                 do {
2336                     __m256i sum256 = _mm256_setzero_si256();
2337 
2338                     s = src;
2339                     r = ref;
2340 
2341                     h = height;
2342                     do {
2343                         sad_loop_kernel_32_avx2(s, r, &sum256);
2344                         s += src_stride;
2345                         r += ref_stride;
2346                     } while (--h);
2347 
2348                     update_some_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2349                     ref += src_stride_raw;
2350                 } while (++y < search_area_height);
2351             } else if (height <= 32) {
2352                 y = 0;
2353                 do {
2354                     __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2355 
2356                     s = src;
2357                     r = ref;
2358 
2359                     h = height;
2360                     do {
2361                         sad_loop_kernel_32_2sum_avx2(s, r, sums256);
2362                         s += src_stride;
2363                         r += ref_stride;
2364                     } while (--h);
2365 
2366                     update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2367                     ref += src_stride_raw;
2368                 } while (++y < search_area_height);
2369             } else {
2370                 y = 0;
2371                 do {
2372                     __m256i sums256[4] = {_mm256_setzero_si256(),
2373                                           _mm256_setzero_si256(),
2374                                           _mm256_setzero_si256(),
2375                                           _mm256_setzero_si256()};
2376 
2377                     s = src;
2378                     r = ref;
2379 
2380                     h = height;
2381                     do {
2382                         sad_loop_kernel_32_4sum_avx2(s, r, sums256);
2383                         s += src_stride;
2384                         r += ref_stride;
2385                     } while (--h);
2386 
2387                     update_leftover8_2048_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2388                     ref += src_stride_raw;
2389                 } while (++y < search_area_height);
2390             }
2391             break;
2392 
2393         case 48:
2394             if (height <= 32) {
2395                 y = 0;
2396                 do {
2397                     __m256i sums256[3] = {
2398                         _mm256_setzero_si256(), _mm256_setzero_si256(), _mm256_setzero_si256()};
2399 
2400                     s = src;
2401                     r = ref;
2402 
2403                     h = height2;
2404                     do {
2405                         sad_loop_kernel_32_2sum_avx2(s, r, sums256);
2406                         sad_loop_kernel_32_2sum_avx2(s + src_stride, r + ref_stride, sums256);
2407                         sad_loop_kernel_16_avx2(
2408                             s + 32, src_stride, r + 32, ref_stride, &sums256[2]);
2409                         s += 2 * src_stride;
2410                         r += 2 * ref_stride;
2411                     } while (--h);
2412 
2413                     update_leftover8_1536_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2414                     ref += src_stride_raw;
2415                 } while (++y < search_area_height);
2416             } else {
2417                 y = 0;
2418                 do {
2419                     __m256i sums256[6] = {_mm256_setzero_si256(),
2420                                           _mm256_setzero_si256(),
2421                                           _mm256_setzero_si256(),
2422                                           _mm256_setzero_si256(),
2423                                           _mm256_setzero_si256(),
2424                                           _mm256_setzero_si256()};
2425 
2426                     s = src;
2427                     r = ref;
2428 
2429                     h = height2;
2430                     do {
2431                         sad_loop_kernel_32_4sum_avx2(s, r, sums256);
2432                         sad_loop_kernel_32_4sum_avx2(s + src_stride, r + ref_stride, sums256);
2433                         sad_loop_kernel_16_2sum_avx2(
2434                             s + 32, src_stride, r + 32, ref_stride, &sums256[4]);
2435                         s += 2 * src_stride;
2436                         r += 2 * ref_stride;
2437                     } while (--h);
2438 
2439                     const __m256i  sum01   = _mm256_adds_epu16(sums256[0], sums256[1]);
2440                     const __m256i  sum23   = _mm256_adds_epu16(sums256[2], sums256[3]);
2441                     const __m256i  sum45   = _mm256_adds_epu16(sums256[4], sums256[5]);
2442                     const __m256i  sum0123 = _mm256_adds_epu16(sum01, sum23);
2443                     const __m256i  sum256  = _mm256_adds_epu16(sum0123, sum45);
2444                     const __m128i  sum_lo  = _mm256_castsi256_si128(sum256);
2445                     const __m128i  sum_hi  = _mm256_extracti128_si256(sum256, 1);
2446                     const __m128i  sad     = _mm_adds_epu16(sum_lo, sum_hi);
2447                     const __m128i  minpos  = _mm_minpos_epu16(sad);
2448                     const uint32_t min0    = _mm_extract_epi16(minpos, 0);
2449 
2450                     if (min0 < best_s) {
2451                         if (min0 != 0xFFFF) { // no overflow
2452                             best_s = min0;
2453                             best_x = _mm_extract_epi16(minpos, 1);
2454                             best_y = y;
2455                         } else { // overflow
2456                             __m128i sads[2];
2457 
2458                             add16x8x6to32bit(sums256, sads);
2459                             update_8_best(sads, 0, y, &best_s, &best_x, &best_y);
2460                         }
2461                     }
2462 
2463                     ref += src_stride_raw;
2464                 } while (++y < search_area_height);
2465             }
2466             break;
2467 
2468         case 64:
2469             if (height <= 16) {
2470                 y = 0;
2471                 do {
2472                     __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2473 
2474                     s = src;
2475                     r = ref;
2476 
2477                     h = height;
2478                     do {
2479                         sad_loop_kernel_64_2sum_avx2(s, r, sums256);
2480                         s += src_stride;
2481                         r += ref_stride;
2482                     } while (--h);
2483 
2484                     update_leftover8_1024_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2485                     ref += src_stride_raw;
2486                 } while (++y < search_area_height);
2487             } else if (height <= 32) {
2488                 y = 0;
2489                 do {
2490                     __m256i sums256[4] = {_mm256_setzero_si256(),
2491                                           _mm256_setzero_si256(),
2492                                           _mm256_setzero_si256(),
2493                                           _mm256_setzero_si256()};
2494 
2495                     s = src;
2496                     r = ref;
2497 
2498                     h = height;
2499                     do {
2500                         sad_loop_kernel_64_4sum_avx2(s, r, sums256);
2501                         s += src_stride;
2502                         r += ref_stride;
2503                     } while (--h);
2504 
2505                     update_leftover8_2048_pel(sums256, 0, y, &best_s, &best_x, &best_y);
2506                     ref += src_stride_raw;
2507                 } while (++y < search_area_height);
2508             } else {
2509                 y = 0;
2510                 do {
2511                     __m256i sums256[8] = {_mm256_setzero_si256(),
2512                                           _mm256_setzero_si256(),
2513                                           _mm256_setzero_si256(),
2514                                           _mm256_setzero_si256(),
2515                                           _mm256_setzero_si256(),
2516                                           _mm256_setzero_si256(),
2517                                           _mm256_setzero_si256(),
2518                                           _mm256_setzero_si256()};
2519 
2520                     s = src;
2521                     r = ref;
2522 
2523                     h = height;
2524                     do {
2525                         sad_loop_kernel_64_8sum_avx2(s, r, sums256);
2526                         s += src_stride;
2527                         r += ref_stride;
2528                     } while (--h);
2529 
2530                     const __m256i  sum01   = _mm256_adds_epu16(sums256[0], sums256[1]);
2531                     const __m256i  sum23   = _mm256_adds_epu16(sums256[2], sums256[3]);
2532                     const __m256i  sum45   = _mm256_adds_epu16(sums256[4], sums256[5]);
2533                     const __m256i  sum67   = _mm256_adds_epu16(sums256[6], sums256[7]);
2534                     const __m256i  sum0123 = _mm256_adds_epu16(sum01, sum23);
2535                     const __m256i  sum4567 = _mm256_adds_epu16(sum45, sum67);
2536                     const __m256i  sum256  = _mm256_adds_epu16(sum0123, sum4567);
2537                     const __m128i  sum_lo  = _mm256_castsi256_si128(sum256);
2538                     const __m128i  sum_hi  = _mm256_extracti128_si256(sum256, 1);
2539                     const __m128i  sad     = _mm_adds_epu16(sum_lo, sum_hi);
2540                     const __m128i  minpos  = _mm_minpos_epu16(sad);
2541                     const uint32_t min0    = _mm_extract_epi16(minpos, 0);
2542 
2543                     if (min0 < best_s) {
2544                         if (min0 != 0xFFFF) { // no overflow
2545                             best_s = min0;
2546                             best_x = _mm_extract_epi16(minpos, 1);
2547                             best_y = y;
2548                         } else { // overflow
2549                             __m128i sads[2];
2550 
2551                             add16x8x8to32bit(sums256, sads);
2552                             update_8_best(sads, 0, y, &best_s, &best_x, &best_y);
2553                         }
2554                     }
2555 
2556                     ref += src_stride_raw;
2557                 } while (++y < search_area_height);
2558             }
2559             break;
2560 
2561         default:
2562             sad_loop_kernel_generalized_avx512(src,
2563                                                src_stride,
2564                                                ref,
2565                                                ref_stride,
2566                                                height,
2567                                                width,
2568                                                best_sad,
2569                                                x_search_center,
2570                                                y_search_center,
2571                                                src_stride_raw,
2572                                                search_area_width,
2573                                                search_area_height);
2574             return;
2575         }
2576     } else if (search_area_width == 16) {
2577         switch (width) {
2578         case 4:
2579             if (height <= 4) {
2580                 y = 0;
2581                 do {
2582                     {
2583                         __m128i sum = _mm_setzero_si128();
2584 
2585                         s = src;
2586                         r = ref;
2587 
2588                         h = height;
2589                         while (h >= 2) {
2590                             sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2591                             s += 2 * src_stride;
2592                             r += 2 * ref_stride;
2593                             h -= 2;
2594                         };
2595 
2596                         if (h) {
2597                             sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2598                         }
2599 
2600                         update_best(sum, 0, y, &best_s, &best_x, &best_y);
2601                     }
2602 
2603                     {
2604                         __m128i sum = _mm_setzero_si128();
2605 
2606                         s = src;
2607                         r = ref + 8;
2608 
2609                         h = height;
2610                         while (h >= 2) {
2611                             sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2612                             s += 2 * src_stride;
2613                             r += 2 * ref_stride;
2614                             h -= 2;
2615                         };
2616 
2617                         if (h) {
2618                             sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2619                         }
2620 
2621                         update_best(sum, 8, y, &best_s, &best_x, &best_y);
2622                     }
2623 
2624                     ref += src_stride_raw;
2625                 } while (++y < search_area_height);
2626             } else {
2627                 y = 0;
2628                 do {
2629                     {
2630                         __m256i sum256 = _mm256_setzero_si256();
2631 
2632                         s = src;
2633                         r = ref;
2634 
2635                         h = height;
2636                         while (h >= 2) {
2637                             sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2638                             s += 2 * src_stride;
2639                             r += 2 * ref_stride;
2640                             h -= 2;
2641                         };
2642 
2643                         if (h) {
2644                             sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2645                         }
2646 
2647                         update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2648                     }
2649 
2650                     {
2651                         __m256i sum256 = _mm256_setzero_si256();
2652 
2653                         s = src;
2654                         r = ref + 8;
2655 
2656                         h = height;
2657                         while (h >= 2) {
2658                             sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2659                             s += 2 * src_stride;
2660                             r += 2 * ref_stride;
2661                             h -= 2;
2662                         };
2663 
2664                         if (h) {
2665                             sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2666                         }
2667 
2668                         update_small_pel(sum256, 8, y, &best_s, &best_x, &best_y);
2669                     }
2670 
2671                     ref += src_stride_raw;
2672                 } while (++y < search_area_height);
2673             }
2674             break;
2675 
2676         case 6:
2677             if (height <= 4) {
2678                 y = 0;
2679                 do {
2680                     {
2681                         __m128i sum = _mm_setzero_si128();
2682 
2683                         s = src;
2684                         r = ref;
2685 
2686                         h = height;
2687                         while (h >= 2) {
2688                             sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2689                             s += 2 * src_stride;
2690                             r += 2 * ref_stride;
2691                             h -= 2;
2692                         };
2693 
2694                         if (h) {
2695                             sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2696                         }
2697 
2698                         __m128i sum2 = complement_4_to_6(
2699                             ref, ref_stride, src, src_stride, height, 8);
2700                         sum = _mm_adds_epu16(sum, sum2);
2701 
2702                         update_best(sum, 0, y, &best_s, &best_x, &best_y);
2703                     }
2704 
2705                     {
2706                         __m128i sum = _mm_setzero_si128();
2707 
2708                         s = src;
2709                         r = ref + 8;
2710 
2711                         h = height;
2712                         while (h >= 2) {
2713                             sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
2714                             s += 2 * src_stride;
2715                             r += 2 * ref_stride;
2716                             h -= 2;
2717                         };
2718 
2719                         if (h) {
2720                             sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
2721                         }
2722 
2723                         __m128i sum2 = complement_4_to_6(
2724                             ref + 8, ref_stride, src, src_stride, height, 8);
2725                         sum = _mm_adds_epu16(sum, sum2);
2726 
2727                         update_best(sum, 8, y, &best_s, &best_x, &best_y);
2728                     }
2729 
2730                     ref += src_stride_raw;
2731                 } while (++y < search_area_height);
2732             } else {
2733                 y = 0;
2734                 do {
2735                     {
2736                         __m256i sum256 = _mm256_setzero_si256();
2737 
2738                         s = src;
2739                         r = ref;
2740 
2741                         h = height;
2742                         while (h >= 2) {
2743                             sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2744                             s += 2 * src_stride;
2745                             r += 2 * ref_stride;
2746                             h -= 2;
2747                         };
2748 
2749                         if (h) {
2750                             sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2751                         }
2752 
2753                         __m128i sum = complement_4_to_6(
2754                             ref, ref_stride, src, src_stride, height, 8);
2755                         sum256 = _mm256_adds_epu16(
2756                             sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
2757 
2758                         update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2759                     }
2760 
2761                     {
2762                         __m256i sum256 = _mm256_setzero_si256();
2763 
2764                         s = src;
2765                         r = ref + 8;
2766 
2767                         h = height;
2768                         while (h >= 2) {
2769                             sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
2770                             s += 2 * src_stride;
2771                             r += 2 * ref_stride;
2772                             h -= 2;
2773                         };
2774 
2775                         if (h) {
2776                             sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
2777                         }
2778 
2779                         __m128i sum = complement_4_to_6(
2780                             ref + 8, ref_stride, src, src_stride, height, 8);
2781                         sum256 = _mm256_adds_epu16(
2782                             sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
2783 
2784                         update_small_pel(sum256, 8, y, &best_s, &best_x, &best_y);
2785                     }
2786 
2787                     ref += src_stride_raw;
2788                 } while (++y < search_area_height);
2789             }
2790             break;
2791 
2792         case 8:
2793             // Note: Tried _mm512_dbsad_epu8 but is even slower.
2794             y = 0;
2795             do {
2796                 {
2797                     __m256i sum256 = _mm256_setzero_si256();
2798 
2799                     s = src;
2800                     r = ref;
2801 
2802                     h = height;
2803                     while (h >= 2) {
2804                         sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
2805                         s += 2 * src_stride;
2806                         r += 2 * ref_stride;
2807                         h -= 2;
2808                     };
2809 
2810                     if (h) {
2811                         sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
2812                     };
2813 
2814                     update_small_pel(sum256, 0, y, &best_s, &best_x, &best_y);
2815                 }
2816 
2817                 {
2818                     __m256i sum256 = _mm256_setzero_si256();
2819 
2820                     s = src;
2821                     r = ref + 8;
2822 
2823                     h = height;
2824                     while (h >= 2) {
2825                         sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
2826                         s += 2 * src_stride;
2827                         r += 2 * ref_stride;
2828                         h -= 2;
2829                     };
2830 
2831                     if (h) {
2832                         sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
2833                     };
2834 
2835                     update_small_pel(sum256, 8, y, &best_s, &best_x, &best_y);
2836                 }
2837 
2838                 ref += src_stride_raw;
2839             } while (++y < search_area_height);
2840             break;
2841 
2842         case 12:
2843             if (height <= 16) {
2844                 y = 0;
2845                 do {
2846                     __m512i sum512 = _mm512_setzero_si512();
2847 
2848                     s = src;
2849                     r = ref;
2850 
2851                     h = height;
2852                     while (h >= 2) {
2853                         sad_loop_kernel_12_avx512(s, src_stride, r, ref_stride, &sum512);
2854                         s += 2 * src_stride;
2855                         r += 2 * ref_stride;
2856                         h -= 2;
2857                     };
2858 
2859                     if (h) {
2860                         sad_loop_kernel_12_oneline_avx512(s, r, &sum512);
2861                     }
2862 
2863                     update_256_pel(sum512, 0, y, &best_s, &best_x, &best_y);
2864                     ref += src_stride_raw;
2865                 } while (++y < search_area_height);
2866             } else if (height <= 32) {
2867                 y = 0;
2868                 do {
2869                     __m512i sum512 = _mm512_setzero_si512();
2870 
2871                     s = src;
2872                     r = ref;
2873 
2874                     h = height;
2875                     while (h >= 2) {
2876                         sad_loop_kernel_12_avx512(s, src_stride, r, ref_stride, &sum512);
2877                         s += 2 * src_stride;
2878                         r += 2 * ref_stride;
2879                         h -= 2;
2880                     };
2881 
2882                     if (h) {
2883                         sad_loop_kernel_12_oneline_avx512(s, r, &sum512);
2884                     }
2885 
2886                     update_512_pel(sum512, 0, y, &best_s, &best_x, &best_y);
2887                     ref += src_stride_raw;
2888                 } while (++y < search_area_height);
2889             } else {
2890                 y = 0;
2891                 do {
2892                     __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
2893 
2894                     s = src;
2895                     r = ref;
2896 
2897                     h = height;
2898                     while (h >= 2) {
2899                         sad_loop_kernel_12_2sum_avx512(s, src_stride, r, ref_stride, sums512);
2900                         s += 2 * src_stride;
2901                         r += 2 * ref_stride;
2902                         h -= 2;
2903                     };
2904 
2905                     if (h) {
2906                         sad_loop_kernel_12_2sum_oneline_avx512(s, r, sums512);
2907                     }
2908 
2909                     update_1024_pel(sums512, 0, y, &best_s, &best_x, &best_y);
2910                     ref += src_stride_raw;
2911                 } while (++y < search_area_height);
2912             }
2913             break;
2914 
2915         case 16:
2916             if (height <= 16) {
2917                 y = 0;
2918                 do {
2919                     __m512i sum512 = _mm512_setzero_si512();
2920 
2921                     s = src;
2922                     r = ref;
2923 
2924                     h = height;
2925                     while (h >= 2) {
2926                         sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
2927                         s += 2 * src_stride;
2928                         r += 2 * ref_stride;
2929                         h -= 2;
2930                     };
2931 
2932                     if (h) {
2933                         sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
2934                     };
2935 
2936                     update_256_pel(sum512, 0, y, &best_s, &best_x, &best_y);
2937                     ref += src_stride_raw;
2938                 } while (++y < search_area_height);
2939             } else if (height <= 32) {
2940                 y = 0;
2941                 do {
2942                     __m512i sum512 = _mm512_setzero_si512();
2943 
2944                     s = src;
2945                     r = ref;
2946 
2947                     h = height;
2948                     while (h >= 2) {
2949                         sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
2950                         s += 2 * src_stride;
2951                         r += 2 * ref_stride;
2952                         h -= 2;
2953                     };
2954 
2955                     if (h) {
2956                         sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
2957                     };
2958 
2959                     update_512_pel(sum512, 0, y, &best_s, &best_x, &best_y);
2960                     ref += src_stride_raw;
2961                 } while (++y < search_area_height);
2962             } else {
2963                 y = 0;
2964                 do {
2965                     __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
2966 
2967                     s = src;
2968                     r = ref;
2969 
2970                     h = height;
2971                     while (h >= 2) {
2972                         sad_loop_kernel_16_2sum_avx512(s, src_stride, r, ref_stride, sums512);
2973                         s += 2 * src_stride;
2974                         r += 2 * ref_stride;
2975                         h -= 2;
2976                     };
2977 
2978                     if (h) {
2979                         sad_loop_kernel_16_2sum_oneline_avx512(s, r, sums512);
2980                     };
2981 
2982                     update_1024_pel(sums512, 0, y, &best_s, &best_x, &best_y);
2983                     ref += src_stride_raw;
2984                 } while (++y < search_area_height);
2985             }
2986             break;
2987 
2988         case 24:
2989             if (height <= 16) {
2990                 y = 0;
2991                 do {
2992                     __m512i sum512     = _mm512_setzero_si512();
2993                     __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
2994 
2995                     s = src;
2996                     r = ref;
2997 
2998                     h = height;
2999                     while (h >= 2) {
3000                         sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3001                         sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sums256[0]);
3002                         sad_loop_kernel_8_avx2(s + 16, src_stride, r + 24, ref_stride, &sums256[1]);
3003                         s += 2 * src_stride;
3004                         r += 2 * ref_stride;
3005                         h -= 2;
3006                     };
3007 
3008                     if (h) {
3009                         sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
3010                         sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[0]);
3011                         sad_loop_kernel_8_oneline_avx2(s + 16, r + 24, &sums256[1]);
3012                     }
3013 
3014                     update_384_pel(sum512, sums256, 0, y, &best_s, &best_x, &best_y);
3015                     ref += src_stride_raw;
3016                 } while (++y < search_area_height);
3017             } else {
3018                 y = 0;
3019                 do {
3020                     __m512i sum512     = _mm512_setzero_si512();
3021                     __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3022 
3023                     s = src;
3024                     r = ref;
3025 
3026                     h = height;
3027                     while (h >= 2) {
3028                         sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3029                         sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sums256[0]);
3030                         sad_loop_kernel_8_avx2(s + 16, src_stride, r + 24, ref_stride, &sums256[1]);
3031                         s += 2 * src_stride;
3032                         r += 2 * ref_stride;
3033                         h -= 2;
3034                     };
3035 
3036                     if (h) {
3037                         sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
3038                         sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[0]);
3039                         sad_loop_kernel_8_oneline_avx2(s + 16, r + 24, &sums256[1]);
3040                     }
3041 
3042                     update_768_pel(sum512, sums256, 0, y, &best_s, &best_x, &best_y);
3043                     ref += src_stride_raw;
3044                 } while (++y < search_area_height);
3045             }
3046             break;
3047 
3048         case 32:
3049             if (height <= 16) {
3050                 y = 0;
3051                 do {
3052                     __m512i sum512 = _mm512_setzero_si512();
3053 
3054                     s = src;
3055                     r = ref;
3056 
3057                     // Note: faster than looping 2 rows.
3058                     h = height;
3059                     do {
3060                         sad_loop_kernel_32_avx512(s, r, &sum512);
3061                         s += src_stride;
3062                         r += ref_stride;
3063                     } while (--h);
3064 
3065                     update_512_pel(sum512, 0, y, &best_s, &best_x, &best_y);
3066                     ref += src_stride_raw;
3067                 } while (++y < search_area_height);
3068             } else if (height <= 32) {
3069                 y = 0;
3070                 do {
3071                     __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
3072 
3073                     s = src;
3074                     r = ref;
3075 
3076                     h = height;
3077                     do {
3078                         sad_loop_kernel_32_2sum_avx512(s, r, sums512);
3079                         s += src_stride;
3080                         r += ref_stride;
3081                     } while (--h);
3082 
3083                     update_1024_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3084                     ref += src_stride_raw;
3085                 } while (++y < search_area_height);
3086             } else {
3087                 y = 0;
3088                 do {
3089                     __m512i sums512[4] = {_mm512_setzero_si512(),
3090                                           _mm512_setzero_si512(),
3091                                           _mm512_setzero_si512(),
3092                                           _mm512_setzero_si512()};
3093 
3094                     s = src;
3095                     r = ref;
3096 
3097                     h = height;
3098                     do {
3099                         sad_loop_kernel_32_4sum_avx512(s, r, sums512);
3100                         s += src_stride;
3101                         r += ref_stride;
3102                     } while (--h);
3103 
3104                     update_2048_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3105                     ref += src_stride_raw;
3106                 } while (++y < search_area_height);
3107             }
3108             break;
3109 
3110         case 48:
3111             if (height <= 32) {
3112                 y = 0;
3113                 do {
3114                     __m512i sums512[3] = {
3115                         _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()};
3116 
3117                     s = src;
3118                     r = ref;
3119 
3120                     h = height2;
3121                     do {
3122                         sad_loop_kernel_32_2sum_avx512(s, r, sums512);
3123                         sad_loop_kernel_32_2sum_avx512(s + src_stride, r + ref_stride, sums512);
3124                         sad_loop_kernel_16_avx512(
3125                             s + 32, src_stride, r + 32, ref_stride, &sums512[2]);
3126                         s += 2 * src_stride;
3127                         r += 2 * ref_stride;
3128                     } while (--h);
3129 
3130                     update_1536_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3131                     ref += src_stride_raw;
3132                 } while (++y < search_area_height);
3133             } else {
3134                 y = 0;
3135                 do {
3136                     __m512i sums512[6] = {_mm512_setzero_si512(),
3137                                           _mm512_setzero_si512(),
3138                                           _mm512_setzero_si512(),
3139                                           _mm512_setzero_si512(),
3140                                           _mm512_setzero_si512(),
3141                                           _mm512_setzero_si512()};
3142 
3143                     s = src;
3144                     r = ref;
3145 
3146                     h = height2;
3147                     do {
3148                         sad_loop_kernel_32_4sum_avx512(s, r, sums512);
3149                         sad_loop_kernel_32_4sum_avx512(s + src_stride, r + ref_stride, sums512);
3150                         sad_loop_kernel_16_2sum_avx512(
3151                             s + 32, src_stride, r + 32, ref_stride, &sums512[4]);
3152                         s += 2 * src_stride;
3153                         r += 2 * ref_stride;
3154                     } while (--h);
3155 
3156                     const __m512i  sum512_01   = _mm512_adds_epu16(sums512[0], sums512[1]);
3157                     const __m512i  sum512_23   = _mm512_adds_epu16(sums512[2], sums512[3]);
3158                     const __m512i  sum512_45   = _mm512_adds_epu16(sums512[4], sums512[5]);
3159                     const __m512i  sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
3160                     const __m512i  sum512      = _mm512_adds_epu16(sum512_0123, sum512_45);
3161                     const __m256i  sum_lo      = _mm512_castsi512_si256(sum512);
3162                     const __m256i  sum_hi      = _mm512_extracti64x4_epi64(sum512, 1);
3163                     const __m256i  sad         = _mm256_adds_epu16(sum_lo, sum_hi);
3164                     const __m128i  sad_lo      = _mm256_castsi256_si128(sad);
3165                     const __m128i  sad_hi      = _mm256_extracti128_si256(sad, 1);
3166                     const __m128i  minpos_lo   = _mm_minpos_epu16(sad_lo);
3167                     const __m128i  minpos_hi   = _mm_minpos_epu16(sad_hi);
3168                     const uint32_t min0        = _mm_extract_epi16(minpos_lo, 0);
3169                     const uint32_t min1        = _mm_extract_epi16(minpos_hi, 0);
3170                     uint32_t       minmin, delta;
3171                     __m128i        minpos;
3172 
3173                     if (min0 <= min1) {
3174                         minmin = min0;
3175                         delta  = 0;
3176                         minpos = minpos_lo;
3177                     } else {
3178                         minmin = min1;
3179                         delta  = 8;
3180                         minpos = minpos_hi;
3181                     }
3182 
3183                     if (minmin < best_s) {
3184                         if (minmin != 0xFFFF) { // no overflow
3185                             best_s = minmin;
3186                             best_x = delta + _mm_extract_epi16(minpos, 1);
3187                             best_y = y;
3188                         } else { // overflow
3189                             __m256i sads256[2];
3190                             __m128i sads128[2];
3191 
3192                             add16x16x6to32bit(sums512, sads256);
3193 
3194                             sads128[0] = _mm256_castsi256_si128(sads256[0]);
3195                             sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
3196                             update_8_best(sads128, 0, y, &best_s, &best_x, &best_y);
3197 
3198                             sads128[0] = _mm256_castsi256_si128(sads256[1]);
3199                             sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
3200                             update_8_best(sads128, 8, y, &best_s, &best_x, &best_y);
3201                         }
3202                     }
3203 
3204                     ref += src_stride_raw;
3205                 } while (++y < search_area_height);
3206             }
3207             break;
3208 
3209         case 64:
3210             if (height <= 16) {
3211                 y = 0;
3212                 do {
3213                     __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
3214 
3215                     s = src;
3216                     r = ref;
3217 
3218                     h = height;
3219                     do {
3220                         sad_loop_kernel_32_2sum_avx512(s + 0 * 32, r + 0 * 32, sums512);
3221                         sad_loop_kernel_32_2sum_avx512(s + 1 * 32, r + 1 * 32, sums512);
3222                         s += src_stride;
3223                         r += ref_stride;
3224                     } while (--h);
3225 
3226                     update_1024_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3227                     ref += src_stride_raw;
3228                 } while (++y < search_area_height);
3229             } else if (height <= 32) {
3230                 y = 0;
3231                 do {
3232                     __m512i sums512[4] = {_mm512_setzero_si512(),
3233                                           _mm512_setzero_si512(),
3234                                           _mm512_setzero_si512(),
3235                                           _mm512_setzero_si512()};
3236 
3237                     s = src;
3238                     r = ref;
3239 
3240                     h = height;
3241                     do {
3242                         sad_loop_kernel_32_4sum_avx512(s + 0 * 32, r + 0 * 32, sums512);
3243                         sad_loop_kernel_32_4sum_avx512(s + 1 * 32, r + 1 * 32, sums512);
3244                         s += src_stride;
3245                         r += ref_stride;
3246                     } while (--h);
3247 
3248                     update_2048_pel(sums512, 0, y, &best_s, &best_x, &best_y);
3249                     ref += src_stride_raw;
3250                 } while (++y < search_area_height);
3251             } else {
3252                 y = 0;
3253                 do {
3254                     __m512i sums512[8] = {_mm512_setzero_si512(),
3255                                           _mm512_setzero_si512(),
3256                                           _mm512_setzero_si512(),
3257                                           _mm512_setzero_si512(),
3258                                           _mm512_setzero_si512(),
3259                                           _mm512_setzero_si512(),
3260                                           _mm512_setzero_si512(),
3261                                           _mm512_setzero_si512()};
3262 
3263                     s = src;
3264                     r = ref;
3265 
3266                     h = height;
3267                     do {
3268                         sad_loop_kernel_32_4sum_avx512(s + 0 * 32, r + 0 * 32, sums512 + 0);
3269                         sad_loop_kernel_32_4sum_avx512(s + 1 * 32, r + 1 * 32, sums512 + 4);
3270                         s += src_stride;
3271                         r += ref_stride;
3272                     } while (--h);
3273 
3274                     const __m512i  sum512_01   = _mm512_adds_epu16(sums512[0], sums512[1]);
3275                     const __m512i  sum512_23   = _mm512_adds_epu16(sums512[2], sums512[3]);
3276                     const __m512i  sum512_45   = _mm512_adds_epu16(sums512[4], sums512[5]);
3277                     const __m512i  sum512_67   = _mm512_adds_epu16(sums512[6], sums512[7]);
3278                     const __m512i  sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
3279                     const __m512i  sum512_4567 = _mm512_adds_epu16(sum512_45, sum512_67);
3280                     const __m512i  sum512      = _mm512_adds_epu16(sum512_0123, sum512_4567);
3281                     const __m256i  sum_lo      = _mm512_castsi512_si256(sum512);
3282                     const __m256i  sum_hi      = _mm512_extracti64x4_epi64(sum512, 1);
3283                     const __m256i  sad         = _mm256_adds_epu16(sum_lo, sum_hi);
3284                     const __m128i  sad_lo      = _mm256_castsi256_si128(sad);
3285                     const __m128i  sad_hi      = _mm256_extracti128_si256(sad, 1);
3286                     const __m128i  minpos_lo   = _mm_minpos_epu16(sad_lo);
3287                     const __m128i  minpos_hi   = _mm_minpos_epu16(sad_hi);
3288                     const uint32_t min0        = _mm_extract_epi16(minpos_lo, 0);
3289                     const uint32_t min1        = _mm_extract_epi16(minpos_hi, 0);
3290                     uint32_t       minmin, delta;
3291                     __m128i        minpos;
3292 
3293                     if (min0 <= min1) {
3294                         minmin = min0;
3295                         delta  = 0;
3296                         minpos = minpos_lo;
3297                     } else {
3298                         minmin = min1;
3299                         delta  = 8;
3300                         minpos = minpos_hi;
3301                     }
3302 
3303                     if (minmin < best_s) {
3304                         if (minmin != 0xFFFF) { // no overflow
3305                             best_s = minmin;
3306                             best_x = delta + _mm_extract_epi16(minpos, 1);
3307                             best_y = y;
3308                         } else { // overflow
3309                             __m256i sads256[2];
3310                             __m128i sads128[2];
3311 
3312                             add16x16x8to32bit(sums512, sads256);
3313 
3314                             sads128[0] = _mm256_castsi256_si128(sads256[0]);
3315                             sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
3316                             update_8_best(sads128, 0, y, &best_s, &best_x, &best_y);
3317 
3318                             sads128[0] = _mm256_castsi256_si128(sads256[1]);
3319                             sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
3320                             update_8_best(sads128, 8, y, &best_s, &best_x, &best_y);
3321                         }
3322                     }
3323 
3324                     ref += src_stride_raw;
3325                 } while (++y < search_area_height);
3326             }
3327             break;
3328 
3329         default:
3330             sad_loop_kernel_generalized_avx512(src,
3331                                                src_stride,
3332                                                ref,
3333                                                ref_stride,
3334                                                height,
3335                                                width,
3336                                                best_sad,
3337                                                x_search_center,
3338                                                y_search_center,
3339                                                src_stride_raw,
3340                                                search_area_width,
3341                                                search_area_height);
3342             return;
3343         }
3344     } else {
3345         const uint32_t leftover = search_area_width & 7;
3346         __m128i        mask128;
3347         __m256i        mask256;
3348 
3349         mask128 = _mm_set1_epi32(-1);
3350         for (x = 0; x < (int32_t)leftover; x++) { mask128 = _mm_slli_si128(mask128, 2); }
3351         mask256 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask128), mask128, 1);
3352 
3353         switch (width) {
3354         case 4:
3355             if (height <= 4) {
3356                 y = 0;
3357                 do {
3358                     for (x = 0; x <= search_area_width - 8; x += 8) {
3359                         __m128i sum = _mm_setzero_si128();
3360 
3361                         s = src;
3362                         r = ref + x;
3363 
3364                         h = height;
3365                         while (h >= 2) {
3366                             sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
3367                             s += 2 * src_stride;
3368                             r += 2 * ref_stride;
3369                             h -= 2;
3370                         };
3371 
3372                         if (h) {
3373                             sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
3374                         }
3375 
3376                         update_best(sum, x, y, &best_s, &best_x, &best_y);
3377                     }
3378 
3379                     if (leftover) {
3380                         __m128i sum = _mm_setzero_si128();
3381 
3382                         s = src;
3383                         r = ref + x;
3384 
3385                         h = height;
3386                         while (h >= 2) {
3387                             sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
3388                             s += 2 * src_stride;
3389                             r += 2 * ref_stride;
3390                             h -= 2;
3391                         };
3392 
3393                         if (h) {
3394                             sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
3395                         }
3396 
3397                         sum = _mm_or_si128(sum, mask128);
3398                         update_best(sum, x, y, &best_s, &best_x, &best_y);
3399                     }
3400 
3401                     ref += src_stride_raw;
3402                 } while (++y < search_area_height);
3403             } else {
3404                 y = 0;
3405                 do {
3406                     for (x = 0; x <= search_area_width - 8; x += 8) {
3407                         __m256i sum256 = _mm256_setzero_si256();
3408 
3409                         s = src;
3410                         r = ref + x;
3411 
3412                         h = height;
3413                         while (h >= 2) {
3414                             sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
3415                             s += 2 * src_stride;
3416                             r += 2 * ref_stride;
3417                             h -= 2;
3418                         };
3419 
3420                         if (h) {
3421                             sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
3422                         }
3423 
3424                         update_small_pel(sum256, x, y, &best_s, &best_x, &best_y);
3425                     }
3426 
3427                     if (leftover) {
3428                         __m256i sum256 = _mm256_setzero_si256();
3429 
3430                         s = src;
3431                         r = ref + x;
3432 
3433                         h = height;
3434                         while (h >= 2) {
3435                             sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
3436                             s += 2 * src_stride;
3437                             r += 2 * ref_stride;
3438                             h -= 2;
3439                         };
3440                         if (h) {
3441                             sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
3442                         }
3443 
3444                         update_leftover_small_pel(sum256, x, y, mask128, &best_s, &best_x, &best_y);
3445                     }
3446 
3447                     ref += src_stride_raw;
3448                 } while (++y < search_area_height);
3449             }
3450             break;
3451 
3452         case 6:
3453             if (height <= 4) {
3454                 y = 0;
3455                 do {
3456                     for (x = 0; x <= search_area_width - 8; x += 8) {
3457                         __m128i sum = _mm_setzero_si128();
3458 
3459                         s = src;
3460                         r = ref + x;
3461 
3462                         h = height;
3463                         while (h >= 2) {
3464                             sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
3465                             s += 2 * src_stride;
3466                             r += 2 * ref_stride;
3467                             h -= 2;
3468                         };
3469 
3470                         if (h) {
3471                             sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
3472                         }
3473 
3474                         __m128i sum2 = complement_4_to_6(
3475                             ref + x, ref_stride, src, src_stride, height, 8);
3476                         sum = _mm_adds_epu16(sum, sum2);
3477 
3478                         update_best(sum, x, y, &best_s, &best_x, &best_y);
3479                     }
3480 
3481                     if (leftover) {
3482                         __m128i sum = _mm_setzero_si128();
3483 
3484                         s = src;
3485                         r = ref + x;
3486 
3487                         h = height;
3488                         while (h >= 2) {
3489                             sad_loop_kernel_4_sse4_1(s, src_stride, r, ref_stride, &sum);
3490                             s += 2 * src_stride;
3491                             r += 2 * ref_stride;
3492                             h -= 2;
3493                         };
3494 
3495                         if (h) {
3496                             sad_loop_kernel_4_oneline_sse4_1(s, r, &sum);
3497                         }
3498 
3499                         sum = _mm_or_si128(sum, mask128);
3500 
3501                         __m128i sum2 = complement_4_to_6(
3502                             ref + x, ref_stride, src, src_stride, height, leftover);
3503                         sum = _mm_adds_epu16(sum, sum2);
3504 
3505                         update_best(sum, x, y, &best_s, &best_x, &best_y);
3506                     }
3507 
3508                     ref += src_stride_raw;
3509                 } while (++y < search_area_height);
3510             } else {
3511                 y = 0;
3512                 do {
3513                     for (x = 0; x <= search_area_width - 8; x += 8) {
3514                         __m256i sum256 = _mm256_setzero_si256();
3515 
3516                         s = src;
3517                         r = ref + x;
3518 
3519                         h = height;
3520                         while (h >= 2) {
3521                             sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
3522                             s += 2 * src_stride;
3523                             r += 2 * ref_stride;
3524                             h -= 2;
3525                         };
3526 
3527                         if (h) {
3528                             sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
3529                         }
3530 
3531                         __m128i sum = complement_4_to_6(
3532                             ref + x, ref_stride, src, src_stride, height, 8);
3533                         sum256 = _mm256_adds_epu16(
3534                             sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
3535 
3536                         update_small_pel(sum256, x, y, &best_s, &best_x, &best_y);
3537                     }
3538 
3539                     if (leftover) {
3540                         __m256i sum256 = _mm256_setzero_si256();
3541 
3542                         s = src;
3543                         r = ref + x;
3544 
3545                         h = height;
3546                         while (h >= 2) {
3547                             sad_loop_kernel_4_avx2(s, src_stride, r, ref_stride, &sum256);
3548                             s += 2 * src_stride;
3549                             r += 2 * ref_stride;
3550                             h -= 2;
3551                         };
3552 
3553                         if (h) {
3554                             sad_loop_kernel_4_oneline_avx2(s, r, &sum256);
3555                         }
3556 
3557                         __m128i sum = complement_4_to_6(
3558                             ref + x, ref_stride, src, src_stride, height, leftover);
3559                         sum256 = _mm256_adds_epu16(
3560                             sum256, _mm256_insertf128_si256(_mm256_setzero_si256(), sum, 0));
3561 
3562                         update_leftover_small_pel(sum256, x, y, mask128, &best_s, &best_x, &best_y);
3563                     }
3564 
3565                     ref += src_stride_raw;
3566                 } while (++y < search_area_height);
3567             }
3568             break;
3569 
3570         case 8:
3571             // Note: Tried _mm512_dbsad_epu8 but is even slower.
3572             y = 0;
3573             do {
3574                 for (x = 0; x <= search_area_width - 8; x += 8) {
3575                     __m256i sum256 = _mm256_setzero_si256();
3576 
3577                     s = src;
3578                     r = ref + x;
3579 
3580                     h = height;
3581                     while (h >= 2) {
3582                         sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
3583                         s += 2 * src_stride;
3584                         r += 2 * ref_stride;
3585                         h -= 2;
3586                     };
3587 
3588                     if (h) {
3589                         sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
3590                     };
3591 
3592                     update_small_pel(sum256, x, y, &best_s, &best_x, &best_y);
3593                 }
3594 
3595                 if (leftover) {
3596                     __m256i sum256 = _mm256_setzero_si256();
3597 
3598                     s = src;
3599                     r = ref + x;
3600 
3601                     h = height;
3602                     while (h >= 2) {
3603                         sad_loop_kernel_8_avx2(s, src_stride, r, ref_stride, &sum256);
3604                         s += 2 * src_stride;
3605                         r += 2 * ref_stride;
3606                         h -= 2;
3607                     };
3608 
3609                     if (h) {
3610                         sad_loop_kernel_8_oneline_avx2(s, r, &sum256);
3611                     };
3612 
3613                     update_leftover_small_pel(sum256, x, y, mask128, &best_s, &best_x, &best_y);
3614                 }
3615 
3616                 ref += src_stride_raw;
3617             } while (++y < search_area_height);
3618             break;
3619 
3620         case 12:
3621             if (height <= 16) {
3622                 y = 0;
3623                 do {
3624                     for (x = 0; x <= search_area_width - 16; x += 16) {
3625                         __m512i sum512 = _mm512_setzero_si512();
3626 
3627                         s = src;
3628                         r = ref + x;
3629 
3630                         h = height;
3631                         while (h >= 2) {
3632                             sad_loop_kernel_12_avx512(s, src_stride, r, ref_stride, &sum512);
3633                             s += 2 * src_stride;
3634                             r += 2 * ref_stride;
3635                             h -= 2;
3636                         };
3637 
3638                         if (h) {
3639                             sad_loop_kernel_12_oneline_avx512(s, r, &sum512);
3640                         }
3641 
3642                         update_256_pel(sum512, x, y, &best_s, &best_x, &best_y);
3643                     }
3644 
3645                     // leftover
3646                     for (; x < search_area_width; x += 8) {
3647                         __m256i sum256 = _mm256_setzero_si256();
3648 
3649                         s = src;
3650                         r = ref + x;
3651 
3652                         h = height;
3653                         while (h >= 2) {
3654                             sad_loop_kernel_12_avx2(s, src_stride, r, ref_stride, &sum256);
3655                             s += 2 * src_stride;
3656                             r += 2 * ref_stride;
3657                             h -= 2;
3658                         };
3659 
3660                         if (h) {
3661                             sad_loop_kernel_12_oneline_avx2(s, r, &sum256);
3662                         }
3663 
3664                         update_leftover_256_pel(
3665                             sum256, search_area_width, x, y, mask128, &best_s, &best_x, &best_y);
3666                     }
3667 
3668                     ref += src_stride_raw;
3669                 } while (++y < search_area_height);
3670             } else if (height <= 32) {
3671                 y = 0;
3672                 do {
3673                     for (x = 0; x <= search_area_width - 16; x += 16) {
3674                         __m512i sum512 = _mm512_setzero_si512();
3675 
3676                         s = src;
3677                         r = ref + x;
3678 
3679                         h = height;
3680                         while (h >= 2) {
3681                             sad_loop_kernel_12_avx512(s, src_stride, r, ref_stride, &sum512);
3682                             s += 2 * src_stride;
3683                             r += 2 * ref_stride;
3684                             h -= 2;
3685                         };
3686 
3687                         if (h) {
3688                             sad_loop_kernel_12_oneline_avx512(s, r, &sum512);
3689                         }
3690 
3691                         update_512_pel(sum512, x, y, &best_s, &best_x, &best_y);
3692                     }
3693 
3694                     // leftover
3695                     for (; x < search_area_width; x += 8) {
3696                         __m256i sum256 = _mm256_setzero_si256();
3697 
3698                         s = src;
3699                         r = ref + x;
3700 
3701                         h = height;
3702                         while (h >= 2) {
3703                             sad_loop_kernel_12_avx2(s, src_stride, r, ref_stride, &sum256);
3704                             s += 2 * src_stride;
3705                             r += 2 * ref_stride;
3706                             h -= 2;
3707                         };
3708 
3709                         if (h) {
3710                             sad_loop_kernel_12_oneline_avx2(s, r, &sum256);
3711                         }
3712 
3713                         update_leftover_512_pel(
3714                             sum256, search_area_width, x, y, mask256, &best_s, &best_x, &best_y);
3715                     }
3716 
3717                     ref += src_stride_raw;
3718                 } while (++y < search_area_height);
3719             } else {
3720                 const uint32_t leftover16 = search_area_width & 15;
3721 
3722                 y = 0;
3723                 do {
3724                     for (x = 0; x <= search_area_width - 16; x += 16) {
3725                         __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
3726 
3727                         s = src;
3728                         r = ref + x;
3729 
3730                         h = height;
3731                         while (h >= 2) {
3732                             sad_loop_kernel_12_2sum_avx512(s, src_stride, r, ref_stride, sums512);
3733                             s += 2 * src_stride;
3734                             r += 2 * ref_stride;
3735                             h -= 2;
3736                         };
3737 
3738                         if (h) {
3739                             sad_loop_kernel_12_2sum_oneline_avx512(s, r, sums512);
3740                         }
3741 
3742                         update_1024_pel(sums512, x, y, &best_s, &best_x, &best_y);
3743                     }
3744 
3745                     if (leftover16 >= 8) {
3746                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3747 
3748                         s = src;
3749                         r = ref + x;
3750 
3751                         h = height;
3752                         while (h >= 2) {
3753                             sad_loop_kernel_12_2sum_avx2(s, src_stride, r, ref_stride, sums256);
3754                             s += 2 * src_stride;
3755                             r += 2 * ref_stride;
3756                             h -= 2;
3757                         };
3758 
3759                         if (h) {
3760                             sad_loop_kernel_12_2sum_oneline_avx2(s, r, sums256);
3761                         }
3762 
3763                         update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
3764                         x += 8;
3765                     }
3766 
3767                     if (leftover) {
3768                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3769 
3770                         s = src;
3771                         r = ref + x;
3772 
3773                         h = height;
3774                         while (h >= 2) {
3775                             sad_loop_kernel_12_2sum_avx2(s, src_stride, r, ref_stride, sums256);
3776                             s += 2 * src_stride;
3777                             r += 2 * ref_stride;
3778                             h -= 2;
3779                         };
3780 
3781                         if (h) {
3782                             sad_loop_kernel_12_2sum_oneline_avx2(s, r, sums256);
3783                         }
3784 
3785                         update_leftover_1024_pel(sums256,
3786                                                  search_area_width,
3787                                                  x,
3788                                                  y,
3789                                                  leftover,
3790                                                  mask128,
3791                                                  &best_s,
3792                                                  &best_x,
3793                                                  &best_y);
3794                     }
3795 
3796                     ref += src_stride_raw;
3797                 } while (++y < search_area_height);
3798             }
3799             break;
3800 
3801         case 16:
3802             if (height <= 16) {
3803                 y = 0;
3804                 do {
3805                     for (x = 0; x <= search_area_width - 16; x += 16) {
3806                         __m512i sum512 = _mm512_setzero_si512();
3807 
3808                         s = src;
3809                         r = ref + x;
3810 
3811                         h = height;
3812                         while (h >= 2) {
3813                             sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3814                             s += 2 * src_stride;
3815                             r += 2 * ref_stride;
3816                             h -= 2;
3817                         };
3818 
3819                         if (h) {
3820                             sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
3821                         };
3822 
3823                         update_256_pel(sum512, x, y, &best_s, &best_x, &best_y);
3824                     }
3825 
3826                     // leftover
3827                     for (; x < search_area_width; x += 8) {
3828                         __m256i sum256 = _mm256_setzero_si256();
3829 
3830                         s = src;
3831                         r = ref + x;
3832 
3833                         h = height;
3834                         while (h >= 2) {
3835                             sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
3836                             s += 2 * src_stride;
3837                             r += 2 * ref_stride;
3838                             h -= 2;
3839                         }
3840 
3841                         if (h) {
3842                             sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
3843                         }
3844 
3845                         update_leftover_256_pel(
3846                             sum256, search_area_width, x, y, mask128, &best_s, &best_x, &best_y);
3847                     }
3848 
3849                     ref += src_stride_raw;
3850                 } while (++y < search_area_height);
3851             } else if (height <= 32) {
3852                 y = 0;
3853                 do {
3854                     for (x = 0; x <= search_area_width - 16; x += 16) {
3855                         __m512i sum512 = _mm512_setzero_si512();
3856 
3857                         s = src;
3858                         r = ref + x;
3859 
3860                         h = height;
3861                         while (h >= 2) {
3862                             sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3863                             s += 2 * src_stride;
3864                             r += 2 * ref_stride;
3865                             h -= 2;
3866                         }
3867 
3868                         if (h) {
3869                             sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
3870                         }
3871 
3872                         update_512_pel(sum512, x, y, &best_s, &best_x, &best_y);
3873                     }
3874 
3875                     // leftover
3876                     for (; x < search_area_width; x += 8) {
3877                         __m256i sum256 = _mm256_setzero_si256();
3878 
3879                         s = src;
3880                         r = ref + x;
3881 
3882                         h = height;
3883                         while (h >= 2) {
3884                             sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
3885                             s += 2 * src_stride;
3886                             r += 2 * ref_stride;
3887                             h -= 2;
3888                         }
3889 
3890                         if (h) {
3891                             sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
3892                         }
3893 
3894                         update_leftover_512_pel(
3895                             sum256, search_area_width, x, y, mask256, &best_s, &best_x, &best_y);
3896                     }
3897 
3898                     ref += src_stride_raw;
3899                 } while (++y < search_area_height);
3900             } else {
3901                 const uint32_t leftover16 = search_area_width & 15;
3902 
3903                 y = 0;
3904                 do {
3905                     for (x = 0; x <= search_area_width - 16; x += 16) {
3906                         __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
3907 
3908                         s = src;
3909                         r = ref + x;
3910 
3911                         h = height;
3912                         while (h >= 2) {
3913                             sad_loop_kernel_16_2sum_avx512(s, src_stride, r, ref_stride, sums512);
3914                             s += 2 * src_stride;
3915                             r += 2 * ref_stride;
3916                             h -= 2;
3917                         }
3918 
3919                         if (h) {
3920                             sad_loop_kernel_16_2sum_oneline_avx512(s, r, sums512);
3921                         }
3922 
3923                         update_1024_pel(sums512, x, y, &best_s, &best_x, &best_y);
3924                     }
3925 
3926                     if (leftover16 >= 8) {
3927                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3928 
3929                         s = src;
3930                         r = ref + x;
3931 
3932                         h = height;
3933                         while (h >= 2) {
3934                             sad_loop_kernel_16_2sum_avx2(s, src_stride, r, ref_stride, sums256);
3935                             s += 2 * src_stride;
3936                             r += 2 * ref_stride;
3937                             h -= 2;
3938                         }
3939 
3940                         if (h) {
3941                             sad_loop_kernel_16_2sum_oneline_avx2(s, r, sums256);
3942                         }
3943 
3944                         update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
3945                         x += 8;
3946                     }
3947 
3948                     if (leftover) {
3949                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3950 
3951                         s = src;
3952                         r = ref + x;
3953 
3954                         h = height;
3955                         while (h >= 2) {
3956                             sad_loop_kernel_16_2sum_avx2(s, src_stride, r, ref_stride, sums256);
3957                             s += 2 * src_stride;
3958                             r += 2 * ref_stride;
3959                             h -= 2;
3960                         }
3961 
3962                         if (h) {
3963                             sad_loop_kernel_16_2sum_oneline_avx2(s, r, sums256);
3964                         }
3965 
3966                         update_leftover_1024_pel(sums256,
3967                                                  search_area_width,
3968                                                  x,
3969                                                  y,
3970                                                  leftover,
3971                                                  mask128,
3972                                                  &best_s,
3973                                                  &best_x,
3974                                                  &best_y);
3975                     }
3976 
3977                     ref += src_stride_raw;
3978                 } while (++y < search_area_height);
3979             }
3980             break;
3981 
3982         case 24:
3983             if (height <= 16) {
3984                 y = 0;
3985                 do {
3986                     for (x = 0; x <= search_area_width - 16; x += 16) {
3987                         __m512i sum512     = _mm512_setzero_si512();
3988                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
3989 
3990                         s = src;
3991                         r = ref + x;
3992 
3993                         h = height;
3994                         while (h >= 2) {
3995                             sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
3996                             sad_loop_kernel_8_avx2(
3997                                 s + 16, src_stride, r + 16, ref_stride, &sums256[0]);
3998                             sad_loop_kernel_8_avx2(
3999                                 s + 16, src_stride, r + 24, ref_stride, &sums256[1]);
4000                             s += 2 * src_stride;
4001                             r += 2 * ref_stride;
4002                             h -= 2;
4003                         };
4004 
4005                         if (h) {
4006                             sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
4007                             sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[0]);
4008                             sad_loop_kernel_8_oneline_avx2(s + 16, r + 24, &sums256[1]);
4009                         }
4010 
4011                         update_384_pel(sum512, sums256, x, y, &best_s, &best_x, &best_y);
4012                     }
4013 
4014                     // leftover
4015                     for (; x < search_area_width; x += 8) {
4016                         __m256i sum256 = _mm256_setzero_si256();
4017 
4018                         s = src;
4019                         r = ref + x;
4020 
4021                         h = height;
4022                         while (h >= 2) {
4023                             sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sum256);
4024                             sad_loop_kernel_8_avx2(s + 16, src_stride, r + 16, ref_stride, &sum256);
4025                             s += 2 * src_stride;
4026                             r += 2 * ref_stride;
4027                             h -= 2;
4028                         };
4029 
4030                         if (h) {
4031                             sad_loop_kernel_16_oneline_avx2(s, r, &sum256);
4032                             sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sum256);
4033                         }
4034 
4035                         update_leftover_512_pel(
4036                             sum256, search_area_width, x, y, mask256, &best_s, &best_x, &best_y);
4037                     }
4038 
4039                     ref += src_stride_raw;
4040                 } while (++y < search_area_height);
4041             } else {
4042                 const uint32_t leftover16 = search_area_width & 15;
4043 
4044                 y = 0;
4045                 do {
4046                     for (x = 0; x <= search_area_width - 16; x += 16) {
4047                         __m512i sum512     = _mm512_setzero_si512();
4048                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4049 
4050                         s = src;
4051                         r = ref + x;
4052 
4053                         h = height;
4054                         while (h >= 2) {
4055                             sad_loop_kernel_16_avx512(s, src_stride, r, ref_stride, &sum512);
4056                             sad_loop_kernel_8_avx2(
4057                                 s + 16, src_stride, r + 16, ref_stride, &sums256[0]);
4058                             sad_loop_kernel_8_avx2(
4059                                 s + 16, src_stride, r + 24, ref_stride, &sums256[1]);
4060                             s += 2 * src_stride;
4061                             r += 2 * ref_stride;
4062                             h -= 2;
4063                         };
4064 
4065                         if (h) {
4066                             sad_loop_kernel_16_oneline_avx512(s, r, &sum512);
4067                             sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[0]);
4068                             sad_loop_kernel_8_oneline_avx2(s + 16, r + 24, &sums256[1]);
4069                         }
4070 
4071                         update_768_pel(sum512, sums256, x, y, &best_s, &best_x, &best_y);
4072                     }
4073 
4074                     // leftover
4075                     if (leftover16 >= 8) {
4076                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4077 
4078                         s = src;
4079                         r = ref + x;
4080 
4081                         h = height;
4082                         while (h >= 2) {
4083                             sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sums256[0]);
4084                             sad_loop_kernel_8_avx2(
4085                                 s + 16, src_stride, r + 16, ref_stride, &sums256[1]);
4086                             s += 2 * src_stride;
4087                             r += 2 * ref_stride;
4088                             h -= 2;
4089                         };
4090 
4091                         if (h) {
4092                             sad_loop_kernel_16_oneline_avx2(s, r, &sums256[0]);
4093                             sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[1]);
4094                         }
4095 
4096                         update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
4097 
4098                         x += 8;
4099                     }
4100 
4101                     if (leftover) {
4102                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4103 
4104                         s = src;
4105                         r = ref + x;
4106 
4107                         h = height;
4108                         while (h >= 2) {
4109                             sad_loop_kernel_16_avx2(s, src_stride, r, ref_stride, &sums256[0]);
4110                             sad_loop_kernel_8_avx2(
4111                                 s + 16, src_stride, r + 16, ref_stride, &sums256[1]);
4112                             s += 2 * src_stride;
4113                             r += 2 * ref_stride;
4114                             h -= 2;
4115                         };
4116 
4117                         if (h) {
4118                             sad_loop_kernel_16_oneline_avx2(s, r, &sums256[0]);
4119                             sad_loop_kernel_8_oneline_avx2(s + 16, r + 16, &sums256[1]);
4120                         }
4121 
4122                         update_leftover_1024_pel(sums256,
4123                                                  search_area_width,
4124                                                  x,
4125                                                  y,
4126                                                  leftover,
4127                                                  mask128,
4128                                                  &best_s,
4129                                                  &best_x,
4130                                                  &best_y);
4131                     }
4132 
4133                     ref += src_stride_raw;
4134                 } while (++y < search_area_height);
4135             }
4136             break;
4137 
4138         case 32:
4139             if (height <= 16) {
4140                 y = 0;
4141                 do {
4142                     for (x = 0; x <= search_area_width - 16; x += 16) {
4143                         __m512i sum512 = _mm512_setzero_si512();
4144 
4145                         s = src;
4146                         r = ref + x;
4147 
4148                         // Note: faster than looping 2 rows.
4149                         h = height;
4150                         do {
4151                             sad_loop_kernel_32_avx512(s, r, &sum512);
4152                             s += src_stride;
4153                             r += ref_stride;
4154                         } while (--h);
4155 
4156                         update_512_pel(sum512, x, y, &best_s, &best_x, &best_y);
4157                     }
4158 
4159                     // leftover
4160                     for (; x < search_area_width; x += 8) {
4161                         __m256i sum256 = _mm256_setzero_si256();
4162 
4163                         s = src;
4164                         r = ref + x;
4165 
4166                         h = height;
4167                         do {
4168                             sad_loop_kernel_32_avx2(s, r, &sum256);
4169                             s += src_stride;
4170                             r += ref_stride;
4171                         } while (--h);
4172 
4173                         update_leftover_512_pel(
4174                             sum256, search_area_width, x, y, mask256, &best_s, &best_x, &best_y);
4175                     }
4176 
4177                     ref += src_stride_raw;
4178                 } while (++y < search_area_height);
4179             } else if (height <= 32) {
4180                 const uint32_t leftover16 = search_area_width & 15;
4181 
4182                 y = 0;
4183                 do {
4184                     for (x = 0; x <= search_area_width - 16; x += 16) {
4185                         __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
4186 
4187                         s = src;
4188                         r = ref + x;
4189 
4190                         h = height;
4191                         do {
4192                             sad_loop_kernel_32_2sum_avx512(s, r, sums512);
4193                             s += src_stride;
4194                             r += ref_stride;
4195                         } while (--h);
4196 
4197                         update_1024_pel(sums512, x, y, &best_s, &best_x, &best_y);
4198                     }
4199 
4200                     if (leftover16 >= 8) {
4201                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4202 
4203                         s = src;
4204                         r = ref + x;
4205 
4206                         h = height;
4207                         do {
4208                             sad_loop_kernel_32_2sum_avx2(s, r, sums256);
4209                             s += src_stride;
4210                             r += ref_stride;
4211                         } while (--h);
4212 
4213                         update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
4214                         x += 8;
4215                     }
4216 
4217                     if (leftover) {
4218                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4219 
4220                         s = src;
4221                         r = ref + x;
4222 
4223                         h = height;
4224                         do {
4225                             sad_loop_kernel_32_2sum_avx2(s, r, sums256);
4226                             s += src_stride;
4227                             r += ref_stride;
4228                         } while (--h);
4229 
4230                         update_leftover_1024_pel(sums256,
4231                                                  search_area_width,
4232                                                  x,
4233                                                  y,
4234                                                  leftover,
4235                                                  mask128,
4236                                                  &best_s,
4237                                                  &best_x,
4238                                                  &best_y);
4239                     }
4240 
4241                     ref += src_stride_raw;
4242                 } while (++y < search_area_height);
4243             } else {
4244                 const uint32_t leftover16 = search_area_width & 15;
4245 
4246                 y = 0;
4247                 do {
4248                     for (x = 0; x <= search_area_width - 16; x += 16) {
4249                         __m512i sums512[4] = {_mm512_setzero_si512(),
4250                                               _mm512_setzero_si512(),
4251                                               _mm512_setzero_si512(),
4252                                               _mm512_setzero_si512()};
4253 
4254                         s = src;
4255                         r = ref + x;
4256 
4257                         h = height;
4258                         do {
4259                             sad_loop_kernel_32_4sum_avx512(s, r, sums512);
4260                             s += src_stride;
4261                             r += ref_stride;
4262                         } while (--h);
4263 
4264                         update_2048_pel(sums512, x, y, &best_s, &best_x, &best_y);
4265                     }
4266 
4267                     if (leftover16 >= 8) {
4268                         __m256i sums256[4] = {_mm256_setzero_si256(),
4269                                               _mm256_setzero_si256(),
4270                                               _mm256_setzero_si256(),
4271                                               _mm256_setzero_si256()};
4272 
4273                         s = src;
4274                         r = ref + x;
4275 
4276                         h = height;
4277                         do {
4278                             sad_loop_kernel_32_4sum_avx2(s, r, sums256);
4279                             s += src_stride;
4280                             r += ref_stride;
4281                         } while (--h);
4282 
4283                         update_leftover8_2048_pel(sums256, x, y, &best_s, &best_x, &best_y);
4284                         x += 8;
4285                     }
4286 
4287                     if (leftover) {
4288                         __m256i sums256[4] = {_mm256_setzero_si256(),
4289                                               _mm256_setzero_si256(),
4290                                               _mm256_setzero_si256(),
4291                                               _mm256_setzero_si256()};
4292 
4293                         s = src;
4294                         r = ref + x;
4295 
4296                         h = height;
4297                         do {
4298                             sad_loop_kernel_32_4sum_avx2(s, r, sums256);
4299                             s += src_stride;
4300                             r += ref_stride;
4301                         } while (--h);
4302 
4303                         update_leftover_2048_pel(sums256,
4304                                                  search_area_width,
4305                                                  x,
4306                                                  y,
4307                                                  leftover,
4308                                                  mask128,
4309                                                  &best_s,
4310                                                  &best_x,
4311                                                  &best_y);
4312                     }
4313 
4314                     ref += src_stride_raw;
4315                 } while (++y < search_area_height);
4316             }
4317             break;
4318 
4319         case 48:
4320             if (height <= 32) {
4321                 const uint32_t leftover16 = search_area_width & 15;
4322 
4323                 y = 0;
4324                 do {
4325                     for (x = 0; x <= search_area_width - 16; x += 16) {
4326                         __m512i sums512[3] = {
4327                             _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512()};
4328 
4329                         s = src;
4330                         r = ref + x;
4331 
4332                         h = height2;
4333                         do {
4334                             sad_loop_kernel_32_2sum_avx512(s, r, sums512);
4335                             sad_loop_kernel_32_2sum_avx512(s + src_stride, r + ref_stride, sums512);
4336                             sad_loop_kernel_16_avx512(
4337                                 s + 32, src_stride, r + 32, ref_stride, &sums512[2]);
4338                             s += 2 * src_stride;
4339                             r += 2 * ref_stride;
4340                         } while (--h);
4341 
4342                         update_1536_pel(sums512, x, y, &best_s, &best_x, &best_y);
4343                     }
4344 
4345                     if (leftover16 >= 8) {
4346                         __m256i sums256[3] = {
4347                             _mm256_setzero_si256(), _mm256_setzero_si256(), _mm256_setzero_si256()};
4348 
4349                         s = src;
4350                         r = ref + x;
4351 
4352                         h = height2;
4353                         do {
4354                             sad_loop_kernel_32_2sum_avx2(s, r, sums256);
4355                             sad_loop_kernel_32_2sum_avx2(s + src_stride, r + ref_stride, sums256);
4356                             sad_loop_kernel_16_avx2(
4357                                 s + 32, src_stride, r + 32, ref_stride, &sums256[2]);
4358                             s += 2 * src_stride;
4359                             r += 2 * ref_stride;
4360                         } while (--h);
4361 
4362                         update_leftover8_1536_pel(sums256, x, y, &best_s, &best_x, &best_y);
4363                         x += 8;
4364                     }
4365 
4366                     if (leftover) {
4367                         __m256i sums256[3] = {
4368                             _mm256_setzero_si256(), _mm256_setzero_si256(), _mm256_setzero_si256()};
4369 
4370                         s = src;
4371                         r = ref + x;
4372 
4373                         h = height2;
4374                         do {
4375                             sad_loop_kernel_32_2sum_avx2(s, r, sums256);
4376                             sad_loop_kernel_32_2sum_avx2(s + src_stride, r + ref_stride, sums256);
4377                             sad_loop_kernel_16_avx2(
4378                                 s + 32, src_stride, r + 32, ref_stride, &sums256[2]);
4379                             s += 2 * src_stride;
4380                             r += 2 * ref_stride;
4381                         } while (--h);
4382 
4383                         update_leftover_1536_pel(sums256,
4384                                                  search_area_width,
4385                                                  x,
4386                                                  y,
4387                                                  leftover,
4388                                                  mask128,
4389                                                  &best_s,
4390                                                  &best_x,
4391                                                  &best_y);
4392                     }
4393 
4394                     ref += src_stride_raw;
4395                 } while (++y < search_area_height);
4396             } else {
4397                 const uint32_t leftover16 = search_area_width & 15;
4398 
4399                 y = 0;
4400                 do {
4401                     for (x = 0; x <= search_area_width - 16; x += 16) {
4402                         __m512i sums512[6] = {_mm512_setzero_si512(),
4403                                               _mm512_setzero_si512(),
4404                                               _mm512_setzero_si512(),
4405                                               _mm512_setzero_si512(),
4406                                               _mm512_setzero_si512(),
4407                                               _mm512_setzero_si512()};
4408 
4409                         s = src;
4410                         r = ref + x;
4411 
4412                         h = height2;
4413                         do {
4414                             sad_loop_kernel_32_4sum_avx512(s, r, sums512);
4415                             sad_loop_kernel_32_4sum_avx512(s + src_stride, r + ref_stride, sums512);
4416                             sad_loop_kernel_16_2sum_avx512(
4417                                 s + 32, src_stride, r + 32, ref_stride, &sums512[4]);
4418                             s += 2 * src_stride;
4419                             r += 2 * ref_stride;
4420                         } while (--h);
4421 
4422                         const __m512i  sum512_01   = _mm512_adds_epu16(sums512[0], sums512[1]);
4423                         const __m512i  sum512_23   = _mm512_adds_epu16(sums512[2], sums512[3]);
4424                         const __m512i  sum512_45   = _mm512_adds_epu16(sums512[4], sums512[5]);
4425                         const __m512i  sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
4426                         const __m512i  sum512      = _mm512_adds_epu16(sum512_0123, sum512_45);
4427                         const __m256i  sum_lo      = _mm512_castsi512_si256(sum512);
4428                         const __m256i  sum_hi      = _mm512_extracti64x4_epi64(sum512, 1);
4429                         const __m256i  sad         = _mm256_adds_epu16(sum_lo, sum_hi);
4430                         const __m128i  sad_lo      = _mm256_castsi256_si128(sad);
4431                         const __m128i  sad_hi      = _mm256_extracti128_si256(sad, 1);
4432                         const __m128i  minpos_lo   = _mm_minpos_epu16(sad_lo);
4433                         const __m128i  minpos_hi   = _mm_minpos_epu16(sad_hi);
4434                         const uint32_t min0        = _mm_extract_epi16(minpos_lo, 0);
4435                         const uint32_t min1        = _mm_extract_epi16(minpos_hi, 0);
4436                         uint32_t       minmin, delta;
4437                         __m128i        minpos;
4438 
4439                         if (min0 <= min1) {
4440                             minmin = min0;
4441                             delta  = 0;
4442                             minpos = minpos_lo;
4443                         } else {
4444                             minmin = min1;
4445                             delta  = 8;
4446                             minpos = minpos_hi;
4447                         }
4448 
4449                         if (minmin < best_s) {
4450                             if (minmin != 0xFFFF) { // no overflow
4451                                 best_s = minmin;
4452                                 best_x = x + delta + _mm_extract_epi16(minpos, 1);
4453                                 best_y = y;
4454                             } else { // overflow
4455                                 __m256i sads256[2];
4456                                 __m128i sads128[2];
4457 
4458                                 add16x16x6to32bit(sums512, sads256);
4459 
4460                                 sads128[0] = _mm256_castsi256_si128(sads256[0]);
4461                                 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
4462                                 update_8_best(sads128, x + 0, y, &best_s, &best_x, &best_y);
4463 
4464                                 sads128[0] = _mm256_castsi256_si128(sads256[1]);
4465                                 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
4466                                 update_8_best(sads128, x + 8, y, &best_s, &best_x, &best_y);
4467                             }
4468                         }
4469                     }
4470 
4471                     if (leftover16 >= 8) {
4472                         __m256i sums256[6] = {_mm256_setzero_si256(),
4473                                               _mm256_setzero_si256(),
4474                                               _mm256_setzero_si256(),
4475                                               _mm256_setzero_si256(),
4476                                               _mm256_setzero_si256(),
4477                                               _mm256_setzero_si256()};
4478 
4479                         s = src;
4480                         r = ref + x;
4481 
4482                         h = height2;
4483                         do {
4484                             sad_loop_kernel_32_4sum_avx2(s, r, sums256);
4485                             sad_loop_kernel_32_4sum_avx2(s + src_stride, r + ref_stride, sums256);
4486                             sad_loop_kernel_16_2sum_avx2(
4487                                 s + 32, src_stride, r + 32, ref_stride, &sums256[4]);
4488                             s += 2 * src_stride;
4489                             r += 2 * ref_stride;
4490                         } while (--h);
4491 
4492                         const __m256i  sum01   = _mm256_adds_epu16(sums256[0], sums256[1]);
4493                         const __m256i  sum23   = _mm256_adds_epu16(sums256[2], sums256[3]);
4494                         const __m256i  sum45   = _mm256_adds_epu16(sums256[4], sums256[5]);
4495                         const __m256i  sum0123 = _mm256_adds_epu16(sum01, sum23);
4496                         const __m256i  sum256  = _mm256_adds_epu16(sum0123, sum45);
4497                         const __m128i  sum_lo  = _mm256_castsi256_si128(sum256);
4498                         const __m128i  sum_hi  = _mm256_extracti128_si256(sum256, 1);
4499                         const __m128i  sad     = _mm_adds_epu16(sum_lo, sum_hi);
4500                         const __m128i  minpos  = _mm_minpos_epu16(sad);
4501                         const uint32_t min0    = _mm_extract_epi16(minpos, 0);
4502 
4503                         if (min0 < best_s) {
4504                             if (min0 != 0xFFFF) { // no overflow
4505                                 best_s = min0;
4506                                 best_x = x + _mm_extract_epi16(minpos, 1);
4507                                 best_y = y;
4508                             } else { // overflow
4509                                 __m128i sads[2];
4510 
4511                                 add16x8x6to32bit(sums256, sads);
4512                                 update_8_best(sads, x, y, &best_s, &best_x, &best_y);
4513                             }
4514                         }
4515 
4516                         x += 8;
4517                     }
4518 
4519                     if (leftover) {
4520                         __m256i sums256[6] = {_mm256_setzero_si256(),
4521                                               _mm256_setzero_si256(),
4522                                               _mm256_setzero_si256(),
4523                                               _mm256_setzero_si256(),
4524                                               _mm256_setzero_si256(),
4525                                               _mm256_setzero_si256()};
4526 
4527                         s = src;
4528                         r = ref + x;
4529 
4530                         h = height2;
4531                         do {
4532                             sad_loop_kernel_32_4sum_avx2(s, r, sums256);
4533                             sad_loop_kernel_32_4sum_avx2(s + src_stride, r + ref_stride, sums256);
4534                             sad_loop_kernel_16_2sum_avx2(
4535                                 s + 32, src_stride, r + 32, ref_stride, &sums256[4]);
4536                             s += 2 * src_stride;
4537                             r += 2 * ref_stride;
4538                         } while (--h);
4539 
4540                         const __m256i  sum01   = _mm256_adds_epu16(sums256[0], sums256[1]);
4541                         const __m256i  sum23   = _mm256_adds_epu16(sums256[2], sums256[3]);
4542                         const __m256i  sum45   = _mm256_adds_epu16(sums256[4], sums256[5]);
4543                         const __m256i  sum0123 = _mm256_adds_epu16(sum01, sum23);
4544                         const __m256i  sum256  = _mm256_adds_epu16(sum0123, sum45);
4545                         const __m128i  sum_lo  = _mm256_castsi256_si128(sum256);
4546                         const __m128i  sum_hi  = _mm256_extracti128_si256(sum256, 1);
4547                         const __m128i  sad0    = _mm_adds_epu16(sum_lo, sum_hi);
4548                         const __m128i  sad1    = _mm_or_si128(sad0, mask128);
4549                         const __m128i  minpos  = _mm_minpos_epu16(sad1);
4550                         const uint32_t min0    = _mm_extract_epi16(minpos, 0);
4551 
4552                         if (min0 < best_s) {
4553                             if (min0 != 0xFFFF) { // no overflow
4554                                 best_s = min0;
4555                                 best_x = x + _mm_extract_epi16(minpos, 1);
4556                                 best_y = y;
4557                             } else { // overflow
4558                                 const int32_t num = x + ((leftover < 4) ? leftover : 4);
4559                                 __m128i       sads[2];
4560 
4561                                 add16x8x6to32bit(sums256, sads);
4562 
4563                                 do {
4564                                     UPDATE_BEST(sads[0], 0, x, best_s, best_x, best_y);
4565                                     sads[0] = _mm_srli_si128(sads[0], 4);
4566                                 } while (++x < num);
4567 
4568                                 while (x < search_area_width) {
4569                                     UPDATE_BEST(sads[1], 0, x, best_s, best_x, best_y);
4570                                     sads[1] = _mm_srli_si128(sads[1], 4);
4571                                     x++;
4572                                 }
4573                             }
4574                         }
4575                     }
4576 
4577                     ref += src_stride_raw;
4578                 } while (++y < search_area_height);
4579             }
4580             break;
4581 
4582         case 64:
4583             if (height <= 16) {
4584                 const uint32_t leftover16 = search_area_width & 15;
4585 
4586                 y = 0;
4587                 do {
4588                     for (x = 0; x <= search_area_width - 16; x += 16) {
4589                         __m512i sums512[2] = {_mm512_setzero_si512(), _mm512_setzero_si512()};
4590 
4591                         s = src;
4592                         r = ref + x;
4593 
4594                         h = height;
4595                         do {
4596                             sad_loop_kernel_32_2sum_avx512(s + 0 * 32, r + 0 * 32, sums512);
4597                             sad_loop_kernel_32_2sum_avx512(s + 1 * 32, r + 1 * 32, sums512);
4598                             s += src_stride;
4599                             r += ref_stride;
4600                         } while (--h);
4601 
4602                         update_1024_pel(sums512, x, y, &best_s, &best_x, &best_y);
4603                     }
4604 
4605                     if (leftover16 >= 8) {
4606                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4607 
4608                         s = src;
4609                         r = ref + x;
4610 
4611                         h = height;
4612                         do {
4613                             sad_loop_kernel_64_2sum_avx2(s, r, sums256);
4614                             s += src_stride;
4615                             r += ref_stride;
4616                         } while (--h);
4617 
4618                         update_leftover8_1024_pel(sums256, x, y, &best_s, &best_x, &best_y);
4619                         x += 8;
4620                     }
4621 
4622                     if (leftover) {
4623                         __m256i sums256[2] = {_mm256_setzero_si256(), _mm256_setzero_si256()};
4624 
4625                         s = src;
4626                         r = ref + x;
4627 
4628                         h = height;
4629                         do {
4630                             sad_loop_kernel_64_2sum_avx2(s, r, sums256);
4631                             s += src_stride;
4632                             r += ref_stride;
4633                         } while (--h);
4634 
4635                         update_leftover_1024_pel(sums256,
4636                                                  search_area_width,
4637                                                  x,
4638                                                  y,
4639                                                  leftover,
4640                                                  mask128,
4641                                                  &best_s,
4642                                                  &best_x,
4643                                                  &best_y);
4644                     }
4645 
4646                     ref += src_stride_raw;
4647                 } while (++y < search_area_height);
4648             } else if (height <= 32) {
4649                 const uint32_t leftover16 = search_area_width & 15;
4650 
4651                 y = 0;
4652                 do {
4653                     for (x = 0; x <= search_area_width - 16; x += 16) {
4654                         __m512i sums512[4] = {_mm512_setzero_si512(),
4655                                               _mm512_setzero_si512(),
4656                                               _mm512_setzero_si512(),
4657                                               _mm512_setzero_si512()};
4658 
4659                         s = src;
4660                         r = ref + x;
4661 
4662                         h = height;
4663                         do {
4664                             sad_loop_kernel_32_4sum_avx512(s + 0 * 32, r + 0 * 32, sums512);
4665                             sad_loop_kernel_32_4sum_avx512(s + 1 * 32, r + 1 * 32, sums512);
4666                             s += src_stride;
4667                             r += ref_stride;
4668                         } while (--h);
4669 
4670                         update_2048_pel(sums512, x, y, &best_s, &best_x, &best_y);
4671                     }
4672 
4673                     if (leftover16 >= 8) {
4674                         __m256i sums256[4] = {_mm256_setzero_si256(),
4675                                               _mm256_setzero_si256(),
4676                                               _mm256_setzero_si256(),
4677                                               _mm256_setzero_si256()};
4678 
4679                         s = src;
4680                         r = ref + x;
4681 
4682                         h = height;
4683                         do {
4684                             sad_loop_kernel_64_4sum_avx2(s, r, sums256);
4685                             s += src_stride;
4686                             r += ref_stride;
4687                         } while (--h);
4688 
4689                         update_leftover8_2048_pel(sums256, x, y, &best_s, &best_x, &best_y);
4690                         x += 8;
4691                     }
4692 
4693                     if (leftover) {
4694                         __m256i sums256[4] = {_mm256_setzero_si256(),
4695                                               _mm256_setzero_si256(),
4696                                               _mm256_setzero_si256(),
4697                                               _mm256_setzero_si256()};
4698 
4699                         s = src;
4700                         r = ref + x;
4701 
4702                         h = height;
4703                         do {
4704                             sad_loop_kernel_64_4sum_avx2(s, r, sums256);
4705                             s += src_stride;
4706                             r += ref_stride;
4707                         } while (--h);
4708 
4709                         update_leftover_2048_pel(sums256,
4710                                                  search_area_width,
4711                                                  x,
4712                                                  y,
4713                                                  leftover,
4714                                                  mask128,
4715                                                  &best_s,
4716                                                  &best_x,
4717                                                  &best_y);
4718                     }
4719 
4720                     ref += src_stride_raw;
4721                 } while (++y < search_area_height);
4722             } else {
4723                 const uint32_t leftover16 = search_area_width & 15;
4724 
4725                 y = 0;
4726                 do {
4727                     for (x = 0; x <= search_area_width - 16; x += 16) {
4728                         __m512i sums512[8] = {_mm512_setzero_si512(),
4729                                               _mm512_setzero_si512(),
4730                                               _mm512_setzero_si512(),
4731                                               _mm512_setzero_si512(),
4732                                               _mm512_setzero_si512(),
4733                                               _mm512_setzero_si512(),
4734                                               _mm512_setzero_si512(),
4735                                               _mm512_setzero_si512()};
4736 
4737                         s = src;
4738                         r = ref + x;
4739 
4740                         h = height;
4741                         do {
4742                             sad_loop_kernel_32_4sum_avx512(s + 0 * 32, r + 0 * 32, sums512 + 0);
4743                             sad_loop_kernel_32_4sum_avx512(s + 1 * 32, r + 1 * 32, sums512 + 4);
4744                             s += src_stride;
4745                             r += ref_stride;
4746                         } while (--h);
4747 
4748                         const __m512i  sum512_01   = _mm512_adds_epu16(sums512[0], sums512[1]);
4749                         const __m512i  sum512_23   = _mm512_adds_epu16(sums512[2], sums512[3]);
4750                         const __m512i  sum512_45   = _mm512_adds_epu16(sums512[4], sums512[5]);
4751                         const __m512i  sum512_67   = _mm512_adds_epu16(sums512[6], sums512[7]);
4752                         const __m512i  sum512_0123 = _mm512_adds_epu16(sum512_01, sum512_23);
4753                         const __m512i  sum512_4567 = _mm512_adds_epu16(sum512_45, sum512_67);
4754                         const __m512i  sum512      = _mm512_adds_epu16(sum512_0123, sum512_4567);
4755                         const __m256i  sum_lo      = _mm512_castsi512_si256(sum512);
4756                         const __m256i  sum_hi      = _mm512_extracti64x4_epi64(sum512, 1);
4757                         const __m256i  sad         = _mm256_adds_epu16(sum_lo, sum_hi);
4758                         const __m128i  sad_lo      = _mm256_castsi256_si128(sad);
4759                         const __m128i  sad_hi      = _mm256_extracti128_si256(sad, 1);
4760                         const __m128i  minpos_lo   = _mm_minpos_epu16(sad_lo);
4761                         const __m128i  minpos_hi   = _mm_minpos_epu16(sad_hi);
4762                         const uint32_t min0        = _mm_extract_epi16(minpos_lo, 0);
4763                         const uint32_t min1        = _mm_extract_epi16(minpos_hi, 0);
4764                         uint32_t       minmin, delta;
4765                         __m128i        minpos;
4766 
4767                         if (min0 <= min1) {
4768                             minmin = min0;
4769                             delta  = 0;
4770                             minpos = minpos_lo;
4771                         } else {
4772                             minmin = min1;
4773                             delta  = 8;
4774                             minpos = minpos_hi;
4775                         }
4776 
4777                         if (minmin < best_s) {
4778                             if (minmin != 0xFFFF) { // no overflow
4779                                 best_s = minmin;
4780                                 best_x = x + delta + _mm_extract_epi16(minpos, 1);
4781                                 best_y = y;
4782                             } else { // overflow
4783                                 __m256i sads256[2];
4784                                 __m128i sads128[2];
4785 
4786                                 add16x16x8to32bit(sums512, sads256);
4787 
4788                                 sads128[0] = _mm256_castsi256_si128(sads256[0]);
4789                                 sads128[1] = _mm256_extracti128_si256(sads256[0], 1);
4790                                 update_8_best(sads128, x + 0, y, &best_s, &best_x, &best_y);
4791 
4792                                 sads128[0] = _mm256_castsi256_si128(sads256[1]);
4793                                 sads128[1] = _mm256_extracti128_si256(sads256[1], 1);
4794                                 update_8_best(sads128, x + 8, y, &best_s, &best_x, &best_y);
4795                             }
4796                         }
4797                     }
4798 
4799                     if (leftover16 >= 8) {
4800                         __m256i sums256[8] = {_mm256_setzero_si256(),
4801                                               _mm256_setzero_si256(),
4802                                               _mm256_setzero_si256(),
4803                                               _mm256_setzero_si256(),
4804                                               _mm256_setzero_si256(),
4805                                               _mm256_setzero_si256(),
4806                                               _mm256_setzero_si256(),
4807                                               _mm256_setzero_si256()};
4808 
4809                         s = src;
4810                         r = ref + x;
4811 
4812                         h = height;
4813                         do {
4814                             sad_loop_kernel_64_8sum_avx2(s, r, sums256);
4815                             s += src_stride;
4816                             r += ref_stride;
4817                         } while (--h);
4818 
4819                         const __m256i  sum01   = _mm256_adds_epu16(sums256[0], sums256[1]);
4820                         const __m256i  sum23   = _mm256_adds_epu16(sums256[2], sums256[3]);
4821                         const __m256i  sum45   = _mm256_adds_epu16(sums256[4], sums256[5]);
4822                         const __m256i  sum67   = _mm256_adds_epu16(sums256[6], sums256[7]);
4823                         const __m256i  sum0123 = _mm256_adds_epu16(sum01, sum23);
4824                         const __m256i  sum4567 = _mm256_adds_epu16(sum45, sum67);
4825                         const __m256i  sum256  = _mm256_adds_epu16(sum0123, sum4567);
4826                         const __m128i  sum_lo  = _mm256_castsi256_si128(sum256);
4827                         const __m128i  sum_hi  = _mm256_extracti128_si256(sum256, 1);
4828                         const __m128i  sad     = _mm_adds_epu16(sum_lo, sum_hi);
4829                         const __m128i  minpos  = _mm_minpos_epu16(sad);
4830                         const uint32_t min0    = _mm_extract_epi16(minpos, 0);
4831 
4832                         if (min0 < best_s) {
4833                             if (min0 != 0xFFFF) { // no overflow
4834                                 best_s = min0;
4835                                 best_x = x + _mm_extract_epi16(minpos, 1);
4836                                 best_y = y;
4837                             } else { // overflow
4838                                 __m128i sads[2];
4839 
4840                                 add16x8x8to32bit(sums256, sads);
4841                                 update_8_best(sads, x, y, &best_s, &best_x, &best_y);
4842                             }
4843                         }
4844 
4845                         x += 8;
4846                     }
4847 
4848                     if (leftover) {
4849                         __m256i sums256[8] = {_mm256_setzero_si256(),
4850                                               _mm256_setzero_si256(),
4851                                               _mm256_setzero_si256(),
4852                                               _mm256_setzero_si256(),
4853                                               _mm256_setzero_si256(),
4854                                               _mm256_setzero_si256(),
4855                                               _mm256_setzero_si256(),
4856                                               _mm256_setzero_si256()};
4857 
4858                         s = src;
4859                         r = ref + x;
4860 
4861                         h = height;
4862                         do {
4863                             sad_loop_kernel_64_8sum_avx2(s, r, sums256);
4864                             s += src_stride;
4865                             r += ref_stride;
4866                         } while (--h);
4867 
4868                         const __m256i  sum01   = _mm256_adds_epu16(sums256[0], sums256[1]);
4869                         const __m256i  sum23   = _mm256_adds_epu16(sums256[2], sums256[3]);
4870                         const __m256i  sum45   = _mm256_adds_epu16(sums256[4], sums256[5]);
4871                         const __m256i  sum67   = _mm256_adds_epu16(sums256[6], sums256[7]);
4872                         const __m256i  sum0123 = _mm256_adds_epu16(sum01, sum23);
4873                         const __m256i  sum4567 = _mm256_adds_epu16(sum45, sum67);
4874                         const __m256i  sum256  = _mm256_adds_epu16(sum0123, sum4567);
4875                         const __m128i  sum_lo  = _mm256_castsi256_si128(sum256);
4876                         const __m128i  sum_hi  = _mm256_extracti128_si256(sum256, 1);
4877                         const __m128i  sad0    = _mm_adds_epu16(sum_lo, sum_hi);
4878                         const __m128i  sad1    = _mm_or_si128(sad0, mask128);
4879                         const __m128i  minpos  = _mm_minpos_epu16(sad1);
4880                         const uint32_t min0    = _mm_extract_epi16(minpos, 0);
4881 
4882                         if (min0 < best_s) {
4883                             if (min0 != 0xFFFF) { // no overflow
4884                                 best_s = min0;
4885                                 best_x = x + _mm_extract_epi16(minpos, 1);
4886                                 best_y = y;
4887                             } else { // overflow
4888                                 const int32_t num = x + ((leftover < 4) ? leftover : 4);
4889                                 __m128i       sads[2];
4890 
4891                                 add16x8x8to32bit(sums256, sads);
4892 
4893                                 do {
4894                                     UPDATE_BEST(sads[0], 0, x, best_s, best_x, best_y);
4895                                     sads[0] = _mm_srli_si128(sads[0], 4);
4896                                 } while (++x < num);
4897 
4898                                 while (x < search_area_width) {
4899                                     UPDATE_BEST(sads[1], 0, x, best_s, best_x, best_y);
4900                                     sads[1] = _mm_srli_si128(sads[1], 4);
4901                                     x++;
4902                                 }
4903                             }
4904                         }
4905                     }
4906 
4907                     ref += src_stride_raw;
4908                 } while (++y < search_area_height);
4909             }
4910             break;
4911 
4912         default:
4913             sad_loop_kernel_generalized_avx512(src,
4914                                                src_stride,
4915                                                ref,
4916                                                ref_stride,
4917                                                height,
4918                                                width,
4919                                                best_sad,
4920                                                x_search_center,
4921                                                y_search_center,
4922                                                src_stride_raw,
4923                                                search_area_width,
4924                                                search_area_height);
4925             return;
4926         }
4927     }
4928 
4929     *best_sad        = best_s;
4930     *x_search_center = (int16_t)best_x;
4931     *y_search_center = (int16_t)best_y;
4932 }
4933 #endif // EN_AVX512_SUPPORT
4934