1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11 
12 #include "EbDefinitions.h"
13 
14 #if EN_AVX512_SUPPORT
15 #include <immintrin.h>
16 #include "common_dsp_rtcd.h"
17 #include "convolve.h"
18 #include "convolve_avx2.h"
19 #include "convolve_avx512.h"
20 #include "EbMemory_SSE4_1.h"
21 
jnt_2d_comp_avg_round_32_avx512(const __m512i src[2])22 static INLINE __m512i jnt_2d_comp_avg_round_32_avx512(const __m512i src[2]) {
23     const __m512i round = _mm512_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
24     const __m512i dst0  = _mm512_add_epi32(src[0], round);
25     const __m512i dst1  = _mm512_add_epi32(src[1], round);
26     const __m512i d0    = _mm512_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
27     const __m512i d1    = _mm512_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
28     return _mm512_packs_epi32(d0, d1);
29 }
30 
jnt_2d_comp_avg_round_half_pel_avx512(const __m512i src)31 static INLINE __m512i jnt_2d_comp_avg_round_half_pel_avx512(const __m512i src) {
32     const __m512i round = _mm512_set1_epi16(1);
33     const __m512i dst   = _mm512_add_epi16(src, round);
34     return _mm512_srai_epi16(dst, 1);
35 }
36 
jnt_2d_comp_avg_round_pack_32_avx512(const __m512i res[2],const __m512i factor,const __m512i offset,const __m512i dst)37 static INLINE __m512i jnt_2d_comp_avg_round_pack_32_avx512(const __m512i res[2],
38                                                            const __m512i factor,
39                                                            const __m512i offset,
40                                                            const __m512i dst) {
41     const __m512i r = jnt_2d_comp_avg_round_32_avx512(res);
42     __m512i       d[2];
43 
44     d[0] = _mm512_unpacklo_epi16(dst, r);
45     d[1] = _mm512_unpackhi_epi16(dst, r);
46     d[0] = _mm512_madd_epi16(d[0], factor);
47     d[1] = _mm512_madd_epi16(d[1], factor);
48     d[0] = _mm512_add_epi32(d[0], offset);
49     d[1] = _mm512_add_epi32(d[1], offset);
50     d[0] = _mm512_srai_epi32(d[0], 8);
51     d[1] = _mm512_srai_epi32(d[1], 8);
52     return _mm512_packs_epi32(d[0], d[1]);
53 }
54 
jnt_2d_comp_avg_round_pack_half_pel_avx512(const __m512i res,const __m512i factor,const __m512i offset,const __m512i dst)55 static INLINE __m512i jnt_2d_comp_avg_round_pack_half_pel_avx512(const __m512i res,
56                                                                  const __m512i factor,
57                                                                  const __m512i offset,
58                                                                  const __m512i dst) {
59     const __m512i r = jnt_2d_comp_avg_round_half_pel_avx512(res);
60     __m512i       d[2];
61 
62     d[0] = _mm512_unpacklo_epi16(dst, r);
63     d[1] = _mm512_unpackhi_epi16(dst, r);
64     d[0] = _mm512_madd_epi16(d[0], factor);
65     d[1] = _mm512_madd_epi16(d[1], factor);
66     d[0] = _mm512_add_epi32(d[0], offset);
67     d[1] = _mm512_add_epi32(d[1], offset);
68     d[0] = _mm512_srai_epi32(d[0], 8);
69     d[1] = _mm512_srai_epi32(d[1], 8);
70     return _mm512_packs_epi32(d[0], d[1]);
71 }
72 
jnt_2d_comp_avg_round_store_32x2_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i factor,const __m512i offset,const ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)73 SIMD_INLINE void jnt_2d_comp_avg_round_store_32x2_avx512(const __m512i r0[2], const __m512i r1[2],
74                                                          const __m512i factor, const __m512i offset,
75                                                          const ConvBufType *const dst,
76                                                          const int32_t            dst_stride,
77                                                          uint8_t *const           dst8,
78                                                          const int32_t            dst8_stride) {
79     __m512i d[2];
80 
81     d[0] = zz_loadu_512((dst + 0 * dst_stride));
82     d[1] = zz_loadu_512((dst + 1 * dst_stride));
83     d[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[0]);
84     d[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[1]);
85     d[0] = jnt_2d_comp_avg_round_pack_32_avx512(r0, factor, offset, d[0]);
86     d[1] = jnt_2d_comp_avg_round_pack_32_avx512(r1, factor, offset, d[1]);
87     xy_y_pack_store_32x2_avx512(d[0], d[1], dst8, dst8_stride);
88 }
89 
jnt_2d_comp_avg_round_store_64_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i factor,const __m512i offset,const ConvBufType * const dst,uint8_t * const dst8)90 SIMD_INLINE void jnt_2d_comp_avg_round_store_64_avx512(const __m512i r0[2], const __m512i r1[2],
91                                                        const __m512i factor, const __m512i offset,
92                                                        const ConvBufType *const dst,
93                                                        uint8_t *const           dst8) {
94     __m512i d[2];
95 
96     jnt_loadu_u16_8x4x2_avx512(dst, 32, d);
97     d[0] = jnt_2d_comp_avg_round_pack_32_avx512(r0, factor, offset, d[0]);
98     d[1] = jnt_2d_comp_avg_round_pack_32_avx512(r1, factor, offset, d[1]);
99     convolve_store_64_avx512(d[0], d[1], dst8);
100 }
101 
jnt_2d_comp_avg_round_store_half_pel_32x2_avx512(const __m512i res[2],const __m512i factor,const __m512i offset,const ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)102 SIMD_INLINE void jnt_2d_comp_avg_round_store_half_pel_32x2_avx512(
103     const __m512i res[2], const __m512i factor, const __m512i offset, const ConvBufType *const dst,
104     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
105     __m512i d[2];
106 
107     d[0] = zz_loadu_512((dst + 0 * dst_stride));
108     d[1] = zz_loadu_512((dst + 1 * dst_stride));
109     d[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[0]);
110     d[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[1]);
111     d[0] = jnt_2d_comp_avg_round_pack_half_pel_avx512(res[0], factor, offset, d[0]);
112     d[1] = jnt_2d_comp_avg_round_pack_half_pel_avx512(res[1], factor, offset, d[1]);
113     xy_y_pack_store_32x2_avx512(d[0], d[1], dst8, dst8_stride);
114 }
115 
jnt_2d_comp_avg_round_store_half_pel_64_avx512(const __m512i res[2],const __m512i factor,const __m512i offset,const ConvBufType * const dst,uint8_t * const dst8)116 SIMD_INLINE void jnt_2d_comp_avg_round_store_half_pel_64_avx512(const __m512i            res[2],
117                                                                 const __m512i            factor,
118                                                                 const __m512i            offset,
119                                                                 const ConvBufType *const dst,
120                                                                 uint8_t *const           dst8) {
121     __m512i d[2];
122 
123     jnt_loadu_u16_8x4x2_avx512(dst, 32, d);
124     d[0] = jnt_2d_comp_avg_round_pack_half_pel_avx512(res[0], factor, offset, d[0]);
125     d[1] = jnt_2d_comp_avg_round_pack_half_pel_avx512(res[1], factor, offset, d[1]);
126     convolve_store_64_avx512(d[0], d[1], dst8);
127 }
128 
jnt_2d_round_32_avx512(const __m512i src[2],const __m512i offset)129 static INLINE __m512i jnt_2d_round_32_avx512(const __m512i src[2], const __m512i offset) {
130     const __m512i dst0 = _mm512_add_epi32(src[0], offset);
131     const __m512i dst1 = _mm512_add_epi32(src[1], offset);
132     const __m512i d0   = _mm512_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
133     const __m512i d1   = _mm512_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
134     return _mm512_packs_epi32(d0, d1);
135 }
136 
jnt_2d_round_half_pel_avx512(const __m512i src,const __m512i offset)137 static INLINE __m512i jnt_2d_round_half_pel_avx512(const __m512i src, const __m512i offset) {
138     const __m512i dst0 = _mm512_add_epi16(src, offset);
139     return _mm512_srai_epi16(dst0, 1);
140 }
141 
jnt_2d_avg_round_store_32x2_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i offset,const ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)142 SIMD_INLINE void jnt_2d_avg_round_store_32x2_avx512(const __m512i r0[2], const __m512i r1[2],
143                                                     const __m512i            offset,
144                                                     const ConvBufType *const dst,
145                                                     const int32_t dst_stride, uint8_t *const dst8,
146                                                     const int32_t dst8_stride) {
147     __m512i r[2], d[2];
148 
149     r[0] = jnt_2d_round_32_avx512(r0, offset);
150     r[1] = jnt_2d_round_32_avx512(r1, offset);
151     d[0] = zz_loadu_512((dst + 0 * dst_stride));
152     d[1] = zz_loadu_512((dst + 1 * dst_stride));
153     d[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[0]);
154     d[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[1]);
155     d[0] = jnt_avg_32_avx512(r[0], d[0]);
156     d[1] = jnt_avg_32_avx512(r[1], d[1]);
157     xy_y_pack_store_32x2_avx512(d[0], d[1], dst8, dst8_stride);
158 }
159 
jnt_2d_avg_round_store_64_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i offset,const ConvBufType * const dst,uint8_t * const dst8)160 SIMD_INLINE void jnt_2d_avg_round_store_64_avx512(const __m512i r0[2], const __m512i r1[2],
161                                                   const __m512i            offset,
162                                                   const ConvBufType *const dst,
163                                                   uint8_t *const           dst8) {
164     __m512i r[2], d[2];
165 
166     r[0] = jnt_2d_round_32_avx512(r0, offset);
167     r[1] = jnt_2d_round_32_avx512(r1, offset);
168     jnt_loadu_u16_8x4x2_avx512(dst, 32, d);
169     d[0] = jnt_avg_32_avx512(r[0], d[0]);
170     d[1] = jnt_avg_32_avx512(r[1], d[1]);
171     convolve_store_64_avx512(d[0], d[1], dst8);
172 }
173 
jnt_2d_avg_round_store_half_pel_32x2_avx512(const __m512i res[2],const __m512i offset,const ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)174 static INLINE void jnt_2d_avg_round_store_half_pel_32x2_avx512(
175     const __m512i res[2], const __m512i offset, const ConvBufType *const dst,
176     const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
177     __m512i r[2], d[2];
178 
179     r[0] = jnt_2d_round_half_pel_avx512(res[0], offset);
180     r[1] = jnt_2d_round_half_pel_avx512(res[1], offset);
181     d[0] = zz_loadu_512((dst + 0 * dst_stride));
182     d[1] = zz_loadu_512((dst + 1 * dst_stride));
183     d[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[0]);
184     d[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[1]);
185     d[0] = jnt_avg_32_avx512(r[0], d[0]);
186     d[1] = jnt_avg_32_avx512(r[1], d[1]);
187     xy_y_pack_store_32x2_avx512(d[0], d[1], dst8, dst8_stride);
188 }
189 
jnt_2d_avg_round_store_half_pel_64_avx512(const __m512i res[2],const __m512i offset,const ConvBufType * const dst,uint8_t * const dst8)190 static INLINE void jnt_2d_avg_round_store_half_pel_64_avx512(const __m512i            res[2],
191                                                              const __m512i            offset,
192                                                              const ConvBufType *const dst,
193                                                              uint8_t *const           dst8) {
194     __m512i r[2], d[2];
195 
196     r[0] = jnt_2d_round_half_pel_avx512(res[0], offset);
197     r[1] = jnt_2d_round_half_pel_avx512(res[1], offset);
198     jnt_loadu_u16_8x4x2_avx512(dst, 32, d);
199     d[0] = jnt_avg_32_avx512(r[0], d[0]);
200     d[1] = jnt_avg_32_avx512(r[1], d[1]);
201     convolve_store_64_avx512(d[0], d[1], dst8);
202 }
203 
jnt_2d_no_avg_store_32x2_avx512(const __m512i src0,const __m512i src1,ConvBufType * const dst,const int32_t stride)204 static INLINE void jnt_2d_no_avg_store_32x2_avx512(const __m512i src0, const __m512i src1,
205                                                    ConvBufType *const dst, const int32_t stride) {
206     const __m512i d0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), src0);
207     const __m512i d1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), src1);
208     _mm512_storeu_si512((__m512i *)dst, d0);
209     _mm512_storeu_si512((__m512i *)(dst + stride), d1);
210 }
211 
jnt_2d_no_avg_round_store_32x2_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i offset,ConvBufType * const dst,const int32_t stride)212 static INLINE void jnt_2d_no_avg_round_store_32x2_avx512(const __m512i r0[2], const __m512i r1[2],
213                                                          const __m512i      offset,
214                                                          ConvBufType *const dst,
215                                                          const int32_t      stride) {
216     const __m512i d0 = jnt_2d_round_32_avx512(r0, offset);
217     const __m512i d1 = jnt_2d_round_32_avx512(r1, offset);
218     jnt_2d_no_avg_store_32x2_avx512(d0, d1, dst, stride);
219 }
220 
jnt_2d_no_avg_round_store_64_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i offset,ConvBufType * const dst)221 static INLINE void jnt_2d_no_avg_round_store_64_avx512(const __m512i r0[2], const __m512i r1[2],
222                                                        const __m512i      offset,
223                                                        ConvBufType *const dst) {
224     const __m512i d0 = jnt_2d_round_32_avx512(r0, offset);
225     const __m512i d1 = jnt_2d_round_32_avx512(r1, offset);
226     jnt_no_avg_store_32x2_avx512(d0, d1, dst, 32);
227 }
228 
jnt_2d_no_avg_round_store_half_pel_32x2_avx512(const __m512i res[2],const __m512i offset,ConvBufType * const dst,const int32_t stride)229 static INLINE void jnt_2d_no_avg_round_store_half_pel_32x2_avx512(const __m512i      res[2],
230                                                                   const __m512i      offset,
231                                                                   ConvBufType *const dst,
232                                                                   const int32_t      stride) {
233     const __m512i d0 = jnt_2d_round_half_pel_avx512(res[0], offset);
234     const __m512i d1 = jnt_2d_round_half_pel_avx512(res[1], offset);
235     jnt_2d_no_avg_store_32x2_avx512(d0, d1, dst, stride);
236 }
237 
jnt_2d_no_avg_round_store_half_pel_64_avx512(const __m512i res[2],const __m512i offset,ConvBufType * const dst)238 static INLINE void jnt_2d_no_avg_round_store_half_pel_64_avx512(const __m512i      res[2],
239                                                                 const __m512i      offset,
240                                                                 ConvBufType *const dst) {
241     const __m512i d0 = jnt_2d_round_half_pel_avx512(res[0], offset);
242     const __m512i d1 = jnt_2d_round_half_pel_avx512(res[1], offset);
243     jnt_no_avg_store_32x2_avx512(d0, d1, dst, 32);
244 }
245 
jnt_convolve_2d_hor_2tap_avx512(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)246 static void jnt_convolve_2d_hor_2tap_avx512(const uint8_t *src, const int32_t src_stride,
247                                             const int32_t w, const int32_t h,
248                                             const InterpFilterParams *filter_params_x,
249                                             const int32_t subpel_x_q4, int16_t *const im_block) {
250     const uint8_t *src_ptr = src;
251     int32_t        y       = h;
252     int16_t *      im      = im_block;
253 
254     if (w <= 8) {
255         __m128i coeffs_128;
256         prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
257 
258         if (w == 2) {
259             do {
260                 const __m128i r = x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
261                 xy_x_round_store_2x2_sse2(r, im);
262                 src_ptr += 2 * src_stride;
263                 im += 2 * 2;
264                 y -= 2;
265             } while (y);
266         } else if (w == 4) {
267             do {
268                 const __m128i r = x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
269                 xy_x_round_store_4x2_sse2(r, im);
270                 src_ptr += 2 * src_stride;
271                 im += 2 * 4;
272                 y -= 2;
273             } while (y);
274         } else {
275             assert(w == 8);
276 
277             do {
278                 __m128i r[2];
279 
280                 x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
281                 xy_x_round_store_8x2_sse2(r, im);
282                 src_ptr += 2 * src_stride;
283                 im += 2 * 8;
284                 y -= 2;
285             } while (y);
286         }
287     } else if (w == 16) {
288         __m256i coeffs_256;
289         prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
290 
291         do {
292             __m256i r[2];
293 
294             x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
295             xy_x_round_store_32_avx2(r, im);
296             src_ptr += 2 * src_stride;
297             im += 2 * 16;
298             y -= 2;
299         } while (y);
300     } else {
301         __m512i coeffs_512;
302         prepare_half_coeffs_2tap_avx512(filter_params_x, subpel_x_q4, &coeffs_512);
303 
304         if (w == 32) {
305             do {
306                 xy_x_2tap_32x2_avx512(src_ptr, src_stride, &coeffs_512, im);
307                 src_ptr += 2 * src_stride;
308                 im += 2 * 32;
309                 y -= 2;
310             } while (y);
311         } else if (w == 64) {
312             do {
313                 xy_x_2tap_64_avx512(src_ptr, &coeffs_512, im);
314                 src_ptr += src_stride;
315                 im += 64;
316             } while (--y);
317         } else {
318             assert(w == 128);
319 
320             do {
321                 xy_x_2tap_64_avx512(src_ptr + 0 * 64, &coeffs_512, im + 0 * 64);
322                 xy_x_2tap_64_avx512(src_ptr + 1 * 64, &coeffs_512, im + 1 * 64);
323                 src_ptr += src_stride;
324                 im += 128;
325             } while (--y);
326         }
327     }
328 }
329 
jnt_convolve_2d_hor_6tap_avx512(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)330 static void jnt_convolve_2d_hor_6tap_avx512(const uint8_t *src, const int32_t src_stride,
331                                             const int32_t w, const int32_t h,
332                                             const InterpFilterParams *filter_params_x,
333                                             const int32_t subpel_x_q4, int16_t *const im_block) {
334     const uint8_t *src_ptr = src - 2;
335     int32_t        y       = h;
336     int16_t *      im      = im_block;
337 
338     if (w <= 16) {
339         __m256i coeffs_256[3], filt_256[3];
340 
341         filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
342         filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
343         filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
344 
345         prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
346 
347         if (w == 8) {
348             do {
349                 const __m256i res = x_convolve_6tap_8x2_avx2(
350                     src_ptr, src_stride, coeffs_256, filt_256);
351                 xy_x_round_store_8x2_avx2(res, im);
352                 src_ptr += 2 * src_stride;
353                 im += 2 * 8;
354                 y -= 2;
355             } while (y);
356         } else {
357             assert(w == 16);
358 
359             do {
360                 __m256i r[2];
361 
362                 x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
363                 xy_x_round_store_32_avx2(r, im);
364                 src_ptr += 2 * src_stride;
365                 im += 2 * 16;
366                 y -= 2;
367             } while (y);
368         }
369     } else {
370         __m512i coeffs_512[3], filt_512[3];
371 
372         filt_512[0] = zz_load_512(filt1_global_avx);
373         filt_512[1] = zz_load_512(filt2_global_avx);
374         filt_512[2] = zz_load_512(filt3_global_avx);
375 
376         prepare_half_coeffs_6tap_avx512(filter_params_x, subpel_x_q4, coeffs_512);
377         if (w == 32) {
378             do {
379                 xy_x_6tap_32x2_avx512(src_ptr, src_stride, coeffs_512, filt_512, im);
380                 src_ptr += 2 * src_stride;
381                 im += 2 * 32;
382                 y -= 2;
383             } while (y);
384         } else if (w == 64) {
385             do {
386                 xy_x_6tap_64_avx512(src_ptr, coeffs_512, filt_512, im);
387                 src_ptr += src_stride;
388                 im += 64;
389             } while (--y);
390         } else {
391             assert(w == 128);
392 
393             do {
394                 xy_x_6tap_64_avx512(src_ptr, coeffs_512, filt_512, im);
395                 xy_x_6tap_64_avx512(src_ptr + 64, coeffs_512, filt_512, im + 64);
396                 src_ptr += src_stride;
397                 im += 128;
398             } while (--y);
399         }
400     }
401 }
402 
jnt_convolve_2d_hor_8tap_avx512(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)403 static void jnt_convolve_2d_hor_8tap_avx512(const uint8_t *src, const int32_t src_stride,
404                                             const int32_t w, const int32_t h,
405                                             const InterpFilterParams *filter_params_x,
406                                             const int32_t subpel_x_q4, int16_t *const im_block) {
407     const uint8_t *src_ptr = src - 3;
408     int32_t        y       = h;
409     int16_t *      im      = im_block;
410 
411     if (w <= 16) {
412         __m256i coeffs_256[4], filt_256[4];
413 
414         filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
415         filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
416         filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
417         filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx);
418 
419         prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
420 
421         if (w == 8) {
422             do {
423                 const __m256i res = x_convolve_8tap_8x2_avx2(
424                     src_ptr, src_stride, coeffs_256, filt_256);
425                 xy_x_round_store_8x2_avx2(res, im);
426                 src_ptr += 2 * src_stride;
427                 im += 2 * 8;
428                 y -= 2;
429             } while (y);
430         } else {
431             assert(w == 16);
432 
433             do {
434                 __m256i r[2];
435 
436                 x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
437                 xy_x_round_store_32_avx2(r, im);
438                 src_ptr += 2 * src_stride;
439                 im += 2 * 16;
440                 y -= 2;
441             } while (y);
442         }
443     } else {
444         __m512i coeffs_512[4], filt_512[4];
445 
446         filt_512[0] = zz_load_512(filt1_global_avx);
447         filt_512[1] = zz_load_512(filt2_global_avx);
448         filt_512[2] = zz_load_512(filt3_global_avx);
449         filt_512[3] = zz_load_512(filt4_global_avx);
450 
451         prepare_half_coeffs_8tap_avx512(filter_params_x, subpel_x_q4, coeffs_512);
452 
453         if (w == 32) {
454             do {
455                 xy_x_8tap_32x2_avx512(src_ptr, src_stride, coeffs_512, filt_512, im);
456                 src_ptr += 2 * src_stride;
457                 im += 2 * 32;
458                 y -= 2;
459             } while (y);
460         } else if (w == 64) {
461             do {
462                 xy_x_8tap_64_avx512(src_ptr, coeffs_512, filt_512, im);
463                 src_ptr += src_stride;
464                 im += 64;
465             } while (--y);
466         } else {
467             assert(w == 128);
468 
469             do {
470                 xy_x_8tap_64_avx512(src_ptr, coeffs_512, filt_512, im);
471                 xy_x_8tap_64_avx512(src_ptr + 64, coeffs_512, filt_512, im + 64);
472                 src_ptr += src_stride;
473                 im += 128;
474             } while (--y);
475         }
476     }
477 }
478 
jnt_convolve_2d_ver_2tap_avx512(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)479 static void jnt_convolve_2d_ver_2tap_avx512(const int16_t *const im_block, const int32_t w,
480                                             const int32_t                   h,
481                                             const InterpFilterParams *const filter_params_y,
482                                             const int32_t                   subpel_y_q4,
483                                             const ConvolveParams *const conv_params, uint8_t *dst8,
484                                             const int32_t dst8_stride) {
485     const int32_t  dst_stride     = conv_params->dst_stride;
486     const int32_t  bd             = 8;
487     const int32_t  round_0        = 3;
488     const int16_t *im             = im_block;
489     const int32_t  round_1        = COMPOUND_ROUND1_BITS;
490     const int32_t  offset_bits    = bd + 2 * FILTER_BITS - round_0; // 19
491     const int32_t  round_bits     = 2 * FILTER_BITS - round_0 - round_1; // 4
492     const int32_t  round_offset   = 1 << (offset_bits - round_1);
493     const int32_t  factor         = conv_params->fwd_offset | (conv_params->bck_offset << 16);
494     const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
495         (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
496         (1 << (round_bits + DIST_PRECISION_BITS - 1));
497     const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
498         (1 << offset_bits) - (1 << (offset_bits - 1));
499     const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
500         (1 << (offset_bits - 1));
501     ConvBufType *dst = conv_params->dst;
502     int32_t      y   = h;
503 
504     if (w <= 4) {
505         const __m128i factor_128          = _mm_set1_epi32(factor);
506         const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
507         const __m128i offset_avg_128      = _mm_set1_epi32(offset_avg);
508         const __m128i offset_no_avg_128   = _mm_set1_epi32(offset_no_avg);
509         __m128i       coeffs_128;
510 
511         prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
512 
513         if (w == 2) {
514             __m128i s_32[2];
515 
516             s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
517 
518             if (conv_params->do_average) {
519                 if (conv_params->use_jnt_comp_avg) {
520                     do {
521                         const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
522                         jnt_2d_comp_avg_round_store_2x2_sse2(res,
523                                                              factor_128,
524                                                              offset_comp_avg_128,
525                                                              dst,
526                                                              dst_stride,
527                                                              dst8,
528                                                              dst8_stride);
529                         im += 2 * 2;
530                         dst += 2 * dst_stride;
531                         dst8 += 2 * dst8_stride;
532                         y -= 2;
533                     } while (y);
534                 } else {
535                     do {
536                         const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
537                         jnt_2d_avg_round_store_2x2_sse2(
538                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
539                         im += 2 * 2;
540                         dst += 2 * dst_stride;
541                         dst8 += 2 * dst8_stride;
542                         y -= 2;
543                     } while (y);
544                 }
545             } else {
546                 do {
547                     const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
548                     jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
549                     im += 2 * 2;
550                     dst += 2 * dst_stride;
551                     y -= 2;
552                 } while (y);
553             }
554         } else {
555             __m128i s_64[2], r[2];
556 
557             assert(w == 4);
558 
559             s_64[0] = _mm_loadl_epi64((__m128i *)im);
560 
561             if (conv_params->do_average) {
562                 if (conv_params->use_jnt_comp_avg) {
563                     do {
564                         xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
565                         jnt_2d_comp_avg_round_store_4x2_sse2(
566                             r, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
567                         im += 2 * 4;
568                         dst += 2 * dst_stride;
569                         dst8 += 2 * dst8_stride;
570                         y -= 2;
571                     } while (y);
572                 } else {
573                     do {
574                         xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
575                         jnt_2d_avg_round_store_4x2_sse2(
576                             r, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
577                         im += 2 * 4;
578                         dst += 2 * dst_stride;
579                         dst8 += 2 * dst8_stride;
580                         y -= 2;
581                     } while (y);
582                 }
583             } else {
584                 do {
585                     xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
586                     jnt_2d_no_avg_round_store_4x2_sse2(r, offset_no_avg_128, dst, dst_stride);
587                     im += 2 * 4;
588                     dst += 2 * dst_stride;
589                     y -= 2;
590                 } while (y);
591             }
592         }
593     } else if (w <= 16) {
594         const __m256i factor_256          = _mm256_set1_epi32(factor);
595         const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
596         const __m256i offset_avg_256      = _mm256_set1_epi32(offset_avg);
597         const __m256i offset_no_avg_256   = _mm256_set1_epi32(offset_no_avg);
598         __m256i       coeffs_256;
599 
600         prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
601 
602         if (w == 8) {
603             __m128i s_128[2];
604             __m256i r[2];
605 
606             s_128[0] = _mm_loadu_si128((__m128i *)im);
607 
608             if (conv_params->do_average) {
609                 if (conv_params->use_jnt_comp_avg) {
610                     do {
611                         xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
612                         jnt_2d_comp_avg_round_store_8x2_avx2(
613                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
614                         im += 2 * 8;
615                         dst += 2 * dst_stride;
616                         dst8 += 2 * dst8_stride;
617                         y -= 2;
618                     } while (y);
619                 } else {
620                     do {
621                         xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
622                         jnt_2d_avg_round_store_8x2_avx2(
623                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
624                         im += 2 * 8;
625                         dst += 2 * dst_stride;
626                         dst8 += 2 * dst8_stride;
627                         y -= 2;
628                     } while (y);
629                 }
630             } else {
631                 do {
632                     xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
633                     jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
634                     im += 2 * 8;
635                     dst += 2 * dst_stride;
636                     y -= 2;
637                 } while (y);
638             }
639         } else {
640             __m256i s_256[2], r[4];
641 
642             assert(w == 16);
643 
644             s_256[0] = _mm256_loadu_si256((__m256i *)im);
645 
646             if (conv_params->do_average) {
647                 if (conv_params->use_jnt_comp_avg) {
648                     do {
649                         xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
650                         jnt_2d_comp_avg_round_store_16x2_avx2(
651                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
652                         im += 2 * 16;
653                         dst += 2 * dst_stride;
654                         dst8 += 2 * dst8_stride;
655                         y -= 2;
656                     } while (y);
657                 } else {
658                     do {
659                         xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
660                         jnt_2d_avg_round_store_16x2_avx2(
661                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
662                         im += 2 * 16;
663                         dst += 2 * dst_stride;
664                         dst8 += 2 * dst8_stride;
665                         y -= 2;
666                     } while (y);
667                 }
668             } else {
669                 do {
670                     xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
671                     jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
672                     im += 2 * 16;
673                     dst += 2 * dst_stride;
674                     y -= 2;
675                 } while (y);
676             }
677         }
678     } else {
679         const __m512i factor_512          = _mm512_set1_epi32(factor);
680         const __m512i offset_comp_avg_512 = _mm512_set1_epi32(offset_comp_avg);
681         const __m512i offset_avg_512      = _mm512_set1_epi32(offset_avg);
682         const __m512i offset_no_avg_512   = _mm512_set1_epi32(offset_no_avg);
683         __m512i       coeffs_512;
684 
685         prepare_coeffs_2tap_avx512(filter_params_y, subpel_y_q4, &coeffs_512);
686 
687         if (w == 32) {
688             __m512i s_512[2], r[4];
689 
690             s_512[0] = loadu_s16_16x2_avx512(im, 32);
691 
692             if (conv_params->do_average) {
693                 if (conv_params->use_jnt_comp_avg) {
694                     do {
695                         xy_y_convolve_2tap_32x2_avx512(im, s_512, &coeffs_512, r);
696                         jnt_2d_comp_avg_round_store_32x2_avx512(r + 0,
697                                                                 r + 2,
698                                                                 factor_512,
699                                                                 offset_comp_avg_512,
700                                                                 dst,
701                                                                 dst_stride,
702                                                                 dst8,
703                                                                 dst8_stride);
704                         im += 2 * 32;
705                         dst += 2 * dst_stride;
706                         dst8 += 2 * dst8_stride;
707                         y -= 2;
708                     } while (y);
709                 } else {
710                     do {
711                         xy_y_convolve_2tap_32x2_avx512(im, s_512, &coeffs_512, r);
712                         jnt_2d_avg_round_store_32x2_avx512(
713                             r + 0, r + 2, offset_avg_512, dst, dst_stride, dst8, dst8_stride);
714                         im += 2 * 32;
715                         dst += 2 * dst_stride;
716                         dst8 += 2 * dst8_stride;
717                         y -= 2;
718                     } while (y);
719                 }
720             } else {
721                 do {
722                     xy_y_convolve_2tap_32x2_avx512(im, s_512, &coeffs_512, r);
723                     jnt_2d_no_avg_round_store_32x2_avx512(
724                         r + 0, r + 2, offset_no_avg_512, dst, dst_stride);
725                     im += 2 * 32;
726                     dst += 2 * dst_stride;
727                     y -= 2;
728                 } while (y);
729             }
730         } else if (w == 64) {
731             __m512i s_512[2][2], r[4];
732 
733             s_512[0][0] = zz_load_512(im + 0 * 32);
734             s_512[0][1] = zz_load_512(im + 1 * 32);
735 
736             if (conv_params->do_average) {
737                 if (conv_params->use_jnt_comp_avg) {
738                     do {
739                         xy_y_convolve_2tap_64_avx512(
740                             im + 2 * 32, s_512[0], s_512[1], &coeffs_512, r);
741                         jnt_2d_comp_avg_round_store_64_avx512(
742                             r + 0, r + 2, factor_512, offset_comp_avg_512, dst, dst8);
743 
744                         im += 2 * 64;
745 
746                         xy_y_convolve_2tap_64_avx512(
747                             im + 0 * 32, s_512[1], s_512[0], &coeffs_512, r);
748                         jnt_2d_comp_avg_round_store_64_avx512(r + 0,
749                                                               r + 2,
750                                                               factor_512,
751                                                               offset_comp_avg_512,
752                                                               dst + dst8_stride,
753                                                               dst8 + dst8_stride);
754 
755                         dst += 2 * dst_stride;
756                         dst8 += 2 * dst8_stride;
757                         y -= 2;
758                     } while (y);
759                 } else {
760                     do {
761                         xy_y_convolve_2tap_64_avx512(
762                             im + 1 * 64, s_512[0], s_512[1], &coeffs_512, r);
763                         jnt_2d_avg_round_store_64_avx512(r + 0, r + 2, offset_avg_512, dst, dst8);
764 
765                         im += 2 * 64;
766 
767                         xy_y_convolve_2tap_64_avx512(
768                             im + 0 * 64, s_512[1], s_512[0], &coeffs_512, r);
769                         jnt_2d_avg_round_store_64_avx512(
770                             r + 0, r + 2, offset_avg_512, dst + dst_stride, dst8 + dst8_stride);
771 
772                         dst += 2 * dst_stride;
773                         dst8 += 2 * dst8_stride;
774                         y -= 2;
775                     } while (y);
776                 }
777             } else {
778                 do {
779                     xy_y_convolve_2tap_64_avx512(im + 2 * 32, s_512[0], s_512[1], &coeffs_512, r);
780                     jnt_2d_no_avg_round_store_64_avx512(r + 0, r + 2, offset_no_avg_512, dst);
781 
782                     im += 2 * 64;
783 
784                     xy_y_convolve_2tap_64_avx512(im + 0 * 32, s_512[1], s_512[0], &coeffs_512, r);
785                     jnt_2d_no_avg_round_store_64_avx512(
786                         r + 0, r + 2, offset_no_avg_512, dst + dst_stride);
787 
788                     dst += 2 * dst_stride;
789                     y -= 2;
790                 } while (y);
791             }
792         } else {
793             __m512i s_512[2][4], r[4];
794 
795             assert(w == 128);
796 
797             load_16bit_4rows_avx512(im, 32, s_512[0]);
798 
799             if (conv_params->do_average) {
800                 if (conv_params->use_jnt_comp_avg) {
801                     do {
802                         xy_y_convolve_2tap_64_avx512(
803                             im + 2 * 64, s_512[0] + 0, s_512[1] + 0, &coeffs_512, r);
804                         jnt_2d_comp_avg_round_store_64_avx512(
805                             r + 0, r + 2, factor_512, offset_comp_avg_512, dst, dst8);
806 
807                         xy_y_convolve_2tap_64_avx512(
808                             im + 3 * 64, s_512[0] + 2, s_512[1] + 2, &coeffs_512, r);
809                         jnt_2d_comp_avg_round_store_64_avx512(r + 0,
810                                                               r + 2,
811                                                               factor_512,
812                                                               offset_comp_avg_512,
813                                                               dst + 1 * 64,
814                                                               dst8 + 1 * 64);
815 
816                         im += 2 * 128;
817 
818                         xy_y_convolve_2tap_64_avx512(
819                             im + 0 * 64, s_512[1] + 0, s_512[0] + 0, &coeffs_512, r);
820                         jnt_2d_comp_avg_round_store_64_avx512(r + 0,
821                                                               r + 2,
822                                                               factor_512,
823                                                               offset_comp_avg_512,
824                                                               dst + dst8_stride + 0 * 64,
825                                                               dst8 + dst8_stride + 0 * 64);
826 
827                         xy_y_convolve_2tap_64_avx512(
828                             im + 1 * 64, s_512[1] + 2, s_512[0] + 2, &coeffs_512, r);
829                         jnt_2d_comp_avg_round_store_64_avx512(r + 0,
830                                                               r + 2,
831                                                               factor_512,
832                                                               offset_comp_avg_512,
833                                                               dst + dst8_stride + 1 * 64,
834                                                               dst8 + dst8_stride + 1 * 64);
835 
836                         dst += 2 * dst_stride;
837                         dst8 += 2 * dst8_stride;
838                         y -= 2;
839                     } while (y);
840                 } else {
841                     do {
842                         xy_y_convolve_2tap_64_avx512(
843                             im + 2 * 64, s_512[0] + 0, s_512[1] + 0, &coeffs_512, r);
844                         jnt_2d_avg_round_store_64_avx512(
845                             r + 0, r + 2, offset_avg_512, dst + 0 * 64, dst8 + 0 * 64);
846 
847                         xy_y_convolve_2tap_64_avx512(
848                             im + 3 * 64, s_512[0] + 2, s_512[1] + 2, &coeffs_512, r);
849                         jnt_2d_avg_round_store_64_avx512(
850                             r + 0, r + 2, offset_avg_512, dst + 1 * 64, dst8 + 1 * 64);
851 
852                         im += 2 * 128;
853 
854                         xy_y_convolve_2tap_64_avx512(
855                             im + 0 * 64, s_512[1] + 0, s_512[0] + 0, &coeffs_512, r);
856                         jnt_2d_avg_round_store_64_avx512(r + 0,
857                                                          r + 2,
858                                                          offset_avg_512,
859                                                          dst + dst_stride + 0 * 64,
860                                                          dst8 + dst8_stride + 0 * 64);
861 
862                         xy_y_convolve_2tap_64_avx512(
863                             im + 1 * 64, s_512[1] + 2, s_512[0] + 2, &coeffs_512, r);
864                         jnt_2d_avg_round_store_64_avx512(r + 0,
865                                                          r + 2,
866                                                          offset_avg_512,
867                                                          dst + dst_stride + 1 * 64,
868                                                          dst8 + dst8_stride + 1 * 64);
869 
870                         dst += 2 * dst_stride;
871                         dst8 += 2 * dst8_stride;
872                         y -= 2;
873                     } while (y);
874                 }
875             } else {
876                 do {
877                     xy_y_convolve_2tap_64_avx512(
878                         im + 2 * 64, s_512[0] + 0, s_512[1] + 0, &coeffs_512, r);
879                     jnt_2d_no_avg_round_store_64_avx512(
880                         r + 0, r + 2, offset_no_avg_512, dst + 0 * 64);
881 
882                     xy_y_convolve_2tap_64_avx512(
883                         im + 3 * 64, s_512[0] + 2, s_512[1] + 2, &coeffs_512, r);
884                     jnt_2d_no_avg_round_store_64_avx512(
885                         r + 0, r + 2, offset_no_avg_512, dst + 1 * 64);
886 
887                     im += 2 * 128;
888 
889                     xy_y_convolve_2tap_64_avx512(
890                         im + 0 * 64, s_512[1] + 0, s_512[0] + 0, &coeffs_512, r);
891                     jnt_2d_no_avg_round_store_64_avx512(
892                         r + 0, r + 2, offset_no_avg_512, dst + dst_stride + 0 * 64);
893 
894                     xy_y_convolve_2tap_64_avx512(
895                         im + 1 * 64, s_512[1] + 2, s_512[0] + 2, &coeffs_512, r);
896                     jnt_2d_no_avg_round_store_64_avx512(
897                         r + 0, r + 2, offset_no_avg_512, dst + dst_stride + 1 * 64);
898 
899                     dst += 2 * dst_stride;
900                     y -= 2;
901                 } while (y);
902             }
903         }
904     }
905 }
906 
jnt_convolve_2d_ver_2tap_half_avx512(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)907 static void jnt_convolve_2d_ver_2tap_half_avx512(const int16_t *const im_block, const int32_t w,
908                                                  const int32_t                   h,
909                                                  const InterpFilterParams *const filter_params_y,
910                                                  const int32_t                   subpel_y_q4,
911                                                  const ConvolveParams *const     conv_params,
912                                                  uint8_t *dst8, const int32_t dst8_stride) {
913     const int32_t  dst_stride     = conv_params->dst_stride;
914     const int32_t  bd             = 8;
915     const int32_t  round_0        = 3;
916     const int16_t *im             = im_block;
917     const int32_t  round_1        = COMPOUND_ROUND1_BITS;
918     const int32_t  offset_bits    = bd + 2 * FILTER_BITS - round_0; // 19
919     const int32_t  round_bits     = 2 * FILTER_BITS - round_0 - round_1; // 4
920     const int32_t  round_offset   = 1 << (offset_bits - round_1);
921     const int32_t  factor         = conv_params->fwd_offset | (conv_params->bck_offset << 16);
922     const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
923         (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
924         (1 << (round_bits + DIST_PRECISION_BITS - 1));
925     const int32_t offset_avg = (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
926         (1 << (round_bits + round_1 - COMPOUND_ROUND1_BITS + 1)) -
927         (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) -
928         (1 << (offset_bits - COMPOUND_ROUND1_BITS));
929     const int32_t offset_no_avg = (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
930         (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) +
931         (1 << (offset_bits - COMPOUND_ROUND1_BITS));
932     ConvBufType *dst = conv_params->dst;
933     int32_t      y   = h;
934 
935     (void)filter_params_y;
936     (void)subpel_y_q4;
937 
938     if (w <= 4) {
939         const __m128i factor_128          = _mm_set1_epi32(factor);
940         const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
941         const __m128i offset_avg_128      = _mm_set1_epi16(offset_avg);
942         const __m128i offset_no_avg_128   = _mm_set1_epi16(offset_no_avg);
943 
944         if (w == 2) {
945             __m128i s_32[2];
946 
947             s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
948 
949             if (conv_params->do_average) {
950                 if (conv_params->use_jnt_comp_avg) {
951                     do {
952                         const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
953                         jnt_2d_comp_avg_round_store_half_pel_2x2_sse2(res,
954                                                                       factor_128,
955                                                                       offset_comp_avg_128,
956                                                                       dst,
957                                                                       dst_stride,
958                                                                       dst8,
959                                                                       dst8_stride);
960                         im += 2 * 2;
961                         dst += 2 * dst_stride;
962                         dst8 += 2 * dst8_stride;
963                         y -= 2;
964                     } while (y);
965                 } else {
966                     do {
967                         const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
968                         jnt_2d_avg_round_store_half_pel_2x2_sse2(
969                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
970                         im += 2 * 2;
971                         dst += 2 * dst_stride;
972                         dst8 += 2 * dst8_stride;
973                         y -= 2;
974                     } while (y);
975                 }
976             } else {
977                 do {
978                     const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
979                     jnt_2d_no_avg_round_store_half_pel_2x2_sse2(
980                         res, offset_no_avg_128, dst, dst_stride);
981                     im += 2 * 2;
982                     dst += 2 * dst_stride;
983                     y -= 2;
984                 } while (y);
985             }
986         } else {
987             __m128i s_64[2];
988 
989             assert(w == 4);
990 
991             s_64[0] = _mm_loadl_epi64((__m128i *)im);
992 
993             if (conv_params->do_average) {
994                 if (conv_params->use_jnt_comp_avg) {
995                     do {
996                         const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
997                         jnt_2d_comp_avg_round_store_half_pel_4x2_sse2(res,
998                                                                       factor_128,
999                                                                       offset_comp_avg_128,
1000                                                                       dst,
1001                                                                       dst_stride,
1002                                                                       dst8,
1003                                                                       dst8_stride);
1004                         im += 2 * 4;
1005                         dst += 2 * dst_stride;
1006                         dst8 += 2 * dst8_stride;
1007                         y -= 2;
1008                     } while (y);
1009                 } else {
1010                     do {
1011                         const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
1012                         jnt_2d_avg_round_store_half_pel_4x2_sse2(
1013                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1014                         im += 2 * 4;
1015                         dst += 2 * dst_stride;
1016                         dst8 += 2 * dst8_stride;
1017                         y -= 2;
1018                     } while (y);
1019                 }
1020             } else {
1021                 do {
1022                     const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
1023                     jnt_2d_no_avg_round_store_half_pel_4x2_sse2(
1024                         res, offset_no_avg_128, dst, dst_stride);
1025                     im += 2 * 4;
1026                     dst += 2 * dst_stride;
1027                     y -= 2;
1028                 } while (y);
1029             }
1030         }
1031     } else if (w <= 16) {
1032         const __m256i factor_256          = _mm256_set1_epi32(factor);
1033         const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1034         const __m256i offset_avg_256      = _mm256_set1_epi16(offset_avg);
1035         const __m256i offset_no_avg_256   = _mm256_set1_epi16(offset_no_avg);
1036 
1037         if (w == 8) {
1038             __m128i s_128[2];
1039 
1040             s_128[0] = _mm_loadu_si128((__m128i *)im);
1041 
1042             if (conv_params->do_average) {
1043                 if (conv_params->use_jnt_comp_avg) {
1044                     do {
1045                         const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1046                         jnt_2d_comp_avg_round_store_half_pel_8x2_avx2(res,
1047                                                                       factor_256,
1048                                                                       offset_comp_avg_256,
1049                                                                       dst,
1050                                                                       dst_stride,
1051                                                                       dst8,
1052                                                                       dst8_stride);
1053                         im += 2 * 8;
1054                         dst += 2 * dst_stride;
1055                         dst8 += 2 * dst8_stride;
1056                         y -= 2;
1057                     } while (y);
1058                 } else {
1059                     do {
1060                         const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1061                         jnt_2d_avg_round_store_half_pel_8x2_avx2(
1062                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1063                         im += 2 * 8;
1064                         dst += 2 * dst_stride;
1065                         dst8 += 2 * dst8_stride;
1066                         y -= 2;
1067                     } while (y);
1068                 }
1069             } else {
1070                 do {
1071                     const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1072                     jnt_2d_no_avg_round_store_half_pel_8x2_avx2(
1073                         res, offset_no_avg_256, dst, dst_stride);
1074                     im += 2 * 8;
1075                     dst += 2 * dst_stride;
1076                     y -= 2;
1077                 } while (y);
1078             }
1079         } else {
1080             __m256i s_256[2], r[2];
1081 
1082             assert(w == 16);
1083 
1084             s_256[0] = _mm256_loadu_si256((__m256i *)im);
1085 
1086             if (conv_params->do_average) {
1087                 if (conv_params->use_jnt_comp_avg) {
1088                     do {
1089                         xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1090                         jnt_2d_comp_avg_round_store_half_pel_16x2_avx2(
1091                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1092                         im += 2 * 16;
1093                         dst += 2 * dst_stride;
1094                         dst8 += 2 * dst8_stride;
1095                         y -= 2;
1096                     } while (y);
1097                 } else {
1098                     do {
1099                         xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1100                         jnt_2d_avg_round_store_half_pel_16x2_avx2(
1101                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1102                         im += 2 * 16;
1103                         dst += 2 * dst_stride;
1104                         dst8 += 2 * dst8_stride;
1105                         y -= 2;
1106                     } while (y);
1107                 }
1108             } else {
1109                 do {
1110                     xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1111                     jnt_2d_no_avg_round_store_half_pel_16x2_avx2(
1112                         r, offset_no_avg_256, dst, dst_stride);
1113                     im += 2 * 16;
1114                     dst += 2 * dst_stride;
1115                     y -= 2;
1116                 } while (y);
1117             }
1118         }
1119     } else {
1120         const __m512i factor_512          = _mm512_set1_epi32(factor);
1121         const __m512i offset_comp_avg_512 = _mm512_set1_epi32(offset_comp_avg);
1122         const __m512i offset_avg_512      = _mm512_set1_epi16(offset_avg);
1123         const __m512i offset_no_avg_512   = _mm512_set1_epi16(offset_no_avg);
1124 
1125         if (w == 32) {
1126             __m512i s_512[2], r[2];
1127 
1128             s_512[0] = loadu_s16_16x2_avx512(im, 32);
1129 
1130             if (conv_params->do_average) {
1131                 if (conv_params->use_jnt_comp_avg) {
1132                     do {
1133                         xy_y_convolve_2tap_half_pel_32x2_avx512(im + 16, s_512, r);
1134                         jnt_2d_comp_avg_round_store_half_pel_32x2_avx512(
1135                             r, factor_512, offset_comp_avg_512, dst, dst_stride, dst8, dst8_stride);
1136                         im += 2 * 32;
1137                         dst += 2 * dst_stride;
1138                         dst8 += 2 * dst8_stride;
1139                         y -= 2;
1140                     } while (y);
1141                 } else {
1142                     do {
1143                         xy_y_convolve_2tap_half_pel_32x2_avx512(im + 16, s_512, r);
1144                         jnt_2d_avg_round_store_half_pel_32x2_avx512(
1145                             r, offset_avg_512, dst, dst_stride, dst8, dst8_stride);
1146                         im += 2 * 32;
1147                         dst += 2 * dst_stride;
1148                         dst8 += 2 * dst8_stride;
1149                         y -= 2;
1150                     } while (y);
1151                 }
1152             } else {
1153                 do {
1154                     xy_y_convolve_2tap_half_pel_32x2_avx512(im + 16, s_512, r);
1155                     jnt_2d_no_avg_round_store_half_pel_32x2_avx512(
1156                         r, offset_no_avg_512, dst, dst_stride);
1157                     im += 2 * 32;
1158                     dst += 2 * dst_stride;
1159                     y -= 2;
1160                 } while (y);
1161             }
1162         } else if (w == 64) {
1163             __m512i s_512[2][2], r[2];
1164 
1165             s_512[0][0] = zz_load_512(im + 0 * 32);
1166             s_512[0][1] = zz_load_512(im + 1 * 32);
1167 
1168             if (conv_params->do_average) {
1169                 if (conv_params->use_jnt_comp_avg) {
1170                     do {
1171                         xy_y_convolve_2tap_half_pel_64_avx512(
1172                             im + 2 * 32, s_512[0] + 0, s_512[1] + 0, r);
1173                         jnt_2d_comp_avg_round_store_half_pel_64_avx512(
1174                             r, factor_512, offset_comp_avg_512, dst, dst8);
1175 
1176                         im += 2 * 64;
1177 
1178                         xy_y_convolve_2tap_half_pel_64_avx512(
1179                             im + 0 * 32, s_512[1] + 0, s_512[0] + 0, r);
1180                         jnt_2d_comp_avg_round_store_half_pel_64_avx512(r,
1181                                                                        factor_512,
1182                                                                        offset_comp_avg_512,
1183                                                                        dst + dst_stride,
1184                                                                        dst8 + dst8_stride);
1185 
1186                         dst += 2 * dst_stride;
1187                         dst8 += 2 * dst8_stride;
1188                         y -= 2;
1189                     } while (y);
1190                 } else {
1191                     do {
1192                         xy_y_convolve_2tap_half_pel_64_avx512(im + 1 * 64, s_512[0], s_512[1], r);
1193                         jnt_2d_avg_round_store_half_pel_64_avx512(r, offset_avg_512, dst, dst8);
1194 
1195                         im += 2 * 64;
1196 
1197                         xy_y_convolve_2tap_half_pel_64_avx512(im + 0 * 64, s_512[1], s_512[0], r);
1198                         jnt_2d_avg_round_store_half_pel_64_avx512(
1199                             r, offset_avg_512, dst + dst_stride, dst8 + dst8_stride);
1200 
1201                         dst += 2 * dst_stride;
1202                         dst8 += 2 * dst8_stride;
1203                         y -= 2;
1204                     } while (y);
1205                 }
1206             } else {
1207                 do {
1208                     xy_y_convolve_2tap_half_pel_64_avx512(im + 2 * 32, s_512[0], s_512[1], r);
1209                     jnt_2d_no_avg_round_store_half_pel_64_avx512(r, offset_no_avg_512, dst);
1210 
1211                     im += 2 * 64;
1212 
1213                     xy_y_convolve_2tap_half_pel_64_avx512(im + 0 * 32, s_512[1], s_512[0], r);
1214                     jnt_2d_no_avg_round_store_half_pel_64_avx512(
1215                         r, offset_no_avg_512, dst + dst_stride);
1216 
1217                     dst += 2 * dst_stride;
1218                     y -= 2;
1219                 } while (y);
1220             }
1221         } else {
1222             __m512i s_512[2][4], r[2];
1223 
1224             assert(w == 128);
1225 
1226             load_16bit_4rows_avx512(im, 32, s_512[0]);
1227 
1228             if (conv_params->do_average) {
1229                 if (conv_params->use_jnt_comp_avg) {
1230                     do {
1231                         xy_y_convolve_2tap_half_pel_64_avx512(
1232                             im + 4 * 32, s_512[0] + 0, s_512[1] + 0, r);
1233                         jnt_2d_comp_avg_round_store_half_pel_64_avx512(
1234                             r, factor_512, offset_comp_avg_512, dst + 0 * 32, dst8 + 0 * 32);
1235 
1236                         xy_y_convolve_2tap_half_pel_64_avx512(
1237                             im + 6 * 32, s_512[0] + 2, s_512[1] + 2, r);
1238                         jnt_2d_comp_avg_round_store_half_pel_64_avx512(
1239                             r, factor_512, offset_comp_avg_512, dst + 2 * 32, dst8 + 2 * 32);
1240 
1241                         im += 2 * 128;
1242 
1243                         xy_y_convolve_2tap_half_pel_64_avx512(
1244                             im + 0 * 32, s_512[1] + 0, s_512[0] + 0, r);
1245                         jnt_2d_comp_avg_round_store_half_pel_64_avx512(r,
1246                                                                        factor_512,
1247                                                                        offset_comp_avg_512,
1248                                                                        dst + dst_stride + 0 * 32,
1249                                                                        dst8 + dst8_stride + 0 * 32);
1250 
1251                         xy_y_convolve_2tap_half_pel_64_avx512(
1252                             im + 2 * 32, s_512[1] + 2, s_512[0] + 2, r);
1253                         jnt_2d_comp_avg_round_store_half_pel_64_avx512(r,
1254                                                                        factor_512,
1255                                                                        offset_comp_avg_512,
1256                                                                        dst + dst_stride + 2 * 32,
1257                                                                        dst8 + dst8_stride + 2 * 32);
1258 
1259                         dst += 2 * dst_stride;
1260                         dst8 += 2 * dst8_stride;
1261                         y -= 2;
1262                     } while (y);
1263                 } else {
1264                     do {
1265                         xy_y_convolve_2tap_half_pel_64_avx512(
1266                             im + 4 * 32, s_512[0] + 0, s_512[1] + 0, r);
1267                         jnt_2d_avg_round_store_half_pel_64_avx512(
1268                             r, offset_avg_512, dst + 0 * 32, dst8 + 0 * 32);
1269 
1270                         xy_y_convolve_2tap_half_pel_64_avx512(
1271                             im + 6 * 32, s_512[0] + 2, s_512[1] + 2, r);
1272                         jnt_2d_avg_round_store_half_pel_64_avx512(
1273                             r, offset_avg_512, dst + 2 * 32, dst8 + 2 * 32);
1274 
1275                         im += 2 * 128;
1276 
1277                         xy_y_convolve_2tap_half_pel_64_avx512(
1278                             im + 0 * 32, s_512[1] + 0, s_512[0] + 0, r);
1279                         jnt_2d_avg_round_store_half_pel_64_avx512(r,
1280                                                                   offset_avg_512,
1281                                                                   dst + dst_stride + 0 * 32,
1282                                                                   dst8 + dst8_stride + 0 * 32);
1283 
1284                         xy_y_convolve_2tap_half_pel_64_avx512(
1285                             im + 2 * 32, s_512[1] + 2, s_512[0] + 2, r);
1286                         jnt_2d_avg_round_store_half_pel_64_avx512(r,
1287                                                                   offset_avg_512,
1288                                                                   dst + dst_stride + 2 * 32,
1289                                                                   dst8 + dst8_stride + 2 * 32);
1290 
1291                         dst += 2 * dst_stride;
1292                         dst8 += 2 * dst8_stride;
1293                         y -= 2;
1294                     } while (y);
1295                 }
1296             } else {
1297                 do {
1298                     xy_y_convolve_2tap_half_pel_64_avx512(
1299                         im + 4 * 32, s_512[0] + 0, s_512[1] + 0, r);
1300                     jnt_2d_no_avg_round_store_half_pel_64_avx512(
1301                         r, offset_no_avg_512, dst + 0 * 32);
1302 
1303                     xy_y_convolve_2tap_half_pel_64_avx512(
1304                         im + 6 * 32, s_512[0] + 2, s_512[1] + 2, r);
1305                     jnt_2d_no_avg_round_store_half_pel_64_avx512(
1306                         r, offset_no_avg_512, dst + 2 * 32);
1307 
1308                     im += 2 * 128;
1309 
1310                     xy_y_convolve_2tap_half_pel_64_avx512(
1311                         im + 0 * 32, s_512[1] + 0, s_512[0] + 0, r);
1312                     jnt_2d_no_avg_round_store_half_pel_64_avx512(
1313                         r, offset_no_avg_512, dst + dst_stride + 0 * 32);
1314 
1315                     xy_y_convolve_2tap_half_pel_64_avx512(
1316                         im + 2 * 32, s_512[1] + 2, s_512[0] + 2, r);
1317                     jnt_2d_no_avg_round_store_half_pel_64_avx512(
1318                         r, offset_no_avg_512, dst + dst_stride + 2 * 32);
1319 
1320                     dst += 2 * dst_stride;
1321                     y -= 2;
1322                 } while (y);
1323             }
1324         }
1325     }
1326 }
1327 
jnt_convolve_2d_ver_6tap_avx512(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)1328 static void jnt_convolve_2d_ver_6tap_avx512(const int16_t *const im_block, const int32_t w,
1329                                             const int32_t                   h,
1330                                             const InterpFilterParams *const filter_params_y,
1331                                             const int32_t                   subpel_y_q4,
1332                                             const ConvolveParams *const conv_params, uint8_t *dst8,
1333                                             const int32_t dst8_stride) {
1334     const int32_t  dst_stride     = conv_params->dst_stride;
1335     const int32_t  bd             = 8;
1336     const int32_t  round_0        = 3;
1337     const int16_t *im             = im_block;
1338     const int32_t  round_1        = COMPOUND_ROUND1_BITS;
1339     const int32_t  offset_bits    = bd + 2 * FILTER_BITS - round_0; // 19
1340     const int32_t  round_bits     = 2 * FILTER_BITS - round_0 - round_1; // 4
1341     const int32_t  round_offset   = 1 << (offset_bits - round_1);
1342     const int32_t  factor         = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1343     const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
1344         (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
1345         (1 << (round_bits + DIST_PRECISION_BITS - 1));
1346     const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
1347         (1 << offset_bits) - (1 << (offset_bits - 1));
1348     const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
1349         (1 << (offset_bits - 1));
1350     int32_t      y   = h;
1351     ConvBufType *dst = conv_params->dst;
1352 
1353     if (w == 2) {
1354         const __m128i factor_128          = _mm_set1_epi32(factor);
1355         const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1356         const __m128i offset_avg_128      = _mm_set1_epi32(offset_avg);
1357         const __m128i offset_no_avg_128   = _mm_set1_epi32(offset_no_avg);
1358         __m128i       coeffs_128[3], s_32[6], ss_128[3];
1359 
1360         prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1361 
1362         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
1363         s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
1364         s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
1365         s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
1366         s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
1367 
1368         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1369         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1370         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1371         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1372 
1373         ss_128[0] = _mm_unpacklo_epi16(src01, src12);
1374         ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1375 
1376         y = h;
1377 
1378         if (conv_params->do_average) {
1379             if (conv_params->use_jnt_comp_avg) {
1380                 do {
1381                     const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1382                     jnt_2d_comp_avg_round_store_2x2_sse2(
1383                         res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
1384                     im += 2 * 2;
1385                     dst += 2 * dst_stride;
1386                     dst8 += 2 * dst8_stride;
1387                     y -= 2;
1388                 } while (y);
1389             } else {
1390                 do {
1391                     const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1392                     jnt_2d_avg_round_store_2x2_sse2(
1393                         res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1394                     im += 2 * 2;
1395                     dst += 2 * dst_stride;
1396                     dst8 += 2 * dst8_stride;
1397                     y -= 2;
1398                 } while (y);
1399             }
1400         } else {
1401             do {
1402                 const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1403                 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1404                 im += 2 * 2;
1405                 dst += 2 * dst_stride;
1406                 y -= 2;
1407             } while (y);
1408         }
1409     } else if (w <= 16) {
1410         const __m256i factor_256          = _mm256_set1_epi32(factor);
1411         const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1412         const __m256i offset_avg_256      = _mm256_set1_epi32(offset_avg);
1413         const __m256i offset_no_avg_256   = _mm256_set1_epi32(offset_no_avg);
1414         __m256i       coeffs_256[3];
1415 
1416         prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1417 
1418         if (w == 4) {
1419             __m128i s_64[6];
1420             __m256i s_256[6], ss_256[3];
1421 
1422             s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1423             s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1424             s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1425             s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1426             s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1427 
1428             // Load lines a and b. Line a to lower 128, line b to upper 128
1429             s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1430             s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1431             s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1432             s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1433 
1434             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1435             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1436 
1437             y = h;
1438 
1439             if (conv_params->do_average) {
1440                 if (conv_params->use_jnt_comp_avg) {
1441                     do {
1442                         const __m256i res = xy_y_convolve_6tap_4x2_avx2(
1443                             im, s_64, ss_256, coeffs_256);
1444                         jnt_2d_comp_avg_round_store_4x2_avx2(res,
1445                                                              factor_256,
1446                                                              offset_comp_avg_256,
1447                                                              dst,
1448                                                              dst_stride,
1449                                                              dst8,
1450                                                              dst8_stride);
1451                         im += 2 * 4;
1452                         dst += 2 * dst_stride;
1453                         dst8 += 2 * dst8_stride;
1454                         y -= 2;
1455                     } while (y);
1456                 } else {
1457                     do {
1458                         const __m256i res = xy_y_convolve_6tap_4x2_avx2(
1459                             im, s_64, ss_256, coeffs_256);
1460                         jnt_2d_avg_round_store_4x2_avx2(
1461                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1462                         im += 2 * 4;
1463                         dst += 2 * dst_stride;
1464                         dst8 += 2 * dst8_stride;
1465                         y -= 2;
1466                     } while (y);
1467                 }
1468             } else {
1469                 do {
1470                     const __m256i res = xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1471                     jnt_2d_no_avg_round_store_4x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1472                     im += 2 * 4;
1473                     dst += 2 * dst_stride;
1474                     y -= 2;
1475                 } while (y);
1476             }
1477         } else if (w == 8) {
1478             __m256i s_256[6], r[2];
1479 
1480             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1481             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1482             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
1483             s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
1484             y        = h;
1485 
1486             __m256i ss_256[6];
1487 
1488             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1489             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1490 
1491             ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1492             ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1493 
1494             if (conv_params->do_average) {
1495                 if (conv_params->use_jnt_comp_avg) {
1496                     do {
1497                         xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1498                         jnt_2d_comp_avg_round_store_8x2_avx2(
1499                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1500                         im += 2 * 8;
1501                         dst += 2 * dst_stride;
1502                         dst8 += 2 * dst8_stride;
1503                         y -= 2;
1504                     } while (y);
1505                 } else {
1506                     do {
1507                         xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1508                         jnt_2d_avg_round_store_8x2_avx2(
1509                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1510                         im += 2 * 8;
1511                         dst += 2 * dst_stride;
1512                         dst8 += 2 * dst8_stride;
1513                         y -= 2;
1514                     } while (y);
1515                 }
1516             } else {
1517                 do {
1518                     xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1519                     jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1520                     im += 2 * 8;
1521                     dst += 2 * dst_stride;
1522                     y -= 2;
1523                 } while (y);
1524             }
1525         } else {
1526             __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1527 
1528             assert(w == 16);
1529 
1530             loadu_unpack_16bit_5rows_avx2(im, 16, s_256, ss_256, tt_256);
1531             y = h;
1532 
1533             if (conv_params->do_average) {
1534                 if (conv_params->use_jnt_comp_avg) {
1535                     do {
1536                         xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1537                         jnt_2d_comp_avg_round_store_16x2_avx2(
1538                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1539                         im += 2 * 16;
1540                         dst += 2 * dst_stride;
1541                         dst8 += 2 * dst8_stride;
1542                         y -= 2;
1543                     } while (y);
1544                 } else {
1545                     do {
1546                         xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1547                         jnt_2d_avg_round_store_16x2_avx2(
1548                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1549                         im += 2 * 16;
1550                         dst += 2 * dst_stride;
1551                         dst8 += 2 * dst8_stride;
1552                         y -= 2;
1553                     } while (y);
1554                 }
1555             } else {
1556                 do {
1557                     xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1558                     jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1559                     im += 2 * 16;
1560                     dst += 2 * dst_stride;
1561                     y -= 2;
1562                 } while (y);
1563             }
1564         }
1565     } else {
1566         const __m512i factor_512          = _mm512_set1_epi32(factor);
1567         const __m512i offset_comp_avg_512 = _mm512_set1_epi32(offset_comp_avg);
1568         const __m512i offset_avg_512      = _mm512_set1_epi32(offset_avg);
1569         const __m512i offset_no_avg_512   = _mm512_set1_epi32(offset_no_avg);
1570         __m512i       coeffs_512[3];
1571 
1572         prepare_coeffs_6tap_avx512(filter_params_y, subpel_y_q4, coeffs_512);
1573 
1574         if (w == 32) {
1575             __m512i s_512[6], ss_512[6], tt_512[6], r[4];
1576 
1577             loadu_unpack_16bit_32x5_avx512(im, s_512, ss_512, tt_512);
1578 
1579             y = h;
1580 
1581             if (conv_params->do_average) {
1582                 if (conv_params->use_jnt_comp_avg) {
1583                     do {
1584                         xy_y_convolve_6tap_width32x2_avx512(
1585                             im, coeffs_512, s_512, ss_512, tt_512, r);
1586                         jnt_2d_comp_avg_round_store_32x2_avx512(r + 0,
1587                                                                 r + 2,
1588                                                                 factor_512,
1589                                                                 offset_comp_avg_512,
1590                                                                 dst,
1591                                                                 dst_stride,
1592                                                                 dst8,
1593                                                                 dst8_stride);
1594                         im += 2 * 32;
1595                         dst += 2 * dst_stride;
1596                         dst8 += 2 * dst8_stride;
1597                         y -= 2;
1598                     } while (y);
1599                 } else {
1600                     do {
1601                         xy_y_convolve_6tap_width32x2_avx512(
1602                             im, coeffs_512, s_512, ss_512, tt_512, r);
1603                         jnt_2d_avg_round_store_32x2_avx512(
1604                             r + 0, r + 2, offset_avg_512, dst, dst_stride, dst8, dst8_stride);
1605                         im += 2 * 32;
1606                         dst += 2 * dst_stride;
1607                         dst8 += 2 * dst8_stride;
1608                         y -= 2;
1609                     } while (y);
1610                 }
1611             } else {
1612                 do {
1613                     xy_y_convolve_6tap_width32x2_avx512(im, coeffs_512, s_512, ss_512, tt_512, r);
1614                     jnt_2d_no_avg_round_store_32x2_avx512(
1615                         r + 0, r + 2, offset_no_avg_512, dst, dst_stride);
1616                     im += 2 * 32;
1617                     dst += 2 * dst_stride;
1618                     y -= 2;
1619                 } while (y);
1620             }
1621         } else {
1622             __m512i s_512[2][6], ss_512[2][6], tt_512[2][6], r0[4], r1[4];
1623 
1624             assert(!(w % 64));
1625 
1626             int32_t x = 0;
1627             do {
1628                 const int16_t *s  = im + x;
1629                 ConvBufType *  d  = dst + x;
1630                 uint8_t *      d8 = dst8 + x;
1631 
1632                 loadu_unpack_16bit_5rows_avx512(s, w, s_512[0], ss_512[0], tt_512[0]);
1633                 loadu_unpack_16bit_5rows_avx512(s + 32, w, s_512[1], ss_512[1], tt_512[1]);
1634 
1635                 y = h;
1636 
1637                 if (conv_params->do_average) {
1638                     if (conv_params->use_jnt_comp_avg) {
1639                         do {
1640                             xy_y_convolve_6tap_32x2_avx512(
1641                                 s, w, s_512[0], ss_512[0], tt_512[0], coeffs_512, r0);
1642                             xy_y_convolve_6tap_32x2_avx512(
1643                                 s + 32, w, s_512[1], ss_512[1], tt_512[1], coeffs_512, r1);
1644                             jnt_2d_comp_avg_round_store_64_avx512(
1645                                 r0 + 0, r1 + 0, factor_512, offset_comp_avg_512, d, d8);
1646                             jnt_2d_comp_avg_round_store_64_avx512(r0 + 2,
1647                                                                   r1 + 2,
1648                                                                   factor_512,
1649                                                                   offset_comp_avg_512,
1650                                                                   d + dst_stride,
1651                                                                   d8 + dst8_stride);
1652                             s += 2 * w;
1653                             d += 2 * dst_stride;
1654                             d8 += 2 * dst8_stride;
1655                             y -= 2;
1656                         } while (y);
1657                     } else {
1658                         do {
1659                             xy_y_convolve_6tap_32x2_avx512(
1660                                 s, w, s_512[0], ss_512[0], tt_512[0], coeffs_512, r0);
1661                             xy_y_convolve_6tap_32x2_avx512(
1662                                 s + 32, w, s_512[1], ss_512[1], tt_512[1], coeffs_512, r1);
1663                             jnt_2d_avg_round_store_64_avx512(r0 + 0, r1 + 0, offset_avg_512, d, d8);
1664                             jnt_2d_avg_round_store_64_avx512(
1665                                 r0 + 2, r1 + 2, offset_avg_512, d + dst_stride, d8 + dst8_stride);
1666                             s += 2 * w;
1667                             d += 2 * dst_stride;
1668                             d8 += 2 * dst8_stride;
1669                             y -= 2;
1670                         } while (y);
1671                     }
1672                 } else {
1673                     do {
1674                         xy_y_convolve_6tap_32x2_avx512(
1675                             s, w, s_512[0], ss_512[0], tt_512[0], coeffs_512, r0);
1676                         xy_y_convolve_6tap_32x2_avx512(
1677                             s + 32, w, s_512[1], ss_512[1], tt_512[1], coeffs_512, r1);
1678                         jnt_2d_no_avg_round_store_64_avx512(r0 + 0, r1 + 0, offset_no_avg_512, d);
1679                         jnt_2d_no_avg_round_store_64_avx512(
1680                             r0 + 2, r1 + 2, offset_no_avg_512, d + dst_stride);
1681                         s += 2 * w;
1682                         d += 2 * dst_stride;
1683                         y -= 2;
1684                     } while (y);
1685                 }
1686 
1687                 x += 64;
1688             } while (x < w);
1689         }
1690     }
1691 }
1692 
jnt_convolve_2d_ver_8tap_avx512(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)1693 static void jnt_convolve_2d_ver_8tap_avx512(const int16_t *const im_block, const int32_t w,
1694                                             const int32_t                   h,
1695                                             const InterpFilterParams *const filter_params_y,
1696                                             const int32_t                   subpel_y_q4,
1697                                             const ConvolveParams *const conv_params, uint8_t *dst8,
1698                                             const int32_t dst8_stride) {
1699     const int32_t  dst_stride     = conv_params->dst_stride;
1700     const int32_t  bd             = 8;
1701     const int32_t  round_0        = 3;
1702     const int16_t *im             = im_block;
1703     const int32_t  round_1        = COMPOUND_ROUND1_BITS;
1704     const int32_t  offset_bits    = bd + 2 * FILTER_BITS - round_0; // 19
1705     const int32_t  round_bits     = 2 * FILTER_BITS - round_0 - round_1; // 4
1706     const int32_t  round_offset   = 1 << (offset_bits - round_1);
1707     const int32_t  factor         = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1708     const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
1709         (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
1710         (1 << (round_bits + DIST_PRECISION_BITS - 1));
1711     const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
1712         (1 << offset_bits) - (1 << (offset_bits - 1));
1713     const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
1714         (1 << (offset_bits - 1));
1715     int32_t      y   = h;
1716     ConvBufType *dst = conv_params->dst;
1717 
1718     if (w == 2) {
1719         const __m128i factor_128          = _mm_set1_epi32(factor);
1720         const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1721         const __m128i offset_avg_128      = _mm_set1_epi32(offset_avg);
1722         const __m128i offset_no_avg_128   = _mm_set1_epi32(offset_no_avg);
1723         __m128i       coeffs_128[4], s_32[8], ss_128[4];
1724 
1725         prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
1726 
1727         s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
1728         s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
1729         s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
1730         s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
1731         s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
1732         s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
1733         s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
1734 
1735         const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1736         const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1737         const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1738         const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1739         const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1740         const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
1741 
1742         ss_128[0] = _mm_unpacklo_epi16(src01, src12);
1743         ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1744         ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1745 
1746         y = h;
1747 
1748         if (conv_params->do_average) {
1749             if (conv_params->use_jnt_comp_avg) {
1750                 do {
1751                     const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1752                     jnt_2d_comp_avg_round_store_2x2_sse2(
1753                         res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
1754                     im += 2 * 2;
1755                     dst += 2 * dst_stride;
1756                     dst8 += 2 * dst8_stride;
1757                     y -= 2;
1758                 } while (y);
1759             } else {
1760                 do {
1761                     const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1762                     jnt_2d_avg_round_store_2x2_sse2(
1763                         res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1764                     im += 2 * 2;
1765                     dst += 2 * dst_stride;
1766                     dst8 += 2 * dst8_stride;
1767                     y -= 2;
1768                 } while (y);
1769             }
1770         } else {
1771             do {
1772                 const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1773                 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1774                 im += 2 * 2;
1775                 dst += 2 * dst_stride;
1776                 y -= 2;
1777             } while (y);
1778         }
1779     } else if (w <= 16) {
1780         const __m256i factor_256          = _mm256_set1_epi32(factor);
1781         const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1782         const __m256i offset_avg_256      = _mm256_set1_epi32(offset_avg);
1783         const __m256i offset_no_avg_256   = _mm256_set1_epi32(offset_no_avg);
1784         __m256i       coeffs_256[4];
1785 
1786         prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1787 
1788         if (w == 4) {
1789             __m128i s_64[8];
1790             __m256i s_256[8], ss_256[4];
1791 
1792             s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1793             s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1794             s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1795             s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1796             s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1797             s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
1798             s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
1799 
1800             // Load lines a and b. Line a to lower 128, line b to upper 128
1801             s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1802             s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1803             s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1804             s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1805             s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
1806             s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
1807 
1808             ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1809             ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1810             ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1811 
1812             y = h;
1813 
1814             if (conv_params->do_average) {
1815                 if (conv_params->use_jnt_comp_avg) {
1816                     do {
1817                         const __m256i res = xy_y_convolve_8tap_4x2_avx2(
1818                             im, s_64, ss_256, coeffs_256);
1819                         jnt_2d_comp_avg_round_store_4x2_avx2(res,
1820                                                              factor_256,
1821                                                              offset_comp_avg_256,
1822                                                              dst,
1823                                                              dst_stride,
1824                                                              dst8,
1825                                                              dst8_stride);
1826                         im += 2 * 4;
1827                         dst += 2 * dst_stride;
1828                         dst8 += 2 * dst8_stride;
1829                         y -= 2;
1830                     } while (y);
1831                 } else {
1832                     do {
1833                         const __m256i res = xy_y_convolve_8tap_4x2_avx2(
1834                             im, s_64, ss_256, coeffs_256);
1835                         jnt_2d_avg_round_store_4x2_avx2(
1836                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1837                         im += 2 * 4;
1838                         dst += 2 * dst_stride;
1839                         dst8 += 2 * dst8_stride;
1840                         y -= 2;
1841                     } while (y);
1842                 }
1843             } else {
1844                 do {
1845                     const __m256i res = xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1846                     jnt_2d_no_avg_round_store_4x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1847                     im += 2 * 4;
1848                     dst += 2 * dst_stride;
1849                     y -= 2;
1850                 } while (y);
1851             }
1852         } else if (w == 8) {
1853             __m256i s_256[8], r[2];
1854 
1855             s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1856             s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1857             s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
1858             s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
1859             s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
1860             s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
1861             y        = h;
1862 
1863             __m256i ss_256[8];
1864 
1865             convolve_8tap_unapck_avx2(s_256, ss_256);
1866 
1867             if (conv_params->do_average) {
1868                 if (conv_params->use_jnt_comp_avg) {
1869                     do {
1870                         xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1871                         jnt_2d_comp_avg_round_store_8x2_avx2(
1872                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1873                         im += 2 * 8;
1874                         dst += 2 * dst_stride;
1875                         dst8 += 2 * dst8_stride;
1876                         y -= 2;
1877                     } while (y);
1878                 } else {
1879                     do {
1880                         xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1881                         jnt_2d_avg_round_store_8x2_avx2(
1882                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1883                         im += 2 * 8;
1884                         dst += 2 * dst_stride;
1885                         dst8 += 2 * dst8_stride;
1886                         y -= 2;
1887                     } while (y);
1888                 }
1889             } else {
1890                 do {
1891                     xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1892                     jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1893                     im += 2 * 8;
1894                     dst += 2 * dst_stride;
1895                     y -= 2;
1896                 } while (y);
1897             }
1898         } else {
1899             __m256i s_256[8], ss_256[8], tt_256[8], r[4];
1900 
1901             assert(w == 16);
1902 
1903             load_16bit_7rows_avx2(im, 16, s_256);
1904             y = h;
1905 
1906             convolve_8tap_unapck_avx2(s_256, ss_256);
1907             convolve_8tap_unapck_avx2(s_256 + 1, tt_256);
1908 
1909             if (conv_params->do_average) {
1910                 if (conv_params->use_jnt_comp_avg) {
1911                     do {
1912                         xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
1913                         jnt_2d_comp_avg_round_store_16x2_avx2(
1914                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1915                         im += 2 * 16;
1916                         dst += 2 * dst_stride;
1917                         dst8 += 2 * dst8_stride;
1918                         y -= 2;
1919                     } while (y);
1920                 } else {
1921                     do {
1922                         xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
1923                         jnt_2d_avg_round_store_16x2_avx2(
1924                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1925                         im += 2 * 16;
1926                         dst += 2 * dst_stride;
1927                         dst8 += 2 * dst8_stride;
1928                         y -= 2;
1929                     } while (y);
1930                 }
1931             } else {
1932                 do {
1933                     xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
1934                     jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1935                     im += 2 * 16;
1936                     dst += 2 * dst_stride;
1937                     y -= 2;
1938                 } while (y);
1939             }
1940         }
1941     } else {
1942         const __m512i factor_512          = _mm512_set1_epi32(factor);
1943         const __m512i offset_comp_avg_512 = _mm512_set1_epi32(offset_comp_avg);
1944         const __m512i offset_avg_512      = _mm512_set1_epi32(offset_avg);
1945         const __m512i offset_no_avg_512   = _mm512_set1_epi32(offset_no_avg);
1946         __m512i       coeffs_512[4];
1947 
1948         prepare_coeffs_8tap_avx512(filter_params_y, subpel_y_q4, coeffs_512);
1949 
1950         if (w == 32) {
1951             __m512i s_512[8], ss_512[8], tt_512[8], r[4];
1952 
1953             load_16bit_32x7_avx512(im, s_512);
1954             convolve_8tap_unapck_avx512(s_512, ss_512);
1955             convolve_8tap_unapck_avx512(s_512 + 1, tt_512);
1956 
1957             y = h;
1958 
1959             if (conv_params->do_average) {
1960                 if (conv_params->use_jnt_comp_avg) {
1961                     do {
1962                         xy_y_convolve_8tap_width32x2_avx512(
1963                             im, coeffs_512, s_512, ss_512, tt_512, r);
1964                         jnt_2d_comp_avg_round_store_32x2_avx512(r + 0,
1965                                                                 r + 2,
1966                                                                 factor_512,
1967                                                                 offset_comp_avg_512,
1968                                                                 dst,
1969                                                                 dst_stride,
1970                                                                 dst8,
1971                                                                 dst8_stride);
1972                         im += 2 * 32;
1973                         dst += 2 * dst_stride;
1974                         dst8 += 2 * dst8_stride;
1975                         y -= 2;
1976                     } while (y);
1977                 } else {
1978                     do {
1979                         xy_y_convolve_8tap_width32x2_avx512(
1980                             im, coeffs_512, s_512, ss_512, tt_512, r);
1981                         jnt_2d_avg_round_store_32x2_avx512(
1982                             r + 0, r + 2, offset_avg_512, dst, dst_stride, dst8, dst8_stride);
1983                         im += 2 * 32;
1984                         dst += 2 * dst_stride;
1985                         dst8 += 2 * dst8_stride;
1986                         y -= 2;
1987                     } while (y);
1988                 }
1989             } else {
1990                 do {
1991                     xy_y_convolve_8tap_width32x2_avx512(im, coeffs_512, s_512, ss_512, tt_512, r);
1992                     jnt_2d_no_avg_round_store_32x2_avx512(
1993                         r + 0, r + 2, offset_no_avg_512, dst, dst_stride);
1994                     im += 2 * 32;
1995                     dst += 2 * dst_stride;
1996                     y -= 2;
1997                 } while (y);
1998             }
1999         } else {
2000             __m512i s_512[2][8], ss_512[2][8], tt_512[2][8], r0[4], r1[4];
2001 
2002             assert(!(w % 64));
2003 
2004             int32_t x = 0;
2005             do {
2006                 const int16_t *s  = im + x;
2007                 ConvBufType *  d  = dst + x;
2008                 uint8_t *      d8 = dst8 + x;
2009 
2010                 load_16bit_7rows_avx512(s, w, s_512[0]);
2011                 convolve_8tap_unapck_avx512(s_512[0], ss_512[0]);
2012                 convolve_8tap_unapck_avx512(s_512[0] + 1, tt_512[0]);
2013 
2014                 load_16bit_7rows_avx512(s + 32, w, s_512[1]);
2015                 convolve_8tap_unapck_avx512(s_512[1], ss_512[1]);
2016                 convolve_8tap_unapck_avx512(s_512[1] + 1, tt_512[1]);
2017 
2018                 y = h;
2019 
2020                 if (conv_params->do_average) {
2021                     if (conv_params->use_jnt_comp_avg) {
2022                         do {
2023                             xy_y_convolve_8tap_32x2_avx512(
2024                                 s, w, coeffs_512, s_512[0], ss_512[0], tt_512[0], r0);
2025                             xy_y_convolve_8tap_32x2_avx512(
2026                                 s + 32, w, coeffs_512, s_512[1], ss_512[1], tt_512[1], r1);
2027                             jnt_2d_comp_avg_round_store_64_avx512(
2028                                 r0 + 0, r1 + 0, factor_512, offset_comp_avg_512, d, d8);
2029                             jnt_2d_comp_avg_round_store_64_avx512(r0 + 2,
2030                                                                   r1 + 2,
2031                                                                   factor_512,
2032                                                                   offset_comp_avg_512,
2033                                                                   d + dst_stride,
2034                                                                   d8 + dst8_stride);
2035                             s += 2 * w;
2036                             d += 2 * dst_stride;
2037                             d8 += 2 * dst8_stride;
2038                             y -= 2;
2039                         } while (y);
2040                     } else {
2041                         do {
2042                             xy_y_convolve_8tap_32x2_avx512(
2043                                 s, w, coeffs_512, s_512[0], ss_512[0], tt_512[0], r0);
2044                             xy_y_convolve_8tap_32x2_avx512(
2045                                 s + 32, w, coeffs_512, s_512[1], ss_512[1], tt_512[1], r1);
2046                             jnt_2d_avg_round_store_64_avx512(r0 + 0, r1 + 0, offset_avg_512, d, d8);
2047                             jnt_2d_avg_round_store_64_avx512(
2048                                 r0 + 2, r1 + 2, offset_avg_512, d + dst_stride, d8 + dst8_stride);
2049                             s += 2 * w;
2050                             d += 2 * dst_stride;
2051                             d8 += 2 * dst8_stride;
2052                             y -= 2;
2053                         } while (y);
2054                     }
2055                 } else {
2056                     do {
2057                         xy_y_convolve_8tap_32x2_avx512(
2058                             s, w, coeffs_512, s_512[0], ss_512[0], tt_512[0], r0);
2059                         xy_y_convolve_8tap_32x2_avx512(
2060                             s + 32, w, coeffs_512, s_512[1], ss_512[1], tt_512[1], r1);
2061                         jnt_2d_no_avg_round_store_64_avx512(r0 + 0, r1 + 0, offset_no_avg_512, d);
2062                         jnt_2d_no_avg_round_store_64_avx512(
2063                             r0 + 2, r1 + 2, offset_no_avg_512, d + dst_stride);
2064                         s += 2 * w;
2065                         d += 2 * dst_stride;
2066                         y -= 2;
2067                     } while (y);
2068                 }
2069 
2070                 x += 64;
2071             } while (x < w);
2072         }
2073     }
2074 }
2075 
2076 typedef void (*JntConvolve2dHorTapFunc)(const uint8_t *src, const int32_t src_stride,
2077                                         const int32_t w, const int32_t h,
2078                                         const InterpFilterParams *filter_params_x,
2079                                         const int32_t subpel_x_q4, int16_t *const im_block);
2080 
2081 typedef void (*JntConvolve2dVerTapFunc)(const int16_t *const im_block, const int32_t w,
2082                                         const int32_t                   h,
2083                                         const InterpFilterParams *const filter_params_y,
2084                                         const int32_t                   subpel_y_q4,
2085                                         const ConvolveParams *const conv_params, uint8_t *dst8,
2086                                         const int32_t dst8_stride);
2087 
svt_av1_jnt_convolve_2d_avx512(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)2088 void svt_av1_jnt_convolve_2d_avx512(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
2089                                     int32_t dst8_stride, int32_t w, int32_t h,
2090                                     InterpFilterParams *filter_params_x,
2091                                     InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
2092                                     const int32_t subpel_y_q4, ConvolveParams *conv_params) {
2093     static const JntConvolve2dHorTapFunc jnt_convolve_2d_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
2094         NULL,
2095         NULL,
2096         jnt_convolve_2d_hor_2tap_avx512,
2097         NULL,
2098         jnt_convolve_2d_hor_4tap_avx2,
2099         NULL,
2100         jnt_convolve_2d_hor_6tap_avx512,
2101         NULL,
2102         jnt_convolve_2d_hor_8tap_avx512};
2103     static const JntConvolve2dVerTapFunc jnt_convolve_2d_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
2104         NULL,
2105         jnt_convolve_2d_ver_2tap_half_avx512,
2106         jnt_convolve_2d_ver_2tap_avx512,
2107         jnt_convolve_2d_ver_4tap_avx2,
2108         jnt_convolve_2d_ver_4tap_avx2,
2109         jnt_convolve_2d_ver_6tap_avx512,
2110         jnt_convolve_2d_ver_6tap_avx512,
2111         jnt_convolve_2d_ver_8tap_avx512,
2112         jnt_convolve_2d_ver_8tap_avx512};
2113     const int32_t  tap_x   = get_convolve_tap(filter_params_x->filter_ptr);
2114     const int32_t  tap_y   = get_convolve_tap(filter_params_y->filter_ptr);
2115     const uint8_t *src_ptr = src + ((MAX_FILTER_TAP - tap_y) / 2 - 3) * src_stride;
2116     // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
2117     //       permutation.
2118     DECLARE_ALIGNED(64, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
2119 
2120     assert(conv_params->round_0 == 3);
2121     assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
2122 
2123     // horizontal filter
2124 
2125     // Have to calculate 1 more row for small widths, since 2 lines are
2126     // calculated in each loop for them.
2127     const int32_t hh = h + tap_y - (w >= 64);
2128 
2129     jnt_convolve_2d_hor_tap_func_table[tap_x](
2130         src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
2131 
2132     // vertical filter
2133     jnt_convolve_2d_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
2134         im_block, w, h, filter_params_y, subpel_y_q4, conv_params, dst8, dst8_stride);
2135 }
2136 
2137 #endif // EN_AVX512_SUPPORT
2138