1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10  */
11 #include "EbDefinitions.h"
12 
13 #if EN_AVX512_SUPPORT
14 #include <assert.h>
15 #include <immintrin.h>
16 
17 #include "common_dsp_rtcd.h"
18 #include "convolve.h"
19 #include "convolve_avx2.h"
20 #include "convolve_avx512.h"
21 #include "synonyms.h"
22 #include "synonyms_avx2.h"
23 #include "synonyms_avx512.h"
24 #include "wiener_convolve_avx2.h"
25 
wiener_clip_avx512(const __m512i s,const __m512i r,const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)26 SIMD_INLINE __m512i wiener_clip_avx512(const __m512i s, const __m512i r, const __m512i filt_center,
27                                        const __m512i round_h0, const __m512i round_h1,
28                                        const __m512i clamp_high) {
29     const int     round_0   = WIENER_ROUND0_BITS;
30     const __m512i clamp_low = _mm512_setzero_si512();
31     __m512i       res       = _mm512_srai_epi16(_mm512_add_epi16(r, round_h0), round_0);
32     __m512i       data_0    = _mm512_shuffle_epi8(s, filt_center);
33     data_0                  = _mm512_slli_epi16(data_0, FILTER_BITS - round_0);
34     res                     = _mm512_add_epi16(res, data_0);
35     res                     = _mm512_add_epi16(res, round_h1);
36     res                     = _mm512_max_epi16(res, clamp_low);
37     return _mm512_min_epi16(res, clamp_high);
38 }
39 
wiener_convolve_tap3_avx512(const __m512i s,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)40 static INLINE __m512i wiener_convolve_tap3_avx512(const __m512i s, const __m512i coeffs[2],
41                                                   const __m512i filt[2], const __m512i filt_center,
42                                                   const __m512i round_h0, const __m512i round_h1,
43                                                   const __m512i clamp_high) {
44     const __m512i res = x_convolve_4tap_avx512(s, coeffs, filt);
45     return wiener_clip_avx512(s, res, filt_center, round_h0, round_h1, clamp_high);
46 }
47 
wiener_convolve_tap5_avx512(const __m512i s,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)48 SIMD_INLINE __m512i wiener_convolve_tap5_avx512(const __m512i s, const __m512i coeffs[3],
49                                                 const __m512i filt[3], const __m512i filt_center,
50                                                 const __m512i round_h0, const __m512i round_h1,
51                                                 const __m512i clamp_high) {
52     const __m512i res = x_convolve_6tap_avx512(s, coeffs, filt);
53     return wiener_clip_avx512(s, res, filt_center, round_h0, round_h1, clamp_high);
54 }
55 
wiener_convolve_tap7_avx512(const __m512i s,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)56 static INLINE __m512i wiener_convolve_tap7_avx512(const __m512i s, const __m512i coeffs[4],
57                                                   const __m512i filt[4], const __m512i filt_center,
58                                                   const __m512i round_h0, const __m512i round_h1,
59                                                   const __m512i clamp_high) {
60     const __m512i res = x_convolve_8tap_avx512(s, coeffs, filt);
61     return wiener_clip_avx512(s, res, filt_center, round_h0, round_h1, clamp_high);
62 }
63 
wiener_convolve_h16x2_tap3_avx512(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)64 SIMD_INLINE __m512i wiener_convolve_h16x2_tap3_avx512(
65     const uint8_t* src, const ptrdiff_t stride, const __m512i coeffs[2], const __m512i filt[2],
66     const __m512i filt_center, const __m512i round_h0, const __m512i round_h1,
67     const __m512i clamp_high) {
68     const __m512i s = loadu_8bit_32x2_avx512(src, stride);
69     return wiener_convolve_tap3_avx512(
70         s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
71 }
72 
wiener_convolve_h16x2_tap5_avx512(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)73 SIMD_INLINE __m512i wiener_convolve_h16x2_tap5_avx512(
74     const uint8_t* src, const ptrdiff_t stride, const __m512i coeffs[3], const __m512i filt[3],
75     const __m512i filt_center, const __m512i round_h0, const __m512i round_h1,
76     const __m512i clamp_high) {
77     const __m512i s = loadu_8bit_32x2_avx512(src, stride);
78     return wiener_convolve_tap5_avx512(
79         s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
80 }
81 
wiener_convolve_h16x2_tap7_avx512(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)82 static INLINE __m512i wiener_convolve_h16x2_tap7_avx512(
83     const uint8_t* src, const ptrdiff_t stride, const __m512i coeffs[4], const __m512i filt[4],
84     const __m512i filt_center, const __m512i round_h0, const __m512i round_h1,
85     const __m512i clamp_high) {
86     const __m512i s = loadu_8bit_32x2_avx512(src, stride);
87     return wiener_convolve_tap7_avx512(
88         s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
89 }
90 
wiener_convolve_h32_tap3_avx512(const uint8_t * src,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)91 SIMD_INLINE __m512i wiener_convolve_h32_tap3_avx512(const uint8_t* src, const __m512i coeffs[2],
92                                                     const __m512i filt[2],
93                                                     const __m512i filt_center,
94                                                     const __m512i round_h0, const __m512i round_h1,
95                                                     const __m512i clamp_high) {
96     const __m512i s = zz_loadu_512(src);
97     return wiener_convolve_tap3_avx512(
98         s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
99 }
100 
wiener_convolve_h32_tap5_avx512(const uint8_t * src,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)101 static INLINE __m512i wiener_convolve_h32_tap5_avx512(
102     const uint8_t* src, const __m512i coeffs[3], const __m512i filt[3], const __m512i filt_center,
103     const __m512i round_h0, const __m512i round_h1, const __m512i clamp_high) {
104     const __m512i s = zz_loadu_512(src);
105     return wiener_convolve_tap5_avx512(
106         s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
107 }
108 
wiener_convolve_h32_tap7_avx512(const uint8_t * src,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)109 static INLINE __m512i wiener_convolve_h32_tap7_avx512(
110     const uint8_t* src, const __m512i coeffs[4], const __m512i filt[4], const __m512i filt_center,
111     const __m512i round_h0, const __m512i round_h1, const __m512i clamp_high) {
112     const __m512i s = zz_loadu_512(src);
113     return wiener_convolve_tap7_avx512(
114         s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
115 }
116 
wiener_convolve_h32x2_tap3(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)117 SIMD_INLINE void wiener_convolve_h32x2_tap3(const uint8_t* src, const ptrdiff_t stride,
118                                             const __m512i coeffs[2], const __m512i filt[2],
119                                             const __m512i filt_center, const __m512i round_h0,
120                                             const __m512i round_h1, const __m512i clamp_high,
121                                             __m512i* const dst0, __m512i* const dst1) {
122     *dst0 = wiener_convolve_h16x2_tap3_avx512(
123         src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
124     *dst1 = wiener_convolve_h16x2_tap3_avx512(
125         src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
126 }
127 
wiener_convolve_h32x2_tap5(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)128 SIMD_INLINE void wiener_convolve_h32x2_tap5(const uint8_t* src, const ptrdiff_t stride,
129                                             const __m512i coeffs[3], const __m512i filt[3],
130                                             const __m512i filt_center, const __m512i round_h0,
131                                             const __m512i round_h1, const __m512i clamp_high,
132                                             __m512i* const dst0, __m512i* const dst1) {
133     *dst0 = wiener_convolve_h16x2_tap5_avx512(
134         src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
135     *dst1 = wiener_convolve_h16x2_tap5_avx512(
136         src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
137 }
138 
wiener_convolve_h32x2_tap7(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)139 SIMD_INLINE void wiener_convolve_h32x2_tap7(const uint8_t* src, const ptrdiff_t stride,
140                                             const __m512i coeffs[4], const __m512i filt[4],
141                                             const __m512i filt_center, const __m512i round_h0,
142                                             const __m512i round_h1, const __m512i clamp_high,
143                                             __m512i* const dst0, __m512i* const dst1) {
144     *dst0 = wiener_convolve_h16x2_tap7_avx512(
145         src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
146     *dst1 = wiener_convolve_h16x2_tap7_avx512(
147         src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
148 }
149 
wiener_convolve_h64_tap3(const uint8_t * src,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)150 static INLINE void wiener_convolve_h64_tap3(const uint8_t* src, const __m512i coeffs[2],
151                                             const __m512i filt[2], const __m512i filt_center,
152                                             const __m512i round_h0, const __m512i round_h1,
153                                             const __m512i clamp_high, __m512i* const dst0,
154                                             __m512i* const dst1) {
155     *dst0 = wiener_convolve_h32_tap3_avx512(
156         src + 0, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
157     *dst1 = wiener_convolve_h32_tap3_avx512(
158         src + 8, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
159 }
160 
wiener_convolve_h64_tap5(const uint8_t * src,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)161 static INLINE void wiener_convolve_h64_tap5(const uint8_t* src, const __m512i coeffs[3],
162                                             const __m512i filt[3], const __m512i filt_center,
163                                             const __m512i round_h0, const __m512i round_h1,
164                                             const __m512i clamp_high, __m512i* const dst0,
165                                             __m512i* const dst1) {
166     *dst0 = wiener_convolve_h32_tap5_avx512(
167         src + 0, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
168     *dst1 = wiener_convolve_h32_tap5_avx512(
169         src + 8, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
170 }
171 
wiener_convolve_h64_tap7_avx512(const uint8_t * src,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)172 static INLINE void wiener_convolve_h64_tap7_avx512(const uint8_t* src, const __m512i coeffs[4],
173                                                    const __m512i filt[4], const __m512i filt_center,
174                                                    const __m512i round_h0, const __m512i round_h1,
175                                                    const __m512i clamp_high, __m512i* const dst0,
176                                                    __m512i* const dst1) {
177     *dst0 = wiener_convolve_h32_tap7_avx512(
178         src + 0, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
179     *dst1 = wiener_convolve_h32_tap7_avx512(
180         src + 8, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
181 }
182 
round_store_avx512(const __m512i res0,const __m512i res1,const __m512i round_v)183 static INLINE __m512i round_store_avx512(const __m512i res0, const __m512i res1,
184                                          const __m512i round_v) {
185     const int     round_1 = 2 * FILTER_BITS - WIENER_ROUND0_BITS;
186     const __m512i r0      = _mm512_srai_epi32(_mm512_add_epi32(res0, round_v), round_1);
187     const __m512i r1      = _mm512_srai_epi32(_mm512_add_epi32(res1, round_v), round_1);
188     return _mm512_packs_epi32(r0, r1);
189 }
190 
wiener_convolve_v_tap3_kernel_avx512(const __m512i coeffs[2],const __m512i round_v,const __m512i s[3])191 SIMD_INLINE __m512i wiener_convolve_v_tap3_kernel_avx512(const __m512i coeffs[2],
192                                                          const __m512i round_v,
193                                                          const __m512i s[3]) {
194     const __m512i s0 = _mm512_add_epi16(s[0], s[2]);
195     __m512i       ss[2];
196     ss[0]              = _mm512_unpacklo_epi16(s0, s[1]);
197     ss[1]              = _mm512_unpackhi_epi16(s0, s[1]);
198     const __m512i res0 = convolve16_2tap_avx512(&ss[0], coeffs);
199     const __m512i res1 = convolve16_2tap_avx512(&ss[1], coeffs);
200     return round_store_avx512(res0, res1, round_v);
201 }
202 
wiener_convolve_v_tap5_kernel_avx512(const __m512i coeffs[2],const __m512i round_v,const __m512i s[5])203 SIMD_INLINE __m512i wiener_convolve_v_tap5_kernel_avx512(const __m512i coeffs[2],
204                                                          const __m512i round_v,
205                                                          const __m512i s[5]) {
206     const __m512i s0 = _mm512_add_epi16(s[0], s[4]);
207     const __m512i s1 = _mm512_add_epi16(s[1], s[3]);
208     __m512i       ss[4];
209     ss[0]              = _mm512_unpacklo_epi16(s0, s1);
210     ss[1]              = _mm512_unpacklo_epi16(s[2], _mm512_setzero_si512());
211     ss[2]              = _mm512_unpackhi_epi16(s0, s1);
212     ss[3]              = _mm512_unpackhi_epi16(s[2], _mm512_setzero_si512());
213     const __m512i res0 = convolve16_4tap_avx512(ss + 0, coeffs);
214     const __m512i res1 = convolve16_4tap_avx512(ss + 2, coeffs);
215     return round_store_avx512(res0, res1, round_v);
216 }
217 
wiener_convolve_v_tap7_kernel_avx512(const __m512i coeffs[2],const __m512i round_v,const __m512i s[7])218 SIMD_INLINE __m512i wiener_convolve_v_tap7_kernel_avx512(const __m512i coeffs[2],
219                                                          const __m512i round_v,
220                                                          const __m512i s[7]) {
221     const __m512i s0 = _mm512_add_epi16(s[0], s[6]);
222     const __m512i s1 = _mm512_add_epi16(s[1], s[5]);
223     const __m512i s2 = _mm512_add_epi16(s[2], s[4]);
224     __m512i       ss[4];
225     ss[0]              = _mm512_unpacklo_epi16(s0, s1);
226     ss[1]              = _mm512_unpacklo_epi16(s2, s[3]);
227     ss[2]              = _mm512_unpackhi_epi16(s0, s1);
228     ss[3]              = _mm512_unpackhi_epi16(s2, s[3]);
229     const __m512i res0 = convolve16_4tap_avx512(ss + 0, coeffs);
230     const __m512i res1 = convolve16_4tap_avx512(ss + 2, coeffs);
231     return round_store_avx512(res0, res1, round_v);
232 }
233 
wiener_convolve_v16x2_tap3(const __m512i coeffs[2],const __m512i round_v,__m512i s[3])234 SIMD_INLINE __m512i wiener_convolve_v16x2_tap3(const __m512i coeffs[2], const __m512i round_v,
235                                                __m512i s[3]) {
236     const __m512i dst = wiener_convolve_v_tap3_kernel_avx512(coeffs, round_v, s);
237     s[0]              = s[2];
238     return dst;
239 }
240 
wiener_convolve_v16x2_tap5(const __m512i coeffs[2],const __m512i round_v,__m512i s[5])241 SIMD_INLINE __m512i wiener_convolve_v16x2_tap5(const __m512i coeffs[2], const __m512i round_v,
242                                                __m512i s[5]) {
243     const __m512i dst = wiener_convolve_v_tap5_kernel_avx512(coeffs, round_v, s);
244     s[0]              = s[2];
245     s[1]              = s[3];
246     s[2]              = s[4];
247     return dst;
248 }
249 
wiener_convolve_v16x2_tap7(const __m512i coeffs[2],const __m512i round_v,__m512i s[7])250 SIMD_INLINE __m512i wiener_convolve_v16x2_tap7(const __m512i coeffs[2], const __m512i round_v,
251                                                __m512i s[7]) {
252     const __m512i dst = wiener_convolve_v_tap7_kernel_avx512(coeffs, round_v, s);
253     s[0]              = s[2];
254     s[1]              = s[3];
255     s[2]              = s[4];
256     s[3]              = s[5];
257     s[4]              = s[6];
258     return dst;
259 }
260 
wiener_convolve_v32_tap3(const __m512i coeffs[2],const __m512i round_v,__m512i s[3])261 SIMD_INLINE __m512i wiener_convolve_v32_tap3(const __m512i coeffs[2], const __m512i round_v,
262                                              __m512i s[3]) {
263     const __m512i dst = wiener_convolve_v_tap3_kernel_avx512(coeffs, round_v, s);
264     s[0]              = s[1];
265     s[1]              = s[2];
266     return dst;
267 }
268 
wiener_convolve_v32_tap5(const __m512i coeffs[2],const __m512i round_v,__m512i s[5])269 static INLINE __m512i wiener_convolve_v32_tap5(const __m512i coeffs[2], const __m512i round_v,
270                                                __m512i s[5]) {
271     const __m512i dst = wiener_convolve_v_tap5_kernel_avx512(coeffs, round_v, s);
272     s[0]              = s[1];
273     s[1]              = s[2];
274     s[2]              = s[3];
275     s[3]              = s[4];
276     return dst;
277 }
278 
wiener_convolve_v32_tap7(const __m512i coeffs[2],const __m512i round_v,__m512i s[7])279 static INLINE __m512i wiener_convolve_v32_tap7(const __m512i coeffs[2], const __m512i round_v,
280                                                __m512i s[7]) {
281     const __m512i dst = wiener_convolve_v_tap7_kernel_avx512(coeffs, round_v, s);
282     s[0]              = s[1];
283     s[1]              = s[2];
284     s[2]              = s[3];
285     s[3]              = s[4];
286     s[4]              = s[5];
287     s[5]              = s[6];
288     return dst;
289 }
290 
pack_store_32x2_avx512(const __m512i res0,const __m512i res1,uint8_t * const dst,const ptrdiff_t stride)291 static INLINE void pack_store_32x2_avx512(const __m512i res0, const __m512i res1,
292                                           uint8_t* const dst, const ptrdiff_t stride) {
293     const __m512i d = _mm512_packus_epi16(res0, res1);
294     storeu_u8_32x2_avx512(d, dst, stride);
295 }
296 
297 // Note: If this function crash in Windows, please pay attention to the pointer
298 // filter_x, which could be overridden by other instructions. It's a bug from
299 // Visual Studio compiler. Please adjust the positions of the following 2
300 // instructions randomly to work around, or even duplicate instruction 1 to
301 // several locations before coeffs_x is referenced.
302 // 1. const __m128i coeffs_x = xx_loadu_128(filter_x);
303 // 2. const int cnt_zero_coef = calc_zero_coef(filter_x, filter_y);
svt_av1_wiener_convolve_add_src_avx512(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride,const int16_t * const filter_x,const int16_t * const filter_y,const int32_t w,const int32_t h,const ConvolveParams * const conv_params)304 void svt_av1_wiener_convolve_add_src_avx512(const uint8_t* const src, const ptrdiff_t src_stride,
305                                             uint8_t* const dst, const ptrdiff_t dst_stride,
306                                             const int16_t* const filter_x,
307                                             const int16_t* const filter_y, const int32_t w,
308                                             const int32_t               h,
309                                             const ConvolveParams* const conv_params) {
310     const int32_t  bd            = 8;
311     const int      center_tap    = (SUBPEL_TAPS - 1) / 2;
312     const int      round_0       = WIENER_ROUND0_BITS;
313     const int      round_1       = 2 * FILTER_BITS - WIENER_ROUND0_BITS;
314     const int      cnt_zero_coef = calc_zero_coef(filter_x, filter_y);
315     const uint8_t* src_ptr       = src - center_tap * src_stride - center_tap;
316     const __m256i  round_h0      = _mm256_set1_epi16((1 << (round_0 - 1)));
317     const __m512i  round_h0_512  = _mm512_set1_epi16((1 << (round_0 - 1)));
318     const __m256i  round_h1      = _mm256_set1_epi16((1 << (bd + FILTER_BITS - round_0 - 1)));
319     const __m512i  round_h1_512  = _mm512_set1_epi16((1 << (bd + FILTER_BITS - round_0 - 1)));
320     const __m256i  round_v    = _mm256_set1_epi32((1 << (round_1 - 1)) - (1 << (bd + round_1 - 1)));
321     const __m512i round_v_512 = _mm512_set1_epi32((1 << (round_1 - 1)) - (1 << (bd + round_1 - 1)));
322     const __m256i clamp_high  = _mm256_set1_epi16(WIENER_CLAMP_LIMIT(round_0, bd) - 1);
323     const __m512i clamp_high_512      = _mm512_set1_epi16(WIENER_CLAMP_LIMIT(round_0, bd) - 1);
324     const __m128i zero_128            = _mm_setzero_si128();
325     const __m128i offset_0            = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
326     const __m128i coeffs_y            = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
327     const __m256i filter_coeffs_y     = _mm256_broadcastsi128_si256(coeffs_y);
328     const __m512i filter_coeffs_y_512 = svt_mm512_broadcast_i64x2(coeffs_y);
329     int32_t       width               = w;
330     uint8_t*      dst_ptr             = dst;
331     __m256i       filt[4], coeffs_h[4], coeffs_v[2];
332     __m512i       filt_512[4], coeffs_h_512[4], coeffs_v_512[2];
333 
334     (void)conv_params;
335     assert(!(w % 8));
336     assert(conv_params->round_0 == round_0);
337     assert(conv_params->round_1 == round_1);
338 
339     filt[0]     = yy_load_256(filt1_global_avx);
340     filt[1]     = yy_load_256(filt2_global_avx);
341     filt_512[0] = zz_load_512(filt1_global_avx);
342     filt_512[1] = zz_load_512(filt2_global_avx);
343 
344     if (!cnt_zero_coef) {
345         const __m128i coeffs_x = xx_loadu_128(filter_x);
346         if (width >= 32) {
347             int32_t x = width & ~63;
348 
349             const __m512i filt_center_512 = zz_load_512(filt_center_tap7_global_avx);
350             filt_512[2]                   = zz_load_512(filt3_global_avx);
351             filt_512[3]                   = zz_load_512(filt4_global_avx);
352             populate_coeffs_8tap_avx512(coeffs_x, coeffs_h_512);
353             // coeffs 0 1 0 1 0 1 0 1
354             coeffs_v_512[0] = _mm512_shuffle_epi32(filter_coeffs_y_512, 0x00);
355             // coeffs 2 3 2 3 2 3 2 3
356             coeffs_v_512[1] = _mm512_shuffle_epi32(filter_coeffs_y_512, 0x55);
357 
358             width -= x;
359             while (x) {
360                 const uint8_t* src_p = src_ptr;
361                 __m512i        s[2][7];
362 
363                 wiener_convolve_h64_tap7_avx512(src_p,
364                                                 coeffs_h_512,
365                                                 filt_512,
366                                                 filt_center_512,
367                                                 round_h0_512,
368                                                 round_h1_512,
369                                                 clamp_high_512,
370                                                 &s[0][0],
371                                                 &s[1][0]);
372                 src_p += src_stride;
373                 wiener_convolve_h64_tap7_avx512(src_p,
374                                                 coeffs_h_512,
375                                                 filt_512,
376                                                 filt_center_512,
377                                                 round_h0_512,
378                                                 round_h1_512,
379                                                 clamp_high_512,
380                                                 &s[0][1],
381                                                 &s[1][1]);
382                 src_p += src_stride;
383                 wiener_convolve_h64_tap7_avx512(src_p,
384                                                 coeffs_h_512,
385                                                 filt_512,
386                                                 filt_center_512,
387                                                 round_h0_512,
388                                                 round_h1_512,
389                                                 clamp_high_512,
390                                                 &s[0][2],
391                                                 &s[1][2]);
392                 src_p += src_stride;
393                 wiener_convolve_h64_tap7_avx512(src_p,
394                                                 coeffs_h_512,
395                                                 filt_512,
396                                                 filt_center_512,
397                                                 round_h0_512,
398                                                 round_h1_512,
399                                                 clamp_high_512,
400                                                 &s[0][3],
401                                                 &s[1][3]);
402                 src_p += src_stride;
403                 wiener_convolve_h64_tap7_avx512(src_p,
404                                                 coeffs_h_512,
405                                                 filt_512,
406                                                 filt_center_512,
407                                                 round_h0_512,
408                                                 round_h1_512,
409                                                 clamp_high_512,
410                                                 &s[0][4],
411                                                 &s[1][4]);
412                 src_p += src_stride;
413                 wiener_convolve_h64_tap7_avx512(src_p,
414                                                 coeffs_h_512,
415                                                 filt_512,
416                                                 filt_center_512,
417                                                 round_h0_512,
418                                                 round_h1_512,
419                                                 clamp_high_512,
420                                                 &s[0][5],
421                                                 &s[1][5]);
422                 src_p += src_stride;
423 
424                 int y = 0;
425                 do {
426                     wiener_convolve_h64_tap7_avx512(src_p,
427                                                     coeffs_h_512,
428                                                     filt_512,
429                                                     filt_center_512,
430                                                     round_h0_512,
431                                                     round_h1_512,
432                                                     clamp_high_512,
433                                                     &s[0][6],
434                                                     &s[1][6]);
435                     src_p += src_stride;
436                     const __m512i r0 = wiener_convolve_v32_tap7(coeffs_v_512, round_v_512, s[0]);
437                     const __m512i r1 = wiener_convolve_v32_tap7(coeffs_v_512, round_v_512, s[1]);
438                     convolve_store_64_avx512(r0, r1, dst_ptr + y * dst_stride);
439                 } while (++y < h);
440 
441                 src_ptr += 64;
442                 dst_ptr += 64;
443                 x -= 64;
444             }
445 
446             if (!width)
447                 return;
448 
449             x = width & ~31;
450             if (x) {
451                 const uint8_t* src_p = src_ptr;
452                 uint8_t*       dst_p = dst_ptr;
453                 __m512i        s[2][7];
454 
455                 wiener_convolve_h32x2_tap7(src_p,
456                                            src_stride,
457                                            coeffs_h_512,
458                                            filt_512,
459                                            filt_center_512,
460                                            round_h0_512,
461                                            round_h1_512,
462                                            clamp_high_512,
463                                            &s[0][0],
464                                            &s[1][0]);
465                 src_p += 2 * src_stride;
466                 wiener_convolve_h32x2_tap7(src_p,
467                                            src_stride,
468                                            coeffs_h_512,
469                                            filt_512,
470                                            filt_center_512,
471                                            round_h0_512,
472                                            round_h1_512,
473                                            clamp_high_512,
474                                            &s[0][2],
475                                            &s[1][2]);
476                 src_p += 2 * src_stride;
477                 wiener_convolve_h32x2_tap7(src_p,
478                                            src_stride,
479                                            coeffs_h_512,
480                                            filt_512,
481                                            filt_center_512,
482                                            round_h0_512,
483                                            round_h1_512,
484                                            clamp_high_512,
485                                            &s[0][4],
486                                            &s[1][4]);
487                 src_p += 2 * src_stride;
488 
489                 s[0][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][0], 1),
490                                             _mm512_castsi512_si256(s[0][2]));
491                 s[1][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][0], 1),
492                                             _mm512_castsi512_si256(s[1][2]));
493                 s[0][3] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][2], 1),
494                                             _mm512_castsi512_si256(s[0][4]));
495                 s[1][3] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][2], 1),
496                                             _mm512_castsi512_si256(s[1][4]));
497 
498                 int y = h;
499                 do {
500                     wiener_convolve_h32x2_tap7(src_p,
501                                                src_stride,
502                                                coeffs_h_512,
503                                                filt_512,
504                                                filt_center_512,
505                                                round_h0_512,
506                                                round_h1_512,
507                                                clamp_high_512,
508                                                &s[0][6],
509                                                &s[1][6]);
510                     s[0][5] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][4], 1),
511                                                 _mm512_castsi512_si256(s[0][6]));
512                     s[1][5] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][4], 1),
513                                                 _mm512_castsi512_si256(s[1][6]));
514                     src_p += 2 * src_stride;
515                     const __m512i r0 = wiener_convolve_v16x2_tap7(coeffs_v_512, round_v_512, s[0]);
516                     const __m512i r1 = wiener_convolve_v16x2_tap7(coeffs_v_512, round_v_512, s[1]);
517                     if (y == 1) {
518                         const __m512i d  = _mm512_packus_epi16(r0, r1);
519                         const __m256i d0 = _mm512_castsi512_si256(d);
520                         _mm256_storeu_si256((__m256i*)dst_p, d0);
521                     } else {
522                         pack_store_32x2_avx512(r0, r1, dst_p, dst_stride);
523                     }
524 
525                     dst_p += 2 * dst_stride;
526                     y -= 2;
527                 } while (y > 0);
528 
529                 src_ptr += 32;
530                 dst_ptr += 32;
531                 width -= 32;
532             }
533 
534             if (!width)
535                 return;
536         }
537 
538         const __m256i filt_center = yy_load_256(filt_center_tap7_global_avx);
539         filt[2]                   = yy_load_256(filt3_global_avx);
540         filt[3]                   = yy_load_256(filt4_global_avx);
541         populate_coeffs_8tap_avx2(coeffs_x, coeffs_h);
542         // coeffs 0 1 0 1 0 1 0 1
543         coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
544         // coeffs 2 3 2 3 2 3 2 3
545         coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
546 
547         if (width >= 16) {
548             const uint8_t* src_p = src_ptr;
549             uint8_t*       dst_p = dst_ptr;
550             __m256i        s[2][7];
551 
552             wiener_convolve_h16x2_tap7(src_p,
553                                        src_stride,
554                                        coeffs_h,
555                                        filt,
556                                        filt_center,
557                                        round_h0,
558                                        round_h1,
559                                        clamp_high,
560                                        &s[0][0],
561                                        &s[1][0]);
562             src_p += 2 * src_stride;
563             wiener_convolve_h16x2_tap7(src_p,
564                                        src_stride,
565                                        coeffs_h,
566                                        filt,
567                                        filt_center,
568                                        round_h0,
569                                        round_h1,
570                                        clamp_high,
571                                        &s[0][2],
572                                        &s[1][2]);
573             src_p += 2 * src_stride;
574             wiener_convolve_h16x2_tap7(src_p,
575                                        src_stride,
576                                        coeffs_h,
577                                        filt,
578                                        filt_center,
579                                        round_h0,
580                                        round_h1,
581                                        clamp_high,
582                                        &s[0][4],
583                                        &s[1][4]);
584             src_p += 2 * src_stride;
585 
586             s[0][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][0], 1),
587                                         _mm256_castsi256_si128(s[0][2]));
588             s[1][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][0], 1),
589                                         _mm256_castsi256_si128(s[1][2]));
590             s[0][3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][2], 1),
591                                         _mm256_castsi256_si128(s[0][4]));
592             s[1][3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][2], 1),
593                                         _mm256_castsi256_si128(s[1][4]));
594 
595             int y = h;
596             do {
597                 wiener_convolve_h16x2_tap7(src_p,
598                                            src_stride,
599                                            coeffs_h,
600                                            filt,
601                                            filt_center,
602                                            round_h0,
603                                            round_h1,
604                                            clamp_high,
605                                            &s[0][6],
606                                            &s[1][6]);
607                 s[0][5] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][4], 1),
608                                             _mm256_castsi256_si128(s[0][6]));
609                 s[1][5] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][4], 1),
610                                             _mm256_castsi256_si128(s[1][6]));
611                 src_p += 2 * src_stride;
612                 const __m256i r0 = wiener_convolve_v8x2_tap7(coeffs_v, round_v, s[0]);
613                 const __m256i r1 = wiener_convolve_v8x2_tap7(coeffs_v, round_v, s[1]);
614                 if (y == 1) {
615                     const __m256i d  = _mm256_packus_epi16(r0, r1);
616                     const __m128i d0 = _mm256_castsi256_si128(d);
617                     _mm_storeu_si128((__m128i*)dst_p, d0);
618                 } else {
619                     pack_store_16x2_avx2(r0, r1, dst_p, dst_stride);
620                 }
621 
622                 dst_p += 2 * dst_stride;
623                 y -= 2;
624             } while (y > 0);
625 
626             src_ptr += 16;
627             dst_ptr += 16;
628             width -= 16;
629         }
630 
631         if (width) {
632             const uint8_t* src_p = src_ptr;
633             __m256i        s[7];
634 
635             assert(width == 8);
636 
637             s[0] = wiener_convolve_h8x2_tap7(
638                 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
639             src_p += 2 * src_stride;
640             s[2] = wiener_convolve_h8x2_tap7(
641                 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
642             src_p += 2 * src_stride;
643             s[4] = wiener_convolve_h8x2_tap7(
644                 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
645             src_p += 2 * src_stride;
646 
647             s[1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0], 1),
648                                      _mm256_castsi256_si128(s[2]));
649             s[3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[2], 1),
650                                      _mm256_castsi256_si128(s[4]));
651 
652             int y = h;
653             do {
654                 s[6] = wiener_convolve_h8x2_tap7(
655                     src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
656                 s[5] = _mm256_setr_m128i(_mm256_extracti128_si256(s[4], 1),
657                                          _mm256_castsi256_si128(s[6]));
658                 src_p += 2 * src_stride;
659                 const __m256i r = wiener_convolve_v8x2_tap7(coeffs_v, round_v, s);
660                 if (y == 1) {
661                     const __m256i d  = _mm256_packus_epi16(r, r);
662                     const __m128i d0 = _mm256_castsi256_si128(d);
663                     _mm_storel_epi64((__m128i*)dst_ptr, d0);
664                 } else {
665                     pack_store_8x2_avx2(r, dst_ptr, dst_stride);
666                 }
667 
668                 dst_ptr += 2 * dst_stride;
669                 y -= 2;
670             } while (y > 0);
671         }
672     } else if (cnt_zero_coef == 1) {
673         __m128i coeffs_x = xx_loadu_128(filter_x);
674         src_ptr += src_stride + 1;
675 
676         if (width >= 32) {
677             int32_t x = width & ~63;
678 
679             const __m512i filt_center_512 = zz_load_512(filt_center_tap5_global_avx);
680             filt_512[2]                   = zz_load_512(filt3_global_avx);
681             populate_coeffs_6tap_avx512(coeffs_x, coeffs_h_512);
682             // coeffs 1 2 1 2 1 2 1 2
683             coeffs_v_512[0] = _mm512_shuffle_epi8(filter_coeffs_y_512,
684                                                   _mm512_set1_epi32(0x05040302u));
685             // coeffs 3 4 3 4 3 4 3 4
686             coeffs_v_512[1] = _mm512_shuffle_epi8(filter_coeffs_y_512,
687                                                   _mm512_set1_epi32(0x09080706u));
688 
689             width -= x;
690             while (x) {
691                 const uint8_t* src_p = src_ptr;
692                 __m512i        s[2][5];
693 
694                 wiener_convolve_h64_tap5(src_p,
695                                          coeffs_h_512,
696                                          filt_512,
697                                          filt_center_512,
698                                          round_h0_512,
699                                          round_h1_512,
700                                          clamp_high_512,
701                                          &s[0][0],
702                                          &s[1][0]);
703                 src_p += src_stride;
704                 wiener_convolve_h64_tap5(src_p,
705                                          coeffs_h_512,
706                                          filt_512,
707                                          filt_center_512,
708                                          round_h0_512,
709                                          round_h1_512,
710                                          clamp_high_512,
711                                          &s[0][1],
712                                          &s[1][1]);
713                 src_p += src_stride;
714                 wiener_convolve_h64_tap5(src_p,
715                                          coeffs_h_512,
716                                          filt_512,
717                                          filt_center_512,
718                                          round_h0_512,
719                                          round_h1_512,
720                                          clamp_high_512,
721                                          &s[0][2],
722                                          &s[1][2]);
723                 src_p += src_stride;
724                 wiener_convolve_h64_tap5(src_p,
725                                          coeffs_h_512,
726                                          filt_512,
727                                          filt_center_512,
728                                          round_h0_512,
729                                          round_h1_512,
730                                          clamp_high_512,
731                                          &s[0][3],
732                                          &s[1][3]);
733                 src_p += src_stride;
734 
735                 int y = 0;
736                 do {
737                     wiener_convolve_h64_tap5(src_p,
738                                              coeffs_h_512,
739                                              filt_512,
740                                              filt_center_512,
741                                              round_h0_512,
742                                              round_h1_512,
743                                              clamp_high_512,
744                                              &s[0][4],
745                                              &s[1][4]);
746                     src_p += src_stride;
747                     const __m512i r0 = wiener_convolve_v32_tap5(coeffs_v_512, round_v_512, s[0]);
748                     const __m512i r1 = wiener_convolve_v32_tap5(coeffs_v_512, round_v_512, s[1]);
749                     convolve_store_64_avx512(r0, r1, dst_ptr + y * dst_stride);
750                 } while (++y < h);
751 
752                 src_ptr += 64;
753                 dst_ptr += 64;
754                 x -= 64;
755             }
756 
757             if (!width)
758                 return;
759 
760             x = width & ~31;
761             if (x) {
762                 const uint8_t* src_p = src_ptr;
763                 uint8_t*       dst_p = dst_ptr;
764                 __m512i        s[2][5];
765 
766                 wiener_convolve_h32x2_tap5(src_p,
767                                            src_stride,
768                                            coeffs_h_512,
769                                            filt_512,
770                                            filt_center_512,
771                                            round_h0_512,
772                                            round_h1_512,
773                                            clamp_high_512,
774                                            &s[0][0],
775                                            &s[1][0]);
776                 src_p += 2 * src_stride;
777                 wiener_convolve_h32x2_tap5(src_p,
778                                            src_stride,
779                                            coeffs_h_512,
780                                            filt_512,
781                                            filt_center_512,
782                                            round_h0_512,
783                                            round_h1_512,
784                                            clamp_high_512,
785                                            &s[0][2],
786                                            &s[1][2]);
787                 src_p += 2 * src_stride;
788 
789                 s[0][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][0], 1),
790                                             _mm512_castsi512_si256(s[0][2]));
791                 s[1][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][0], 1),
792                                             _mm512_castsi512_si256(s[1][2]));
793 
794                 int y = h;
795                 do {
796                     wiener_convolve_h32x2_tap5(src_p,
797                                                src_stride,
798                                                coeffs_h_512,
799                                                filt_512,
800                                                filt_center_512,
801                                                round_h0_512,
802                                                round_h1_512,
803                                                clamp_high_512,
804                                                &s[0][4],
805                                                &s[1][4]);
806                     s[0][3] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][2], 1),
807                                                 _mm512_castsi512_si256(s[0][4]));
808                     s[1][3] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][2], 1),
809                                                 _mm512_castsi512_si256(s[1][4]));
810                     src_p += 2 * src_stride;
811                     const __m512i r0 = wiener_convolve_v16x2_tap5(coeffs_v_512, round_v_512, s[0]);
812                     const __m512i r1 = wiener_convolve_v16x2_tap5(coeffs_v_512, round_v_512, s[1]);
813                     if (y == 1) {
814                         const __m512i d  = _mm512_packus_epi16(r0, r1);
815                         const __m256i d0 = _mm512_castsi512_si256(d);
816                         _mm256_storeu_si256((__m256i*)dst_p, d0);
817                     } else {
818                         pack_store_32x2_avx512(r0, r1, dst_p, dst_stride);
819                     }
820 
821                     dst_p += 2 * dst_stride;
822                     y -= 2;
823                 } while (y > 0);
824 
825                 src_ptr += 32;
826                 dst_ptr += 32;
827                 width -= 32;
828             }
829 
830             if (!width)
831                 return;
832         }
833 
834         const __m256i filt_center = yy_load_256(filt_center_tap5_global_avx);
835         filt[2]                   = yy_load_256(filt3_global_avx);
836         coeffs_x                  = xx_loadu_128(filter_x);
837         populate_coeffs_6tap_avx2(coeffs_x, coeffs_h);
838         // coeffs 1 2 1 2 1 2 1 2
839         coeffs_v[0] = _mm256_shuffle_epi8(filter_coeffs_y, _mm256_set1_epi32(0x05040302u));
840         // coeffs 3 4 3 4 3 4 3 4
841         coeffs_v[1] = _mm256_shuffle_epi8(filter_coeffs_y, _mm256_set1_epi32(0x09080706u));
842 
843         if (width >= 16) {
844             const uint8_t* src_p = src_ptr;
845             uint8_t*       dst_p = dst_ptr;
846             __m256i        s[2][5];
847 
848             wiener_convolve_h16x2_tap5(src_p,
849                                        src_stride,
850                                        coeffs_h,
851                                        filt,
852                                        filt_center,
853                                        round_h0,
854                                        round_h1,
855                                        clamp_high,
856                                        &s[0][0],
857                                        &s[1][0]);
858             src_p += 2 * src_stride;
859             wiener_convolve_h16x2_tap5(src_p,
860                                        src_stride,
861                                        coeffs_h,
862                                        filt,
863                                        filt_center,
864                                        round_h0,
865                                        round_h1,
866                                        clamp_high,
867                                        &s[0][2],
868                                        &s[1][2]);
869             src_p += 2 * src_stride;
870 
871             s[0][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][0], 1),
872                                         _mm256_castsi256_si128(s[0][2]));
873             s[1][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][0], 1),
874                                         _mm256_castsi256_si128(s[1][2]));
875 
876             int y = h;
877             do {
878                 wiener_convolve_h16x2_tap5(src_p,
879                                            src_stride,
880                                            coeffs_h,
881                                            filt,
882                                            filt_center,
883                                            round_h0,
884                                            round_h1,
885                                            clamp_high,
886                                            &s[0][4],
887                                            &s[1][4]);
888                 s[0][3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][2], 1),
889                                             _mm256_castsi256_si128(s[0][4]));
890                 s[1][3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][2], 1),
891                                             _mm256_castsi256_si128(s[1][4]));
892                 src_p += 2 * src_stride;
893                 const __m256i r0 = wiener_convolve_v8x2_tap5(coeffs_v, round_v, s[0]);
894                 const __m256i r1 = wiener_convolve_v8x2_tap5(coeffs_v, round_v, s[1]);
895                 if (y == 1) {
896                     const __m256i d  = _mm256_packus_epi16(r0, r1);
897                     const __m128i d0 = _mm256_castsi256_si128(d);
898                     _mm_storeu_si128((__m128i*)dst_p, d0);
899                 } else {
900                     pack_store_16x2_avx2(r0, r1, dst_p, dst_stride);
901                 }
902 
903                 dst_p += 2 * dst_stride;
904                 y -= 2;
905             } while (y > 0);
906 
907             src_ptr += 16;
908             dst_ptr += 16;
909             width -= 16;
910         }
911 
912         if (width) {
913             const uint8_t* src_p = src_ptr;
914             __m256i        s[5];
915 
916             assert(width == 8);
917 
918             s[0] = wiener_convolve_h8x2_tap5(
919                 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
920             src_p += 2 * src_stride;
921             s[2] = wiener_convolve_h8x2_tap5(
922                 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
923             src_p += 2 * src_stride;
924 
925             s[1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0], 1),
926                                      _mm256_castsi256_si128(s[2]));
927 
928             int y = h;
929             do {
930                 s[4] = wiener_convolve_h8x2_tap5(
931                     src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
932                 s[3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[2], 1),
933                                          _mm256_castsi256_si128(s[4]));
934                 src_p += 2 * src_stride;
935                 const __m256i r = wiener_convolve_v8x2_tap5(coeffs_v, round_v, s);
936                 if (y == 1) {
937                     const __m256i d  = _mm256_packus_epi16(r, r);
938                     const __m128i d0 = _mm256_castsi256_si128(d);
939                     _mm_storel_epi64((__m128i*)dst_ptr, d0);
940                 } else {
941                     pack_store_8x2_avx2(r, dst_ptr, dst_stride);
942                 }
943 
944                 dst_ptr += 2 * dst_stride;
945                 y -= 2;
946             } while (y > 0);
947         }
948     } else {
949         const __m128i coeffs_x = xx_loadu_128(filter_x);
950         src_ptr += 2 * src_stride + 2;
951 
952         if (width >= 32) {
953             int32_t x = width & ~63;
954 
955             const __m512i filt_center_512 = zz_load_512(filt_center_tap3_global_avx);
956             populate_coeffs_4tap_avx512(coeffs_x, coeffs_h_512);
957             // coeffs 2 3 2 3 2 3 2 3
958             coeffs_v_512[0] = _mm512_shuffle_epi32(filter_coeffs_y_512, 0x55);
959             // coeffs 4 5 4 5 4 5 4 5
960             coeffs_v_512[1] = _mm512_shuffle_epi32(filter_coeffs_y_512, 0xaa);
961 
962             width -= x;
963             while (x) {
964                 const uint8_t* src_p = src_ptr;
965                 __m512i        s[2][3];
966 
967                 wiener_convolve_h64_tap3(src_p,
968                                          coeffs_h_512,
969                                          filt_512,
970                                          filt_center_512,
971                                          round_h0_512,
972                                          round_h1_512,
973                                          clamp_high_512,
974                                          &s[0][0],
975                                          &s[1][0]);
976                 src_p += src_stride;
977                 wiener_convolve_h64_tap3(src_p,
978                                          coeffs_h_512,
979                                          filt_512,
980                                          filt_center_512,
981                                          round_h0_512,
982                                          round_h1_512,
983                                          clamp_high_512,
984                                          &s[0][1],
985                                          &s[1][1]);
986                 src_p += src_stride;
987 
988                 int y = 0;
989                 do {
990                     wiener_convolve_h64_tap3(src_p,
991                                              coeffs_h_512,
992                                              filt_512,
993                                              filt_center_512,
994                                              round_h0_512,
995                                              round_h1_512,
996                                              clamp_high_512,
997                                              &s[0][2],
998                                              &s[1][2]);
999                     src_p += src_stride;
1000                     const __m512i r0 = wiener_convolve_v32_tap3(coeffs_v_512, round_v_512, s[0]);
1001                     const __m512i r1 = wiener_convolve_v32_tap3(coeffs_v_512, round_v_512, s[1]);
1002                     convolve_store_64_avx512(r0, r1, dst_ptr + y * dst_stride);
1003                 } while (++y < h);
1004 
1005                 src_ptr += 64;
1006                 dst_ptr += 64;
1007                 x -= 64;
1008             }
1009 
1010             if (!width)
1011                 return;
1012 
1013             x = width & ~31;
1014             if (x) {
1015                 const uint8_t* src_p = src_ptr;
1016                 uint8_t*       dst_p = dst_ptr;
1017                 __m512i        s[2][3];
1018 
1019                 wiener_convolve_h32x2_tap3(src_p,
1020                                            src_stride,
1021                                            coeffs_h_512,
1022                                            filt_512,
1023                                            filt_center_512,
1024                                            round_h0_512,
1025                                            round_h1_512,
1026                                            clamp_high_512,
1027                                            &s[0][0],
1028                                            &s[1][0]);
1029                 src_p += 2 * src_stride;
1030 
1031                 int y = h;
1032                 do {
1033                     wiener_convolve_h32x2_tap3(src_p,
1034                                                src_stride,
1035                                                coeffs_h_512,
1036                                                filt_512,
1037                                                filt_center_512,
1038                                                round_h0_512,
1039                                                round_h1_512,
1040                                                clamp_high_512,
1041                                                &s[0][2],
1042                                                &s[1][2]);
1043                     s[0][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][0], 1),
1044                                                 _mm512_castsi512_si256(s[0][2]));
1045                     s[1][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][0], 1),
1046                                                 _mm512_castsi512_si256(s[1][2]));
1047                     src_p += 2 * src_stride;
1048                     const __m512i r0 = wiener_convolve_v16x2_tap3(coeffs_v_512, round_v_512, s[0]);
1049                     const __m512i r1 = wiener_convolve_v16x2_tap3(coeffs_v_512, round_v_512, s[1]);
1050                     if (y == 1) {
1051                         const __m512i d  = _mm512_packus_epi16(r0, r1);
1052                         const __m256i d0 = _mm512_castsi512_si256(d);
1053                         _mm256_storeu_si256((__m256i*)dst_p, d0);
1054                     } else {
1055                         pack_store_32x2_avx512(r0, r1, dst_p, dst_stride);
1056                     }
1057 
1058                     dst_p += 2 * dst_stride;
1059                     y -= 2;
1060                 } while (y > 0);
1061 
1062                 src_ptr += 32;
1063                 dst_ptr += 32;
1064                 width -= 32;
1065             }
1066 
1067             if (!width)
1068                 return;
1069         }
1070 
1071         const __m256i filt_center = yy_load_256(filt_center_tap3_global_avx);
1072         populate_coeffs_4tap_avx2(coeffs_x, coeffs_h);
1073         // coeffs 2 3 2 3 2 3 2 3
1074         coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
1075         // coeffs 4 5 4 5 4 5 4 5
1076         coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
1077 
1078         if (width >= 16) {
1079             const uint8_t* src_p = src_ptr;
1080             uint8_t*       dst_p = dst_ptr;
1081             __m256i        s[2][3];
1082 
1083             wiener_convolve_h16x2_tap3(src_p,
1084                                        src_stride,
1085                                        coeffs_h,
1086                                        filt,
1087                                        filt_center,
1088                                        round_h0,
1089                                        round_h1,
1090                                        clamp_high,
1091                                        &s[0][0],
1092                                        &s[1][0]);
1093             src_p += 2 * src_stride;
1094 
1095             int y = h;
1096             do {
1097                 wiener_convolve_h16x2_tap3(src_p,
1098                                            src_stride,
1099                                            coeffs_h,
1100                                            filt,
1101                                            filt_center,
1102                                            round_h0,
1103                                            round_h1,
1104                                            clamp_high,
1105                                            &s[0][2],
1106                                            &s[1][2]);
1107                 s[0][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][0], 1),
1108                                             _mm256_castsi256_si128(s[0][2]));
1109                 s[1][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][0], 1),
1110                                             _mm256_castsi256_si128(s[1][2]));
1111                 src_p += 2 * src_stride;
1112                 const __m256i r0 = wiener_convolve_v8x2_tap3(coeffs_v, round_v, s[0]);
1113                 const __m256i r1 = wiener_convolve_v8x2_tap3(coeffs_v, round_v, s[1]);
1114                 if (y == 1) {
1115                     const __m256i d  = _mm256_packus_epi16(r0, r1);
1116                     const __m128i d0 = _mm256_castsi256_si128(d);
1117                     _mm_storeu_si128((__m128i*)dst_p, d0);
1118                 } else {
1119                     pack_store_16x2_avx2(r0, r1, dst_p, dst_stride);
1120                 }
1121 
1122                 dst_p += 2 * dst_stride;
1123                 y -= 2;
1124             } while (y > 0);
1125 
1126             src_ptr += 16;
1127             dst_ptr += 16;
1128             width -= 16;
1129         }
1130 
1131         if (width) {
1132             const uint8_t* src_p = src_ptr;
1133             __m256i        s[5];
1134 
1135             assert(width == 8);
1136 
1137             s[0] = wiener_convolve_h8x2_tap3(
1138                 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
1139             src_p += 2 * src_stride;
1140 
1141             int y = h;
1142             do {
1143                 s[2] = wiener_convolve_h8x2_tap3(
1144                     src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
1145                 s[1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0], 1),
1146                                          _mm256_castsi256_si128(s[2]));
1147                 src_p += 2 * src_stride;
1148                 const __m256i r = wiener_convolve_v8x2_tap3(coeffs_v, round_v, s);
1149                 if (y == 1) {
1150                     const __m256i d  = _mm256_packus_epi16(r, r);
1151                     const __m128i d0 = _mm256_castsi256_si128(d);
1152                     _mm_storel_epi64((__m128i*)dst_ptr, d0);
1153                 } else {
1154                     pack_store_8x2_avx2(r, dst_ptr, dst_stride);
1155                 }
1156 
1157                 dst_ptr += 2 * dst_stride;
1158                 y -= 2;
1159             } while (y > 0);
1160         }
1161     }
1162 }
1163 
1164 #endif // EN_AVX512_SUPPORT
1165