1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11 #include "EbDefinitions.h"
12
13 #if EN_AVX512_SUPPORT
14 #include <assert.h>
15 #include <immintrin.h>
16
17 #include "common_dsp_rtcd.h"
18 #include "convolve.h"
19 #include "convolve_avx2.h"
20 #include "convolve_avx512.h"
21 #include "synonyms.h"
22 #include "synonyms_avx2.h"
23 #include "synonyms_avx512.h"
24 #include "wiener_convolve_avx2.h"
25
wiener_clip_avx512(const __m512i s,const __m512i r,const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)26 SIMD_INLINE __m512i wiener_clip_avx512(const __m512i s, const __m512i r, const __m512i filt_center,
27 const __m512i round_h0, const __m512i round_h1,
28 const __m512i clamp_high) {
29 const int round_0 = WIENER_ROUND0_BITS;
30 const __m512i clamp_low = _mm512_setzero_si512();
31 __m512i res = _mm512_srai_epi16(_mm512_add_epi16(r, round_h0), round_0);
32 __m512i data_0 = _mm512_shuffle_epi8(s, filt_center);
33 data_0 = _mm512_slli_epi16(data_0, FILTER_BITS - round_0);
34 res = _mm512_add_epi16(res, data_0);
35 res = _mm512_add_epi16(res, round_h1);
36 res = _mm512_max_epi16(res, clamp_low);
37 return _mm512_min_epi16(res, clamp_high);
38 }
39
wiener_convolve_tap3_avx512(const __m512i s,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)40 static INLINE __m512i wiener_convolve_tap3_avx512(const __m512i s, const __m512i coeffs[2],
41 const __m512i filt[2], const __m512i filt_center,
42 const __m512i round_h0, const __m512i round_h1,
43 const __m512i clamp_high) {
44 const __m512i res = x_convolve_4tap_avx512(s, coeffs, filt);
45 return wiener_clip_avx512(s, res, filt_center, round_h0, round_h1, clamp_high);
46 }
47
wiener_convolve_tap5_avx512(const __m512i s,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)48 SIMD_INLINE __m512i wiener_convolve_tap5_avx512(const __m512i s, const __m512i coeffs[3],
49 const __m512i filt[3], const __m512i filt_center,
50 const __m512i round_h0, const __m512i round_h1,
51 const __m512i clamp_high) {
52 const __m512i res = x_convolve_6tap_avx512(s, coeffs, filt);
53 return wiener_clip_avx512(s, res, filt_center, round_h0, round_h1, clamp_high);
54 }
55
wiener_convolve_tap7_avx512(const __m512i s,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)56 static INLINE __m512i wiener_convolve_tap7_avx512(const __m512i s, const __m512i coeffs[4],
57 const __m512i filt[4], const __m512i filt_center,
58 const __m512i round_h0, const __m512i round_h1,
59 const __m512i clamp_high) {
60 const __m512i res = x_convolve_8tap_avx512(s, coeffs, filt);
61 return wiener_clip_avx512(s, res, filt_center, round_h0, round_h1, clamp_high);
62 }
63
wiener_convolve_h16x2_tap3_avx512(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)64 SIMD_INLINE __m512i wiener_convolve_h16x2_tap3_avx512(
65 const uint8_t* src, const ptrdiff_t stride, const __m512i coeffs[2], const __m512i filt[2],
66 const __m512i filt_center, const __m512i round_h0, const __m512i round_h1,
67 const __m512i clamp_high) {
68 const __m512i s = loadu_8bit_32x2_avx512(src, stride);
69 return wiener_convolve_tap3_avx512(
70 s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
71 }
72
wiener_convolve_h16x2_tap5_avx512(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)73 SIMD_INLINE __m512i wiener_convolve_h16x2_tap5_avx512(
74 const uint8_t* src, const ptrdiff_t stride, const __m512i coeffs[3], const __m512i filt[3],
75 const __m512i filt_center, const __m512i round_h0, const __m512i round_h1,
76 const __m512i clamp_high) {
77 const __m512i s = loadu_8bit_32x2_avx512(src, stride);
78 return wiener_convolve_tap5_avx512(
79 s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
80 }
81
wiener_convolve_h16x2_tap7_avx512(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)82 static INLINE __m512i wiener_convolve_h16x2_tap7_avx512(
83 const uint8_t* src, const ptrdiff_t stride, const __m512i coeffs[4], const __m512i filt[4],
84 const __m512i filt_center, const __m512i round_h0, const __m512i round_h1,
85 const __m512i clamp_high) {
86 const __m512i s = loadu_8bit_32x2_avx512(src, stride);
87 return wiener_convolve_tap7_avx512(
88 s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
89 }
90
wiener_convolve_h32_tap3_avx512(const uint8_t * src,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)91 SIMD_INLINE __m512i wiener_convolve_h32_tap3_avx512(const uint8_t* src, const __m512i coeffs[2],
92 const __m512i filt[2],
93 const __m512i filt_center,
94 const __m512i round_h0, const __m512i round_h1,
95 const __m512i clamp_high) {
96 const __m512i s = zz_loadu_512(src);
97 return wiener_convolve_tap3_avx512(
98 s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
99 }
100
wiener_convolve_h32_tap5_avx512(const uint8_t * src,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)101 static INLINE __m512i wiener_convolve_h32_tap5_avx512(
102 const uint8_t* src, const __m512i coeffs[3], const __m512i filt[3], const __m512i filt_center,
103 const __m512i round_h0, const __m512i round_h1, const __m512i clamp_high) {
104 const __m512i s = zz_loadu_512(src);
105 return wiener_convolve_tap5_avx512(
106 s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
107 }
108
wiener_convolve_h32_tap7_avx512(const uint8_t * src,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high)109 static INLINE __m512i wiener_convolve_h32_tap7_avx512(
110 const uint8_t* src, const __m512i coeffs[4], const __m512i filt[4], const __m512i filt_center,
111 const __m512i round_h0, const __m512i round_h1, const __m512i clamp_high) {
112 const __m512i s = zz_loadu_512(src);
113 return wiener_convolve_tap7_avx512(
114 s, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
115 }
116
wiener_convolve_h32x2_tap3(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)117 SIMD_INLINE void wiener_convolve_h32x2_tap3(const uint8_t* src, const ptrdiff_t stride,
118 const __m512i coeffs[2], const __m512i filt[2],
119 const __m512i filt_center, const __m512i round_h0,
120 const __m512i round_h1, const __m512i clamp_high,
121 __m512i* const dst0, __m512i* const dst1) {
122 *dst0 = wiener_convolve_h16x2_tap3_avx512(
123 src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
124 *dst1 = wiener_convolve_h16x2_tap3_avx512(
125 src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
126 }
127
wiener_convolve_h32x2_tap5(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)128 SIMD_INLINE void wiener_convolve_h32x2_tap5(const uint8_t* src, const ptrdiff_t stride,
129 const __m512i coeffs[3], const __m512i filt[3],
130 const __m512i filt_center, const __m512i round_h0,
131 const __m512i round_h1, const __m512i clamp_high,
132 __m512i* const dst0, __m512i* const dst1) {
133 *dst0 = wiener_convolve_h16x2_tap5_avx512(
134 src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
135 *dst1 = wiener_convolve_h16x2_tap5_avx512(
136 src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
137 }
138
wiener_convolve_h32x2_tap7(const uint8_t * src,const ptrdiff_t stride,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)139 SIMD_INLINE void wiener_convolve_h32x2_tap7(const uint8_t* src, const ptrdiff_t stride,
140 const __m512i coeffs[4], const __m512i filt[4],
141 const __m512i filt_center, const __m512i round_h0,
142 const __m512i round_h1, const __m512i clamp_high,
143 __m512i* const dst0, __m512i* const dst1) {
144 *dst0 = wiener_convolve_h16x2_tap7_avx512(
145 src + 0, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
146 *dst1 = wiener_convolve_h16x2_tap7_avx512(
147 src + 8, stride, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
148 }
149
wiener_convolve_h64_tap3(const uint8_t * src,const __m512i coeffs[2],const __m512i filt[2],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)150 static INLINE void wiener_convolve_h64_tap3(const uint8_t* src, const __m512i coeffs[2],
151 const __m512i filt[2], const __m512i filt_center,
152 const __m512i round_h0, const __m512i round_h1,
153 const __m512i clamp_high, __m512i* const dst0,
154 __m512i* const dst1) {
155 *dst0 = wiener_convolve_h32_tap3_avx512(
156 src + 0, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
157 *dst1 = wiener_convolve_h32_tap3_avx512(
158 src + 8, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
159 }
160
wiener_convolve_h64_tap5(const uint8_t * src,const __m512i coeffs[3],const __m512i filt[3],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)161 static INLINE void wiener_convolve_h64_tap5(const uint8_t* src, const __m512i coeffs[3],
162 const __m512i filt[3], const __m512i filt_center,
163 const __m512i round_h0, const __m512i round_h1,
164 const __m512i clamp_high, __m512i* const dst0,
165 __m512i* const dst1) {
166 *dst0 = wiener_convolve_h32_tap5_avx512(
167 src + 0, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
168 *dst1 = wiener_convolve_h32_tap5_avx512(
169 src + 8, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
170 }
171
wiener_convolve_h64_tap7_avx512(const uint8_t * src,const __m512i coeffs[4],const __m512i filt[4],const __m512i filt_center,const __m512i round_h0,const __m512i round_h1,const __m512i clamp_high,__m512i * const dst0,__m512i * const dst1)172 static INLINE void wiener_convolve_h64_tap7_avx512(const uint8_t* src, const __m512i coeffs[4],
173 const __m512i filt[4], const __m512i filt_center,
174 const __m512i round_h0, const __m512i round_h1,
175 const __m512i clamp_high, __m512i* const dst0,
176 __m512i* const dst1) {
177 *dst0 = wiener_convolve_h32_tap7_avx512(
178 src + 0, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
179 *dst1 = wiener_convolve_h32_tap7_avx512(
180 src + 8, coeffs, filt, filt_center, round_h0, round_h1, clamp_high);
181 }
182
round_store_avx512(const __m512i res0,const __m512i res1,const __m512i round_v)183 static INLINE __m512i round_store_avx512(const __m512i res0, const __m512i res1,
184 const __m512i round_v) {
185 const int round_1 = 2 * FILTER_BITS - WIENER_ROUND0_BITS;
186 const __m512i r0 = _mm512_srai_epi32(_mm512_add_epi32(res0, round_v), round_1);
187 const __m512i r1 = _mm512_srai_epi32(_mm512_add_epi32(res1, round_v), round_1);
188 return _mm512_packs_epi32(r0, r1);
189 }
190
wiener_convolve_v_tap3_kernel_avx512(const __m512i coeffs[2],const __m512i round_v,const __m512i s[3])191 SIMD_INLINE __m512i wiener_convolve_v_tap3_kernel_avx512(const __m512i coeffs[2],
192 const __m512i round_v,
193 const __m512i s[3]) {
194 const __m512i s0 = _mm512_add_epi16(s[0], s[2]);
195 __m512i ss[2];
196 ss[0] = _mm512_unpacklo_epi16(s0, s[1]);
197 ss[1] = _mm512_unpackhi_epi16(s0, s[1]);
198 const __m512i res0 = convolve16_2tap_avx512(&ss[0], coeffs);
199 const __m512i res1 = convolve16_2tap_avx512(&ss[1], coeffs);
200 return round_store_avx512(res0, res1, round_v);
201 }
202
wiener_convolve_v_tap5_kernel_avx512(const __m512i coeffs[2],const __m512i round_v,const __m512i s[5])203 SIMD_INLINE __m512i wiener_convolve_v_tap5_kernel_avx512(const __m512i coeffs[2],
204 const __m512i round_v,
205 const __m512i s[5]) {
206 const __m512i s0 = _mm512_add_epi16(s[0], s[4]);
207 const __m512i s1 = _mm512_add_epi16(s[1], s[3]);
208 __m512i ss[4];
209 ss[0] = _mm512_unpacklo_epi16(s0, s1);
210 ss[1] = _mm512_unpacklo_epi16(s[2], _mm512_setzero_si512());
211 ss[2] = _mm512_unpackhi_epi16(s0, s1);
212 ss[3] = _mm512_unpackhi_epi16(s[2], _mm512_setzero_si512());
213 const __m512i res0 = convolve16_4tap_avx512(ss + 0, coeffs);
214 const __m512i res1 = convolve16_4tap_avx512(ss + 2, coeffs);
215 return round_store_avx512(res0, res1, round_v);
216 }
217
wiener_convolve_v_tap7_kernel_avx512(const __m512i coeffs[2],const __m512i round_v,const __m512i s[7])218 SIMD_INLINE __m512i wiener_convolve_v_tap7_kernel_avx512(const __m512i coeffs[2],
219 const __m512i round_v,
220 const __m512i s[7]) {
221 const __m512i s0 = _mm512_add_epi16(s[0], s[6]);
222 const __m512i s1 = _mm512_add_epi16(s[1], s[5]);
223 const __m512i s2 = _mm512_add_epi16(s[2], s[4]);
224 __m512i ss[4];
225 ss[0] = _mm512_unpacklo_epi16(s0, s1);
226 ss[1] = _mm512_unpacklo_epi16(s2, s[3]);
227 ss[2] = _mm512_unpackhi_epi16(s0, s1);
228 ss[3] = _mm512_unpackhi_epi16(s2, s[3]);
229 const __m512i res0 = convolve16_4tap_avx512(ss + 0, coeffs);
230 const __m512i res1 = convolve16_4tap_avx512(ss + 2, coeffs);
231 return round_store_avx512(res0, res1, round_v);
232 }
233
wiener_convolve_v16x2_tap3(const __m512i coeffs[2],const __m512i round_v,__m512i s[3])234 SIMD_INLINE __m512i wiener_convolve_v16x2_tap3(const __m512i coeffs[2], const __m512i round_v,
235 __m512i s[3]) {
236 const __m512i dst = wiener_convolve_v_tap3_kernel_avx512(coeffs, round_v, s);
237 s[0] = s[2];
238 return dst;
239 }
240
wiener_convolve_v16x2_tap5(const __m512i coeffs[2],const __m512i round_v,__m512i s[5])241 SIMD_INLINE __m512i wiener_convolve_v16x2_tap5(const __m512i coeffs[2], const __m512i round_v,
242 __m512i s[5]) {
243 const __m512i dst = wiener_convolve_v_tap5_kernel_avx512(coeffs, round_v, s);
244 s[0] = s[2];
245 s[1] = s[3];
246 s[2] = s[4];
247 return dst;
248 }
249
wiener_convolve_v16x2_tap7(const __m512i coeffs[2],const __m512i round_v,__m512i s[7])250 SIMD_INLINE __m512i wiener_convolve_v16x2_tap7(const __m512i coeffs[2], const __m512i round_v,
251 __m512i s[7]) {
252 const __m512i dst = wiener_convolve_v_tap7_kernel_avx512(coeffs, round_v, s);
253 s[0] = s[2];
254 s[1] = s[3];
255 s[2] = s[4];
256 s[3] = s[5];
257 s[4] = s[6];
258 return dst;
259 }
260
wiener_convolve_v32_tap3(const __m512i coeffs[2],const __m512i round_v,__m512i s[3])261 SIMD_INLINE __m512i wiener_convolve_v32_tap3(const __m512i coeffs[2], const __m512i round_v,
262 __m512i s[3]) {
263 const __m512i dst = wiener_convolve_v_tap3_kernel_avx512(coeffs, round_v, s);
264 s[0] = s[1];
265 s[1] = s[2];
266 return dst;
267 }
268
wiener_convolve_v32_tap5(const __m512i coeffs[2],const __m512i round_v,__m512i s[5])269 static INLINE __m512i wiener_convolve_v32_tap5(const __m512i coeffs[2], const __m512i round_v,
270 __m512i s[5]) {
271 const __m512i dst = wiener_convolve_v_tap5_kernel_avx512(coeffs, round_v, s);
272 s[0] = s[1];
273 s[1] = s[2];
274 s[2] = s[3];
275 s[3] = s[4];
276 return dst;
277 }
278
wiener_convolve_v32_tap7(const __m512i coeffs[2],const __m512i round_v,__m512i s[7])279 static INLINE __m512i wiener_convolve_v32_tap7(const __m512i coeffs[2], const __m512i round_v,
280 __m512i s[7]) {
281 const __m512i dst = wiener_convolve_v_tap7_kernel_avx512(coeffs, round_v, s);
282 s[0] = s[1];
283 s[1] = s[2];
284 s[2] = s[3];
285 s[3] = s[4];
286 s[4] = s[5];
287 s[5] = s[6];
288 return dst;
289 }
290
pack_store_32x2_avx512(const __m512i res0,const __m512i res1,uint8_t * const dst,const ptrdiff_t stride)291 static INLINE void pack_store_32x2_avx512(const __m512i res0, const __m512i res1,
292 uint8_t* const dst, const ptrdiff_t stride) {
293 const __m512i d = _mm512_packus_epi16(res0, res1);
294 storeu_u8_32x2_avx512(d, dst, stride);
295 }
296
297 // Note: If this function crash in Windows, please pay attention to the pointer
298 // filter_x, which could be overridden by other instructions. It's a bug from
299 // Visual Studio compiler. Please adjust the positions of the following 2
300 // instructions randomly to work around, or even duplicate instruction 1 to
301 // several locations before coeffs_x is referenced.
302 // 1. const __m128i coeffs_x = xx_loadu_128(filter_x);
303 // 2. const int cnt_zero_coef = calc_zero_coef(filter_x, filter_y);
svt_av1_wiener_convolve_add_src_avx512(const uint8_t * const src,const ptrdiff_t src_stride,uint8_t * const dst,const ptrdiff_t dst_stride,const int16_t * const filter_x,const int16_t * const filter_y,const int32_t w,const int32_t h,const ConvolveParams * const conv_params)304 void svt_av1_wiener_convolve_add_src_avx512(const uint8_t* const src, const ptrdiff_t src_stride,
305 uint8_t* const dst, const ptrdiff_t dst_stride,
306 const int16_t* const filter_x,
307 const int16_t* const filter_y, const int32_t w,
308 const int32_t h,
309 const ConvolveParams* const conv_params) {
310 const int32_t bd = 8;
311 const int center_tap = (SUBPEL_TAPS - 1) / 2;
312 const int round_0 = WIENER_ROUND0_BITS;
313 const int round_1 = 2 * FILTER_BITS - WIENER_ROUND0_BITS;
314 const int cnt_zero_coef = calc_zero_coef(filter_x, filter_y);
315 const uint8_t* src_ptr = src - center_tap * src_stride - center_tap;
316 const __m256i round_h0 = _mm256_set1_epi16((1 << (round_0 - 1)));
317 const __m512i round_h0_512 = _mm512_set1_epi16((1 << (round_0 - 1)));
318 const __m256i round_h1 = _mm256_set1_epi16((1 << (bd + FILTER_BITS - round_0 - 1)));
319 const __m512i round_h1_512 = _mm512_set1_epi16((1 << (bd + FILTER_BITS - round_0 - 1)));
320 const __m256i round_v = _mm256_set1_epi32((1 << (round_1 - 1)) - (1 << (bd + round_1 - 1)));
321 const __m512i round_v_512 = _mm512_set1_epi32((1 << (round_1 - 1)) - (1 << (bd + round_1 - 1)));
322 const __m256i clamp_high = _mm256_set1_epi16(WIENER_CLAMP_LIMIT(round_0, bd) - 1);
323 const __m512i clamp_high_512 = _mm512_set1_epi16(WIENER_CLAMP_LIMIT(round_0, bd) - 1);
324 const __m128i zero_128 = _mm_setzero_si128();
325 const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
326 const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
327 const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
328 const __m512i filter_coeffs_y_512 = svt_mm512_broadcast_i64x2(coeffs_y);
329 int32_t width = w;
330 uint8_t* dst_ptr = dst;
331 __m256i filt[4], coeffs_h[4], coeffs_v[2];
332 __m512i filt_512[4], coeffs_h_512[4], coeffs_v_512[2];
333
334 (void)conv_params;
335 assert(!(w % 8));
336 assert(conv_params->round_0 == round_0);
337 assert(conv_params->round_1 == round_1);
338
339 filt[0] = yy_load_256(filt1_global_avx);
340 filt[1] = yy_load_256(filt2_global_avx);
341 filt_512[0] = zz_load_512(filt1_global_avx);
342 filt_512[1] = zz_load_512(filt2_global_avx);
343
344 if (!cnt_zero_coef) {
345 const __m128i coeffs_x = xx_loadu_128(filter_x);
346 if (width >= 32) {
347 int32_t x = width & ~63;
348
349 const __m512i filt_center_512 = zz_load_512(filt_center_tap7_global_avx);
350 filt_512[2] = zz_load_512(filt3_global_avx);
351 filt_512[3] = zz_load_512(filt4_global_avx);
352 populate_coeffs_8tap_avx512(coeffs_x, coeffs_h_512);
353 // coeffs 0 1 0 1 0 1 0 1
354 coeffs_v_512[0] = _mm512_shuffle_epi32(filter_coeffs_y_512, 0x00);
355 // coeffs 2 3 2 3 2 3 2 3
356 coeffs_v_512[1] = _mm512_shuffle_epi32(filter_coeffs_y_512, 0x55);
357
358 width -= x;
359 while (x) {
360 const uint8_t* src_p = src_ptr;
361 __m512i s[2][7];
362
363 wiener_convolve_h64_tap7_avx512(src_p,
364 coeffs_h_512,
365 filt_512,
366 filt_center_512,
367 round_h0_512,
368 round_h1_512,
369 clamp_high_512,
370 &s[0][0],
371 &s[1][0]);
372 src_p += src_stride;
373 wiener_convolve_h64_tap7_avx512(src_p,
374 coeffs_h_512,
375 filt_512,
376 filt_center_512,
377 round_h0_512,
378 round_h1_512,
379 clamp_high_512,
380 &s[0][1],
381 &s[1][1]);
382 src_p += src_stride;
383 wiener_convolve_h64_tap7_avx512(src_p,
384 coeffs_h_512,
385 filt_512,
386 filt_center_512,
387 round_h0_512,
388 round_h1_512,
389 clamp_high_512,
390 &s[0][2],
391 &s[1][2]);
392 src_p += src_stride;
393 wiener_convolve_h64_tap7_avx512(src_p,
394 coeffs_h_512,
395 filt_512,
396 filt_center_512,
397 round_h0_512,
398 round_h1_512,
399 clamp_high_512,
400 &s[0][3],
401 &s[1][3]);
402 src_p += src_stride;
403 wiener_convolve_h64_tap7_avx512(src_p,
404 coeffs_h_512,
405 filt_512,
406 filt_center_512,
407 round_h0_512,
408 round_h1_512,
409 clamp_high_512,
410 &s[0][4],
411 &s[1][4]);
412 src_p += src_stride;
413 wiener_convolve_h64_tap7_avx512(src_p,
414 coeffs_h_512,
415 filt_512,
416 filt_center_512,
417 round_h0_512,
418 round_h1_512,
419 clamp_high_512,
420 &s[0][5],
421 &s[1][5]);
422 src_p += src_stride;
423
424 int y = 0;
425 do {
426 wiener_convolve_h64_tap7_avx512(src_p,
427 coeffs_h_512,
428 filt_512,
429 filt_center_512,
430 round_h0_512,
431 round_h1_512,
432 clamp_high_512,
433 &s[0][6],
434 &s[1][6]);
435 src_p += src_stride;
436 const __m512i r0 = wiener_convolve_v32_tap7(coeffs_v_512, round_v_512, s[0]);
437 const __m512i r1 = wiener_convolve_v32_tap7(coeffs_v_512, round_v_512, s[1]);
438 convolve_store_64_avx512(r0, r1, dst_ptr + y * dst_stride);
439 } while (++y < h);
440
441 src_ptr += 64;
442 dst_ptr += 64;
443 x -= 64;
444 }
445
446 if (!width)
447 return;
448
449 x = width & ~31;
450 if (x) {
451 const uint8_t* src_p = src_ptr;
452 uint8_t* dst_p = dst_ptr;
453 __m512i s[2][7];
454
455 wiener_convolve_h32x2_tap7(src_p,
456 src_stride,
457 coeffs_h_512,
458 filt_512,
459 filt_center_512,
460 round_h0_512,
461 round_h1_512,
462 clamp_high_512,
463 &s[0][0],
464 &s[1][0]);
465 src_p += 2 * src_stride;
466 wiener_convolve_h32x2_tap7(src_p,
467 src_stride,
468 coeffs_h_512,
469 filt_512,
470 filt_center_512,
471 round_h0_512,
472 round_h1_512,
473 clamp_high_512,
474 &s[0][2],
475 &s[1][2]);
476 src_p += 2 * src_stride;
477 wiener_convolve_h32x2_tap7(src_p,
478 src_stride,
479 coeffs_h_512,
480 filt_512,
481 filt_center_512,
482 round_h0_512,
483 round_h1_512,
484 clamp_high_512,
485 &s[0][4],
486 &s[1][4]);
487 src_p += 2 * src_stride;
488
489 s[0][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][0], 1),
490 _mm512_castsi512_si256(s[0][2]));
491 s[1][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][0], 1),
492 _mm512_castsi512_si256(s[1][2]));
493 s[0][3] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][2], 1),
494 _mm512_castsi512_si256(s[0][4]));
495 s[1][3] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][2], 1),
496 _mm512_castsi512_si256(s[1][4]));
497
498 int y = h;
499 do {
500 wiener_convolve_h32x2_tap7(src_p,
501 src_stride,
502 coeffs_h_512,
503 filt_512,
504 filt_center_512,
505 round_h0_512,
506 round_h1_512,
507 clamp_high_512,
508 &s[0][6],
509 &s[1][6]);
510 s[0][5] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][4], 1),
511 _mm512_castsi512_si256(s[0][6]));
512 s[1][5] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][4], 1),
513 _mm512_castsi512_si256(s[1][6]));
514 src_p += 2 * src_stride;
515 const __m512i r0 = wiener_convolve_v16x2_tap7(coeffs_v_512, round_v_512, s[0]);
516 const __m512i r1 = wiener_convolve_v16x2_tap7(coeffs_v_512, round_v_512, s[1]);
517 if (y == 1) {
518 const __m512i d = _mm512_packus_epi16(r0, r1);
519 const __m256i d0 = _mm512_castsi512_si256(d);
520 _mm256_storeu_si256((__m256i*)dst_p, d0);
521 } else {
522 pack_store_32x2_avx512(r0, r1, dst_p, dst_stride);
523 }
524
525 dst_p += 2 * dst_stride;
526 y -= 2;
527 } while (y > 0);
528
529 src_ptr += 32;
530 dst_ptr += 32;
531 width -= 32;
532 }
533
534 if (!width)
535 return;
536 }
537
538 const __m256i filt_center = yy_load_256(filt_center_tap7_global_avx);
539 filt[2] = yy_load_256(filt3_global_avx);
540 filt[3] = yy_load_256(filt4_global_avx);
541 populate_coeffs_8tap_avx2(coeffs_x, coeffs_h);
542 // coeffs 0 1 0 1 0 1 0 1
543 coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
544 // coeffs 2 3 2 3 2 3 2 3
545 coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
546
547 if (width >= 16) {
548 const uint8_t* src_p = src_ptr;
549 uint8_t* dst_p = dst_ptr;
550 __m256i s[2][7];
551
552 wiener_convolve_h16x2_tap7(src_p,
553 src_stride,
554 coeffs_h,
555 filt,
556 filt_center,
557 round_h0,
558 round_h1,
559 clamp_high,
560 &s[0][0],
561 &s[1][0]);
562 src_p += 2 * src_stride;
563 wiener_convolve_h16x2_tap7(src_p,
564 src_stride,
565 coeffs_h,
566 filt,
567 filt_center,
568 round_h0,
569 round_h1,
570 clamp_high,
571 &s[0][2],
572 &s[1][2]);
573 src_p += 2 * src_stride;
574 wiener_convolve_h16x2_tap7(src_p,
575 src_stride,
576 coeffs_h,
577 filt,
578 filt_center,
579 round_h0,
580 round_h1,
581 clamp_high,
582 &s[0][4],
583 &s[1][4]);
584 src_p += 2 * src_stride;
585
586 s[0][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][0], 1),
587 _mm256_castsi256_si128(s[0][2]));
588 s[1][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][0], 1),
589 _mm256_castsi256_si128(s[1][2]));
590 s[0][3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][2], 1),
591 _mm256_castsi256_si128(s[0][4]));
592 s[1][3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][2], 1),
593 _mm256_castsi256_si128(s[1][4]));
594
595 int y = h;
596 do {
597 wiener_convolve_h16x2_tap7(src_p,
598 src_stride,
599 coeffs_h,
600 filt,
601 filt_center,
602 round_h0,
603 round_h1,
604 clamp_high,
605 &s[0][6],
606 &s[1][6]);
607 s[0][5] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][4], 1),
608 _mm256_castsi256_si128(s[0][6]));
609 s[1][5] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][4], 1),
610 _mm256_castsi256_si128(s[1][6]));
611 src_p += 2 * src_stride;
612 const __m256i r0 = wiener_convolve_v8x2_tap7(coeffs_v, round_v, s[0]);
613 const __m256i r1 = wiener_convolve_v8x2_tap7(coeffs_v, round_v, s[1]);
614 if (y == 1) {
615 const __m256i d = _mm256_packus_epi16(r0, r1);
616 const __m128i d0 = _mm256_castsi256_si128(d);
617 _mm_storeu_si128((__m128i*)dst_p, d0);
618 } else {
619 pack_store_16x2_avx2(r0, r1, dst_p, dst_stride);
620 }
621
622 dst_p += 2 * dst_stride;
623 y -= 2;
624 } while (y > 0);
625
626 src_ptr += 16;
627 dst_ptr += 16;
628 width -= 16;
629 }
630
631 if (width) {
632 const uint8_t* src_p = src_ptr;
633 __m256i s[7];
634
635 assert(width == 8);
636
637 s[0] = wiener_convolve_h8x2_tap7(
638 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
639 src_p += 2 * src_stride;
640 s[2] = wiener_convolve_h8x2_tap7(
641 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
642 src_p += 2 * src_stride;
643 s[4] = wiener_convolve_h8x2_tap7(
644 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
645 src_p += 2 * src_stride;
646
647 s[1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0], 1),
648 _mm256_castsi256_si128(s[2]));
649 s[3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[2], 1),
650 _mm256_castsi256_si128(s[4]));
651
652 int y = h;
653 do {
654 s[6] = wiener_convolve_h8x2_tap7(
655 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
656 s[5] = _mm256_setr_m128i(_mm256_extracti128_si256(s[4], 1),
657 _mm256_castsi256_si128(s[6]));
658 src_p += 2 * src_stride;
659 const __m256i r = wiener_convolve_v8x2_tap7(coeffs_v, round_v, s);
660 if (y == 1) {
661 const __m256i d = _mm256_packus_epi16(r, r);
662 const __m128i d0 = _mm256_castsi256_si128(d);
663 _mm_storel_epi64((__m128i*)dst_ptr, d0);
664 } else {
665 pack_store_8x2_avx2(r, dst_ptr, dst_stride);
666 }
667
668 dst_ptr += 2 * dst_stride;
669 y -= 2;
670 } while (y > 0);
671 }
672 } else if (cnt_zero_coef == 1) {
673 __m128i coeffs_x = xx_loadu_128(filter_x);
674 src_ptr += src_stride + 1;
675
676 if (width >= 32) {
677 int32_t x = width & ~63;
678
679 const __m512i filt_center_512 = zz_load_512(filt_center_tap5_global_avx);
680 filt_512[2] = zz_load_512(filt3_global_avx);
681 populate_coeffs_6tap_avx512(coeffs_x, coeffs_h_512);
682 // coeffs 1 2 1 2 1 2 1 2
683 coeffs_v_512[0] = _mm512_shuffle_epi8(filter_coeffs_y_512,
684 _mm512_set1_epi32(0x05040302u));
685 // coeffs 3 4 3 4 3 4 3 4
686 coeffs_v_512[1] = _mm512_shuffle_epi8(filter_coeffs_y_512,
687 _mm512_set1_epi32(0x09080706u));
688
689 width -= x;
690 while (x) {
691 const uint8_t* src_p = src_ptr;
692 __m512i s[2][5];
693
694 wiener_convolve_h64_tap5(src_p,
695 coeffs_h_512,
696 filt_512,
697 filt_center_512,
698 round_h0_512,
699 round_h1_512,
700 clamp_high_512,
701 &s[0][0],
702 &s[1][0]);
703 src_p += src_stride;
704 wiener_convolve_h64_tap5(src_p,
705 coeffs_h_512,
706 filt_512,
707 filt_center_512,
708 round_h0_512,
709 round_h1_512,
710 clamp_high_512,
711 &s[0][1],
712 &s[1][1]);
713 src_p += src_stride;
714 wiener_convolve_h64_tap5(src_p,
715 coeffs_h_512,
716 filt_512,
717 filt_center_512,
718 round_h0_512,
719 round_h1_512,
720 clamp_high_512,
721 &s[0][2],
722 &s[1][2]);
723 src_p += src_stride;
724 wiener_convolve_h64_tap5(src_p,
725 coeffs_h_512,
726 filt_512,
727 filt_center_512,
728 round_h0_512,
729 round_h1_512,
730 clamp_high_512,
731 &s[0][3],
732 &s[1][3]);
733 src_p += src_stride;
734
735 int y = 0;
736 do {
737 wiener_convolve_h64_tap5(src_p,
738 coeffs_h_512,
739 filt_512,
740 filt_center_512,
741 round_h0_512,
742 round_h1_512,
743 clamp_high_512,
744 &s[0][4],
745 &s[1][4]);
746 src_p += src_stride;
747 const __m512i r0 = wiener_convolve_v32_tap5(coeffs_v_512, round_v_512, s[0]);
748 const __m512i r1 = wiener_convolve_v32_tap5(coeffs_v_512, round_v_512, s[1]);
749 convolve_store_64_avx512(r0, r1, dst_ptr + y * dst_stride);
750 } while (++y < h);
751
752 src_ptr += 64;
753 dst_ptr += 64;
754 x -= 64;
755 }
756
757 if (!width)
758 return;
759
760 x = width & ~31;
761 if (x) {
762 const uint8_t* src_p = src_ptr;
763 uint8_t* dst_p = dst_ptr;
764 __m512i s[2][5];
765
766 wiener_convolve_h32x2_tap5(src_p,
767 src_stride,
768 coeffs_h_512,
769 filt_512,
770 filt_center_512,
771 round_h0_512,
772 round_h1_512,
773 clamp_high_512,
774 &s[0][0],
775 &s[1][0]);
776 src_p += 2 * src_stride;
777 wiener_convolve_h32x2_tap5(src_p,
778 src_stride,
779 coeffs_h_512,
780 filt_512,
781 filt_center_512,
782 round_h0_512,
783 round_h1_512,
784 clamp_high_512,
785 &s[0][2],
786 &s[1][2]);
787 src_p += 2 * src_stride;
788
789 s[0][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][0], 1),
790 _mm512_castsi512_si256(s[0][2]));
791 s[1][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][0], 1),
792 _mm512_castsi512_si256(s[1][2]));
793
794 int y = h;
795 do {
796 wiener_convolve_h32x2_tap5(src_p,
797 src_stride,
798 coeffs_h_512,
799 filt_512,
800 filt_center_512,
801 round_h0_512,
802 round_h1_512,
803 clamp_high_512,
804 &s[0][4],
805 &s[1][4]);
806 s[0][3] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][2], 1),
807 _mm512_castsi512_si256(s[0][4]));
808 s[1][3] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][2], 1),
809 _mm512_castsi512_si256(s[1][4]));
810 src_p += 2 * src_stride;
811 const __m512i r0 = wiener_convolve_v16x2_tap5(coeffs_v_512, round_v_512, s[0]);
812 const __m512i r1 = wiener_convolve_v16x2_tap5(coeffs_v_512, round_v_512, s[1]);
813 if (y == 1) {
814 const __m512i d = _mm512_packus_epi16(r0, r1);
815 const __m256i d0 = _mm512_castsi512_si256(d);
816 _mm256_storeu_si256((__m256i*)dst_p, d0);
817 } else {
818 pack_store_32x2_avx512(r0, r1, dst_p, dst_stride);
819 }
820
821 dst_p += 2 * dst_stride;
822 y -= 2;
823 } while (y > 0);
824
825 src_ptr += 32;
826 dst_ptr += 32;
827 width -= 32;
828 }
829
830 if (!width)
831 return;
832 }
833
834 const __m256i filt_center = yy_load_256(filt_center_tap5_global_avx);
835 filt[2] = yy_load_256(filt3_global_avx);
836 coeffs_x = xx_loadu_128(filter_x);
837 populate_coeffs_6tap_avx2(coeffs_x, coeffs_h);
838 // coeffs 1 2 1 2 1 2 1 2
839 coeffs_v[0] = _mm256_shuffle_epi8(filter_coeffs_y, _mm256_set1_epi32(0x05040302u));
840 // coeffs 3 4 3 4 3 4 3 4
841 coeffs_v[1] = _mm256_shuffle_epi8(filter_coeffs_y, _mm256_set1_epi32(0x09080706u));
842
843 if (width >= 16) {
844 const uint8_t* src_p = src_ptr;
845 uint8_t* dst_p = dst_ptr;
846 __m256i s[2][5];
847
848 wiener_convolve_h16x2_tap5(src_p,
849 src_stride,
850 coeffs_h,
851 filt,
852 filt_center,
853 round_h0,
854 round_h1,
855 clamp_high,
856 &s[0][0],
857 &s[1][0]);
858 src_p += 2 * src_stride;
859 wiener_convolve_h16x2_tap5(src_p,
860 src_stride,
861 coeffs_h,
862 filt,
863 filt_center,
864 round_h0,
865 round_h1,
866 clamp_high,
867 &s[0][2],
868 &s[1][2]);
869 src_p += 2 * src_stride;
870
871 s[0][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][0], 1),
872 _mm256_castsi256_si128(s[0][2]));
873 s[1][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][0], 1),
874 _mm256_castsi256_si128(s[1][2]));
875
876 int y = h;
877 do {
878 wiener_convolve_h16x2_tap5(src_p,
879 src_stride,
880 coeffs_h,
881 filt,
882 filt_center,
883 round_h0,
884 round_h1,
885 clamp_high,
886 &s[0][4],
887 &s[1][4]);
888 s[0][3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][2], 1),
889 _mm256_castsi256_si128(s[0][4]));
890 s[1][3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][2], 1),
891 _mm256_castsi256_si128(s[1][4]));
892 src_p += 2 * src_stride;
893 const __m256i r0 = wiener_convolve_v8x2_tap5(coeffs_v, round_v, s[0]);
894 const __m256i r1 = wiener_convolve_v8x2_tap5(coeffs_v, round_v, s[1]);
895 if (y == 1) {
896 const __m256i d = _mm256_packus_epi16(r0, r1);
897 const __m128i d0 = _mm256_castsi256_si128(d);
898 _mm_storeu_si128((__m128i*)dst_p, d0);
899 } else {
900 pack_store_16x2_avx2(r0, r1, dst_p, dst_stride);
901 }
902
903 dst_p += 2 * dst_stride;
904 y -= 2;
905 } while (y > 0);
906
907 src_ptr += 16;
908 dst_ptr += 16;
909 width -= 16;
910 }
911
912 if (width) {
913 const uint8_t* src_p = src_ptr;
914 __m256i s[5];
915
916 assert(width == 8);
917
918 s[0] = wiener_convolve_h8x2_tap5(
919 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
920 src_p += 2 * src_stride;
921 s[2] = wiener_convolve_h8x2_tap5(
922 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
923 src_p += 2 * src_stride;
924
925 s[1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0], 1),
926 _mm256_castsi256_si128(s[2]));
927
928 int y = h;
929 do {
930 s[4] = wiener_convolve_h8x2_tap5(
931 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
932 s[3] = _mm256_setr_m128i(_mm256_extracti128_si256(s[2], 1),
933 _mm256_castsi256_si128(s[4]));
934 src_p += 2 * src_stride;
935 const __m256i r = wiener_convolve_v8x2_tap5(coeffs_v, round_v, s);
936 if (y == 1) {
937 const __m256i d = _mm256_packus_epi16(r, r);
938 const __m128i d0 = _mm256_castsi256_si128(d);
939 _mm_storel_epi64((__m128i*)dst_ptr, d0);
940 } else {
941 pack_store_8x2_avx2(r, dst_ptr, dst_stride);
942 }
943
944 dst_ptr += 2 * dst_stride;
945 y -= 2;
946 } while (y > 0);
947 }
948 } else {
949 const __m128i coeffs_x = xx_loadu_128(filter_x);
950 src_ptr += 2 * src_stride + 2;
951
952 if (width >= 32) {
953 int32_t x = width & ~63;
954
955 const __m512i filt_center_512 = zz_load_512(filt_center_tap3_global_avx);
956 populate_coeffs_4tap_avx512(coeffs_x, coeffs_h_512);
957 // coeffs 2 3 2 3 2 3 2 3
958 coeffs_v_512[0] = _mm512_shuffle_epi32(filter_coeffs_y_512, 0x55);
959 // coeffs 4 5 4 5 4 5 4 5
960 coeffs_v_512[1] = _mm512_shuffle_epi32(filter_coeffs_y_512, 0xaa);
961
962 width -= x;
963 while (x) {
964 const uint8_t* src_p = src_ptr;
965 __m512i s[2][3];
966
967 wiener_convolve_h64_tap3(src_p,
968 coeffs_h_512,
969 filt_512,
970 filt_center_512,
971 round_h0_512,
972 round_h1_512,
973 clamp_high_512,
974 &s[0][0],
975 &s[1][0]);
976 src_p += src_stride;
977 wiener_convolve_h64_tap3(src_p,
978 coeffs_h_512,
979 filt_512,
980 filt_center_512,
981 round_h0_512,
982 round_h1_512,
983 clamp_high_512,
984 &s[0][1],
985 &s[1][1]);
986 src_p += src_stride;
987
988 int y = 0;
989 do {
990 wiener_convolve_h64_tap3(src_p,
991 coeffs_h_512,
992 filt_512,
993 filt_center_512,
994 round_h0_512,
995 round_h1_512,
996 clamp_high_512,
997 &s[0][2],
998 &s[1][2]);
999 src_p += src_stride;
1000 const __m512i r0 = wiener_convolve_v32_tap3(coeffs_v_512, round_v_512, s[0]);
1001 const __m512i r1 = wiener_convolve_v32_tap3(coeffs_v_512, round_v_512, s[1]);
1002 convolve_store_64_avx512(r0, r1, dst_ptr + y * dst_stride);
1003 } while (++y < h);
1004
1005 src_ptr += 64;
1006 dst_ptr += 64;
1007 x -= 64;
1008 }
1009
1010 if (!width)
1011 return;
1012
1013 x = width & ~31;
1014 if (x) {
1015 const uint8_t* src_p = src_ptr;
1016 uint8_t* dst_p = dst_ptr;
1017 __m512i s[2][3];
1018
1019 wiener_convolve_h32x2_tap3(src_p,
1020 src_stride,
1021 coeffs_h_512,
1022 filt_512,
1023 filt_center_512,
1024 round_h0_512,
1025 round_h1_512,
1026 clamp_high_512,
1027 &s[0][0],
1028 &s[1][0]);
1029 src_p += 2 * src_stride;
1030
1031 int y = h;
1032 do {
1033 wiener_convolve_h32x2_tap3(src_p,
1034 src_stride,
1035 coeffs_h_512,
1036 filt_512,
1037 filt_center_512,
1038 round_h0_512,
1039 round_h1_512,
1040 clamp_high_512,
1041 &s[0][2],
1042 &s[1][2]);
1043 s[0][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[0][0], 1),
1044 _mm512_castsi512_si256(s[0][2]));
1045 s[1][1] = _mm512_setr_m256i(_mm512_extracti64x4_epi64(s[1][0], 1),
1046 _mm512_castsi512_si256(s[1][2]));
1047 src_p += 2 * src_stride;
1048 const __m512i r0 = wiener_convolve_v16x2_tap3(coeffs_v_512, round_v_512, s[0]);
1049 const __m512i r1 = wiener_convolve_v16x2_tap3(coeffs_v_512, round_v_512, s[1]);
1050 if (y == 1) {
1051 const __m512i d = _mm512_packus_epi16(r0, r1);
1052 const __m256i d0 = _mm512_castsi512_si256(d);
1053 _mm256_storeu_si256((__m256i*)dst_p, d0);
1054 } else {
1055 pack_store_32x2_avx512(r0, r1, dst_p, dst_stride);
1056 }
1057
1058 dst_p += 2 * dst_stride;
1059 y -= 2;
1060 } while (y > 0);
1061
1062 src_ptr += 32;
1063 dst_ptr += 32;
1064 width -= 32;
1065 }
1066
1067 if (!width)
1068 return;
1069 }
1070
1071 const __m256i filt_center = yy_load_256(filt_center_tap3_global_avx);
1072 populate_coeffs_4tap_avx2(coeffs_x, coeffs_h);
1073 // coeffs 2 3 2 3 2 3 2 3
1074 coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
1075 // coeffs 4 5 4 5 4 5 4 5
1076 coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
1077
1078 if (width >= 16) {
1079 const uint8_t* src_p = src_ptr;
1080 uint8_t* dst_p = dst_ptr;
1081 __m256i s[2][3];
1082
1083 wiener_convolve_h16x2_tap3(src_p,
1084 src_stride,
1085 coeffs_h,
1086 filt,
1087 filt_center,
1088 round_h0,
1089 round_h1,
1090 clamp_high,
1091 &s[0][0],
1092 &s[1][0]);
1093 src_p += 2 * src_stride;
1094
1095 int y = h;
1096 do {
1097 wiener_convolve_h16x2_tap3(src_p,
1098 src_stride,
1099 coeffs_h,
1100 filt,
1101 filt_center,
1102 round_h0,
1103 round_h1,
1104 clamp_high,
1105 &s[0][2],
1106 &s[1][2]);
1107 s[0][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0][0], 1),
1108 _mm256_castsi256_si128(s[0][2]));
1109 s[1][1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[1][0], 1),
1110 _mm256_castsi256_si128(s[1][2]));
1111 src_p += 2 * src_stride;
1112 const __m256i r0 = wiener_convolve_v8x2_tap3(coeffs_v, round_v, s[0]);
1113 const __m256i r1 = wiener_convolve_v8x2_tap3(coeffs_v, round_v, s[1]);
1114 if (y == 1) {
1115 const __m256i d = _mm256_packus_epi16(r0, r1);
1116 const __m128i d0 = _mm256_castsi256_si128(d);
1117 _mm_storeu_si128((__m128i*)dst_p, d0);
1118 } else {
1119 pack_store_16x2_avx2(r0, r1, dst_p, dst_stride);
1120 }
1121
1122 dst_p += 2 * dst_stride;
1123 y -= 2;
1124 } while (y > 0);
1125
1126 src_ptr += 16;
1127 dst_ptr += 16;
1128 width -= 16;
1129 }
1130
1131 if (width) {
1132 const uint8_t* src_p = src_ptr;
1133 __m256i s[5];
1134
1135 assert(width == 8);
1136
1137 s[0] = wiener_convolve_h8x2_tap3(
1138 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
1139 src_p += 2 * src_stride;
1140
1141 int y = h;
1142 do {
1143 s[2] = wiener_convolve_h8x2_tap3(
1144 src_p, src_stride, coeffs_h, filt, filt_center, round_h0, round_h1, clamp_high);
1145 s[1] = _mm256_setr_m128i(_mm256_extracti128_si256(s[0], 1),
1146 _mm256_castsi256_si128(s[2]));
1147 src_p += 2 * src_stride;
1148 const __m256i r = wiener_convolve_v8x2_tap3(coeffs_v, round_v, s);
1149 if (y == 1) {
1150 const __m256i d = _mm256_packus_epi16(r, r);
1151 const __m128i d0 = _mm256_castsi256_si128(d);
1152 _mm_storel_epi64((__m128i*)dst_ptr, d0);
1153 } else {
1154 pack_store_8x2_avx2(r, dst_ptr, dst_stride);
1155 }
1156
1157 dst_ptr += 2 * dst_stride;
1158 y -= 2;
1159 } while (y > 0);
1160 }
1161 }
1162 }
1163
1164 #endif // EN_AVX512_SUPPORT
1165