1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include "EbDefinitions.h"
13
14 #if EN_AVX512_SUPPORT
15 #include <immintrin.h>
16 #include "common_dsp_rtcd.h"
17 #include "convolve.h"
18 #include "convolve_avx2.h"
19 #include "convolve_avx512.h"
20 #include "EbMemory_SSE4_1.h"
21
jnt_2d_comp_avg_round_32_avx512(const __m512i src[2])22 static INLINE __m512i jnt_2d_comp_avg_round_32_avx512(const __m512i src[2]) {
23 const __m512i round = _mm512_set1_epi32(1 << (COMPOUND_ROUND1_BITS - 1));
24 const __m512i dst0 = _mm512_add_epi32(src[0], round);
25 const __m512i dst1 = _mm512_add_epi32(src[1], round);
26 const __m512i d0 = _mm512_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
27 const __m512i d1 = _mm512_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
28 return _mm512_packs_epi32(d0, d1);
29 }
30
jnt_2d_comp_avg_round_half_pel_avx512(const __m512i src)31 static INLINE __m512i jnt_2d_comp_avg_round_half_pel_avx512(const __m512i src) {
32 const __m512i round = _mm512_set1_epi16(1);
33 const __m512i dst = _mm512_add_epi16(src, round);
34 return _mm512_srai_epi16(dst, 1);
35 }
36
jnt_2d_comp_avg_round_pack_32_avx512(const __m512i res[2],const __m512i factor,const __m512i offset,const __m512i dst)37 static INLINE __m512i jnt_2d_comp_avg_round_pack_32_avx512(const __m512i res[2],
38 const __m512i factor,
39 const __m512i offset,
40 const __m512i dst) {
41 const __m512i r = jnt_2d_comp_avg_round_32_avx512(res);
42 __m512i d[2];
43
44 d[0] = _mm512_unpacklo_epi16(dst, r);
45 d[1] = _mm512_unpackhi_epi16(dst, r);
46 d[0] = _mm512_madd_epi16(d[0], factor);
47 d[1] = _mm512_madd_epi16(d[1], factor);
48 d[0] = _mm512_add_epi32(d[0], offset);
49 d[1] = _mm512_add_epi32(d[1], offset);
50 d[0] = _mm512_srai_epi32(d[0], 8);
51 d[1] = _mm512_srai_epi32(d[1], 8);
52 return _mm512_packs_epi32(d[0], d[1]);
53 }
54
jnt_2d_comp_avg_round_pack_half_pel_avx512(const __m512i res,const __m512i factor,const __m512i offset,const __m512i dst)55 static INLINE __m512i jnt_2d_comp_avg_round_pack_half_pel_avx512(const __m512i res,
56 const __m512i factor,
57 const __m512i offset,
58 const __m512i dst) {
59 const __m512i r = jnt_2d_comp_avg_round_half_pel_avx512(res);
60 __m512i d[2];
61
62 d[0] = _mm512_unpacklo_epi16(dst, r);
63 d[1] = _mm512_unpackhi_epi16(dst, r);
64 d[0] = _mm512_madd_epi16(d[0], factor);
65 d[1] = _mm512_madd_epi16(d[1], factor);
66 d[0] = _mm512_add_epi32(d[0], offset);
67 d[1] = _mm512_add_epi32(d[1], offset);
68 d[0] = _mm512_srai_epi32(d[0], 8);
69 d[1] = _mm512_srai_epi32(d[1], 8);
70 return _mm512_packs_epi32(d[0], d[1]);
71 }
72
jnt_2d_comp_avg_round_store_32x2_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i factor,const __m512i offset,const ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)73 SIMD_INLINE void jnt_2d_comp_avg_round_store_32x2_avx512(const __m512i r0[2], const __m512i r1[2],
74 const __m512i factor, const __m512i offset,
75 const ConvBufType *const dst,
76 const int32_t dst_stride,
77 uint8_t *const dst8,
78 const int32_t dst8_stride) {
79 __m512i d[2];
80
81 d[0] = zz_loadu_512((dst + 0 * dst_stride));
82 d[1] = zz_loadu_512((dst + 1 * dst_stride));
83 d[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[0]);
84 d[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[1]);
85 d[0] = jnt_2d_comp_avg_round_pack_32_avx512(r0, factor, offset, d[0]);
86 d[1] = jnt_2d_comp_avg_round_pack_32_avx512(r1, factor, offset, d[1]);
87 xy_y_pack_store_32x2_avx512(d[0], d[1], dst8, dst8_stride);
88 }
89
jnt_2d_comp_avg_round_store_64_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i factor,const __m512i offset,const ConvBufType * const dst,uint8_t * const dst8)90 SIMD_INLINE void jnt_2d_comp_avg_round_store_64_avx512(const __m512i r0[2], const __m512i r1[2],
91 const __m512i factor, const __m512i offset,
92 const ConvBufType *const dst,
93 uint8_t *const dst8) {
94 __m512i d[2];
95
96 jnt_loadu_u16_8x4x2_avx512(dst, 32, d);
97 d[0] = jnt_2d_comp_avg_round_pack_32_avx512(r0, factor, offset, d[0]);
98 d[1] = jnt_2d_comp_avg_round_pack_32_avx512(r1, factor, offset, d[1]);
99 convolve_store_64_avx512(d[0], d[1], dst8);
100 }
101
jnt_2d_comp_avg_round_store_half_pel_32x2_avx512(const __m512i res[2],const __m512i factor,const __m512i offset,const ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)102 SIMD_INLINE void jnt_2d_comp_avg_round_store_half_pel_32x2_avx512(
103 const __m512i res[2], const __m512i factor, const __m512i offset, const ConvBufType *const dst,
104 const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
105 __m512i d[2];
106
107 d[0] = zz_loadu_512((dst + 0 * dst_stride));
108 d[1] = zz_loadu_512((dst + 1 * dst_stride));
109 d[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[0]);
110 d[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[1]);
111 d[0] = jnt_2d_comp_avg_round_pack_half_pel_avx512(res[0], factor, offset, d[0]);
112 d[1] = jnt_2d_comp_avg_round_pack_half_pel_avx512(res[1], factor, offset, d[1]);
113 xy_y_pack_store_32x2_avx512(d[0], d[1], dst8, dst8_stride);
114 }
115
jnt_2d_comp_avg_round_store_half_pel_64_avx512(const __m512i res[2],const __m512i factor,const __m512i offset,const ConvBufType * const dst,uint8_t * const dst8)116 SIMD_INLINE void jnt_2d_comp_avg_round_store_half_pel_64_avx512(const __m512i res[2],
117 const __m512i factor,
118 const __m512i offset,
119 const ConvBufType *const dst,
120 uint8_t *const dst8) {
121 __m512i d[2];
122
123 jnt_loadu_u16_8x4x2_avx512(dst, 32, d);
124 d[0] = jnt_2d_comp_avg_round_pack_half_pel_avx512(res[0], factor, offset, d[0]);
125 d[1] = jnt_2d_comp_avg_round_pack_half_pel_avx512(res[1], factor, offset, d[1]);
126 convolve_store_64_avx512(d[0], d[1], dst8);
127 }
128
jnt_2d_round_32_avx512(const __m512i src[2],const __m512i offset)129 static INLINE __m512i jnt_2d_round_32_avx512(const __m512i src[2], const __m512i offset) {
130 const __m512i dst0 = _mm512_add_epi32(src[0], offset);
131 const __m512i dst1 = _mm512_add_epi32(src[1], offset);
132 const __m512i d0 = _mm512_srai_epi32(dst0, COMPOUND_ROUND1_BITS);
133 const __m512i d1 = _mm512_srai_epi32(dst1, COMPOUND_ROUND1_BITS);
134 return _mm512_packs_epi32(d0, d1);
135 }
136
jnt_2d_round_half_pel_avx512(const __m512i src,const __m512i offset)137 static INLINE __m512i jnt_2d_round_half_pel_avx512(const __m512i src, const __m512i offset) {
138 const __m512i dst0 = _mm512_add_epi16(src, offset);
139 return _mm512_srai_epi16(dst0, 1);
140 }
141
jnt_2d_avg_round_store_32x2_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i offset,const ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)142 SIMD_INLINE void jnt_2d_avg_round_store_32x2_avx512(const __m512i r0[2], const __m512i r1[2],
143 const __m512i offset,
144 const ConvBufType *const dst,
145 const int32_t dst_stride, uint8_t *const dst8,
146 const int32_t dst8_stride) {
147 __m512i r[2], d[2];
148
149 r[0] = jnt_2d_round_32_avx512(r0, offset);
150 r[1] = jnt_2d_round_32_avx512(r1, offset);
151 d[0] = zz_loadu_512((dst + 0 * dst_stride));
152 d[1] = zz_loadu_512((dst + 1 * dst_stride));
153 d[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[0]);
154 d[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[1]);
155 d[0] = jnt_avg_32_avx512(r[0], d[0]);
156 d[1] = jnt_avg_32_avx512(r[1], d[1]);
157 xy_y_pack_store_32x2_avx512(d[0], d[1], dst8, dst8_stride);
158 }
159
jnt_2d_avg_round_store_64_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i offset,const ConvBufType * const dst,uint8_t * const dst8)160 SIMD_INLINE void jnt_2d_avg_round_store_64_avx512(const __m512i r0[2], const __m512i r1[2],
161 const __m512i offset,
162 const ConvBufType *const dst,
163 uint8_t *const dst8) {
164 __m512i r[2], d[2];
165
166 r[0] = jnt_2d_round_32_avx512(r0, offset);
167 r[1] = jnt_2d_round_32_avx512(r1, offset);
168 jnt_loadu_u16_8x4x2_avx512(dst, 32, d);
169 d[0] = jnt_avg_32_avx512(r[0], d[0]);
170 d[1] = jnt_avg_32_avx512(r[1], d[1]);
171 convolve_store_64_avx512(d[0], d[1], dst8);
172 }
173
jnt_2d_avg_round_store_half_pel_32x2_avx512(const __m512i res[2],const __m512i offset,const ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)174 static INLINE void jnt_2d_avg_round_store_half_pel_32x2_avx512(
175 const __m512i res[2], const __m512i offset, const ConvBufType *const dst,
176 const int32_t dst_stride, uint8_t *const dst8, const int32_t dst8_stride) {
177 __m512i r[2], d[2];
178
179 r[0] = jnt_2d_round_half_pel_avx512(res[0], offset);
180 r[1] = jnt_2d_round_half_pel_avx512(res[1], offset);
181 d[0] = zz_loadu_512((dst + 0 * dst_stride));
182 d[1] = zz_loadu_512((dst + 1 * dst_stride));
183 d[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[0]);
184 d[1] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), d[1]);
185 d[0] = jnt_avg_32_avx512(r[0], d[0]);
186 d[1] = jnt_avg_32_avx512(r[1], d[1]);
187 xy_y_pack_store_32x2_avx512(d[0], d[1], dst8, dst8_stride);
188 }
189
jnt_2d_avg_round_store_half_pel_64_avx512(const __m512i res[2],const __m512i offset,const ConvBufType * const dst,uint8_t * const dst8)190 static INLINE void jnt_2d_avg_round_store_half_pel_64_avx512(const __m512i res[2],
191 const __m512i offset,
192 const ConvBufType *const dst,
193 uint8_t *const dst8) {
194 __m512i r[2], d[2];
195
196 r[0] = jnt_2d_round_half_pel_avx512(res[0], offset);
197 r[1] = jnt_2d_round_half_pel_avx512(res[1], offset);
198 jnt_loadu_u16_8x4x2_avx512(dst, 32, d);
199 d[0] = jnt_avg_32_avx512(r[0], d[0]);
200 d[1] = jnt_avg_32_avx512(r[1], d[1]);
201 convolve_store_64_avx512(d[0], d[1], dst8);
202 }
203
jnt_2d_no_avg_store_32x2_avx512(const __m512i src0,const __m512i src1,ConvBufType * const dst,const int32_t stride)204 static INLINE void jnt_2d_no_avg_store_32x2_avx512(const __m512i src0, const __m512i src1,
205 ConvBufType *const dst, const int32_t stride) {
206 const __m512i d0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), src0);
207 const __m512i d1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), src1);
208 _mm512_storeu_si512((__m512i *)dst, d0);
209 _mm512_storeu_si512((__m512i *)(dst + stride), d1);
210 }
211
jnt_2d_no_avg_round_store_32x2_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i offset,ConvBufType * const dst,const int32_t stride)212 static INLINE void jnt_2d_no_avg_round_store_32x2_avx512(const __m512i r0[2], const __m512i r1[2],
213 const __m512i offset,
214 ConvBufType *const dst,
215 const int32_t stride) {
216 const __m512i d0 = jnt_2d_round_32_avx512(r0, offset);
217 const __m512i d1 = jnt_2d_round_32_avx512(r1, offset);
218 jnt_2d_no_avg_store_32x2_avx512(d0, d1, dst, stride);
219 }
220
jnt_2d_no_avg_round_store_64_avx512(const __m512i r0[2],const __m512i r1[2],const __m512i offset,ConvBufType * const dst)221 static INLINE void jnt_2d_no_avg_round_store_64_avx512(const __m512i r0[2], const __m512i r1[2],
222 const __m512i offset,
223 ConvBufType *const dst) {
224 const __m512i d0 = jnt_2d_round_32_avx512(r0, offset);
225 const __m512i d1 = jnt_2d_round_32_avx512(r1, offset);
226 jnt_no_avg_store_32x2_avx512(d0, d1, dst, 32);
227 }
228
jnt_2d_no_avg_round_store_half_pel_32x2_avx512(const __m512i res[2],const __m512i offset,ConvBufType * const dst,const int32_t stride)229 static INLINE void jnt_2d_no_avg_round_store_half_pel_32x2_avx512(const __m512i res[2],
230 const __m512i offset,
231 ConvBufType *const dst,
232 const int32_t stride) {
233 const __m512i d0 = jnt_2d_round_half_pel_avx512(res[0], offset);
234 const __m512i d1 = jnt_2d_round_half_pel_avx512(res[1], offset);
235 jnt_2d_no_avg_store_32x2_avx512(d0, d1, dst, stride);
236 }
237
jnt_2d_no_avg_round_store_half_pel_64_avx512(const __m512i res[2],const __m512i offset,ConvBufType * const dst)238 static INLINE void jnt_2d_no_avg_round_store_half_pel_64_avx512(const __m512i res[2],
239 const __m512i offset,
240 ConvBufType *const dst) {
241 const __m512i d0 = jnt_2d_round_half_pel_avx512(res[0], offset);
242 const __m512i d1 = jnt_2d_round_half_pel_avx512(res[1], offset);
243 jnt_no_avg_store_32x2_avx512(d0, d1, dst, 32);
244 }
245
jnt_convolve_2d_hor_2tap_avx512(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)246 static void jnt_convolve_2d_hor_2tap_avx512(const uint8_t *src, const int32_t src_stride,
247 const int32_t w, const int32_t h,
248 const InterpFilterParams *filter_params_x,
249 const int32_t subpel_x_q4, int16_t *const im_block) {
250 const uint8_t *src_ptr = src;
251 int32_t y = h;
252 int16_t * im = im_block;
253
254 if (w <= 8) {
255 __m128i coeffs_128;
256 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
257
258 if (w == 2) {
259 do {
260 const __m128i r = x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
261 xy_x_round_store_2x2_sse2(r, im);
262 src_ptr += 2 * src_stride;
263 im += 2 * 2;
264 y -= 2;
265 } while (y);
266 } else if (w == 4) {
267 do {
268 const __m128i r = x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
269 xy_x_round_store_4x2_sse2(r, im);
270 src_ptr += 2 * src_stride;
271 im += 2 * 4;
272 y -= 2;
273 } while (y);
274 } else {
275 assert(w == 8);
276
277 do {
278 __m128i r[2];
279
280 x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
281 xy_x_round_store_8x2_sse2(r, im);
282 src_ptr += 2 * src_stride;
283 im += 2 * 8;
284 y -= 2;
285 } while (y);
286 }
287 } else if (w == 16) {
288 __m256i coeffs_256;
289 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
290
291 do {
292 __m256i r[2];
293
294 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
295 xy_x_round_store_32_avx2(r, im);
296 src_ptr += 2 * src_stride;
297 im += 2 * 16;
298 y -= 2;
299 } while (y);
300 } else {
301 __m512i coeffs_512;
302 prepare_half_coeffs_2tap_avx512(filter_params_x, subpel_x_q4, &coeffs_512);
303
304 if (w == 32) {
305 do {
306 xy_x_2tap_32x2_avx512(src_ptr, src_stride, &coeffs_512, im);
307 src_ptr += 2 * src_stride;
308 im += 2 * 32;
309 y -= 2;
310 } while (y);
311 } else if (w == 64) {
312 do {
313 xy_x_2tap_64_avx512(src_ptr, &coeffs_512, im);
314 src_ptr += src_stride;
315 im += 64;
316 } while (--y);
317 } else {
318 assert(w == 128);
319
320 do {
321 xy_x_2tap_64_avx512(src_ptr + 0 * 64, &coeffs_512, im + 0 * 64);
322 xy_x_2tap_64_avx512(src_ptr + 1 * 64, &coeffs_512, im + 1 * 64);
323 src_ptr += src_stride;
324 im += 128;
325 } while (--y);
326 }
327 }
328 }
329
jnt_convolve_2d_hor_6tap_avx512(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)330 static void jnt_convolve_2d_hor_6tap_avx512(const uint8_t *src, const int32_t src_stride,
331 const int32_t w, const int32_t h,
332 const InterpFilterParams *filter_params_x,
333 const int32_t subpel_x_q4, int16_t *const im_block) {
334 const uint8_t *src_ptr = src - 2;
335 int32_t y = h;
336 int16_t * im = im_block;
337
338 if (w <= 16) {
339 __m256i coeffs_256[3], filt_256[3];
340
341 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
342 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
343 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
344
345 prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
346
347 if (w == 8) {
348 do {
349 const __m256i res = x_convolve_6tap_8x2_avx2(
350 src_ptr, src_stride, coeffs_256, filt_256);
351 xy_x_round_store_8x2_avx2(res, im);
352 src_ptr += 2 * src_stride;
353 im += 2 * 8;
354 y -= 2;
355 } while (y);
356 } else {
357 assert(w == 16);
358
359 do {
360 __m256i r[2];
361
362 x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
363 xy_x_round_store_32_avx2(r, im);
364 src_ptr += 2 * src_stride;
365 im += 2 * 16;
366 y -= 2;
367 } while (y);
368 }
369 } else {
370 __m512i coeffs_512[3], filt_512[3];
371
372 filt_512[0] = zz_load_512(filt1_global_avx);
373 filt_512[1] = zz_load_512(filt2_global_avx);
374 filt_512[2] = zz_load_512(filt3_global_avx);
375
376 prepare_half_coeffs_6tap_avx512(filter_params_x, subpel_x_q4, coeffs_512);
377 if (w == 32) {
378 do {
379 xy_x_6tap_32x2_avx512(src_ptr, src_stride, coeffs_512, filt_512, im);
380 src_ptr += 2 * src_stride;
381 im += 2 * 32;
382 y -= 2;
383 } while (y);
384 } else if (w == 64) {
385 do {
386 xy_x_6tap_64_avx512(src_ptr, coeffs_512, filt_512, im);
387 src_ptr += src_stride;
388 im += 64;
389 } while (--y);
390 } else {
391 assert(w == 128);
392
393 do {
394 xy_x_6tap_64_avx512(src_ptr, coeffs_512, filt_512, im);
395 xy_x_6tap_64_avx512(src_ptr + 64, coeffs_512, filt_512, im + 64);
396 src_ptr += src_stride;
397 im += 128;
398 } while (--y);
399 }
400 }
401 }
402
jnt_convolve_2d_hor_8tap_avx512(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)403 static void jnt_convolve_2d_hor_8tap_avx512(const uint8_t *src, const int32_t src_stride,
404 const int32_t w, const int32_t h,
405 const InterpFilterParams *filter_params_x,
406 const int32_t subpel_x_q4, int16_t *const im_block) {
407 const uint8_t *src_ptr = src - 3;
408 int32_t y = h;
409 int16_t * im = im_block;
410
411 if (w <= 16) {
412 __m256i coeffs_256[4], filt_256[4];
413
414 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
415 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
416 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
417 filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx);
418
419 prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
420
421 if (w == 8) {
422 do {
423 const __m256i res = x_convolve_8tap_8x2_avx2(
424 src_ptr, src_stride, coeffs_256, filt_256);
425 xy_x_round_store_8x2_avx2(res, im);
426 src_ptr += 2 * src_stride;
427 im += 2 * 8;
428 y -= 2;
429 } while (y);
430 } else {
431 assert(w == 16);
432
433 do {
434 __m256i r[2];
435
436 x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
437 xy_x_round_store_32_avx2(r, im);
438 src_ptr += 2 * src_stride;
439 im += 2 * 16;
440 y -= 2;
441 } while (y);
442 }
443 } else {
444 __m512i coeffs_512[4], filt_512[4];
445
446 filt_512[0] = zz_load_512(filt1_global_avx);
447 filt_512[1] = zz_load_512(filt2_global_avx);
448 filt_512[2] = zz_load_512(filt3_global_avx);
449 filt_512[3] = zz_load_512(filt4_global_avx);
450
451 prepare_half_coeffs_8tap_avx512(filter_params_x, subpel_x_q4, coeffs_512);
452
453 if (w == 32) {
454 do {
455 xy_x_8tap_32x2_avx512(src_ptr, src_stride, coeffs_512, filt_512, im);
456 src_ptr += 2 * src_stride;
457 im += 2 * 32;
458 y -= 2;
459 } while (y);
460 } else if (w == 64) {
461 do {
462 xy_x_8tap_64_avx512(src_ptr, coeffs_512, filt_512, im);
463 src_ptr += src_stride;
464 im += 64;
465 } while (--y);
466 } else {
467 assert(w == 128);
468
469 do {
470 xy_x_8tap_64_avx512(src_ptr, coeffs_512, filt_512, im);
471 xy_x_8tap_64_avx512(src_ptr + 64, coeffs_512, filt_512, im + 64);
472 src_ptr += src_stride;
473 im += 128;
474 } while (--y);
475 }
476 }
477 }
478
jnt_convolve_2d_ver_2tap_avx512(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)479 static void jnt_convolve_2d_ver_2tap_avx512(const int16_t *const im_block, const int32_t w,
480 const int32_t h,
481 const InterpFilterParams *const filter_params_y,
482 const int32_t subpel_y_q4,
483 const ConvolveParams *const conv_params, uint8_t *dst8,
484 const int32_t dst8_stride) {
485 const int32_t dst_stride = conv_params->dst_stride;
486 const int32_t bd = 8;
487 const int32_t round_0 = 3;
488 const int16_t *im = im_block;
489 const int32_t round_1 = COMPOUND_ROUND1_BITS;
490 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
491 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
492 const int32_t round_offset = 1 << (offset_bits - round_1);
493 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
494 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
495 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
496 (1 << (round_bits + DIST_PRECISION_BITS - 1));
497 const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
498 (1 << offset_bits) - (1 << (offset_bits - 1));
499 const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
500 (1 << (offset_bits - 1));
501 ConvBufType *dst = conv_params->dst;
502 int32_t y = h;
503
504 if (w <= 4) {
505 const __m128i factor_128 = _mm_set1_epi32(factor);
506 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
507 const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
508 const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
509 __m128i coeffs_128;
510
511 prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
512
513 if (w == 2) {
514 __m128i s_32[2];
515
516 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
517
518 if (conv_params->do_average) {
519 if (conv_params->use_jnt_comp_avg) {
520 do {
521 const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
522 jnt_2d_comp_avg_round_store_2x2_sse2(res,
523 factor_128,
524 offset_comp_avg_128,
525 dst,
526 dst_stride,
527 dst8,
528 dst8_stride);
529 im += 2 * 2;
530 dst += 2 * dst_stride;
531 dst8 += 2 * dst8_stride;
532 y -= 2;
533 } while (y);
534 } else {
535 do {
536 const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
537 jnt_2d_avg_round_store_2x2_sse2(
538 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
539 im += 2 * 2;
540 dst += 2 * dst_stride;
541 dst8 += 2 * dst8_stride;
542 y -= 2;
543 } while (y);
544 }
545 } else {
546 do {
547 const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
548 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
549 im += 2 * 2;
550 dst += 2 * dst_stride;
551 y -= 2;
552 } while (y);
553 }
554 } else {
555 __m128i s_64[2], r[2];
556
557 assert(w == 4);
558
559 s_64[0] = _mm_loadl_epi64((__m128i *)im);
560
561 if (conv_params->do_average) {
562 if (conv_params->use_jnt_comp_avg) {
563 do {
564 xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
565 jnt_2d_comp_avg_round_store_4x2_sse2(
566 r, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
567 im += 2 * 4;
568 dst += 2 * dst_stride;
569 dst8 += 2 * dst8_stride;
570 y -= 2;
571 } while (y);
572 } else {
573 do {
574 xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
575 jnt_2d_avg_round_store_4x2_sse2(
576 r, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
577 im += 2 * 4;
578 dst += 2 * dst_stride;
579 dst8 += 2 * dst8_stride;
580 y -= 2;
581 } while (y);
582 }
583 } else {
584 do {
585 xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
586 jnt_2d_no_avg_round_store_4x2_sse2(r, offset_no_avg_128, dst, dst_stride);
587 im += 2 * 4;
588 dst += 2 * dst_stride;
589 y -= 2;
590 } while (y);
591 }
592 }
593 } else if (w <= 16) {
594 const __m256i factor_256 = _mm256_set1_epi32(factor);
595 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
596 const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
597 const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
598 __m256i coeffs_256;
599
600 prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
601
602 if (w == 8) {
603 __m128i s_128[2];
604 __m256i r[2];
605
606 s_128[0] = _mm_loadu_si128((__m128i *)im);
607
608 if (conv_params->do_average) {
609 if (conv_params->use_jnt_comp_avg) {
610 do {
611 xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
612 jnt_2d_comp_avg_round_store_8x2_avx2(
613 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
614 im += 2 * 8;
615 dst += 2 * dst_stride;
616 dst8 += 2 * dst8_stride;
617 y -= 2;
618 } while (y);
619 } else {
620 do {
621 xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
622 jnt_2d_avg_round_store_8x2_avx2(
623 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
624 im += 2 * 8;
625 dst += 2 * dst_stride;
626 dst8 += 2 * dst8_stride;
627 y -= 2;
628 } while (y);
629 }
630 } else {
631 do {
632 xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
633 jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
634 im += 2 * 8;
635 dst += 2 * dst_stride;
636 y -= 2;
637 } while (y);
638 }
639 } else {
640 __m256i s_256[2], r[4];
641
642 assert(w == 16);
643
644 s_256[0] = _mm256_loadu_si256((__m256i *)im);
645
646 if (conv_params->do_average) {
647 if (conv_params->use_jnt_comp_avg) {
648 do {
649 xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
650 jnt_2d_comp_avg_round_store_16x2_avx2(
651 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
652 im += 2 * 16;
653 dst += 2 * dst_stride;
654 dst8 += 2 * dst8_stride;
655 y -= 2;
656 } while (y);
657 } else {
658 do {
659 xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
660 jnt_2d_avg_round_store_16x2_avx2(
661 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
662 im += 2 * 16;
663 dst += 2 * dst_stride;
664 dst8 += 2 * dst8_stride;
665 y -= 2;
666 } while (y);
667 }
668 } else {
669 do {
670 xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
671 jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
672 im += 2 * 16;
673 dst += 2 * dst_stride;
674 y -= 2;
675 } while (y);
676 }
677 }
678 } else {
679 const __m512i factor_512 = _mm512_set1_epi32(factor);
680 const __m512i offset_comp_avg_512 = _mm512_set1_epi32(offset_comp_avg);
681 const __m512i offset_avg_512 = _mm512_set1_epi32(offset_avg);
682 const __m512i offset_no_avg_512 = _mm512_set1_epi32(offset_no_avg);
683 __m512i coeffs_512;
684
685 prepare_coeffs_2tap_avx512(filter_params_y, subpel_y_q4, &coeffs_512);
686
687 if (w == 32) {
688 __m512i s_512[2], r[4];
689
690 s_512[0] = loadu_s16_16x2_avx512(im, 32);
691
692 if (conv_params->do_average) {
693 if (conv_params->use_jnt_comp_avg) {
694 do {
695 xy_y_convolve_2tap_32x2_avx512(im, s_512, &coeffs_512, r);
696 jnt_2d_comp_avg_round_store_32x2_avx512(r + 0,
697 r + 2,
698 factor_512,
699 offset_comp_avg_512,
700 dst,
701 dst_stride,
702 dst8,
703 dst8_stride);
704 im += 2 * 32;
705 dst += 2 * dst_stride;
706 dst8 += 2 * dst8_stride;
707 y -= 2;
708 } while (y);
709 } else {
710 do {
711 xy_y_convolve_2tap_32x2_avx512(im, s_512, &coeffs_512, r);
712 jnt_2d_avg_round_store_32x2_avx512(
713 r + 0, r + 2, offset_avg_512, dst, dst_stride, dst8, dst8_stride);
714 im += 2 * 32;
715 dst += 2 * dst_stride;
716 dst8 += 2 * dst8_stride;
717 y -= 2;
718 } while (y);
719 }
720 } else {
721 do {
722 xy_y_convolve_2tap_32x2_avx512(im, s_512, &coeffs_512, r);
723 jnt_2d_no_avg_round_store_32x2_avx512(
724 r + 0, r + 2, offset_no_avg_512, dst, dst_stride);
725 im += 2 * 32;
726 dst += 2 * dst_stride;
727 y -= 2;
728 } while (y);
729 }
730 } else if (w == 64) {
731 __m512i s_512[2][2], r[4];
732
733 s_512[0][0] = zz_load_512(im + 0 * 32);
734 s_512[0][1] = zz_load_512(im + 1 * 32);
735
736 if (conv_params->do_average) {
737 if (conv_params->use_jnt_comp_avg) {
738 do {
739 xy_y_convolve_2tap_64_avx512(
740 im + 2 * 32, s_512[0], s_512[1], &coeffs_512, r);
741 jnt_2d_comp_avg_round_store_64_avx512(
742 r + 0, r + 2, factor_512, offset_comp_avg_512, dst, dst8);
743
744 im += 2 * 64;
745
746 xy_y_convolve_2tap_64_avx512(
747 im + 0 * 32, s_512[1], s_512[0], &coeffs_512, r);
748 jnt_2d_comp_avg_round_store_64_avx512(r + 0,
749 r + 2,
750 factor_512,
751 offset_comp_avg_512,
752 dst + dst8_stride,
753 dst8 + dst8_stride);
754
755 dst += 2 * dst_stride;
756 dst8 += 2 * dst8_stride;
757 y -= 2;
758 } while (y);
759 } else {
760 do {
761 xy_y_convolve_2tap_64_avx512(
762 im + 1 * 64, s_512[0], s_512[1], &coeffs_512, r);
763 jnt_2d_avg_round_store_64_avx512(r + 0, r + 2, offset_avg_512, dst, dst8);
764
765 im += 2 * 64;
766
767 xy_y_convolve_2tap_64_avx512(
768 im + 0 * 64, s_512[1], s_512[0], &coeffs_512, r);
769 jnt_2d_avg_round_store_64_avx512(
770 r + 0, r + 2, offset_avg_512, dst + dst_stride, dst8 + dst8_stride);
771
772 dst += 2 * dst_stride;
773 dst8 += 2 * dst8_stride;
774 y -= 2;
775 } while (y);
776 }
777 } else {
778 do {
779 xy_y_convolve_2tap_64_avx512(im + 2 * 32, s_512[0], s_512[1], &coeffs_512, r);
780 jnt_2d_no_avg_round_store_64_avx512(r + 0, r + 2, offset_no_avg_512, dst);
781
782 im += 2 * 64;
783
784 xy_y_convolve_2tap_64_avx512(im + 0 * 32, s_512[1], s_512[0], &coeffs_512, r);
785 jnt_2d_no_avg_round_store_64_avx512(
786 r + 0, r + 2, offset_no_avg_512, dst + dst_stride);
787
788 dst += 2 * dst_stride;
789 y -= 2;
790 } while (y);
791 }
792 } else {
793 __m512i s_512[2][4], r[4];
794
795 assert(w == 128);
796
797 load_16bit_4rows_avx512(im, 32, s_512[0]);
798
799 if (conv_params->do_average) {
800 if (conv_params->use_jnt_comp_avg) {
801 do {
802 xy_y_convolve_2tap_64_avx512(
803 im + 2 * 64, s_512[0] + 0, s_512[1] + 0, &coeffs_512, r);
804 jnt_2d_comp_avg_round_store_64_avx512(
805 r + 0, r + 2, factor_512, offset_comp_avg_512, dst, dst8);
806
807 xy_y_convolve_2tap_64_avx512(
808 im + 3 * 64, s_512[0] + 2, s_512[1] + 2, &coeffs_512, r);
809 jnt_2d_comp_avg_round_store_64_avx512(r + 0,
810 r + 2,
811 factor_512,
812 offset_comp_avg_512,
813 dst + 1 * 64,
814 dst8 + 1 * 64);
815
816 im += 2 * 128;
817
818 xy_y_convolve_2tap_64_avx512(
819 im + 0 * 64, s_512[1] + 0, s_512[0] + 0, &coeffs_512, r);
820 jnt_2d_comp_avg_round_store_64_avx512(r + 0,
821 r + 2,
822 factor_512,
823 offset_comp_avg_512,
824 dst + dst8_stride + 0 * 64,
825 dst8 + dst8_stride + 0 * 64);
826
827 xy_y_convolve_2tap_64_avx512(
828 im + 1 * 64, s_512[1] + 2, s_512[0] + 2, &coeffs_512, r);
829 jnt_2d_comp_avg_round_store_64_avx512(r + 0,
830 r + 2,
831 factor_512,
832 offset_comp_avg_512,
833 dst + dst8_stride + 1 * 64,
834 dst8 + dst8_stride + 1 * 64);
835
836 dst += 2 * dst_stride;
837 dst8 += 2 * dst8_stride;
838 y -= 2;
839 } while (y);
840 } else {
841 do {
842 xy_y_convolve_2tap_64_avx512(
843 im + 2 * 64, s_512[0] + 0, s_512[1] + 0, &coeffs_512, r);
844 jnt_2d_avg_round_store_64_avx512(
845 r + 0, r + 2, offset_avg_512, dst + 0 * 64, dst8 + 0 * 64);
846
847 xy_y_convolve_2tap_64_avx512(
848 im + 3 * 64, s_512[0] + 2, s_512[1] + 2, &coeffs_512, r);
849 jnt_2d_avg_round_store_64_avx512(
850 r + 0, r + 2, offset_avg_512, dst + 1 * 64, dst8 + 1 * 64);
851
852 im += 2 * 128;
853
854 xy_y_convolve_2tap_64_avx512(
855 im + 0 * 64, s_512[1] + 0, s_512[0] + 0, &coeffs_512, r);
856 jnt_2d_avg_round_store_64_avx512(r + 0,
857 r + 2,
858 offset_avg_512,
859 dst + dst_stride + 0 * 64,
860 dst8 + dst8_stride + 0 * 64);
861
862 xy_y_convolve_2tap_64_avx512(
863 im + 1 * 64, s_512[1] + 2, s_512[0] + 2, &coeffs_512, r);
864 jnt_2d_avg_round_store_64_avx512(r + 0,
865 r + 2,
866 offset_avg_512,
867 dst + dst_stride + 1 * 64,
868 dst8 + dst8_stride + 1 * 64);
869
870 dst += 2 * dst_stride;
871 dst8 += 2 * dst8_stride;
872 y -= 2;
873 } while (y);
874 }
875 } else {
876 do {
877 xy_y_convolve_2tap_64_avx512(
878 im + 2 * 64, s_512[0] + 0, s_512[1] + 0, &coeffs_512, r);
879 jnt_2d_no_avg_round_store_64_avx512(
880 r + 0, r + 2, offset_no_avg_512, dst + 0 * 64);
881
882 xy_y_convolve_2tap_64_avx512(
883 im + 3 * 64, s_512[0] + 2, s_512[1] + 2, &coeffs_512, r);
884 jnt_2d_no_avg_round_store_64_avx512(
885 r + 0, r + 2, offset_no_avg_512, dst + 1 * 64);
886
887 im += 2 * 128;
888
889 xy_y_convolve_2tap_64_avx512(
890 im + 0 * 64, s_512[1] + 0, s_512[0] + 0, &coeffs_512, r);
891 jnt_2d_no_avg_round_store_64_avx512(
892 r + 0, r + 2, offset_no_avg_512, dst + dst_stride + 0 * 64);
893
894 xy_y_convolve_2tap_64_avx512(
895 im + 1 * 64, s_512[1] + 2, s_512[0] + 2, &coeffs_512, r);
896 jnt_2d_no_avg_round_store_64_avx512(
897 r + 0, r + 2, offset_no_avg_512, dst + dst_stride + 1 * 64);
898
899 dst += 2 * dst_stride;
900 y -= 2;
901 } while (y);
902 }
903 }
904 }
905 }
906
jnt_convolve_2d_ver_2tap_half_avx512(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)907 static void jnt_convolve_2d_ver_2tap_half_avx512(const int16_t *const im_block, const int32_t w,
908 const int32_t h,
909 const InterpFilterParams *const filter_params_y,
910 const int32_t subpel_y_q4,
911 const ConvolveParams *const conv_params,
912 uint8_t *dst8, const int32_t dst8_stride) {
913 const int32_t dst_stride = conv_params->dst_stride;
914 const int32_t bd = 8;
915 const int32_t round_0 = 3;
916 const int16_t *im = im_block;
917 const int32_t round_1 = COMPOUND_ROUND1_BITS;
918 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
919 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
920 const int32_t round_offset = 1 << (offset_bits - round_1);
921 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
922 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
923 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
924 (1 << (round_bits + DIST_PRECISION_BITS - 1));
925 const int32_t offset_avg = (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
926 (1 << (round_bits + round_1 - COMPOUND_ROUND1_BITS + 1)) -
927 (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) -
928 (1 << (offset_bits - COMPOUND_ROUND1_BITS));
929 const int32_t offset_no_avg = (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
930 (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) +
931 (1 << (offset_bits - COMPOUND_ROUND1_BITS));
932 ConvBufType *dst = conv_params->dst;
933 int32_t y = h;
934
935 (void)filter_params_y;
936 (void)subpel_y_q4;
937
938 if (w <= 4) {
939 const __m128i factor_128 = _mm_set1_epi32(factor);
940 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
941 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
942 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
943
944 if (w == 2) {
945 __m128i s_32[2];
946
947 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
948
949 if (conv_params->do_average) {
950 if (conv_params->use_jnt_comp_avg) {
951 do {
952 const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
953 jnt_2d_comp_avg_round_store_half_pel_2x2_sse2(res,
954 factor_128,
955 offset_comp_avg_128,
956 dst,
957 dst_stride,
958 dst8,
959 dst8_stride);
960 im += 2 * 2;
961 dst += 2 * dst_stride;
962 dst8 += 2 * dst8_stride;
963 y -= 2;
964 } while (y);
965 } else {
966 do {
967 const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
968 jnt_2d_avg_round_store_half_pel_2x2_sse2(
969 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
970 im += 2 * 2;
971 dst += 2 * dst_stride;
972 dst8 += 2 * dst8_stride;
973 y -= 2;
974 } while (y);
975 }
976 } else {
977 do {
978 const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
979 jnt_2d_no_avg_round_store_half_pel_2x2_sse2(
980 res, offset_no_avg_128, dst, dst_stride);
981 im += 2 * 2;
982 dst += 2 * dst_stride;
983 y -= 2;
984 } while (y);
985 }
986 } else {
987 __m128i s_64[2];
988
989 assert(w == 4);
990
991 s_64[0] = _mm_loadl_epi64((__m128i *)im);
992
993 if (conv_params->do_average) {
994 if (conv_params->use_jnt_comp_avg) {
995 do {
996 const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
997 jnt_2d_comp_avg_round_store_half_pel_4x2_sse2(res,
998 factor_128,
999 offset_comp_avg_128,
1000 dst,
1001 dst_stride,
1002 dst8,
1003 dst8_stride);
1004 im += 2 * 4;
1005 dst += 2 * dst_stride;
1006 dst8 += 2 * dst8_stride;
1007 y -= 2;
1008 } while (y);
1009 } else {
1010 do {
1011 const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
1012 jnt_2d_avg_round_store_half_pel_4x2_sse2(
1013 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1014 im += 2 * 4;
1015 dst += 2 * dst_stride;
1016 dst8 += 2 * dst8_stride;
1017 y -= 2;
1018 } while (y);
1019 }
1020 } else {
1021 do {
1022 const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
1023 jnt_2d_no_avg_round_store_half_pel_4x2_sse2(
1024 res, offset_no_avg_128, dst, dst_stride);
1025 im += 2 * 4;
1026 dst += 2 * dst_stride;
1027 y -= 2;
1028 } while (y);
1029 }
1030 }
1031 } else if (w <= 16) {
1032 const __m256i factor_256 = _mm256_set1_epi32(factor);
1033 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1034 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
1035 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
1036
1037 if (w == 8) {
1038 __m128i s_128[2];
1039
1040 s_128[0] = _mm_loadu_si128((__m128i *)im);
1041
1042 if (conv_params->do_average) {
1043 if (conv_params->use_jnt_comp_avg) {
1044 do {
1045 const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1046 jnt_2d_comp_avg_round_store_half_pel_8x2_avx2(res,
1047 factor_256,
1048 offset_comp_avg_256,
1049 dst,
1050 dst_stride,
1051 dst8,
1052 dst8_stride);
1053 im += 2 * 8;
1054 dst += 2 * dst_stride;
1055 dst8 += 2 * dst8_stride;
1056 y -= 2;
1057 } while (y);
1058 } else {
1059 do {
1060 const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1061 jnt_2d_avg_round_store_half_pel_8x2_avx2(
1062 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1063 im += 2 * 8;
1064 dst += 2 * dst_stride;
1065 dst8 += 2 * dst8_stride;
1066 y -= 2;
1067 } while (y);
1068 }
1069 } else {
1070 do {
1071 const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
1072 jnt_2d_no_avg_round_store_half_pel_8x2_avx2(
1073 res, offset_no_avg_256, dst, dst_stride);
1074 im += 2 * 8;
1075 dst += 2 * dst_stride;
1076 y -= 2;
1077 } while (y);
1078 }
1079 } else {
1080 __m256i s_256[2], r[2];
1081
1082 assert(w == 16);
1083
1084 s_256[0] = _mm256_loadu_si256((__m256i *)im);
1085
1086 if (conv_params->do_average) {
1087 if (conv_params->use_jnt_comp_avg) {
1088 do {
1089 xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1090 jnt_2d_comp_avg_round_store_half_pel_16x2_avx2(
1091 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1092 im += 2 * 16;
1093 dst += 2 * dst_stride;
1094 dst8 += 2 * dst8_stride;
1095 y -= 2;
1096 } while (y);
1097 } else {
1098 do {
1099 xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1100 jnt_2d_avg_round_store_half_pel_16x2_avx2(
1101 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1102 im += 2 * 16;
1103 dst += 2 * dst_stride;
1104 dst8 += 2 * dst8_stride;
1105 y -= 2;
1106 } while (y);
1107 }
1108 } else {
1109 do {
1110 xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1111 jnt_2d_no_avg_round_store_half_pel_16x2_avx2(
1112 r, offset_no_avg_256, dst, dst_stride);
1113 im += 2 * 16;
1114 dst += 2 * dst_stride;
1115 y -= 2;
1116 } while (y);
1117 }
1118 }
1119 } else {
1120 const __m512i factor_512 = _mm512_set1_epi32(factor);
1121 const __m512i offset_comp_avg_512 = _mm512_set1_epi32(offset_comp_avg);
1122 const __m512i offset_avg_512 = _mm512_set1_epi16(offset_avg);
1123 const __m512i offset_no_avg_512 = _mm512_set1_epi16(offset_no_avg);
1124
1125 if (w == 32) {
1126 __m512i s_512[2], r[2];
1127
1128 s_512[0] = loadu_s16_16x2_avx512(im, 32);
1129
1130 if (conv_params->do_average) {
1131 if (conv_params->use_jnt_comp_avg) {
1132 do {
1133 xy_y_convolve_2tap_half_pel_32x2_avx512(im + 16, s_512, r);
1134 jnt_2d_comp_avg_round_store_half_pel_32x2_avx512(
1135 r, factor_512, offset_comp_avg_512, dst, dst_stride, dst8, dst8_stride);
1136 im += 2 * 32;
1137 dst += 2 * dst_stride;
1138 dst8 += 2 * dst8_stride;
1139 y -= 2;
1140 } while (y);
1141 } else {
1142 do {
1143 xy_y_convolve_2tap_half_pel_32x2_avx512(im + 16, s_512, r);
1144 jnt_2d_avg_round_store_half_pel_32x2_avx512(
1145 r, offset_avg_512, dst, dst_stride, dst8, dst8_stride);
1146 im += 2 * 32;
1147 dst += 2 * dst_stride;
1148 dst8 += 2 * dst8_stride;
1149 y -= 2;
1150 } while (y);
1151 }
1152 } else {
1153 do {
1154 xy_y_convolve_2tap_half_pel_32x2_avx512(im + 16, s_512, r);
1155 jnt_2d_no_avg_round_store_half_pel_32x2_avx512(
1156 r, offset_no_avg_512, dst, dst_stride);
1157 im += 2 * 32;
1158 dst += 2 * dst_stride;
1159 y -= 2;
1160 } while (y);
1161 }
1162 } else if (w == 64) {
1163 __m512i s_512[2][2], r[2];
1164
1165 s_512[0][0] = zz_load_512(im + 0 * 32);
1166 s_512[0][1] = zz_load_512(im + 1 * 32);
1167
1168 if (conv_params->do_average) {
1169 if (conv_params->use_jnt_comp_avg) {
1170 do {
1171 xy_y_convolve_2tap_half_pel_64_avx512(
1172 im + 2 * 32, s_512[0] + 0, s_512[1] + 0, r);
1173 jnt_2d_comp_avg_round_store_half_pel_64_avx512(
1174 r, factor_512, offset_comp_avg_512, dst, dst8);
1175
1176 im += 2 * 64;
1177
1178 xy_y_convolve_2tap_half_pel_64_avx512(
1179 im + 0 * 32, s_512[1] + 0, s_512[0] + 0, r);
1180 jnt_2d_comp_avg_round_store_half_pel_64_avx512(r,
1181 factor_512,
1182 offset_comp_avg_512,
1183 dst + dst_stride,
1184 dst8 + dst8_stride);
1185
1186 dst += 2 * dst_stride;
1187 dst8 += 2 * dst8_stride;
1188 y -= 2;
1189 } while (y);
1190 } else {
1191 do {
1192 xy_y_convolve_2tap_half_pel_64_avx512(im + 1 * 64, s_512[0], s_512[1], r);
1193 jnt_2d_avg_round_store_half_pel_64_avx512(r, offset_avg_512, dst, dst8);
1194
1195 im += 2 * 64;
1196
1197 xy_y_convolve_2tap_half_pel_64_avx512(im + 0 * 64, s_512[1], s_512[0], r);
1198 jnt_2d_avg_round_store_half_pel_64_avx512(
1199 r, offset_avg_512, dst + dst_stride, dst8 + dst8_stride);
1200
1201 dst += 2 * dst_stride;
1202 dst8 += 2 * dst8_stride;
1203 y -= 2;
1204 } while (y);
1205 }
1206 } else {
1207 do {
1208 xy_y_convolve_2tap_half_pel_64_avx512(im + 2 * 32, s_512[0], s_512[1], r);
1209 jnt_2d_no_avg_round_store_half_pel_64_avx512(r, offset_no_avg_512, dst);
1210
1211 im += 2 * 64;
1212
1213 xy_y_convolve_2tap_half_pel_64_avx512(im + 0 * 32, s_512[1], s_512[0], r);
1214 jnt_2d_no_avg_round_store_half_pel_64_avx512(
1215 r, offset_no_avg_512, dst + dst_stride);
1216
1217 dst += 2 * dst_stride;
1218 y -= 2;
1219 } while (y);
1220 }
1221 } else {
1222 __m512i s_512[2][4], r[2];
1223
1224 assert(w == 128);
1225
1226 load_16bit_4rows_avx512(im, 32, s_512[0]);
1227
1228 if (conv_params->do_average) {
1229 if (conv_params->use_jnt_comp_avg) {
1230 do {
1231 xy_y_convolve_2tap_half_pel_64_avx512(
1232 im + 4 * 32, s_512[0] + 0, s_512[1] + 0, r);
1233 jnt_2d_comp_avg_round_store_half_pel_64_avx512(
1234 r, factor_512, offset_comp_avg_512, dst + 0 * 32, dst8 + 0 * 32);
1235
1236 xy_y_convolve_2tap_half_pel_64_avx512(
1237 im + 6 * 32, s_512[0] + 2, s_512[1] + 2, r);
1238 jnt_2d_comp_avg_round_store_half_pel_64_avx512(
1239 r, factor_512, offset_comp_avg_512, dst + 2 * 32, dst8 + 2 * 32);
1240
1241 im += 2 * 128;
1242
1243 xy_y_convolve_2tap_half_pel_64_avx512(
1244 im + 0 * 32, s_512[1] + 0, s_512[0] + 0, r);
1245 jnt_2d_comp_avg_round_store_half_pel_64_avx512(r,
1246 factor_512,
1247 offset_comp_avg_512,
1248 dst + dst_stride + 0 * 32,
1249 dst8 + dst8_stride + 0 * 32);
1250
1251 xy_y_convolve_2tap_half_pel_64_avx512(
1252 im + 2 * 32, s_512[1] + 2, s_512[0] + 2, r);
1253 jnt_2d_comp_avg_round_store_half_pel_64_avx512(r,
1254 factor_512,
1255 offset_comp_avg_512,
1256 dst + dst_stride + 2 * 32,
1257 dst8 + dst8_stride + 2 * 32);
1258
1259 dst += 2 * dst_stride;
1260 dst8 += 2 * dst8_stride;
1261 y -= 2;
1262 } while (y);
1263 } else {
1264 do {
1265 xy_y_convolve_2tap_half_pel_64_avx512(
1266 im + 4 * 32, s_512[0] + 0, s_512[1] + 0, r);
1267 jnt_2d_avg_round_store_half_pel_64_avx512(
1268 r, offset_avg_512, dst + 0 * 32, dst8 + 0 * 32);
1269
1270 xy_y_convolve_2tap_half_pel_64_avx512(
1271 im + 6 * 32, s_512[0] + 2, s_512[1] + 2, r);
1272 jnt_2d_avg_round_store_half_pel_64_avx512(
1273 r, offset_avg_512, dst + 2 * 32, dst8 + 2 * 32);
1274
1275 im += 2 * 128;
1276
1277 xy_y_convolve_2tap_half_pel_64_avx512(
1278 im + 0 * 32, s_512[1] + 0, s_512[0] + 0, r);
1279 jnt_2d_avg_round_store_half_pel_64_avx512(r,
1280 offset_avg_512,
1281 dst + dst_stride + 0 * 32,
1282 dst8 + dst8_stride + 0 * 32);
1283
1284 xy_y_convolve_2tap_half_pel_64_avx512(
1285 im + 2 * 32, s_512[1] + 2, s_512[0] + 2, r);
1286 jnt_2d_avg_round_store_half_pel_64_avx512(r,
1287 offset_avg_512,
1288 dst + dst_stride + 2 * 32,
1289 dst8 + dst8_stride + 2 * 32);
1290
1291 dst += 2 * dst_stride;
1292 dst8 += 2 * dst8_stride;
1293 y -= 2;
1294 } while (y);
1295 }
1296 } else {
1297 do {
1298 xy_y_convolve_2tap_half_pel_64_avx512(
1299 im + 4 * 32, s_512[0] + 0, s_512[1] + 0, r);
1300 jnt_2d_no_avg_round_store_half_pel_64_avx512(
1301 r, offset_no_avg_512, dst + 0 * 32);
1302
1303 xy_y_convolve_2tap_half_pel_64_avx512(
1304 im + 6 * 32, s_512[0] + 2, s_512[1] + 2, r);
1305 jnt_2d_no_avg_round_store_half_pel_64_avx512(
1306 r, offset_no_avg_512, dst + 2 * 32);
1307
1308 im += 2 * 128;
1309
1310 xy_y_convolve_2tap_half_pel_64_avx512(
1311 im + 0 * 32, s_512[1] + 0, s_512[0] + 0, r);
1312 jnt_2d_no_avg_round_store_half_pel_64_avx512(
1313 r, offset_no_avg_512, dst + dst_stride + 0 * 32);
1314
1315 xy_y_convolve_2tap_half_pel_64_avx512(
1316 im + 2 * 32, s_512[1] + 2, s_512[0] + 2, r);
1317 jnt_2d_no_avg_round_store_half_pel_64_avx512(
1318 r, offset_no_avg_512, dst + dst_stride + 2 * 32);
1319
1320 dst += 2 * dst_stride;
1321 y -= 2;
1322 } while (y);
1323 }
1324 }
1325 }
1326 }
1327
jnt_convolve_2d_ver_6tap_avx512(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)1328 static void jnt_convolve_2d_ver_6tap_avx512(const int16_t *const im_block, const int32_t w,
1329 const int32_t h,
1330 const InterpFilterParams *const filter_params_y,
1331 const int32_t subpel_y_q4,
1332 const ConvolveParams *const conv_params, uint8_t *dst8,
1333 const int32_t dst8_stride) {
1334 const int32_t dst_stride = conv_params->dst_stride;
1335 const int32_t bd = 8;
1336 const int32_t round_0 = 3;
1337 const int16_t *im = im_block;
1338 const int32_t round_1 = COMPOUND_ROUND1_BITS;
1339 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
1340 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
1341 const int32_t round_offset = 1 << (offset_bits - round_1);
1342 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1343 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
1344 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
1345 (1 << (round_bits + DIST_PRECISION_BITS - 1));
1346 const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
1347 (1 << offset_bits) - (1 << (offset_bits - 1));
1348 const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
1349 (1 << (offset_bits - 1));
1350 int32_t y = h;
1351 ConvBufType *dst = conv_params->dst;
1352
1353 if (w == 2) {
1354 const __m128i factor_128 = _mm_set1_epi32(factor);
1355 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1356 const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
1357 const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
1358 __m128i coeffs_128[3], s_32[6], ss_128[3];
1359
1360 prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1361
1362 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
1363 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
1364 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
1365 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
1366 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
1367
1368 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1369 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1370 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1371 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1372
1373 ss_128[0] = _mm_unpacklo_epi16(src01, src12);
1374 ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1375
1376 y = h;
1377
1378 if (conv_params->do_average) {
1379 if (conv_params->use_jnt_comp_avg) {
1380 do {
1381 const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1382 jnt_2d_comp_avg_round_store_2x2_sse2(
1383 res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
1384 im += 2 * 2;
1385 dst += 2 * dst_stride;
1386 dst8 += 2 * dst8_stride;
1387 y -= 2;
1388 } while (y);
1389 } else {
1390 do {
1391 const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1392 jnt_2d_avg_round_store_2x2_sse2(
1393 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1394 im += 2 * 2;
1395 dst += 2 * dst_stride;
1396 dst8 += 2 * dst8_stride;
1397 y -= 2;
1398 } while (y);
1399 }
1400 } else {
1401 do {
1402 const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1403 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1404 im += 2 * 2;
1405 dst += 2 * dst_stride;
1406 y -= 2;
1407 } while (y);
1408 }
1409 } else if (w <= 16) {
1410 const __m256i factor_256 = _mm256_set1_epi32(factor);
1411 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1412 const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
1413 const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
1414 __m256i coeffs_256[3];
1415
1416 prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1417
1418 if (w == 4) {
1419 __m128i s_64[6];
1420 __m256i s_256[6], ss_256[3];
1421
1422 s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1423 s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1424 s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1425 s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1426 s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1427
1428 // Load lines a and b. Line a to lower 128, line b to upper 128
1429 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1430 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1431 s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1432 s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1433
1434 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1435 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1436
1437 y = h;
1438
1439 if (conv_params->do_average) {
1440 if (conv_params->use_jnt_comp_avg) {
1441 do {
1442 const __m256i res = xy_y_convolve_6tap_4x2_avx2(
1443 im, s_64, ss_256, coeffs_256);
1444 jnt_2d_comp_avg_round_store_4x2_avx2(res,
1445 factor_256,
1446 offset_comp_avg_256,
1447 dst,
1448 dst_stride,
1449 dst8,
1450 dst8_stride);
1451 im += 2 * 4;
1452 dst += 2 * dst_stride;
1453 dst8 += 2 * dst8_stride;
1454 y -= 2;
1455 } while (y);
1456 } else {
1457 do {
1458 const __m256i res = xy_y_convolve_6tap_4x2_avx2(
1459 im, s_64, ss_256, coeffs_256);
1460 jnt_2d_avg_round_store_4x2_avx2(
1461 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1462 im += 2 * 4;
1463 dst += 2 * dst_stride;
1464 dst8 += 2 * dst8_stride;
1465 y -= 2;
1466 } while (y);
1467 }
1468 } else {
1469 do {
1470 const __m256i res = xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1471 jnt_2d_no_avg_round_store_4x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1472 im += 2 * 4;
1473 dst += 2 * dst_stride;
1474 y -= 2;
1475 } while (y);
1476 }
1477 } else if (w == 8) {
1478 __m256i s_256[6], r[2];
1479
1480 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1481 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1482 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
1483 s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
1484 y = h;
1485
1486 __m256i ss_256[6];
1487
1488 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1489 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1490
1491 ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1492 ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1493
1494 if (conv_params->do_average) {
1495 if (conv_params->use_jnt_comp_avg) {
1496 do {
1497 xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1498 jnt_2d_comp_avg_round_store_8x2_avx2(
1499 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1500 im += 2 * 8;
1501 dst += 2 * dst_stride;
1502 dst8 += 2 * dst8_stride;
1503 y -= 2;
1504 } while (y);
1505 } else {
1506 do {
1507 xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1508 jnt_2d_avg_round_store_8x2_avx2(
1509 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1510 im += 2 * 8;
1511 dst += 2 * dst_stride;
1512 dst8 += 2 * dst8_stride;
1513 y -= 2;
1514 } while (y);
1515 }
1516 } else {
1517 do {
1518 xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1519 jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1520 im += 2 * 8;
1521 dst += 2 * dst_stride;
1522 y -= 2;
1523 } while (y);
1524 }
1525 } else {
1526 __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1527
1528 assert(w == 16);
1529
1530 loadu_unpack_16bit_5rows_avx2(im, 16, s_256, ss_256, tt_256);
1531 y = h;
1532
1533 if (conv_params->do_average) {
1534 if (conv_params->use_jnt_comp_avg) {
1535 do {
1536 xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1537 jnt_2d_comp_avg_round_store_16x2_avx2(
1538 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1539 im += 2 * 16;
1540 dst += 2 * dst_stride;
1541 dst8 += 2 * dst8_stride;
1542 y -= 2;
1543 } while (y);
1544 } else {
1545 do {
1546 xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1547 jnt_2d_avg_round_store_16x2_avx2(
1548 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1549 im += 2 * 16;
1550 dst += 2 * dst_stride;
1551 dst8 += 2 * dst8_stride;
1552 y -= 2;
1553 } while (y);
1554 }
1555 } else {
1556 do {
1557 xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1558 jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1559 im += 2 * 16;
1560 dst += 2 * dst_stride;
1561 y -= 2;
1562 } while (y);
1563 }
1564 }
1565 } else {
1566 const __m512i factor_512 = _mm512_set1_epi32(factor);
1567 const __m512i offset_comp_avg_512 = _mm512_set1_epi32(offset_comp_avg);
1568 const __m512i offset_avg_512 = _mm512_set1_epi32(offset_avg);
1569 const __m512i offset_no_avg_512 = _mm512_set1_epi32(offset_no_avg);
1570 __m512i coeffs_512[3];
1571
1572 prepare_coeffs_6tap_avx512(filter_params_y, subpel_y_q4, coeffs_512);
1573
1574 if (w == 32) {
1575 __m512i s_512[6], ss_512[6], tt_512[6], r[4];
1576
1577 loadu_unpack_16bit_32x5_avx512(im, s_512, ss_512, tt_512);
1578
1579 y = h;
1580
1581 if (conv_params->do_average) {
1582 if (conv_params->use_jnt_comp_avg) {
1583 do {
1584 xy_y_convolve_6tap_width32x2_avx512(
1585 im, coeffs_512, s_512, ss_512, tt_512, r);
1586 jnt_2d_comp_avg_round_store_32x2_avx512(r + 0,
1587 r + 2,
1588 factor_512,
1589 offset_comp_avg_512,
1590 dst,
1591 dst_stride,
1592 dst8,
1593 dst8_stride);
1594 im += 2 * 32;
1595 dst += 2 * dst_stride;
1596 dst8 += 2 * dst8_stride;
1597 y -= 2;
1598 } while (y);
1599 } else {
1600 do {
1601 xy_y_convolve_6tap_width32x2_avx512(
1602 im, coeffs_512, s_512, ss_512, tt_512, r);
1603 jnt_2d_avg_round_store_32x2_avx512(
1604 r + 0, r + 2, offset_avg_512, dst, dst_stride, dst8, dst8_stride);
1605 im += 2 * 32;
1606 dst += 2 * dst_stride;
1607 dst8 += 2 * dst8_stride;
1608 y -= 2;
1609 } while (y);
1610 }
1611 } else {
1612 do {
1613 xy_y_convolve_6tap_width32x2_avx512(im, coeffs_512, s_512, ss_512, tt_512, r);
1614 jnt_2d_no_avg_round_store_32x2_avx512(
1615 r + 0, r + 2, offset_no_avg_512, dst, dst_stride);
1616 im += 2 * 32;
1617 dst += 2 * dst_stride;
1618 y -= 2;
1619 } while (y);
1620 }
1621 } else {
1622 __m512i s_512[2][6], ss_512[2][6], tt_512[2][6], r0[4], r1[4];
1623
1624 assert(!(w % 64));
1625
1626 int32_t x = 0;
1627 do {
1628 const int16_t *s = im + x;
1629 ConvBufType * d = dst + x;
1630 uint8_t * d8 = dst8 + x;
1631
1632 loadu_unpack_16bit_5rows_avx512(s, w, s_512[0], ss_512[0], tt_512[0]);
1633 loadu_unpack_16bit_5rows_avx512(s + 32, w, s_512[1], ss_512[1], tt_512[1]);
1634
1635 y = h;
1636
1637 if (conv_params->do_average) {
1638 if (conv_params->use_jnt_comp_avg) {
1639 do {
1640 xy_y_convolve_6tap_32x2_avx512(
1641 s, w, s_512[0], ss_512[0], tt_512[0], coeffs_512, r0);
1642 xy_y_convolve_6tap_32x2_avx512(
1643 s + 32, w, s_512[1], ss_512[1], tt_512[1], coeffs_512, r1);
1644 jnt_2d_comp_avg_round_store_64_avx512(
1645 r0 + 0, r1 + 0, factor_512, offset_comp_avg_512, d, d8);
1646 jnt_2d_comp_avg_round_store_64_avx512(r0 + 2,
1647 r1 + 2,
1648 factor_512,
1649 offset_comp_avg_512,
1650 d + dst_stride,
1651 d8 + dst8_stride);
1652 s += 2 * w;
1653 d += 2 * dst_stride;
1654 d8 += 2 * dst8_stride;
1655 y -= 2;
1656 } while (y);
1657 } else {
1658 do {
1659 xy_y_convolve_6tap_32x2_avx512(
1660 s, w, s_512[0], ss_512[0], tt_512[0], coeffs_512, r0);
1661 xy_y_convolve_6tap_32x2_avx512(
1662 s + 32, w, s_512[1], ss_512[1], tt_512[1], coeffs_512, r1);
1663 jnt_2d_avg_round_store_64_avx512(r0 + 0, r1 + 0, offset_avg_512, d, d8);
1664 jnt_2d_avg_round_store_64_avx512(
1665 r0 + 2, r1 + 2, offset_avg_512, d + dst_stride, d8 + dst8_stride);
1666 s += 2 * w;
1667 d += 2 * dst_stride;
1668 d8 += 2 * dst8_stride;
1669 y -= 2;
1670 } while (y);
1671 }
1672 } else {
1673 do {
1674 xy_y_convolve_6tap_32x2_avx512(
1675 s, w, s_512[0], ss_512[0], tt_512[0], coeffs_512, r0);
1676 xy_y_convolve_6tap_32x2_avx512(
1677 s + 32, w, s_512[1], ss_512[1], tt_512[1], coeffs_512, r1);
1678 jnt_2d_no_avg_round_store_64_avx512(r0 + 0, r1 + 0, offset_no_avg_512, d);
1679 jnt_2d_no_avg_round_store_64_avx512(
1680 r0 + 2, r1 + 2, offset_no_avg_512, d + dst_stride);
1681 s += 2 * w;
1682 d += 2 * dst_stride;
1683 y -= 2;
1684 } while (y);
1685 }
1686
1687 x += 64;
1688 } while (x < w);
1689 }
1690 }
1691 }
1692
jnt_convolve_2d_ver_8tap_avx512(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)1693 static void jnt_convolve_2d_ver_8tap_avx512(const int16_t *const im_block, const int32_t w,
1694 const int32_t h,
1695 const InterpFilterParams *const filter_params_y,
1696 const int32_t subpel_y_q4,
1697 const ConvolveParams *const conv_params, uint8_t *dst8,
1698 const int32_t dst8_stride) {
1699 const int32_t dst_stride = conv_params->dst_stride;
1700 const int32_t bd = 8;
1701 const int32_t round_0 = 3;
1702 const int16_t *im = im_block;
1703 const int32_t round_1 = COMPOUND_ROUND1_BITS;
1704 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
1705 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
1706 const int32_t round_offset = 1 << (offset_bits - round_1);
1707 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1708 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
1709 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
1710 (1 << (round_bits + DIST_PRECISION_BITS - 1));
1711 const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
1712 (1 << offset_bits) - (1 << (offset_bits - 1));
1713 const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
1714 (1 << (offset_bits - 1));
1715 int32_t y = h;
1716 ConvBufType *dst = conv_params->dst;
1717
1718 if (w == 2) {
1719 const __m128i factor_128 = _mm_set1_epi32(factor);
1720 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1721 const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
1722 const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
1723 __m128i coeffs_128[4], s_32[8], ss_128[4];
1724
1725 prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
1726
1727 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
1728 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
1729 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
1730 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
1731 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
1732 s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
1733 s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
1734
1735 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1736 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1737 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1738 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1739 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1740 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
1741
1742 ss_128[0] = _mm_unpacklo_epi16(src01, src12);
1743 ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1744 ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1745
1746 y = h;
1747
1748 if (conv_params->do_average) {
1749 if (conv_params->use_jnt_comp_avg) {
1750 do {
1751 const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1752 jnt_2d_comp_avg_round_store_2x2_sse2(
1753 res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
1754 im += 2 * 2;
1755 dst += 2 * dst_stride;
1756 dst8 += 2 * dst8_stride;
1757 y -= 2;
1758 } while (y);
1759 } else {
1760 do {
1761 const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1762 jnt_2d_avg_round_store_2x2_sse2(
1763 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1764 im += 2 * 2;
1765 dst += 2 * dst_stride;
1766 dst8 += 2 * dst8_stride;
1767 y -= 2;
1768 } while (y);
1769 }
1770 } else {
1771 do {
1772 const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1773 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1774 im += 2 * 2;
1775 dst += 2 * dst_stride;
1776 y -= 2;
1777 } while (y);
1778 }
1779 } else if (w <= 16) {
1780 const __m256i factor_256 = _mm256_set1_epi32(factor);
1781 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1782 const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
1783 const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
1784 __m256i coeffs_256[4];
1785
1786 prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1787
1788 if (w == 4) {
1789 __m128i s_64[8];
1790 __m256i s_256[8], ss_256[4];
1791
1792 s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1793 s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1794 s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1795 s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1796 s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1797 s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
1798 s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
1799
1800 // Load lines a and b. Line a to lower 128, line b to upper 128
1801 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1802 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1803 s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1804 s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1805 s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
1806 s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
1807
1808 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1809 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1810 ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1811
1812 y = h;
1813
1814 if (conv_params->do_average) {
1815 if (conv_params->use_jnt_comp_avg) {
1816 do {
1817 const __m256i res = xy_y_convolve_8tap_4x2_avx2(
1818 im, s_64, ss_256, coeffs_256);
1819 jnt_2d_comp_avg_round_store_4x2_avx2(res,
1820 factor_256,
1821 offset_comp_avg_256,
1822 dst,
1823 dst_stride,
1824 dst8,
1825 dst8_stride);
1826 im += 2 * 4;
1827 dst += 2 * dst_stride;
1828 dst8 += 2 * dst8_stride;
1829 y -= 2;
1830 } while (y);
1831 } else {
1832 do {
1833 const __m256i res = xy_y_convolve_8tap_4x2_avx2(
1834 im, s_64, ss_256, coeffs_256);
1835 jnt_2d_avg_round_store_4x2_avx2(
1836 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1837 im += 2 * 4;
1838 dst += 2 * dst_stride;
1839 dst8 += 2 * dst8_stride;
1840 y -= 2;
1841 } while (y);
1842 }
1843 } else {
1844 do {
1845 const __m256i res = xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1846 jnt_2d_no_avg_round_store_4x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1847 im += 2 * 4;
1848 dst += 2 * dst_stride;
1849 y -= 2;
1850 } while (y);
1851 }
1852 } else if (w == 8) {
1853 __m256i s_256[8], r[2];
1854
1855 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1856 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1857 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
1858 s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
1859 s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
1860 s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
1861 y = h;
1862
1863 __m256i ss_256[8];
1864
1865 convolve_8tap_unapck_avx2(s_256, ss_256);
1866
1867 if (conv_params->do_average) {
1868 if (conv_params->use_jnt_comp_avg) {
1869 do {
1870 xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1871 jnt_2d_comp_avg_round_store_8x2_avx2(
1872 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1873 im += 2 * 8;
1874 dst += 2 * dst_stride;
1875 dst8 += 2 * dst8_stride;
1876 y -= 2;
1877 } while (y);
1878 } else {
1879 do {
1880 xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1881 jnt_2d_avg_round_store_8x2_avx2(
1882 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1883 im += 2 * 8;
1884 dst += 2 * dst_stride;
1885 dst8 += 2 * dst8_stride;
1886 y -= 2;
1887 } while (y);
1888 }
1889 } else {
1890 do {
1891 xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
1892 jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1893 im += 2 * 8;
1894 dst += 2 * dst_stride;
1895 y -= 2;
1896 } while (y);
1897 }
1898 } else {
1899 __m256i s_256[8], ss_256[8], tt_256[8], r[4];
1900
1901 assert(w == 16);
1902
1903 load_16bit_7rows_avx2(im, 16, s_256);
1904 y = h;
1905
1906 convolve_8tap_unapck_avx2(s_256, ss_256);
1907 convolve_8tap_unapck_avx2(s_256 + 1, tt_256);
1908
1909 if (conv_params->do_average) {
1910 if (conv_params->use_jnt_comp_avg) {
1911 do {
1912 xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
1913 jnt_2d_comp_avg_round_store_16x2_avx2(
1914 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1915 im += 2 * 16;
1916 dst += 2 * dst_stride;
1917 dst8 += 2 * dst8_stride;
1918 y -= 2;
1919 } while (y);
1920 } else {
1921 do {
1922 xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
1923 jnt_2d_avg_round_store_16x2_avx2(
1924 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1925 im += 2 * 16;
1926 dst += 2 * dst_stride;
1927 dst8 += 2 * dst8_stride;
1928 y -= 2;
1929 } while (y);
1930 }
1931 } else {
1932 do {
1933 xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
1934 jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1935 im += 2 * 16;
1936 dst += 2 * dst_stride;
1937 y -= 2;
1938 } while (y);
1939 }
1940 }
1941 } else {
1942 const __m512i factor_512 = _mm512_set1_epi32(factor);
1943 const __m512i offset_comp_avg_512 = _mm512_set1_epi32(offset_comp_avg);
1944 const __m512i offset_avg_512 = _mm512_set1_epi32(offset_avg);
1945 const __m512i offset_no_avg_512 = _mm512_set1_epi32(offset_no_avg);
1946 __m512i coeffs_512[4];
1947
1948 prepare_coeffs_8tap_avx512(filter_params_y, subpel_y_q4, coeffs_512);
1949
1950 if (w == 32) {
1951 __m512i s_512[8], ss_512[8], tt_512[8], r[4];
1952
1953 load_16bit_32x7_avx512(im, s_512);
1954 convolve_8tap_unapck_avx512(s_512, ss_512);
1955 convolve_8tap_unapck_avx512(s_512 + 1, tt_512);
1956
1957 y = h;
1958
1959 if (conv_params->do_average) {
1960 if (conv_params->use_jnt_comp_avg) {
1961 do {
1962 xy_y_convolve_8tap_width32x2_avx512(
1963 im, coeffs_512, s_512, ss_512, tt_512, r);
1964 jnt_2d_comp_avg_round_store_32x2_avx512(r + 0,
1965 r + 2,
1966 factor_512,
1967 offset_comp_avg_512,
1968 dst,
1969 dst_stride,
1970 dst8,
1971 dst8_stride);
1972 im += 2 * 32;
1973 dst += 2 * dst_stride;
1974 dst8 += 2 * dst8_stride;
1975 y -= 2;
1976 } while (y);
1977 } else {
1978 do {
1979 xy_y_convolve_8tap_width32x2_avx512(
1980 im, coeffs_512, s_512, ss_512, tt_512, r);
1981 jnt_2d_avg_round_store_32x2_avx512(
1982 r + 0, r + 2, offset_avg_512, dst, dst_stride, dst8, dst8_stride);
1983 im += 2 * 32;
1984 dst += 2 * dst_stride;
1985 dst8 += 2 * dst8_stride;
1986 y -= 2;
1987 } while (y);
1988 }
1989 } else {
1990 do {
1991 xy_y_convolve_8tap_width32x2_avx512(im, coeffs_512, s_512, ss_512, tt_512, r);
1992 jnt_2d_no_avg_round_store_32x2_avx512(
1993 r + 0, r + 2, offset_no_avg_512, dst, dst_stride);
1994 im += 2 * 32;
1995 dst += 2 * dst_stride;
1996 y -= 2;
1997 } while (y);
1998 }
1999 } else {
2000 __m512i s_512[2][8], ss_512[2][8], tt_512[2][8], r0[4], r1[4];
2001
2002 assert(!(w % 64));
2003
2004 int32_t x = 0;
2005 do {
2006 const int16_t *s = im + x;
2007 ConvBufType * d = dst + x;
2008 uint8_t * d8 = dst8 + x;
2009
2010 load_16bit_7rows_avx512(s, w, s_512[0]);
2011 convolve_8tap_unapck_avx512(s_512[0], ss_512[0]);
2012 convolve_8tap_unapck_avx512(s_512[0] + 1, tt_512[0]);
2013
2014 load_16bit_7rows_avx512(s + 32, w, s_512[1]);
2015 convolve_8tap_unapck_avx512(s_512[1], ss_512[1]);
2016 convolve_8tap_unapck_avx512(s_512[1] + 1, tt_512[1]);
2017
2018 y = h;
2019
2020 if (conv_params->do_average) {
2021 if (conv_params->use_jnt_comp_avg) {
2022 do {
2023 xy_y_convolve_8tap_32x2_avx512(
2024 s, w, coeffs_512, s_512[0], ss_512[0], tt_512[0], r0);
2025 xy_y_convolve_8tap_32x2_avx512(
2026 s + 32, w, coeffs_512, s_512[1], ss_512[1], tt_512[1], r1);
2027 jnt_2d_comp_avg_round_store_64_avx512(
2028 r0 + 0, r1 + 0, factor_512, offset_comp_avg_512, d, d8);
2029 jnt_2d_comp_avg_round_store_64_avx512(r0 + 2,
2030 r1 + 2,
2031 factor_512,
2032 offset_comp_avg_512,
2033 d + dst_stride,
2034 d8 + dst8_stride);
2035 s += 2 * w;
2036 d += 2 * dst_stride;
2037 d8 += 2 * dst8_stride;
2038 y -= 2;
2039 } while (y);
2040 } else {
2041 do {
2042 xy_y_convolve_8tap_32x2_avx512(
2043 s, w, coeffs_512, s_512[0], ss_512[0], tt_512[0], r0);
2044 xy_y_convolve_8tap_32x2_avx512(
2045 s + 32, w, coeffs_512, s_512[1], ss_512[1], tt_512[1], r1);
2046 jnt_2d_avg_round_store_64_avx512(r0 + 0, r1 + 0, offset_avg_512, d, d8);
2047 jnt_2d_avg_round_store_64_avx512(
2048 r0 + 2, r1 + 2, offset_avg_512, d + dst_stride, d8 + dst8_stride);
2049 s += 2 * w;
2050 d += 2 * dst_stride;
2051 d8 += 2 * dst8_stride;
2052 y -= 2;
2053 } while (y);
2054 }
2055 } else {
2056 do {
2057 xy_y_convolve_8tap_32x2_avx512(
2058 s, w, coeffs_512, s_512[0], ss_512[0], tt_512[0], r0);
2059 xy_y_convolve_8tap_32x2_avx512(
2060 s + 32, w, coeffs_512, s_512[1], ss_512[1], tt_512[1], r1);
2061 jnt_2d_no_avg_round_store_64_avx512(r0 + 0, r1 + 0, offset_no_avg_512, d);
2062 jnt_2d_no_avg_round_store_64_avx512(
2063 r0 + 2, r1 + 2, offset_no_avg_512, d + dst_stride);
2064 s += 2 * w;
2065 d += 2 * dst_stride;
2066 y -= 2;
2067 } while (y);
2068 }
2069
2070 x += 64;
2071 } while (x < w);
2072 }
2073 }
2074 }
2075
2076 typedef void (*JntConvolve2dHorTapFunc)(const uint8_t *src, const int32_t src_stride,
2077 const int32_t w, const int32_t h,
2078 const InterpFilterParams *filter_params_x,
2079 const int32_t subpel_x_q4, int16_t *const im_block);
2080
2081 typedef void (*JntConvolve2dVerTapFunc)(const int16_t *const im_block, const int32_t w,
2082 const int32_t h,
2083 const InterpFilterParams *const filter_params_y,
2084 const int32_t subpel_y_q4,
2085 const ConvolveParams *const conv_params, uint8_t *dst8,
2086 const int32_t dst8_stride);
2087
svt_av1_jnt_convolve_2d_avx512(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)2088 void svt_av1_jnt_convolve_2d_avx512(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
2089 int32_t dst8_stride, int32_t w, int32_t h,
2090 InterpFilterParams *filter_params_x,
2091 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
2092 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
2093 static const JntConvolve2dHorTapFunc jnt_convolve_2d_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
2094 NULL,
2095 NULL,
2096 jnt_convolve_2d_hor_2tap_avx512,
2097 NULL,
2098 jnt_convolve_2d_hor_4tap_avx2,
2099 NULL,
2100 jnt_convolve_2d_hor_6tap_avx512,
2101 NULL,
2102 jnt_convolve_2d_hor_8tap_avx512};
2103 static const JntConvolve2dVerTapFunc jnt_convolve_2d_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
2104 NULL,
2105 jnt_convolve_2d_ver_2tap_half_avx512,
2106 jnt_convolve_2d_ver_2tap_avx512,
2107 jnt_convolve_2d_ver_4tap_avx2,
2108 jnt_convolve_2d_ver_4tap_avx2,
2109 jnt_convolve_2d_ver_6tap_avx512,
2110 jnt_convolve_2d_ver_6tap_avx512,
2111 jnt_convolve_2d_ver_8tap_avx512,
2112 jnt_convolve_2d_ver_8tap_avx512};
2113 const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
2114 const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
2115 const uint8_t *src_ptr = src + ((MAX_FILTER_TAP - tap_y) / 2 - 3) * src_stride;
2116 // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
2117 // permutation.
2118 DECLARE_ALIGNED(64, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
2119
2120 assert(conv_params->round_0 == 3);
2121 assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
2122
2123 // horizontal filter
2124
2125 // Have to calculate 1 more row for small widths, since 2 lines are
2126 // calculated in each loop for them.
2127 const int32_t hh = h + tap_y - (w >= 64);
2128
2129 jnt_convolve_2d_hor_tap_func_table[tap_x](
2130 src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
2131
2132 // vertical filter
2133 jnt_convolve_2d_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
2134 im_block, w, h, filter_params_y, subpel_y_q4, conv_params, dst8, dst8_stride);
2135 }
2136
2137 #endif // EN_AVX512_SUPPORT
2138