1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include <immintrin.h>
13 #include "common_dsp_rtcd.h"
14 #include "convolve.h"
15 #include "convolve_avx2.h"
16 #include "EbDefinitions.h"
17 #include "EbMemory_SSE4_1.h"
18
jnt_convolve_2d_hor_2tap_avx2(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)19 static void jnt_convolve_2d_hor_2tap_avx2(const uint8_t *src, const int32_t src_stride,
20 const int32_t w, const int32_t h,
21 const InterpFilterParams *filter_params_x,
22 const int32_t subpel_x_q4, int16_t *const im_block) {
23 const uint8_t *src_ptr = src;
24 int32_t y = h;
25 int16_t * im = im_block;
26
27 if (w <= 8) {
28 __m128i coeffs_128;
29 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, &coeffs_128);
30
31 if (w == 2) {
32 do {
33 const __m128i r = x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, &coeffs_128);
34 xy_x_round_store_2x2_sse2(r, im);
35 src_ptr += 2 * src_stride;
36 im += 2 * 2;
37 y -= 2;
38 } while (y);
39 } else if (w == 4) {
40 do {
41 const __m128i r = x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, &coeffs_128);
42 xy_x_round_store_4x2_sse2(r, im);
43 src_ptr += 2 * src_stride;
44 im += 2 * 4;
45 y -= 2;
46 } while (y);
47 } else {
48 assert(w == 8);
49
50 do {
51 __m128i r[2];
52
53 x_convolve_2tap_8x2_ssse3(src_ptr, src_stride, &coeffs_128, r);
54 xy_x_round_store_8x2_sse2(r, im);
55 src_ptr += 2 * src_stride;
56 im += 2 * 8;
57 y -= 2;
58 } while (y);
59 }
60 } else {
61 __m256i coeffs_256;
62 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, &coeffs_256);
63
64 if (w == 16) {
65 do {
66 __m256i r[2];
67
68 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, &coeffs_256, r);
69 xy_x_round_store_32_avx2(r, im);
70 src_ptr += 2 * src_stride;
71 im += 2 * 16;
72 y -= 2;
73 } while (y);
74 } else if (w == 32) {
75 do {
76 xy_x_2tap_32_avx2(src_ptr, &coeffs_256, im);
77 src_ptr += src_stride;
78 im += 32;
79 } while (--y);
80 } else if (w == 64) {
81 do {
82 xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
83 xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
84 src_ptr += src_stride;
85 im += 64;
86 } while (--y);
87 } else {
88 assert(w == 128);
89
90 do {
91 xy_x_2tap_32_avx2(src_ptr + 0 * 32, &coeffs_256, im + 0 * 32);
92 xy_x_2tap_32_avx2(src_ptr + 1 * 32, &coeffs_256, im + 1 * 32);
93 xy_x_2tap_32_avx2(src_ptr + 2 * 32, &coeffs_256, im + 2 * 32);
94 xy_x_2tap_32_avx2(src_ptr + 3 * 32, &coeffs_256, im + 3 * 32);
95 src_ptr += src_stride;
96 im += 128;
97 } while (--y);
98 }
99 }
100 }
101
jnt_convolve_2d_hor_4tap_avx2(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)102 void jnt_convolve_2d_hor_4tap_avx2(const uint8_t *src, const int32_t src_stride, const int32_t w,
103 const int32_t h, const InterpFilterParams *filter_params_x,
104 const int32_t subpel_x_q4, int16_t *const im_block) {
105 const uint8_t *src_ptr = src - 1;
106 int32_t y = h;
107 int16_t * im = im_block;
108 __m128i coeffs_128[4];
109
110 prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
111
112 if (w == 2) {
113 do {
114 const __m128i r = x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
115 xy_x_round_store_2x2_sse2(r, im);
116 src_ptr += 2 * src_stride;
117 im += 2 * 2;
118 y -= 2;
119 } while (y);
120 } else {
121 assert(w == 4);
122
123 do {
124 const __m128i r = x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
125 xy_x_round_store_4x2_sse2(r, im);
126 src_ptr += 2 * src_stride;
127 im += 2 * 4;
128 y -= 2;
129 } while (y);
130 }
131 };
132
jnt_convolve_2d_hor_6tap_avx2(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)133 static void jnt_convolve_2d_hor_6tap_avx2(const uint8_t *src, const int32_t src_stride,
134 const int32_t w, const int32_t h,
135 const InterpFilterParams *filter_params_x,
136 const int32_t subpel_x_q4, int16_t *const im_block) {
137 const uint8_t *src_ptr = src - 2;
138 int32_t y = h;
139 int16_t * im = im_block;
140 __m256i coeffs_256[3], filt_256[3];
141
142 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
143 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
144 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
145
146 prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
147
148 if (w == 8) {
149 do {
150 const __m256i res = x_convolve_6tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
151 xy_x_round_store_8x2_avx2(res, im);
152 src_ptr += 2 * src_stride;
153 im += 2 * 8;
154 y -= 2;
155 } while (y);
156 } else if (w == 16) {
157 do {
158 __m256i r[2];
159
160 x_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
161 xy_x_round_store_32_avx2(r, im);
162 src_ptr += 2 * src_stride;
163 im += 2 * 16;
164 y -= 2;
165 } while (y);
166 } else if (w == 32) {
167 do {
168 xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
169 src_ptr += src_stride;
170 im += 32;
171 } while (--y);
172 } else if (w == 64) {
173 do {
174 xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
175 xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
176 src_ptr += src_stride;
177 im += 64;
178 } while (--y);
179 } else {
180 assert(w == 128);
181
182 do {
183 xy_x_6tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
184 xy_x_6tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
185 xy_x_6tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
186 xy_x_6tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
187 src_ptr += src_stride;
188 im += 128;
189 } while (--y);
190 }
191 }
192
jnt_convolve_2d_hor_8tap_avx2(const uint8_t * src,const int32_t src_stride,const int32_t w,const int32_t h,const InterpFilterParams * filter_params_x,const int32_t subpel_x_q4,int16_t * const im_block)193 static void jnt_convolve_2d_hor_8tap_avx2(const uint8_t *src, const int32_t src_stride,
194 const int32_t w, const int32_t h,
195 const InterpFilterParams *filter_params_x,
196 const int32_t subpel_x_q4, int16_t *const im_block) {
197 const uint8_t *src_ptr = src - 3;
198 int32_t y = h;
199 int16_t * im = im_block;
200 __m256i coeffs_256[4], filt_256[4];
201
202 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
203 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
204 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
205 filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx);
206
207 prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
208
209 if (w == 8) {
210 do {
211 const __m256i res = x_convolve_8tap_8x2_avx2(src_ptr, src_stride, coeffs_256, filt_256);
212 xy_x_round_store_8x2_avx2(res, im);
213 src_ptr += 2 * src_stride;
214 im += 2 * 8;
215 y -= 2;
216 } while (y);
217 } else if (w == 16) {
218 do {
219 __m256i r[2];
220
221 x_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, filt_256, r);
222 xy_x_round_store_32_avx2(r, im);
223 src_ptr += 2 * src_stride;
224 im += 2 * 16;
225 y -= 2;
226 } while (y);
227 } else if (w == 32) {
228 do {
229 xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
230 src_ptr += src_stride;
231 im += 32;
232 } while (--y);
233 } else if (w == 64) {
234 do {
235 xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
236 xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
237 src_ptr += src_stride;
238 im += 64;
239 } while (--y);
240 } else {
241 assert(w == 128);
242
243 do {
244 xy_x_8tap_32_avx2(src_ptr, coeffs_256, filt_256, im);
245 xy_x_8tap_32_avx2(src_ptr + 32, coeffs_256, filt_256, im + 32);
246 xy_x_8tap_32_avx2(src_ptr + 64, coeffs_256, filt_256, im + 64);
247 xy_x_8tap_32_avx2(src_ptr + 96, coeffs_256, filt_256, im + 96);
248 src_ptr += src_stride;
249 im += 128;
250 } while (--y);
251 }
252 }
253
jnt_convolve_2d_ver_2tap_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)254 static void jnt_convolve_2d_ver_2tap_avx2(const int16_t *const im_block, const int32_t w,
255 const int32_t h,
256 const InterpFilterParams *const filter_params_y,
257 const int32_t subpel_y_q4,
258 const ConvolveParams *const conv_params, uint8_t *dst8,
259 const int32_t dst8_stride) {
260 const int32_t dst_stride = conv_params->dst_stride;
261 const int32_t bd = 8;
262 const int32_t round_0 = 3;
263 const int16_t *im = im_block;
264 const int32_t round_1 = COMPOUND_ROUND1_BITS;
265 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
266 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
267 const int32_t round_offset = 1 << (offset_bits - round_1);
268 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
269 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
270 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
271 (1 << (round_bits + DIST_PRECISION_BITS - 1));
272 const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
273 (1 << offset_bits) - (1 << (offset_bits - 1));
274 const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
275 (1 << (offset_bits - 1));
276 ConvBufType *dst = conv_params->dst;
277 int32_t y = h;
278
279 if (w <= 4) {
280 const __m128i factor_128 = _mm_set1_epi32(factor);
281 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
282 const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
283 const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
284 __m128i coeffs_128;
285
286 prepare_coeffs_2tap_sse2(filter_params_y, subpel_y_q4, &coeffs_128);
287
288 if (w == 2) {
289 __m128i s_32[2];
290
291 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
292
293 if (conv_params->do_average) {
294 if (conv_params->use_jnt_comp_avg) {
295 do {
296 const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
297 jnt_2d_comp_avg_round_store_2x2_sse2(res,
298 factor_128,
299 offset_comp_avg_128,
300 dst,
301 dst_stride,
302 dst8,
303 dst8_stride);
304 im += 2 * 2;
305 dst += 2 * dst_stride;
306 dst8 += 2 * dst8_stride;
307 y -= 2;
308 } while (y);
309 } else {
310 do {
311 const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
312 jnt_2d_avg_round_store_2x2_sse2(
313 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
314 im += 2 * 2;
315 dst += 2 * dst_stride;
316 dst8 += 2 * dst8_stride;
317 y -= 2;
318 } while (y);
319 }
320 } else {
321 do {
322 const __m128i res = xy_y_convolve_2tap_2x2_sse2(im, s_32, &coeffs_128);
323 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
324 im += 2 * 2;
325 dst += 2 * dst_stride;
326 y -= 2;
327 } while (y);
328 }
329 } else {
330 __m128i s_64[2], r[2];
331
332 assert(w == 4);
333
334 s_64[0] = _mm_loadl_epi64((__m128i *)im);
335
336 if (conv_params->do_average) {
337 if (conv_params->use_jnt_comp_avg) {
338 do {
339 xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
340 jnt_2d_comp_avg_round_store_4x2_sse2(
341 r, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
342 im += 2 * 4;
343 dst += 2 * dst_stride;
344 dst8 += 2 * dst8_stride;
345 y -= 2;
346 } while (y);
347 } else {
348 do {
349 xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
350 jnt_2d_avg_round_store_4x2_sse2(
351 r, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
352 im += 2 * 4;
353 dst += 2 * dst_stride;
354 dst8 += 2 * dst8_stride;
355 y -= 2;
356 } while (y);
357 }
358 } else {
359 do {
360 xy_y_convolve_2tap_4x2_sse2(im, s_64, &coeffs_128, r);
361 jnt_2d_no_avg_round_store_4x2_sse2(r, offset_no_avg_128, dst, dst_stride);
362 im += 2 * 4;
363 dst += 2 * dst_stride;
364 y -= 2;
365 } while (y);
366 }
367 }
368 } else {
369 const __m256i factor_256 = _mm256_set1_epi32(factor);
370 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
371 const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
372 const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
373 __m256i coeffs_256;
374
375 prepare_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, &coeffs_256);
376
377 if (w == 8) {
378 __m128i s_128[2];
379 __m256i r[2];
380
381 s_128[0] = _mm_loadu_si128((__m128i *)im);
382
383 if (conv_params->do_average) {
384 if (conv_params->use_jnt_comp_avg) {
385 do {
386 xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
387 jnt_2d_comp_avg_round_store_8x2_avx2(
388 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
389 im += 2 * 8;
390 dst += 2 * dst_stride;
391 dst8 += 2 * dst8_stride;
392 y -= 2;
393 } while (y);
394 } else {
395 do {
396 xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
397 jnt_2d_avg_round_store_8x2_avx2(
398 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
399 im += 2 * 8;
400 dst += 2 * dst_stride;
401 dst8 += 2 * dst8_stride;
402 y -= 2;
403 } while (y);
404 }
405 } else {
406 do {
407 xy_y_convolve_2tap_8x2_avx2(im, s_128, &coeffs_256, r);
408 jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
409 im += 2 * 8;
410 dst += 2 * dst_stride;
411 y -= 2;
412 } while (y);
413 }
414 } else if (w == 16) {
415 __m256i s_256[2], r[4];
416
417 s_256[0] = _mm256_loadu_si256((__m256i *)im);
418
419 if (conv_params->do_average) {
420 if (conv_params->use_jnt_comp_avg) {
421 do {
422 xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
423 jnt_2d_comp_avg_round_store_16x2_avx2(
424 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
425 im += 2 * 16;
426 dst += 2 * dst_stride;
427 dst8 += 2 * dst8_stride;
428 y -= 2;
429 } while (y);
430 } else {
431 do {
432 xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
433 jnt_2d_avg_round_store_16x2_avx2(
434 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
435 im += 2 * 16;
436 dst += 2 * dst_stride;
437 dst8 += 2 * dst8_stride;
438 y -= 2;
439 } while (y);
440 }
441 } else {
442 do {
443 xy_y_convolve_2tap_16x2_avx2(im, s_256, &coeffs_256, r);
444 jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
445 im += 2 * 16;
446 dst += 2 * dst_stride;
447 y -= 2;
448 } while (y);
449 }
450 } else if (w == 32) {
451 __m256i s_256[2][2], r[4];
452
453 s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
454 s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
455
456 if (conv_params->do_average) {
457 if (conv_params->use_jnt_comp_avg) {
458 do {
459 xy_y_convolve_2tap_32_avx2(im + 1 * 32, s_256[0], s_256[1], &coeffs_256, r);
460 jnt_2d_comp_avg_round_store_32_avx2(
461 r + 0, r + 2, factor_256, offset_comp_avg_256, dst, dst8);
462
463 xy_y_convolve_2tap_32_avx2(im + 2 * 32, s_256[1], s_256[0], &coeffs_256, r);
464 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
465 r + 2,
466 factor_256,
467 offset_comp_avg_256,
468 dst + dst_stride,
469 dst8 + dst8_stride);
470
471 im += 2 * 32;
472 dst += 2 * dst_stride;
473 dst8 += 2 * dst8_stride;
474 y -= 2;
475 } while (y);
476 } else {
477 do {
478 xy_y_convolve_2tap_32_avx2(im + 1 * 32, s_256[0], s_256[1], &coeffs_256, r);
479 jnt_2d_avg_round_store_32_avx2(r + 0, r + 2, offset_avg_256, dst, dst8);
480
481 xy_y_convolve_2tap_32_avx2(im + 2 * 32, s_256[1], s_256[0], &coeffs_256, r);
482 jnt_2d_avg_round_store_32_avx2(
483 r + 0, r + 2, offset_avg_256, dst + dst_stride, dst8 + dst8_stride);
484
485 im += 2 * 32;
486 dst += 2 * dst_stride;
487 dst8 += 2 * dst8_stride;
488 y -= 2;
489 } while (y);
490 }
491 } else {
492 do {
493 xy_y_convolve_2tap_32_avx2(im + 1 * 32, s_256[0], s_256[1], &coeffs_256, r);
494 jnt_2d_no_avg_round_store_32_avx2(r + 0, r + 2, offset_no_avg_256, dst);
495
496 xy_y_convolve_2tap_32_avx2(im + 2 * 32, s_256[1], s_256[0], &coeffs_256, r);
497 jnt_2d_no_avg_round_store_32_avx2(
498 r + 0, r + 2, offset_no_avg_256, dst + dst_stride);
499
500 im += 2 * 32;
501 dst += 2 * dst_stride;
502 y -= 2;
503 } while (y);
504 }
505 } else if (w == 64) {
506 __m256i s_256[2][4];
507
508 s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
509 s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
510 s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
511 s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
512
513 if (conv_params->do_average) {
514 if (conv_params->use_jnt_comp_avg) {
515 do {
516 __m256i r[4];
517
518 xy_y_convolve_2tap_32_avx2(
519 im + 2 * 32, s_256[0] + 0, s_256[1] + 0, &coeffs_256, r);
520 jnt_2d_comp_avg_round_store_32_avx2(
521 r + 0, r + 2, factor_256, offset_comp_avg_256, dst, dst8);
522
523 xy_y_convolve_2tap_32_avx2(
524 im + 3 * 32, s_256[0] + 2, s_256[1] + 2, &coeffs_256, r);
525 jnt_2d_comp_avg_round_store_32_avx2(
526 r + 0, r + 2, factor_256, offset_comp_avg_256, dst + 32, dst8 + 32);
527 im += 2 * 64;
528
529 xy_y_convolve_2tap_32_avx2(
530 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, &coeffs_256, r);
531 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
532 r + 2,
533 factor_256,
534 offset_comp_avg_256,
535 dst + dst8_stride,
536 dst8 + dst8_stride);
537
538 xy_y_convolve_2tap_32_avx2(
539 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, &coeffs_256, r);
540 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
541 r + 2,
542 factor_256,
543 offset_comp_avg_256,
544 dst + dst8_stride + 32,
545 dst8 + dst8_stride + 32);
546
547 dst += 2 * dst_stride;
548 dst8 += 2 * dst8_stride;
549 y -= 2;
550 } while (y);
551 } else {
552 do {
553 __m256i r[4];
554
555 xy_y_convolve_2tap_32_avx2(
556 im + 2 * 32, s_256[0] + 0, s_256[1] + 0, &coeffs_256, r);
557 jnt_2d_avg_round_store_32_avx2(r + 0, r + 2, offset_avg_256, dst, dst8);
558
559 xy_y_convolve_2tap_32_avx2(
560 im + 3 * 32, s_256[0] + 2, s_256[1] + 2, &coeffs_256, r);
561 jnt_2d_avg_round_store_32_avx2(
562 r + 0, r + 2, offset_avg_256, dst + 32, dst8 + 32);
563 im += 2 * 64;
564
565 xy_y_convolve_2tap_32_avx2(
566 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, &coeffs_256, r);
567 jnt_2d_avg_round_store_32_avx2(
568 r + 0, r + 2, offset_avg_256, dst + dst_stride, dst8 + dst8_stride);
569
570 xy_y_convolve_2tap_32_avx2(
571 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, &coeffs_256, r);
572 jnt_2d_avg_round_store_32_avx2(r + 0,
573 r + 2,
574 offset_avg_256,
575 dst + dst_stride + 32,
576 dst8 + dst8_stride + 32);
577
578 dst += 2 * dst_stride;
579 dst8 += 2 * dst8_stride;
580 y -= 2;
581 } while (y);
582 }
583 } else {
584 do {
585 __m256i r[4];
586
587 xy_y_convolve_2tap_32_avx2(
588 im + 2 * 32, s_256[0] + 0, s_256[1] + 0, &coeffs_256, r);
589 jnt_2d_no_avg_round_store_32_avx2(r + 0, r + 2, offset_no_avg_256, dst);
590
591 xy_y_convolve_2tap_32_avx2(
592 im + 3 * 32, s_256[0] + 2, s_256[1] + 2, &coeffs_256, r);
593 jnt_2d_no_avg_round_store_32_avx2(r + 0, r + 2, offset_no_avg_256, dst + 32);
594 im += 2 * 64;
595
596 xy_y_convolve_2tap_32_avx2(
597 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, &coeffs_256, r);
598 jnt_2d_no_avg_round_store_32_avx2(
599 r + 0, r + 2, offset_no_avg_256, dst + dst_stride);
600
601 xy_y_convolve_2tap_32_avx2(
602 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, &coeffs_256, r);
603 jnt_2d_no_avg_round_store_32_avx2(
604 r + 0, r + 2, offset_no_avg_256, dst + dst_stride + 32);
605
606 dst += 2 * dst_stride;
607 y -= 2;
608 } while (y);
609 }
610 } else {
611 __m256i s_256[2][8];
612
613 assert(w == 128);
614
615 load_16bit_8rows_avx2(im, 16, s_256[0]);
616
617 if (conv_params->do_average) {
618 if (conv_params->use_jnt_comp_avg) {
619 do {
620 __m256i r[4];
621
622 xy_y_convolve_2tap_32_avx2(
623 im + 4 * 32, s_256[0] + 0, s_256[1] + 0, &coeffs_256, r);
624 jnt_2d_comp_avg_round_store_32_avx2(
625 r + 0, r + 2, factor_256, offset_comp_avg_256, dst, dst8);
626
627 xy_y_convolve_2tap_32_avx2(
628 im + 5 * 32, s_256[0] + 2, s_256[1] + 2, &coeffs_256, r);
629 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
630 r + 2,
631 factor_256,
632 offset_comp_avg_256,
633 dst + 1 * 32,
634 dst8 + 1 * 32);
635
636 xy_y_convolve_2tap_32_avx2(
637 im + 6 * 32, s_256[0] + 4, s_256[1] + 4, &coeffs_256, r);
638 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
639 r + 2,
640 factor_256,
641 offset_comp_avg_256,
642 dst + 2 * 32,
643 dst8 + 2 * 32);
644
645 xy_y_convolve_2tap_32_avx2(
646 im + 7 * 32, s_256[0] + 6, s_256[1] + 6, &coeffs_256, r);
647 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
648 r + 2,
649 factor_256,
650 offset_comp_avg_256,
651 dst + 3 * 32,
652 dst8 + 3 * 32);
653 im += 2 * 128;
654
655 xy_y_convolve_2tap_32_avx2(
656 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, &coeffs_256, r);
657 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
658 r + 2,
659 factor_256,
660 offset_comp_avg_256,
661 dst + dst8_stride + 0 * 32,
662 dst8 + dst8_stride + 0 * 32);
663
664 xy_y_convolve_2tap_32_avx2(
665 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, &coeffs_256, r);
666 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
667 r + 2,
668 factor_256,
669 offset_comp_avg_256,
670 dst + dst8_stride + 1 * 32,
671 dst8 + dst8_stride + 1 * 32);
672
673 xy_y_convolve_2tap_32_avx2(
674 im + 2 * 32, s_256[1] + 4, s_256[0] + 4, &coeffs_256, r);
675 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
676 r + 2,
677 factor_256,
678 offset_comp_avg_256,
679 dst + dst8_stride + 2 * 32,
680 dst8 + dst8_stride + 2 * 32);
681
682 xy_y_convolve_2tap_32_avx2(
683 im + 3 * 32, s_256[1] + 6, s_256[0] + 6, &coeffs_256, r);
684 jnt_2d_comp_avg_round_store_32_avx2(r + 0,
685 r + 2,
686 factor_256,
687 offset_comp_avg_256,
688 dst + dst8_stride + 3 * 32,
689 dst8 + dst8_stride + 3 * 32);
690
691 dst += 2 * dst_stride;
692 dst8 += 2 * dst8_stride;
693 y -= 2;
694 } while (y);
695 } else {
696 do {
697 __m256i r[4];
698
699 xy_y_convolve_2tap_32_avx2(
700 im + 4 * 32, s_256[0] + 0, s_256[1] + 0, &coeffs_256, r);
701 jnt_2d_avg_round_store_32_avx2(
702 r + 0, r + 2, offset_avg_256, dst + 0 * 32, dst8 + 0 * 32);
703
704 xy_y_convolve_2tap_32_avx2(
705 im + 5 * 32, s_256[0] + 2, s_256[1] + 2, &coeffs_256, r);
706 jnt_2d_avg_round_store_32_avx2(
707 r + 0, r + 2, offset_avg_256, dst + 1 * 32, dst8 + 1 * 32);
708
709 xy_y_convolve_2tap_32_avx2(
710 im + 6 * 32, s_256[0] + 4, s_256[1] + 4, &coeffs_256, r);
711 jnt_2d_avg_round_store_32_avx2(
712 r + 0, r + 2, offset_avg_256, dst + 2 * 32, dst8 + 2 * 32);
713
714 xy_y_convolve_2tap_32_avx2(
715 im + 7 * 32, s_256[0] + 6, s_256[1] + 6, &coeffs_256, r);
716 jnt_2d_avg_round_store_32_avx2(
717 r + 0, r + 2, offset_avg_256, dst + 3 * 32, dst8 + 3 * 32);
718 im += 2 * 128;
719
720 xy_y_convolve_2tap_32_avx2(
721 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, &coeffs_256, r);
722 jnt_2d_avg_round_store_32_avx2(r + 0,
723 r + 2,
724 offset_avg_256,
725 dst + dst_stride + 0 * 32,
726 dst8 + dst8_stride + 0 * 32);
727
728 xy_y_convolve_2tap_32_avx2(
729 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, &coeffs_256, r);
730 jnt_2d_avg_round_store_32_avx2(r + 0,
731 r + 2,
732 offset_avg_256,
733 dst + dst_stride + 1 * 32,
734 dst8 + dst8_stride + 1 * 32);
735
736 xy_y_convolve_2tap_32_avx2(
737 im + 2 * 32, s_256[1] + 4, s_256[0] + 4, &coeffs_256, r);
738 jnt_2d_avg_round_store_32_avx2(r + 0,
739 r + 2,
740 offset_avg_256,
741 dst + dst_stride + 2 * 32,
742 dst8 + dst8_stride + 2 * 32);
743
744 xy_y_convolve_2tap_32_avx2(
745 im + 3 * 32, s_256[1] + 6, s_256[0] + 6, &coeffs_256, r);
746 jnt_2d_avg_round_store_32_avx2(r + 0,
747 r + 2,
748 offset_avg_256,
749 dst + dst_stride + 3 * 32,
750 dst8 + dst8_stride + 3 * 32);
751
752 dst += 2 * dst_stride;
753 dst8 += 2 * dst8_stride;
754 y -= 2;
755 } while (y);
756 }
757 } else {
758 do {
759 __m256i r[4];
760
761 xy_y_convolve_2tap_32_avx2(
762 im + 4 * 32, s_256[0] + 0, s_256[1] + 0, &coeffs_256, r);
763 jnt_2d_no_avg_round_store_32_avx2(
764 r + 0, r + 2, offset_no_avg_256, dst + 0 * 32);
765
766 xy_y_convolve_2tap_32_avx2(
767 im + 5 * 32, s_256[0] + 2, s_256[1] + 2, &coeffs_256, r);
768 jnt_2d_no_avg_round_store_32_avx2(
769 r + 0, r + 2, offset_no_avg_256, dst + 1 * 32);
770
771 xy_y_convolve_2tap_32_avx2(
772 im + 6 * 32, s_256[0] + 4, s_256[1] + 4, &coeffs_256, r);
773 jnt_2d_no_avg_round_store_32_avx2(
774 r + 0, r + 2, offset_no_avg_256, dst + 2 * 32);
775
776 xy_y_convolve_2tap_32_avx2(
777 im + 7 * 32, s_256[0] + 6, s_256[1] + 6, &coeffs_256, r);
778 jnt_2d_no_avg_round_store_32_avx2(
779 r + 0, r + 2, offset_no_avg_256, dst + 3 * 32);
780 im += 2 * 128;
781
782 xy_y_convolve_2tap_32_avx2(
783 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, &coeffs_256, r);
784 jnt_2d_no_avg_round_store_32_avx2(
785 r + 0, r + 2, offset_no_avg_256, dst + dst_stride + 0 * 32);
786
787 xy_y_convolve_2tap_32_avx2(
788 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, &coeffs_256, r);
789 jnt_2d_no_avg_round_store_32_avx2(
790 r + 0, r + 2, offset_no_avg_256, dst + dst_stride + 1 * 32);
791
792 xy_y_convolve_2tap_32_avx2(
793 im + 2 * 32, s_256[1] + 4, s_256[0] + 4, &coeffs_256, r);
794 jnt_2d_no_avg_round_store_32_avx2(
795 r + 0, r + 2, offset_no_avg_256, dst + dst_stride + 2 * 32);
796
797 xy_y_convolve_2tap_32_avx2(
798 im + 3 * 32, s_256[1] + 6, s_256[0] + 6, &coeffs_256, r);
799 jnt_2d_no_avg_round_store_32_avx2(
800 r + 0, r + 2, offset_no_avg_256, dst + dst_stride + 3 * 32);
801
802 dst += 2 * dst_stride;
803 y -= 2;
804 } while (y);
805 }
806 }
807 }
808 }
809
jnt_convolve_2d_ver_2tap_half_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)810 static void jnt_convolve_2d_ver_2tap_half_avx2(const int16_t *const im_block, const int32_t w,
811 const int32_t h,
812 const InterpFilterParams *const filter_params_y,
813 const int32_t subpel_y_q4,
814 const ConvolveParams *const conv_params,
815 uint8_t *dst8, const int32_t dst8_stride) {
816 const int32_t dst_stride = conv_params->dst_stride;
817 const int32_t bd = 8;
818 const int32_t round_0 = 3;
819 const int16_t *im = im_block;
820 const int32_t round_1 = COMPOUND_ROUND1_BITS;
821 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
822 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
823 const int32_t round_offset = 1 << (offset_bits - round_1);
824 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
825 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
826 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
827 (1 << (round_bits + DIST_PRECISION_BITS - 1));
828 const int32_t offset_avg = (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
829 (1 << (round_bits + round_1 - COMPOUND_ROUND1_BITS + 1)) -
830 (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) -
831 (1 << (offset_bits - COMPOUND_ROUND1_BITS));
832 const int32_t offset_no_avg = (1 << (round_1 - COMPOUND_ROUND1_BITS)) +
833 (1 << (offset_bits - COMPOUND_ROUND1_BITS + 1)) +
834 (1 << (offset_bits - COMPOUND_ROUND1_BITS));
835 ConvBufType *dst = conv_params->dst;
836 int32_t y = h;
837
838 (void)filter_params_y;
839 (void)subpel_y_q4;
840
841 if (w <= 4) {
842 const __m128i factor_128 = _mm_set1_epi32(factor);
843 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
844 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
845 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
846
847 if (w == 2) {
848 __m128i s_32[2];
849
850 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)im);
851
852 if (conv_params->do_average) {
853 if (conv_params->use_jnt_comp_avg) {
854 do {
855 const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
856 jnt_2d_comp_avg_round_store_half_pel_2x2_sse2(res,
857 factor_128,
858 offset_comp_avg_128,
859 dst,
860 dst_stride,
861 dst8,
862 dst8_stride);
863 im += 2 * 2;
864 dst += 2 * dst_stride;
865 dst8 += 2 * dst8_stride;
866 y -= 2;
867 } while (y);
868 } else {
869 do {
870 const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
871 jnt_2d_avg_round_store_half_pel_2x2_sse2(
872 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
873 im += 2 * 2;
874 dst += 2 * dst_stride;
875 dst8 += 2 * dst8_stride;
876 y -= 2;
877 } while (y);
878 }
879 } else {
880 do {
881 const __m128i res = xy_y_convolve_2tap_2x2_half_pel_sse2(im, s_32);
882 jnt_2d_no_avg_round_store_half_pel_2x2_sse2(
883 res, offset_no_avg_128, dst, dst_stride);
884 im += 2 * 2;
885 dst += 2 * dst_stride;
886 y -= 2;
887 } while (y);
888 }
889 } else {
890 __m128i s_64[2];
891
892 assert(w == 4);
893
894 s_64[0] = _mm_loadl_epi64((__m128i *)im);
895
896 if (conv_params->do_average) {
897 if (conv_params->use_jnt_comp_avg) {
898 do {
899 const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
900 jnt_2d_comp_avg_round_store_half_pel_4x2_sse2(res,
901 factor_128,
902 offset_comp_avg_128,
903 dst,
904 dst_stride,
905 dst8,
906 dst8_stride);
907 im += 2 * 4;
908 dst += 2 * dst_stride;
909 dst8 += 2 * dst8_stride;
910 y -= 2;
911 } while (y);
912 } else {
913 do {
914 const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
915 jnt_2d_avg_round_store_half_pel_4x2_sse2(
916 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
917 im += 2 * 4;
918 dst += 2 * dst_stride;
919 dst8 += 2 * dst8_stride;
920 y -= 2;
921 } while (y);
922 }
923 } else {
924 do {
925 const __m128i res = xy_y_convolve_2tap_4x2_half_pel_sse2(im, s_64);
926 jnt_2d_no_avg_round_store_half_pel_4x2_sse2(
927 res, offset_no_avg_128, dst, dst_stride);
928 im += 2 * 4;
929 dst += 2 * dst_stride;
930 y -= 2;
931 } while (y);
932 }
933 }
934 } else {
935 const __m256i factor_256 = _mm256_set1_epi32(factor);
936 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
937 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
938 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
939
940 if (w == 8) {
941 __m128i s_128[2];
942
943 s_128[0] = _mm_loadu_si128((__m128i *)im);
944
945 if (conv_params->do_average) {
946 if (conv_params->use_jnt_comp_avg) {
947 do {
948 const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
949 jnt_2d_comp_avg_round_store_half_pel_8x2_avx2(res,
950 factor_256,
951 offset_comp_avg_256,
952 dst,
953 dst_stride,
954 dst8,
955 dst8_stride);
956 im += 2 * 8;
957 dst += 2 * dst_stride;
958 dst8 += 2 * dst8_stride;
959 y -= 2;
960 } while (y);
961 } else {
962 do {
963 const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
964 jnt_2d_avg_round_store_half_pel_8x2_avx2(
965 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
966 im += 2 * 8;
967 dst += 2 * dst_stride;
968 dst8 += 2 * dst8_stride;
969 y -= 2;
970 } while (y);
971 }
972 } else {
973 do {
974 const __m256i res = xy_y_convolve_2tap_8x2_half_pel_avx2(im, s_128);
975 jnt_2d_no_avg_round_store_half_pel_8x2_avx2(
976 res, offset_no_avg_256, dst, dst_stride);
977 im += 2 * 8;
978 dst += 2 * dst_stride;
979 y -= 2;
980 } while (y);
981 }
982 } else if (w == 16) {
983 __m256i s_256[2], r[2];
984
985 s_256[0] = _mm256_loadu_si256((__m256i *)im);
986
987 if (conv_params->do_average) {
988 if (conv_params->use_jnt_comp_avg) {
989 do {
990 xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
991 jnt_2d_comp_avg_round_store_half_pel_16x2_avx2(
992 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
993 im += 2 * 16;
994 dst += 2 * dst_stride;
995 dst8 += 2 * dst8_stride;
996 y -= 2;
997 } while (y);
998 } else {
999 do {
1000 xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1001 jnt_2d_avg_round_store_half_pel_16x2_avx2(
1002 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1003 im += 2 * 16;
1004 dst += 2 * dst_stride;
1005 dst8 += 2 * dst8_stride;
1006 y -= 2;
1007 } while (y);
1008 }
1009 } else {
1010 do {
1011 xy_y_convolve_2tap_16x2_half_pel_avx2(im, s_256, r);
1012 jnt_2d_no_avg_round_store_half_pel_16x2_avx2(
1013 r, offset_no_avg_256, dst, dst_stride);
1014 im += 2 * 16;
1015 dst += 2 * dst_stride;
1016 y -= 2;
1017 } while (y);
1018 }
1019 } else if (w == 32) {
1020 __m256i s_256[2][2], r[2];
1021
1022 s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
1023 s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
1024
1025 if (conv_params->do_average) {
1026 if (conv_params->use_jnt_comp_avg) {
1027 do {
1028 xy_y_convolve_2tap_half_pel_32_avx2(im + 1 * 32, s_256[0], s_256[1], r);
1029 jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1030 r, factor_256, offset_comp_avg_256, dst, dst8);
1031
1032 xy_y_convolve_2tap_half_pel_32_avx2(im + 2 * 32, s_256[1], s_256[0], r);
1033 jnt_2d_comp_avg_round_store_half_pel_32_avx2(r,
1034 factor_256,
1035 offset_comp_avg_256,
1036 dst + dst_stride,
1037 dst8 + dst8_stride);
1038
1039 im += 2 * 32;
1040 dst += 2 * dst_stride;
1041 dst8 += 2 * dst8_stride;
1042 y -= 2;
1043 } while (y);
1044 } else {
1045 do {
1046 xy_y_convolve_2tap_half_pel_32_avx2(im + 1 * 32, s_256[0], s_256[1], r);
1047 jnt_2d_avg_round_store_half_pel_32_avx2(r, offset_avg_256, dst, dst8);
1048
1049 xy_y_convolve_2tap_half_pel_32_avx2(im + 2 * 32, s_256[1], s_256[0], r);
1050 jnt_2d_avg_round_store_half_pel_32_avx2(
1051 r, offset_avg_256, dst + dst_stride, dst8 + dst8_stride);
1052
1053 im += 2 * 32;
1054 dst += 2 * dst_stride;
1055 dst8 += 2 * dst8_stride;
1056 y -= 2;
1057 } while (y);
1058 }
1059 } else {
1060 do {
1061 xy_y_convolve_2tap_half_pel_32_avx2(im + 1 * 32, s_256[0], s_256[1], r);
1062 jnt_2d_no_avg_round_store_half_pel_32_avx2(r, offset_no_avg_256, dst);
1063
1064 xy_y_convolve_2tap_half_pel_32_avx2(im + 2 * 32, s_256[1], s_256[0], r);
1065 jnt_2d_no_avg_round_store_half_pel_32_avx2(
1066 r, offset_no_avg_256, dst + dst_stride);
1067
1068 im += 2 * 32;
1069 dst += 2 * dst_stride;
1070 y -= 2;
1071 } while (y);
1072 }
1073 } else if (w == 64) {
1074 __m256i s_256[2][4];
1075
1076 s_256[0][0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
1077 s_256[0][1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
1078 s_256[0][2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
1079 s_256[0][3] = _mm256_loadu_si256((__m256i *)(im + 3 * 16));
1080
1081 if (conv_params->do_average) {
1082 if (conv_params->use_jnt_comp_avg) {
1083 do {
1084 __m256i r[2];
1085
1086 xy_y_convolve_2tap_half_pel_32_avx2(
1087 im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
1088 jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1089 r, factor_256, offset_comp_avg_256, dst, dst8);
1090
1091 xy_y_convolve_2tap_half_pel_32_avx2(
1092 im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
1093 jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1094 r, factor_256, offset_comp_avg_256, dst + 32, dst8 + 32);
1095 im += 2 * 64;
1096
1097 xy_y_convolve_2tap_half_pel_32_avx2(
1098 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1099 jnt_2d_comp_avg_round_store_half_pel_32_avx2(r,
1100 factor_256,
1101 offset_comp_avg_256,
1102 dst + dst_stride,
1103 dst8 + dst8_stride);
1104
1105 xy_y_convolve_2tap_half_pel_32_avx2(
1106 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
1107 jnt_2d_comp_avg_round_store_half_pel_32_avx2(r,
1108 factor_256,
1109 offset_comp_avg_256,
1110 dst + dst_stride + 32,
1111 dst8 + dst8_stride + 32);
1112
1113 dst += 2 * dst_stride;
1114 dst8 += 2 * dst8_stride;
1115 y -= 2;
1116 } while (y);
1117 } else {
1118 do {
1119 __m256i r[2];
1120
1121 xy_y_convolve_2tap_half_pel_32_avx2(
1122 im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
1123 jnt_2d_avg_round_store_half_pel_32_avx2(r, offset_avg_256, dst, dst8);
1124
1125 xy_y_convolve_2tap_half_pel_32_avx2(
1126 im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
1127 jnt_2d_avg_round_store_half_pel_32_avx2(
1128 r, offset_avg_256, dst + 32, dst8 + 32);
1129 im += 2 * 64;
1130
1131 xy_y_convolve_2tap_half_pel_32_avx2(
1132 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1133 jnt_2d_avg_round_store_half_pel_32_avx2(
1134 r, offset_avg_256, dst + dst_stride, dst8 + dst8_stride);
1135
1136 xy_y_convolve_2tap_half_pel_32_avx2(
1137 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
1138 jnt_2d_avg_round_store_half_pel_32_avx2(
1139 r, offset_avg_256, dst + dst_stride + 32, dst8 + dst8_stride + 32);
1140
1141 dst += 2 * dst_stride;
1142 dst8 += 2 * dst8_stride;
1143 y -= 2;
1144 } while (y);
1145 }
1146 } else {
1147 do {
1148 __m256i r[2];
1149
1150 xy_y_convolve_2tap_half_pel_32_avx2(im + 2 * 32, s_256[0] + 0, s_256[1] + 0, r);
1151 jnt_2d_no_avg_round_store_half_pel_32_avx2(r, offset_no_avg_256, dst);
1152
1153 xy_y_convolve_2tap_half_pel_32_avx2(im + 3 * 32, s_256[0] + 2, s_256[1] + 2, r);
1154 jnt_2d_no_avg_round_store_half_pel_32_avx2(r, offset_no_avg_256, dst + 32);
1155 im += 2 * 64;
1156
1157 xy_y_convolve_2tap_half_pel_32_avx2(im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1158 jnt_2d_no_avg_round_store_half_pel_32_avx2(
1159 r, offset_no_avg_256, dst + dst_stride);
1160
1161 xy_y_convolve_2tap_half_pel_32_avx2(im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
1162 jnt_2d_no_avg_round_store_half_pel_32_avx2(
1163 r, offset_no_avg_256, dst + dst_stride + 32);
1164
1165 dst += 2 * dst_stride;
1166 y -= 2;
1167 } while (y);
1168 }
1169 } else {
1170 __m256i s_256[2][8];
1171
1172 assert(w == 128);
1173
1174 load_16bit_8rows_avx2(im, 16, s_256[0]);
1175
1176 if (conv_params->do_average) {
1177 if (conv_params->use_jnt_comp_avg) {
1178 do {
1179 __m256i r[2];
1180
1181 xy_y_convolve_2tap_half_pel_32_avx2(
1182 im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
1183 jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1184 r, factor_256, offset_comp_avg_256, dst + 0 * 32, dst8 + 0 * 32);
1185
1186 xy_y_convolve_2tap_half_pel_32_avx2(
1187 im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
1188 jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1189 r, factor_256, offset_comp_avg_256, dst + 1 * 32, dst8 + 1 * 32);
1190
1191 xy_y_convolve_2tap_half_pel_32_avx2(
1192 im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
1193 jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1194 r, factor_256, offset_comp_avg_256, dst + 2 * 32, dst8 + 2 * 32);
1195
1196 xy_y_convolve_2tap_half_pel_32_avx2(
1197 im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
1198 jnt_2d_comp_avg_round_store_half_pel_32_avx2(
1199 r, factor_256, offset_comp_avg_256, dst + 3 * 32, dst8 + 3 * 32);
1200 im += 2 * 128;
1201
1202 xy_y_convolve_2tap_half_pel_32_avx2(
1203 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1204 jnt_2d_comp_avg_round_store_half_pel_32_avx2(r,
1205 factor_256,
1206 offset_comp_avg_256,
1207 dst + dst_stride + 0 * 32,
1208 dst8 + dst8_stride + 0 * 32);
1209
1210 xy_y_convolve_2tap_half_pel_32_avx2(
1211 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
1212 jnt_2d_comp_avg_round_store_half_pel_32_avx2(r,
1213 factor_256,
1214 offset_comp_avg_256,
1215 dst + dst_stride + 1 * 32,
1216 dst8 + dst8_stride + 1 * 32);
1217
1218 xy_y_convolve_2tap_half_pel_32_avx2(
1219 im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
1220 jnt_2d_comp_avg_round_store_half_pel_32_avx2(r,
1221 factor_256,
1222 offset_comp_avg_256,
1223 dst + dst_stride + 2 * 32,
1224 dst8 + dst8_stride + 2 * 32);
1225
1226 xy_y_convolve_2tap_half_pel_32_avx2(
1227 im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
1228 jnt_2d_comp_avg_round_store_half_pel_32_avx2(r,
1229 factor_256,
1230 offset_comp_avg_256,
1231 dst + dst_stride + 3 * 32,
1232 dst8 + dst8_stride + 3 * 32);
1233
1234 dst += 2 * dst_stride;
1235 dst8 += 2 * dst8_stride;
1236 y -= 2;
1237 } while (y);
1238 } else {
1239 do {
1240 __m256i r[2];
1241
1242 xy_y_convolve_2tap_half_pel_32_avx2(
1243 im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
1244 jnt_2d_avg_round_store_half_pel_32_avx2(
1245 r, offset_avg_256, dst + 0 * 32, dst8 + 0 * 32);
1246
1247 xy_y_convolve_2tap_half_pel_32_avx2(
1248 im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
1249 jnt_2d_avg_round_store_half_pel_32_avx2(
1250 r, offset_avg_256, dst + 1 * 32, dst8 + 1 * 32);
1251
1252 xy_y_convolve_2tap_half_pel_32_avx2(
1253 im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
1254 jnt_2d_avg_round_store_half_pel_32_avx2(
1255 r, offset_avg_256, dst + 2 * 32, dst8 + 2 * 32);
1256
1257 xy_y_convolve_2tap_half_pel_32_avx2(
1258 im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
1259 jnt_2d_avg_round_store_half_pel_32_avx2(
1260 r, offset_avg_256, dst + 3 * 32, dst8 + 3 * 32);
1261 im += 2 * 128;
1262
1263 xy_y_convolve_2tap_half_pel_32_avx2(
1264 im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1265 jnt_2d_avg_round_store_half_pel_32_avx2(r,
1266 offset_avg_256,
1267 dst + dst_stride + 0 * 32,
1268 dst8 + dst8_stride + 0 * 32);
1269
1270 xy_y_convolve_2tap_half_pel_32_avx2(
1271 im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
1272 jnt_2d_avg_round_store_half_pel_32_avx2(r,
1273 offset_avg_256,
1274 dst + dst_stride + 1 * 32,
1275 dst8 + dst8_stride + 1 * 32);
1276
1277 xy_y_convolve_2tap_half_pel_32_avx2(
1278 im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
1279 jnt_2d_avg_round_store_half_pel_32_avx2(r,
1280 offset_avg_256,
1281 dst + dst_stride + 2 * 32,
1282 dst8 + dst8_stride + 2 * 32);
1283
1284 xy_y_convolve_2tap_half_pel_32_avx2(
1285 im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
1286 jnt_2d_avg_round_store_half_pel_32_avx2(r,
1287 offset_avg_256,
1288 dst + dst_stride + 3 * 32,
1289 dst8 + dst8_stride + 3 * 32);
1290
1291 dst += 2 * dst_stride;
1292 dst8 += 2 * dst8_stride;
1293 y -= 2;
1294 } while (y);
1295 }
1296 } else {
1297 do {
1298 __m256i r[2];
1299
1300 xy_y_convolve_2tap_half_pel_32_avx2(im + 4 * 32, s_256[0] + 0, s_256[1] + 0, r);
1301 jnt_2d_no_avg_round_store_half_pel_32_avx2(r, offset_no_avg_256, dst + 0 * 32);
1302
1303 xy_y_convolve_2tap_half_pel_32_avx2(im + 5 * 32, s_256[0] + 2, s_256[1] + 2, r);
1304 jnt_2d_no_avg_round_store_half_pel_32_avx2(r, offset_no_avg_256, dst + 1 * 32);
1305
1306 xy_y_convolve_2tap_half_pel_32_avx2(im + 6 * 32, s_256[0] + 4, s_256[1] + 4, r);
1307 jnt_2d_no_avg_round_store_half_pel_32_avx2(r, offset_no_avg_256, dst + 2 * 32);
1308
1309 xy_y_convolve_2tap_half_pel_32_avx2(im + 7 * 32, s_256[0] + 6, s_256[1] + 6, r);
1310 jnt_2d_no_avg_round_store_half_pel_32_avx2(r, offset_no_avg_256, dst + 3 * 32);
1311 im += 2 * 128;
1312
1313 xy_y_convolve_2tap_half_pel_32_avx2(im + 0 * 32, s_256[1] + 0, s_256[0] + 0, r);
1314 jnt_2d_no_avg_round_store_half_pel_32_avx2(
1315 r, offset_no_avg_256, dst + dst_stride + 0 * 32);
1316
1317 xy_y_convolve_2tap_half_pel_32_avx2(im + 1 * 32, s_256[1] + 2, s_256[0] + 2, r);
1318 jnt_2d_no_avg_round_store_half_pel_32_avx2(
1319 r, offset_no_avg_256, dst + dst_stride + 1 * 32);
1320
1321 xy_y_convolve_2tap_half_pel_32_avx2(im + 2 * 32, s_256[1] + 4, s_256[0] + 4, r);
1322 jnt_2d_no_avg_round_store_half_pel_32_avx2(
1323 r, offset_no_avg_256, dst + dst_stride + 2 * 32);
1324
1325 xy_y_convolve_2tap_half_pel_32_avx2(im + 3 * 32, s_256[1] + 6, s_256[0] + 6, r);
1326 jnt_2d_no_avg_round_store_half_pel_32_avx2(
1327 r, offset_no_avg_256, dst + dst_stride + 3 * 32);
1328
1329 dst += 2 * dst_stride;
1330 y -= 2;
1331 } while (y);
1332 }
1333 }
1334 }
1335 }
1336
jnt_convolve_2d_ver_4tap_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)1337 void jnt_convolve_2d_ver_4tap_avx2(const int16_t *const im_block, const int32_t w, const int32_t h,
1338 const InterpFilterParams *const filter_params_y,
1339 const int32_t subpel_y_q4,
1340 const ConvolveParams *const conv_params, uint8_t *dst8,
1341 const int32_t dst8_stride) {
1342 const int32_t dst_stride = conv_params->dst_stride;
1343 const int32_t bd = 8;
1344 const int32_t round_0 = 3;
1345 const int16_t *im = im_block;
1346 const int32_t round_1 = COMPOUND_ROUND1_BITS;
1347 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
1348 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
1349 const int32_t round_offset = 1 << (offset_bits - round_1);
1350 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1351 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
1352 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
1353 (1 << (round_bits + DIST_PRECISION_BITS - 1));
1354 const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
1355 (1 << offset_bits) - (1 << (offset_bits - 1));
1356 const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
1357 (1 << (offset_bits - 1));
1358 int32_t y = h;
1359 ConvBufType *dst = conv_params->dst;
1360
1361 if (w == 2) {
1362 const __m128i factor_128 = _mm_set1_epi32(factor);
1363 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1364 const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
1365 const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
1366 __m128i coeffs_128[4], s_32[4], ss_128[2];
1367
1368 prepare_coeffs_4tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
1369
1370 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
1371 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
1372 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
1373
1374 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1375 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1376
1377 ss_128[0] = _mm_unpacklo_epi16(src01, src12);
1378
1379 if (conv_params->do_average) {
1380 if (conv_params->use_jnt_comp_avg) {
1381 do {
1382 const __m128i res = xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1383 jnt_2d_comp_avg_round_store_2x2_sse2(
1384 res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
1385 im += 2 * 2;
1386 dst += 2 * dst_stride;
1387 dst8 += 2 * dst8_stride;
1388 y -= 2;
1389 } while (y);
1390 } else {
1391 do {
1392 const __m128i res = xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1393 jnt_2d_avg_round_store_2x2_sse2(
1394 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1395 im += 2 * 2;
1396 dst += 2 * dst_stride;
1397 dst8 += 2 * dst8_stride;
1398 y -= 2;
1399 } while (y);
1400 }
1401 } else {
1402 do {
1403 const __m128i res = xy_y_convolve_4tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1404 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1405 im += 2 * 2;
1406 dst += 2 * dst_stride;
1407 y -= 2;
1408 } while (y);
1409 }
1410 } else {
1411 const __m256i factor_256 = _mm256_set1_epi32(factor);
1412 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1413 const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
1414 const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
1415 __m256i coeffs_256[4];
1416
1417 prepare_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1418
1419 if (w == 4) {
1420 __m128i s_64[4];
1421 __m256i s_256[2], ss_256[2];
1422
1423 s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1424 s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1425 s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1426
1427 // Load lines a and b. Line a to lower 128, line b to upper 128
1428 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1429 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1430
1431 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1432
1433 if (conv_params->do_average) {
1434 if (conv_params->use_jnt_comp_avg) {
1435 do {
1436 const __m256i res = xy_y_convolve_4tap_4x2_avx2(
1437 im, s_64, ss_256, coeffs_256);
1438 jnt_2d_comp_avg_round_store_4x2_avx2(res,
1439 factor_256,
1440 offset_comp_avg_256,
1441 dst,
1442 dst_stride,
1443 dst8,
1444 dst8_stride);
1445 im += 2 * 4;
1446 dst += 2 * dst_stride;
1447 dst8 += 2 * dst8_stride;
1448 y -= 2;
1449 } while (y);
1450 } else {
1451 do {
1452 const __m256i res = xy_y_convolve_4tap_4x2_avx2(
1453 im, s_64, ss_256, coeffs_256);
1454 jnt_2d_avg_round_store_4x2_avx2(
1455 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1456 im += 2 * 4;
1457 dst += 2 * dst_stride;
1458 dst8 += 2 * dst8_stride;
1459 y -= 2;
1460 } while (y);
1461 }
1462 } else {
1463 do {
1464 const __m256i res = xy_y_convolve_4tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1465 jnt_2d_no_avg_round_store_4x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1466 im += 2 * 4;
1467 dst += 2 * dst_stride;
1468 y -= 2;
1469 } while (y);
1470 }
1471 } else if (w == 8) {
1472 __m256i s_256[4], r[2];
1473
1474 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1475 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1476
1477 __m256i ss_256[4];
1478
1479 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1480 ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1481
1482 if (conv_params->do_average) {
1483 if (conv_params->use_jnt_comp_avg) {
1484 do {
1485 xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
1486 jnt_2d_comp_avg_round_store_8x2_avx2(
1487 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1488 im += 2 * 8;
1489 dst += 2 * dst_stride;
1490 dst8 += 2 * dst8_stride;
1491 y -= 2;
1492 } while (y);
1493 } else {
1494 do {
1495 xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
1496 jnt_2d_avg_round_store_8x2_avx2(
1497 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1498 im += 2 * 8;
1499 dst += 2 * dst_stride;
1500 dst8 += 2 * dst8_stride;
1501 y -= 2;
1502 } while (y);
1503 }
1504 } else {
1505 do {
1506 xy_y_convolve_4tap_8x2_avx2(im, ss_256, coeffs_256, r);
1507 jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1508 im += 2 * 8;
1509 dst += 2 * dst_stride;
1510 y -= 2;
1511 } while (y);
1512 }
1513 } else {
1514 __m256i s_256[5];
1515
1516 assert(w == 16);
1517
1518 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 16));
1519 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 16));
1520 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 16));
1521
1522 __m256i ss_256[4], tt_256[4], r[4];
1523
1524 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1525 ss_256[2] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1526
1527 tt_256[0] = _mm256_unpacklo_epi16(s_256[1], s_256[2]);
1528 tt_256[2] = _mm256_unpackhi_epi16(s_256[1], s_256[2]);
1529
1530 if (conv_params->do_average) {
1531 if (conv_params->use_jnt_comp_avg) {
1532 do {
1533 xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256, r);
1534 jnt_2d_comp_avg_round_store_16x2_avx2(
1535 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1536 im += 2 * 16;
1537 dst += 2 * dst_stride;
1538 dst8 += 2 * dst8_stride;
1539 y -= 2;
1540 } while (y);
1541 } else {
1542 do {
1543 xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256, r);
1544 jnt_2d_avg_round_store_16x2_avx2(
1545 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1546 im += 2 * 16;
1547 dst += 2 * dst_stride;
1548 dst8 += 2 * dst8_stride;
1549 y -= 2;
1550 } while (y);
1551 }
1552 } else {
1553 do {
1554 xy_y_convolve_4tap_16x2_avx2(im, s_256, ss_256, tt_256, coeffs_256, r);
1555 jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1556 im += 2 * 16;
1557 dst += 2 * dst_stride;
1558 y -= 2;
1559 } while (y);
1560 }
1561 }
1562 }
1563 }
1564
jnt_convolve_2d_ver_6tap_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)1565 static void jnt_convolve_2d_ver_6tap_avx2(const int16_t *const im_block, const int32_t w,
1566 const int32_t h,
1567 const InterpFilterParams *const filter_params_y,
1568 const int32_t subpel_y_q4,
1569 const ConvolveParams *const conv_params, uint8_t *dst8,
1570 const int32_t dst8_stride) {
1571 const int32_t dst_stride = conv_params->dst_stride;
1572 const int32_t bd = 8;
1573 const int32_t round_0 = 3;
1574 const int16_t *im = im_block;
1575 const int32_t round_1 = COMPOUND_ROUND1_BITS;
1576 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
1577 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
1578 const int32_t round_offset = 1 << (offset_bits - round_1);
1579 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1580 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
1581 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
1582 (1 << (round_bits + DIST_PRECISION_BITS - 1));
1583 const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
1584 (1 << offset_bits) - (1 << (offset_bits - 1));
1585 const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
1586 (1 << (offset_bits - 1));
1587 ConvBufType *dst = conv_params->dst;
1588
1589 if (w == 2) {
1590 const __m128i factor_128 = _mm_set1_epi32(factor);
1591 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1592 const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
1593 const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
1594 __m128i coeffs_128[3], s_32[6], ss_128[3];
1595
1596 prepare_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1597
1598 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
1599 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
1600 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
1601 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
1602 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
1603
1604 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1605 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1606 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1607 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1608
1609 ss_128[0] = _mm_unpacklo_epi16(src01, src12);
1610 ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1611
1612 int32_t y = h;
1613
1614 if (conv_params->do_average) {
1615 if (conv_params->use_jnt_comp_avg) {
1616 do {
1617 const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1618 jnt_2d_comp_avg_round_store_2x2_sse2(
1619 res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
1620 im += 2 * 2;
1621 dst += 2 * dst_stride;
1622 dst8 += 2 * dst8_stride;
1623 y -= 2;
1624 } while (y);
1625 } else {
1626 do {
1627 const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1628 jnt_2d_avg_round_store_2x2_sse2(
1629 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1630 im += 2 * 2;
1631 dst += 2 * dst_stride;
1632 dst8 += 2 * dst8_stride;
1633 y -= 2;
1634 } while (y);
1635 }
1636 } else {
1637 do {
1638 const __m128i res = xy_y_convolve_6tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1639 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1640 im += 2 * 2;
1641 dst += 2 * dst_stride;
1642 y -= 2;
1643 } while (y);
1644 }
1645 } else {
1646 const __m256i factor_256 = _mm256_set1_epi32(factor);
1647 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1648 const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
1649 const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
1650 __m256i coeffs_256[3];
1651
1652 prepare_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1653
1654 if (w == 4) {
1655 __m128i s_64[6];
1656 __m256i s_256[6], ss_256[3];
1657
1658 s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1659 s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1660 s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1661 s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1662 s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1663
1664 // Load lines a and b. Line a to lower 128, line b to upper 128
1665 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1666 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1667 s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1668 s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1669
1670 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1671 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1672
1673 int32_t y = h;
1674
1675 if (conv_params->do_average) {
1676 if (conv_params->use_jnt_comp_avg) {
1677 do {
1678 const __m256i res = xy_y_convolve_6tap_4x2_avx2(
1679 im, s_64, ss_256, coeffs_256);
1680 jnt_2d_comp_avg_round_store_4x2_avx2(res,
1681 factor_256,
1682 offset_comp_avg_256,
1683 dst,
1684 dst_stride,
1685 dst8,
1686 dst8_stride);
1687 im += 2 * 4;
1688 dst += 2 * dst_stride;
1689 dst8 += 2 * dst8_stride;
1690 y -= 2;
1691 } while (y);
1692 } else {
1693 do {
1694 const __m256i res = xy_y_convolve_6tap_4x2_avx2(
1695 im, s_64, ss_256, coeffs_256);
1696 jnt_2d_avg_round_store_4x2_avx2(
1697 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1698 im += 2 * 4;
1699 dst += 2 * dst_stride;
1700 dst8 += 2 * dst8_stride;
1701 y -= 2;
1702 } while (y);
1703 }
1704 } else {
1705 do {
1706 const __m256i res = xy_y_convolve_6tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
1707 jnt_2d_no_avg_round_store_4x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1708 im += 2 * 4;
1709 dst += 2 * dst_stride;
1710 y -= 2;
1711 } while (y);
1712 }
1713 } else if (w == 8) {
1714 __m256i s_256[6], r[2];
1715
1716 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
1717 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
1718 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
1719 s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
1720 int32_t y = h;
1721
1722 __m256i ss_256[6];
1723
1724 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1725 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1726
1727 ss_256[3] = _mm256_unpackhi_epi16(s_256[0], s_256[1]);
1728 ss_256[4] = _mm256_unpackhi_epi16(s_256[2], s_256[3]);
1729
1730 if (conv_params->do_average) {
1731 if (conv_params->use_jnt_comp_avg) {
1732 do {
1733 xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1734 jnt_2d_comp_avg_round_store_8x2_avx2(
1735 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1736 im += 2 * 8;
1737 dst += 2 * dst_stride;
1738 dst8 += 2 * dst8_stride;
1739 y -= 2;
1740 } while (y);
1741 } else {
1742 do {
1743 xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1744 jnt_2d_avg_round_store_8x2_avx2(
1745 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1746 im += 2 * 8;
1747 dst += 2 * dst_stride;
1748 dst8 += 2 * dst8_stride;
1749 y -= 2;
1750 } while (y);
1751 }
1752 } else {
1753 do {
1754 xy_y_convolve_6tap_8x2_avx2(im, ss_256, coeffs_256, r);
1755 jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1756 im += 2 * 8;
1757 dst += 2 * dst_stride;
1758 y -= 2;
1759 } while (y);
1760 }
1761 } else if (w == 16) {
1762 __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1763
1764 loadu_unpack_16bit_5rows_avx2(im, 16, s_256, ss_256, tt_256);
1765 int32_t y = h;
1766
1767 if (conv_params->do_average) {
1768 if (conv_params->use_jnt_comp_avg) {
1769 do {
1770 xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1771 jnt_2d_comp_avg_round_store_16x2_avx2(
1772 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1773 im += 2 * 16;
1774 dst += 2 * dst_stride;
1775 dst8 += 2 * dst8_stride;
1776 y -= 2;
1777 } while (y);
1778 } else {
1779 do {
1780 xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1781 jnt_2d_avg_round_store_16x2_avx2(
1782 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1783 im += 2 * 16;
1784 dst += 2 * dst_stride;
1785 dst8 += 2 * dst8_stride;
1786 y -= 2;
1787 } while (y);
1788 }
1789 } else {
1790 do {
1791 xy_y_convolve_6tap_16x2_avx2(im, 16, s_256, ss_256, tt_256, coeffs_256, r);
1792 jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1793 im += 2 * 16;
1794 dst += 2 * dst_stride;
1795 y -= 2;
1796 } while (y);
1797 }
1798 } else {
1799 __m256i s_256[2][6], ss_256[2][6], tt_256[2][6], r0[4], r1[4];
1800
1801 assert(!(w % 32));
1802
1803 int32_t x = 0;
1804 do {
1805 const int16_t *s = im + x;
1806 ConvBufType * d = dst + x;
1807 uint8_t * d8 = dst8 + x;
1808
1809 loadu_unpack_16bit_5rows_avx2(s, w, s_256[0], ss_256[0], tt_256[0]);
1810 loadu_unpack_16bit_5rows_avx2(s + 16, w, s_256[1], ss_256[1], tt_256[1]);
1811
1812 int32_t y = h;
1813
1814 if (conv_params->do_average) {
1815 if (conv_params->use_jnt_comp_avg) {
1816 do {
1817 xy_y_convolve_6tap_16x2_avx2(
1818 s, w, s_256[0], ss_256[0], tt_256[0], coeffs_256, r0);
1819 xy_y_convolve_6tap_16x2_avx2(
1820 s + 16, w, s_256[1], ss_256[1], tt_256[1], coeffs_256, r1);
1821 jnt_2d_comp_avg_round_store_32_avx2(
1822 r0 + 0, r1 + 0, factor_256, offset_comp_avg_256, d, d8);
1823 jnt_2d_comp_avg_round_store_32_avx2(r0 + 2,
1824 r1 + 2,
1825 factor_256,
1826 offset_comp_avg_256,
1827 d + dst_stride,
1828 d8 + dst8_stride);
1829 s += 2 * w;
1830 d += 2 * dst_stride;
1831 d8 += 2 * dst8_stride;
1832 y -= 2;
1833 } while (y);
1834 } else {
1835 do {
1836 xy_y_convolve_6tap_16x2_avx2(
1837 s, w, s_256[0], ss_256[0], tt_256[0], coeffs_256, r0);
1838 xy_y_convolve_6tap_16x2_avx2(
1839 s + 16, w, s_256[1], ss_256[1], tt_256[1], coeffs_256, r1);
1840 jnt_2d_avg_round_store_32_avx2(r0 + 0, r1 + 0, offset_avg_256, d, d8);
1841 jnt_2d_avg_round_store_32_avx2(
1842 r0 + 2, r1 + 2, offset_avg_256, d + dst_stride, d8 + dst8_stride);
1843 s += 2 * w;
1844 d += 2 * dst_stride;
1845 d8 += 2 * dst8_stride;
1846 y -= 2;
1847 } while (y);
1848 }
1849 } else {
1850 do {
1851 xy_y_convolve_6tap_16x2_avx2(
1852 s, w, s_256[0], ss_256[0], tt_256[0], coeffs_256, r0);
1853 xy_y_convolve_6tap_16x2_avx2(
1854 s + 16, w, s_256[1], ss_256[1], tt_256[1], coeffs_256, r1);
1855 jnt_2d_no_avg_round_store_32_avx2(r0 + 0, r1 + 0, offset_no_avg_256, d);
1856 jnt_2d_no_avg_round_store_32_avx2(
1857 r0 + 2, r1 + 2, offset_no_avg_256, d + dst_stride);
1858 s += 2 * w;
1859 d += 2 * dst_stride;
1860 y -= 2;
1861 } while (y);
1862 }
1863
1864 x += 32;
1865 } while (x < w);
1866 }
1867 }
1868 }
1869
jnt_convolve_2d_ver_8tap_avx2(const int16_t * const im_block,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params,uint8_t * dst8,const int32_t dst8_stride)1870 static void jnt_convolve_2d_ver_8tap_avx2(const int16_t *const im_block, const int32_t w,
1871 const int32_t h,
1872 const InterpFilterParams *const filter_params_y,
1873 const int32_t subpel_y_q4,
1874 const ConvolveParams *const conv_params, uint8_t *dst8,
1875 const int32_t dst8_stride) {
1876 const int32_t dst_stride = conv_params->dst_stride;
1877 const int32_t bd = 8;
1878 const int32_t round_0 = 3;
1879 const int16_t *im = im_block;
1880 const int32_t round_1 = COMPOUND_ROUND1_BITS;
1881 const int32_t offset_bits = bd + 2 * FILTER_BITS - round_0; // 19
1882 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1; // 4
1883 const int32_t round_offset = 1 << (offset_bits - round_1);
1884 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1885 const int32_t offset_comp_avg = (round_offset + (round_offset >> 1)) * conv_params->bck_offset -
1886 (round_offset << DIST_PRECISION_BITS) - (round_offset << (DIST_PRECISION_BITS - 1)) +
1887 (1 << (round_bits + DIST_PRECISION_BITS - 1));
1888 const int32_t offset_avg = (1 << (round_1 - 1)) + (1 << (round_bits + round_1)) -
1889 (1 << offset_bits) - (1 << (offset_bits - 1));
1890 const int32_t offset_no_avg = (1 << (round_1 - 1)) + (1 << offset_bits) +
1891 (1 << (offset_bits - 1));
1892 ConvBufType *dst = conv_params->dst;
1893
1894 if (w == 2) {
1895 const __m128i factor_128 = _mm_set1_epi32(factor);
1896 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1897 const __m128i offset_avg_128 = _mm_set1_epi32(offset_avg);
1898 const __m128i offset_no_avg_128 = _mm_set1_epi32(offset_no_avg);
1899 __m128i coeffs_128[4], s_32[8], ss_128[4];
1900
1901 prepare_coeffs_8tap_sse2(filter_params_y, subpel_y_q4, coeffs_128);
1902
1903 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(im + 0 * 2));
1904 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(im + 1 * 2));
1905 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(im + 2 * 2));
1906 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(im + 3 * 2));
1907 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(im + 4 * 2));
1908 s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(im + 5 * 2));
1909 s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(im + 6 * 2));
1910
1911 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1912 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1913 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1914 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1915 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1916 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
1917
1918 ss_128[0] = _mm_unpacklo_epi16(src01, src12);
1919 ss_128[1] = _mm_unpacklo_epi16(src23, src34);
1920 ss_128[2] = _mm_unpacklo_epi16(src45, src56);
1921
1922 int32_t y = h;
1923
1924 if (conv_params->do_average) {
1925 if (conv_params->use_jnt_comp_avg) {
1926 do {
1927 const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1928 jnt_2d_comp_avg_round_store_2x2_sse2(
1929 res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
1930 im += 2 * 2;
1931 dst += 2 * dst_stride;
1932 dst8 += 2 * dst8_stride;
1933 y -= 2;
1934 } while (y);
1935 } else {
1936 do {
1937 const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1938 jnt_2d_avg_round_store_2x2_sse2(
1939 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1940 im += 2 * 2;
1941 dst += 2 * dst_stride;
1942 dst8 += 2 * dst8_stride;
1943 y -= 2;
1944 } while (y);
1945 }
1946 } else {
1947 do {
1948 const __m128i res = xy_y_convolve_8tap_2x2_sse2(im, s_32, ss_128, coeffs_128);
1949 jnt_2d_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1950 im += 2 * 2;
1951 dst += 2 * dst_stride;
1952 y -= 2;
1953 } while (y);
1954 }
1955 } else {
1956 const __m256i factor_256 = _mm256_set1_epi32(factor);
1957 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1958 const __m256i offset_avg_256 = _mm256_set1_epi32(offset_avg);
1959 const __m256i offset_no_avg_256 = _mm256_set1_epi32(offset_no_avg);
1960 __m256i coeffs_256[4];
1961
1962 prepare_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1963
1964 if (w == 4) {
1965 __m128i s_64[8];
1966 __m256i s_256[8], ss_256[4];
1967
1968 s_64[0] = _mm_loadl_epi64((__m128i *)(im + 0 * 4));
1969 s_64[1] = _mm_loadl_epi64((__m128i *)(im + 1 * 4));
1970 s_64[2] = _mm_loadl_epi64((__m128i *)(im + 2 * 4));
1971 s_64[3] = _mm_loadl_epi64((__m128i *)(im + 3 * 4));
1972 s_64[4] = _mm_loadl_epi64((__m128i *)(im + 4 * 4));
1973 s_64[5] = _mm_loadl_epi64((__m128i *)(im + 5 * 4));
1974 s_64[6] = _mm_loadl_epi64((__m128i *)(im + 6 * 4));
1975
1976 // Load lines a and b. Line a to lower 128, line b to upper 128
1977 s_256[0] = _mm256_setr_m128i(s_64[0], s_64[1]);
1978 s_256[1] = _mm256_setr_m128i(s_64[1], s_64[2]);
1979 s_256[2] = _mm256_setr_m128i(s_64[2], s_64[3]);
1980 s_256[3] = _mm256_setr_m128i(s_64[3], s_64[4]);
1981 s_256[4] = _mm256_setr_m128i(s_64[4], s_64[5]);
1982 s_256[5] = _mm256_setr_m128i(s_64[5], s_64[6]);
1983
1984 ss_256[0] = _mm256_unpacklo_epi16(s_256[0], s_256[1]);
1985 ss_256[1] = _mm256_unpacklo_epi16(s_256[2], s_256[3]);
1986 ss_256[2] = _mm256_unpacklo_epi16(s_256[4], s_256[5]);
1987
1988 int32_t y = h;
1989
1990 if (conv_params->do_average) {
1991 if (conv_params->use_jnt_comp_avg) {
1992 do {
1993 const __m256i res = xy_y_convolve_8tap_4x2_avx2(
1994 im, s_64, ss_256, coeffs_256);
1995 jnt_2d_comp_avg_round_store_4x2_avx2(res,
1996 factor_256,
1997 offset_comp_avg_256,
1998 dst,
1999 dst_stride,
2000 dst8,
2001 dst8_stride);
2002 im += 2 * 4;
2003 dst += 2 * dst_stride;
2004 dst8 += 2 * dst8_stride;
2005 y -= 2;
2006 } while (y);
2007 } else {
2008 do {
2009 const __m256i res = xy_y_convolve_8tap_4x2_avx2(
2010 im, s_64, ss_256, coeffs_256);
2011 jnt_2d_avg_round_store_4x2_avx2(
2012 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2013 im += 2 * 4;
2014 dst += 2 * dst_stride;
2015 dst8 += 2 * dst8_stride;
2016 y -= 2;
2017 } while (y);
2018 }
2019 } else {
2020 do {
2021 const __m256i res = xy_y_convolve_8tap_4x2_avx2(im, s_64, ss_256, coeffs_256);
2022 jnt_2d_no_avg_round_store_4x2_avx2(res, offset_no_avg_256, dst, dst_stride);
2023 im += 2 * 4;
2024 dst += 2 * dst_stride;
2025 y -= 2;
2026 } while (y);
2027 }
2028 } else if (w == 8) {
2029 __m256i s_256[8], r[2];
2030
2031 s_256[0] = _mm256_loadu_si256((__m256i *)(im + 0 * 8));
2032 s_256[1] = _mm256_loadu_si256((__m256i *)(im + 1 * 8));
2033 s_256[2] = _mm256_loadu_si256((__m256i *)(im + 2 * 8));
2034 s_256[3] = _mm256_loadu_si256((__m256i *)(im + 3 * 8));
2035 s_256[4] = _mm256_loadu_si256((__m256i *)(im + 4 * 8));
2036 s_256[5] = _mm256_loadu_si256((__m256i *)(im + 5 * 8));
2037 int32_t y = h;
2038
2039 __m256i ss_256[8];
2040
2041 convolve_8tap_unapck_avx2(s_256, ss_256);
2042
2043 if (conv_params->do_average) {
2044 if (conv_params->use_jnt_comp_avg) {
2045 do {
2046 xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
2047 jnt_2d_comp_avg_round_store_8x2_avx2(
2048 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
2049 im += 2 * 8;
2050 dst += 2 * dst_stride;
2051 dst8 += 2 * dst8_stride;
2052 y -= 2;
2053 } while (y);
2054 } else {
2055 do {
2056 xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
2057 jnt_2d_avg_round_store_8x2_avx2(
2058 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2059 im += 2 * 8;
2060 dst += 2 * dst_stride;
2061 dst8 += 2 * dst8_stride;
2062 y -= 2;
2063 } while (y);
2064 }
2065 } else {
2066 do {
2067 xy_y_convolve_8tap_8x2_avx2(im, ss_256, coeffs_256, r);
2068 jnt_2d_no_avg_round_store_8x2_avx2(r, offset_no_avg_256, dst, dst_stride);
2069 im += 2 * 8;
2070 dst += 2 * dst_stride;
2071 y -= 2;
2072 } while (y);
2073 }
2074 } else if (w == 16) {
2075 __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2076
2077 load_16bit_7rows_avx2(im, 16, s_256);
2078 int32_t y = h;
2079
2080 convolve_8tap_unapck_avx2(s_256, ss_256);
2081 convolve_8tap_unapck_avx2(s_256 + 1, tt_256);
2082
2083 if (conv_params->do_average) {
2084 if (conv_params->use_jnt_comp_avg) {
2085 do {
2086 xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
2087 jnt_2d_comp_avg_round_store_16x2_avx2(
2088 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
2089 im += 2 * 16;
2090 dst += 2 * dst_stride;
2091 dst8 += 2 * dst8_stride;
2092 y -= 2;
2093 } while (y);
2094 } else {
2095 do {
2096 xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
2097 jnt_2d_avg_round_store_16x2_avx2(
2098 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2099 im += 2 * 16;
2100 dst += 2 * dst_stride;
2101 dst8 += 2 * dst8_stride;
2102 y -= 2;
2103 } while (y);
2104 }
2105 } else {
2106 do {
2107 xy_y_convolve_8tap_16x2_avx2(im, 16, coeffs_256, s_256, ss_256, tt_256, r);
2108 jnt_2d_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
2109 im += 2 * 16;
2110 dst += 2 * dst_stride;
2111 y -= 2;
2112 } while (y);
2113 }
2114 } else {
2115 __m256i s_256[2][8], r0[4], r1[4], ss_256[2][8], tt_256[2][8];
2116
2117 assert(!(w % 32));
2118
2119 int32_t x = 0;
2120 do {
2121 const int16_t *s = im + x;
2122 ConvBufType * d = dst + x;
2123 uint8_t * d8 = dst8 + x;
2124
2125 load_16bit_7rows_avx2(s, w, s_256[0]);
2126 convolve_8tap_unapck_avx2(s_256[0], ss_256[0]);
2127 convolve_8tap_unapck_avx2(s_256[0] + 1, tt_256[0]);
2128
2129 load_16bit_7rows_avx2(s + 16, w, s_256[1]);
2130 convolve_8tap_unapck_avx2(s_256[1], ss_256[1]);
2131 convolve_8tap_unapck_avx2(s_256[1] + 1, tt_256[1]);
2132
2133 int32_t y = h;
2134
2135 if (conv_params->do_average) {
2136 if (conv_params->use_jnt_comp_avg) {
2137 do {
2138 xy_y_convolve_8tap_16x2_avx2(
2139 s, w, coeffs_256, s_256[0], ss_256[0], tt_256[0], r0);
2140 xy_y_convolve_8tap_16x2_avx2(
2141 s + 16, w, coeffs_256, s_256[1], ss_256[1], tt_256[1], r1);
2142 jnt_2d_comp_avg_round_store_32_avx2(
2143 r0 + 0, r1 + 0, factor_256, offset_comp_avg_256, d, d8);
2144 jnt_2d_comp_avg_round_store_32_avx2(r0 + 2,
2145 r1 + 2,
2146 factor_256,
2147 offset_comp_avg_256,
2148 d + dst_stride,
2149 d8 + dst8_stride);
2150 s += 2 * w;
2151 d += 2 * dst_stride;
2152 d8 += 2 * dst8_stride;
2153 y -= 2;
2154 } while (y);
2155 } else {
2156 do {
2157 xy_y_convolve_8tap_16x2_avx2(
2158 s, w, coeffs_256, s_256[0], ss_256[0], tt_256[0], r0);
2159 xy_y_convolve_8tap_16x2_avx2(
2160 s + 16, w, coeffs_256, s_256[1], ss_256[1], tt_256[1], r1);
2161 jnt_2d_avg_round_store_32_avx2(r0 + 0, r1 + 0, offset_avg_256, d, d8);
2162 jnt_2d_avg_round_store_32_avx2(
2163 r0 + 2, r1 + 2, offset_avg_256, d + dst_stride, d8 + dst8_stride);
2164 s += 2 * w;
2165 d += 2 * dst_stride;
2166 d8 += 2 * dst8_stride;
2167 y -= 2;
2168 } while (y);
2169 }
2170 } else {
2171 do {
2172 xy_y_convolve_8tap_16x2_avx2(
2173 s, w, coeffs_256, s_256[0], ss_256[0], tt_256[0], r0);
2174 xy_y_convolve_8tap_16x2_avx2(
2175 s + 16, w, coeffs_256, s_256[1], ss_256[1], tt_256[1], r1);
2176 jnt_2d_no_avg_round_store_32_avx2(r0 + 0, r1 + 0, offset_no_avg_256, d);
2177 jnt_2d_no_avg_round_store_32_avx2(
2178 r0 + 2, r1 + 2, offset_no_avg_256, d + dst_stride);
2179 s += 2 * w;
2180 d += 2 * dst_stride;
2181 y -= 2;
2182 } while (y);
2183 }
2184
2185 x += 32;
2186 } while (x < w);
2187 }
2188 }
2189 }
2190
2191 typedef void (*JntConvolve2dHorTapFunc)(const uint8_t *src, const int32_t src_stride,
2192 const int32_t w, const int32_t h,
2193 const InterpFilterParams *filter_params_x,
2194 const int32_t subpel_x_q4, int16_t *const im_block);
2195
2196 typedef void (*JntConvolve2dVerTapFunc)(const int16_t *const im_block, const int32_t w,
2197 const int32_t h,
2198 const InterpFilterParams *const filter_params_y,
2199 const int32_t subpel_y_q4,
2200 const ConvolveParams *const conv_params, uint8_t *dst8,
2201 const int32_t dst8_stride);
2202
svt_av1_jnt_convolve_2d_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)2203 void svt_av1_jnt_convolve_2d_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
2204 int32_t dst8_stride, int32_t w, int32_t h,
2205 InterpFilterParams *filter_params_x,
2206 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
2207 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
2208 static const JntConvolve2dHorTapFunc jnt_convolve_2d_hor_tap_func_table[MAX_FILTER_TAP + 1] = {
2209 NULL,
2210 NULL,
2211 jnt_convolve_2d_hor_2tap_avx2,
2212 NULL,
2213 jnt_convolve_2d_hor_4tap_avx2,
2214 NULL,
2215 jnt_convolve_2d_hor_6tap_avx2,
2216 NULL,
2217 jnt_convolve_2d_hor_8tap_avx2};
2218 static const JntConvolve2dVerTapFunc jnt_convolve_2d_ver_tap_func_table[MAX_FILTER_TAP + 1] = {
2219 NULL,
2220 jnt_convolve_2d_ver_2tap_half_avx2,
2221 jnt_convolve_2d_ver_2tap_avx2,
2222 jnt_convolve_2d_ver_4tap_avx2,
2223 jnt_convolve_2d_ver_4tap_avx2,
2224 jnt_convolve_2d_ver_6tap_avx2,
2225 jnt_convolve_2d_ver_6tap_avx2,
2226 jnt_convolve_2d_ver_8tap_avx2,
2227 jnt_convolve_2d_ver_8tap_avx2};
2228 const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
2229 const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
2230 const uint8_t *src_ptr = src + ((MAX_FILTER_TAP - tap_y) / 2 - 3) * src_stride;
2231 // Note: im_block is 8-pixel interlaced for width 32 and up, to avoid data
2232 // permutation.
2233 DECLARE_ALIGNED(64, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
2234
2235 assert(conv_params->round_0 == 3);
2236 assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
2237
2238 // horizontal filter
2239
2240 // Have to calculate 1 more row for small widths, since 2 lines are
2241 // calculated in each loop for them.
2242 const int32_t hh = h + tap_y - (w >= 32);
2243
2244 jnt_convolve_2d_hor_tap_func_table[tap_x](
2245 src_ptr, src_stride, w, hh, filter_params_x, subpel_x_q4, im_block);
2246
2247 // vertical filter
2248 jnt_convolve_2d_ver_tap_func_table[tap_y - (subpel_y_q4 == 8)](
2249 im_block, w, h, filter_params_y, subpel_y_q4, conv_params, dst8, dst8_stride);
2250 }
2251