1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11
12 #include <immintrin.h>
13 #include "common_dsp_rtcd.h"
14 #include "convolve.h"
15 #include "convolve_avx2.h"
16 #include "EbDefinitions.h"
17 #include "EbMemory_SSE4_1.h"
18
jnt_y_comp_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i factor,const __m256i offset,const __m256i s0,__m256i * const s1,ConvBufType * const dst,uint8_t * const dst8)19 SIMD_INLINE void jnt_y_comp_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
20 const __m256i factor, const __m256i offset,
21 const __m256i s0, __m256i *const s1,
22 ConvBufType *const dst, uint8_t *const dst8) {
23 __m256i r[2];
24
25 y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
26 jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
27 }
28
jnt_y_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i offset,const __m256i s0,__m256i * const s1,const ConvBufType * const dst,uint8_t * const dst8)29 static INLINE void jnt_y_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
30 const __m256i offset, const __m256i s0, __m256i *const s1,
31 const ConvBufType *const dst, uint8_t *const dst8) {
32 __m256i r[2];
33
34 y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
35 jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
36 }
37
jnt_y_no_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i offset,const __m256i s0,__m256i * const s1,ConvBufType * const dst)38 static INLINE void jnt_y_no_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
39 const __m256i offset, const __m256i s0,
40 __m256i *const s1, ConvBufType *const dst) {
41 __m256i r[2];
42
43 y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
44 jnt_no_avg_round_store_32_avx2(r, offset, dst);
45 }
46
jnt_convolve_y_2tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params)47 static void jnt_convolve_y_2tap_avx2(const uint8_t *const src, const int32_t src_stride,
48 uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
49 const int32_t h,
50 const InterpFilterParams *const filter_params_y,
51 const int32_t subpel_y_q4,
52 const ConvolveParams *const conv_params) {
53 const uint8_t *src_ptr = src;
54 const int32_t dst_stride = conv_params->dst_stride;
55 const int32_t round_0 = 3;
56 const int32_t round_1 = COMPOUND_ROUND1_BITS;
57 const int32_t bits = FILTER_BITS - round_0;
58 const int32_t bd = 8;
59 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
60 const int32_t offset_bits = bd + round_bits;
61 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
62 ConvBufType * dst = conv_params->dst;
63 int32_t y = h;
64 __m128i coeffs_128[4];
65 __m256i coeffs_256[4];
66
67 if (conv_params->do_average) {
68 if (conv_params->use_jnt_comp_avg) {
69 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
70 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
71 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
72 (round_offset << DIST_PRECISION_BITS);
73
74 if (w <= 4) {
75 const __m128i factor_128 = _mm_set1_epi32(factor);
76 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
77
78 prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
79
80 if (w == 2) {
81 __m128i s_16[2];
82
83 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
84
85 do {
86 const __m128i res = y_convolve_2tap_2x2_ssse3(
87 src_ptr, src_stride, coeffs_128, s_16);
88 jnt_comp_avg_round_store_2x2_sse2(res,
89 factor_128,
90 offset_comp_avg_128,
91 dst,
92 dst_stride,
93 dst8,
94 dst8_stride);
95 src_ptr += 2 * src_stride;
96 dst += 2 * dst_stride;
97 dst8 += 2 * dst8_stride;
98 y -= 2;
99 } while (y);
100 } else {
101 __m128i s_32[2];
102
103 assert(w == 4);
104
105 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
106
107 do {
108 const __m128i res = y_convolve_2tap_4x2_ssse3(
109 src_ptr, src_stride, coeffs_128, s_32);
110 jnt_comp_avg_round_store_4x2_sse2(res,
111 factor_128,
112 offset_comp_avg_128,
113 dst,
114 dst_stride,
115 dst8,
116 dst8_stride);
117 src_ptr += 2 * src_stride;
118 dst += 2 * dst_stride;
119 dst8 += 2 * dst8_stride;
120 y -= 2;
121 } while (y);
122 }
123 } else {
124 const __m256i factor_256 = _mm256_set1_epi32(factor);
125 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
126
127 prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
128
129 if (w == 8) {
130 __m128i s_64[2];
131
132 s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
133
134 do {
135 const __m256i res = y_convolve_2tap_8x2_avx2(
136 src_ptr, src_stride, coeffs_256, s_64);
137 jnt_comp_avg_round_store_8x2_avx2(res,
138 factor_256,
139 offset_comp_avg_256,
140 dst,
141 dst_stride,
142 dst8,
143 dst8_stride);
144 src_ptr += 2 * src_stride;
145 dst += 2 * dst_stride;
146 dst8 += 2 * dst8_stride;
147 y -= 2;
148 } while (y);
149 } else if (w == 16) {
150 __m128i s_128[2];
151 __m256i r[2];
152
153 s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
154
155 do {
156 y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, r);
157 jnt_comp_avg_round_store_16x2_avx2(
158 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
159 src_ptr += 2 * src_stride;
160 dst += 2 * dst_stride;
161 dst8 += 2 * dst8_stride;
162 y -= 2;
163 } while (y);
164 } else if (w == 32) {
165 __m256i s_256[2];
166
167 s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
168
169 do {
170 jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
171 coeffs_256,
172 factor_256,
173 offset_comp_avg_256,
174 s_256[0],
175 &s_256[1],
176 dst,
177 dst8);
178 jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
179 coeffs_256,
180 factor_256,
181 offset_comp_avg_256,
182 s_256[1],
183 &s_256[0],
184 dst + dst_stride,
185 dst8 + dst8_stride);
186 src_ptr += 2 * src_stride;
187 dst += 2 * dst_stride;
188 dst8 += 2 * dst8_stride;
189 y -= 2;
190 } while (y);
191 } else if (w == 64) {
192 __m256i s_256[2][2];
193
194 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
195 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
196
197 do {
198 jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
199 coeffs_256,
200 factor_256,
201 offset_comp_avg_256,
202 s_256[0][0],
203 &s_256[1][0],
204 dst,
205 dst8);
206 jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 32,
207 coeffs_256,
208 factor_256,
209 offset_comp_avg_256,
210 s_256[0][1],
211 &s_256[1][1],
212 dst + 32,
213 dst8 + 32);
214 jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
215 coeffs_256,
216 factor_256,
217 offset_comp_avg_256,
218 s_256[1][0],
219 &s_256[0][0],
220 dst + dst_stride,
221 dst8 + dst8_stride);
222 jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
223 coeffs_256,
224 factor_256,
225 offset_comp_avg_256,
226 s_256[1][1],
227 &s_256[0][1],
228 dst + dst_stride + 32,
229 dst8 + dst8_stride + 32);
230
231 src_ptr += 2 * src_stride;
232 dst += 2 * dst_stride;
233 dst8 += 2 * dst8_stride;
234 y -= 2;
235 } while (y);
236 } else {
237 __m256i s_256[2][4];
238
239 assert(w == 128);
240
241 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
242 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
243 s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
244 s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
245
246 do {
247 jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
248 coeffs_256,
249 factor_256,
250 offset_comp_avg_256,
251 s_256[0][0],
252 &s_256[1][0],
253 dst,
254 dst8);
255 jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
256 coeffs_256,
257 factor_256,
258 offset_comp_avg_256,
259 s_256[0][1],
260 &s_256[1][1],
261 dst + 1 * 32,
262 dst8 + 1 * 32);
263 jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
264 coeffs_256,
265 factor_256,
266 offset_comp_avg_256,
267 s_256[0][2],
268 &s_256[1][2],
269 dst + 2 * 32,
270 dst8 + 2 * 32);
271 jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
272 coeffs_256,
273 factor_256,
274 offset_comp_avg_256,
275 s_256[0][3],
276 &s_256[1][3],
277 dst + 3 * 32,
278 dst8 + 3 * 32);
279 jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
280 coeffs_256,
281 factor_256,
282 offset_comp_avg_256,
283 s_256[1][0],
284 &s_256[0][0],
285 dst + dst_stride,
286 dst8 + dst8_stride);
287 jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
288 coeffs_256,
289 factor_256,
290 offset_comp_avg_256,
291 s_256[1][1],
292 &s_256[0][1],
293 dst + dst_stride + 1 * 32,
294 dst8 + dst8_stride + 1 * 32);
295 jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
296 coeffs_256,
297 factor_256,
298 offset_comp_avg_256,
299 s_256[1][2],
300 &s_256[0][2],
301 dst + dst_stride + 2 * 32,
302 dst8 + dst8_stride + 2 * 32);
303 jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
304 coeffs_256,
305 factor_256,
306 offset_comp_avg_256,
307 s_256[1][3],
308 &s_256[0][3],
309 dst + dst_stride + 3 * 32,
310 dst8 + dst8_stride + 3 * 32);
311
312 src_ptr += 2 * src_stride;
313 dst += 2 * dst_stride;
314 dst8 += 2 * dst8_stride;
315 y -= 2;
316 } while (y);
317 }
318 }
319 } else {
320 const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_1 - bits - 2)) -
321 (round_offset << (round_1 - bits - 1));
322
323 if (w <= 4) {
324 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
325
326 prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
327
328 if (w == 2) {
329 __m128i s_16[2];
330
331 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
332
333 do {
334 const __m128i res = y_convolve_2tap_2x2_ssse3(
335 src_ptr, src_stride, coeffs_128, s_16);
336 jnt_avg_round_store_2x2_sse2(
337 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
338 src_ptr += 2 * src_stride;
339 dst += 2 * dst_stride;
340 dst8 += 2 * dst8_stride;
341 y -= 2;
342 } while (y);
343 } else {
344 __m128i s_32[2];
345
346 assert(w == 4);
347
348 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
349
350 do {
351 const __m128i res = y_convolve_2tap_4x2_ssse3(
352 src_ptr, src_stride, coeffs_128, s_32);
353 jnt_avg_round_store_4x2_sse2(
354 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
355 src_ptr += 2 * src_stride;
356 dst += 2 * dst_stride;
357 dst8 += 2 * dst8_stride;
358 y -= 2;
359 } while (y);
360 }
361 } else {
362 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
363
364 prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
365
366 if (w == 8) {
367 __m128i s_64[2];
368
369 s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
370
371 do {
372 const __m256i res = y_convolve_2tap_8x2_avx2(
373 src_ptr, src_stride, coeffs_256, s_64);
374 jnt_avg_round_store_8x2_avx2(
375 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
376 src_ptr += 2 * src_stride;
377 dst += 2 * dst_stride;
378 dst8 += 2 * dst8_stride;
379 y -= 2;
380 } while (y);
381 } else if (w == 16) {
382 __m128i s_128[2];
383 __m256i r[2];
384
385 s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
386
387 do {
388 y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, r);
389 jnt_avg_round_store_16x2_avx2(
390 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
391 src_ptr += 2 * src_stride;
392 dst += 2 * dst_stride;
393 dst8 += 2 * dst8_stride;
394 y -= 2;
395 } while (y);
396 } else if (w == 32) {
397 __m256i s_256[2];
398
399 s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
400
401 do {
402 jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
403 coeffs_256,
404 offset_avg_256,
405 s_256[0],
406 &s_256[1],
407 dst,
408 dst8);
409 jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
410 coeffs_256,
411 offset_avg_256,
412 s_256[1],
413 &s_256[0],
414 dst + dst_stride,
415 dst8 + dst8_stride);
416 src_ptr += 2 * src_stride;
417 dst += 2 * dst_stride;
418 dst8 += 2 * dst8_stride;
419 y -= 2;
420 } while (y);
421 } else if (w == 64) {
422 __m256i s_256[2][2];
423
424 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
425 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
426
427 do {
428 jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
429 coeffs_256,
430 offset_avg_256,
431 s_256[0][0],
432 &s_256[1][0],
433 dst,
434 dst8);
435 jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 32,
436 coeffs_256,
437 offset_avg_256,
438 s_256[0][1],
439 &s_256[1][1],
440 dst + 32,
441 dst8 + 32);
442 jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
443 coeffs_256,
444 offset_avg_256,
445 s_256[1][0],
446 &s_256[0][0],
447 dst + dst_stride,
448 dst8 + dst8_stride);
449 jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
450 coeffs_256,
451 offset_avg_256,
452 s_256[1][1],
453 &s_256[0][1],
454 dst + dst_stride + 32,
455 dst8 + dst8_stride + 32);
456
457 src_ptr += 2 * src_stride;
458 dst += 2 * dst_stride;
459 dst8 += 2 * dst8_stride;
460 y -= 2;
461 } while (y);
462 } else {
463 __m256i s_256[2][4];
464
465 assert(w == 128);
466
467 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
468 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
469 s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
470 s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
471
472 do {
473 jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
474 coeffs_256,
475 offset_avg_256,
476 s_256[0][0],
477 &s_256[1][0],
478 dst,
479 dst8);
480 jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
481 coeffs_256,
482 offset_avg_256,
483 s_256[0][1],
484 &s_256[1][1],
485 dst + 1 * 32,
486 dst8 + 1 * 32);
487 jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
488 coeffs_256,
489 offset_avg_256,
490 s_256[0][2],
491 &s_256[1][2],
492 dst + 2 * 32,
493 dst8 + 2 * 32);
494 jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
495 coeffs_256,
496 offset_avg_256,
497 s_256[0][3],
498 &s_256[1][3],
499 dst + 3 * 32,
500 dst8 + 3 * 32);
501 jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
502 coeffs_256,
503 offset_avg_256,
504 s_256[1][0],
505 &s_256[0][0],
506 dst + dst_stride,
507 dst8 + dst8_stride);
508 jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
509 coeffs_256,
510 offset_avg_256,
511 s_256[1][1],
512 &s_256[0][1],
513 dst + dst_stride + 1 * 32,
514 dst8 + dst8_stride + 1 * 32);
515 jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
516 coeffs_256,
517 offset_avg_256,
518 s_256[1][2],
519 &s_256[0][2],
520 dst + dst_stride + 2 * 32,
521 dst8 + dst8_stride + 2 * 32);
522 jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
523 coeffs_256,
524 offset_avg_256,
525 s_256[1][3],
526 &s_256[0][3],
527 dst + dst_stride + 3 * 32,
528 dst8 + dst8_stride + 3 * 32);
529
530 src_ptr += 2 * src_stride;
531 dst += 2 * dst_stride;
532 dst8 += 2 * dst8_stride;
533 y -= 2;
534 } while (y);
535 }
536 }
537 }
538 } else {
539 const int16_t offset_no_avg = (round_offset << (round_1 - bits - 1)) +
540 (1 << (round_1 - bits - 2));
541
542 if (w <= 4) {
543 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
544
545 prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
546
547 if (w == 2) {
548 __m128i s_16[2];
549
550 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
551
552 do {
553 const __m128i res = y_convolve_2tap_2x2_ssse3(
554 src_ptr, src_stride, coeffs_128, s_16);
555 jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
556 src_ptr += 2 * src_stride;
557 dst += 2 * dst_stride;
558 y -= 2;
559 } while (y);
560 } else {
561 __m128i s_32[2];
562
563 assert(w == 4);
564
565 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
566
567 do {
568 const __m128i res = y_convolve_2tap_4x2_ssse3(
569 src_ptr, src_stride, coeffs_128, s_32);
570 jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
571 src_ptr += 2 * src_stride;
572 dst += 2 * dst_stride;
573 y -= 2;
574 } while (y);
575 }
576 } else {
577 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
578
579 prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
580
581 if (w == 8) {
582 __m128i s_64[2];
583
584 s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
585
586 do {
587 const __m256i res = y_convolve_2tap_8x2_avx2(
588 src_ptr, src_stride, coeffs_256, s_64);
589 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
590 src_ptr += 2 * src_stride;
591 dst += 2 * dst_stride;
592 y -= 2;
593 } while (y);
594 } else if (w == 16) {
595 __m128i s_128[2];
596 __m256i r[2];
597
598 s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
599
600 do {
601 y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, r);
602 jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
603 src_ptr += 2 * src_stride;
604 dst += 2 * dst_stride;
605 y -= 2;
606 } while (y);
607 } else if (w == 32) {
608 __m256i s_256[2];
609
610 s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
611
612 do {
613 jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
614 coeffs_256,
615 offset_no_avg_256,
616 s_256[0],
617 &s_256[1],
618 dst);
619 jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
620 coeffs_256,
621 offset_no_avg_256,
622 s_256[1],
623 &s_256[0],
624 dst + dst_stride);
625 src_ptr += 2 * src_stride;
626 dst += 2 * dst_stride;
627 y -= 2;
628 } while (y);
629 } else if (w == 64) {
630 __m256i s_256[2][2];
631
632 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
633 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
634
635 do {
636 jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
637 coeffs_256,
638 offset_no_avg_256,
639 s_256[0][0],
640 &s_256[1][0],
641 dst);
642 jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 32,
643 coeffs_256,
644 offset_no_avg_256,
645 s_256[0][1],
646 &s_256[1][1],
647 dst + 32);
648 jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
649 coeffs_256,
650 offset_no_avg_256,
651 s_256[1][0],
652 &s_256[0][0],
653 dst + dst_stride);
654 jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
655 coeffs_256,
656 offset_no_avg_256,
657 s_256[1][1],
658 &s_256[0][1],
659 dst + dst_stride + 32);
660
661 src_ptr += 2 * src_stride;
662 dst += 2 * dst_stride;
663 y -= 2;
664 } while (y);
665 } else {
666 __m256i s_256[2][4];
667
668 assert(w == 128);
669
670 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
671 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
672 s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
673 s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
674
675 do {
676 jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
677 coeffs_256,
678 offset_no_avg_256,
679 s_256[0][0],
680 &s_256[1][0],
681 dst);
682 jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
683 coeffs_256,
684 offset_no_avg_256,
685 s_256[0][1],
686 &s_256[1][1],
687 dst + 1 * 32);
688 jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
689 coeffs_256,
690 offset_no_avg_256,
691 s_256[0][2],
692 &s_256[1][2],
693 dst + 2 * 32);
694 jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
695 coeffs_256,
696 offset_no_avg_256,
697 s_256[0][3],
698 &s_256[1][3],
699 dst + 3 * 32);
700 jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
701 coeffs_256,
702 offset_no_avg_256,
703 s_256[1][0],
704 &s_256[0][0],
705 dst + dst_stride);
706 jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
707 coeffs_256,
708 offset_no_avg_256,
709 s_256[1][1],
710 &s_256[0][1],
711 dst + dst_stride + 1 * 32);
712 jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
713 coeffs_256,
714 offset_no_avg_256,
715 s_256[1][2],
716 &s_256[0][2],
717 dst + dst_stride + 2 * 32);
718 jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
719 coeffs_256,
720 offset_no_avg_256,
721 s_256[1][3],
722 &s_256[0][3],
723 dst + dst_stride + 3 * 32);
724
725 src_ptr += 2 * src_stride;
726 dst += 2 * dst_stride;
727 y -= 2;
728 } while (y);
729 }
730 }
731 }
732 }
733
jnt_convolve_y_4tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params)734 void jnt_convolve_y_4tap_avx2(const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
735 const int32_t dst8_stride, const int32_t w, const int32_t h,
736 const InterpFilterParams *const filter_params_y,
737 const int32_t subpel_y_q4, const ConvolveParams *const conv_params) {
738 const uint8_t *src_ptr = src - src_stride;
739 const int32_t dst_stride = conv_params->dst_stride;
740 const int32_t round_0 = 3;
741 const int32_t round_1 = COMPOUND_ROUND1_BITS;
742 const int32_t bits = FILTER_BITS - round_0;
743 const int32_t bd = 8;
744 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
745 const int32_t offset_bits = bd + round_bits;
746 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
747 ConvBufType * dst = conv_params->dst;
748 int32_t y = h;
749 __m128i coeffs_128[4];
750 __m256i coeffs_256[4];
751
752 if (conv_params->do_average) {
753 if (conv_params->use_jnt_comp_avg) {
754 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
755 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
756 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
757 (round_offset << DIST_PRECISION_BITS);
758
759 if (w <= 4) {
760 const __m128i factor_128 = _mm_set1_epi32(factor);
761 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
762
763 prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
764
765 if (w == 2) {
766 __m128i s_16[4], ss_128[2];
767
768 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
769 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
770 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
771
772 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
773 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
774
775 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
776
777 do {
778 src_ptr += 2 * src_stride;
779 const __m128i res = y_convolve_4tap_2x2_ssse3(
780 src_ptr, src_stride, coeffs_128, s_16, ss_128);
781 jnt_comp_avg_round_store_2x2_sse2(res,
782 factor_128,
783 offset_comp_avg_128,
784 dst,
785 dst_stride,
786 dst8,
787 dst8_stride);
788 ss_128[0] = ss_128[1];
789 dst += 2 * dst_stride;
790 dst8 += 2 * dst8_stride;
791 y -= 2;
792 } while (y);
793 } else {
794 __m128i s_32[4], ss_128[2];
795
796 assert(w == 4);
797
798 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
799 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
800 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
801
802 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
803 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
804
805 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
806
807 do {
808 src_ptr += 2 * src_stride;
809 const __m128i res = y_convolve_4tap_4x2_ssse3(
810 src_ptr, src_stride, coeffs_128, s_32, ss_128);
811 jnt_comp_avg_round_store_4x2_sse2(res,
812 factor_128,
813 offset_comp_avg_128,
814 dst,
815 dst_stride,
816 dst8,
817 dst8_stride);
818 ss_128[0] = ss_128[1];
819 dst += 2 * dst_stride;
820 dst8 += 2 * dst8_stride;
821 y -= 2;
822 } while (y);
823 }
824 } else {
825 const __m256i factor_256 = _mm256_set1_epi32(factor);
826 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
827
828 prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
829
830 if (w == 8) {
831 __m128i s_64[4];
832 __m256i ss_256[2];
833
834 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
835 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
836 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
837
838 // Load lines a and b. Line a to lower 128, line b to upper
839 // 128
840 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
841 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
842
843 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
844
845 do {
846 src_ptr += 2 * src_stride;
847 const __m256i res = y_convolve_4tap_8x2_avx2(
848 src_ptr, src_stride, coeffs_256, s_64, ss_256);
849 jnt_comp_avg_round_store_8x2_avx2(res,
850 factor_256,
851 offset_comp_avg_256,
852 dst,
853 dst_stride,
854 dst8,
855 dst8_stride);
856 ss_256[0] = ss_256[1];
857 dst += 2 * dst_stride;
858 dst8 += 2 * dst8_stride;
859 y -= 2;
860 } while (y);
861 } else {
862 __m128i s_128[4];
863 __m256i ss_256[4], r[2];
864
865 assert(w == 16);
866
867 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
868 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
869 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
870
871 // Load lines a and b. Line a to lower 128, line b to upper
872 // 128
873 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
874 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
875
876 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
877 ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
878
879 do {
880 src_ptr += 2 * src_stride;
881 y_convolve_4tap_16x2_avx2(
882 src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
883 jnt_comp_avg_round_store_16x2_avx2(
884 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
885 ss_256[0] = ss_256[1];
886 ss_256[2] = ss_256[3];
887 dst += 2 * dst_stride;
888 dst8 += 2 * dst8_stride;
889 y -= 2;
890 } while (y);
891 }
892 }
893 } else {
894 const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_1 - bits - 2)) -
895 (round_offset << (round_1 - bits - 1));
896
897 if (w <= 4) {
898 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
899
900 prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
901
902 if (w == 2) {
903 __m128i s_16[4], ss_128[2];
904
905 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
906 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
907 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
908
909 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
910 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
911
912 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
913
914 do {
915 src_ptr += 2 * src_stride;
916 const __m128i res = y_convolve_4tap_2x2_ssse3(
917 src_ptr, src_stride, coeffs_128, s_16, ss_128);
918 jnt_avg_round_store_2x2_sse2(
919 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
920 ss_128[0] = ss_128[1];
921 dst += 2 * dst_stride;
922 dst8 += 2 * dst8_stride;
923 y -= 2;
924 } while (y);
925 } else {
926 __m128i s_32[4], ss_128[2];
927
928 assert(w == 4);
929
930 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
931 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
932 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
933
934 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
935 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
936
937 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
938
939 do {
940 src_ptr += 2 * src_stride;
941 const __m128i res = y_convolve_4tap_4x2_ssse3(
942 src_ptr, src_stride, coeffs_128, s_32, ss_128);
943 jnt_avg_round_store_4x2_sse2(
944 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
945 ss_128[0] = ss_128[1];
946 dst += 2 * dst_stride;
947 dst8 += 2 * dst8_stride;
948 y -= 2;
949 } while (y);
950 }
951 } else {
952 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
953
954 prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
955
956 if (w == 8) {
957 __m128i s_64[4];
958 __m256i ss_256[2];
959
960 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
961 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
962 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
963
964 // Load lines a and b. Line a to lower 128, line b to upper
965 // 128
966 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
967 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
968
969 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
970
971 do {
972 src_ptr += 2 * src_stride;
973 const __m256i res = y_convolve_4tap_8x2_avx2(
974 src_ptr, src_stride, coeffs_256, s_64, ss_256);
975 jnt_avg_round_store_8x2_avx2(
976 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
977 ss_256[0] = ss_256[1];
978 dst += 2 * dst_stride;
979 dst8 += 2 * dst8_stride;
980 y -= 2;
981 } while (y);
982 } else {
983 __m128i s_128[4];
984 __m256i ss_256[4], r[2];
985
986 assert(w == 16);
987
988 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
989 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
990 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
991
992 // Load lines a and b. Line a to lower 128, line b to upper
993 // 128
994 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
995 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
996
997 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
998 ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
999
1000 do {
1001 src_ptr += 2 * src_stride;
1002 y_convolve_4tap_16x2_avx2(
1003 src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1004 jnt_avg_round_store_16x2_avx2(
1005 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1006 ss_256[0] = ss_256[1];
1007 ss_256[2] = ss_256[3];
1008 dst += 2 * dst_stride;
1009 dst8 += 2 * dst8_stride;
1010 y -= 2;
1011 } while (y);
1012 }
1013 }
1014 }
1015 } else {
1016 const int16_t offset_no_avg = (round_offset << (round_1 - bits - 1)) +
1017 (1 << (round_1 - bits - 2));
1018
1019 if (w <= 4) {
1020 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
1021
1022 prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1023
1024 if (w == 2) {
1025 __m128i s_16[4], ss_128[2];
1026
1027 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1028 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1029 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1030
1031 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1032 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1033
1034 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1035
1036 do {
1037 src_ptr += 2 * src_stride;
1038 const __m128i res = y_convolve_4tap_2x2_ssse3(
1039 src_ptr, src_stride, coeffs_128, s_16, ss_128);
1040 jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1041 ss_128[0] = ss_128[1];
1042 dst += 2 * dst_stride;
1043 y -= 2;
1044 } while (y);
1045 } else {
1046 __m128i s_32[4], ss_128[2];
1047
1048 assert(w == 4);
1049
1050 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1051 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1052 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1053
1054 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1055 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1056
1057 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1058
1059 do {
1060 src_ptr += 2 * src_stride;
1061 const __m128i res = y_convolve_4tap_4x2_ssse3(
1062 src_ptr, src_stride, coeffs_128, s_32, ss_128);
1063 jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1064 ss_128[0] = ss_128[1];
1065 dst += 2 * dst_stride;
1066 y -= 2;
1067 } while (y);
1068 }
1069 } else {
1070 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
1071
1072 prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1073
1074 if (w == 8) {
1075 __m128i s_64[4];
1076 __m256i ss_256[2];
1077
1078 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1079 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1080 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1081
1082 // Load lines a and b. Line a to lower 128, line b to upper 128
1083 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1084 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1085
1086 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1087
1088 do {
1089 src_ptr += 2 * src_stride;
1090 const __m256i res = y_convolve_4tap_8x2_avx2(
1091 src_ptr, src_stride, coeffs_256, s_64, ss_256);
1092 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1093 ss_256[0] = ss_256[1];
1094 dst += 2 * dst_stride;
1095 y -= 2;
1096 } while (y);
1097 } else {
1098 __m128i s_128[4];
1099 __m256i ss_256[4], r[2];
1100
1101 assert(w == 16);
1102
1103 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1104 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1105 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1106
1107 // Load lines a and b. Line a to lower 128, line b to upper 128
1108 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1109 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1110
1111 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1112 ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
1113
1114 do {
1115 src_ptr += 2 * src_stride;
1116 y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1117 jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1118 ss_256[0] = ss_256[1];
1119 ss_256[2] = ss_256[3];
1120 dst += 2 * dst_stride;
1121 y -= 2;
1122 } while (y);
1123 }
1124 }
1125 }
1126 }
1127
jnt_convolve_y_6tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params)1128 static void jnt_convolve_y_6tap_avx2(const uint8_t *const src, const int32_t src_stride,
1129 uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
1130 const int32_t h,
1131 const InterpFilterParams *const filter_params_y,
1132 const int32_t subpel_y_q4,
1133 const ConvolveParams *const conv_params) {
1134 const uint8_t *src_ptr = src - 2 * src_stride;
1135 const int32_t dst_stride = conv_params->dst_stride;
1136 const int32_t round_0 = 3;
1137 const int32_t round_1 = COMPOUND_ROUND1_BITS;
1138 const int32_t bits = FILTER_BITS - round_0;
1139 const int32_t bd = 8;
1140 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
1141 const int32_t offset_bits = bd + round_bits;
1142 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
1143 ConvBufType * dst = conv_params->dst;
1144 int32_t x;
1145 __m128i coeffs_128[4];
1146 __m256i coeffs_256[4];
1147
1148 if (conv_params->do_average) {
1149 if (conv_params->use_jnt_comp_avg) {
1150 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1151 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
1152 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
1153 (round_offset << DIST_PRECISION_BITS);
1154
1155 if (w <= 4) {
1156 const __m128i factor_128 = _mm_set1_epi32(factor);
1157 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1158
1159 prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1160
1161 int32_t y = h;
1162
1163 if (w == 2) {
1164 __m128i s_16[6], ss_128[3];
1165
1166 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1167 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1168 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1169 s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1170 s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1171
1172 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1173 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1174 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1175 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1176
1177 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1178 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1179
1180 do {
1181 src_ptr += 2 * src_stride;
1182 const __m128i res = y_convolve_6tap_2x2_ssse3(
1183 src_ptr, src_stride, coeffs_128, s_16, ss_128);
1184 jnt_comp_avg_round_store_2x2_sse2(res,
1185 factor_128,
1186 offset_comp_avg_128,
1187 dst,
1188 dst_stride,
1189 dst8,
1190 dst8_stride);
1191 ss_128[0] = ss_128[1];
1192 ss_128[1] = ss_128[2];
1193 dst += 2 * dst_stride;
1194 dst8 += 2 * dst8_stride;
1195 y -= 2;
1196 } while (y);
1197 } else {
1198 __m128i s_32[6], ss_128[3];
1199
1200 assert(w == 4);
1201
1202 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1203 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1204 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1205 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1206 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1207
1208 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1209 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1210 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1211 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1212
1213 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1214 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1215
1216 do {
1217 src_ptr += 2 * src_stride;
1218 const __m128i res = y_convolve_6tap_4x2_ssse3(
1219 src_ptr, src_stride, coeffs_128, s_32, ss_128);
1220 jnt_comp_avg_round_store_4x2_sse2(res,
1221 factor_128,
1222 offset_comp_avg_128,
1223 dst,
1224 dst_stride,
1225 dst8,
1226 dst8_stride);
1227 ss_128[0] = ss_128[1];
1228 ss_128[1] = ss_128[2];
1229 dst += 2 * dst_stride;
1230 dst8 += 2 * dst8_stride;
1231 y -= 2;
1232 } while (y);
1233 }
1234 } else {
1235 const __m256i factor_256 = _mm256_set1_epi32(factor);
1236 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1237
1238 prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1239
1240 if (w == 8) {
1241 __m128i s_64[6];
1242 __m256i ss_256[3];
1243
1244 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1245 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1246 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1247 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1248 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1249
1250 // Load lines a and b. Line a to lower 128, line b to upper
1251 // 128
1252 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1253 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1254 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1255 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1256
1257 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1258 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1259
1260 int32_t y = h;
1261 do {
1262 src_ptr += 2 * src_stride;
1263 const __m256i res = y_convolve_6tap_8x2_avx2(
1264 src_ptr, src_stride, coeffs_256, s_64, ss_256);
1265 jnt_comp_avg_round_store_8x2_avx2(res,
1266 factor_256,
1267 offset_comp_avg_256,
1268 dst,
1269 dst_stride,
1270 dst8,
1271 dst8_stride);
1272 ss_256[0] = ss_256[1];
1273 ss_256[1] = ss_256[2];
1274 dst += 2 * dst_stride;
1275 dst8 += 2 * dst8_stride;
1276 y -= 2;
1277 } while (y);
1278 } else if (w == 16) {
1279 __m128i s_128[6];
1280 __m256i ss_256[6], r[2];
1281
1282 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1283 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1284 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1285 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1286 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1287
1288 // Load lines a and b. Line a to lower 128, line b to upper
1289 // 128
1290 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1291 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1292 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1293 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1294
1295 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1296 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1297
1298 ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
1299 ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
1300
1301 int32_t y = h;
1302 do {
1303 src_ptr += 2 * src_stride;
1304 y_convolve_6tap_16x2_avx2(
1305 src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1306 jnt_comp_avg_round_store_16x2_avx2(
1307 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1308 ss_256[0] = ss_256[1];
1309 ss_256[1] = ss_256[2];
1310 ss_256[3] = ss_256[4];
1311 ss_256[4] = ss_256[5];
1312 dst += 2 * dst_stride;
1313 dst8 += 2 * dst8_stride;
1314 y -= 2;
1315 } while (y);
1316 } else {
1317 __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1318
1319 assert(!(w % 32));
1320
1321 x = 0;
1322 do {
1323 const uint8_t *s = src_ptr + x;
1324 ConvBufType * d = dst + x;
1325 uint8_t * d8 = dst8 + x;
1326
1327 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1328 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1329 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1330 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1331 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1332
1333 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1334 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1335 ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1336 ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1337
1338 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1339 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1340 tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1341 tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1342
1343 int32_t y = h;
1344 do {
1345 s += 2 * src_stride;
1346 y_convolve_6tap_32x2_avx2(
1347 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
1348 jnt_comp_avg_round_store_32_avx2(
1349 r, factor_256, offset_comp_avg_256, d, d8);
1350 jnt_comp_avg_round_store_32_avx2(r + 2,
1351 factor_256,
1352 offset_comp_avg_256,
1353 d + dst_stride,
1354 d8 + dst8_stride);
1355
1356 ss_256[0] = ss_256[1];
1357 ss_256[1] = ss_256[2];
1358 ss_256[3] = ss_256[4];
1359 ss_256[4] = ss_256[5];
1360
1361 tt_256[0] = tt_256[1];
1362 tt_256[1] = tt_256[2];
1363 tt_256[3] = tt_256[4];
1364 tt_256[4] = tt_256[5];
1365 d += 2 * dst_stride;
1366 d8 += 2 * dst8_stride;
1367 y -= 2;
1368 } while (y);
1369
1370 x += 32;
1371 } while (x < w);
1372 }
1373 }
1374 } else {
1375 const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_1 - bits - 2)) -
1376 (round_offset << (round_1 - bits - 1));
1377
1378 if (w <= 4) {
1379 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
1380
1381 prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1382
1383 int32_t y = h;
1384
1385 if (w == 2) {
1386 __m128i s_16[6], ss_128[3];
1387
1388 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1389 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1390 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1391 s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1392 s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1393
1394 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1395 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1396 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1397 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1398
1399 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1400 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1401
1402 do {
1403 src_ptr += 2 * src_stride;
1404 const __m128i res = y_convolve_6tap_2x2_ssse3(
1405 src_ptr, src_stride, coeffs_128, s_16, ss_128);
1406 jnt_avg_round_store_2x2_sse2(
1407 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1408 ss_128[0] = ss_128[1];
1409 ss_128[1] = ss_128[2];
1410 dst += 2 * dst_stride;
1411 dst8 += 2 * dst8_stride;
1412 y -= 2;
1413 } while (y);
1414 } else {
1415 __m128i s_32[6], ss_128[3];
1416
1417 assert(w == 4);
1418
1419 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1420 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1421 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1422 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1423 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1424
1425 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1426 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1427 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1428 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1429
1430 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1431 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1432
1433 do {
1434 src_ptr += 2 * src_stride;
1435 const __m128i res = y_convolve_6tap_4x2_ssse3(
1436 src_ptr, src_stride, coeffs_128, s_32, ss_128);
1437 jnt_avg_round_store_4x2_sse2(
1438 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1439 ss_128[0] = ss_128[1];
1440 ss_128[1] = ss_128[2];
1441 dst += 2 * dst_stride;
1442 dst8 += 2 * dst8_stride;
1443 y -= 2;
1444 } while (y);
1445 }
1446 } else {
1447 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
1448
1449 prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1450
1451 if (w == 8) {
1452 __m128i s_64[6];
1453 __m256i ss_256[3];
1454
1455 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1456 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1457 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1458 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1459 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1460
1461 // Load lines a and b. Line a to lower 128, line b to upper
1462 // 128
1463 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1464 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1465 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1466 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1467
1468 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1469 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1470
1471 int32_t y = h;
1472 do {
1473 src_ptr += 2 * src_stride;
1474 const __m256i res = y_convolve_6tap_8x2_avx2(
1475 src_ptr, src_stride, coeffs_256, s_64, ss_256);
1476 jnt_avg_round_store_8x2_avx2(
1477 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1478 ss_256[0] = ss_256[1];
1479 ss_256[1] = ss_256[2];
1480 dst += 2 * dst_stride;
1481 dst8 += 2 * dst8_stride;
1482 y -= 2;
1483 } while (y);
1484 } else if (w == 16) {
1485 __m128i s_128[6];
1486 __m256i ss_256[6], r[2];
1487
1488 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1489 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1490 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1491 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1492 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1493
1494 // Load lines a and b. Line a to lower 128, line b to upper
1495 // 128
1496 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1497 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1498 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1499 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1500
1501 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1502 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1503
1504 ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
1505 ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
1506
1507 int32_t y = h;
1508 do {
1509 src_ptr += 2 * src_stride;
1510 y_convolve_6tap_16x2_avx2(
1511 src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1512 jnt_avg_round_store_16x2_avx2(
1513 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1514 ss_256[0] = ss_256[1];
1515 ss_256[1] = ss_256[2];
1516 ss_256[3] = ss_256[4];
1517 ss_256[4] = ss_256[5];
1518 dst += 2 * dst_stride;
1519 dst8 += 2 * dst8_stride;
1520 y -= 2;
1521 } while (y);
1522 } else {
1523 __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1524
1525 assert(!(w % 32));
1526
1527 x = 0;
1528 do {
1529 const uint8_t *s = src_ptr + x;
1530 ConvBufType * d = dst + x;
1531 uint8_t * d8 = dst8 + x;
1532
1533 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1534 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1535 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1536 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1537 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1538
1539 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1540 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1541 ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1542 ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1543
1544 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1545 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1546 tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1547 tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1548
1549 int32_t y = h;
1550 do {
1551 s += 2 * src_stride;
1552 y_convolve_6tap_32x2_avx2(
1553 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
1554 jnt_avg_round_store_32_avx2(r, offset_avg_256, d, d8);
1555 jnt_avg_round_store_32_avx2(
1556 r + 2, offset_avg_256, d + dst_stride, d8 + dst8_stride);
1557
1558 ss_256[0] = ss_256[1];
1559 ss_256[1] = ss_256[2];
1560 ss_256[3] = ss_256[4];
1561 ss_256[4] = ss_256[5];
1562
1563 tt_256[0] = tt_256[1];
1564 tt_256[1] = tt_256[2];
1565 tt_256[3] = tt_256[4];
1566 tt_256[4] = tt_256[5];
1567 d += 2 * dst_stride;
1568 d8 += 2 * dst8_stride;
1569 y -= 2;
1570 } while (y);
1571
1572 x += 32;
1573 } while (x < w);
1574 }
1575 }
1576 }
1577 } else {
1578 const int16_t offset_no_avg = (round_offset << (round_1 - bits - 1)) +
1579 (1 << (round_1 - bits - 2));
1580
1581 if (w <= 4) {
1582 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
1583
1584 prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1585
1586 int32_t y = h;
1587
1588 if (w == 2) {
1589 __m128i s_16[6], ss_128[3];
1590
1591 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1592 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1593 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1594 s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1595 s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1596
1597 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1598 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1599 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1600 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1601
1602 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1603 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1604
1605 do {
1606 src_ptr += 2 * src_stride;
1607 const __m128i res = y_convolve_6tap_2x2_ssse3(
1608 src_ptr, src_stride, coeffs_128, s_16, ss_128);
1609 jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1610 ss_128[0] = ss_128[1];
1611 ss_128[1] = ss_128[2];
1612 dst += 2 * dst_stride;
1613 y -= 2;
1614 } while (y);
1615 } else {
1616 __m128i s_32[6], ss_128[3];
1617
1618 assert(w == 4);
1619
1620 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1621 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1622 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1623 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1624 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1625
1626 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1627 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1628 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1629 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1630
1631 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1632 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1633
1634 do {
1635 src_ptr += 2 * src_stride;
1636 const __m128i res = y_convolve_6tap_4x2_ssse3(
1637 src_ptr, src_stride, coeffs_128, s_32, ss_128);
1638 jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1639 ss_128[0] = ss_128[1];
1640 ss_128[1] = ss_128[2];
1641 dst += 2 * dst_stride;
1642 y -= 2;
1643 } while (y);
1644 }
1645 } else {
1646 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
1647
1648 prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1649
1650 if (w == 8) {
1651 __m128i s_64[6];
1652 __m256i ss_256[3];
1653
1654 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1655 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1656 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1657 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1658 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1659
1660 // Load lines a and b. Line a to lower 128, line b to upper 128
1661 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1662 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1663 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1664 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1665
1666 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1667 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1668
1669 int32_t y = h;
1670 do {
1671 src_ptr += 2 * src_stride;
1672 const __m256i res = y_convolve_6tap_8x2_avx2(
1673 src_ptr, src_stride, coeffs_256, s_64, ss_256);
1674 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1675 ss_256[0] = ss_256[1];
1676 ss_256[1] = ss_256[2];
1677 dst += 2 * dst_stride;
1678 y -= 2;
1679 } while (y);
1680 } else if (w == 16) {
1681 __m128i s_128[6];
1682 __m256i ss_256[6], r[2];
1683
1684 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1685 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1686 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1687 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1688 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1689
1690 // Load lines a and b. Line a to lower 128, line b to upper 128
1691 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1692 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1693 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1694 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1695
1696 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1697 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1698
1699 ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
1700 ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
1701
1702 int32_t y = h;
1703 do {
1704 src_ptr += 2 * src_stride;
1705 y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1706 jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1707 ss_256[0] = ss_256[1];
1708 ss_256[1] = ss_256[2];
1709 ss_256[3] = ss_256[4];
1710 ss_256[4] = ss_256[5];
1711 dst += 2 * dst_stride;
1712 y -= 2;
1713 } while (y);
1714 } else {
1715 __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1716
1717 assert(!(w % 32));
1718
1719 x = 0;
1720 do {
1721 const uint8_t *s = src_ptr + x;
1722 ConvBufType * d = dst + x;
1723
1724 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1725 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1726 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1727 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1728 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1729
1730 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1731 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1732 ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1733 ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1734
1735 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1736 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1737 tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1738 tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1739
1740 int32_t y = h;
1741 do {
1742 s += 2 * src_stride;
1743 y_convolve_6tap_32x2_avx2(
1744 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
1745 jnt_no_avg_round_store_32_avx2(r, offset_no_avg_256, d);
1746 jnt_no_avg_round_store_32_avx2(r + 2, offset_no_avg_256, d + dst_stride);
1747
1748 ss_256[0] = ss_256[1];
1749 ss_256[1] = ss_256[2];
1750 ss_256[3] = ss_256[4];
1751 ss_256[4] = ss_256[5];
1752
1753 tt_256[0] = tt_256[1];
1754 tt_256[1] = tt_256[2];
1755 tt_256[3] = tt_256[4];
1756 tt_256[4] = tt_256[5];
1757 d += 2 * dst_stride;
1758 y -= 2;
1759 } while (y);
1760
1761 x += 32;
1762 } while (x < w);
1763 }
1764 }
1765 }
1766 }
1767
jnt_convolve_y_8tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params)1768 static void jnt_convolve_y_8tap_avx2(const uint8_t *const src, const int32_t src_stride,
1769 uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
1770 const int32_t h,
1771 const InterpFilterParams *const filter_params_y,
1772 const int32_t subpel_y_q4,
1773 const ConvolveParams *const conv_params) {
1774 const uint8_t *src_ptr = src - 3 * src_stride;
1775 const int32_t dst_stride = conv_params->dst_stride;
1776 const int32_t round_0 = 3;
1777 const int32_t round_1 = COMPOUND_ROUND1_BITS;
1778 const int32_t bits = FILTER_BITS - round_0;
1779 const int32_t bd = 8;
1780 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
1781 const int32_t offset_bits = bd + round_bits;
1782 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
1783 ConvBufType * dst = conv_params->dst;
1784 int32_t x;
1785 __m128i coeffs_128[4];
1786 __m256i coeffs_256[4];
1787
1788 if (conv_params->do_average) {
1789 if (conv_params->use_jnt_comp_avg) {
1790 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1791 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
1792 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
1793 (round_offset << DIST_PRECISION_BITS);
1794
1795 if (w <= 4) {
1796 const __m128i factor_128 = _mm_set1_epi32(factor);
1797 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1798
1799 prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1800
1801 int32_t y = h;
1802
1803 if (w == 2) {
1804 __m128i s_16[8], ss_128[4];
1805
1806 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1807 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1808 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1809 s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1810 s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1811 s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
1812 s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
1813
1814 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1815 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1816 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1817 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1818 const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1819 const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
1820
1821 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1822 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1823 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1824
1825 do {
1826 const __m128i res = y_convolve_8tap_2x2_ssse3(
1827 src_ptr, src_stride, coeffs_128, s_16, ss_128);
1828 jnt_comp_avg_round_store_2x2_sse2(res,
1829 factor_128,
1830 offset_comp_avg_128,
1831 dst,
1832 dst_stride,
1833 dst8,
1834 dst8_stride);
1835 ss_128[0] = ss_128[1];
1836 ss_128[1] = ss_128[2];
1837 ss_128[2] = ss_128[3];
1838 src_ptr += 2 * src_stride;
1839 dst += 2 * dst_stride;
1840 dst8 += 2 * dst8_stride;
1841 y -= 2;
1842 } while (y);
1843 } else {
1844 __m128i s_32[8], ss_128[4];
1845
1846 assert(w == 4);
1847
1848 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1849 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1850 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1851 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1852 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1853 s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
1854 s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
1855
1856 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1857 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1858 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1859 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1860 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1861 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
1862
1863 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1864 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1865 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1866
1867 do {
1868 const __m128i res = y_convolve_8tap_4x2_ssse3(
1869 src_ptr, src_stride, coeffs_128, s_32, ss_128);
1870 jnt_comp_avg_round_store_4x2_sse2(res,
1871 factor_128,
1872 offset_comp_avg_128,
1873 dst,
1874 dst_stride,
1875 dst8,
1876 dst8_stride);
1877 ss_128[0] = ss_128[1];
1878 ss_128[1] = ss_128[2];
1879 ss_128[2] = ss_128[3];
1880 src_ptr += 2 * src_stride;
1881 dst += 2 * dst_stride;
1882 dst8 += 2 * dst8_stride;
1883 y -= 2;
1884 } while (y);
1885 }
1886 } else {
1887 const __m256i factor_256 = _mm256_set1_epi32(factor);
1888 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1889
1890 prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1891
1892 if (w == 8) {
1893 __m128i s_64[8];
1894 __m256i ss_256[4];
1895
1896 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1897 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1898 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1899 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1900 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1901 s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
1902 s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
1903
1904 // Load lines a and b. Line a to lower 128, line b to upper
1905 // 128
1906 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1907 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1908 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1909 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1910 const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1911 const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
1912
1913 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1914 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1915 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1916
1917 int32_t y = h;
1918 do {
1919 const __m256i res = y_convolve_8tap_8x2_avx2(
1920 src_ptr, src_stride, coeffs_256, s_64, ss_256);
1921 jnt_comp_avg_round_store_8x2_avx2(res,
1922 factor_256,
1923 offset_comp_avg_256,
1924 dst,
1925 dst_stride,
1926 dst8,
1927 dst8_stride);
1928 ss_256[0] = ss_256[1];
1929 ss_256[1] = ss_256[2];
1930 ss_256[2] = ss_256[3];
1931 src_ptr += 2 * src_stride;
1932 dst += 2 * dst_stride;
1933 dst8 += 2 * dst8_stride;
1934 y -= 2;
1935 } while (y);
1936 } else if (w == 16) {
1937 __m128i s_128[8];
1938 __m256i ss_256[8], r[2];
1939
1940 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1941 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1942 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1943 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1944 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1945 s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
1946 s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
1947
1948 // Load lines a and b. Line a to lower 128, line b to upper
1949 // 128
1950 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1951 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1952 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1953 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1954 const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1955 const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
1956
1957 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1958 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1959 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1960
1961 ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
1962 ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
1963 ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
1964
1965 int32_t y = h;
1966 do {
1967 y_convolve_8tap_16x2_avx2(
1968 src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1969 jnt_comp_avg_round_store_16x2_avx2(
1970 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1971 ss_256[0] = ss_256[1];
1972 ss_256[1] = ss_256[2];
1973 ss_256[2] = ss_256[3];
1974 ss_256[4] = ss_256[5];
1975 ss_256[5] = ss_256[6];
1976 ss_256[6] = ss_256[7];
1977 src_ptr += 2 * src_stride;
1978 dst += 2 * dst_stride;
1979 dst8 += 2 * dst8_stride;
1980 y -= 2;
1981 } while (y);
1982 } else {
1983 __m256i s_256[8], ss_256[8], tt_256[8], r[4];
1984
1985 assert(!(w % 32));
1986
1987 x = 0;
1988 do {
1989 const uint8_t *s = src_ptr + x;
1990 ConvBufType * d = dst + x;
1991 uint8_t * d8 = dst8 + x;
1992
1993 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1994 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1995 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1996 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1997 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1998 s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
1999 s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2000
2001 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2002 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2003 ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2004 ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2005 ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2006 ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2007
2008 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2009 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2010 tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2011 tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2012 tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2013 tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2014
2015 int32_t y = h;
2016 do {
2017 y_convolve_8tap_32x2_avx2(
2018 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
2019 jnt_comp_avg_round_store_32_avx2(
2020 r, factor_256, offset_comp_avg_256, d, d8);
2021 jnt_comp_avg_round_store_32_avx2(r + 2,
2022 factor_256,
2023 offset_comp_avg_256,
2024 d + dst_stride,
2025 d8 + dst8_stride);
2026
2027 ss_256[0] = ss_256[1];
2028 ss_256[1] = ss_256[2];
2029 ss_256[2] = ss_256[3];
2030 ss_256[4] = ss_256[5];
2031 ss_256[5] = ss_256[6];
2032 ss_256[6] = ss_256[7];
2033
2034 tt_256[0] = tt_256[1];
2035 tt_256[1] = tt_256[2];
2036 tt_256[2] = tt_256[3];
2037 tt_256[4] = tt_256[5];
2038 tt_256[5] = tt_256[6];
2039 tt_256[6] = tt_256[7];
2040 s += 2 * src_stride;
2041 d += 2 * dst_stride;
2042 d8 += 2 * dst8_stride;
2043 y -= 2;
2044 } while (y);
2045
2046 x += 32;
2047 } while (x < w);
2048 }
2049 }
2050 } else {
2051 const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_1 - bits - 2)) -
2052 (round_offset << (round_1 - bits - 1));
2053
2054 if (w <= 4) {
2055 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
2056
2057 prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2058
2059 int32_t y = h;
2060
2061 if (w == 2) {
2062 __m128i s_16[8], ss_128[4];
2063
2064 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
2065 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
2066 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2067 s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
2068 s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
2069 s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
2070 s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
2071
2072 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2073 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2074 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2075 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2076 const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2077 const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2078
2079 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2080 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2081 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2082
2083 do {
2084 const __m128i res = y_convolve_8tap_2x2_ssse3(
2085 src_ptr, src_stride, coeffs_128, s_16, ss_128);
2086 jnt_avg_round_store_2x2_sse2(
2087 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
2088 ss_128[0] = ss_128[1];
2089 ss_128[1] = ss_128[2];
2090 ss_128[2] = ss_128[3];
2091 src_ptr += 2 * src_stride;
2092 dst += 2 * dst_stride;
2093 dst8 += 2 * dst8_stride;
2094 y -= 2;
2095 } while (y);
2096 } else {
2097 __m128i s_32[8], ss_128[4];
2098
2099 assert(w == 4);
2100
2101 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
2102 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
2103 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
2104 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
2105 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
2106 s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
2107 s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
2108
2109 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2110 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2111 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2112 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2113 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2114 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2115
2116 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2117 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2118 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2119
2120 do {
2121 const __m128i res = y_convolve_8tap_4x2_ssse3(
2122 src_ptr, src_stride, coeffs_128, s_32, ss_128);
2123 jnt_avg_round_store_4x2_sse2(
2124 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
2125 ss_128[0] = ss_128[1];
2126 ss_128[1] = ss_128[2];
2127 ss_128[2] = ss_128[3];
2128 src_ptr += 2 * src_stride;
2129 dst += 2 * dst_stride;
2130 dst8 += 2 * dst8_stride;
2131 y -= 2;
2132 } while (y);
2133 }
2134 } else {
2135 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
2136
2137 prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2138
2139 if (w == 8) {
2140 __m128i s_64[8];
2141 __m256i ss_256[4];
2142
2143 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2144 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2145 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2146 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2147 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2148 s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2149 s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2150
2151 // Load lines a and b. Line a to lower 128, line b to upper
2152 // 128
2153 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2154 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2155 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2156 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2157 const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2158 const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2159
2160 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2161 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2162 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2163
2164 int32_t y = h;
2165 do {
2166 const __m256i res = y_convolve_8tap_8x2_avx2(
2167 src_ptr, src_stride, coeffs_256, s_64, ss_256);
2168 jnt_avg_round_store_8x2_avx2(
2169 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2170 ss_256[0] = ss_256[1];
2171 ss_256[1] = ss_256[2];
2172 ss_256[2] = ss_256[3];
2173 src_ptr += 2 * src_stride;
2174 dst += 2 * dst_stride;
2175 dst8 += 2 * dst8_stride;
2176 y -= 2;
2177 } while (y);
2178 } else if (w == 16) {
2179 __m128i s_128[8];
2180 __m256i ss_256[8], r[2];
2181
2182 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2183 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2184 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2185 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2186 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2187 s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2188 s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2189
2190 // Load lines a and b. Line a to lower 128, line b to upper
2191 // 128
2192 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2193 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2194 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2195 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2196 const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2197 const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2198
2199 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2200 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2201 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2202
2203 ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2204 ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2205 ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2206
2207 int32_t y = h;
2208 do {
2209 y_convolve_8tap_16x2_avx2(
2210 src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
2211 jnt_avg_round_store_16x2_avx2(
2212 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2213 ss_256[0] = ss_256[1];
2214 ss_256[1] = ss_256[2];
2215 ss_256[2] = ss_256[3];
2216 ss_256[4] = ss_256[5];
2217 ss_256[5] = ss_256[6];
2218 ss_256[6] = ss_256[7];
2219 src_ptr += 2 * src_stride;
2220 dst += 2 * dst_stride;
2221 dst8 += 2 * dst8_stride;
2222 y -= 2;
2223 } while (y);
2224 } else {
2225 __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2226
2227 assert(!(w % 32));
2228
2229 x = 0;
2230 do {
2231 const uint8_t *s = src_ptr + x;
2232 ConvBufType * d = dst + x;
2233 uint8_t * d8 = dst8 + x;
2234
2235 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2236 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2237 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2238 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2239 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2240 s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2241 s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2242
2243 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2244 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2245 ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2246 ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2247 ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2248 ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2249
2250 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2251 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2252 tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2253 tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2254 tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2255 tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2256
2257 int32_t y = h;
2258 do {
2259 y_convolve_8tap_32x2_avx2(
2260 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
2261 jnt_avg_round_store_32_avx2(r, offset_avg_256, d, d8);
2262 jnt_avg_round_store_32_avx2(
2263 r + 2, offset_avg_256, d + dst_stride, d8 + dst8_stride);
2264
2265 ss_256[0] = ss_256[1];
2266 ss_256[1] = ss_256[2];
2267 ss_256[2] = ss_256[3];
2268 ss_256[4] = ss_256[5];
2269 ss_256[5] = ss_256[6];
2270 ss_256[6] = ss_256[7];
2271
2272 tt_256[0] = tt_256[1];
2273 tt_256[1] = tt_256[2];
2274 tt_256[2] = tt_256[3];
2275 tt_256[4] = tt_256[5];
2276 tt_256[5] = tt_256[6];
2277 tt_256[6] = tt_256[7];
2278 s += 2 * src_stride;
2279 d += 2 * dst_stride;
2280 d8 += 2 * dst8_stride;
2281 y -= 2;
2282 } while (y);
2283
2284 x += 32;
2285 } while (x < w);
2286 }
2287 }
2288 }
2289 } else {
2290 const int16_t offset_no_avg = (round_offset << (round_1 - bits - 1)) +
2291 (1 << (round_1 - bits - 2));
2292
2293 if (w <= 4) {
2294 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
2295
2296 prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2297
2298 int32_t y = h;
2299
2300 if (w == 2) {
2301 __m128i s_16[8], ss_128[4];
2302
2303 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
2304 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
2305 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2306 s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
2307 s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
2308 s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
2309 s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
2310
2311 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2312 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2313 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2314 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2315 const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2316 const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2317
2318 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2319 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2320 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2321
2322 do {
2323 const __m128i res = y_convolve_8tap_2x2_ssse3(
2324 src_ptr, src_stride, coeffs_128, s_16, ss_128);
2325 jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
2326 ss_128[0] = ss_128[1];
2327 ss_128[1] = ss_128[2];
2328 ss_128[2] = ss_128[3];
2329 src_ptr += 2 * src_stride;
2330 dst += 2 * dst_stride;
2331 y -= 2;
2332 } while (y);
2333 } else {
2334 __m128i s_32[8], ss_128[4];
2335
2336 assert(w == 4);
2337
2338 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
2339 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
2340 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
2341 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
2342 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
2343 s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
2344 s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
2345
2346 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2347 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2348 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2349 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2350 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2351 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2352
2353 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2354 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2355 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2356
2357 do {
2358 const __m128i res = y_convolve_8tap_4x2_ssse3(
2359 src_ptr, src_stride, coeffs_128, s_32, ss_128);
2360 jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
2361 ss_128[0] = ss_128[1];
2362 ss_128[1] = ss_128[2];
2363 ss_128[2] = ss_128[3];
2364 src_ptr += 2 * src_stride;
2365 dst += 2 * dst_stride;
2366 y -= 2;
2367 } while (y);
2368 }
2369 } else {
2370 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
2371
2372 prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2373
2374 if (w == 8) {
2375 __m128i s_64[8];
2376 __m256i ss_256[4];
2377
2378 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2379 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2380 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2381 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2382 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2383 s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2384 s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2385
2386 // Load lines a and b. Line a to lower 128, line b to upper 128
2387 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2388 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2389 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2390 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2391 const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2392 const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2393
2394 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2395 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2396 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2397
2398 int32_t y = h;
2399 do {
2400 const __m256i res = y_convolve_8tap_8x2_avx2(
2401 src_ptr, src_stride, coeffs_256, s_64, ss_256);
2402 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
2403 ss_256[0] = ss_256[1];
2404 ss_256[1] = ss_256[2];
2405 ss_256[2] = ss_256[3];
2406 src_ptr += 2 * src_stride;
2407 dst += 2 * dst_stride;
2408 y -= 2;
2409 } while (y);
2410 } else if (w == 16) {
2411 __m128i s_128[8];
2412 __m256i ss_256[8], r[2];
2413
2414 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2415 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2416 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2417 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2418 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2419 s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2420 s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2421
2422 // Load lines a and b. Line a to lower 128, line b to upper 128
2423 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2424 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2425 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2426 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2427 const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2428 const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2429
2430 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2431 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2432 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2433
2434 ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2435 ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2436 ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2437
2438 int32_t y = h;
2439 do {
2440 y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
2441 jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
2442 ss_256[0] = ss_256[1];
2443 ss_256[1] = ss_256[2];
2444 ss_256[2] = ss_256[3];
2445 ss_256[4] = ss_256[5];
2446 ss_256[5] = ss_256[6];
2447 ss_256[6] = ss_256[7];
2448 src_ptr += 2 * src_stride;
2449 dst += 2 * dst_stride;
2450 y -= 2;
2451 } while (y);
2452 } else {
2453 __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2454
2455 assert(!(w % 32));
2456
2457 x = 0;
2458 do {
2459 const uint8_t *s = src_ptr + x;
2460 ConvBufType * d = dst + x;
2461
2462 s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2463 s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2464 s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2465 s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2466 s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2467 s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2468 s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2469
2470 ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2471 ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2472 ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2473 ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2474 ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2475 ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2476
2477 tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2478 tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2479 tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2480 tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2481 tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2482 tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2483
2484 int32_t y = h;
2485 do {
2486 y_convolve_8tap_32x2_avx2(
2487 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
2488 jnt_no_avg_round_store_32_avx2(r, offset_no_avg_256, d);
2489 jnt_no_avg_round_store_32_avx2(r + 2, offset_no_avg_256, d + dst_stride);
2490
2491 ss_256[0] = ss_256[1];
2492 ss_256[1] = ss_256[2];
2493 ss_256[2] = ss_256[3];
2494 ss_256[4] = ss_256[5];
2495 ss_256[5] = ss_256[6];
2496 ss_256[6] = ss_256[7];
2497
2498 tt_256[0] = tt_256[1];
2499 tt_256[1] = tt_256[2];
2500 tt_256[2] = tt_256[3];
2501 tt_256[4] = tt_256[5];
2502 tt_256[5] = tt_256[6];
2503 tt_256[6] = tt_256[7];
2504 s += 2 * src_stride;
2505 d += 2 * dst_stride;
2506 y -= 2;
2507 } while (y);
2508
2509 x += 32;
2510 } while (x < w);
2511 }
2512 }
2513 }
2514 }
2515
2516 typedef void (*JntConvolveYTapFunc)(const uint8_t *const src, const int32_t src_stride,
2517 uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
2518 const int32_t h,
2519 const InterpFilterParams *const filter_params_y,
2520 const int32_t subpel_y_q4,
2521 const ConvolveParams *const conv_params);
2522
svt_av1_jnt_convolve_y_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)2523 void svt_av1_jnt_convolve_y_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
2524 int32_t dst8_stride, int32_t w, int32_t h,
2525 InterpFilterParams *filter_params_x,
2526 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
2527 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
2528 static const JntConvolveYTapFunc jnt_convolve_y_tap_func_table[MAX_FILTER_TAP + 1] = {
2529 NULL,
2530 NULL,
2531 jnt_convolve_y_2tap_avx2,
2532 NULL,
2533 jnt_convolve_y_4tap_avx2,
2534 NULL,
2535 jnt_convolve_y_6tap_avx2,
2536 NULL,
2537 jnt_convolve_y_8tap_avx2};
2538 const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
2539
2540 (void)filter_params_x;
2541 (void)subpel_x_q4;
2542
2543 assert(conv_params->round_0 == 3);
2544 assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
2545
2546 jnt_convolve_y_tap_func_table[tap_y](
2547 src, src_stride, dst8, dst8_stride, w, h, filter_params_y, subpel_y_q4, conv_params);
2548 }
2549
2550 // =============================================================================
2551
jnt_copy_avg_32_avx2(const uint8_t * const src,const __m256i offset_avg_256,const ConvBufType * const dst,uint8_t * const dst8)2552 static INLINE void jnt_copy_avg_32_avx2(const uint8_t *const src, const __m256i offset_avg_256,
2553 const ConvBufType *const dst, uint8_t *const dst8) {
2554 __m256i res[2];
2555 jnt_copy_load_src_32_avx2(src, res);
2556 jnt_copy_avg_round_store_32_avx2(res, offset_avg_256, dst, dst8);
2557 }
2558
jnt_copy_no_avg_32_avx2(const uint8_t * const src,const __m256i offset_no_avg_256,const ConvBufType * const dst)2559 static INLINE void jnt_copy_no_avg_32_avx2(const uint8_t *const src,
2560 const __m256i offset_no_avg_256,
2561 const ConvBufType *const dst) {
2562 __m256i d[2];
2563 jnt_copy_load_src_32_avx2(src, d);
2564 d[0] = _mm256_add_epi16(d[0], offset_no_avg_256);
2565 d[1] = _mm256_add_epi16(d[1], offset_no_avg_256);
2566 _mm256_storeu_si256((__m256i *)(dst + 0 * 16), d[0]);
2567 _mm256_storeu_si256((__m256i *)(dst + 1 * 16), d[1]);
2568 }
2569
svt_av1_jnt_convolve_2d_copy_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)2570 void svt_av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
2571 int32_t dst8_stride, int32_t w, int32_t h,
2572 InterpFilterParams *filter_params_x,
2573 InterpFilterParams *filter_params_y,
2574 const int32_t subpel_x_q4, const int32_t subpel_y_q4,
2575 ConvolveParams *conv_params) {
2576 const int32_t round_0 = 3;
2577 const int32_t round_1 = COMPOUND_ROUND1_BITS;
2578 const int32_t bits = 2 * FILTER_BITS - round_0 - round_1;
2579 const int32_t bd = 8;
2580 const int32_t offset_bits = bd + bits;
2581 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
2582 ConvBufType * dst = conv_params->dst;
2583 int32_t dst_stride = conv_params->dst_stride;
2584
2585 (void)filter_params_x;
2586 (void)filter_params_y;
2587 (void)subpel_x_q4;
2588 (void)subpel_y_q4;
2589
2590 if (conv_params->do_average) {
2591 if (conv_params->use_jnt_comp_avg) {
2592 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
2593 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
2594 (1 << (bits + DIST_PRECISION_BITS - 1)) - (round_offset << DIST_PRECISION_BITS);
2595
2596 if (w <= 4) {
2597 const __m128i factor_128 = _mm_set1_epi32(factor);
2598 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
2599
2600 if (w == 2) {
2601 do {
2602 const __m128i res = jnt_copy_load_src_2x2_sse2(src, src_stride);
2603 jnt_comp_avg_round_store_2x2_kernel_sse2(res,
2604 factor_128,
2605 offset_comp_avg_128,
2606 dst,
2607 dst_stride,
2608 dst8,
2609 dst8_stride);
2610 src += 2 * src_stride;
2611 dst += 2 * dst_stride;
2612 dst8 += 2 * dst8_stride;
2613 h -= 2;
2614 } while (h);
2615 } else {
2616 assert(w == 4);
2617
2618 do {
2619 const __m128i res = jnt_copy_load_src_4x2_sse4_1(src, src_stride);
2620 jnt_comp_avg_round_store_4x2_kernel_sse2(res,
2621 factor_128,
2622 offset_comp_avg_128,
2623 dst,
2624 dst_stride,
2625 dst8,
2626 dst8_stride);
2627 src += 2 * src_stride;
2628 dst += 2 * dst_stride;
2629 dst8 += 2 * dst8_stride;
2630 h -= 2;
2631 } while (h);
2632 }
2633 } else {
2634 const __m256i factor_256 = _mm256_set1_epi32(factor);
2635 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
2636
2637 if (w == 8) {
2638 do {
2639 const __m256i res = jnt_copy_load_src_8x2_avx2(src, src_stride);
2640 jnt_comp_avg_round_store_8x2_kernel_avx2(res,
2641 factor_256,
2642 offset_comp_avg_256,
2643 dst,
2644 dst_stride,
2645 dst8,
2646 dst8_stride);
2647 src += 2 * src_stride;
2648 dst += 2 * dst_stride;
2649 dst8 += 2 * dst8_stride;
2650 h -= 2;
2651 } while (h);
2652 } else if (w == 16) {
2653 do {
2654 __m256i res[2];
2655 res[0] = jnt_copy_load_src_16_avx2(src);
2656 res[1] = jnt_copy_load_src_16_avx2(src + src_stride);
2657 jnt_comp_avg_round_store_16x2_kernel_avx2(res,
2658 factor_256,
2659 offset_comp_avg_256,
2660 dst,
2661 dst_stride,
2662 dst8,
2663 dst8_stride);
2664 src += 2 * src_stride;
2665 dst += 2 * dst_stride;
2666 dst8 += 2 * dst8_stride;
2667 h -= 2;
2668 } while (h);
2669 } else if (w == 32) {
2670 do {
2671 jnt_copy_comp_avg_32_avx2(src, factor_256, offset_comp_avg_256, dst, dst8);
2672 src += src_stride;
2673 dst += dst_stride;
2674 dst8 += dst8_stride;
2675 } while (--h);
2676 } else if (w == 64) {
2677 do {
2678 jnt_copy_comp_avg_32_avx2(src + 0 * 32,
2679 factor_256,
2680 offset_comp_avg_256,
2681 dst + 0 * 32,
2682 dst8 + 0 * 32);
2683 jnt_copy_comp_avg_32_avx2(src + 1 * 32,
2684 factor_256,
2685 offset_comp_avg_256,
2686 dst + 1 * 32,
2687 dst8 + 1 * 32);
2688 src += src_stride;
2689 dst += dst_stride;
2690 dst8 += dst8_stride;
2691 } while (--h);
2692 } else {
2693 assert(w == 128);
2694
2695 do {
2696 jnt_copy_comp_avg_32_avx2(src + 0 * 32,
2697 factor_256,
2698 offset_comp_avg_256,
2699 dst + 0 * 32,
2700 dst8 + 0 * 32);
2701 jnt_copy_comp_avg_32_avx2(src + 1 * 32,
2702 factor_256,
2703 offset_comp_avg_256,
2704 dst + 1 * 32,
2705 dst8 + 1 * 32);
2706 jnt_copy_comp_avg_32_avx2(src + 2 * 32,
2707 factor_256,
2708 offset_comp_avg_256,
2709 dst + 2 * 32,
2710 dst8 + 2 * 32);
2711 jnt_copy_comp_avg_32_avx2(src + 3 * 32,
2712 factor_256,
2713 offset_comp_avg_256,
2714 dst + 3 * 32,
2715 dst8 + 3 * 32);
2716 src += src_stride;
2717 dst += dst_stride;
2718 dst8 += dst8_stride;
2719 } while (--h);
2720 }
2721 }
2722 } else {
2723 const int16_t offset_avg = (1 << bits) - round_offset;
2724
2725 if (w <= 4) {
2726 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
2727
2728 if (w == 2) {
2729 do {
2730 const __m128i res = jnt_copy_load_src_2x2_sse2(src, src_stride);
2731 jnt_copy_avg_round_store_2x2_sse2(
2732 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
2733 src += 2 * src_stride;
2734 dst += 2 * dst_stride;
2735 dst8 += 2 * dst8_stride;
2736 h -= 2;
2737 } while (h);
2738 } else {
2739 assert(w == 4);
2740
2741 do {
2742 const __m128i res = jnt_copy_load_src_4x2_sse4_1(src, src_stride);
2743 jnt_copy_avg_round_store_4x2_sse2(
2744 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
2745 src += 2 * src_stride;
2746 dst += 2 * dst_stride;
2747 dst8 += 2 * dst8_stride;
2748 h -= 2;
2749 } while (h);
2750 }
2751 } else {
2752 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
2753
2754 if (w == 8) {
2755 do {
2756 const __m256i res = jnt_copy_load_src_8x2_avx2(src, src_stride);
2757 jnt_copy_avg_round_store_8x2_avx2(
2758 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2759 src += 2 * src_stride;
2760 dst += 2 * dst_stride;
2761 dst8 += 2 * dst8_stride;
2762 h -= 2;
2763 } while (h);
2764 } else if (w == 16) {
2765 do {
2766 __m256i res[2];
2767 res[0] = jnt_copy_load_src_16_avx2(src);
2768 res[1] = jnt_copy_load_src_16_avx2(src + src_stride);
2769 jnt_copy_avg_round_store_16x2_avx2(
2770 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2771 src += 2 * src_stride;
2772 dst += 2 * dst_stride;
2773 dst8 += 2 * dst8_stride;
2774 h -= 2;
2775 } while (h);
2776 } else if (w == 32) {
2777 do {
2778 jnt_copy_avg_32_avx2(src, offset_avg_256, dst, dst8);
2779 src += src_stride;
2780 dst += dst_stride;
2781 dst8 += dst8_stride;
2782 } while (--h);
2783 } else if (w == 64) {
2784 do {
2785 jnt_copy_avg_32_avx2(
2786 src + 0 * 32, offset_avg_256, dst + 0 * 32, dst8 + 0 * 32);
2787 jnt_copy_avg_32_avx2(
2788 src + 1 * 32, offset_avg_256, dst + 1 * 32, dst8 + 1 * 32);
2789 src += src_stride;
2790 dst += dst_stride;
2791 dst8 += dst8_stride;
2792 } while (--h);
2793 } else {
2794 assert(w == 128);
2795
2796 do {
2797 jnt_copy_avg_32_avx2(
2798 src + 0 * 32, offset_avg_256, dst + 0 * 32, dst8 + 0 * 32);
2799 jnt_copy_avg_32_avx2(
2800 src + 1 * 32, offset_avg_256, dst + 1 * 32, dst8 + 1 * 32);
2801 jnt_copy_avg_32_avx2(
2802 src + 2 * 32, offset_avg_256, dst + 2 * 32, dst8 + 2 * 32);
2803 jnt_copy_avg_32_avx2(
2804 src + 3 * 32, offset_avg_256, dst + 3 * 32, dst8 + 3 * 32);
2805 src += src_stride;
2806 dst += dst_stride;
2807 dst8 += dst8_stride;
2808 } while (--h);
2809 }
2810 }
2811 }
2812 } else {
2813 const int32_t offset_no_avg = (1 << offset_bits) + (1 << (offset_bits - 1));
2814
2815 if (w <= 4) {
2816 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
2817
2818 if (w == 2) {
2819 do {
2820 const __m128i res = jnt_copy_load_src_2x2_sse2(src, src_stride);
2821 const __m128i r = _mm_add_epi16(res, offset_no_avg_128);
2822 *(uint32_t *)dst = _mm_cvtsi128_si32(r);
2823 *(uint32_t *)(dst + dst_stride) = _mm_extract_epi32(r, 1);
2824 src += 2 * src_stride;
2825 dst += 2 * dst_stride;
2826 h -= 2;
2827 } while (h);
2828 } else {
2829 assert(w == 4);
2830
2831 do {
2832 const __m128i res = jnt_copy_load_src_4x2_sse4_1(src, src_stride);
2833 const __m128i r = _mm_add_epi16(res, offset_no_avg_128);
2834 store_u16_4x2_sse2(r, dst, dst_stride);
2835 src += 2 * src_stride;
2836 dst += 2 * dst_stride;
2837 h -= 2;
2838 } while (h);
2839 }
2840 } else {
2841 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
2842
2843 if (w == 8) {
2844 do {
2845 const __m256i res = jnt_copy_load_src_8x2_avx2(src, src_stride);
2846 const __m256i r = _mm256_add_epi16(res, offset_no_avg_256);
2847 storeu_u16_8x2_avx2(r, dst, dst_stride);
2848 src += 2 * src_stride;
2849 dst += 2 * dst_stride;
2850 h -= 2;
2851 } while (h);
2852 } else if (w == 16) {
2853 do {
2854 __m256i d[2];
2855 d[0] = jnt_copy_load_src_16_avx2(src);
2856 d[1] = jnt_copy_load_src_16_avx2(src + src_stride);
2857 d[0] = _mm256_add_epi16(d[0], offset_no_avg_256);
2858 d[1] = _mm256_add_epi16(d[1], offset_no_avg_256);
2859 _mm256_storeu_si256((__m256i *)(dst + 0 * dst_stride), d[0]);
2860 _mm256_storeu_si256((__m256i *)(dst + 1 * dst_stride), d[1]);
2861 src += 2 * src_stride;
2862 dst += 2 * dst_stride;
2863 h -= 2;
2864 } while (h);
2865 } else if (w == 32) {
2866 do {
2867 jnt_copy_no_avg_32_avx2(src, offset_no_avg_256, dst);
2868 src += src_stride;
2869 dst += dst_stride;
2870 } while (--h);
2871 } else if (w == 64) {
2872 do {
2873 jnt_copy_no_avg_32_avx2(src + 0 * 32, offset_no_avg_256, dst + 0 * 32);
2874 jnt_copy_no_avg_32_avx2(src + 1 * 32, offset_no_avg_256, dst + 1 * 32);
2875 src += src_stride;
2876 dst += dst_stride;
2877 } while (--h);
2878 } else {
2879 assert(w == 128);
2880
2881 do {
2882 jnt_copy_no_avg_32_avx2(src + 0 * 32, offset_no_avg_256, dst + 0 * 32);
2883 jnt_copy_no_avg_32_avx2(src + 1 * 32, offset_no_avg_256, dst + 1 * 32);
2884 jnt_copy_no_avg_32_avx2(src + 2 * 32, offset_no_avg_256, dst + 2 * 32);
2885 jnt_copy_no_avg_32_avx2(src + 3 * 32, offset_no_avg_256, dst + 3 * 32);
2886 src += src_stride;
2887 dst += dst_stride;
2888 } while (--h);
2889 }
2890 }
2891 }
2892 }
2893
2894 // =============================================================================
2895
jnt_x_comp_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i factor,const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)2896 SIMD_INLINE void jnt_x_comp_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
2897 const __m256i factor, const __m256i offset,
2898 ConvBufType *const dst, uint8_t *const dst8) {
2899 __m256i r[2];
2900
2901 x_convolve_2tap_32_avx2(src, coeffs, r);
2902 jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
2903 }
2904
jnt_x_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i offset,const ConvBufType * const dst,uint8_t * const dst8)2905 static INLINE void jnt_x_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
2906 const __m256i offset, const ConvBufType *const dst,
2907 uint8_t *const dst8) {
2908 __m256i r[2];
2909
2910 x_convolve_2tap_32_avx2(src, coeffs, r);
2911 jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
2912 }
2913
jnt_x_no_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i offset,ConvBufType * const dst)2914 static INLINE void jnt_x_no_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
2915 const __m256i offset, ConvBufType *const dst) {
2916 __m256i r[2];
2917
2918 x_convolve_2tap_32_avx2(src, coeffs, r);
2919 jnt_no_avg_round_store_32_avx2(r, offset, dst);
2920 }
2921
jnt_x_comp_avg_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i factor,const __m256i offset,ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)2922 SIMD_INLINE void jnt_x_comp_avg_6tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
2923 const __m256i coeffs[3], const __m256i filt[3],
2924 const __m256i factor, const __m256i offset,
2925 ConvBufType *const dst, const int32_t dst_stride,
2926 uint8_t *const dst8, const int32_t dst8_stride) {
2927 __m256i r[2];
2928
2929 x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2930 jnt_comp_avg_round_store_16x2_avx2(r, factor, offset, dst, dst_stride, dst8, dst8_stride);
2931 }
2932
jnt_x_avg_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i offset,ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)2933 SIMD_INLINE void jnt_x_avg_6tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
2934 const __m256i coeffs[3], const __m256i filt[3],
2935 const __m256i offset, ConvBufType *const dst,
2936 const int32_t dst_stride, uint8_t *const dst8,
2937 const int32_t dst8_stride) {
2938 __m256i r[2];
2939
2940 x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2941 jnt_avg_round_store_16x2_avx2(r, offset, dst, dst_stride, dst8, dst8_stride);
2942 }
2943
jnt_x_no_avg_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i offset,ConvBufType * const dst,const int32_t dst_stride)2944 SIMD_INLINE void jnt_x_no_avg_6tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
2945 const __m256i coeffs[3], const __m256i filt[3],
2946 const __m256i offset, ConvBufType *const dst,
2947 const int32_t dst_stride) {
2948 __m256i r[2];
2949
2950 x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2951 jnt_no_avg_round_store_16x2_avx2(r, offset, dst, dst_stride);
2952 }
2953
jnt_x_comp_avg_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],const __m256i factor,const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)2954 SIMD_INLINE void jnt_x_comp_avg_6tap_32_avx2(const uint8_t *const src, const __m256i coeffs[3],
2955 const __m256i filt[3], const __m256i factor,
2956 const __m256i offset, ConvBufType *const dst,
2957 uint8_t *const dst8) {
2958 __m256i r[2];
2959
2960 x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2961 jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
2962 }
2963
jnt_x_avg_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)2964 SIMD_INLINE void jnt_x_avg_6tap_32_avx2(const uint8_t *const src, const __m256i coeffs[3],
2965 const __m256i filt[3], const __m256i offset,
2966 ConvBufType *const dst, uint8_t *const dst8) {
2967 __m256i r[2];
2968
2969 x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2970 jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
2971 }
2972
jnt_x_no_avg_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],const __m256i offset,ConvBufType * const dst)2973 SIMD_INLINE void jnt_x_no_avg_6tap_32_avx2(const uint8_t *const src, const __m256i coeffs[3],
2974 const __m256i filt[3], const __m256i offset,
2975 ConvBufType *const dst) {
2976 __m256i r[2];
2977
2978 x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2979 jnt_no_avg_round_store_32_avx2(r, offset, dst);
2980 }
2981
jnt_x_comp_avg_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i factor,const __m256i offset,ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)2982 static INLINE void jnt_x_comp_avg_8tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
2983 const __m256i coeffs[4], const __m256i filt[4],
2984 const __m256i factor, const __m256i offset,
2985 ConvBufType *const dst, const int32_t dst_stride,
2986 uint8_t *const dst8, const int32_t dst8_stride) {
2987 __m256i r[2];
2988
2989 x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2990 jnt_comp_avg_round_store_16x2_avx2(r, factor, offset, dst, dst_stride, dst8, dst8_stride);
2991 }
2992
jnt_x_comp_avg_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],const __m256i factor,const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)2993 SIMD_INLINE void jnt_x_comp_avg_8tap_32_avx2(const uint8_t *const src, const __m256i coeffs[4],
2994 const __m256i filt[4], const __m256i factor,
2995 const __m256i offset, ConvBufType *const dst,
2996 uint8_t *const dst8) {
2997 __m256i r[2];
2998
2999 x_convolve_8tap_32_avx2(src, coeffs, filt, r);
3000 jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
3001 }
3002
jnt_x_avg_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i offset,ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)3003 SIMD_INLINE void jnt_x_avg_8tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
3004 const __m256i coeffs[4], const __m256i filt[4],
3005 const __m256i offset, ConvBufType *const dst,
3006 const int32_t dst_stride, uint8_t *const dst8,
3007 const int32_t dst8_stride) {
3008 __m256i r[2];
3009
3010 x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
3011 jnt_avg_round_store_16x2_avx2(r, offset, dst, dst_stride, dst8, dst8_stride);
3012 }
3013
jnt_x_avg_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)3014 SIMD_INLINE void jnt_x_avg_8tap_32_avx2(const uint8_t *const src, const __m256i coeffs[4],
3015 const __m256i filt[4], const __m256i offset,
3016 ConvBufType *const dst, uint8_t *const dst8) {
3017 __m256i r[2];
3018
3019 x_convolve_8tap_32_avx2(src, coeffs, filt, r);
3020 jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
3021 }
3022
jnt_x_no_avg_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i offset,ConvBufType * const dst,const int32_t dst_stride)3023 static INLINE void jnt_x_no_avg_8tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
3024 const __m256i coeffs[4], const __m256i filt[4],
3025 const __m256i offset, ConvBufType *const dst,
3026 const int32_t dst_stride) {
3027 __m256i r[2];
3028
3029 x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
3030 jnt_no_avg_round_store_16x2_avx2(r, offset, dst, dst_stride);
3031 }
3032
jnt_x_no_avg_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],const __m256i offset,ConvBufType * const dst)3033 SIMD_INLINE void jnt_x_no_avg_8tap_32_avx2(const uint8_t *const src, const __m256i coeffs[4],
3034 const __m256i filt[4], const __m256i offset,
3035 ConvBufType *const dst) {
3036 __m256i r[2];
3037
3038 x_convolve_8tap_32_avx2(src, coeffs, filt, r);
3039 jnt_no_avg_round_store_32_avx2(r, offset, dst);
3040 }
3041
jnt_convolve_x_2tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,const ConvolveParams * const conv_params)3042 static void jnt_convolve_x_2tap_avx2(const uint8_t *const src, const int32_t src_stride,
3043 uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
3044 const int32_t h,
3045 const InterpFilterParams *const filter_params_x,
3046 const int32_t subpel_x_q4,
3047 const ConvolveParams *const conv_params) {
3048 const uint8_t *src_ptr = src;
3049 const int32_t dst_stride = conv_params->dst_stride;
3050 const int32_t round_0 = 3;
3051 const int32_t round_1 = COMPOUND_ROUND1_BITS;
3052 const int32_t bits = FILTER_BITS - round_1;
3053 const int32_t bd = 8;
3054 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
3055 const int32_t offset_bits = bd + round_bits;
3056 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
3057 ConvBufType * dst = conv_params->dst;
3058 int32_t y = h;
3059 __m128i coeffs_128[4];
3060 __m256i coeffs_256[4];
3061
3062 if (conv_params->do_average) {
3063 if (conv_params->use_jnt_comp_avg) {
3064 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
3065 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
3066 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3067 (round_offset << DIST_PRECISION_BITS);
3068
3069 if (w <= 4) {
3070 const __m128i factor_128 = _mm_set1_epi32(factor);
3071 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
3072
3073 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3074
3075 if (w == 2) {
3076 do {
3077 const __m128i res = x_convolve_2tap_2x2_sse4_1(
3078 src_ptr, src_stride, coeffs_128);
3079 jnt_comp_avg_round_store_2x2_sse2(res,
3080 factor_128,
3081 offset_comp_avg_128,
3082 dst,
3083 dst_stride,
3084 dst8,
3085 dst8_stride);
3086 src_ptr += 2 * src_stride;
3087 dst += 2 * dst_stride;
3088 dst8 += 2 * dst8_stride;
3089 y -= 2;
3090 } while (y);
3091 } else {
3092 assert(w == 4);
3093
3094 do {
3095 const __m128i res = x_convolve_2tap_4x2_ssse3(
3096 src_ptr, src_stride, coeffs_128);
3097 jnt_comp_avg_round_store_4x2_sse2(res,
3098 factor_128,
3099 offset_comp_avg_128,
3100 dst,
3101 dst_stride,
3102 dst8,
3103 dst8_stride);
3104 src_ptr += 2 * src_stride;
3105 dst += 2 * dst_stride;
3106 dst8 += 2 * dst8_stride;
3107 y -= 2;
3108 } while (y);
3109 }
3110 } else {
3111 const __m256i factor_256 = _mm256_set1_epi32(factor);
3112 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
3113 __m256i r[2];
3114
3115 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3116
3117 if (w == 8) {
3118 do {
3119 const __m256i res = x_convolve_2tap_8x2_avx2(
3120 src_ptr, src_stride, coeffs_256);
3121 jnt_comp_avg_round_store_8x2_avx2(res,
3122 factor_256,
3123 offset_comp_avg_256,
3124 dst,
3125 dst_stride,
3126 dst8,
3127 dst8_stride);
3128 src_ptr += 2 * src_stride;
3129 dst += 2 * dst_stride;
3130 dst8 += 2 * dst8_stride;
3131 y -= 2;
3132 } while (y);
3133 } else if (w == 16) {
3134 do {
3135 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3136 jnt_comp_avg_round_store_16x2_avx2(
3137 r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
3138 src_ptr += 2 * src_stride;
3139 dst += 2 * dst_stride;
3140 dst8 += 2 * dst8_stride;
3141 y -= 2;
3142 } while (y);
3143 } else if (w == 32) {
3144 do {
3145 jnt_x_comp_avg_2tap_32_avx2(
3146 src_ptr, coeffs_256, factor_256, offset_comp_avg_256, dst, dst8);
3147 src_ptr += src_stride;
3148 dst += dst_stride;
3149 dst8 += dst8_stride;
3150 } while (--y);
3151 } else if (w == 64) {
3152 do {
3153 jnt_x_comp_avg_2tap_32_avx2(
3154 src_ptr, coeffs_256, factor_256, offset_comp_avg_256, dst, dst8);
3155 jnt_x_comp_avg_2tap_32_avx2(src_ptr + 32,
3156 coeffs_256,
3157 factor_256,
3158 offset_comp_avg_256,
3159 dst + 32,
3160 dst8 + 32);
3161 src_ptr += src_stride;
3162 dst += dst_stride;
3163 dst8 += dst8_stride;
3164 } while (--y);
3165 } else {
3166 assert(w == 128);
3167
3168 do {
3169 jnt_x_comp_avg_2tap_32_avx2(
3170 src_ptr, coeffs_256, factor_256, offset_comp_avg_256, dst, dst8);
3171 jnt_x_comp_avg_2tap_32_avx2(src_ptr + 1 * 32,
3172 coeffs_256,
3173 factor_256,
3174 offset_comp_avg_256,
3175 dst + 1 * 32,
3176 dst8 + 1 * 32);
3177 jnt_x_comp_avg_2tap_32_avx2(src_ptr + 2 * 32,
3178 coeffs_256,
3179 factor_256,
3180 offset_comp_avg_256,
3181 dst + 2 * 32,
3182 dst8 + 2 * 32);
3183 jnt_x_comp_avg_2tap_32_avx2(src_ptr + 3 * 32,
3184 coeffs_256,
3185 factor_256,
3186 offset_comp_avg_256,
3187 dst + 3 * 32,
3188 dst8 + 3 * 32);
3189 src_ptr += src_stride;
3190 dst += dst_stride;
3191 dst8 += dst8_stride;
3192 } while (--y);
3193 }
3194 }
3195 } else {
3196 const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_0 - bits - 2)) -
3197 (round_offset << (round_0 - bits - 1));
3198
3199 if (w <= 4) {
3200 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
3201
3202 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3203
3204 if (w == 2) {
3205 do {
3206 const __m128i res = x_convolve_2tap_2x2_sse4_1(
3207 src_ptr, src_stride, coeffs_128);
3208 jnt_avg_round_store_2x2_sse2(
3209 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
3210 src_ptr += 2 * src_stride;
3211 dst += 2 * dst_stride;
3212 dst8 += 2 * dst8_stride;
3213 y -= 2;
3214 } while (y);
3215 } else {
3216 assert(w == 4);
3217
3218 do {
3219 const __m128i res = x_convolve_2tap_4x2_ssse3(
3220 src_ptr, src_stride, coeffs_128);
3221 jnt_avg_round_store_4x2_sse2(
3222 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
3223 src_ptr += 2 * src_stride;
3224 dst += 2 * dst_stride;
3225 dst8 += 2 * dst8_stride;
3226 y -= 2;
3227 } while (y);
3228 }
3229 } else {
3230 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
3231 __m256i r[2];
3232
3233 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3234
3235 if (w == 8) {
3236 do {
3237 const __m256i res = x_convolve_2tap_8x2_avx2(
3238 src_ptr, src_stride, coeffs_256);
3239 jnt_avg_round_store_8x2_avx2(
3240 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
3241 src_ptr += 2 * src_stride;
3242 dst += 2 * dst_stride;
3243 dst8 += 2 * dst8_stride;
3244 y -= 2;
3245 } while (y);
3246 } else if (w == 16) {
3247 do {
3248 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3249 jnt_avg_round_store_16x2_avx2(
3250 r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
3251 src_ptr += 2 * src_stride;
3252 dst += 2 * dst_stride;
3253 dst8 += 2 * dst8_stride;
3254 y -= 2;
3255 } while (y);
3256 } else if (w == 32) {
3257 do {
3258 jnt_x_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_avg_256, dst, dst8);
3259 src_ptr += src_stride;
3260 dst += dst_stride;
3261 dst8 += dst8_stride;
3262 } while (--y);
3263 } else if (w == 64) {
3264 do {
3265 jnt_x_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_avg_256, dst, dst8);
3266 jnt_x_avg_2tap_32_avx2(
3267 src_ptr + 32, coeffs_256, offset_avg_256, dst + 32, dst8 + 32);
3268 src_ptr += src_stride;
3269 dst += dst_stride;
3270 dst8 += dst8_stride;
3271 } while (--y);
3272 } else {
3273 assert(w == 128);
3274
3275 do {
3276 jnt_x_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_avg_256, dst, dst8);
3277 jnt_x_avg_2tap_32_avx2(src_ptr + 1 * 32,
3278 coeffs_256,
3279 offset_avg_256,
3280 dst + 1 * 32,
3281 dst8 + 1 * 32);
3282 jnt_x_avg_2tap_32_avx2(src_ptr + 2 * 32,
3283 coeffs_256,
3284 offset_avg_256,
3285 dst + 2 * 32,
3286 dst8 + 2 * 32);
3287 jnt_x_avg_2tap_32_avx2(src_ptr + 3 * 32,
3288 coeffs_256,
3289 offset_avg_256,
3290 dst + 3 * 32,
3291 dst8 + 3 * 32);
3292 src_ptr += src_stride;
3293 dst += dst_stride;
3294 dst8 += dst8_stride;
3295 } while (--y);
3296 }
3297 }
3298 }
3299 } else {
3300 const int16_t offset_no_avg = (round_offset << (round_0 - bits - 1)) +
3301 (1 << (round_0 - bits - 2));
3302
3303 if (w <= 4) {
3304 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
3305
3306 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3307
3308 if (w == 2) {
3309 do {
3310 const __m128i res = x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
3311 jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
3312 src_ptr += 2 * src_stride;
3313 dst += 2 * dst_stride;
3314 y -= 2;
3315 } while (y);
3316 } else {
3317 assert(w == 4);
3318
3319 do {
3320 const __m128i res = x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3321 jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
3322 src_ptr += 2 * src_stride;
3323 dst += 2 * dst_stride;
3324 y -= 2;
3325 } while (y);
3326 }
3327 } else {
3328 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
3329 __m256i r[2];
3330
3331 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3332
3333 if (w == 8) {
3334 do {
3335 const __m256i res = x_convolve_2tap_8x2_avx2(src_ptr, src_stride, coeffs_256);
3336 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
3337 src_ptr += 2 * src_stride;
3338 dst += 2 * dst_stride;
3339 y -= 2;
3340 } while (y);
3341 } else if (w == 16) {
3342 do {
3343 x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3344 jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
3345 src_ptr += 2 * src_stride;
3346 dst += 2 * dst_stride;
3347 y -= 2;
3348 } while (y);
3349 } else if (w == 32) {
3350 do {
3351 jnt_x_no_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_no_avg_256, dst);
3352 src_ptr += src_stride;
3353 dst += dst_stride;
3354 } while (--y);
3355 } else if (w == 64) {
3356 do {
3357 jnt_x_no_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_no_avg_256, dst);
3358 jnt_x_no_avg_2tap_32_avx2(
3359 src_ptr + 32, coeffs_256, offset_no_avg_256, dst + 32);
3360 src_ptr += src_stride;
3361 dst += dst_stride;
3362 } while (--y);
3363 } else {
3364 assert(w == 128);
3365
3366 do {
3367 jnt_x_no_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_no_avg_256, dst);
3368 jnt_x_no_avg_2tap_32_avx2(
3369 src_ptr + 1 * 32, coeffs_256, offset_no_avg_256, dst + 1 * 32);
3370 jnt_x_no_avg_2tap_32_avx2(
3371 src_ptr + 2 * 32, coeffs_256, offset_no_avg_256, dst + 2 * 32);
3372 jnt_x_no_avg_2tap_32_avx2(
3373 src_ptr + 3 * 32, coeffs_256, offset_no_avg_256, dst + 3 * 32);
3374 src_ptr += src_stride;
3375 dst += dst_stride;
3376 } while (--y);
3377 }
3378 }
3379 }
3380 }
3381
jnt_convolve_x_4tap_ssse3(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,const ConvolveParams * const conv_params)3382 void jnt_convolve_x_4tap_ssse3(const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
3383 const int32_t dst8_stride, const int32_t w, const int32_t h,
3384 const InterpFilterParams *const filter_params_x,
3385 const int32_t subpel_x_q4, const ConvolveParams *const conv_params) {
3386 const uint8_t *src_ptr = src - 1;
3387 const int32_t dst_stride = conv_params->dst_stride;
3388 const int32_t round_0 = 3;
3389 const int32_t round_1 = COMPOUND_ROUND1_BITS;
3390 const int32_t bits = FILTER_BITS - round_1;
3391 const int32_t bd = 8;
3392 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
3393 const int32_t offset_bits = bd + round_bits;
3394 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
3395 ConvBufType * dst = conv_params->dst;
3396 int32_t y = h;
3397 __m128i coeffs_128[4];
3398
3399 prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3400
3401 if (conv_params->do_average) {
3402 if (conv_params->use_jnt_comp_avg) {
3403 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
3404 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
3405 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3406 (round_offset << DIST_PRECISION_BITS);
3407 const __m128i factor_128 = _mm_set1_epi32(factor);
3408 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
3409
3410 if (w == 2) {
3411 do {
3412 const __m128i res = x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3413 jnt_comp_avg_round_store_2x2_sse2(
3414 res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
3415 src_ptr += 2 * src_stride;
3416 dst += 2 * dst_stride;
3417 dst8 += 2 * dst8_stride;
3418 y -= 2;
3419 } while (y);
3420 } else {
3421 assert(w == 4);
3422
3423 do {
3424 const __m128i res = x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3425 jnt_comp_avg_round_store_4x2_sse2(
3426 res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
3427 src_ptr += 2 * src_stride;
3428 dst += 2 * dst_stride;
3429 dst8 += 2 * dst8_stride;
3430 y -= 2;
3431 } while (y);
3432 }
3433 } else {
3434 const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_0 - bits - 2)) -
3435 (round_offset << (round_0 - bits - 1));
3436 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
3437
3438 if (w == 2) {
3439 do {
3440 const __m128i res = x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3441 jnt_avg_round_store_2x2_sse2(
3442 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
3443 src_ptr += 2 * src_stride;
3444 dst += 2 * dst_stride;
3445 dst8 += 2 * dst8_stride;
3446 y -= 2;
3447 } while (y);
3448 } else {
3449 assert(w == 4);
3450
3451 do {
3452 const __m128i res = x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3453 jnt_avg_round_store_4x2_sse2(
3454 res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
3455 src_ptr += 2 * src_stride;
3456 dst += 2 * dst_stride;
3457 dst8 += 2 * dst8_stride;
3458 y -= 2;
3459 } while (y);
3460 }
3461 }
3462 } else {
3463 const int16_t offset_no_avg = (round_offset << (round_0 - bits - 1)) +
3464 (1 << (round_0 - bits - 2));
3465 const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
3466
3467 if (w == 2) {
3468 do {
3469 const __m128i res = x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3470 jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
3471 src_ptr += 2 * src_stride;
3472 dst += 2 * dst_stride;
3473 y -= 2;
3474 } while (y);
3475 } else {
3476 assert(w == 4);
3477
3478 do {
3479 const __m128i res = x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3480 jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
3481 src_ptr += 2 * src_stride;
3482 dst += 2 * dst_stride;
3483 y -= 2;
3484 } while (y);
3485 }
3486 }
3487 }
3488
jnt_convolve_x_6tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,const ConvolveParams * const conv_params)3489 static void jnt_convolve_x_6tap_avx2(const uint8_t *const src, const int32_t src_stride,
3490 uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
3491 const int32_t h,
3492 const InterpFilterParams *const filter_params_x,
3493 const int32_t subpel_x_q4,
3494 const ConvolveParams *const conv_params) {
3495 const uint8_t *src_ptr = src - 2;
3496 const int32_t dst_stride = conv_params->dst_stride;
3497 const int32_t round_0 = 3;
3498 const int32_t round_1 = COMPOUND_ROUND1_BITS;
3499 const int32_t bits = FILTER_BITS - round_1;
3500 const int32_t bd = 8;
3501 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
3502 const int32_t offset_bits = bd + round_bits;
3503 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
3504 ConvBufType * dst = conv_params->dst;
3505 int32_t y = h;
3506 __m256i coeffs_256[3], filt_256[3];
3507
3508 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
3509 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
3510 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
3511
3512 prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3513
3514 if (conv_params->do_average) {
3515 if (conv_params->use_jnt_comp_avg) {
3516 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
3517 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
3518 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3519 (round_offset << DIST_PRECISION_BITS);
3520 const __m256i factor_256 = _mm256_set1_epi32(factor);
3521 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
3522
3523 if (w == 8) {
3524 do {
3525 const __m256i res = x_convolve_6tap_8x2_avx2(
3526 src_ptr, src_stride, coeffs_256, filt_256);
3527 jnt_comp_avg_round_store_8x2_avx2(
3528 res, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
3529 src_ptr += 2 * src_stride;
3530 dst += 2 * dst_stride;
3531 dst8 += 2 * dst8_stride;
3532 y -= 2;
3533 } while (y);
3534 } else if (w == 16) {
3535 do {
3536 jnt_x_comp_avg_6tap_16x2_avx2(src_ptr,
3537 src_stride,
3538 coeffs_256,
3539 filt_256,
3540 factor_256,
3541 offset_comp_avg_256,
3542 dst,
3543 dst_stride,
3544 dst8,
3545 dst8_stride);
3546 src_ptr += 2 * src_stride;
3547 dst += 2 * dst_stride;
3548 dst8 += 2 * dst8_stride;
3549 y -= 2;
3550 } while (y);
3551 } else if (w == 32) {
3552 do {
3553 jnt_x_comp_avg_6tap_32_avx2(
3554 src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3555 src_ptr += src_stride;
3556 dst += dst_stride;
3557 dst8 += dst8_stride;
3558 } while (--y);
3559 } else if (w == 64) {
3560 do {
3561 jnt_x_comp_avg_6tap_32_avx2(
3562 src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3563 jnt_x_comp_avg_6tap_32_avx2(src_ptr + 32,
3564 coeffs_256,
3565 filt_256,
3566 factor_256,
3567 offset_comp_avg_256,
3568 dst + 32,
3569 dst8 + 32);
3570 src_ptr += src_stride;
3571 dst += dst_stride;
3572 dst8 += dst8_stride;
3573 } while (--y);
3574 } else {
3575 assert(w == 128);
3576
3577 do {
3578 jnt_x_comp_avg_6tap_32_avx2(
3579 src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3580 jnt_x_comp_avg_6tap_32_avx2(src_ptr + 1 * 32,
3581 coeffs_256,
3582 filt_256,
3583 factor_256,
3584 offset_comp_avg_256,
3585 dst + 1 * 32,
3586 dst8 + 1 * 32);
3587 jnt_x_comp_avg_6tap_32_avx2(src_ptr + 2 * 32,
3588 coeffs_256,
3589 filt_256,
3590 factor_256,
3591 offset_comp_avg_256,
3592 dst + 2 * 32,
3593 dst8 + 2 * 32);
3594 jnt_x_comp_avg_6tap_32_avx2(src_ptr + 3 * 32,
3595 coeffs_256,
3596 filt_256,
3597 factor_256,
3598 offset_comp_avg_256,
3599 dst + 3 * 32,
3600 dst8 + 3 * 32);
3601 src_ptr += src_stride;
3602 dst += dst_stride;
3603 dst8 += dst8_stride;
3604 } while (--y);
3605 }
3606 } else {
3607 const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_0 - bits - 2)) -
3608 (round_offset << (round_0 - bits - 1));
3609 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
3610
3611 if (w == 8) {
3612 do {
3613 const __m256i res = x_convolve_6tap_8x2_avx2(
3614 src_ptr, src_stride, coeffs_256, filt_256);
3615 jnt_avg_round_store_8x2_avx2(
3616 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
3617 src_ptr += 2 * src_stride;
3618 dst += 2 * dst_stride;
3619 dst8 += 2 * dst8_stride;
3620 y -= 2;
3621 } while (y);
3622 } else if (w == 16) {
3623 do {
3624 jnt_x_avg_6tap_16x2_avx2(src_ptr,
3625 src_stride,
3626 coeffs_256,
3627 filt_256,
3628 offset_avg_256,
3629 dst,
3630 dst_stride,
3631 dst8,
3632 dst8_stride);
3633 src_ptr += 2 * src_stride;
3634 dst += 2 * dst_stride;
3635 dst8 += 2 * dst8_stride;
3636 y -= 2;
3637 } while (y);
3638 } else if (w == 32) {
3639 do {
3640 jnt_x_avg_6tap_32_avx2(
3641 src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3642 src_ptr += src_stride;
3643 dst += dst_stride;
3644 dst8 += dst8_stride;
3645 } while (--y);
3646 } else if (w == 64) {
3647 do {
3648 jnt_x_avg_6tap_32_avx2(
3649 src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3650 jnt_x_avg_6tap_32_avx2(
3651 src_ptr + 32, coeffs_256, filt_256, offset_avg_256, dst + 32, dst8 + 32);
3652 src_ptr += src_stride;
3653 dst += dst_stride;
3654 dst8 += dst8_stride;
3655 } while (--y);
3656 } else {
3657 assert(w == 128);
3658
3659 do {
3660 jnt_x_avg_6tap_32_avx2(
3661 src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3662 jnt_x_avg_6tap_32_avx2(src_ptr + 1 * 32,
3663 coeffs_256,
3664 filt_256,
3665 offset_avg_256,
3666 dst + 1 * 32,
3667 dst8 + 1 * 32);
3668 jnt_x_avg_6tap_32_avx2(src_ptr + 2 * 32,
3669 coeffs_256,
3670 filt_256,
3671 offset_avg_256,
3672 dst + 2 * 32,
3673 dst8 + 2 * 32);
3674 jnt_x_avg_6tap_32_avx2(src_ptr + 3 * 32,
3675 coeffs_256,
3676 filt_256,
3677 offset_avg_256,
3678 dst + 3 * 32,
3679 dst8 + 3 * 32);
3680 src_ptr += src_stride;
3681 dst += dst_stride;
3682 dst8 += dst8_stride;
3683 } while (--y);
3684 }
3685 }
3686 } else {
3687 const int16_t offset_no_avg = (round_offset << (round_0 - bits - 1)) +
3688 (1 << (round_0 - bits - 2));
3689 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
3690
3691 if (w == 8) {
3692 do {
3693 const __m256i res = x_convolve_6tap_8x2_avx2(
3694 src_ptr, src_stride, coeffs_256, filt_256);
3695 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
3696 src_ptr += 2 * src_stride;
3697 dst += 2 * dst_stride;
3698 y -= 2;
3699 } while (y);
3700 } else if (w == 16) {
3701 do {
3702 jnt_x_no_avg_6tap_16x2_avx2(
3703 src_ptr, src_stride, coeffs_256, filt_256, offset_no_avg_256, dst, dst_stride);
3704 src_ptr += 2 * src_stride;
3705 dst += 2 * dst_stride;
3706 y -= 2;
3707 } while (y);
3708 } else if (w == 32) {
3709 do {
3710 jnt_x_no_avg_6tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3711 src_ptr += src_stride;
3712 dst += dst_stride;
3713 } while (--y);
3714 } else if (w == 64) {
3715 do {
3716 jnt_x_no_avg_6tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3717 jnt_x_no_avg_6tap_32_avx2(
3718 src_ptr + 32, coeffs_256, filt_256, offset_no_avg_256, dst + 32);
3719 src_ptr += src_stride;
3720 dst += dst_stride;
3721 } while (--y);
3722 } else {
3723 assert(w == 128);
3724
3725 do {
3726 jnt_x_no_avg_6tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3727 jnt_x_no_avg_6tap_32_avx2(
3728 src_ptr + 1 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 1 * 32);
3729 jnt_x_no_avg_6tap_32_avx2(
3730 src_ptr + 2 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 2 * 32);
3731 jnt_x_no_avg_6tap_32_avx2(
3732 src_ptr + 3 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 3 * 32);
3733 src_ptr += src_stride;
3734 dst += dst_stride;
3735 } while (--y);
3736 }
3737 }
3738 }
3739
jnt_convolve_x_8tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,const ConvolveParams * const conv_params)3740 static void jnt_convolve_x_8tap_avx2(const uint8_t *const src, const int32_t src_stride,
3741 uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
3742 const int32_t h,
3743 const InterpFilterParams *const filter_params_x,
3744 const int32_t subpel_x_q4,
3745 const ConvolveParams *const conv_params) {
3746 const uint8_t *src_ptr = src - 3;
3747 const int32_t dst_stride = conv_params->dst_stride;
3748 const int32_t round_0 = 3;
3749 const int32_t round_1 = COMPOUND_ROUND1_BITS;
3750 const int32_t bits = FILTER_BITS - round_1;
3751 const int32_t bd = 8;
3752 const int32_t round_bits = 2 * FILTER_BITS - round_0 - round_1;
3753 const int32_t offset_bits = bd + round_bits;
3754 const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
3755 ConvBufType * dst = conv_params->dst;
3756 int32_t y = h;
3757 __m256i coeffs_256[4], filt_256[4];
3758
3759 filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
3760 filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
3761 filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
3762 filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx);
3763
3764 prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3765
3766 if (conv_params->do_average) {
3767 if (conv_params->use_jnt_comp_avg) {
3768 const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
3769 const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
3770 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3771 (round_offset << DIST_PRECISION_BITS);
3772 const __m256i factor_256 = _mm256_set1_epi32(factor);
3773 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
3774
3775 if (w == 8) {
3776 do {
3777 const __m256i res = x_convolve_8tap_8x2_avx2(
3778 src_ptr, src_stride, coeffs_256, filt_256);
3779 jnt_comp_avg_round_store_8x2_avx2(
3780 res, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
3781 src_ptr += 2 * src_stride;
3782 dst += 2 * dst_stride;
3783 dst8 += 2 * dst8_stride;
3784 y -= 2;
3785 } while (y);
3786 } else if (w == 16) {
3787 do {
3788 jnt_x_comp_avg_8tap_16x2_avx2(src_ptr,
3789 src_stride,
3790 coeffs_256,
3791 filt_256,
3792 factor_256,
3793 offset_comp_avg_256,
3794 dst,
3795 dst_stride,
3796 dst8,
3797 dst8_stride);
3798 src_ptr += 2 * src_stride;
3799 dst += 2 * dst_stride;
3800 dst8 += 2 * dst8_stride;
3801 y -= 2;
3802 } while (y);
3803 } else if (w == 32) {
3804 do {
3805 jnt_x_comp_avg_8tap_32_avx2(
3806 src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3807 src_ptr += src_stride;
3808 dst += dst_stride;
3809 dst8 += dst8_stride;
3810 } while (--y);
3811 } else if (w == 64) {
3812 do {
3813 jnt_x_comp_avg_8tap_32_avx2(
3814 src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3815 jnt_x_comp_avg_8tap_32_avx2(src_ptr + 32,
3816 coeffs_256,
3817 filt_256,
3818 factor_256,
3819 offset_comp_avg_256,
3820 dst + 32,
3821 dst8 + 32);
3822 src_ptr += src_stride;
3823 dst += dst_stride;
3824 dst8 += dst8_stride;
3825 } while (--y);
3826 } else {
3827 assert(w == 128);
3828
3829 do {
3830 jnt_x_comp_avg_8tap_32_avx2(
3831 src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3832 jnt_x_comp_avg_8tap_32_avx2(src_ptr + 1 * 32,
3833 coeffs_256,
3834 filt_256,
3835 factor_256,
3836 offset_comp_avg_256,
3837 dst + 1 * 32,
3838 dst8 + 1 * 32);
3839 jnt_x_comp_avg_8tap_32_avx2(src_ptr + 2 * 32,
3840 coeffs_256,
3841 filt_256,
3842 factor_256,
3843 offset_comp_avg_256,
3844 dst + 2 * 32,
3845 dst8 + 2 * 32);
3846 jnt_x_comp_avg_8tap_32_avx2(src_ptr + 3 * 32,
3847 coeffs_256,
3848 filt_256,
3849 factor_256,
3850 offset_comp_avg_256,
3851 dst + 3 * 32,
3852 dst8 + 3 * 32);
3853 src_ptr += src_stride;
3854 dst += dst_stride;
3855 dst8 += dst8_stride;
3856 } while (--y);
3857 }
3858 } else {
3859 const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_0 - bits - 2)) -
3860 (round_offset << (round_0 - bits - 1));
3861 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
3862
3863 if (w == 8) {
3864 do {
3865 const __m256i res = x_convolve_8tap_8x2_avx2(
3866 src_ptr, src_stride, coeffs_256, filt_256);
3867 jnt_avg_round_store_8x2_avx2(
3868 res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
3869 src_ptr += 2 * src_stride;
3870 dst += 2 * dst_stride;
3871 dst8 += 2 * dst8_stride;
3872 y -= 2;
3873 } while (y);
3874 } else if (w == 16) {
3875 do {
3876 jnt_x_avg_8tap_16x2_avx2(src_ptr,
3877 src_stride,
3878 coeffs_256,
3879 filt_256,
3880 offset_avg_256,
3881 dst,
3882 dst_stride,
3883 dst8,
3884 dst8_stride);
3885 src_ptr += 2 * src_stride;
3886 dst += 2 * dst_stride;
3887 dst8 += 2 * dst8_stride;
3888 y -= 2;
3889 } while (y);
3890 } else if (w == 32) {
3891 do {
3892 jnt_x_avg_8tap_32_avx2(
3893 src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3894 src_ptr += src_stride;
3895 dst += dst_stride;
3896 dst8 += dst8_stride;
3897 } while (--y);
3898 } else if (w == 64) {
3899 do {
3900 jnt_x_avg_8tap_32_avx2(
3901 src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3902 jnt_x_avg_8tap_32_avx2(
3903 src_ptr + 32, coeffs_256, filt_256, offset_avg_256, dst + 32, dst8 + 32);
3904 src_ptr += src_stride;
3905 dst += dst_stride;
3906 dst8 += dst8_stride;
3907 } while (--y);
3908 } else {
3909 assert(w == 128);
3910
3911 do {
3912 jnt_x_avg_8tap_32_avx2(
3913 src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3914 jnt_x_avg_8tap_32_avx2(src_ptr + 1 * 32,
3915 coeffs_256,
3916 filt_256,
3917 offset_avg_256,
3918 dst + 1 * 32,
3919 dst8 + 1 * 32);
3920 jnt_x_avg_8tap_32_avx2(src_ptr + 2 * 32,
3921 coeffs_256,
3922 filt_256,
3923 offset_avg_256,
3924 dst + 2 * 32,
3925 dst8 + 2 * 32);
3926 jnt_x_avg_8tap_32_avx2(src_ptr + 3 * 32,
3927 coeffs_256,
3928 filt_256,
3929 offset_avg_256,
3930 dst + 3 * 32,
3931 dst8 + 3 * 32);
3932 src_ptr += src_stride;
3933 dst += dst_stride;
3934 dst8 += dst8_stride;
3935 } while (--y);
3936 }
3937 }
3938 } else {
3939 const int16_t offset_no_avg = (round_offset << (round_0 - bits - 1)) +
3940 (1 << (round_0 - bits - 2));
3941 const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
3942
3943 if (w == 8) {
3944 do {
3945 const __m256i res = x_convolve_8tap_8x2_avx2(
3946 src_ptr, src_stride, coeffs_256, filt_256);
3947 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
3948 src_ptr += 2 * src_stride;
3949 dst += 2 * dst_stride;
3950 y -= 2;
3951 } while (y);
3952 } else if (w == 16) {
3953 do {
3954 jnt_x_no_avg_8tap_16x2_avx2(
3955 src_ptr, src_stride, coeffs_256, filt_256, offset_no_avg_256, dst, dst_stride);
3956 src_ptr += 2 * src_stride;
3957 dst += 2 * dst_stride;
3958 y -= 2;
3959 } while (y);
3960 } else if (w == 32) {
3961 do {
3962 jnt_x_no_avg_8tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3963 src_ptr += src_stride;
3964 dst += dst_stride;
3965 } while (--y);
3966 } else if (w == 64) {
3967 do {
3968 jnt_x_no_avg_8tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3969 jnt_x_no_avg_8tap_32_avx2(
3970 src_ptr + 32, coeffs_256, filt_256, offset_no_avg_256, dst + 32);
3971 src_ptr += src_stride;
3972 dst += dst_stride;
3973 } while (--y);
3974 } else {
3975 assert(w == 128);
3976
3977 do {
3978 jnt_x_no_avg_8tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3979 jnt_x_no_avg_8tap_32_avx2(
3980 src_ptr + 1 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 1 * 32);
3981 jnt_x_no_avg_8tap_32_avx2(
3982 src_ptr + 2 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 2 * 32);
3983 jnt_x_no_avg_8tap_32_avx2(
3984 src_ptr + 3 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 3 * 32);
3985 src_ptr += src_stride;
3986 dst += dst_stride;
3987 } while (--y);
3988 }
3989 }
3990 }
3991
3992 typedef void (*JntConvolveXTapFunc)(const uint8_t *const src, const int32_t src_stride,
3993 uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
3994 const int32_t h,
3995 const InterpFilterParams *const filter_params_x,
3996 const int32_t subpel_x_q4,
3997 const ConvolveParams *const conv_params);
3998
svt_av1_jnt_convolve_x_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)3999 void svt_av1_jnt_convolve_x_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
4000 int32_t dst8_stride, int32_t w, int32_t h,
4001 InterpFilterParams *filter_params_x,
4002 InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
4003 const int32_t subpel_y_q4, ConvolveParams *conv_params) {
4004 static const JntConvolveXTapFunc jnt_convolve_x_tap_func_table[MAX_FILTER_TAP + 1] = {
4005 NULL,
4006 NULL,
4007 jnt_convolve_x_2tap_avx2,
4008 NULL,
4009 jnt_convolve_x_4tap_ssse3,
4010 NULL,
4011 jnt_convolve_x_6tap_avx2,
4012 NULL,
4013 jnt_convolve_x_8tap_avx2};
4014 const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
4015
4016 (void)filter_params_y;
4017 (void)subpel_y_q4;
4018
4019 assert(conv_params->round_0 == 3);
4020 assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
4021
4022 jnt_convolve_x_tap_func_table[tap_x](
4023 src, src_stride, dst8, dst8_stride, w, h, filter_params_x, subpel_x_q4, conv_params);
4024 }
4025