1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10  */
11 
12 #include <immintrin.h>
13 #include "common_dsp_rtcd.h"
14 #include "convolve.h"
15 #include "convolve_avx2.h"
16 #include "EbDefinitions.h"
17 #include "EbMemory_SSE4_1.h"
18 
jnt_y_comp_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i factor,const __m256i offset,const __m256i s0,__m256i * const s1,ConvBufType * const dst,uint8_t * const dst8)19 SIMD_INLINE void jnt_y_comp_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
20                                              const __m256i factor, const __m256i offset,
21                                              const __m256i s0, __m256i *const s1,
22                                              ConvBufType *const dst, uint8_t *const dst8) {
23     __m256i r[2];
24 
25     y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
26     jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
27 }
28 
jnt_y_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i offset,const __m256i s0,__m256i * const s1,const ConvBufType * const dst,uint8_t * const dst8)29 static INLINE void jnt_y_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
30                                           const __m256i offset, const __m256i s0, __m256i *const s1,
31                                           const ConvBufType *const dst, uint8_t *const dst8) {
32     __m256i r[2];
33 
34     y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
35     jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
36 }
37 
jnt_y_no_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i offset,const __m256i s0,__m256i * const s1,ConvBufType * const dst)38 static INLINE void jnt_y_no_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
39                                              const __m256i offset, const __m256i s0,
40                                              __m256i *const s1, ConvBufType *const dst) {
41     __m256i r[2];
42 
43     y_convolve_2tap_32_avx2(src, coeffs, s0, s1, r);
44     jnt_no_avg_round_store_32_avx2(r, offset, dst);
45 }
46 
jnt_convolve_y_2tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params)47 static void jnt_convolve_y_2tap_avx2(const uint8_t *const src, const int32_t src_stride,
48                                      uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
49                                      const int32_t                   h,
50                                      const InterpFilterParams *const filter_params_y,
51                                      const int32_t                   subpel_y_q4,
52                                      const ConvolveParams *const     conv_params) {
53     const uint8_t *src_ptr      = src;
54     const int32_t  dst_stride   = conv_params->dst_stride;
55     const int32_t  round_0      = 3;
56     const int32_t  round_1      = COMPOUND_ROUND1_BITS;
57     const int32_t  bits         = FILTER_BITS - round_0;
58     const int32_t  bd           = 8;
59     const int32_t  round_bits   = 2 * FILTER_BITS - round_0 - round_1;
60     const int32_t  offset_bits  = bd + round_bits;
61     const int32_t  round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
62     ConvBufType *  dst          = conv_params->dst;
63     int32_t        y            = h;
64     __m128i        coeffs_128[4];
65     __m256i        coeffs_256[4];
66 
67     if (conv_params->do_average) {
68         if (conv_params->use_jnt_comp_avg) {
69             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
70             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
71                 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
72                 (round_offset << DIST_PRECISION_BITS);
73 
74             if (w <= 4) {
75                 const __m128i factor_128          = _mm_set1_epi32(factor);
76                 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
77 
78                 prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
79 
80                 if (w == 2) {
81                     __m128i s_16[2];
82 
83                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
84 
85                     do {
86                         const __m128i res = y_convolve_2tap_2x2_ssse3(
87                             src_ptr, src_stride, coeffs_128, s_16);
88                         jnt_comp_avg_round_store_2x2_sse2(res,
89                                                           factor_128,
90                                                           offset_comp_avg_128,
91                                                           dst,
92                                                           dst_stride,
93                                                           dst8,
94                                                           dst8_stride);
95                         src_ptr += 2 * src_stride;
96                         dst += 2 * dst_stride;
97                         dst8 += 2 * dst8_stride;
98                         y -= 2;
99                     } while (y);
100                 } else {
101                     __m128i s_32[2];
102 
103                     assert(w == 4);
104 
105                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
106 
107                     do {
108                         const __m128i res = y_convolve_2tap_4x2_ssse3(
109                             src_ptr, src_stride, coeffs_128, s_32);
110                         jnt_comp_avg_round_store_4x2_sse2(res,
111                                                           factor_128,
112                                                           offset_comp_avg_128,
113                                                           dst,
114                                                           dst_stride,
115                                                           dst8,
116                                                           dst8_stride);
117                         src_ptr += 2 * src_stride;
118                         dst += 2 * dst_stride;
119                         dst8 += 2 * dst8_stride;
120                         y -= 2;
121                     } while (y);
122                 }
123             } else {
124                 const __m256i factor_256          = _mm256_set1_epi32(factor);
125                 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
126 
127                 prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
128 
129                 if (w == 8) {
130                     __m128i s_64[2];
131 
132                     s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
133 
134                     do {
135                         const __m256i res = y_convolve_2tap_8x2_avx2(
136                             src_ptr, src_stride, coeffs_256, s_64);
137                         jnt_comp_avg_round_store_8x2_avx2(res,
138                                                           factor_256,
139                                                           offset_comp_avg_256,
140                                                           dst,
141                                                           dst_stride,
142                                                           dst8,
143                                                           dst8_stride);
144                         src_ptr += 2 * src_stride;
145                         dst += 2 * dst_stride;
146                         dst8 += 2 * dst8_stride;
147                         y -= 2;
148                     } while (y);
149                 } else if (w == 16) {
150                     __m128i s_128[2];
151                     __m256i r[2];
152 
153                     s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
154 
155                     do {
156                         y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, r);
157                         jnt_comp_avg_round_store_16x2_avx2(
158                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
159                         src_ptr += 2 * src_stride;
160                         dst += 2 * dst_stride;
161                         dst8 += 2 * dst8_stride;
162                         y -= 2;
163                     } while (y);
164                 } else if (w == 32) {
165                     __m256i s_256[2];
166 
167                     s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
168 
169                     do {
170                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
171                                                     coeffs_256,
172                                                     factor_256,
173                                                     offset_comp_avg_256,
174                                                     s_256[0],
175                                                     &s_256[1],
176                                                     dst,
177                                                     dst8);
178                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
179                                                     coeffs_256,
180                                                     factor_256,
181                                                     offset_comp_avg_256,
182                                                     s_256[1],
183                                                     &s_256[0],
184                                                     dst + dst_stride,
185                                                     dst8 + dst8_stride);
186                         src_ptr += 2 * src_stride;
187                         dst += 2 * dst_stride;
188                         dst8 += 2 * dst8_stride;
189                         y -= 2;
190                     } while (y);
191                 } else if (w == 64) {
192                     __m256i s_256[2][2];
193 
194                     s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
195                     s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
196 
197                     do {
198                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
199                                                     coeffs_256,
200                                                     factor_256,
201                                                     offset_comp_avg_256,
202                                                     s_256[0][0],
203                                                     &s_256[1][0],
204                                                     dst,
205                                                     dst8);
206                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 32,
207                                                     coeffs_256,
208                                                     factor_256,
209                                                     offset_comp_avg_256,
210                                                     s_256[0][1],
211                                                     &s_256[1][1],
212                                                     dst + 32,
213                                                     dst8 + 32);
214                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
215                                                     coeffs_256,
216                                                     factor_256,
217                                                     offset_comp_avg_256,
218                                                     s_256[1][0],
219                                                     &s_256[0][0],
220                                                     dst + dst_stride,
221                                                     dst8 + dst8_stride);
222                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
223                                                     coeffs_256,
224                                                     factor_256,
225                                                     offset_comp_avg_256,
226                                                     s_256[1][1],
227                                                     &s_256[0][1],
228                                                     dst + dst_stride + 32,
229                                                     dst8 + dst8_stride + 32);
230 
231                         src_ptr += 2 * src_stride;
232                         dst += 2 * dst_stride;
233                         dst8 += 2 * dst8_stride;
234                         y -= 2;
235                     } while (y);
236                 } else {
237                     __m256i s_256[2][4];
238 
239                     assert(w == 128);
240 
241                     s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
242                     s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
243                     s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
244                     s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
245 
246                     do {
247                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride,
248                                                     coeffs_256,
249                                                     factor_256,
250                                                     offset_comp_avg_256,
251                                                     s_256[0][0],
252                                                     &s_256[1][0],
253                                                     dst,
254                                                     dst8);
255                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
256                                                     coeffs_256,
257                                                     factor_256,
258                                                     offset_comp_avg_256,
259                                                     s_256[0][1],
260                                                     &s_256[1][1],
261                                                     dst + 1 * 32,
262                                                     dst8 + 1 * 32);
263                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
264                                                     coeffs_256,
265                                                     factor_256,
266                                                     offset_comp_avg_256,
267                                                     s_256[0][2],
268                                                     &s_256[1][2],
269                                                     dst + 2 * 32,
270                                                     dst8 + 2 * 32);
271                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
272                                                     coeffs_256,
273                                                     factor_256,
274                                                     offset_comp_avg_256,
275                                                     s_256[0][3],
276                                                     &s_256[1][3],
277                                                     dst + 3 * 32,
278                                                     dst8 + 3 * 32);
279                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
280                                                     coeffs_256,
281                                                     factor_256,
282                                                     offset_comp_avg_256,
283                                                     s_256[1][0],
284                                                     &s_256[0][0],
285                                                     dst + dst_stride,
286                                                     dst8 + dst8_stride);
287                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
288                                                     coeffs_256,
289                                                     factor_256,
290                                                     offset_comp_avg_256,
291                                                     s_256[1][1],
292                                                     &s_256[0][1],
293                                                     dst + dst_stride + 1 * 32,
294                                                     dst8 + dst8_stride + 1 * 32);
295                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
296                                                     coeffs_256,
297                                                     factor_256,
298                                                     offset_comp_avg_256,
299                                                     s_256[1][2],
300                                                     &s_256[0][2],
301                                                     dst + dst_stride + 2 * 32,
302                                                     dst8 + dst8_stride + 2 * 32);
303                         jnt_y_comp_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
304                                                     coeffs_256,
305                                                     factor_256,
306                                                     offset_comp_avg_256,
307                                                     s_256[1][3],
308                                                     &s_256[0][3],
309                                                     dst + dst_stride + 3 * 32,
310                                                     dst8 + dst8_stride + 3 * 32);
311 
312                         src_ptr += 2 * src_stride;
313                         dst += 2 * dst_stride;
314                         dst8 += 2 * dst8_stride;
315                         y -= 2;
316                     } while (y);
317                 }
318             }
319         } else {
320             const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_1 - bits - 2)) -
321                 (round_offset << (round_1 - bits - 1));
322 
323             if (w <= 4) {
324                 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
325 
326                 prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
327 
328                 if (w == 2) {
329                     __m128i s_16[2];
330 
331                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
332 
333                     do {
334                         const __m128i res = y_convolve_2tap_2x2_ssse3(
335                             src_ptr, src_stride, coeffs_128, s_16);
336                         jnt_avg_round_store_2x2_sse2(
337                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
338                         src_ptr += 2 * src_stride;
339                         dst += 2 * dst_stride;
340                         dst8 += 2 * dst8_stride;
341                         y -= 2;
342                     } while (y);
343                 } else {
344                     __m128i s_32[2];
345 
346                     assert(w == 4);
347 
348                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
349 
350                     do {
351                         const __m128i res = y_convolve_2tap_4x2_ssse3(
352                             src_ptr, src_stride, coeffs_128, s_32);
353                         jnt_avg_round_store_4x2_sse2(
354                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
355                         src_ptr += 2 * src_stride;
356                         dst += 2 * dst_stride;
357                         dst8 += 2 * dst8_stride;
358                         y -= 2;
359                     } while (y);
360                 }
361             } else {
362                 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
363 
364                 prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
365 
366                 if (w == 8) {
367                     __m128i s_64[2];
368 
369                     s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
370 
371                     do {
372                         const __m256i res = y_convolve_2tap_8x2_avx2(
373                             src_ptr, src_stride, coeffs_256, s_64);
374                         jnt_avg_round_store_8x2_avx2(
375                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
376                         src_ptr += 2 * src_stride;
377                         dst += 2 * dst_stride;
378                         dst8 += 2 * dst8_stride;
379                         y -= 2;
380                     } while (y);
381                 } else if (w == 16) {
382                     __m128i s_128[2];
383                     __m256i r[2];
384 
385                     s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
386 
387                     do {
388                         y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, r);
389                         jnt_avg_round_store_16x2_avx2(
390                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
391                         src_ptr += 2 * src_stride;
392                         dst += 2 * dst_stride;
393                         dst8 += 2 * dst8_stride;
394                         y -= 2;
395                     } while (y);
396                 } else if (w == 32) {
397                     __m256i s_256[2];
398 
399                     s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
400 
401                     do {
402                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
403                                                coeffs_256,
404                                                offset_avg_256,
405                                                s_256[0],
406                                                &s_256[1],
407                                                dst,
408                                                dst8);
409                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
410                                                coeffs_256,
411                                                offset_avg_256,
412                                                s_256[1],
413                                                &s_256[0],
414                                                dst + dst_stride,
415                                                dst8 + dst8_stride);
416                         src_ptr += 2 * src_stride;
417                         dst += 2 * dst_stride;
418                         dst8 += 2 * dst8_stride;
419                         y -= 2;
420                     } while (y);
421                 } else if (w == 64) {
422                     __m256i s_256[2][2];
423 
424                     s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
425                     s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
426 
427                     do {
428                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
429                                                coeffs_256,
430                                                offset_avg_256,
431                                                s_256[0][0],
432                                                &s_256[1][0],
433                                                dst,
434                                                dst8);
435                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 32,
436                                                coeffs_256,
437                                                offset_avg_256,
438                                                s_256[0][1],
439                                                &s_256[1][1],
440                                                dst + 32,
441                                                dst8 + 32);
442                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
443                                                coeffs_256,
444                                                offset_avg_256,
445                                                s_256[1][0],
446                                                &s_256[0][0],
447                                                dst + dst_stride,
448                                                dst8 + dst8_stride);
449                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
450                                                coeffs_256,
451                                                offset_avg_256,
452                                                s_256[1][1],
453                                                &s_256[0][1],
454                                                dst + dst_stride + 32,
455                                                dst8 + dst8_stride + 32);
456 
457                         src_ptr += 2 * src_stride;
458                         dst += 2 * dst_stride;
459                         dst8 += 2 * dst8_stride;
460                         y -= 2;
461                     } while (y);
462                 } else {
463                     __m256i s_256[2][4];
464 
465                     assert(w == 128);
466 
467                     s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
468                     s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
469                     s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
470                     s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
471 
472                     do {
473                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride,
474                                                coeffs_256,
475                                                offset_avg_256,
476                                                s_256[0][0],
477                                                &s_256[1][0],
478                                                dst,
479                                                dst8);
480                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
481                                                coeffs_256,
482                                                offset_avg_256,
483                                                s_256[0][1],
484                                                &s_256[1][1],
485                                                dst + 1 * 32,
486                                                dst8 + 1 * 32);
487                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
488                                                coeffs_256,
489                                                offset_avg_256,
490                                                s_256[0][2],
491                                                &s_256[1][2],
492                                                dst + 2 * 32,
493                                                dst8 + 2 * 32);
494                         jnt_y_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
495                                                coeffs_256,
496                                                offset_avg_256,
497                                                s_256[0][3],
498                                                &s_256[1][3],
499                                                dst + 3 * 32,
500                                                dst8 + 3 * 32);
501                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
502                                                coeffs_256,
503                                                offset_avg_256,
504                                                s_256[1][0],
505                                                &s_256[0][0],
506                                                dst + dst_stride,
507                                                dst8 + dst8_stride);
508                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
509                                                coeffs_256,
510                                                offset_avg_256,
511                                                s_256[1][1],
512                                                &s_256[0][1],
513                                                dst + dst_stride + 1 * 32,
514                                                dst8 + dst8_stride + 1 * 32);
515                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
516                                                coeffs_256,
517                                                offset_avg_256,
518                                                s_256[1][2],
519                                                &s_256[0][2],
520                                                dst + dst_stride + 2 * 32,
521                                                dst8 + dst8_stride + 2 * 32);
522                         jnt_y_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
523                                                coeffs_256,
524                                                offset_avg_256,
525                                                s_256[1][3],
526                                                &s_256[0][3],
527                                                dst + dst_stride + 3 * 32,
528                                                dst8 + dst8_stride + 3 * 32);
529 
530                         src_ptr += 2 * src_stride;
531                         dst += 2 * dst_stride;
532                         dst8 += 2 * dst8_stride;
533                         y -= 2;
534                     } while (y);
535                 }
536             }
537         }
538     } else {
539         const int16_t offset_no_avg = (round_offset << (round_1 - bits - 1)) +
540             (1 << (round_1 - bits - 2));
541 
542         if (w <= 4) {
543             const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
544 
545             prepare_half_coeffs_2tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
546 
547             if (w == 2) {
548                 __m128i s_16[2];
549 
550                 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)src_ptr);
551 
552                 do {
553                     const __m128i res = y_convolve_2tap_2x2_ssse3(
554                         src_ptr, src_stride, coeffs_128, s_16);
555                     jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
556                     src_ptr += 2 * src_stride;
557                     dst += 2 * dst_stride;
558                     y -= 2;
559                 } while (y);
560             } else {
561                 __m128i s_32[2];
562 
563                 assert(w == 4);
564 
565                 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)src_ptr);
566 
567                 do {
568                     const __m128i res = y_convolve_2tap_4x2_ssse3(
569                         src_ptr, src_stride, coeffs_128, s_32);
570                     jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
571                     src_ptr += 2 * src_stride;
572                     dst += 2 * dst_stride;
573                     y -= 2;
574                 } while (y);
575             }
576         } else {
577             const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
578 
579             prepare_half_coeffs_2tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
580 
581             if (w == 8) {
582                 __m128i s_64[2];
583 
584                 s_64[0] = _mm_loadl_epi64((__m128i *)src_ptr);
585 
586                 do {
587                     const __m256i res = y_convolve_2tap_8x2_avx2(
588                         src_ptr, src_stride, coeffs_256, s_64);
589                     jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
590                     src_ptr += 2 * src_stride;
591                     dst += 2 * dst_stride;
592                     y -= 2;
593                 } while (y);
594             } else if (w == 16) {
595                 __m128i s_128[2];
596                 __m256i r[2];
597 
598                 s_128[0] = _mm_loadu_si128((__m128i *)src_ptr);
599 
600                 do {
601                     y_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, r);
602                     jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
603                     src_ptr += 2 * src_stride;
604                     dst += 2 * dst_stride;
605                     y -= 2;
606                 } while (y);
607             } else if (w == 32) {
608                 __m256i s_256[2];
609 
610                 s_256[0] = _mm256_loadu_si256((__m256i *)src_ptr);
611 
612                 do {
613                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
614                                               coeffs_256,
615                                               offset_no_avg_256,
616                                               s_256[0],
617                                               &s_256[1],
618                                               dst);
619                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
620                                               coeffs_256,
621                                               offset_no_avg_256,
622                                               s_256[1],
623                                               &s_256[0],
624                                               dst + dst_stride);
625                     src_ptr += 2 * src_stride;
626                     dst += 2 * dst_stride;
627                     y -= 2;
628                 } while (y);
629             } else if (w == 64) {
630                 __m256i s_256[2][2];
631 
632                 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
633                 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
634 
635                 do {
636                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
637                                               coeffs_256,
638                                               offset_no_avg_256,
639                                               s_256[0][0],
640                                               &s_256[1][0],
641                                               dst);
642                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 32,
643                                               coeffs_256,
644                                               offset_no_avg_256,
645                                               s_256[0][1],
646                                               &s_256[1][1],
647                                               dst + 32);
648                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
649                                               coeffs_256,
650                                               offset_no_avg_256,
651                                               s_256[1][0],
652                                               &s_256[0][0],
653                                               dst + dst_stride);
654                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 32,
655                                               coeffs_256,
656                                               offset_no_avg_256,
657                                               s_256[1][1],
658                                               &s_256[0][1],
659                                               dst + dst_stride + 32);
660 
661                     src_ptr += 2 * src_stride;
662                     dst += 2 * dst_stride;
663                     y -= 2;
664                 } while (y);
665             } else {
666                 __m256i s_256[2][4];
667 
668                 assert(w == 128);
669 
670                 s_256[0][0] = _mm256_loadu_si256((__m256i *)(src_ptr + 0 * 32));
671                 s_256[0][1] = _mm256_loadu_si256((__m256i *)(src_ptr + 1 * 32));
672                 s_256[0][2] = _mm256_loadu_si256((__m256i *)(src_ptr + 2 * 32));
673                 s_256[0][3] = _mm256_loadu_si256((__m256i *)(src_ptr + 3 * 32));
674 
675                 do {
676                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride,
677                                               coeffs_256,
678                                               offset_no_avg_256,
679                                               s_256[0][0],
680                                               &s_256[1][0],
681                                               dst);
682                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 1 * 32,
683                                               coeffs_256,
684                                               offset_no_avg_256,
685                                               s_256[0][1],
686                                               &s_256[1][1],
687                                               dst + 1 * 32);
688                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 2 * 32,
689                                               coeffs_256,
690                                               offset_no_avg_256,
691                                               s_256[0][2],
692                                               &s_256[1][2],
693                                               dst + 2 * 32);
694                     jnt_y_no_avg_2tap_32_avx2(src_ptr + src_stride + 3 * 32,
695                                               coeffs_256,
696                                               offset_no_avg_256,
697                                               s_256[0][3],
698                                               &s_256[1][3],
699                                               dst + 3 * 32);
700                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride,
701                                               coeffs_256,
702                                               offset_no_avg_256,
703                                               s_256[1][0],
704                                               &s_256[0][0],
705                                               dst + dst_stride);
706                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 1 * 32,
707                                               coeffs_256,
708                                               offset_no_avg_256,
709                                               s_256[1][1],
710                                               &s_256[0][1],
711                                               dst + dst_stride + 1 * 32);
712                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 2 * 32,
713                                               coeffs_256,
714                                               offset_no_avg_256,
715                                               s_256[1][2],
716                                               &s_256[0][2],
717                                               dst + dst_stride + 2 * 32);
718                     jnt_y_no_avg_2tap_32_avx2(src_ptr + 2 * src_stride + 3 * 32,
719                                               coeffs_256,
720                                               offset_no_avg_256,
721                                               s_256[1][3],
722                                               &s_256[0][3],
723                                               dst + dst_stride + 3 * 32);
724 
725                     src_ptr += 2 * src_stride;
726                     dst += 2 * dst_stride;
727                     y -= 2;
728                 } while (y);
729             }
730         }
731     }
732 }
733 
jnt_convolve_y_4tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params)734 void jnt_convolve_y_4tap_avx2(const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
735                               const int32_t dst8_stride, const int32_t w, const int32_t h,
736                               const InterpFilterParams *const filter_params_y,
737                               const int32_t subpel_y_q4, const ConvolveParams *const conv_params) {
738     const uint8_t *src_ptr      = src - src_stride;
739     const int32_t  dst_stride   = conv_params->dst_stride;
740     const int32_t  round_0      = 3;
741     const int32_t  round_1      = COMPOUND_ROUND1_BITS;
742     const int32_t  bits         = FILTER_BITS - round_0;
743     const int32_t  bd           = 8;
744     const int32_t  round_bits   = 2 * FILTER_BITS - round_0 - round_1;
745     const int32_t  offset_bits  = bd + round_bits;
746     const int32_t  round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
747     ConvBufType *  dst          = conv_params->dst;
748     int32_t        y            = h;
749     __m128i        coeffs_128[4];
750     __m256i        coeffs_256[4];
751 
752     if (conv_params->do_average) {
753         if (conv_params->use_jnt_comp_avg) {
754             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
755             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
756                 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
757                 (round_offset << DIST_PRECISION_BITS);
758 
759             if (w <= 4) {
760                 const __m128i factor_128          = _mm_set1_epi32(factor);
761                 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
762 
763                 prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
764 
765                 if (w == 2) {
766                     __m128i s_16[4], ss_128[2];
767 
768                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
769                     s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
770                     s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
771 
772                     const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
773                     const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
774 
775                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
776 
777                     do {
778                         src_ptr += 2 * src_stride;
779                         const __m128i res = y_convolve_4tap_2x2_ssse3(
780                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
781                         jnt_comp_avg_round_store_2x2_sse2(res,
782                                                           factor_128,
783                                                           offset_comp_avg_128,
784                                                           dst,
785                                                           dst_stride,
786                                                           dst8,
787                                                           dst8_stride);
788                         ss_128[0] = ss_128[1];
789                         dst += 2 * dst_stride;
790                         dst8 += 2 * dst8_stride;
791                         y -= 2;
792                     } while (y);
793                 } else {
794                     __m128i s_32[4], ss_128[2];
795 
796                     assert(w == 4);
797 
798                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
799                     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
800                     s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
801 
802                     const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
803                     const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
804 
805                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
806 
807                     do {
808                         src_ptr += 2 * src_stride;
809                         const __m128i res = y_convolve_4tap_4x2_ssse3(
810                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
811                         jnt_comp_avg_round_store_4x2_sse2(res,
812                                                           factor_128,
813                                                           offset_comp_avg_128,
814                                                           dst,
815                                                           dst_stride,
816                                                           dst8,
817                                                           dst8_stride);
818                         ss_128[0] = ss_128[1];
819                         dst += 2 * dst_stride;
820                         dst8 += 2 * dst8_stride;
821                         y -= 2;
822                     } while (y);
823                 }
824             } else {
825                 const __m256i factor_256          = _mm256_set1_epi32(factor);
826                 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
827 
828                 prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
829 
830                 if (w == 8) {
831                     __m128i s_64[4];
832                     __m256i ss_256[2];
833 
834                     s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
835                     s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
836                     s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
837 
838                     // Load lines a and b. Line a to lower 128, line b to upper
839                     // 128
840                     const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
841                     const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
842 
843                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
844 
845                     do {
846                         src_ptr += 2 * src_stride;
847                         const __m256i res = y_convolve_4tap_8x2_avx2(
848                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
849                         jnt_comp_avg_round_store_8x2_avx2(res,
850                                                           factor_256,
851                                                           offset_comp_avg_256,
852                                                           dst,
853                                                           dst_stride,
854                                                           dst8,
855                                                           dst8_stride);
856                         ss_256[0] = ss_256[1];
857                         dst += 2 * dst_stride;
858                         dst8 += 2 * dst8_stride;
859                         y -= 2;
860                     } while (y);
861                 } else {
862                     __m128i s_128[4];
863                     __m256i ss_256[4], r[2];
864 
865                     assert(w == 16);
866 
867                     s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
868                     s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
869                     s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
870 
871                     // Load lines a and b. Line a to lower 128, line b to upper
872                     // 128
873                     const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
874                     const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
875 
876                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
877                     ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
878 
879                     do {
880                         src_ptr += 2 * src_stride;
881                         y_convolve_4tap_16x2_avx2(
882                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
883                         jnt_comp_avg_round_store_16x2_avx2(
884                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
885                         ss_256[0] = ss_256[1];
886                         ss_256[2] = ss_256[3];
887                         dst += 2 * dst_stride;
888                         dst8 += 2 * dst8_stride;
889                         y -= 2;
890                     } while (y);
891                 }
892             }
893         } else {
894             const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_1 - bits - 2)) -
895                 (round_offset << (round_1 - bits - 1));
896 
897             if (w <= 4) {
898                 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
899 
900                 prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
901 
902                 if (w == 2) {
903                     __m128i s_16[4], ss_128[2];
904 
905                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
906                     s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
907                     s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
908 
909                     const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
910                     const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
911 
912                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
913 
914                     do {
915                         src_ptr += 2 * src_stride;
916                         const __m128i res = y_convolve_4tap_2x2_ssse3(
917                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
918                         jnt_avg_round_store_2x2_sse2(
919                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
920                         ss_128[0] = ss_128[1];
921                         dst += 2 * dst_stride;
922                         dst8 += 2 * dst8_stride;
923                         y -= 2;
924                     } while (y);
925                 } else {
926                     __m128i s_32[4], ss_128[2];
927 
928                     assert(w == 4);
929 
930                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
931                     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
932                     s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
933 
934                     const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
935                     const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
936 
937                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
938 
939                     do {
940                         src_ptr += 2 * src_stride;
941                         const __m128i res = y_convolve_4tap_4x2_ssse3(
942                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
943                         jnt_avg_round_store_4x2_sse2(
944                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
945                         ss_128[0] = ss_128[1];
946                         dst += 2 * dst_stride;
947                         dst8 += 2 * dst8_stride;
948                         y -= 2;
949                     } while (y);
950                 }
951             } else {
952                 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
953 
954                 prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
955 
956                 if (w == 8) {
957                     __m128i s_64[4];
958                     __m256i ss_256[2];
959 
960                     s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
961                     s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
962                     s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
963 
964                     // Load lines a and b. Line a to lower 128, line b to upper
965                     // 128
966                     const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
967                     const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
968 
969                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
970 
971                     do {
972                         src_ptr += 2 * src_stride;
973                         const __m256i res = y_convolve_4tap_8x2_avx2(
974                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
975                         jnt_avg_round_store_8x2_avx2(
976                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
977                         ss_256[0] = ss_256[1];
978                         dst += 2 * dst_stride;
979                         dst8 += 2 * dst8_stride;
980                         y -= 2;
981                     } while (y);
982                 } else {
983                     __m128i s_128[4];
984                     __m256i ss_256[4], r[2];
985 
986                     assert(w == 16);
987 
988                     s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
989                     s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
990                     s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
991 
992                     // Load lines a and b. Line a to lower 128, line b to upper
993                     // 128
994                     const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
995                     const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
996 
997                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
998                     ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
999 
1000                     do {
1001                         src_ptr += 2 * src_stride;
1002                         y_convolve_4tap_16x2_avx2(
1003                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1004                         jnt_avg_round_store_16x2_avx2(
1005                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1006                         ss_256[0] = ss_256[1];
1007                         ss_256[2] = ss_256[3];
1008                         dst += 2 * dst_stride;
1009                         dst8 += 2 * dst8_stride;
1010                         y -= 2;
1011                     } while (y);
1012                 }
1013             }
1014         }
1015     } else {
1016         const int16_t offset_no_avg = (round_offset << (round_1 - bits - 1)) +
1017             (1 << (round_1 - bits - 2));
1018 
1019         if (w <= 4) {
1020             const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
1021 
1022             prepare_half_coeffs_4tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1023 
1024             if (w == 2) {
1025                 __m128i s_16[4], ss_128[2];
1026 
1027                 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1028                 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1029                 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1030 
1031                 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1032                 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1033 
1034                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1035 
1036                 do {
1037                     src_ptr += 2 * src_stride;
1038                     const __m128i res = y_convolve_4tap_2x2_ssse3(
1039                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
1040                     jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1041                     ss_128[0] = ss_128[1];
1042                     dst += 2 * dst_stride;
1043                     y -= 2;
1044                 } while (y);
1045             } else {
1046                 __m128i s_32[4], ss_128[2];
1047 
1048                 assert(w == 4);
1049 
1050                 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1051                 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1052                 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1053 
1054                 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1055                 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1056 
1057                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1058 
1059                 do {
1060                     src_ptr += 2 * src_stride;
1061                     const __m128i res = y_convolve_4tap_4x2_ssse3(
1062                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
1063                     jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1064                     ss_128[0] = ss_128[1];
1065                     dst += 2 * dst_stride;
1066                     y -= 2;
1067                 } while (y);
1068             }
1069         } else {
1070             const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
1071 
1072             prepare_half_coeffs_4tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1073 
1074             if (w == 8) {
1075                 __m128i s_64[4];
1076                 __m256i ss_256[2];
1077 
1078                 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1079                 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1080                 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1081 
1082                 // Load lines a and b. Line a to lower 128, line b to upper 128
1083                 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1084                 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1085 
1086                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1087 
1088                 do {
1089                     src_ptr += 2 * src_stride;
1090                     const __m256i res = y_convolve_4tap_8x2_avx2(
1091                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
1092                     jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1093                     ss_256[0] = ss_256[1];
1094                     dst += 2 * dst_stride;
1095                     y -= 2;
1096                 } while (y);
1097             } else {
1098                 __m128i s_128[4];
1099                 __m256i ss_256[4], r[2];
1100 
1101                 assert(w == 16);
1102 
1103                 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1104                 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1105                 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1106 
1107                 // Load lines a and b. Line a to lower 128, line b to upper 128
1108                 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1109                 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1110 
1111                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1112                 ss_256[2] = _mm256_unpackhi_epi8(src01, src12);
1113 
1114                 do {
1115                     src_ptr += 2 * src_stride;
1116                     y_convolve_4tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1117                     jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1118                     ss_256[0] = ss_256[1];
1119                     ss_256[2] = ss_256[3];
1120                     dst += 2 * dst_stride;
1121                     y -= 2;
1122                 } while (y);
1123             }
1124         }
1125     }
1126 }
1127 
jnt_convolve_y_6tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params)1128 static void jnt_convolve_y_6tap_avx2(const uint8_t *const src, const int32_t src_stride,
1129                                      uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
1130                                      const int32_t                   h,
1131                                      const InterpFilterParams *const filter_params_y,
1132                                      const int32_t                   subpel_y_q4,
1133                                      const ConvolveParams *const     conv_params) {
1134     const uint8_t *src_ptr      = src - 2 * src_stride;
1135     const int32_t  dst_stride   = conv_params->dst_stride;
1136     const int32_t  round_0      = 3;
1137     const int32_t  round_1      = COMPOUND_ROUND1_BITS;
1138     const int32_t  bits         = FILTER_BITS - round_0;
1139     const int32_t  bd           = 8;
1140     const int32_t  round_bits   = 2 * FILTER_BITS - round_0 - round_1;
1141     const int32_t  offset_bits  = bd + round_bits;
1142     const int32_t  round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
1143     ConvBufType *  dst          = conv_params->dst;
1144     int32_t        x;
1145     __m128i        coeffs_128[4];
1146     __m256i        coeffs_256[4];
1147 
1148     if (conv_params->do_average) {
1149         if (conv_params->use_jnt_comp_avg) {
1150             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1151             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
1152                 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
1153                 (round_offset << DIST_PRECISION_BITS);
1154 
1155             if (w <= 4) {
1156                 const __m128i factor_128          = _mm_set1_epi32(factor);
1157                 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1158 
1159                 prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1160 
1161                 int32_t y = h;
1162 
1163                 if (w == 2) {
1164                     __m128i s_16[6], ss_128[3];
1165 
1166                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1167                     s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1168                     s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1169                     s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1170                     s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1171 
1172                     const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1173                     const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1174                     const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1175                     const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1176 
1177                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1178                     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1179 
1180                     do {
1181                         src_ptr += 2 * src_stride;
1182                         const __m128i res = y_convolve_6tap_2x2_ssse3(
1183                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
1184                         jnt_comp_avg_round_store_2x2_sse2(res,
1185                                                           factor_128,
1186                                                           offset_comp_avg_128,
1187                                                           dst,
1188                                                           dst_stride,
1189                                                           dst8,
1190                                                           dst8_stride);
1191                         ss_128[0] = ss_128[1];
1192                         ss_128[1] = ss_128[2];
1193                         dst += 2 * dst_stride;
1194                         dst8 += 2 * dst8_stride;
1195                         y -= 2;
1196                     } while (y);
1197                 } else {
1198                     __m128i s_32[6], ss_128[3];
1199 
1200                     assert(w == 4);
1201 
1202                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1203                     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1204                     s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1205                     s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1206                     s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1207 
1208                     const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1209                     const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1210                     const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1211                     const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1212 
1213                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1214                     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1215 
1216                     do {
1217                         src_ptr += 2 * src_stride;
1218                         const __m128i res = y_convolve_6tap_4x2_ssse3(
1219                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
1220                         jnt_comp_avg_round_store_4x2_sse2(res,
1221                                                           factor_128,
1222                                                           offset_comp_avg_128,
1223                                                           dst,
1224                                                           dst_stride,
1225                                                           dst8,
1226                                                           dst8_stride);
1227                         ss_128[0] = ss_128[1];
1228                         ss_128[1] = ss_128[2];
1229                         dst += 2 * dst_stride;
1230                         dst8 += 2 * dst8_stride;
1231                         y -= 2;
1232                     } while (y);
1233                 }
1234             } else {
1235                 const __m256i factor_256          = _mm256_set1_epi32(factor);
1236                 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1237 
1238                 prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1239 
1240                 if (w == 8) {
1241                     __m128i s_64[6];
1242                     __m256i ss_256[3];
1243 
1244                     s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1245                     s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1246                     s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1247                     s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1248                     s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1249 
1250                     // Load lines a and b. Line a to lower 128, line b to upper
1251                     // 128
1252                     const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1253                     const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1254                     const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1255                     const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1256 
1257                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1258                     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1259 
1260                     int32_t y = h;
1261                     do {
1262                         src_ptr += 2 * src_stride;
1263                         const __m256i res = y_convolve_6tap_8x2_avx2(
1264                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
1265                         jnt_comp_avg_round_store_8x2_avx2(res,
1266                                                           factor_256,
1267                                                           offset_comp_avg_256,
1268                                                           dst,
1269                                                           dst_stride,
1270                                                           dst8,
1271                                                           dst8_stride);
1272                         ss_256[0] = ss_256[1];
1273                         ss_256[1] = ss_256[2];
1274                         dst += 2 * dst_stride;
1275                         dst8 += 2 * dst8_stride;
1276                         y -= 2;
1277                     } while (y);
1278                 } else if (w == 16) {
1279                     __m128i s_128[6];
1280                     __m256i ss_256[6], r[2];
1281 
1282                     s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1283                     s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1284                     s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1285                     s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1286                     s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1287 
1288                     // Load lines a and b. Line a to lower 128, line b to upper
1289                     // 128
1290                     const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1291                     const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1292                     const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1293                     const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1294 
1295                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1296                     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1297 
1298                     ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
1299                     ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
1300 
1301                     int32_t y = h;
1302                     do {
1303                         src_ptr += 2 * src_stride;
1304                         y_convolve_6tap_16x2_avx2(
1305                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1306                         jnt_comp_avg_round_store_16x2_avx2(
1307                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1308                         ss_256[0] = ss_256[1];
1309                         ss_256[1] = ss_256[2];
1310                         ss_256[3] = ss_256[4];
1311                         ss_256[4] = ss_256[5];
1312                         dst += 2 * dst_stride;
1313                         dst8 += 2 * dst8_stride;
1314                         y -= 2;
1315                     } while (y);
1316                 } else {
1317                     __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1318 
1319                     assert(!(w % 32));
1320 
1321                     x = 0;
1322                     do {
1323                         const uint8_t *s  = src_ptr + x;
1324                         ConvBufType *  d  = dst + x;
1325                         uint8_t *      d8 = dst8 + x;
1326 
1327                         s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1328                         s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1329                         s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1330                         s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1331                         s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1332 
1333                         ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1334                         ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1335                         ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1336                         ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1337 
1338                         tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1339                         tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1340                         tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1341                         tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1342 
1343                         int32_t y = h;
1344                         do {
1345                             s += 2 * src_stride;
1346                             y_convolve_6tap_32x2_avx2(
1347                                 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
1348                             jnt_comp_avg_round_store_32_avx2(
1349                                 r, factor_256, offset_comp_avg_256, d, d8);
1350                             jnt_comp_avg_round_store_32_avx2(r + 2,
1351                                                              factor_256,
1352                                                              offset_comp_avg_256,
1353                                                              d + dst_stride,
1354                                                              d8 + dst8_stride);
1355 
1356                             ss_256[0] = ss_256[1];
1357                             ss_256[1] = ss_256[2];
1358                             ss_256[3] = ss_256[4];
1359                             ss_256[4] = ss_256[5];
1360 
1361                             tt_256[0] = tt_256[1];
1362                             tt_256[1] = tt_256[2];
1363                             tt_256[3] = tt_256[4];
1364                             tt_256[4] = tt_256[5];
1365                             d += 2 * dst_stride;
1366                             d8 += 2 * dst8_stride;
1367                             y -= 2;
1368                         } while (y);
1369 
1370                         x += 32;
1371                     } while (x < w);
1372                 }
1373             }
1374         } else {
1375             const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_1 - bits - 2)) -
1376                 (round_offset << (round_1 - bits - 1));
1377 
1378             if (w <= 4) {
1379                 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
1380 
1381                 prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1382 
1383                 int32_t y = h;
1384 
1385                 if (w == 2) {
1386                     __m128i s_16[6], ss_128[3];
1387 
1388                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1389                     s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1390                     s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1391                     s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1392                     s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1393 
1394                     const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1395                     const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1396                     const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1397                     const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1398 
1399                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1400                     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1401 
1402                     do {
1403                         src_ptr += 2 * src_stride;
1404                         const __m128i res = y_convolve_6tap_2x2_ssse3(
1405                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
1406                         jnt_avg_round_store_2x2_sse2(
1407                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1408                         ss_128[0] = ss_128[1];
1409                         ss_128[1] = ss_128[2];
1410                         dst += 2 * dst_stride;
1411                         dst8 += 2 * dst8_stride;
1412                         y -= 2;
1413                     } while (y);
1414                 } else {
1415                     __m128i s_32[6], ss_128[3];
1416 
1417                     assert(w == 4);
1418 
1419                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1420                     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1421                     s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1422                     s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1423                     s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1424 
1425                     const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1426                     const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1427                     const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1428                     const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1429 
1430                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1431                     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1432 
1433                     do {
1434                         src_ptr += 2 * src_stride;
1435                         const __m128i res = y_convolve_6tap_4x2_ssse3(
1436                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
1437                         jnt_avg_round_store_4x2_sse2(
1438                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
1439                         ss_128[0] = ss_128[1];
1440                         ss_128[1] = ss_128[2];
1441                         dst += 2 * dst_stride;
1442                         dst8 += 2 * dst8_stride;
1443                         y -= 2;
1444                     } while (y);
1445                 }
1446             } else {
1447                 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
1448 
1449                 prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1450 
1451                 if (w == 8) {
1452                     __m128i s_64[6];
1453                     __m256i ss_256[3];
1454 
1455                     s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1456                     s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1457                     s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1458                     s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1459                     s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1460 
1461                     // Load lines a and b. Line a to lower 128, line b to upper
1462                     // 128
1463                     const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1464                     const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1465                     const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1466                     const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1467 
1468                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1469                     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1470 
1471                     int32_t y = h;
1472                     do {
1473                         src_ptr += 2 * src_stride;
1474                         const __m256i res = y_convolve_6tap_8x2_avx2(
1475                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
1476                         jnt_avg_round_store_8x2_avx2(
1477                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1478                         ss_256[0] = ss_256[1];
1479                         ss_256[1] = ss_256[2];
1480                         dst += 2 * dst_stride;
1481                         dst8 += 2 * dst8_stride;
1482                         y -= 2;
1483                     } while (y);
1484                 } else if (w == 16) {
1485                     __m128i s_128[6];
1486                     __m256i ss_256[6], r[2];
1487 
1488                     s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1489                     s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1490                     s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1491                     s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1492                     s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1493 
1494                     // Load lines a and b. Line a to lower 128, line b to upper
1495                     // 128
1496                     const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1497                     const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1498                     const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1499                     const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1500 
1501                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1502                     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1503 
1504                     ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
1505                     ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
1506 
1507                     int32_t y = h;
1508                     do {
1509                         src_ptr += 2 * src_stride;
1510                         y_convolve_6tap_16x2_avx2(
1511                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1512                         jnt_avg_round_store_16x2_avx2(
1513                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
1514                         ss_256[0] = ss_256[1];
1515                         ss_256[1] = ss_256[2];
1516                         ss_256[3] = ss_256[4];
1517                         ss_256[4] = ss_256[5];
1518                         dst += 2 * dst_stride;
1519                         dst8 += 2 * dst8_stride;
1520                         y -= 2;
1521                     } while (y);
1522                 } else {
1523                     __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1524 
1525                     assert(!(w % 32));
1526 
1527                     x = 0;
1528                     do {
1529                         const uint8_t *s  = src_ptr + x;
1530                         ConvBufType *  d  = dst + x;
1531                         uint8_t *      d8 = dst8 + x;
1532 
1533                         s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1534                         s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1535                         s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1536                         s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1537                         s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1538 
1539                         ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1540                         ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1541                         ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1542                         ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1543 
1544                         tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1545                         tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1546                         tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1547                         tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1548 
1549                         int32_t y = h;
1550                         do {
1551                             s += 2 * src_stride;
1552                             y_convolve_6tap_32x2_avx2(
1553                                 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
1554                             jnt_avg_round_store_32_avx2(r, offset_avg_256, d, d8);
1555                             jnt_avg_round_store_32_avx2(
1556                                 r + 2, offset_avg_256, d + dst_stride, d8 + dst8_stride);
1557 
1558                             ss_256[0] = ss_256[1];
1559                             ss_256[1] = ss_256[2];
1560                             ss_256[3] = ss_256[4];
1561                             ss_256[4] = ss_256[5];
1562 
1563                             tt_256[0] = tt_256[1];
1564                             tt_256[1] = tt_256[2];
1565                             tt_256[3] = tt_256[4];
1566                             tt_256[4] = tt_256[5];
1567                             d += 2 * dst_stride;
1568                             d8 += 2 * dst8_stride;
1569                             y -= 2;
1570                         } while (y);
1571 
1572                         x += 32;
1573                     } while (x < w);
1574                 }
1575             }
1576         }
1577     } else {
1578         const int16_t offset_no_avg = (round_offset << (round_1 - bits - 1)) +
1579             (1 << (round_1 - bits - 2));
1580 
1581         if (w <= 4) {
1582             const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
1583 
1584             prepare_half_coeffs_6tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1585 
1586             int32_t y = h;
1587 
1588             if (w == 2) {
1589                 __m128i s_16[6], ss_128[3];
1590 
1591                 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1592                 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1593                 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1594                 s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1595                 s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1596 
1597                 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1598                 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1599                 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1600                 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1601 
1602                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1603                 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1604 
1605                 do {
1606                     src_ptr += 2 * src_stride;
1607                     const __m128i res = y_convolve_6tap_2x2_ssse3(
1608                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
1609                     jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1610                     ss_128[0] = ss_128[1];
1611                     ss_128[1] = ss_128[2];
1612                     dst += 2 * dst_stride;
1613                     y -= 2;
1614                 } while (y);
1615             } else {
1616                 __m128i s_32[6], ss_128[3];
1617 
1618                 assert(w == 4);
1619 
1620                 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1621                 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1622                 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1623                 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1624                 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1625 
1626                 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1627                 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1628                 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1629                 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1630 
1631                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1632                 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1633 
1634                 do {
1635                     src_ptr += 2 * src_stride;
1636                     const __m128i res = y_convolve_6tap_4x2_ssse3(
1637                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
1638                     jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
1639                     ss_128[0] = ss_128[1];
1640                     ss_128[1] = ss_128[2];
1641                     dst += 2 * dst_stride;
1642                     y -= 2;
1643                 } while (y);
1644             }
1645         } else {
1646             const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
1647 
1648             prepare_half_coeffs_6tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1649 
1650             if (w == 8) {
1651                 __m128i s_64[6];
1652                 __m256i ss_256[3];
1653 
1654                 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1655                 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1656                 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1657                 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1658                 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1659 
1660                 // Load lines a and b. Line a to lower 128, line b to upper 128
1661                 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1662                 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1663                 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1664                 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1665 
1666                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1667                 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1668 
1669                 int32_t y = h;
1670                 do {
1671                     src_ptr += 2 * src_stride;
1672                     const __m256i res = y_convolve_6tap_8x2_avx2(
1673                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
1674                     jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
1675                     ss_256[0] = ss_256[1];
1676                     ss_256[1] = ss_256[2];
1677                     dst += 2 * dst_stride;
1678                     y -= 2;
1679                 } while (y);
1680             } else if (w == 16) {
1681                 __m128i s_128[6];
1682                 __m256i ss_256[6], r[2];
1683 
1684                 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1685                 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1686                 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1687                 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1688                 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1689 
1690                 // Load lines a and b. Line a to lower 128, line b to upper 128
1691                 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1692                 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1693                 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1694                 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1695 
1696                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1697                 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1698 
1699                 ss_256[3] = _mm256_unpackhi_epi8(src01, src12);
1700                 ss_256[4] = _mm256_unpackhi_epi8(src23, src34);
1701 
1702                 int32_t y = h;
1703                 do {
1704                     src_ptr += 2 * src_stride;
1705                     y_convolve_6tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1706                     jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
1707                     ss_256[0] = ss_256[1];
1708                     ss_256[1] = ss_256[2];
1709                     ss_256[3] = ss_256[4];
1710                     ss_256[4] = ss_256[5];
1711                     dst += 2 * dst_stride;
1712                     y -= 2;
1713                 } while (y);
1714             } else {
1715                 __m256i s_256[6], ss_256[6], tt_256[6], r[4];
1716 
1717                 assert(!(w % 32));
1718 
1719                 x = 0;
1720                 do {
1721                     const uint8_t *s = src_ptr + x;
1722                     ConvBufType *  d = dst + x;
1723 
1724                     s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1725                     s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1726                     s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1727                     s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1728                     s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1729 
1730                     ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
1731                     ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
1732                     ss_256[3] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
1733                     ss_256[4] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
1734 
1735                     tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
1736                     tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
1737                     tt_256[3] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
1738                     tt_256[4] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
1739 
1740                     int32_t y = h;
1741                     do {
1742                         s += 2 * src_stride;
1743                         y_convolve_6tap_32x2_avx2(
1744                             s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
1745                         jnt_no_avg_round_store_32_avx2(r, offset_no_avg_256, d);
1746                         jnt_no_avg_round_store_32_avx2(r + 2, offset_no_avg_256, d + dst_stride);
1747 
1748                         ss_256[0] = ss_256[1];
1749                         ss_256[1] = ss_256[2];
1750                         ss_256[3] = ss_256[4];
1751                         ss_256[4] = ss_256[5];
1752 
1753                         tt_256[0] = tt_256[1];
1754                         tt_256[1] = tt_256[2];
1755                         tt_256[3] = tt_256[4];
1756                         tt_256[4] = tt_256[5];
1757                         d += 2 * dst_stride;
1758                         y -= 2;
1759                     } while (y);
1760 
1761                     x += 32;
1762                 } while (x < w);
1763             }
1764         }
1765     }
1766 }
1767 
jnt_convolve_y_8tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_y,const int32_t subpel_y_q4,const ConvolveParams * const conv_params)1768 static void jnt_convolve_y_8tap_avx2(const uint8_t *const src, const int32_t src_stride,
1769                                      uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
1770                                      const int32_t                   h,
1771                                      const InterpFilterParams *const filter_params_y,
1772                                      const int32_t                   subpel_y_q4,
1773                                      const ConvolveParams *const     conv_params) {
1774     const uint8_t *src_ptr      = src - 3 * src_stride;
1775     const int32_t  dst_stride   = conv_params->dst_stride;
1776     const int32_t  round_0      = 3;
1777     const int32_t  round_1      = COMPOUND_ROUND1_BITS;
1778     const int32_t  bits         = FILTER_BITS - round_0;
1779     const int32_t  bd           = 8;
1780     const int32_t  round_bits   = 2 * FILTER_BITS - round_0 - round_1;
1781     const int32_t  offset_bits  = bd + round_bits;
1782     const int32_t  round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
1783     ConvBufType *  dst          = conv_params->dst;
1784     int32_t        x;
1785     __m128i        coeffs_128[4];
1786     __m256i        coeffs_256[4];
1787 
1788     if (conv_params->do_average) {
1789         if (conv_params->use_jnt_comp_avg) {
1790             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
1791             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
1792                 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
1793                 (round_offset << DIST_PRECISION_BITS);
1794 
1795             if (w <= 4) {
1796                 const __m128i factor_128          = _mm_set1_epi32(factor);
1797                 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
1798 
1799                 prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
1800 
1801                 int32_t y = h;
1802 
1803                 if (w == 2) {
1804                     __m128i s_16[8], ss_128[4];
1805 
1806                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
1807                     s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
1808                     s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
1809                     s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
1810                     s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
1811                     s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
1812                     s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
1813 
1814                     const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
1815                     const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
1816                     const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
1817                     const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
1818                     const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
1819                     const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
1820 
1821                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1822                     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1823                     ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1824 
1825                     do {
1826                         const __m128i res = y_convolve_8tap_2x2_ssse3(
1827                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
1828                         jnt_comp_avg_round_store_2x2_sse2(res,
1829                                                           factor_128,
1830                                                           offset_comp_avg_128,
1831                                                           dst,
1832                                                           dst_stride,
1833                                                           dst8,
1834                                                           dst8_stride);
1835                         ss_128[0] = ss_128[1];
1836                         ss_128[1] = ss_128[2];
1837                         ss_128[2] = ss_128[3];
1838                         src_ptr += 2 * src_stride;
1839                         dst += 2 * dst_stride;
1840                         dst8 += 2 * dst8_stride;
1841                         y -= 2;
1842                     } while (y);
1843                 } else {
1844                     __m128i s_32[8], ss_128[4];
1845 
1846                     assert(w == 4);
1847 
1848                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
1849                     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
1850                     s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
1851                     s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
1852                     s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
1853                     s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
1854                     s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
1855 
1856                     const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
1857                     const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
1858                     const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
1859                     const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
1860                     const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
1861                     const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
1862 
1863                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
1864                     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
1865                     ss_128[2] = _mm_unpacklo_epi8(src45, src56);
1866 
1867                     do {
1868                         const __m128i res = y_convolve_8tap_4x2_ssse3(
1869                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
1870                         jnt_comp_avg_round_store_4x2_sse2(res,
1871                                                           factor_128,
1872                                                           offset_comp_avg_128,
1873                                                           dst,
1874                                                           dst_stride,
1875                                                           dst8,
1876                                                           dst8_stride);
1877                         ss_128[0] = ss_128[1];
1878                         ss_128[1] = ss_128[2];
1879                         ss_128[2] = ss_128[3];
1880                         src_ptr += 2 * src_stride;
1881                         dst += 2 * dst_stride;
1882                         dst8 += 2 * dst8_stride;
1883                         y -= 2;
1884                     } while (y);
1885                 }
1886             } else {
1887                 const __m256i factor_256          = _mm256_set1_epi32(factor);
1888                 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
1889 
1890                 prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
1891 
1892                 if (w == 8) {
1893                     __m128i s_64[8];
1894                     __m256i ss_256[4];
1895 
1896                     s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
1897                     s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
1898                     s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
1899                     s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
1900                     s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
1901                     s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
1902                     s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
1903 
1904                     // Load lines a and b. Line a to lower 128, line b to upper
1905                     // 128
1906                     const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
1907                     const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
1908                     const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
1909                     const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
1910                     const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
1911                     const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
1912 
1913                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1914                     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1915                     ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1916 
1917                     int32_t y = h;
1918                     do {
1919                         const __m256i res = y_convolve_8tap_8x2_avx2(
1920                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
1921                         jnt_comp_avg_round_store_8x2_avx2(res,
1922                                                           factor_256,
1923                                                           offset_comp_avg_256,
1924                                                           dst,
1925                                                           dst_stride,
1926                                                           dst8,
1927                                                           dst8_stride);
1928                         ss_256[0] = ss_256[1];
1929                         ss_256[1] = ss_256[2];
1930                         ss_256[2] = ss_256[3];
1931                         src_ptr += 2 * src_stride;
1932                         dst += 2 * dst_stride;
1933                         dst8 += 2 * dst8_stride;
1934                         y -= 2;
1935                     } while (y);
1936                 } else if (w == 16) {
1937                     __m128i s_128[8];
1938                     __m256i ss_256[8], r[2];
1939 
1940                     s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
1941                     s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
1942                     s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
1943                     s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
1944                     s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
1945                     s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
1946                     s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
1947 
1948                     // Load lines a and b. Line a to lower 128, line b to upper
1949                     // 128
1950                     const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
1951                     const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
1952                     const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
1953                     const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
1954                     const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
1955                     const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
1956 
1957                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
1958                     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
1959                     ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
1960 
1961                     ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
1962                     ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
1963                     ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
1964 
1965                     int32_t y = h;
1966                     do {
1967                         y_convolve_8tap_16x2_avx2(
1968                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
1969                         jnt_comp_avg_round_store_16x2_avx2(
1970                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
1971                         ss_256[0] = ss_256[1];
1972                         ss_256[1] = ss_256[2];
1973                         ss_256[2] = ss_256[3];
1974                         ss_256[4] = ss_256[5];
1975                         ss_256[5] = ss_256[6];
1976                         ss_256[6] = ss_256[7];
1977                         src_ptr += 2 * src_stride;
1978                         dst += 2 * dst_stride;
1979                         dst8 += 2 * dst8_stride;
1980                         y -= 2;
1981                     } while (y);
1982                 } else {
1983                     __m256i s_256[8], ss_256[8], tt_256[8], r[4];
1984 
1985                     assert(!(w % 32));
1986 
1987                     x = 0;
1988                     do {
1989                         const uint8_t *s  = src_ptr + x;
1990                         ConvBufType *  d  = dst + x;
1991                         uint8_t *      d8 = dst8 + x;
1992 
1993                         s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
1994                         s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
1995                         s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
1996                         s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
1997                         s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
1998                         s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
1999                         s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2000 
2001                         ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2002                         ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2003                         ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2004                         ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2005                         ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2006                         ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2007 
2008                         tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2009                         tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2010                         tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2011                         tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2012                         tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2013                         tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2014 
2015                         int32_t y = h;
2016                         do {
2017                             y_convolve_8tap_32x2_avx2(
2018                                 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
2019                             jnt_comp_avg_round_store_32_avx2(
2020                                 r, factor_256, offset_comp_avg_256, d, d8);
2021                             jnt_comp_avg_round_store_32_avx2(r + 2,
2022                                                              factor_256,
2023                                                              offset_comp_avg_256,
2024                                                              d + dst_stride,
2025                                                              d8 + dst8_stride);
2026 
2027                             ss_256[0] = ss_256[1];
2028                             ss_256[1] = ss_256[2];
2029                             ss_256[2] = ss_256[3];
2030                             ss_256[4] = ss_256[5];
2031                             ss_256[5] = ss_256[6];
2032                             ss_256[6] = ss_256[7];
2033 
2034                             tt_256[0] = tt_256[1];
2035                             tt_256[1] = tt_256[2];
2036                             tt_256[2] = tt_256[3];
2037                             tt_256[4] = tt_256[5];
2038                             tt_256[5] = tt_256[6];
2039                             tt_256[6] = tt_256[7];
2040                             s += 2 * src_stride;
2041                             d += 2 * dst_stride;
2042                             d8 += 2 * dst8_stride;
2043                             y -= 2;
2044                         } while (y);
2045 
2046                         x += 32;
2047                     } while (x < w);
2048                 }
2049             }
2050         } else {
2051             const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_1 - bits - 2)) -
2052                 (round_offset << (round_1 - bits - 1));
2053 
2054             if (w <= 4) {
2055                 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
2056 
2057                 prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2058 
2059                 int32_t y = h;
2060 
2061                 if (w == 2) {
2062                     __m128i s_16[8], ss_128[4];
2063 
2064                     s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
2065                     s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
2066                     s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2067                     s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
2068                     s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
2069                     s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
2070                     s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
2071 
2072                     const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2073                     const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2074                     const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2075                     const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2076                     const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2077                     const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2078 
2079                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2080                     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2081                     ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2082 
2083                     do {
2084                         const __m128i res = y_convolve_8tap_2x2_ssse3(
2085                             src_ptr, src_stride, coeffs_128, s_16, ss_128);
2086                         jnt_avg_round_store_2x2_sse2(
2087                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
2088                         ss_128[0] = ss_128[1];
2089                         ss_128[1] = ss_128[2];
2090                         ss_128[2] = ss_128[3];
2091                         src_ptr += 2 * src_stride;
2092                         dst += 2 * dst_stride;
2093                         dst8 += 2 * dst8_stride;
2094                         y -= 2;
2095                     } while (y);
2096                 } else {
2097                     __m128i s_32[8], ss_128[4];
2098 
2099                     assert(w == 4);
2100 
2101                     s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
2102                     s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
2103                     s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
2104                     s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
2105                     s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
2106                     s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
2107                     s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
2108 
2109                     const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2110                     const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2111                     const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2112                     const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2113                     const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2114                     const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2115 
2116                     ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2117                     ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2118                     ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2119 
2120                     do {
2121                         const __m128i res = y_convolve_8tap_4x2_ssse3(
2122                             src_ptr, src_stride, coeffs_128, s_32, ss_128);
2123                         jnt_avg_round_store_4x2_sse2(
2124                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
2125                         ss_128[0] = ss_128[1];
2126                         ss_128[1] = ss_128[2];
2127                         ss_128[2] = ss_128[3];
2128                         src_ptr += 2 * src_stride;
2129                         dst += 2 * dst_stride;
2130                         dst8 += 2 * dst8_stride;
2131                         y -= 2;
2132                     } while (y);
2133                 }
2134             } else {
2135                 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
2136 
2137                 prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2138 
2139                 if (w == 8) {
2140                     __m128i s_64[8];
2141                     __m256i ss_256[4];
2142 
2143                     s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2144                     s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2145                     s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2146                     s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2147                     s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2148                     s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2149                     s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2150 
2151                     // Load lines a and b. Line a to lower 128, line b to upper
2152                     // 128
2153                     const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2154                     const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2155                     const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2156                     const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2157                     const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2158                     const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2159 
2160                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2161                     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2162                     ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2163 
2164                     int32_t y = h;
2165                     do {
2166                         const __m256i res = y_convolve_8tap_8x2_avx2(
2167                             src_ptr, src_stride, coeffs_256, s_64, ss_256);
2168                         jnt_avg_round_store_8x2_avx2(
2169                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2170                         ss_256[0] = ss_256[1];
2171                         ss_256[1] = ss_256[2];
2172                         ss_256[2] = ss_256[3];
2173                         src_ptr += 2 * src_stride;
2174                         dst += 2 * dst_stride;
2175                         dst8 += 2 * dst8_stride;
2176                         y -= 2;
2177                     } while (y);
2178                 } else if (w == 16) {
2179                     __m128i s_128[8];
2180                     __m256i ss_256[8], r[2];
2181 
2182                     s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2183                     s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2184                     s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2185                     s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2186                     s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2187                     s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2188                     s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2189 
2190                     // Load lines a and b. Line a to lower 128, line b to upper
2191                     // 128
2192                     const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2193                     const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2194                     const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2195                     const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2196                     const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2197                     const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2198 
2199                     ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2200                     ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2201                     ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2202 
2203                     ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2204                     ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2205                     ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2206 
2207                     int32_t y = h;
2208                     do {
2209                         y_convolve_8tap_16x2_avx2(
2210                             src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
2211                         jnt_avg_round_store_16x2_avx2(
2212                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2213                         ss_256[0] = ss_256[1];
2214                         ss_256[1] = ss_256[2];
2215                         ss_256[2] = ss_256[3];
2216                         ss_256[4] = ss_256[5];
2217                         ss_256[5] = ss_256[6];
2218                         ss_256[6] = ss_256[7];
2219                         src_ptr += 2 * src_stride;
2220                         dst += 2 * dst_stride;
2221                         dst8 += 2 * dst8_stride;
2222                         y -= 2;
2223                     } while (y);
2224                 } else {
2225                     __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2226 
2227                     assert(!(w % 32));
2228 
2229                     x = 0;
2230                     do {
2231                         const uint8_t *s  = src_ptr + x;
2232                         ConvBufType *  d  = dst + x;
2233                         uint8_t *      d8 = dst8 + x;
2234 
2235                         s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2236                         s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2237                         s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2238                         s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2239                         s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2240                         s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2241                         s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2242 
2243                         ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2244                         ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2245                         ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2246                         ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2247                         ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2248                         ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2249 
2250                         tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2251                         tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2252                         tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2253                         tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2254                         tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2255                         tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2256 
2257                         int32_t y = h;
2258                         do {
2259                             y_convolve_8tap_32x2_avx2(
2260                                 s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
2261                             jnt_avg_round_store_32_avx2(r, offset_avg_256, d, d8);
2262                             jnt_avg_round_store_32_avx2(
2263                                 r + 2, offset_avg_256, d + dst_stride, d8 + dst8_stride);
2264 
2265                             ss_256[0] = ss_256[1];
2266                             ss_256[1] = ss_256[2];
2267                             ss_256[2] = ss_256[3];
2268                             ss_256[4] = ss_256[5];
2269                             ss_256[5] = ss_256[6];
2270                             ss_256[6] = ss_256[7];
2271 
2272                             tt_256[0] = tt_256[1];
2273                             tt_256[1] = tt_256[2];
2274                             tt_256[2] = tt_256[3];
2275                             tt_256[4] = tt_256[5];
2276                             tt_256[5] = tt_256[6];
2277                             tt_256[6] = tt_256[7];
2278                             s += 2 * src_stride;
2279                             d += 2 * dst_stride;
2280                             d8 += 2 * dst8_stride;
2281                             y -= 2;
2282                         } while (y);
2283 
2284                         x += 32;
2285                     } while (x < w);
2286                 }
2287             }
2288         }
2289     } else {
2290         const int16_t offset_no_avg = (round_offset << (round_1 - bits - 1)) +
2291             (1 << (round_1 - bits - 2));
2292 
2293         if (w <= 4) {
2294             const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
2295 
2296             prepare_half_coeffs_8tap_ssse3(filter_params_y, subpel_y_q4, coeffs_128);
2297 
2298             int32_t y = h;
2299 
2300             if (w == 2) {
2301                 __m128i s_16[8], ss_128[4];
2302 
2303                 s_16[0] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 0 * src_stride));
2304                 s_16[1] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 1 * src_stride));
2305                 s_16[2] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 2 * src_stride));
2306                 s_16[3] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 3 * src_stride));
2307                 s_16[4] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 4 * src_stride));
2308                 s_16[5] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 5 * src_stride));
2309                 s_16[6] = _mm_cvtsi32_si128(*(int16_t *)(src_ptr + 6 * src_stride));
2310 
2311                 const __m128i src01 = _mm_unpacklo_epi16(s_16[0], s_16[1]);
2312                 const __m128i src12 = _mm_unpacklo_epi16(s_16[1], s_16[2]);
2313                 const __m128i src23 = _mm_unpacklo_epi16(s_16[2], s_16[3]);
2314                 const __m128i src34 = _mm_unpacklo_epi16(s_16[3], s_16[4]);
2315                 const __m128i src45 = _mm_unpacklo_epi16(s_16[4], s_16[5]);
2316                 const __m128i src56 = _mm_unpacklo_epi16(s_16[5], s_16[6]);
2317 
2318                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2319                 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2320                 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2321 
2322                 do {
2323                     const __m128i res = y_convolve_8tap_2x2_ssse3(
2324                         src_ptr, src_stride, coeffs_128, s_16, ss_128);
2325                     jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
2326                     ss_128[0] = ss_128[1];
2327                     ss_128[1] = ss_128[2];
2328                     ss_128[2] = ss_128[3];
2329                     src_ptr += 2 * src_stride;
2330                     dst += 2 * dst_stride;
2331                     y -= 2;
2332                 } while (y);
2333             } else {
2334                 __m128i s_32[8], ss_128[4];
2335 
2336                 assert(w == 4);
2337 
2338                 s_32[0] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 0 * src_stride));
2339                 s_32[1] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 1 * src_stride));
2340                 s_32[2] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 2 * src_stride));
2341                 s_32[3] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 3 * src_stride));
2342                 s_32[4] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 4 * src_stride));
2343                 s_32[5] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 5 * src_stride));
2344                 s_32[6] = _mm_cvtsi32_si128(*(int32_t *)(src_ptr + 6 * src_stride));
2345 
2346                 const __m128i src01 = _mm_unpacklo_epi32(s_32[0], s_32[1]);
2347                 const __m128i src12 = _mm_unpacklo_epi32(s_32[1], s_32[2]);
2348                 const __m128i src23 = _mm_unpacklo_epi32(s_32[2], s_32[3]);
2349                 const __m128i src34 = _mm_unpacklo_epi32(s_32[3], s_32[4]);
2350                 const __m128i src45 = _mm_unpacklo_epi32(s_32[4], s_32[5]);
2351                 const __m128i src56 = _mm_unpacklo_epi32(s_32[5], s_32[6]);
2352 
2353                 ss_128[0] = _mm_unpacklo_epi8(src01, src12);
2354                 ss_128[1] = _mm_unpacklo_epi8(src23, src34);
2355                 ss_128[2] = _mm_unpacklo_epi8(src45, src56);
2356 
2357                 do {
2358                     const __m128i res = y_convolve_8tap_4x2_ssse3(
2359                         src_ptr, src_stride, coeffs_128, s_32, ss_128);
2360                     jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
2361                     ss_128[0] = ss_128[1];
2362                     ss_128[1] = ss_128[2];
2363                     ss_128[2] = ss_128[3];
2364                     src_ptr += 2 * src_stride;
2365                     dst += 2 * dst_stride;
2366                     y -= 2;
2367                 } while (y);
2368             }
2369         } else {
2370             const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
2371 
2372             prepare_half_coeffs_8tap_avx2(filter_params_y, subpel_y_q4, coeffs_256);
2373 
2374             if (w == 8) {
2375                 __m128i s_64[8];
2376                 __m256i ss_256[4];
2377 
2378                 s_64[0] = _mm_loadl_epi64((__m128i *)(src_ptr + 0 * src_stride));
2379                 s_64[1] = _mm_loadl_epi64((__m128i *)(src_ptr + 1 * src_stride));
2380                 s_64[2] = _mm_loadl_epi64((__m128i *)(src_ptr + 2 * src_stride));
2381                 s_64[3] = _mm_loadl_epi64((__m128i *)(src_ptr + 3 * src_stride));
2382                 s_64[4] = _mm_loadl_epi64((__m128i *)(src_ptr + 4 * src_stride));
2383                 s_64[5] = _mm_loadl_epi64((__m128i *)(src_ptr + 5 * src_stride));
2384                 s_64[6] = _mm_loadl_epi64((__m128i *)(src_ptr + 6 * src_stride));
2385 
2386                 // Load lines a and b. Line a to lower 128, line b to upper 128
2387                 const __m256i src01 = _mm256_setr_m128i(s_64[0], s_64[1]);
2388                 const __m256i src12 = _mm256_setr_m128i(s_64[1], s_64[2]);
2389                 const __m256i src23 = _mm256_setr_m128i(s_64[2], s_64[3]);
2390                 const __m256i src34 = _mm256_setr_m128i(s_64[3], s_64[4]);
2391                 const __m256i src45 = _mm256_setr_m128i(s_64[4], s_64[5]);
2392                 const __m256i src56 = _mm256_setr_m128i(s_64[5], s_64[6]);
2393 
2394                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2395                 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2396                 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2397 
2398                 int32_t y = h;
2399                 do {
2400                     const __m256i res = y_convolve_8tap_8x2_avx2(
2401                         src_ptr, src_stride, coeffs_256, s_64, ss_256);
2402                     jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
2403                     ss_256[0] = ss_256[1];
2404                     ss_256[1] = ss_256[2];
2405                     ss_256[2] = ss_256[3];
2406                     src_ptr += 2 * src_stride;
2407                     dst += 2 * dst_stride;
2408                     y -= 2;
2409                 } while (y);
2410             } else if (w == 16) {
2411                 __m128i s_128[8];
2412                 __m256i ss_256[8], r[2];
2413 
2414                 s_128[0] = _mm_loadu_si128((__m128i *)(src_ptr + 0 * src_stride));
2415                 s_128[1] = _mm_loadu_si128((__m128i *)(src_ptr + 1 * src_stride));
2416                 s_128[2] = _mm_loadu_si128((__m128i *)(src_ptr + 2 * src_stride));
2417                 s_128[3] = _mm_loadu_si128((__m128i *)(src_ptr + 3 * src_stride));
2418                 s_128[4] = _mm_loadu_si128((__m128i *)(src_ptr + 4 * src_stride));
2419                 s_128[5] = _mm_loadu_si128((__m128i *)(src_ptr + 5 * src_stride));
2420                 s_128[6] = _mm_loadu_si128((__m128i *)(src_ptr + 6 * src_stride));
2421 
2422                 // Load lines a and b. Line a to lower 128, line b to upper 128
2423                 const __m256i src01 = _mm256_setr_m128i(s_128[0], s_128[1]);
2424                 const __m256i src12 = _mm256_setr_m128i(s_128[1], s_128[2]);
2425                 const __m256i src23 = _mm256_setr_m128i(s_128[2], s_128[3]);
2426                 const __m256i src34 = _mm256_setr_m128i(s_128[3], s_128[4]);
2427                 const __m256i src45 = _mm256_setr_m128i(s_128[4], s_128[5]);
2428                 const __m256i src56 = _mm256_setr_m128i(s_128[5], s_128[6]);
2429 
2430                 ss_256[0] = _mm256_unpacklo_epi8(src01, src12);
2431                 ss_256[1] = _mm256_unpacklo_epi8(src23, src34);
2432                 ss_256[2] = _mm256_unpacklo_epi8(src45, src56);
2433 
2434                 ss_256[4] = _mm256_unpackhi_epi8(src01, src12);
2435                 ss_256[5] = _mm256_unpackhi_epi8(src23, src34);
2436                 ss_256[6] = _mm256_unpackhi_epi8(src45, src56);
2437 
2438                 int32_t y = h;
2439                 do {
2440                     y_convolve_8tap_16x2_avx2(src_ptr, src_stride, coeffs_256, s_128, ss_256, r);
2441                     jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
2442                     ss_256[0] = ss_256[1];
2443                     ss_256[1] = ss_256[2];
2444                     ss_256[2] = ss_256[3];
2445                     ss_256[4] = ss_256[5];
2446                     ss_256[5] = ss_256[6];
2447                     ss_256[6] = ss_256[7];
2448                     src_ptr += 2 * src_stride;
2449                     dst += 2 * dst_stride;
2450                     y -= 2;
2451                 } while (y);
2452             } else {
2453                 __m256i s_256[8], ss_256[8], tt_256[8], r[4];
2454 
2455                 assert(!(w % 32));
2456 
2457                 x = 0;
2458                 do {
2459                     const uint8_t *s = src_ptr + x;
2460                     ConvBufType *  d = dst + x;
2461 
2462                     s_256[0] = _mm256_loadu_si256((__m256i *)(s + 0 * src_stride));
2463                     s_256[1] = _mm256_loadu_si256((__m256i *)(s + 1 * src_stride));
2464                     s_256[2] = _mm256_loadu_si256((__m256i *)(s + 2 * src_stride));
2465                     s_256[3] = _mm256_loadu_si256((__m256i *)(s + 3 * src_stride));
2466                     s_256[4] = _mm256_loadu_si256((__m256i *)(s + 4 * src_stride));
2467                     s_256[5] = _mm256_loadu_si256((__m256i *)(s + 5 * src_stride));
2468                     s_256[6] = _mm256_loadu_si256((__m256i *)(s + 6 * src_stride));
2469 
2470                     ss_256[0] = _mm256_unpacklo_epi8(s_256[0], s_256[1]);
2471                     ss_256[1] = _mm256_unpacklo_epi8(s_256[2], s_256[3]);
2472                     ss_256[2] = _mm256_unpacklo_epi8(s_256[4], s_256[5]);
2473                     ss_256[4] = _mm256_unpackhi_epi8(s_256[0], s_256[1]);
2474                     ss_256[5] = _mm256_unpackhi_epi8(s_256[2], s_256[3]);
2475                     ss_256[6] = _mm256_unpackhi_epi8(s_256[4], s_256[5]);
2476 
2477                     tt_256[0] = _mm256_unpacklo_epi8(s_256[1], s_256[2]);
2478                     tt_256[1] = _mm256_unpacklo_epi8(s_256[3], s_256[4]);
2479                     tt_256[2] = _mm256_unpacklo_epi8(s_256[5], s_256[6]);
2480                     tt_256[4] = _mm256_unpackhi_epi8(s_256[1], s_256[2]);
2481                     tt_256[5] = _mm256_unpackhi_epi8(s_256[3], s_256[4]);
2482                     tt_256[6] = _mm256_unpackhi_epi8(s_256[5], s_256[6]);
2483 
2484                     int32_t y = h;
2485                     do {
2486                         y_convolve_8tap_32x2_avx2(
2487                             s, src_stride, coeffs_256, s_256, ss_256, tt_256, r);
2488                         jnt_no_avg_round_store_32_avx2(r, offset_no_avg_256, d);
2489                         jnt_no_avg_round_store_32_avx2(r + 2, offset_no_avg_256, d + dst_stride);
2490 
2491                         ss_256[0] = ss_256[1];
2492                         ss_256[1] = ss_256[2];
2493                         ss_256[2] = ss_256[3];
2494                         ss_256[4] = ss_256[5];
2495                         ss_256[5] = ss_256[6];
2496                         ss_256[6] = ss_256[7];
2497 
2498                         tt_256[0] = tt_256[1];
2499                         tt_256[1] = tt_256[2];
2500                         tt_256[2] = tt_256[3];
2501                         tt_256[4] = tt_256[5];
2502                         tt_256[5] = tt_256[6];
2503                         tt_256[6] = tt_256[7];
2504                         s += 2 * src_stride;
2505                         d += 2 * dst_stride;
2506                         y -= 2;
2507                     } while (y);
2508 
2509                     x += 32;
2510                 } while (x < w);
2511             }
2512         }
2513     }
2514 }
2515 
2516 typedef void (*JntConvolveYTapFunc)(const uint8_t *const src, const int32_t src_stride,
2517                                     uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
2518                                     const int32_t                   h,
2519                                     const InterpFilterParams *const filter_params_y,
2520                                     const int32_t                   subpel_y_q4,
2521                                     const ConvolveParams *const     conv_params);
2522 
svt_av1_jnt_convolve_y_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)2523 void svt_av1_jnt_convolve_y_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
2524                                  int32_t dst8_stride, int32_t w, int32_t h,
2525                                  InterpFilterParams *filter_params_x,
2526                                  InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
2527                                  const int32_t subpel_y_q4, ConvolveParams *conv_params) {
2528     static const JntConvolveYTapFunc jnt_convolve_y_tap_func_table[MAX_FILTER_TAP + 1] = {
2529         NULL,
2530         NULL,
2531         jnt_convolve_y_2tap_avx2,
2532         NULL,
2533         jnt_convolve_y_4tap_avx2,
2534         NULL,
2535         jnt_convolve_y_6tap_avx2,
2536         NULL,
2537         jnt_convolve_y_8tap_avx2};
2538     const int32_t tap_y = get_convolve_tap(filter_params_y->filter_ptr);
2539 
2540     (void)filter_params_x;
2541     (void)subpel_x_q4;
2542 
2543     assert(conv_params->round_0 == 3);
2544     assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
2545 
2546     jnt_convolve_y_tap_func_table[tap_y](
2547         src, src_stride, dst8, dst8_stride, w, h, filter_params_y, subpel_y_q4, conv_params);
2548 }
2549 
2550 // =============================================================================
2551 
jnt_copy_avg_32_avx2(const uint8_t * const src,const __m256i offset_avg_256,const ConvBufType * const dst,uint8_t * const dst8)2552 static INLINE void jnt_copy_avg_32_avx2(const uint8_t *const src, const __m256i offset_avg_256,
2553                                         const ConvBufType *const dst, uint8_t *const dst8) {
2554     __m256i res[2];
2555     jnt_copy_load_src_32_avx2(src, res);
2556     jnt_copy_avg_round_store_32_avx2(res, offset_avg_256, dst, dst8);
2557 }
2558 
jnt_copy_no_avg_32_avx2(const uint8_t * const src,const __m256i offset_no_avg_256,const ConvBufType * const dst)2559 static INLINE void jnt_copy_no_avg_32_avx2(const uint8_t *const     src,
2560                                            const __m256i            offset_no_avg_256,
2561                                            const ConvBufType *const dst) {
2562     __m256i d[2];
2563     jnt_copy_load_src_32_avx2(src, d);
2564     d[0] = _mm256_add_epi16(d[0], offset_no_avg_256);
2565     d[1] = _mm256_add_epi16(d[1], offset_no_avg_256);
2566     _mm256_storeu_si256((__m256i *)(dst + 0 * 16), d[0]);
2567     _mm256_storeu_si256((__m256i *)(dst + 1 * 16), d[1]);
2568 }
2569 
svt_av1_jnt_convolve_2d_copy_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)2570 void svt_av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
2571                                        int32_t dst8_stride, int32_t w, int32_t h,
2572                                        InterpFilterParams *filter_params_x,
2573                                        InterpFilterParams *filter_params_y,
2574                                        const int32_t subpel_x_q4, const int32_t subpel_y_q4,
2575                                        ConvolveParams *conv_params) {
2576     const int32_t round_0      = 3;
2577     const int32_t round_1      = COMPOUND_ROUND1_BITS;
2578     const int32_t bits         = 2 * FILTER_BITS - round_0 - round_1;
2579     const int32_t bd           = 8;
2580     const int32_t offset_bits  = bd + bits;
2581     const int32_t round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
2582     ConvBufType * dst          = conv_params->dst;
2583     int32_t       dst_stride   = conv_params->dst_stride;
2584 
2585     (void)filter_params_x;
2586     (void)filter_params_y;
2587     (void)subpel_x_q4;
2588     (void)subpel_y_q4;
2589 
2590     if (conv_params->do_average) {
2591         if (conv_params->use_jnt_comp_avg) {
2592             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
2593             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
2594                 (1 << (bits + DIST_PRECISION_BITS - 1)) - (round_offset << DIST_PRECISION_BITS);
2595 
2596             if (w <= 4) {
2597                 const __m128i factor_128          = _mm_set1_epi32(factor);
2598                 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
2599 
2600                 if (w == 2) {
2601                     do {
2602                         const __m128i res = jnt_copy_load_src_2x2_sse2(src, src_stride);
2603                         jnt_comp_avg_round_store_2x2_kernel_sse2(res,
2604                                                                  factor_128,
2605                                                                  offset_comp_avg_128,
2606                                                                  dst,
2607                                                                  dst_stride,
2608                                                                  dst8,
2609                                                                  dst8_stride);
2610                         src += 2 * src_stride;
2611                         dst += 2 * dst_stride;
2612                         dst8 += 2 * dst8_stride;
2613                         h -= 2;
2614                     } while (h);
2615                 } else {
2616                     assert(w == 4);
2617 
2618                     do {
2619                         const __m128i res = jnt_copy_load_src_4x2_sse4_1(src, src_stride);
2620                         jnt_comp_avg_round_store_4x2_kernel_sse2(res,
2621                                                                  factor_128,
2622                                                                  offset_comp_avg_128,
2623                                                                  dst,
2624                                                                  dst_stride,
2625                                                                  dst8,
2626                                                                  dst8_stride);
2627                         src += 2 * src_stride;
2628                         dst += 2 * dst_stride;
2629                         dst8 += 2 * dst8_stride;
2630                         h -= 2;
2631                     } while (h);
2632                 }
2633             } else {
2634                 const __m256i factor_256          = _mm256_set1_epi32(factor);
2635                 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
2636 
2637                 if (w == 8) {
2638                     do {
2639                         const __m256i res = jnt_copy_load_src_8x2_avx2(src, src_stride);
2640                         jnt_comp_avg_round_store_8x2_kernel_avx2(res,
2641                                                                  factor_256,
2642                                                                  offset_comp_avg_256,
2643                                                                  dst,
2644                                                                  dst_stride,
2645                                                                  dst8,
2646                                                                  dst8_stride);
2647                         src += 2 * src_stride;
2648                         dst += 2 * dst_stride;
2649                         dst8 += 2 * dst8_stride;
2650                         h -= 2;
2651                     } while (h);
2652                 } else if (w == 16) {
2653                     do {
2654                         __m256i res[2];
2655                         res[0] = jnt_copy_load_src_16_avx2(src);
2656                         res[1] = jnt_copy_load_src_16_avx2(src + src_stride);
2657                         jnt_comp_avg_round_store_16x2_kernel_avx2(res,
2658                                                                   factor_256,
2659                                                                   offset_comp_avg_256,
2660                                                                   dst,
2661                                                                   dst_stride,
2662                                                                   dst8,
2663                                                                   dst8_stride);
2664                         src += 2 * src_stride;
2665                         dst += 2 * dst_stride;
2666                         dst8 += 2 * dst8_stride;
2667                         h -= 2;
2668                     } while (h);
2669                 } else if (w == 32) {
2670                     do {
2671                         jnt_copy_comp_avg_32_avx2(src, factor_256, offset_comp_avg_256, dst, dst8);
2672                         src += src_stride;
2673                         dst += dst_stride;
2674                         dst8 += dst8_stride;
2675                     } while (--h);
2676                 } else if (w == 64) {
2677                     do {
2678                         jnt_copy_comp_avg_32_avx2(src + 0 * 32,
2679                                                   factor_256,
2680                                                   offset_comp_avg_256,
2681                                                   dst + 0 * 32,
2682                                                   dst8 + 0 * 32);
2683                         jnt_copy_comp_avg_32_avx2(src + 1 * 32,
2684                                                   factor_256,
2685                                                   offset_comp_avg_256,
2686                                                   dst + 1 * 32,
2687                                                   dst8 + 1 * 32);
2688                         src += src_stride;
2689                         dst += dst_stride;
2690                         dst8 += dst8_stride;
2691                     } while (--h);
2692                 } else {
2693                     assert(w == 128);
2694 
2695                     do {
2696                         jnt_copy_comp_avg_32_avx2(src + 0 * 32,
2697                                                   factor_256,
2698                                                   offset_comp_avg_256,
2699                                                   dst + 0 * 32,
2700                                                   dst8 + 0 * 32);
2701                         jnt_copy_comp_avg_32_avx2(src + 1 * 32,
2702                                                   factor_256,
2703                                                   offset_comp_avg_256,
2704                                                   dst + 1 * 32,
2705                                                   dst8 + 1 * 32);
2706                         jnt_copy_comp_avg_32_avx2(src + 2 * 32,
2707                                                   factor_256,
2708                                                   offset_comp_avg_256,
2709                                                   dst + 2 * 32,
2710                                                   dst8 + 2 * 32);
2711                         jnt_copy_comp_avg_32_avx2(src + 3 * 32,
2712                                                   factor_256,
2713                                                   offset_comp_avg_256,
2714                                                   dst + 3 * 32,
2715                                                   dst8 + 3 * 32);
2716                         src += src_stride;
2717                         dst += dst_stride;
2718                         dst8 += dst8_stride;
2719                     } while (--h);
2720                 }
2721             }
2722         } else {
2723             const int16_t offset_avg = (1 << bits) - round_offset;
2724 
2725             if (w <= 4) {
2726                 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
2727 
2728                 if (w == 2) {
2729                     do {
2730                         const __m128i res = jnt_copy_load_src_2x2_sse2(src, src_stride);
2731                         jnt_copy_avg_round_store_2x2_sse2(
2732                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
2733                         src += 2 * src_stride;
2734                         dst += 2 * dst_stride;
2735                         dst8 += 2 * dst8_stride;
2736                         h -= 2;
2737                     } while (h);
2738                 } else {
2739                     assert(w == 4);
2740 
2741                     do {
2742                         const __m128i res = jnt_copy_load_src_4x2_sse4_1(src, src_stride);
2743                         jnt_copy_avg_round_store_4x2_sse2(
2744                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
2745                         src += 2 * src_stride;
2746                         dst += 2 * dst_stride;
2747                         dst8 += 2 * dst8_stride;
2748                         h -= 2;
2749                     } while (h);
2750                 }
2751             } else {
2752                 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
2753 
2754                 if (w == 8) {
2755                     do {
2756                         const __m256i res = jnt_copy_load_src_8x2_avx2(src, src_stride);
2757                         jnt_copy_avg_round_store_8x2_avx2(
2758                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2759                         src += 2 * src_stride;
2760                         dst += 2 * dst_stride;
2761                         dst8 += 2 * dst8_stride;
2762                         h -= 2;
2763                     } while (h);
2764                 } else if (w == 16) {
2765                     do {
2766                         __m256i res[2];
2767                         res[0] = jnt_copy_load_src_16_avx2(src);
2768                         res[1] = jnt_copy_load_src_16_avx2(src + src_stride);
2769                         jnt_copy_avg_round_store_16x2_avx2(
2770                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
2771                         src += 2 * src_stride;
2772                         dst += 2 * dst_stride;
2773                         dst8 += 2 * dst8_stride;
2774                         h -= 2;
2775                     } while (h);
2776                 } else if (w == 32) {
2777                     do {
2778                         jnt_copy_avg_32_avx2(src, offset_avg_256, dst, dst8);
2779                         src += src_stride;
2780                         dst += dst_stride;
2781                         dst8 += dst8_stride;
2782                     } while (--h);
2783                 } else if (w == 64) {
2784                     do {
2785                         jnt_copy_avg_32_avx2(
2786                             src + 0 * 32, offset_avg_256, dst + 0 * 32, dst8 + 0 * 32);
2787                         jnt_copy_avg_32_avx2(
2788                             src + 1 * 32, offset_avg_256, dst + 1 * 32, dst8 + 1 * 32);
2789                         src += src_stride;
2790                         dst += dst_stride;
2791                         dst8 += dst8_stride;
2792                     } while (--h);
2793                 } else {
2794                     assert(w == 128);
2795 
2796                     do {
2797                         jnt_copy_avg_32_avx2(
2798                             src + 0 * 32, offset_avg_256, dst + 0 * 32, dst8 + 0 * 32);
2799                         jnt_copy_avg_32_avx2(
2800                             src + 1 * 32, offset_avg_256, dst + 1 * 32, dst8 + 1 * 32);
2801                         jnt_copy_avg_32_avx2(
2802                             src + 2 * 32, offset_avg_256, dst + 2 * 32, dst8 + 2 * 32);
2803                         jnt_copy_avg_32_avx2(
2804                             src + 3 * 32, offset_avg_256, dst + 3 * 32, dst8 + 3 * 32);
2805                         src += src_stride;
2806                         dst += dst_stride;
2807                         dst8 += dst8_stride;
2808                     } while (--h);
2809                 }
2810             }
2811         }
2812     } else {
2813         const int32_t offset_no_avg = (1 << offset_bits) + (1 << (offset_bits - 1));
2814 
2815         if (w <= 4) {
2816             const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
2817 
2818             if (w == 2) {
2819                 do {
2820                     const __m128i res               = jnt_copy_load_src_2x2_sse2(src, src_stride);
2821                     const __m128i r                 = _mm_add_epi16(res, offset_no_avg_128);
2822                     *(uint32_t *)dst                = _mm_cvtsi128_si32(r);
2823                     *(uint32_t *)(dst + dst_stride) = _mm_extract_epi32(r, 1);
2824                     src += 2 * src_stride;
2825                     dst += 2 * dst_stride;
2826                     h -= 2;
2827                 } while (h);
2828             } else {
2829                 assert(w == 4);
2830 
2831                 do {
2832                     const __m128i res = jnt_copy_load_src_4x2_sse4_1(src, src_stride);
2833                     const __m128i r   = _mm_add_epi16(res, offset_no_avg_128);
2834                     store_u16_4x2_sse2(r, dst, dst_stride);
2835                     src += 2 * src_stride;
2836                     dst += 2 * dst_stride;
2837                     h -= 2;
2838                 } while (h);
2839             }
2840         } else {
2841             const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
2842 
2843             if (w == 8) {
2844                 do {
2845                     const __m256i res = jnt_copy_load_src_8x2_avx2(src, src_stride);
2846                     const __m256i r   = _mm256_add_epi16(res, offset_no_avg_256);
2847                     storeu_u16_8x2_avx2(r, dst, dst_stride);
2848                     src += 2 * src_stride;
2849                     dst += 2 * dst_stride;
2850                     h -= 2;
2851                 } while (h);
2852             } else if (w == 16) {
2853                 do {
2854                     __m256i d[2];
2855                     d[0] = jnt_copy_load_src_16_avx2(src);
2856                     d[1] = jnt_copy_load_src_16_avx2(src + src_stride);
2857                     d[0] = _mm256_add_epi16(d[0], offset_no_avg_256);
2858                     d[1] = _mm256_add_epi16(d[1], offset_no_avg_256);
2859                     _mm256_storeu_si256((__m256i *)(dst + 0 * dst_stride), d[0]);
2860                     _mm256_storeu_si256((__m256i *)(dst + 1 * dst_stride), d[1]);
2861                     src += 2 * src_stride;
2862                     dst += 2 * dst_stride;
2863                     h -= 2;
2864                 } while (h);
2865             } else if (w == 32) {
2866                 do {
2867                     jnt_copy_no_avg_32_avx2(src, offset_no_avg_256, dst);
2868                     src += src_stride;
2869                     dst += dst_stride;
2870                 } while (--h);
2871             } else if (w == 64) {
2872                 do {
2873                     jnt_copy_no_avg_32_avx2(src + 0 * 32, offset_no_avg_256, dst + 0 * 32);
2874                     jnt_copy_no_avg_32_avx2(src + 1 * 32, offset_no_avg_256, dst + 1 * 32);
2875                     src += src_stride;
2876                     dst += dst_stride;
2877                 } while (--h);
2878             } else {
2879                 assert(w == 128);
2880 
2881                 do {
2882                     jnt_copy_no_avg_32_avx2(src + 0 * 32, offset_no_avg_256, dst + 0 * 32);
2883                     jnt_copy_no_avg_32_avx2(src + 1 * 32, offset_no_avg_256, dst + 1 * 32);
2884                     jnt_copy_no_avg_32_avx2(src + 2 * 32, offset_no_avg_256, dst + 2 * 32);
2885                     jnt_copy_no_avg_32_avx2(src + 3 * 32, offset_no_avg_256, dst + 3 * 32);
2886                     src += src_stride;
2887                     dst += dst_stride;
2888                 } while (--h);
2889             }
2890         }
2891     }
2892 }
2893 
2894 // =============================================================================
2895 
jnt_x_comp_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i factor,const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)2896 SIMD_INLINE void jnt_x_comp_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
2897                                              const __m256i factor, const __m256i offset,
2898                                              ConvBufType *const dst, uint8_t *const dst8) {
2899     __m256i r[2];
2900 
2901     x_convolve_2tap_32_avx2(src, coeffs, r);
2902     jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
2903 }
2904 
jnt_x_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i offset,const ConvBufType * const dst,uint8_t * const dst8)2905 static INLINE void jnt_x_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
2906                                           const __m256i offset, const ConvBufType *const dst,
2907                                           uint8_t *const dst8) {
2908     __m256i r[2];
2909 
2910     x_convolve_2tap_32_avx2(src, coeffs, r);
2911     jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
2912 }
2913 
jnt_x_no_avg_2tap_32_avx2(const uint8_t * const src,const __m256i * const coeffs,const __m256i offset,ConvBufType * const dst)2914 static INLINE void jnt_x_no_avg_2tap_32_avx2(const uint8_t *const src, const __m256i *const coeffs,
2915                                              const __m256i offset, ConvBufType *const dst) {
2916     __m256i r[2];
2917 
2918     x_convolve_2tap_32_avx2(src, coeffs, r);
2919     jnt_no_avg_round_store_32_avx2(r, offset, dst);
2920 }
2921 
jnt_x_comp_avg_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i factor,const __m256i offset,ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)2922 SIMD_INLINE void jnt_x_comp_avg_6tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
2923                                                const __m256i coeffs[3], const __m256i filt[3],
2924                                                const __m256i factor, const __m256i offset,
2925                                                ConvBufType *const dst, const int32_t dst_stride,
2926                                                uint8_t *const dst8, const int32_t dst8_stride) {
2927     __m256i r[2];
2928 
2929     x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2930     jnt_comp_avg_round_store_16x2_avx2(r, factor, offset, dst, dst_stride, dst8, dst8_stride);
2931 }
2932 
jnt_x_avg_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i offset,ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)2933 SIMD_INLINE void jnt_x_avg_6tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
2934                                           const __m256i coeffs[3], const __m256i filt[3],
2935                                           const __m256i offset, ConvBufType *const dst,
2936                                           const int32_t dst_stride, uint8_t *const dst8,
2937                                           const int32_t dst8_stride) {
2938     __m256i r[2];
2939 
2940     x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2941     jnt_avg_round_store_16x2_avx2(r, offset, dst, dst_stride, dst8, dst8_stride);
2942 }
2943 
jnt_x_no_avg_6tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[3],const __m256i filt[3],const __m256i offset,ConvBufType * const dst,const int32_t dst_stride)2944 SIMD_INLINE void jnt_x_no_avg_6tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
2945                                              const __m256i coeffs[3], const __m256i filt[3],
2946                                              const __m256i offset, ConvBufType *const dst,
2947                                              const int32_t dst_stride) {
2948     __m256i r[2];
2949 
2950     x_convolve_6tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2951     jnt_no_avg_round_store_16x2_avx2(r, offset, dst, dst_stride);
2952 }
2953 
jnt_x_comp_avg_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],const __m256i factor,const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)2954 SIMD_INLINE void jnt_x_comp_avg_6tap_32_avx2(const uint8_t *const src, const __m256i coeffs[3],
2955                                              const __m256i filt[3], const __m256i factor,
2956                                              const __m256i offset, ConvBufType *const dst,
2957                                              uint8_t *const dst8) {
2958     __m256i r[2];
2959 
2960     x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2961     jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
2962 }
2963 
jnt_x_avg_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)2964 SIMD_INLINE void jnt_x_avg_6tap_32_avx2(const uint8_t *const src, const __m256i coeffs[3],
2965                                         const __m256i filt[3], const __m256i offset,
2966                                         ConvBufType *const dst, uint8_t *const dst8) {
2967     __m256i r[2];
2968 
2969     x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2970     jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
2971 }
2972 
jnt_x_no_avg_6tap_32_avx2(const uint8_t * const src,const __m256i coeffs[3],const __m256i filt[3],const __m256i offset,ConvBufType * const dst)2973 SIMD_INLINE void jnt_x_no_avg_6tap_32_avx2(const uint8_t *const src, const __m256i coeffs[3],
2974                                            const __m256i filt[3], const __m256i offset,
2975                                            ConvBufType *const dst) {
2976     __m256i r[2];
2977 
2978     x_convolve_6tap_32_avx2(src, coeffs, filt, r);
2979     jnt_no_avg_round_store_32_avx2(r, offset, dst);
2980 }
2981 
jnt_x_comp_avg_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i factor,const __m256i offset,ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)2982 static INLINE void jnt_x_comp_avg_8tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
2983                                                  const __m256i coeffs[4], const __m256i filt[4],
2984                                                  const __m256i factor, const __m256i offset,
2985                                                  ConvBufType *const dst, const int32_t dst_stride,
2986                                                  uint8_t *const dst8, const int32_t dst8_stride) {
2987     __m256i r[2];
2988 
2989     x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
2990     jnt_comp_avg_round_store_16x2_avx2(r, factor, offset, dst, dst_stride, dst8, dst8_stride);
2991 }
2992 
jnt_x_comp_avg_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],const __m256i factor,const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)2993 SIMD_INLINE void jnt_x_comp_avg_8tap_32_avx2(const uint8_t *const src, const __m256i coeffs[4],
2994                                              const __m256i filt[4], const __m256i factor,
2995                                              const __m256i offset, ConvBufType *const dst,
2996                                              uint8_t *const dst8) {
2997     __m256i r[2];
2998 
2999     x_convolve_8tap_32_avx2(src, coeffs, filt, r);
3000     jnt_comp_avg_round_store_32_avx2(r, factor, offset, dst, dst8);
3001 }
3002 
jnt_x_avg_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i offset,ConvBufType * const dst,const int32_t dst_stride,uint8_t * const dst8,const int32_t dst8_stride)3003 SIMD_INLINE void jnt_x_avg_8tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
3004                                           const __m256i coeffs[4], const __m256i filt[4],
3005                                           const __m256i offset, ConvBufType *const dst,
3006                                           const int32_t dst_stride, uint8_t *const dst8,
3007                                           const int32_t dst8_stride) {
3008     __m256i r[2];
3009 
3010     x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
3011     jnt_avg_round_store_16x2_avx2(r, offset, dst, dst_stride, dst8, dst8_stride);
3012 }
3013 
jnt_x_avg_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],const __m256i offset,ConvBufType * const dst,uint8_t * const dst8)3014 SIMD_INLINE void jnt_x_avg_8tap_32_avx2(const uint8_t *const src, const __m256i coeffs[4],
3015                                         const __m256i filt[4], const __m256i offset,
3016                                         ConvBufType *const dst, uint8_t *const dst8) {
3017     __m256i r[2];
3018 
3019     x_convolve_8tap_32_avx2(src, coeffs, filt, r);
3020     jnt_avg_round_store_32_avx2(r, offset, dst, dst8);
3021 }
3022 
jnt_x_no_avg_8tap_16x2_avx2(const uint8_t * const src,const int32_t src_stride,const __m256i coeffs[4],const __m256i filt[4],const __m256i offset,ConvBufType * const dst,const int32_t dst_stride)3023 static INLINE void jnt_x_no_avg_8tap_16x2_avx2(const uint8_t *const src, const int32_t src_stride,
3024                                                const __m256i coeffs[4], const __m256i filt[4],
3025                                                const __m256i offset, ConvBufType *const dst,
3026                                                const int32_t dst_stride) {
3027     __m256i r[2];
3028 
3029     x_convolve_8tap_16x2_avx2(src, src_stride, coeffs, filt, r);
3030     jnt_no_avg_round_store_16x2_avx2(r, offset, dst, dst_stride);
3031 }
3032 
jnt_x_no_avg_8tap_32_avx2(const uint8_t * const src,const __m256i coeffs[4],const __m256i filt[4],const __m256i offset,ConvBufType * const dst)3033 SIMD_INLINE void jnt_x_no_avg_8tap_32_avx2(const uint8_t *const src, const __m256i coeffs[4],
3034                                            const __m256i filt[4], const __m256i offset,
3035                                            ConvBufType *const dst) {
3036     __m256i r[2];
3037 
3038     x_convolve_8tap_32_avx2(src, coeffs, filt, r);
3039     jnt_no_avg_round_store_32_avx2(r, offset, dst);
3040 }
3041 
jnt_convolve_x_2tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,const ConvolveParams * const conv_params)3042 static void jnt_convolve_x_2tap_avx2(const uint8_t *const src, const int32_t src_stride,
3043                                      uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
3044                                      const int32_t                   h,
3045                                      const InterpFilterParams *const filter_params_x,
3046                                      const int32_t                   subpel_x_q4,
3047                                      const ConvolveParams *const     conv_params) {
3048     const uint8_t *src_ptr      = src;
3049     const int32_t  dst_stride   = conv_params->dst_stride;
3050     const int32_t  round_0      = 3;
3051     const int32_t  round_1      = COMPOUND_ROUND1_BITS;
3052     const int32_t  bits         = FILTER_BITS - round_1;
3053     const int32_t  bd           = 8;
3054     const int32_t  round_bits   = 2 * FILTER_BITS - round_0 - round_1;
3055     const int32_t  offset_bits  = bd + round_bits;
3056     const int32_t  round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
3057     ConvBufType *  dst          = conv_params->dst;
3058     int32_t        y            = h;
3059     __m128i        coeffs_128[4];
3060     __m256i        coeffs_256[4];
3061 
3062     if (conv_params->do_average) {
3063         if (conv_params->use_jnt_comp_avg) {
3064             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
3065             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
3066                 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3067                 (round_offset << DIST_PRECISION_BITS);
3068 
3069             if (w <= 4) {
3070                 const __m128i factor_128          = _mm_set1_epi32(factor);
3071                 const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
3072 
3073                 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3074 
3075                 if (w == 2) {
3076                     do {
3077                         const __m128i res = x_convolve_2tap_2x2_sse4_1(
3078                             src_ptr, src_stride, coeffs_128);
3079                         jnt_comp_avg_round_store_2x2_sse2(res,
3080                                                           factor_128,
3081                                                           offset_comp_avg_128,
3082                                                           dst,
3083                                                           dst_stride,
3084                                                           dst8,
3085                                                           dst8_stride);
3086                         src_ptr += 2 * src_stride;
3087                         dst += 2 * dst_stride;
3088                         dst8 += 2 * dst8_stride;
3089                         y -= 2;
3090                     } while (y);
3091                 } else {
3092                     assert(w == 4);
3093 
3094                     do {
3095                         const __m128i res = x_convolve_2tap_4x2_ssse3(
3096                             src_ptr, src_stride, coeffs_128);
3097                         jnt_comp_avg_round_store_4x2_sse2(res,
3098                                                           factor_128,
3099                                                           offset_comp_avg_128,
3100                                                           dst,
3101                                                           dst_stride,
3102                                                           dst8,
3103                                                           dst8_stride);
3104                         src_ptr += 2 * src_stride;
3105                         dst += 2 * dst_stride;
3106                         dst8 += 2 * dst8_stride;
3107                         y -= 2;
3108                     } while (y);
3109                 }
3110             } else {
3111                 const __m256i factor_256          = _mm256_set1_epi32(factor);
3112                 const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
3113                 __m256i       r[2];
3114 
3115                 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3116 
3117                 if (w == 8) {
3118                     do {
3119                         const __m256i res = x_convolve_2tap_8x2_avx2(
3120                             src_ptr, src_stride, coeffs_256);
3121                         jnt_comp_avg_round_store_8x2_avx2(res,
3122                                                           factor_256,
3123                                                           offset_comp_avg_256,
3124                                                           dst,
3125                                                           dst_stride,
3126                                                           dst8,
3127                                                           dst8_stride);
3128                         src_ptr += 2 * src_stride;
3129                         dst += 2 * dst_stride;
3130                         dst8 += 2 * dst8_stride;
3131                         y -= 2;
3132                     } while (y);
3133                 } else if (w == 16) {
3134                     do {
3135                         x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3136                         jnt_comp_avg_round_store_16x2_avx2(
3137                             r, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
3138                         src_ptr += 2 * src_stride;
3139                         dst += 2 * dst_stride;
3140                         dst8 += 2 * dst8_stride;
3141                         y -= 2;
3142                     } while (y);
3143                 } else if (w == 32) {
3144                     do {
3145                         jnt_x_comp_avg_2tap_32_avx2(
3146                             src_ptr, coeffs_256, factor_256, offset_comp_avg_256, dst, dst8);
3147                         src_ptr += src_stride;
3148                         dst += dst_stride;
3149                         dst8 += dst8_stride;
3150                     } while (--y);
3151                 } else if (w == 64) {
3152                     do {
3153                         jnt_x_comp_avg_2tap_32_avx2(
3154                             src_ptr, coeffs_256, factor_256, offset_comp_avg_256, dst, dst8);
3155                         jnt_x_comp_avg_2tap_32_avx2(src_ptr + 32,
3156                                                     coeffs_256,
3157                                                     factor_256,
3158                                                     offset_comp_avg_256,
3159                                                     dst + 32,
3160                                                     dst8 + 32);
3161                         src_ptr += src_stride;
3162                         dst += dst_stride;
3163                         dst8 += dst8_stride;
3164                     } while (--y);
3165                 } else {
3166                     assert(w == 128);
3167 
3168                     do {
3169                         jnt_x_comp_avg_2tap_32_avx2(
3170                             src_ptr, coeffs_256, factor_256, offset_comp_avg_256, dst, dst8);
3171                         jnt_x_comp_avg_2tap_32_avx2(src_ptr + 1 * 32,
3172                                                     coeffs_256,
3173                                                     factor_256,
3174                                                     offset_comp_avg_256,
3175                                                     dst + 1 * 32,
3176                                                     dst8 + 1 * 32);
3177                         jnt_x_comp_avg_2tap_32_avx2(src_ptr + 2 * 32,
3178                                                     coeffs_256,
3179                                                     factor_256,
3180                                                     offset_comp_avg_256,
3181                                                     dst + 2 * 32,
3182                                                     dst8 + 2 * 32);
3183                         jnt_x_comp_avg_2tap_32_avx2(src_ptr + 3 * 32,
3184                                                     coeffs_256,
3185                                                     factor_256,
3186                                                     offset_comp_avg_256,
3187                                                     dst + 3 * 32,
3188                                                     dst8 + 3 * 32);
3189                         src_ptr += src_stride;
3190                         dst += dst_stride;
3191                         dst8 += dst8_stride;
3192                     } while (--y);
3193                 }
3194             }
3195         } else {
3196             const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_0 - bits - 2)) -
3197                 (round_offset << (round_0 - bits - 1));
3198 
3199             if (w <= 4) {
3200                 const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
3201 
3202                 prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3203 
3204                 if (w == 2) {
3205                     do {
3206                         const __m128i res = x_convolve_2tap_2x2_sse4_1(
3207                             src_ptr, src_stride, coeffs_128);
3208                         jnt_avg_round_store_2x2_sse2(
3209                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
3210                         src_ptr += 2 * src_stride;
3211                         dst += 2 * dst_stride;
3212                         dst8 += 2 * dst8_stride;
3213                         y -= 2;
3214                     } while (y);
3215                 } else {
3216                     assert(w == 4);
3217 
3218                     do {
3219                         const __m128i res = x_convolve_2tap_4x2_ssse3(
3220                             src_ptr, src_stride, coeffs_128);
3221                         jnt_avg_round_store_4x2_sse2(
3222                             res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
3223                         src_ptr += 2 * src_stride;
3224                         dst += 2 * dst_stride;
3225                         dst8 += 2 * dst8_stride;
3226                         y -= 2;
3227                     } while (y);
3228                 }
3229             } else {
3230                 const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
3231                 __m256i       r[2];
3232 
3233                 prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3234 
3235                 if (w == 8) {
3236                     do {
3237                         const __m256i res = x_convolve_2tap_8x2_avx2(
3238                             src_ptr, src_stride, coeffs_256);
3239                         jnt_avg_round_store_8x2_avx2(
3240                             res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
3241                         src_ptr += 2 * src_stride;
3242                         dst += 2 * dst_stride;
3243                         dst8 += 2 * dst8_stride;
3244                         y -= 2;
3245                     } while (y);
3246                 } else if (w == 16) {
3247                     do {
3248                         x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3249                         jnt_avg_round_store_16x2_avx2(
3250                             r, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
3251                         src_ptr += 2 * src_stride;
3252                         dst += 2 * dst_stride;
3253                         dst8 += 2 * dst8_stride;
3254                         y -= 2;
3255                     } while (y);
3256                 } else if (w == 32) {
3257                     do {
3258                         jnt_x_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_avg_256, dst, dst8);
3259                         src_ptr += src_stride;
3260                         dst += dst_stride;
3261                         dst8 += dst8_stride;
3262                     } while (--y);
3263                 } else if (w == 64) {
3264                     do {
3265                         jnt_x_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_avg_256, dst, dst8);
3266                         jnt_x_avg_2tap_32_avx2(
3267                             src_ptr + 32, coeffs_256, offset_avg_256, dst + 32, dst8 + 32);
3268                         src_ptr += src_stride;
3269                         dst += dst_stride;
3270                         dst8 += dst8_stride;
3271                     } while (--y);
3272                 } else {
3273                     assert(w == 128);
3274 
3275                     do {
3276                         jnt_x_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_avg_256, dst, dst8);
3277                         jnt_x_avg_2tap_32_avx2(src_ptr + 1 * 32,
3278                                                coeffs_256,
3279                                                offset_avg_256,
3280                                                dst + 1 * 32,
3281                                                dst8 + 1 * 32);
3282                         jnt_x_avg_2tap_32_avx2(src_ptr + 2 * 32,
3283                                                coeffs_256,
3284                                                offset_avg_256,
3285                                                dst + 2 * 32,
3286                                                dst8 + 2 * 32);
3287                         jnt_x_avg_2tap_32_avx2(src_ptr + 3 * 32,
3288                                                coeffs_256,
3289                                                offset_avg_256,
3290                                                dst + 3 * 32,
3291                                                dst8 + 3 * 32);
3292                         src_ptr += src_stride;
3293                         dst += dst_stride;
3294                         dst8 += dst8_stride;
3295                     } while (--y);
3296                 }
3297             }
3298         }
3299     } else {
3300         const int16_t offset_no_avg = (round_offset << (round_0 - bits - 1)) +
3301             (1 << (round_0 - bits - 2));
3302 
3303         if (w <= 4) {
3304             const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
3305 
3306             prepare_half_coeffs_2tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3307 
3308             if (w == 2) {
3309                 do {
3310                     const __m128i res = x_convolve_2tap_2x2_sse4_1(src_ptr, src_stride, coeffs_128);
3311                     jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
3312                     src_ptr += 2 * src_stride;
3313                     dst += 2 * dst_stride;
3314                     y -= 2;
3315                 } while (y);
3316             } else {
3317                 assert(w == 4);
3318 
3319                 do {
3320                     const __m128i res = x_convolve_2tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3321                     jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
3322                     src_ptr += 2 * src_stride;
3323                     dst += 2 * dst_stride;
3324                     y -= 2;
3325                 } while (y);
3326             }
3327         } else {
3328             const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
3329             __m256i       r[2];
3330 
3331             prepare_half_coeffs_2tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3332 
3333             if (w == 8) {
3334                 do {
3335                     const __m256i res = x_convolve_2tap_8x2_avx2(src_ptr, src_stride, coeffs_256);
3336                     jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
3337                     src_ptr += 2 * src_stride;
3338                     dst += 2 * dst_stride;
3339                     y -= 2;
3340                 } while (y);
3341             } else if (w == 16) {
3342                 do {
3343                     x_convolve_2tap_16x2_avx2(src_ptr, src_stride, coeffs_256, r);
3344                     jnt_no_avg_round_store_16x2_avx2(r, offset_no_avg_256, dst, dst_stride);
3345                     src_ptr += 2 * src_stride;
3346                     dst += 2 * dst_stride;
3347                     y -= 2;
3348                 } while (y);
3349             } else if (w == 32) {
3350                 do {
3351                     jnt_x_no_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_no_avg_256, dst);
3352                     src_ptr += src_stride;
3353                     dst += dst_stride;
3354                 } while (--y);
3355             } else if (w == 64) {
3356                 do {
3357                     jnt_x_no_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_no_avg_256, dst);
3358                     jnt_x_no_avg_2tap_32_avx2(
3359                         src_ptr + 32, coeffs_256, offset_no_avg_256, dst + 32);
3360                     src_ptr += src_stride;
3361                     dst += dst_stride;
3362                 } while (--y);
3363             } else {
3364                 assert(w == 128);
3365 
3366                 do {
3367                     jnt_x_no_avg_2tap_32_avx2(src_ptr, coeffs_256, offset_no_avg_256, dst);
3368                     jnt_x_no_avg_2tap_32_avx2(
3369                         src_ptr + 1 * 32, coeffs_256, offset_no_avg_256, dst + 1 * 32);
3370                     jnt_x_no_avg_2tap_32_avx2(
3371                         src_ptr + 2 * 32, coeffs_256, offset_no_avg_256, dst + 2 * 32);
3372                     jnt_x_no_avg_2tap_32_avx2(
3373                         src_ptr + 3 * 32, coeffs_256, offset_no_avg_256, dst + 3 * 32);
3374                     src_ptr += src_stride;
3375                     dst += dst_stride;
3376                 } while (--y);
3377             }
3378         }
3379     }
3380 }
3381 
jnt_convolve_x_4tap_ssse3(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,const ConvolveParams * const conv_params)3382 void jnt_convolve_x_4tap_ssse3(const uint8_t *const src, const int32_t src_stride, uint8_t *dst8,
3383                                const int32_t dst8_stride, const int32_t w, const int32_t h,
3384                                const InterpFilterParams *const filter_params_x,
3385                                const int32_t subpel_x_q4, const ConvolveParams *const conv_params) {
3386     const uint8_t *src_ptr      = src - 1;
3387     const int32_t  dst_stride   = conv_params->dst_stride;
3388     const int32_t  round_0      = 3;
3389     const int32_t  round_1      = COMPOUND_ROUND1_BITS;
3390     const int32_t  bits         = FILTER_BITS - round_1;
3391     const int32_t  bd           = 8;
3392     const int32_t  round_bits   = 2 * FILTER_BITS - round_0 - round_1;
3393     const int32_t  offset_bits  = bd + round_bits;
3394     const int32_t  round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
3395     ConvBufType *  dst          = conv_params->dst;
3396     int32_t        y            = h;
3397     __m128i        coeffs_128[4];
3398 
3399     prepare_half_coeffs_4tap_ssse3(filter_params_x, subpel_x_q4, coeffs_128);
3400 
3401     if (conv_params->do_average) {
3402         if (conv_params->use_jnt_comp_avg) {
3403             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
3404             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
3405                 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3406                 (round_offset << DIST_PRECISION_BITS);
3407             const __m128i factor_128          = _mm_set1_epi32(factor);
3408             const __m128i offset_comp_avg_128 = _mm_set1_epi32(offset_comp_avg);
3409 
3410             if (w == 2) {
3411                 do {
3412                     const __m128i res = x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3413                     jnt_comp_avg_round_store_2x2_sse2(
3414                         res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
3415                     src_ptr += 2 * src_stride;
3416                     dst += 2 * dst_stride;
3417                     dst8 += 2 * dst8_stride;
3418                     y -= 2;
3419                 } while (y);
3420             } else {
3421                 assert(w == 4);
3422 
3423                 do {
3424                     const __m128i res = x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3425                     jnt_comp_avg_round_store_4x2_sse2(
3426                         res, factor_128, offset_comp_avg_128, dst, dst_stride, dst8, dst8_stride);
3427                     src_ptr += 2 * src_stride;
3428                     dst += 2 * dst_stride;
3429                     dst8 += 2 * dst8_stride;
3430                     y -= 2;
3431                 } while (y);
3432             }
3433         } else {
3434             const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_0 - bits - 2)) -
3435                 (round_offset << (round_0 - bits - 1));
3436             const __m128i offset_avg_128 = _mm_set1_epi16(offset_avg);
3437 
3438             if (w == 2) {
3439                 do {
3440                     const __m128i res = x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3441                     jnt_avg_round_store_2x2_sse2(
3442                         res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
3443                     src_ptr += 2 * src_stride;
3444                     dst += 2 * dst_stride;
3445                     dst8 += 2 * dst8_stride;
3446                     y -= 2;
3447                 } while (y);
3448             } else {
3449                 assert(w == 4);
3450 
3451                 do {
3452                     const __m128i res = x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3453                     jnt_avg_round_store_4x2_sse2(
3454                         res, offset_avg_128, dst, dst_stride, dst8, dst8_stride);
3455                     src_ptr += 2 * src_stride;
3456                     dst += 2 * dst_stride;
3457                     dst8 += 2 * dst8_stride;
3458                     y -= 2;
3459                 } while (y);
3460             }
3461         }
3462     } else {
3463         const int16_t offset_no_avg = (round_offset << (round_0 - bits - 1)) +
3464             (1 << (round_0 - bits - 2));
3465         const __m128i offset_no_avg_128 = _mm_set1_epi16(offset_no_avg);
3466 
3467         if (w == 2) {
3468             do {
3469                 const __m128i res = x_convolve_4tap_2x2_ssse3(src_ptr, src_stride, coeffs_128);
3470                 jnt_no_avg_round_store_2x2_sse2(res, offset_no_avg_128, dst, dst_stride);
3471                 src_ptr += 2 * src_stride;
3472                 dst += 2 * dst_stride;
3473                 y -= 2;
3474             } while (y);
3475         } else {
3476             assert(w == 4);
3477 
3478             do {
3479                 const __m128i res = x_convolve_4tap_4x2_ssse3(src_ptr, src_stride, coeffs_128);
3480                 jnt_no_avg_round_store_4x2_sse2(res, offset_no_avg_128, dst, dst_stride);
3481                 src_ptr += 2 * src_stride;
3482                 dst += 2 * dst_stride;
3483                 y -= 2;
3484             } while (y);
3485         }
3486     }
3487 }
3488 
jnt_convolve_x_6tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,const ConvolveParams * const conv_params)3489 static void jnt_convolve_x_6tap_avx2(const uint8_t *const src, const int32_t src_stride,
3490                                      uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
3491                                      const int32_t                   h,
3492                                      const InterpFilterParams *const filter_params_x,
3493                                      const int32_t                   subpel_x_q4,
3494                                      const ConvolveParams *const     conv_params) {
3495     const uint8_t *src_ptr      = src - 2;
3496     const int32_t  dst_stride   = conv_params->dst_stride;
3497     const int32_t  round_0      = 3;
3498     const int32_t  round_1      = COMPOUND_ROUND1_BITS;
3499     const int32_t  bits         = FILTER_BITS - round_1;
3500     const int32_t  bd           = 8;
3501     const int32_t  round_bits   = 2 * FILTER_BITS - round_0 - round_1;
3502     const int32_t  offset_bits  = bd + round_bits;
3503     const int32_t  round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
3504     ConvBufType *  dst          = conv_params->dst;
3505     int32_t        y            = h;
3506     __m256i        coeffs_256[3], filt_256[3];
3507 
3508     filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
3509     filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
3510     filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
3511 
3512     prepare_half_coeffs_6tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3513 
3514     if (conv_params->do_average) {
3515         if (conv_params->use_jnt_comp_avg) {
3516             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
3517             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
3518                 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3519                 (round_offset << DIST_PRECISION_BITS);
3520             const __m256i factor_256          = _mm256_set1_epi32(factor);
3521             const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
3522 
3523             if (w == 8) {
3524                 do {
3525                     const __m256i res = x_convolve_6tap_8x2_avx2(
3526                         src_ptr, src_stride, coeffs_256, filt_256);
3527                     jnt_comp_avg_round_store_8x2_avx2(
3528                         res, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
3529                     src_ptr += 2 * src_stride;
3530                     dst += 2 * dst_stride;
3531                     dst8 += 2 * dst8_stride;
3532                     y -= 2;
3533                 } while (y);
3534             } else if (w == 16) {
3535                 do {
3536                     jnt_x_comp_avg_6tap_16x2_avx2(src_ptr,
3537                                                   src_stride,
3538                                                   coeffs_256,
3539                                                   filt_256,
3540                                                   factor_256,
3541                                                   offset_comp_avg_256,
3542                                                   dst,
3543                                                   dst_stride,
3544                                                   dst8,
3545                                                   dst8_stride);
3546                     src_ptr += 2 * src_stride;
3547                     dst += 2 * dst_stride;
3548                     dst8 += 2 * dst8_stride;
3549                     y -= 2;
3550                 } while (y);
3551             } else if (w == 32) {
3552                 do {
3553                     jnt_x_comp_avg_6tap_32_avx2(
3554                         src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3555                     src_ptr += src_stride;
3556                     dst += dst_stride;
3557                     dst8 += dst8_stride;
3558                 } while (--y);
3559             } else if (w == 64) {
3560                 do {
3561                     jnt_x_comp_avg_6tap_32_avx2(
3562                         src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3563                     jnt_x_comp_avg_6tap_32_avx2(src_ptr + 32,
3564                                                 coeffs_256,
3565                                                 filt_256,
3566                                                 factor_256,
3567                                                 offset_comp_avg_256,
3568                                                 dst + 32,
3569                                                 dst8 + 32);
3570                     src_ptr += src_stride;
3571                     dst += dst_stride;
3572                     dst8 += dst8_stride;
3573                 } while (--y);
3574             } else {
3575                 assert(w == 128);
3576 
3577                 do {
3578                     jnt_x_comp_avg_6tap_32_avx2(
3579                         src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3580                     jnt_x_comp_avg_6tap_32_avx2(src_ptr + 1 * 32,
3581                                                 coeffs_256,
3582                                                 filt_256,
3583                                                 factor_256,
3584                                                 offset_comp_avg_256,
3585                                                 dst + 1 * 32,
3586                                                 dst8 + 1 * 32);
3587                     jnt_x_comp_avg_6tap_32_avx2(src_ptr + 2 * 32,
3588                                                 coeffs_256,
3589                                                 filt_256,
3590                                                 factor_256,
3591                                                 offset_comp_avg_256,
3592                                                 dst + 2 * 32,
3593                                                 dst8 + 2 * 32);
3594                     jnt_x_comp_avg_6tap_32_avx2(src_ptr + 3 * 32,
3595                                                 coeffs_256,
3596                                                 filt_256,
3597                                                 factor_256,
3598                                                 offset_comp_avg_256,
3599                                                 dst + 3 * 32,
3600                                                 dst8 + 3 * 32);
3601                     src_ptr += src_stride;
3602                     dst += dst_stride;
3603                     dst8 += dst8_stride;
3604                 } while (--y);
3605             }
3606         } else {
3607             const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_0 - bits - 2)) -
3608                 (round_offset << (round_0 - bits - 1));
3609             const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
3610 
3611             if (w == 8) {
3612                 do {
3613                     const __m256i res = x_convolve_6tap_8x2_avx2(
3614                         src_ptr, src_stride, coeffs_256, filt_256);
3615                     jnt_avg_round_store_8x2_avx2(
3616                         res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
3617                     src_ptr += 2 * src_stride;
3618                     dst += 2 * dst_stride;
3619                     dst8 += 2 * dst8_stride;
3620                     y -= 2;
3621                 } while (y);
3622             } else if (w == 16) {
3623                 do {
3624                     jnt_x_avg_6tap_16x2_avx2(src_ptr,
3625                                              src_stride,
3626                                              coeffs_256,
3627                                              filt_256,
3628                                              offset_avg_256,
3629                                              dst,
3630                                              dst_stride,
3631                                              dst8,
3632                                              dst8_stride);
3633                     src_ptr += 2 * src_stride;
3634                     dst += 2 * dst_stride;
3635                     dst8 += 2 * dst8_stride;
3636                     y -= 2;
3637                 } while (y);
3638             } else if (w == 32) {
3639                 do {
3640                     jnt_x_avg_6tap_32_avx2(
3641                         src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3642                     src_ptr += src_stride;
3643                     dst += dst_stride;
3644                     dst8 += dst8_stride;
3645                 } while (--y);
3646             } else if (w == 64) {
3647                 do {
3648                     jnt_x_avg_6tap_32_avx2(
3649                         src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3650                     jnt_x_avg_6tap_32_avx2(
3651                         src_ptr + 32, coeffs_256, filt_256, offset_avg_256, dst + 32, dst8 + 32);
3652                     src_ptr += src_stride;
3653                     dst += dst_stride;
3654                     dst8 += dst8_stride;
3655                 } while (--y);
3656             } else {
3657                 assert(w == 128);
3658 
3659                 do {
3660                     jnt_x_avg_6tap_32_avx2(
3661                         src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3662                     jnt_x_avg_6tap_32_avx2(src_ptr + 1 * 32,
3663                                            coeffs_256,
3664                                            filt_256,
3665                                            offset_avg_256,
3666                                            dst + 1 * 32,
3667                                            dst8 + 1 * 32);
3668                     jnt_x_avg_6tap_32_avx2(src_ptr + 2 * 32,
3669                                            coeffs_256,
3670                                            filt_256,
3671                                            offset_avg_256,
3672                                            dst + 2 * 32,
3673                                            dst8 + 2 * 32);
3674                     jnt_x_avg_6tap_32_avx2(src_ptr + 3 * 32,
3675                                            coeffs_256,
3676                                            filt_256,
3677                                            offset_avg_256,
3678                                            dst + 3 * 32,
3679                                            dst8 + 3 * 32);
3680                     src_ptr += src_stride;
3681                     dst += dst_stride;
3682                     dst8 += dst8_stride;
3683                 } while (--y);
3684             }
3685         }
3686     } else {
3687         const int16_t offset_no_avg = (round_offset << (round_0 - bits - 1)) +
3688             (1 << (round_0 - bits - 2));
3689         const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
3690 
3691         if (w == 8) {
3692             do {
3693                 const __m256i res = x_convolve_6tap_8x2_avx2(
3694                     src_ptr, src_stride, coeffs_256, filt_256);
3695                 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
3696                 src_ptr += 2 * src_stride;
3697                 dst += 2 * dst_stride;
3698                 y -= 2;
3699             } while (y);
3700         } else if (w == 16) {
3701             do {
3702                 jnt_x_no_avg_6tap_16x2_avx2(
3703                     src_ptr, src_stride, coeffs_256, filt_256, offset_no_avg_256, dst, dst_stride);
3704                 src_ptr += 2 * src_stride;
3705                 dst += 2 * dst_stride;
3706                 y -= 2;
3707             } while (y);
3708         } else if (w == 32) {
3709             do {
3710                 jnt_x_no_avg_6tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3711                 src_ptr += src_stride;
3712                 dst += dst_stride;
3713             } while (--y);
3714         } else if (w == 64) {
3715             do {
3716                 jnt_x_no_avg_6tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3717                 jnt_x_no_avg_6tap_32_avx2(
3718                     src_ptr + 32, coeffs_256, filt_256, offset_no_avg_256, dst + 32);
3719                 src_ptr += src_stride;
3720                 dst += dst_stride;
3721             } while (--y);
3722         } else {
3723             assert(w == 128);
3724 
3725             do {
3726                 jnt_x_no_avg_6tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3727                 jnt_x_no_avg_6tap_32_avx2(
3728                     src_ptr + 1 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 1 * 32);
3729                 jnt_x_no_avg_6tap_32_avx2(
3730                     src_ptr + 2 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 2 * 32);
3731                 jnt_x_no_avg_6tap_32_avx2(
3732                     src_ptr + 3 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 3 * 32);
3733                 src_ptr += src_stride;
3734                 dst += dst_stride;
3735             } while (--y);
3736         }
3737     }
3738 }
3739 
jnt_convolve_x_8tap_avx2(const uint8_t * const src,const int32_t src_stride,uint8_t * dst8,const int32_t dst8_stride,const int32_t w,const int32_t h,const InterpFilterParams * const filter_params_x,const int32_t subpel_x_q4,const ConvolveParams * const conv_params)3740 static void jnt_convolve_x_8tap_avx2(const uint8_t *const src, const int32_t src_stride,
3741                                      uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
3742                                      const int32_t                   h,
3743                                      const InterpFilterParams *const filter_params_x,
3744                                      const int32_t                   subpel_x_q4,
3745                                      const ConvolveParams *const     conv_params) {
3746     const uint8_t *src_ptr      = src - 3;
3747     const int32_t  dst_stride   = conv_params->dst_stride;
3748     const int32_t  round_0      = 3;
3749     const int32_t  round_1      = COMPOUND_ROUND1_BITS;
3750     const int32_t  bits         = FILTER_BITS - round_1;
3751     const int32_t  bd           = 8;
3752     const int32_t  round_bits   = 2 * FILTER_BITS - round_0 - round_1;
3753     const int32_t  offset_bits  = bd + round_bits;
3754     const int32_t  round_offset = (1 << offset_bits) + (1 << (offset_bits - 1));
3755     ConvBufType *  dst          = conv_params->dst;
3756     int32_t        y            = h;
3757     __m256i        coeffs_256[4], filt_256[4];
3758 
3759     filt_256[0] = _mm256_loadu_si256((__m256i const *)filt1_global_avx);
3760     filt_256[1] = _mm256_loadu_si256((__m256i const *)filt2_global_avx);
3761     filt_256[2] = _mm256_loadu_si256((__m256i const *)filt3_global_avx);
3762     filt_256[3] = _mm256_loadu_si256((__m256i const *)filt4_global_avx);
3763 
3764     prepare_half_coeffs_8tap_avx2(filter_params_x, subpel_x_q4, coeffs_256);
3765 
3766     if (conv_params->do_average) {
3767         if (conv_params->use_jnt_comp_avg) {
3768             const int32_t factor = conv_params->fwd_offset | (conv_params->bck_offset << 16);
3769             const int32_t offset_comp_avg = round_offset * conv_params->bck_offset +
3770                 (1 << (round_bits + DIST_PRECISION_BITS - 1)) -
3771                 (round_offset << DIST_PRECISION_BITS);
3772             const __m256i factor_256          = _mm256_set1_epi32(factor);
3773             const __m256i offset_comp_avg_256 = _mm256_set1_epi32(offset_comp_avg);
3774 
3775             if (w == 8) {
3776                 do {
3777                     const __m256i res = x_convolve_8tap_8x2_avx2(
3778                         src_ptr, src_stride, coeffs_256, filt_256);
3779                     jnt_comp_avg_round_store_8x2_avx2(
3780                         res, factor_256, offset_comp_avg_256, dst, dst_stride, dst8, dst8_stride);
3781                     src_ptr += 2 * src_stride;
3782                     dst += 2 * dst_stride;
3783                     dst8 += 2 * dst8_stride;
3784                     y -= 2;
3785                 } while (y);
3786             } else if (w == 16) {
3787                 do {
3788                     jnt_x_comp_avg_8tap_16x2_avx2(src_ptr,
3789                                                   src_stride,
3790                                                   coeffs_256,
3791                                                   filt_256,
3792                                                   factor_256,
3793                                                   offset_comp_avg_256,
3794                                                   dst,
3795                                                   dst_stride,
3796                                                   dst8,
3797                                                   dst8_stride);
3798                     src_ptr += 2 * src_stride;
3799                     dst += 2 * dst_stride;
3800                     dst8 += 2 * dst8_stride;
3801                     y -= 2;
3802                 } while (y);
3803             } else if (w == 32) {
3804                 do {
3805                     jnt_x_comp_avg_8tap_32_avx2(
3806                         src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3807                     src_ptr += src_stride;
3808                     dst += dst_stride;
3809                     dst8 += dst8_stride;
3810                 } while (--y);
3811             } else if (w == 64) {
3812                 do {
3813                     jnt_x_comp_avg_8tap_32_avx2(
3814                         src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3815                     jnt_x_comp_avg_8tap_32_avx2(src_ptr + 32,
3816                                                 coeffs_256,
3817                                                 filt_256,
3818                                                 factor_256,
3819                                                 offset_comp_avg_256,
3820                                                 dst + 32,
3821                                                 dst8 + 32);
3822                     src_ptr += src_stride;
3823                     dst += dst_stride;
3824                     dst8 += dst8_stride;
3825                 } while (--y);
3826             } else {
3827                 assert(w == 128);
3828 
3829                 do {
3830                     jnt_x_comp_avg_8tap_32_avx2(
3831                         src_ptr, coeffs_256, filt_256, factor_256, offset_comp_avg_256, dst, dst8);
3832                     jnt_x_comp_avg_8tap_32_avx2(src_ptr + 1 * 32,
3833                                                 coeffs_256,
3834                                                 filt_256,
3835                                                 factor_256,
3836                                                 offset_comp_avg_256,
3837                                                 dst + 1 * 32,
3838                                                 dst8 + 1 * 32);
3839                     jnt_x_comp_avg_8tap_32_avx2(src_ptr + 2 * 32,
3840                                                 coeffs_256,
3841                                                 filt_256,
3842                                                 factor_256,
3843                                                 offset_comp_avg_256,
3844                                                 dst + 2 * 32,
3845                                                 dst8 + 2 * 32);
3846                     jnt_x_comp_avg_8tap_32_avx2(src_ptr + 3 * 32,
3847                                                 coeffs_256,
3848                                                 filt_256,
3849                                                 factor_256,
3850                                                 offset_comp_avg_256,
3851                                                 dst + 3 * 32,
3852                                                 dst8 + 3 * 32);
3853                     src_ptr += src_stride;
3854                     dst += dst_stride;
3855                     dst8 += dst8_stride;
3856                 } while (--y);
3857             }
3858         } else {
3859             const int16_t offset_avg = (1 << (FILTER_BITS - 1)) + (1 << (round_0 - bits - 2)) -
3860                 (round_offset << (round_0 - bits - 1));
3861             const __m256i offset_avg_256 = _mm256_set1_epi16(offset_avg);
3862 
3863             if (w == 8) {
3864                 do {
3865                     const __m256i res = x_convolve_8tap_8x2_avx2(
3866                         src_ptr, src_stride, coeffs_256, filt_256);
3867                     jnt_avg_round_store_8x2_avx2(
3868                         res, offset_avg_256, dst, dst_stride, dst8, dst8_stride);
3869                     src_ptr += 2 * src_stride;
3870                     dst += 2 * dst_stride;
3871                     dst8 += 2 * dst8_stride;
3872                     y -= 2;
3873                 } while (y);
3874             } else if (w == 16) {
3875                 do {
3876                     jnt_x_avg_8tap_16x2_avx2(src_ptr,
3877                                              src_stride,
3878                                              coeffs_256,
3879                                              filt_256,
3880                                              offset_avg_256,
3881                                              dst,
3882                                              dst_stride,
3883                                              dst8,
3884                                              dst8_stride);
3885                     src_ptr += 2 * src_stride;
3886                     dst += 2 * dst_stride;
3887                     dst8 += 2 * dst8_stride;
3888                     y -= 2;
3889                 } while (y);
3890             } else if (w == 32) {
3891                 do {
3892                     jnt_x_avg_8tap_32_avx2(
3893                         src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3894                     src_ptr += src_stride;
3895                     dst += dst_stride;
3896                     dst8 += dst8_stride;
3897                 } while (--y);
3898             } else if (w == 64) {
3899                 do {
3900                     jnt_x_avg_8tap_32_avx2(
3901                         src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3902                     jnt_x_avg_8tap_32_avx2(
3903                         src_ptr + 32, coeffs_256, filt_256, offset_avg_256, dst + 32, dst8 + 32);
3904                     src_ptr += src_stride;
3905                     dst += dst_stride;
3906                     dst8 += dst8_stride;
3907                 } while (--y);
3908             } else {
3909                 assert(w == 128);
3910 
3911                 do {
3912                     jnt_x_avg_8tap_32_avx2(
3913                         src_ptr, coeffs_256, filt_256, offset_avg_256, dst, dst8);
3914                     jnt_x_avg_8tap_32_avx2(src_ptr + 1 * 32,
3915                                            coeffs_256,
3916                                            filt_256,
3917                                            offset_avg_256,
3918                                            dst + 1 * 32,
3919                                            dst8 + 1 * 32);
3920                     jnt_x_avg_8tap_32_avx2(src_ptr + 2 * 32,
3921                                            coeffs_256,
3922                                            filt_256,
3923                                            offset_avg_256,
3924                                            dst + 2 * 32,
3925                                            dst8 + 2 * 32);
3926                     jnt_x_avg_8tap_32_avx2(src_ptr + 3 * 32,
3927                                            coeffs_256,
3928                                            filt_256,
3929                                            offset_avg_256,
3930                                            dst + 3 * 32,
3931                                            dst8 + 3 * 32);
3932                     src_ptr += src_stride;
3933                     dst += dst_stride;
3934                     dst8 += dst8_stride;
3935                 } while (--y);
3936             }
3937         }
3938     } else {
3939         const int16_t offset_no_avg = (round_offset << (round_0 - bits - 1)) +
3940             (1 << (round_0 - bits - 2));
3941         const __m256i offset_no_avg_256 = _mm256_set1_epi16(offset_no_avg);
3942 
3943         if (w == 8) {
3944             do {
3945                 const __m256i res = x_convolve_8tap_8x2_avx2(
3946                     src_ptr, src_stride, coeffs_256, filt_256);
3947                 jnt_no_avg_round_store_8x2_avx2(res, offset_no_avg_256, dst, dst_stride);
3948                 src_ptr += 2 * src_stride;
3949                 dst += 2 * dst_stride;
3950                 y -= 2;
3951             } while (y);
3952         } else if (w == 16) {
3953             do {
3954                 jnt_x_no_avg_8tap_16x2_avx2(
3955                     src_ptr, src_stride, coeffs_256, filt_256, offset_no_avg_256, dst, dst_stride);
3956                 src_ptr += 2 * src_stride;
3957                 dst += 2 * dst_stride;
3958                 y -= 2;
3959             } while (y);
3960         } else if (w == 32) {
3961             do {
3962                 jnt_x_no_avg_8tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3963                 src_ptr += src_stride;
3964                 dst += dst_stride;
3965             } while (--y);
3966         } else if (w == 64) {
3967             do {
3968                 jnt_x_no_avg_8tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3969                 jnt_x_no_avg_8tap_32_avx2(
3970                     src_ptr + 32, coeffs_256, filt_256, offset_no_avg_256, dst + 32);
3971                 src_ptr += src_stride;
3972                 dst += dst_stride;
3973             } while (--y);
3974         } else {
3975             assert(w == 128);
3976 
3977             do {
3978                 jnt_x_no_avg_8tap_32_avx2(src_ptr, coeffs_256, filt_256, offset_no_avg_256, dst);
3979                 jnt_x_no_avg_8tap_32_avx2(
3980                     src_ptr + 1 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 1 * 32);
3981                 jnt_x_no_avg_8tap_32_avx2(
3982                     src_ptr + 2 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 2 * 32);
3983                 jnt_x_no_avg_8tap_32_avx2(
3984                     src_ptr + 3 * 32, coeffs_256, filt_256, offset_no_avg_256, dst + 3 * 32);
3985                 src_ptr += src_stride;
3986                 dst += dst_stride;
3987             } while (--y);
3988         }
3989     }
3990 }
3991 
3992 typedef void (*JntConvolveXTapFunc)(const uint8_t *const src, const int32_t src_stride,
3993                                     uint8_t *dst8, const int32_t dst8_stride, const int32_t w,
3994                                     const int32_t                   h,
3995                                     const InterpFilterParams *const filter_params_x,
3996                                     const int32_t                   subpel_x_q4,
3997                                     const ConvolveParams *const     conv_params);
3998 
svt_av1_jnt_convolve_x_avx2(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)3999 void svt_av1_jnt_convolve_x_avx2(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
4000                                  int32_t dst8_stride, int32_t w, int32_t h,
4001                                  InterpFilterParams *filter_params_x,
4002                                  InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
4003                                  const int32_t subpel_y_q4, ConvolveParams *conv_params) {
4004     static const JntConvolveXTapFunc jnt_convolve_x_tap_func_table[MAX_FILTER_TAP + 1] = {
4005         NULL,
4006         NULL,
4007         jnt_convolve_x_2tap_avx2,
4008         NULL,
4009         jnt_convolve_x_4tap_ssse3,
4010         NULL,
4011         jnt_convolve_x_6tap_avx2,
4012         NULL,
4013         jnt_convolve_x_8tap_avx2};
4014     const int32_t tap_x = get_convolve_tap(filter_params_x->filter_ptr);
4015 
4016     (void)filter_params_y;
4017     (void)subpel_y_q4;
4018 
4019     assert(conv_params->round_0 == 3);
4020     assert(conv_params->round_1 == COMPOUND_ROUND1_BITS);
4021 
4022     jnt_convolve_x_tap_func_table[tap_x](
4023         src, src_stride, dst8, dst8_stride, w, h, filter_params_x, subpel_x_q4, conv_params);
4024 }
4025