1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
11 */
12 
13 #include <assert.h>
14 #include "EbDefinitions.h"
15 #include "aom_dsp_rtcd.h"
16 #include "EbTransforms.h"
17 #include <immintrin.h>
18 #include "txfm_common_avx2.h"
19 
20 void av1_transform_config(TxType tx_type, TxSize tx_size, Txfm2dFlipCfg *cfg);
21 
22 typedef void (*FwdTransform1dAvx2)(const __m256i *in, __m256i *out, int8_t bit,
23                                    const int32_t num_cols);
24 
25 #define TRANSPOSE_4X4_AVX2(x0, x1, x2, x3, y0, y1, y2, y3) \
26     do {                                                   \
27         __m256i u0, u1, u2, u3;                            \
28         u0 = _mm256_unpacklo_epi32(x0, x1);                \
29         u1 = _mm256_unpackhi_epi32(x0, x1);                \
30         u2 = _mm256_unpacklo_epi32(x2, x3);                \
31         u3 = _mm256_unpackhi_epi32(x2, x3);                \
32         y0 = _mm256_unpacklo_epi64(u0, u2);                \
33         y1 = _mm256_unpackhi_epi64(u0, u2);                \
34         y2 = _mm256_unpacklo_epi64(u1, u3);                \
35         y3 = _mm256_unpackhi_epi64(u1, u3);                \
36     } while (0)
37 
transpose_8x8_avx2(const __m256i * in,__m256i * out)38 static INLINE void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
39     __m256i out1[8];
40     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out1[0], out1[1], out1[4], out1[5]);
41     TRANSPOSE_4X4_AVX2(in[4], in[5], in[6], in[7], out1[2], out1[3], out1[6], out1[7]);
42     out[0] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
43     out[1] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
44     out[2] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
45     out[3] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
46     out[4] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
47     out[5] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
48     out[6] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
49     out[7] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
50 }
51 
transpose_16x16_avx2(const __m256i * in,__m256i * out)52 static INLINE void transpose_16x16_avx2(const __m256i *in, __m256i *out) {
53     __m256i temp[32];
54     TRANSPOSE_4X4_AVX2(in[0], in[2], in[4], in[6], temp[0], temp[2], temp[4], temp[6]);
55     TRANSPOSE_4X4_AVX2(in[8], in[10], in[12], in[14], temp[17], temp[19], temp[21], temp[23]);
56     TRANSPOSE_4X4_AVX2(in[1], in[3], in[5], in[7], temp[16], temp[18], temp[20], temp[22]);
57     TRANSPOSE_4X4_AVX2(in[9], in[11], in[13], in[15], temp[25], temp[27], temp[29], temp[31]);
58     TRANSPOSE_4X4_AVX2(in[16], in[18], in[20], in[22], temp[1], temp[3], temp[5], temp[7]);
59     TRANSPOSE_4X4_AVX2(in[24], in[26], in[28], in[30], temp[9], temp[11], temp[13], temp[15]);
60     TRANSPOSE_4X4_AVX2(in[17], in[19], in[21], in[23], temp[8], temp[10], temp[12], temp[14]);
61     TRANSPOSE_4X4_AVX2(in[25], in[27], in[29], in[31], temp[24], temp[26], temp[28], temp[30]);
62 
63     out[0]  = _mm256_permute2x128_si256(temp[0], temp[17], 0x20);
64     out[1]  = _mm256_permute2x128_si256(temp[1], temp[9], 0x20);
65     out[2]  = _mm256_permute2x128_si256(temp[2], temp[19], 0x20);
66     out[3]  = _mm256_permute2x128_si256(temp[3], temp[11], 0x20);
67     out[4]  = _mm256_permute2x128_si256(temp[4], temp[21], 0x20);
68     out[5]  = _mm256_permute2x128_si256(temp[5], temp[13], 0x20);
69     out[6]  = _mm256_permute2x128_si256(temp[6], temp[23], 0x20);
70     out[7]  = _mm256_permute2x128_si256(temp[7], temp[15], 0x20);
71     out[8]  = _mm256_permute2x128_si256(temp[0], temp[17], 0x31);
72     out[9]  = _mm256_permute2x128_si256(temp[1], temp[9], 0x31);
73     out[10] = _mm256_permute2x128_si256(temp[2], temp[19], 0x31);
74     out[11] = _mm256_permute2x128_si256(temp[3], temp[11], 0x31);
75     out[12] = _mm256_permute2x128_si256(temp[4], temp[21], 0x31);
76     out[13] = _mm256_permute2x128_si256(temp[5], temp[13], 0x31);
77     out[14] = _mm256_permute2x128_si256(temp[6], temp[23], 0x31);
78     out[15] = _mm256_permute2x128_si256(temp[7], temp[15], 0x31);
79     out[16] = _mm256_permute2x128_si256(temp[16], temp[25], 0x20);
80     out[17] = _mm256_permute2x128_si256(temp[8], temp[24], 0x20);
81     out[18] = _mm256_permute2x128_si256(temp[18], temp[27], 0x20);
82     out[19] = _mm256_permute2x128_si256(temp[10], temp[26], 0x20);
83     out[20] = _mm256_permute2x128_si256(temp[20], temp[29], 0x20);
84     out[21] = _mm256_permute2x128_si256(temp[12], temp[28], 0x20);
85     out[22] = _mm256_permute2x128_si256(temp[22], temp[31], 0x20);
86     out[23] = _mm256_permute2x128_si256(temp[14], temp[30], 0x20);
87     out[24] = _mm256_permute2x128_si256(temp[16], temp[25], 0x31);
88     out[25] = _mm256_permute2x128_si256(temp[8], temp[24], 0x31);
89     out[26] = _mm256_permute2x128_si256(temp[18], temp[27], 0x31);
90     out[27] = _mm256_permute2x128_si256(temp[10], temp[26], 0x31);
91     out[28] = _mm256_permute2x128_si256(temp[20], temp[29], 0x31);
92     out[29] = _mm256_permute2x128_si256(temp[12], temp[28], 0x31);
93     out[30] = _mm256_permute2x128_si256(temp[22], temp[31], 0x31);
94     out[31] = _mm256_permute2x128_si256(temp[14], temp[30], 0x31);
95 }
96 
transpose_32_8x8_avx2(int32_t stride,const __m256i * in,__m256i * out)97 static INLINE void transpose_32_8x8_avx2(int32_t stride, const __m256i *in, __m256i *out) {
98     __m256i out1[8];
99     __m256i temp0 = _mm256_unpacklo_epi32(in[0 * stride], in[2 * stride]);
100     __m256i temp1 = _mm256_unpackhi_epi32(in[0 * stride], in[2 * stride]);
101     __m256i temp2 = _mm256_unpacklo_epi32(in[1 * stride], in[3 * stride]);
102     __m256i temp3 = _mm256_unpackhi_epi32(in[1 * stride], in[3 * stride]);
103     __m256i temp4 = _mm256_unpacklo_epi32(in[4 * stride], in[6 * stride]);
104     __m256i temp5 = _mm256_unpackhi_epi32(in[4 * stride], in[6 * stride]);
105     __m256i temp6 = _mm256_unpacklo_epi32(in[5 * stride], in[7 * stride]);
106     __m256i temp7 = _mm256_unpackhi_epi32(in[5 * stride], in[7 * stride]);
107 
108     out1[0] = _mm256_unpacklo_epi32(temp0, temp2);
109     out1[1] = _mm256_unpackhi_epi32(temp0, temp2);
110     out1[4] = _mm256_unpacklo_epi32(temp1, temp3);
111     out1[5] = _mm256_unpackhi_epi32(temp1, temp3);
112     out1[2] = _mm256_unpacklo_epi32(temp4, temp6);
113     out1[3] = _mm256_unpackhi_epi32(temp4, temp6);
114     out1[6] = _mm256_unpacklo_epi32(temp5, temp7);
115     out1[7] = _mm256_unpackhi_epi32(temp5, temp7);
116 
117     out[0 * stride] = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
118     out[1 * stride] = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
119     out[2 * stride] = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
120     out[3 * stride] = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
121     out[4 * stride] = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
122     out[5 * stride] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
123     out[6 * stride] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
124     out[7 * stride] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
125 }
126 
transpose_32_avx2(int32_t txfm_size,const __m256i * input,__m256i * output)127 static INLINE void transpose_32_avx2(int32_t txfm_size, const __m256i *input, __m256i *output) {
128     const int32_t num_per_256 = 8;
129     const int32_t row_size    = txfm_size;
130     const int32_t col_size    = txfm_size / num_per_256;
131     int32_t       r, c;
132 
133     // transpose each 8x8 block internally
134     for (r = 0; r < row_size; r += 8) {
135         for (c = 0; c < col_size; c++) {
136             transpose_32_8x8_avx2(
137                 col_size, &input[r * col_size + c], &output[c * 8 * col_size + r / 8]);
138         }
139     }
140 }
141 
transpose_8nx8n(const __m256i * input,__m256i * output,const int32_t width,const int32_t height)142 static INLINE void transpose_8nx8n(const __m256i *input, __m256i *output, const int32_t width,
143                                    const int32_t height) {
144     const int32_t numcol = height >> 3;
145     const int32_t numrow = width >> 3;
146     __m256i       out1[8];
147     for (int32_t j = 0; j < numrow; j++) {
148         for (int32_t i = 0; i < numcol; i++) {
149             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 0)],
150                                input[i * width + j + (numrow * 1)],
151                                input[i * width + j + (numrow * 2)],
152                                input[i * width + j + (numrow * 3)],
153                                out1[0],
154                                out1[1],
155                                out1[4],
156                                out1[5]);
157             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 4)],
158                                input[i * width + j + (numrow * 5)],
159                                input[i * width + j + (numrow * 6)],
160                                input[i * width + j + (numrow * 7)],
161                                out1[2],
162                                out1[3],
163                                out1[6],
164                                out1[7]);
165             output[j * height + i + (numcol * 0)] = _mm256_permute2x128_si256(
166                 out1[0], out1[2], 0x20);
167             output[j * height + i + (numcol * 1)] = _mm256_permute2x128_si256(
168                 out1[1], out1[3], 0x20);
169             output[j * height + i + (numcol * 2)] = _mm256_permute2x128_si256(
170                 out1[4], out1[6], 0x20);
171             output[j * height + i + (numcol * 3)] = _mm256_permute2x128_si256(
172                 out1[5], out1[7], 0x20);
173             output[j * height + i + (numcol * 4)] = _mm256_permute2x128_si256(
174                 out1[0], out1[2], 0x31);
175             output[j * height + i + (numcol * 5)] = _mm256_permute2x128_si256(
176                 out1[1], out1[3], 0x31);
177             output[j * height + i + (numcol * 6)] = _mm256_permute2x128_si256(
178                 out1[4], out1[6], 0x31);
179             output[j * height + i + (numcol * 7)] = _mm256_permute2x128_si256(
180                 out1[5], out1[7], 0x31);
181         }
182     }
183 }
184 
transpose_8nx8n_N2_half(const __m256i * input,__m256i * output,const int32_t width,const int32_t height)185 static INLINE void transpose_8nx8n_N2_half(const __m256i *input, __m256i *output,
186                                            const int32_t width, const int32_t height) {
187     const int32_t numcol      = height >> 3;
188     const int32_t numrow      = width >> 3;
189     int32_t       calc_numcol = numcol >> 1;
190     if (!calc_numcol) {
191         calc_numcol = 1;
192     }
193 
194     __m256i out1[8];
195     for (int32_t j = 0; j < numrow; j++) {
196         for (int32_t i = 0; i < calc_numcol; i++) {
197             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 0)],
198                                input[i * width + j + (numrow * 1)],
199                                input[i * width + j + (numrow * 2)],
200                                input[i * width + j + (numrow * 3)],
201                                out1[0],
202                                out1[1],
203                                out1[4],
204                                out1[5]);
205             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 4)],
206                                input[i * width + j + (numrow * 5)],
207                                input[i * width + j + (numrow * 6)],
208                                input[i * width + j + (numrow * 7)],
209                                out1[2],
210                                out1[3],
211                                out1[6],
212                                out1[7]);
213             output[j * height + i + (numcol * 0)] = _mm256_permute2x128_si256(
214                 out1[0], out1[2], 0x20);
215             output[j * height + i + (numcol * 1)] = _mm256_permute2x128_si256(
216                 out1[1], out1[3], 0x20);
217             output[j * height + i + (numcol * 2)] = _mm256_permute2x128_si256(
218                 out1[4], out1[6], 0x20);
219             output[j * height + i + (numcol * 3)] = _mm256_permute2x128_si256(
220                 out1[5], out1[7], 0x20);
221             output[j * height + i + (numcol * 4)] = _mm256_permute2x128_si256(
222                 out1[0], out1[2], 0x31);
223             output[j * height + i + (numcol * 5)] = _mm256_permute2x128_si256(
224                 out1[1], out1[3], 0x31);
225             output[j * height + i + (numcol * 6)] = _mm256_permute2x128_si256(
226                 out1[4], out1[6], 0x31);
227             output[j * height + i + (numcol * 7)] = _mm256_permute2x128_si256(
228                 out1[5], out1[7], 0x31);
229         }
230     }
231 }
232 
transpose_8nx8n_N2_quad(const __m256i * input,__m256i * output,const int32_t width,const int32_t height)233 static INLINE void transpose_8nx8n_N2_quad(const __m256i *input, __m256i *output,
234                                            const int32_t width, const int32_t height) {
235     const int32_t numcol = height >> 3;
236     const int32_t numrow = width >> 3;
237 
238     int32_t calc_numcol = numcol >> 1;
239     int32_t calc_numrow = numrow >> 1;
240     if (!calc_numcol) {
241         calc_numcol = 1;
242     }
243     if (!calc_numrow) {
244         calc_numrow = 1;
245     }
246 
247     __m256i out1[8];
248     for (int32_t j = 0; j < calc_numrow; j++) {
249         for (int32_t i = 0; i < calc_numcol; i++) {
250             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 0)],
251                                input[i * width + j + (numrow * 1)],
252                                input[i * width + j + (numrow * 2)],
253                                input[i * width + j + (numrow * 3)],
254                                out1[0],
255                                out1[1],
256                                out1[4],
257                                out1[5]);
258             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 4)],
259                                input[i * width + j + (numrow * 5)],
260                                input[i * width + j + (numrow * 6)],
261                                input[i * width + j + (numrow * 7)],
262                                out1[2],
263                                out1[3],
264                                out1[6],
265                                out1[7]);
266             output[j * height + i + (numcol * 0)] = _mm256_permute2x128_si256(
267                 out1[0], out1[2], 0x20);
268             output[j * height + i + (numcol * 1)] = _mm256_permute2x128_si256(
269                 out1[1], out1[3], 0x20);
270             output[j * height + i + (numcol * 2)] = _mm256_permute2x128_si256(
271                 out1[4], out1[6], 0x20);
272             output[j * height + i + (numcol * 3)] = _mm256_permute2x128_si256(
273                 out1[5], out1[7], 0x20);
274             output[j * height + i + (numcol * 4)] = _mm256_permute2x128_si256(
275                 out1[0], out1[2], 0x31);
276             output[j * height + i + (numcol * 5)] = _mm256_permute2x128_si256(
277                 out1[1], out1[3], 0x31);
278             output[j * height + i + (numcol * 6)] = _mm256_permute2x128_si256(
279                 out1[4], out1[6], 0x31);
280             output[j * height + i + (numcol * 7)] = _mm256_permute2x128_si256(
281                 out1[5], out1[7], 0x31);
282         }
283     }
284 }
transpose_4x8_avx2(const __m256i * in,__m256i * out)285 static INLINE void transpose_4x8_avx2(const __m256i *in, __m256i *out) {
286     __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
287 
288     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
289     out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
290     out[1] = _mm256_permutevar8x32_epi32(out[1], perm);
291     out[2] = _mm256_permutevar8x32_epi32(out[2], perm);
292     out[3] = _mm256_permutevar8x32_epi32(out[3], perm);
293 }
294 
transpose_4x16_avx2(const __m256i * in,__m256i * out)295 static INLINE void transpose_4x16_avx2(const __m256i *in, __m256i *out) {
296     __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
297 
298     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[2], out[4], out[6]);
299     TRANSPOSE_4X4_AVX2(in[4], in[5], in[6], in[7], out[1], out[3], out[5], out[7]);
300 
301     out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
302     out[1] = _mm256_permutevar8x32_epi32(out[1], perm);
303     out[2] = _mm256_permutevar8x32_epi32(out[2], perm);
304     out[3] = _mm256_permutevar8x32_epi32(out[3], perm);
305     out[4] = _mm256_permutevar8x32_epi32(out[4], perm);
306     out[5] = _mm256_permutevar8x32_epi32(out[5], perm);
307     out[6] = _mm256_permutevar8x32_epi32(out[6], perm);
308     out[7] = _mm256_permutevar8x32_epi32(out[7], perm);
309 }
310 
311 // Note:
312 //  rounding = 1 << (bit - 1)
half_btf_avx2(const __m256i * w0,const __m256i * n0,const __m256i * w1,const __m256i * n1,const __m256i * rounding,int32_t bit)313 static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0, const __m256i *w1,
314                                     const __m256i *n1, const __m256i *rounding, int32_t bit) {
315     __m256i x, y;
316 
317     x = _mm256_mullo_epi32(*w0, *n0);
318     y = _mm256_mullo_epi32(*w1, *n1);
319     x = _mm256_add_epi32(x, y);
320     x = _mm256_add_epi32(x, *rounding);
321     x = _mm256_srai_epi32(x, bit);
322     return x;
323 }
324 
half_btf_small(const __m128i * w0,const __m128i * n0,const __m128i * w1,const __m128i * n1,const __m128i * rounding,int32_t bit)325 static INLINE __m128i half_btf_small(const __m128i *w0, const __m128i *n0, const __m128i *w1,
326                                      const __m128i *n1, const __m128i *rounding, int32_t bit) {
327     __m128i x, y;
328 
329     x = _mm_mullo_epi32(*w0, *n0);
330     y = _mm_mullo_epi32(*w1, *n1);
331     x = _mm_add_epi32(x, y);
332     x = _mm_add_epi32(x, *rounding);
333     x = _mm_srai_epi32(x, bit);
334     return x;
335 }
336 
337 // out0 = in0*w0 + in1*w1
338 // out1 = -in1*w0 + in0*w1
339 #define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit)       \
340     do {                                                           \
341         const __m256i ww0    = _mm256_set1_epi32(w0);              \
342         const __m256i ww1    = _mm256_set1_epi32(w1);              \
343         const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);       \
344         const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);       \
345         out0                 = _mm256_add_epi32(in0_w0, in1_w1);   \
346         out0                 = av1_round_shift_32_avx2(out0, bit); \
347         const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);       \
348         const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);       \
349         out1                 = _mm256_sub_epi32(in0_w1, in1_w0);   \
350         out1                 = av1_round_shift_32_avx2(out1, bit); \
351     } while (0)
352 
353 // out0 = in0*w0 + in1*w1
354 // out1 = in1*w0 - in0*w1
355 #define btf_32_avx2_type1(w0, w1, in0, in1, out0, out1, bit) \
356     do { btf_32_avx2_type0(w1, w0, in1, in0, out0, out1, bit); } while (0)
357 
358 // out0 = in0*w0 + in1*w1
359 // out1 = -in1*w0 + in0*w1
360 #define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
361     do {                                                              \
362         const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);          \
363         const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);          \
364         out0                 = _mm256_add_epi32(in0_w0, in1_w1);      \
365         out0                 = _mm256_add_epi32(out0, r);             \
366         out0                 = _mm256_srai_epi32(out0, bit);          \
367         const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);          \
368         const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);          \
369         out1                 = _mm256_sub_epi32(in0_w1, in1_w0);      \
370         out1                 = _mm256_add_epi32(out1, r);             \
371         out1                 = _mm256_srai_epi32(out1, bit);          \
372     } while (0)
373 
374 // out0 = in0*w0 + in1*w1
375 // out1 = in1*w0 - in0*w1
376 #define btf_32_type1_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
377     do { btf_32_type0_avx2_new(ww1, ww0, in1, in0, out0, out1, r, bit); } while (0)
378 
379 static const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
380     fwd_shift_4x4,   fwd_shift_8x8,   fwd_shift_16x16, fwd_shift_32x32, fwd_shift_64x64,
381     fwd_shift_4x8,   fwd_shift_8x4,   fwd_shift_8x16,  fwd_shift_16x8,  fwd_shift_16x32,
382     fwd_shift_32x16, fwd_shift_32x64, fwd_shift_64x32, fwd_shift_4x16,  fwd_shift_16x4,
383     fwd_shift_8x32,  fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
384 };
385 
load_buffer_8x8(const int16_t * input,__m256i * in,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)386 static INLINE void load_buffer_8x8(const int16_t *input, __m256i *in, int32_t stride,
387                                    int32_t flipud, int32_t fliplr, int32_t shift) {
388     __m128i temp[8];
389     if (!flipud) {
390         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
391         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
392         temp[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
393         temp[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
394         temp[4] = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
395         temp[5] = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
396         temp[6] = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
397         temp[7] = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
398     } else {
399         temp[0] = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
400         temp[1] = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
401         temp[2] = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
402         temp[3] = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
403         temp[4] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
404         temp[5] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
405         temp[6] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
406         temp[7] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
407     }
408 
409     if (fliplr) {
410         temp[0] = mm_reverse_epi16(temp[0]);
411         temp[1] = mm_reverse_epi16(temp[1]);
412         temp[2] = mm_reverse_epi16(temp[2]);
413         temp[3] = mm_reverse_epi16(temp[3]);
414         temp[4] = mm_reverse_epi16(temp[4]);
415         temp[5] = mm_reverse_epi16(temp[5]);
416         temp[6] = mm_reverse_epi16(temp[6]);
417         temp[7] = mm_reverse_epi16(temp[7]);
418     }
419 
420     in[0] = _mm256_cvtepi16_epi32(temp[0]);
421     in[1] = _mm256_cvtepi16_epi32(temp[1]);
422     in[2] = _mm256_cvtepi16_epi32(temp[2]);
423     in[3] = _mm256_cvtepi16_epi32(temp[3]);
424     in[4] = _mm256_cvtepi16_epi32(temp[4]);
425     in[5] = _mm256_cvtepi16_epi32(temp[5]);
426     in[6] = _mm256_cvtepi16_epi32(temp[6]);
427     in[7] = _mm256_cvtepi16_epi32(temp[7]);
428 
429     in[0] = _mm256_slli_epi32(in[0], shift);
430     in[1] = _mm256_slli_epi32(in[1], shift);
431     in[2] = _mm256_slli_epi32(in[2], shift);
432     in[3] = _mm256_slli_epi32(in[3], shift);
433     in[4] = _mm256_slli_epi32(in[4], shift);
434     in[5] = _mm256_slli_epi32(in[5], shift);
435     in[6] = _mm256_slli_epi32(in[6], shift);
436     in[7] = _mm256_slli_epi32(in[7], shift);
437 }
438 
load_buffer_8x8_N2(const int16_t * input,__m256i * in,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)439 static INLINE void load_buffer_8x8_N2(const int16_t *input, __m256i *in, int32_t stride,
440                                       int32_t flipud, int32_t fliplr, int32_t shift) {
441     __m128i temp[8];
442     if (!flipud) {
443         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
444         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
445         temp[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
446         temp[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
447         temp[4] = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
448         temp[5] = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
449         temp[6] = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
450         temp[7] = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
451     } else {
452         temp[0] = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
453         temp[1] = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
454         temp[2] = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
455         temp[3] = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
456         temp[4] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
457         temp[5] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
458         temp[6] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
459         temp[7] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
460     }
461 
462     if (fliplr) {
463         temp[0] = mm_reverse_epi16(temp[0]);
464         temp[1] = mm_reverse_epi16(temp[1]);
465         temp[2] = mm_reverse_epi16(temp[2]);
466         temp[3] = mm_reverse_epi16(temp[3]);
467         temp[4] = mm_reverse_epi16(temp[4]);
468         temp[5] = mm_reverse_epi16(temp[5]);
469         temp[6] = mm_reverse_epi16(temp[6]);
470         temp[7] = mm_reverse_epi16(temp[7]);
471     }
472 
473     in[0]  = _mm256_cvtepi16_epi32(temp[0]);
474     in[2]  = _mm256_cvtepi16_epi32(temp[1]);
475     in[4]  = _mm256_cvtepi16_epi32(temp[2]);
476     in[6]  = _mm256_cvtepi16_epi32(temp[3]);
477     in[8]  = _mm256_cvtepi16_epi32(temp[4]);
478     in[10] = _mm256_cvtepi16_epi32(temp[5]);
479     in[12] = _mm256_cvtepi16_epi32(temp[6]);
480     in[14] = _mm256_cvtepi16_epi32(temp[7]);
481 
482     in[0]  = _mm256_slli_epi32(in[0], shift);
483     in[2]  = _mm256_slli_epi32(in[2], shift);
484     in[4]  = _mm256_slli_epi32(in[4], shift);
485     in[6]  = _mm256_slli_epi32(in[6], shift);
486     in[8]  = _mm256_slli_epi32(in[8], shift);
487     in[10] = _mm256_slli_epi32(in[10], shift);
488     in[12] = _mm256_slli_epi32(in[12], shift);
489     in[14] = _mm256_slli_epi32(in[14], shift);
490 }
load_buffer_4x4_avx2(const int16_t * input,__m256i * in,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)491 static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m256i *in, int32_t stride,
492                                         int32_t flipud, int32_t fliplr, int32_t shift) {
493     if (!flipud) {
494         in[0] = _mm256_setr_epi64x(
495             *(uint64_t *)(input + 0 * stride), *(uint64_t *)(input + 1 * stride), 0, 0);
496         in[1] = _mm256_setr_epi64x(
497             *(uint64_t *)(input + 2 * stride), *(uint64_t *)(input + 3 * stride), 0, 0);
498     } else {
499         in[0] = _mm256_setr_epi64x(
500             *(uint64_t *)(input + 3 * stride), *(uint64_t *)(input + 2 * stride), 0, 0);
501         in[1] = _mm256_setr_epi64x(
502             *(uint64_t *)(input + 1 * stride), *(uint64_t *)(input + 0 * stride), 0, 0);
503     }
504 
505     if (fliplr) {
506         in[0] = _mm256_shufflelo_epi16(in[0], 0x1b);
507         in[0] = _mm256_shufflehi_epi16(in[0], 0x1b);
508         in[1] = _mm256_shufflelo_epi16(in[1], 0x1b);
509         in[1] = _mm256_shufflehi_epi16(in[1], 0x1b);
510     }
511 
512     in[0] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[0]));
513     in[1] = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[1]));
514 
515     in[0] = _mm256_slli_epi32(in[0], shift);
516     in[1] = _mm256_slli_epi32(in[1], shift);
517 }
518 
load_buffer_4x8_avx2(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)519 static INLINE void load_buffer_4x8_avx2(const int16_t *input, __m256i *out, int32_t stride,
520                                         int32_t flipud, int32_t fliplr, int32_t shift) {
521     const int16_t *top_l = input;
522     const int16_t *bot_l = input + 4 * stride;
523 
524     if (flipud) {
525         load_buffer_4x4_avx2(bot_l, out, stride, flipud, fliplr, shift);
526         load_buffer_4x4_avx2(top_l, out + 2, stride, flipud, fliplr, shift);
527     } else {
528         load_buffer_4x4_avx2(top_l, out, stride, flipud, fliplr, shift);
529         load_buffer_4x4_avx2(bot_l, out + 2, stride, flipud, fliplr, shift);
530     }
531 }
532 
load_buffer_8x4_avx2(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)533 static INLINE void load_buffer_8x4_avx2(const int16_t *input, __m256i *out, int32_t stride,
534                                         int32_t flipud, int32_t fliplr, int32_t shift) {
535     const int16_t *top_l = input;
536     const int16_t *top_r = input + 4;
537 
538     if (fliplr) {
539         load_buffer_4x4_avx2(top_r, out, stride, flipud, fliplr, shift);
540         load_buffer_4x4_avx2(top_l, out + 2, stride, flipud, fliplr, shift);
541     } else {
542         load_buffer_4x4_avx2(top_l, out, stride, flipud, fliplr, shift);
543         load_buffer_4x4_avx2(top_r, out + 2, stride, flipud, fliplr, shift);
544     }
545 }
546 
load_buffer_4x16_avx2(const int16_t * input,__m256i * out,const int32_t stride,const int32_t flipud,const int32_t fliplr,const int32_t shift)547 static INLINE void load_buffer_4x16_avx2(const int16_t *input, __m256i *out, const int32_t stride,
548                                          const int32_t flipud, const int32_t fliplr,
549                                          const int32_t shift) {
550     const int16_t *top_l = input;
551     const int16_t *bot_l = input + 8 * stride;
552 
553     if (flipud) {
554         load_buffer_4x8_avx2(bot_l, out, stride, flipud, fliplr, shift);
555         load_buffer_4x8_avx2(top_l, out + 4, stride, flipud, fliplr, shift);
556     } else {
557         load_buffer_4x8_avx2(top_l, out, stride, flipud, fliplr, shift);
558         load_buffer_4x8_avx2(bot_l, out + 4, stride, flipud, fliplr, shift);
559     }
560 }
561 
load_buffer_16x4_avx2(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)562 static INLINE void load_buffer_16x4_avx2(const int16_t *input, __m256i *out, int32_t stride,
563                                          int32_t flipud, int32_t fliplr, int32_t shift) {
564     const int16_t *top_l = input;
565     const int16_t *top_r = input + 8;
566 
567     if (fliplr) {
568         load_buffer_8x4_avx2(top_r, out, stride, flipud, fliplr, shift);
569         load_buffer_8x4_avx2(top_l, out + 4, stride, flipud, fliplr, shift);
570     } else {
571         load_buffer_8x4_avx2(top_l, out, stride, flipud, fliplr, shift);
572         load_buffer_8x4_avx2(top_r, out + 4, stride, flipud, fliplr, shift);
573     }
574 }
575 
col_txfm_8x8_rounding(__m256i * in,int32_t shift)576 static INLINE void col_txfm_8x8_rounding(__m256i *in, int32_t shift) {
577     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
578 
579     in[0] = _mm256_add_epi32(in[0], rounding);
580     in[1] = _mm256_add_epi32(in[1], rounding);
581     in[2] = _mm256_add_epi32(in[2], rounding);
582     in[3] = _mm256_add_epi32(in[3], rounding);
583     in[4] = _mm256_add_epi32(in[4], rounding);
584     in[5] = _mm256_add_epi32(in[5], rounding);
585     in[6] = _mm256_add_epi32(in[6], rounding);
586     in[7] = _mm256_add_epi32(in[7], rounding);
587 
588     in[0] = _mm256_srai_epi32(in[0], shift);
589     in[1] = _mm256_srai_epi32(in[1], shift);
590     in[2] = _mm256_srai_epi32(in[2], shift);
591     in[3] = _mm256_srai_epi32(in[3], shift);
592     in[4] = _mm256_srai_epi32(in[4], shift);
593     in[5] = _mm256_srai_epi32(in[5], shift);
594     in[6] = _mm256_srai_epi32(in[6], shift);
595     in[7] = _mm256_srai_epi32(in[7], shift);
596 }
597 
fidtx8x8_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num)598 static void fidtx8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
599     (void)bit;
600     out[0] = _mm256_slli_epi32(in[0 * col_num], 1);
601     out[1] = _mm256_slli_epi32(in[1 * col_num], 1);
602     out[2] = _mm256_slli_epi32(in[2 * col_num], 1);
603     out[3] = _mm256_slli_epi32(in[3 * col_num], 1);
604     out[4] = _mm256_slli_epi32(in[4 * col_num], 1);
605     out[5] = _mm256_slli_epi32(in[5 * col_num], 1);
606     out[6] = _mm256_slli_epi32(in[6 * col_num], 1);
607     out[7] = _mm256_slli_epi32(in[7 * col_num], 1);
608 }
609 
fidtx16x8_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num)610 static INLINE void fidtx16x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
611     (void)bit;
612     const int32_t bits     = 12; // new_sqrt2_bits = 12
613     const int32_t sqrt     = 2 * 5793; // 2 * new_sqrt2
614     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
615     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
616     __m256i       temp;
617     int32_t       num_iters = 8 * col_num;
618     for (int32_t i = 0; i < num_iters; i++) {
619         temp   = _mm256_mullo_epi32(in[i], newsqrt);
620         temp   = _mm256_add_epi32(temp, rounding);
621         out[i] = _mm256_srai_epi32(temp, bits);
622     }
623 }
624 
write_buffer_4x8(const __m256i * res,int32_t * output)625 static INLINE void write_buffer_4x8(const __m256i *res, int32_t *output) {
626     _mm256_storeu_si256((__m256i *)(output + 0 * 8), res[0]);
627     _mm256_storeu_si256((__m256i *)(output + 1 * 8), res[1]);
628     _mm256_storeu_si256((__m256i *)(output + 2 * 8), res[2]);
629     _mm256_storeu_si256((__m256i *)(output + 3 * 8), res[3]);
630 }
631 
write_buffer_8x8(const __m256i * res,int32_t * output)632 static INLINE void write_buffer_8x8(const __m256i *res, int32_t *output) {
633     _mm256_storeu_si256((__m256i *)(output + 0 * 8), res[0]);
634     _mm256_storeu_si256((__m256i *)(output + 1 * 8), res[1]);
635     _mm256_storeu_si256((__m256i *)(output + 2 * 8), res[2]);
636     _mm256_storeu_si256((__m256i *)(output + 3 * 8), res[3]);
637 
638     _mm256_storeu_si256((__m256i *)(output + 4 * 8), res[4]);
639     _mm256_storeu_si256((__m256i *)(output + 5 * 8), res[5]);
640     _mm256_storeu_si256((__m256i *)(output + 6 * 8), res[6]);
641     _mm256_storeu_si256((__m256i *)(output + 7 * 8), res[7]);
642 }
643 
fdct8x8_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num)644 static void fdct8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
645     const int32_t *cospi    = cospi_arr(bit);
646     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
647     const __m256i  cospim32 = _mm256_set1_epi32(-cospi[32]);
648     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
649     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
650     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
651     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
652     const __m256i  cospi24  = _mm256_set1_epi32(cospi[24]);
653     const __m256i  cospi40  = _mm256_set1_epi32(cospi[40]);
654     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
655     __m256i        u[8], v[8];
656 
657     // stage 0
658     // stage 1
659     u[0] = _mm256_add_epi32(in[0 * col_num], in[7 * col_num]);
660     v[7] = _mm256_sub_epi32(in[0 * col_num], in[7 * col_num]);
661     u[1] = _mm256_add_epi32(in[1 * col_num], in[6 * col_num]);
662     u[6] = _mm256_sub_epi32(in[1 * col_num], in[6 * col_num]);
663     u[2] = _mm256_add_epi32(in[2 * col_num], in[5 * col_num]);
664     u[5] = _mm256_sub_epi32(in[2 * col_num], in[5 * col_num]);
665     u[3] = _mm256_add_epi32(in[3 * col_num], in[4 * col_num]);
666     v[4] = _mm256_sub_epi32(in[3 * col_num], in[4 * col_num]);
667 
668     // stage 2
669     v[0] = _mm256_add_epi32(u[0], u[3]);
670     v[3] = _mm256_sub_epi32(u[0], u[3]);
671     v[1] = _mm256_add_epi32(u[1], u[2]);
672     v[2] = _mm256_sub_epi32(u[1], u[2]);
673 
674     v[5] = _mm256_mullo_epi32(u[5], cospim32);
675     v[6] = _mm256_mullo_epi32(u[6], cospi32);
676     v[5] = _mm256_add_epi32(v[5], v[6]);
677     v[5] = _mm256_add_epi32(v[5], rnding);
678     v[5] = _mm256_srai_epi32(v[5], bit);
679 
680     u[0] = _mm256_mullo_epi32(u[5], cospi32);
681     v[6] = _mm256_mullo_epi32(u[6], cospim32);
682     v[6] = _mm256_sub_epi32(u[0], v[6]);
683     v[6] = _mm256_add_epi32(v[6], rnding);
684     v[6] = _mm256_srai_epi32(v[6], bit);
685 
686     // stage 3
687     // type 0
688     v[0]             = _mm256_mullo_epi32(v[0], cospi32);
689     v[1]             = _mm256_mullo_epi32(v[1], cospi32);
690     u[0]             = _mm256_add_epi32(v[0], v[1]);
691     u[0]             = _mm256_add_epi32(u[0], rnding);
692     out[0 * col_num] = _mm256_srai_epi32(u[0], bit);
693 
694     u[1]             = _mm256_sub_epi32(v[0], v[1]);
695     u[1]             = _mm256_add_epi32(u[1], rnding);
696     out[4 * col_num] = _mm256_srai_epi32(u[1], bit);
697 
698     // type 1
699     v[0]             = _mm256_mullo_epi32(v[2], cospi48);
700     v[1]             = _mm256_mullo_epi32(v[3], cospi16);
701     u[2]             = _mm256_add_epi32(v[0], v[1]);
702     u[2]             = _mm256_add_epi32(u[2], rnding);
703     out[2 * col_num] = _mm256_srai_epi32(u[2], bit);
704 
705     v[0]             = _mm256_mullo_epi32(v[2], cospi16);
706     v[1]             = _mm256_mullo_epi32(v[3], cospi48);
707     u[3]             = _mm256_sub_epi32(v[1], v[0]);
708     u[3]             = _mm256_add_epi32(u[3], rnding);
709     out[6 * col_num] = _mm256_srai_epi32(u[3], bit);
710 
711     u[4] = _mm256_add_epi32(v[4], v[5]);
712     u[5] = _mm256_sub_epi32(v[4], v[5]);
713     u[6] = _mm256_sub_epi32(v[7], v[6]);
714     u[7] = _mm256_add_epi32(v[7], v[6]);
715 
716     // stage 4
717     // stage 5
718     v[0]             = _mm256_mullo_epi32(u[4], cospi56);
719     v[1]             = _mm256_mullo_epi32(u[7], cospi8);
720     v[0]             = _mm256_add_epi32(v[0], v[1]);
721     v[0]             = _mm256_add_epi32(v[0], rnding);
722     out[1 * col_num] = _mm256_srai_epi32(v[0], bit);
723 
724     v[0]             = _mm256_mullo_epi32(u[4], cospi8);
725     v[1]             = _mm256_mullo_epi32(u[7], cospi56);
726     v[0]             = _mm256_sub_epi32(v[1], v[0]);
727     v[0]             = _mm256_add_epi32(v[0], rnding);
728     out[7 * col_num] = _mm256_srai_epi32(v[0], bit);
729 
730     v[0]             = _mm256_mullo_epi32(u[5], cospi24);
731     v[1]             = _mm256_mullo_epi32(u[6], cospi40);
732     v[0]             = _mm256_add_epi32(v[0], v[1]);
733     v[0]             = _mm256_add_epi32(v[0], rnding);
734     out[5 * col_num] = _mm256_srai_epi32(v[0], bit);
735 
736     v[0]             = _mm256_mullo_epi32(u[5], cospi40);
737     v[1]             = _mm256_mullo_epi32(u[6], cospi24);
738     v[0]             = _mm256_sub_epi32(v[1], v[0]);
739     v[0]             = _mm256_add_epi32(v[0], rnding);
740     out[3 * col_num] = _mm256_srai_epi32(v[0], bit);
741 }
742 
fadst8x8_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num)743 static void fadst8x8_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
744     const int32_t *cospi    = cospi_arr(bit);
745     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
746     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
747     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
748     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
749     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
750     const __m256i  cospi4   = _mm256_set1_epi32(cospi[4]);
751     const __m256i  cospim4  = _mm256_set1_epi32(-cospi[4]);
752     const __m256i  cospi60  = _mm256_set1_epi32(cospi[60]);
753     const __m256i  cospi20  = _mm256_set1_epi32(cospi[20]);
754     const __m256i  cospim20 = _mm256_set1_epi32(-cospi[20]);
755     const __m256i  cospi44  = _mm256_set1_epi32(cospi[44]);
756     const __m256i  cospi28  = _mm256_set1_epi32(cospi[28]);
757     const __m256i  cospi36  = _mm256_set1_epi32(cospi[36]);
758     const __m256i  cospim36 = _mm256_set1_epi32(-cospi[36]);
759     const __m256i  cospi52  = _mm256_set1_epi32(cospi[52]);
760     const __m256i  cospim52 = _mm256_set1_epi32(-cospi[52]);
761     const __m256i  cospi12  = _mm256_set1_epi32(cospi[12]);
762     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
763     const __m256i  zero     = _mm256_setzero_si256();
764     __m256i        u0, u1, u2, u3, u4, u5, u6, u7;
765     __m256i        v0, v1, v2, v3, v4, v5, v6, v7;
766     __m256i        x, y;
767 
768     u0 = in[0 * col_num];
769     u1 = _mm256_sub_epi32(zero, in[7 * col_num]);
770     u2 = _mm256_sub_epi32(zero, in[3 * col_num]);
771     u3 = in[4 * col_num];
772     u4 = _mm256_sub_epi32(zero, in[1 * col_num]);
773     u5 = in[6 * col_num];
774     u6 = in[2 * col_num];
775     u7 = _mm256_sub_epi32(zero, in[5 * col_num]);
776 
777     // stage 2
778     v0 = u0;
779     v1 = u1;
780 
781     x  = _mm256_mullo_epi32(u2, cospi32);
782     y  = _mm256_mullo_epi32(u3, cospi32);
783     v2 = _mm256_add_epi32(x, y);
784     v2 = _mm256_add_epi32(v2, rnding);
785     v2 = _mm256_srai_epi32(v2, bit);
786 
787     v3 = _mm256_sub_epi32(x, y);
788     v3 = _mm256_add_epi32(v3, rnding);
789     v3 = _mm256_srai_epi32(v3, bit);
790 
791     v4 = u4;
792     v5 = u5;
793 
794     x  = _mm256_mullo_epi32(u6, cospi32);
795     y  = _mm256_mullo_epi32(u7, cospi32);
796     v6 = _mm256_add_epi32(x, y);
797     v6 = _mm256_add_epi32(v6, rnding);
798     v6 = _mm256_srai_epi32(v6, bit);
799 
800     v7 = _mm256_sub_epi32(x, y);
801     v7 = _mm256_add_epi32(v7, rnding);
802     v7 = _mm256_srai_epi32(v7, bit);
803 
804     // stage 3
805     u0 = _mm256_add_epi32(v0, v2);
806     u1 = _mm256_add_epi32(v1, v3);
807     u2 = _mm256_sub_epi32(v0, v2);
808     u3 = _mm256_sub_epi32(v1, v3);
809     u4 = _mm256_add_epi32(v4, v6);
810     u5 = _mm256_add_epi32(v5, v7);
811     u6 = _mm256_sub_epi32(v4, v6);
812     u7 = _mm256_sub_epi32(v5, v7);
813 
814     // stage 4
815     v0 = u0;
816     v1 = u1;
817     v2 = u2;
818     v3 = u3;
819 
820     x  = _mm256_mullo_epi32(u4, cospi16);
821     y  = _mm256_mullo_epi32(u5, cospi48);
822     v4 = _mm256_add_epi32(x, y);
823     v4 = _mm256_add_epi32(v4, rnding);
824     v4 = _mm256_srai_epi32(v4, bit);
825 
826     x  = _mm256_mullo_epi32(u4, cospi48);
827     y  = _mm256_mullo_epi32(u5, cospim16);
828     v5 = _mm256_add_epi32(x, y);
829     v5 = _mm256_add_epi32(v5, rnding);
830     v5 = _mm256_srai_epi32(v5, bit);
831 
832     x  = _mm256_mullo_epi32(u6, cospim48);
833     y  = _mm256_mullo_epi32(u7, cospi16);
834     v6 = _mm256_add_epi32(x, y);
835     v6 = _mm256_add_epi32(v6, rnding);
836     v6 = _mm256_srai_epi32(v6, bit);
837 
838     x  = _mm256_mullo_epi32(u6, cospi16);
839     y  = _mm256_mullo_epi32(u7, cospi48);
840     v7 = _mm256_add_epi32(x, y);
841     v7 = _mm256_add_epi32(v7, rnding);
842     v7 = _mm256_srai_epi32(v7, bit);
843 
844     // stage 5
845     u0 = _mm256_add_epi32(v0, v4);
846     u1 = _mm256_add_epi32(v1, v5);
847     u2 = _mm256_add_epi32(v2, v6);
848     u3 = _mm256_add_epi32(v3, v7);
849     u4 = _mm256_sub_epi32(v0, v4);
850     u5 = _mm256_sub_epi32(v1, v5);
851     u6 = _mm256_sub_epi32(v2, v6);
852     u7 = _mm256_sub_epi32(v3, v7);
853 
854     // stage 6
855     x  = _mm256_mullo_epi32(u0, cospi4);
856     y  = _mm256_mullo_epi32(u1, cospi60);
857     v0 = _mm256_add_epi32(x, y);
858     v0 = _mm256_add_epi32(v0, rnding);
859     v0 = _mm256_srai_epi32(v0, bit);
860 
861     x  = _mm256_mullo_epi32(u0, cospi60);
862     y  = _mm256_mullo_epi32(u1, cospim4);
863     v1 = _mm256_add_epi32(x, y);
864     v1 = _mm256_add_epi32(v1, rnding);
865     v1 = _mm256_srai_epi32(v1, bit);
866 
867     x  = _mm256_mullo_epi32(u2, cospi20);
868     y  = _mm256_mullo_epi32(u3, cospi44);
869     v2 = _mm256_add_epi32(x, y);
870     v2 = _mm256_add_epi32(v2, rnding);
871     v2 = _mm256_srai_epi32(v2, bit);
872 
873     x  = _mm256_mullo_epi32(u2, cospi44);
874     y  = _mm256_mullo_epi32(u3, cospim20);
875     v3 = _mm256_add_epi32(x, y);
876     v3 = _mm256_add_epi32(v3, rnding);
877     v3 = _mm256_srai_epi32(v3, bit);
878 
879     x  = _mm256_mullo_epi32(u4, cospi36);
880     y  = _mm256_mullo_epi32(u5, cospi28);
881     v4 = _mm256_add_epi32(x, y);
882     v4 = _mm256_add_epi32(v4, rnding);
883     v4 = _mm256_srai_epi32(v4, bit);
884 
885     x  = _mm256_mullo_epi32(u4, cospi28);
886     y  = _mm256_mullo_epi32(u5, cospim36);
887     v5 = _mm256_add_epi32(x, y);
888     v5 = _mm256_add_epi32(v5, rnding);
889     v5 = _mm256_srai_epi32(v5, bit);
890 
891     x  = _mm256_mullo_epi32(u6, cospi52);
892     y  = _mm256_mullo_epi32(u7, cospi12);
893     v6 = _mm256_add_epi32(x, y);
894     v6 = _mm256_add_epi32(v6, rnding);
895     v6 = _mm256_srai_epi32(v6, bit);
896 
897     x  = _mm256_mullo_epi32(u6, cospi12);
898     y  = _mm256_mullo_epi32(u7, cospim52);
899     v7 = _mm256_add_epi32(x, y);
900     v7 = _mm256_add_epi32(v7, rnding);
901     v7 = _mm256_srai_epi32(v7, bit);
902 
903     // stage 7
904     out[0 * col_num] = v1;
905     out[1 * col_num] = v6;
906     out[2 * col_num] = v3;
907     out[3 * col_num] = v4;
908     out[4 * col_num] = v5;
909     out[5 * col_num] = v2;
910     out[6 * col_num] = v7;
911     out[7 * col_num] = v0;
912 }
913 
svt_av1_fwd_txfm2d_8x8_avx2(int16_t * input,int32_t * coeff,uint32_t stride,TxType tx_type,uint8_t bd)914 void svt_av1_fwd_txfm2d_8x8_avx2(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type,
915                                  uint8_t bd) {
916     __m256i       in[8], out[8];
917     const int8_t *shift   = fwd_txfm_shift_ls[TX_8X8];
918     const int32_t txw_idx = get_txw_idx(TX_8X8);
919     const int32_t txh_idx = get_txh_idx(TX_8X8);
920 
921     switch (tx_type) {
922     case DCT_DCT:
923         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
924         fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
925         col_txfm_8x8_rounding(out, -shift[1]);
926         transpose_8x8_avx2(out, in);
927         fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
928         transpose_8x8_avx2(out, in);
929         write_buffer_8x8(in, coeff);
930         break;
931     case ADST_DCT:
932         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
933         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
934         col_txfm_8x8_rounding(out, -shift[1]);
935         transpose_8x8_avx2(out, in);
936         fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
937         transpose_8x8_avx2(out, in);
938         write_buffer_8x8(in, coeff);
939         break;
940     case DCT_ADST:
941         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
942         fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
943         col_txfm_8x8_rounding(out, -shift[1]);
944         transpose_8x8_avx2(out, in);
945         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
946         transpose_8x8_avx2(out, in);
947         write_buffer_8x8(in, coeff);
948         break;
949     case ADST_ADST:
950         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
951         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
952         col_txfm_8x8_rounding(out, -shift[1]);
953         transpose_8x8_avx2(out, in);
954         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
955         transpose_8x8_avx2(out, in);
956         write_buffer_8x8(in, coeff);
957         break;
958     case FLIPADST_DCT:
959         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
960         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
961         col_txfm_8x8_rounding(out, -shift[1]);
962         transpose_8x8_avx2(out, in);
963         fdct8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
964         transpose_8x8_avx2(out, in);
965         write_buffer_8x8(in, coeff);
966         break;
967     case DCT_FLIPADST:
968         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
969         fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
970         col_txfm_8x8_rounding(out, -shift[1]);
971         transpose_8x8_avx2(out, in);
972         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
973         transpose_8x8_avx2(out, in);
974         write_buffer_8x8(in, coeff);
975         break;
976     case FLIPADST_FLIPADST:
977         load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
978         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
979         col_txfm_8x8_rounding(out, -shift[1]);
980         transpose_8x8_avx2(out, in);
981         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
982         transpose_8x8_avx2(out, in);
983         write_buffer_8x8(in, coeff);
984         break;
985     case ADST_FLIPADST:
986         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
987         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
988         col_txfm_8x8_rounding(out, -shift[1]);
989         transpose_8x8_avx2(out, in);
990         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
991         transpose_8x8_avx2(out, in);
992         write_buffer_8x8(in, coeff);
993         break;
994     case FLIPADST_ADST:
995         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
996         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
997         col_txfm_8x8_rounding(out, -shift[1]);
998         transpose_8x8_avx2(out, in);
999         fadst8x8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
1000         transpose_8x8_avx2(out, in);
1001         write_buffer_8x8(in, coeff);
1002         break;
1003     case IDTX:
1004         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1005         fidtx8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
1006         col_txfm_8x8_rounding(out, -shift[1]);
1007         fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
1008         write_buffer_8x8(out, coeff);
1009         break;
1010     case V_DCT:
1011         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1012         fdct8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
1013         col_txfm_8x8_rounding(out, -shift[1]);
1014         fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
1015         write_buffer_8x8(out, coeff);
1016         break;
1017     case H_DCT:
1018         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1019         fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
1020         col_txfm_8x8_rounding(in, -shift[1]);
1021         transpose_8x8_avx2(in, out);
1022         fdct8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
1023         transpose_8x8_avx2(in, out);
1024         write_buffer_8x8(out, coeff);
1025         break;
1026     case V_ADST:
1027         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1028         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
1029         col_txfm_8x8_rounding(out, -shift[1]);
1030         fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
1031         write_buffer_8x8(out, coeff);
1032         break;
1033     case H_ADST:
1034         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
1035         fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
1036         col_txfm_8x8_rounding(in, -shift[1]);
1037         transpose_8x8_avx2(in, out);
1038         fadst8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
1039         transpose_8x8_avx2(in, out);
1040         write_buffer_8x8(out, coeff);
1041         break;
1042     case V_FLIPADST:
1043         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
1044         fadst8x8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
1045         col_txfm_8x8_rounding(out, -shift[1]);
1046         fidtx8x8_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
1047         write_buffer_8x8(out, coeff);
1048         break;
1049     case H_FLIPADST:
1050         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
1051         fidtx8x8_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
1052         col_txfm_8x8_rounding(in, -shift[1]);
1053         transpose_8x8_avx2(in, out);
1054         fadst8x8_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
1055         transpose_8x8_avx2(in, out);
1056         write_buffer_8x8(out, coeff);
1057         break;
1058     default: assert(0);
1059     }
1060     (void)bd;
1061 }
1062 
convert_8x8_to_16x16(const __m256i * in,__m256i * out)1063 static INLINE void convert_8x8_to_16x16(const __m256i *in, __m256i *out) {
1064     int32_t row_index = 0;
1065     int32_t dst_index = 0;
1066     int32_t src_index = 0;
1067 
1068     // row 0, 1, .., 7
1069     do {
1070         out[dst_index]     = in[src_index];
1071         out[dst_index + 1] = in[src_index + 8];
1072         dst_index += 2;
1073         src_index += 1;
1074         row_index += 1;
1075     } while (row_index < 8);
1076 
1077     // row 8, 9, ..., 15
1078     src_index += 8;
1079     do {
1080         out[dst_index]     = in[src_index];
1081         out[dst_index + 1] = in[src_index + 8];
1082         dst_index += 2;
1083         src_index += 1;
1084         row_index += 1;
1085     } while (row_index < 16);
1086 }
1087 
load_buffer_16x16(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)1088 static INLINE void load_buffer_16x16(const int16_t *input, __m256i *out, int32_t stride,
1089                                      int32_t flipud, int32_t fliplr, int32_t shift) {
1090     __m256i in[32];
1091     // Load 4 8x8 blocks
1092     const int16_t *top_l = input;
1093     const int16_t *top_r = input + 8;
1094     const int16_t *bot_l = input + 8 * stride;
1095     const int16_t *bot_r = input + 8 * stride + 8;
1096 
1097     const int16_t *tmp;
1098 
1099     if (flipud) {
1100         // Swap left columns
1101         tmp   = top_l;
1102         top_l = bot_l;
1103         bot_l = tmp;
1104         // Swap right columns
1105         tmp   = top_r;
1106         top_r = bot_r;
1107         bot_r = tmp;
1108     }
1109 
1110     if (fliplr) {
1111         // Swap top rows
1112         tmp   = top_l;
1113         top_l = top_r;
1114         top_r = tmp;
1115         // Swap bottom rows
1116         tmp   = bot_l;
1117         bot_l = bot_r;
1118         bot_r = tmp;
1119     }
1120 
1121     // load first 8 columns
1122     load_buffer_8x8(top_l, &in[0], stride, flipud, fliplr, shift);
1123     load_buffer_8x8(bot_l, &in[16], stride, flipud, fliplr, shift);
1124 
1125     // load second 8 columns
1126     load_buffer_8x8(top_r, &in[8], stride, flipud, fliplr, shift);
1127     load_buffer_8x8(bot_r, &in[24], stride, flipud, fliplr, shift);
1128 
1129     convert_8x8_to_16x16(in, out);
1130 }
1131 
load_buffer_16x16_N2(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)1132 static AOM_FORCE_INLINE void load_buffer_16x16_N2(const int16_t *input, __m256i *out,
1133                                                   int32_t stride, int32_t flipud, int32_t fliplr,
1134                                                   int32_t shift) {
1135     // Load 4 8x8 blocks
1136     const int16_t *top_l = input;
1137     const int16_t *top_r = input + 8;
1138     const int16_t *bot_l = input + 8 * stride;
1139     const int16_t *bot_r = input + 8 * stride + 8;
1140 
1141     const int16_t *tmp;
1142 
1143     if (flipud) {
1144         // Swap left columns
1145         tmp   = top_l;
1146         top_l = bot_l;
1147         bot_l = tmp;
1148         // Swap right columns
1149         tmp   = top_r;
1150         top_r = bot_r;
1151         bot_r = tmp;
1152     }
1153 
1154     if (fliplr) {
1155         // Swap top rows
1156         tmp   = top_l;
1157         top_l = top_r;
1158         top_r = tmp;
1159         // Swap bottom rows
1160         tmp   = bot_l;
1161         bot_l = bot_r;
1162         bot_r = tmp;
1163     }
1164 
1165     // load first 8 columns
1166     load_buffer_8x8_N2(top_l, out, stride, flipud, fliplr, shift);
1167     load_buffer_8x8_N2(bot_l, &out[16], stride, flipud, fliplr, shift);
1168 }
1169 
load_buffer_16x16_N2_H(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)1170 static AOM_FORCE_INLINE void load_buffer_16x16_N2_H(const int16_t *input, __m256i *out,
1171                                                     int32_t stride, int32_t flipud, int32_t fliplr,
1172                                                     int32_t shift) {
1173     // Load 4 8x8 blocks
1174     const int16_t *top_l = input;
1175     const int16_t *top_r = input + 8;
1176     const int16_t *bot_l = input + 8 * stride;
1177     const int16_t *bot_r = input + 8 * stride + 8;
1178 
1179     const int16_t *tmp;
1180 
1181     if (flipud) {
1182         // Swap left columns
1183         tmp   = top_l;
1184         top_l = bot_l;
1185         bot_l = tmp;
1186         // Swap right columns
1187         tmp   = top_r;
1188         top_r = bot_r;
1189         bot_r = tmp;
1190     }
1191 
1192     if (fliplr) {
1193         // Swap top rows
1194         tmp   = top_l;
1195         top_l = top_r;
1196         top_r = tmp;
1197         // Swap bottom rows
1198         tmp   = bot_l;
1199         bot_l = bot_r;
1200         bot_r = tmp;
1201     }
1202 
1203     // load first 8 columns
1204     load_buffer_8x8_N2(top_l, out, stride, flipud, fliplr, shift);
1205 
1206     //// load second 8 columns
1207     load_buffer_8x8_N2(top_r, &out[1], stride, flipud, fliplr, shift);
1208 }
1209 
load_buffer_16x16_N2_half(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)1210 static AOM_FORCE_INLINE void load_buffer_16x16_N2_half(const int16_t *input, __m256i *out,
1211                                                        int32_t stride, int32_t flipud,
1212                                                        int32_t fliplr, int32_t shift) {
1213     // Load 4 8x8 blocks
1214     const int16_t *top_l = input;
1215     const int16_t *top_r = input + 8;
1216     const int16_t *bot_l = input + 8 * stride;
1217     const int16_t *bot_r = input + 8 * stride + 8;
1218 
1219     const int16_t *tmp;
1220 
1221     if (flipud) {
1222         // Swap left columns
1223         tmp   = top_l;
1224         top_l = bot_l;
1225         bot_l = tmp;
1226         // Swap right columns
1227         tmp   = top_r;
1228         top_r = bot_r;
1229         bot_r = tmp;
1230     }
1231 
1232     if (fliplr) {
1233         // Swap top rows
1234         tmp   = top_l;
1235         top_l = top_r;
1236         top_r = tmp;
1237         // Swap bottom rows
1238         tmp   = bot_l;
1239         bot_l = bot_r;
1240         bot_r = tmp;
1241     }
1242 
1243     // load first 8 columns
1244     load_buffer_8x8_N2(top_l, out /*& in[0]*/, stride, flipud, fliplr, shift);
1245 }
col_txfm_16x16_rounding(__m256i * in,int32_t shift)1246 static INLINE void col_txfm_16x16_rounding(__m256i *in, int32_t shift) {
1247     col_txfm_8x8_rounding(&in[0], shift);
1248     col_txfm_8x8_rounding(&in[8], shift);
1249     col_txfm_8x8_rounding(&in[16], shift);
1250     col_txfm_8x8_rounding(&in[24], shift);
1251 }
1252 
fidtx16x16_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num)1253 static void fidtx16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
1254     (void)bit;
1255     const int32_t bits     = 12; // new_sqrt2_bits = 12
1256     const int32_t sqrt     = 2 * 5793; // 2 * new_sqrt2
1257     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
1258     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
1259     __m256i       temp;
1260     int32_t       num_iters = 16 * col_num;
1261     for (int32_t i = 0; i < num_iters; i++) {
1262         temp   = _mm256_mullo_epi32(in[i], newsqrt);
1263         temp   = _mm256_add_epi32(temp, rounding);
1264         out[i] = _mm256_srai_epi32(temp, bits);
1265     }
1266 }
1267 
write_buffer_16x16(const __m256i * res,int32_t * output)1268 static INLINE void write_buffer_16x16(const __m256i *res, int32_t *output) {
1269     int32_t fact = -1, index = -1;
1270     for (int32_t i = 0; i < 8; i++) {
1271         _mm256_storeu_si256((__m256i *)(output + (++fact) * 16), res[++index]);
1272         _mm256_storeu_si256((__m256i *)(output + (fact)*16 + 8), res[++index]);
1273         _mm256_storeu_si256((__m256i *)(output + (++fact) * 16), res[++index]);
1274         _mm256_storeu_si256((__m256i *)(output + (fact)*16 + 8), res[++index]);
1275     }
1276 }
1277 
fdct4x8_row_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t num_col)1278 static INLINE void fdct4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit,
1279                                     const int32_t num_col) {
1280     const int32_t *cospi   = cospi_arr(bit);
1281     const __m256i  cospi32 = _mm256_set1_epi32(cospi[32]);
1282     const __m256i  cospi48 = _mm256_set1_epi32(cospi[48]);
1283     const __m256i  cospi16 = _mm256_set1_epi32(cospi[16]);
1284     const __m256i  rnding  = _mm256_set1_epi32(1 << (bit - 1));
1285     __m256i        in[4];
1286     __m256i        out[4];
1287     __m256i        s0, s1, s2, s3;
1288     __m256i        u0, u1, u2, u3;
1289     __m256i        v0, v1, v2, v3;
1290     int32_t        endidx = 3 * num_col;
1291 
1292     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
1293     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
1294     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
1295     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
1296 
1297     s0 = _mm256_add_epi32(in[0], in[endidx]);
1298     s3 = _mm256_sub_epi32(in[0], in[endidx]);
1299     endidx -= num_col;
1300     s1 = _mm256_add_epi32(in[num_col], in[endidx]);
1301     s2 = _mm256_sub_epi32(in[num_col], in[endidx]);
1302 
1303     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
1304     u0 = _mm256_mullo_epi32(s0, cospi32);
1305     u1 = _mm256_mullo_epi32(s1, cospi32);
1306     u2 = _mm256_add_epi32(u0, u1);
1307     v0 = _mm256_sub_epi32(u0, u1);
1308 
1309     u3 = _mm256_add_epi32(u2, rnding);
1310     v1 = _mm256_add_epi32(v0, rnding);
1311 
1312     u0 = _mm256_srai_epi32(u3, bit);
1313     u2 = _mm256_srai_epi32(v1, bit);
1314 
1315     // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
1316     v0 = _mm256_mullo_epi32(s2, cospi48);
1317     v1 = _mm256_mullo_epi32(s3, cospi16);
1318     v2 = _mm256_add_epi32(v0, v1);
1319 
1320     v3 = _mm256_add_epi32(v2, rnding);
1321     u1 = _mm256_srai_epi32(v3, bit);
1322 
1323     v0 = _mm256_mullo_epi32(s2, cospi16);
1324     v1 = _mm256_mullo_epi32(s3, cospi48);
1325     v2 = _mm256_sub_epi32(v1, v0);
1326 
1327     v3 = _mm256_add_epi32(v2, rnding);
1328     u3 = _mm256_srai_epi32(v3, bit);
1329 
1330     // Note: shift[1] and shift[2] are zeros
1331 
1332     // Transpose 4x4 32-bit
1333     v0 = _mm256_unpacklo_epi32(u0, u1);
1334     v1 = _mm256_unpackhi_epi32(u0, u1);
1335     v2 = _mm256_unpacklo_epi32(u2, u3);
1336     v3 = _mm256_unpackhi_epi32(u2, u3);
1337 
1338     out[0] = _mm256_unpacklo_epi64(v0, v2);
1339     out[1] = _mm256_unpackhi_epi64(v0, v2);
1340     out[2] = _mm256_unpacklo_epi64(v1, v3);
1341     out[3] = _mm256_unpackhi_epi64(v1, v3);
1342 
1343     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
1344     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
1345     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
1346     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
1347 }
1348 
fdct4x8_col_avx2(__m256i * in,__m256i * output,int32_t bit,const int32_t num_col)1349 static INLINE void fdct4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit,
1350                                     const int32_t num_col) {
1351     const int32_t *cospi   = cospi_arr(bit);
1352     const __m256i  cospi32 = _mm256_set1_epi32(cospi[32]);
1353     const __m256i  cospi48 = _mm256_set1_epi32(cospi[48]);
1354     const __m256i  cospi16 = _mm256_set1_epi32(cospi[16]);
1355     const __m256i  rnding  = _mm256_set1_epi32(1 << (bit - 1));
1356     __m256i        s0, s1, s2, s3;
1357     __m256i        u0, u1, u2, u3;
1358     __m256i        v0, v1, v2, v3;
1359     __m256i        out[4];
1360 
1361     int32_t endidx = 3 * num_col;
1362     s0             = _mm256_add_epi32(in[0], in[endidx]);
1363     s3             = _mm256_sub_epi32(in[0], in[endidx]);
1364     endidx -= num_col;
1365     s1 = _mm256_add_epi32(in[num_col], in[endidx]);
1366     s2 = _mm256_sub_epi32(in[num_col], in[endidx]);
1367 
1368     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
1369     u0 = _mm256_mullo_epi32(s0, cospi32);
1370     u1 = _mm256_mullo_epi32(s1, cospi32);
1371     u2 = _mm256_add_epi32(u0, u1);
1372     v0 = _mm256_sub_epi32(u0, u1);
1373 
1374     u3 = _mm256_add_epi32(u2, rnding);
1375     v1 = _mm256_add_epi32(v0, rnding);
1376 
1377     u0 = _mm256_srai_epi32(u3, bit);
1378     u2 = _mm256_srai_epi32(v1, bit);
1379 
1380     // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
1381     v0 = _mm256_mullo_epi32(s2, cospi48);
1382     v1 = _mm256_mullo_epi32(s3, cospi16);
1383     v2 = _mm256_add_epi32(v0, v1);
1384 
1385     v3 = _mm256_add_epi32(v2, rnding);
1386     u1 = _mm256_srai_epi32(v3, bit);
1387 
1388     v0 = _mm256_mullo_epi32(s2, cospi16);
1389     v1 = _mm256_mullo_epi32(s3, cospi48);
1390     v2 = _mm256_sub_epi32(v1, v0);
1391 
1392     v3 = _mm256_add_epi32(v2, rnding);
1393     u3 = _mm256_srai_epi32(v3, bit);
1394 
1395     // Note: shift[1] and shift[2] are zeros
1396 
1397     // Transpose 4x4 32-bit
1398     v0 = _mm256_unpacklo_epi32(u0, u1);
1399     v1 = _mm256_unpackhi_epi32(u0, u1);
1400     v2 = _mm256_unpacklo_epi32(u2, u3);
1401     v3 = _mm256_unpackhi_epi32(u2, u3);
1402 
1403     out[0] = _mm256_unpacklo_epi64(v0, v2);
1404     out[1] = _mm256_unpackhi_epi64(v0, v2);
1405     out[2] = _mm256_unpacklo_epi64(v1, v3);
1406     out[3] = _mm256_unpackhi_epi64(v1, v3);
1407 
1408     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
1409     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
1410     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
1411     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
1412 }
1413 
fdct16x4_avx2(__m256i * input,__m256i * output,int32_t bit)1414 static INLINE void fdct16x4_avx2(__m256i *input, __m256i *output, int32_t bit) {
1415     __m128i *in  = (__m128i *)input;
1416     __m128i *out = (__m128i *)output;
1417 
1418     const int32_t *cospi    = cospi_arr(bit);
1419     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
1420     const __m128i  cospim32 = _mm_set1_epi32(-cospi[32]);
1421     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
1422     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
1423     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
1424     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
1425     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
1426     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
1427     const __m128i  cospi24  = _mm_set1_epi32(cospi[24]);
1428     const __m128i  cospi40  = _mm_set1_epi32(cospi[40]);
1429     const __m128i  cospi60  = _mm_set1_epi32(cospi[60]);
1430     const __m128i  cospi4   = _mm_set1_epi32(cospi[4]);
1431     const __m128i  cospi28  = _mm_set1_epi32(cospi[28]);
1432     const __m128i  cospi36  = _mm_set1_epi32(cospi[36]);
1433     const __m128i  cospi44  = _mm_set1_epi32(cospi[44]);
1434     const __m128i  cospi20  = _mm_set1_epi32(cospi[20]);
1435     const __m128i  cospi12  = _mm_set1_epi32(cospi[12]);
1436     const __m128i  cospi52  = _mm_set1_epi32(cospi[52]);
1437     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
1438     __m128i        u[16], v[16], x;
1439 
1440     // stage 0
1441     // stage 1
1442     u[0]  = _mm_add_epi32(in[0], in[15]);
1443     v[15] = _mm_sub_epi32(in[0], in[15]);
1444     u[1]  = _mm_add_epi32(in[1], in[14]);
1445     v[14] = _mm_sub_epi32(in[1], in[14]);
1446     u[2]  = _mm_add_epi32(in[2], in[13]);
1447     u[13] = _mm_sub_epi32(in[2], in[13]);
1448     u[3]  = _mm_add_epi32(in[3], in[12]);
1449     u[12] = _mm_sub_epi32(in[3], in[12]);
1450     u[4]  = _mm_add_epi32(in[4], in[11]);
1451     u[11] = _mm_sub_epi32(in[4], in[11]);
1452     u[5]  = _mm_add_epi32(in[5], in[10]);
1453     u[10] = _mm_sub_epi32(in[5], in[10]);
1454     u[6]  = _mm_add_epi32(in[6], in[9]);
1455     v[9]  = _mm_sub_epi32(in[6], in[9]);
1456     u[7]  = _mm_add_epi32(in[7], in[8]);
1457     v[8]  = _mm_sub_epi32(in[7], in[8]);
1458 
1459     // stage 2
1460     v[0] = _mm_add_epi32(u[0], u[7]);
1461     u[7] = _mm_sub_epi32(u[0], u[7]);
1462     v[1] = _mm_add_epi32(u[1], u[6]);
1463     v[6] = _mm_sub_epi32(u[1], u[6]);
1464     v[2] = _mm_add_epi32(u[2], u[5]);
1465     v[5] = _mm_sub_epi32(u[2], u[5]);
1466     v[3] = _mm_add_epi32(u[3], u[4]);
1467     u[4] = _mm_sub_epi32(u[3], u[4]);
1468 
1469     v[10] = _mm_mullo_epi32(u[10], cospim32);
1470     x     = _mm_mullo_epi32(u[13], cospi32);
1471     v[10] = _mm_add_epi32(v[10], x);
1472     v[10] = _mm_add_epi32(v[10], rnding);
1473     v[10] = _mm_srai_epi32(v[10], bit);
1474 
1475     v[13] = _mm_mullo_epi32(u[10], cospi32);
1476     x     = _mm_mullo_epi32(u[13], cospim32);
1477     v[13] = _mm_sub_epi32(v[13], x);
1478     v[13] = _mm_add_epi32(v[13], rnding);
1479     v[13] = _mm_srai_epi32(v[13], bit);
1480 
1481     v[11] = _mm_mullo_epi32(u[11], cospim32);
1482     x     = _mm_mullo_epi32(u[12], cospi32);
1483     v[11] = _mm_add_epi32(v[11], x);
1484     v[11] = _mm_add_epi32(v[11], rnding);
1485     v[11] = _mm_srai_epi32(v[11], bit);
1486 
1487     v[12] = _mm_mullo_epi32(u[11], cospi32);
1488     x     = _mm_mullo_epi32(u[12], cospim32);
1489     v[12] = _mm_sub_epi32(v[12], x);
1490     v[12] = _mm_add_epi32(v[12], rnding);
1491     v[12] = _mm_srai_epi32(v[12], bit);
1492 
1493     // stage 3
1494     u[0] = _mm_add_epi32(v[0], v[3]);
1495     u[3] = _mm_sub_epi32(v[0], v[3]);
1496     u[1] = _mm_add_epi32(v[1], v[2]);
1497     u[2] = _mm_sub_epi32(v[1], v[2]);
1498 
1499     u[5] = _mm_mullo_epi32(v[5], cospim32);
1500     x    = _mm_mullo_epi32(v[6], cospi32);
1501     u[5] = _mm_add_epi32(u[5], x);
1502     u[5] = _mm_add_epi32(u[5], rnding);
1503     u[5] = _mm_srai_epi32(u[5], bit);
1504 
1505     u[6] = _mm_mullo_epi32(v[5], cospi32);
1506     x    = _mm_mullo_epi32(v[6], cospim32);
1507     u[6] = _mm_sub_epi32(u[6], x);
1508     u[6] = _mm_add_epi32(u[6], rnding);
1509     u[6] = _mm_srai_epi32(u[6], bit);
1510 
1511     u[8]  = _mm_add_epi32(v[8], v[11]);
1512     v[11] = _mm_sub_epi32(v[8], v[11]);
1513     u[9]  = _mm_add_epi32(v[9], v[10]);
1514     u[10] = _mm_sub_epi32(v[9], v[10]);
1515     u[12] = _mm_sub_epi32(v[15], v[12]);
1516     v[15] = _mm_add_epi32(v[15], v[12]);
1517     u[13] = _mm_sub_epi32(v[14], v[13]);
1518     u[14] = _mm_add_epi32(v[14], v[13]);
1519 
1520     // stage 4
1521     u[0]   = _mm_mullo_epi32(u[0], cospi32);
1522     u[1]   = _mm_mullo_epi32(u[1], cospi32);
1523     v[0]   = _mm_add_epi32(u[0], u[1]);
1524     v[0]   = _mm_add_epi32(v[0], rnding);
1525     out[0] = _mm_srai_epi32(v[0], bit);
1526 
1527     v[1]   = _mm_sub_epi32(u[0], u[1]);
1528     v[1]   = _mm_add_epi32(v[1], rnding);
1529     out[8] = _mm_srai_epi32(v[1], bit);
1530 
1531     v[2]   = _mm_mullo_epi32(u[2], cospi48);
1532     x      = _mm_mullo_epi32(u[3], cospi16);
1533     v[2]   = _mm_add_epi32(v[2], x);
1534     v[2]   = _mm_add_epi32(v[2], rnding);
1535     out[4] = _mm_srai_epi32(v[2], bit);
1536 
1537     v[3]    = _mm_mullo_epi32(u[2], cospi16);
1538     x       = _mm_mullo_epi32(u[3], cospi48);
1539     v[3]    = _mm_sub_epi32(x, v[3]);
1540     v[3]    = _mm_add_epi32(v[3], rnding);
1541     out[12] = _mm_srai_epi32(v[3], bit);
1542 
1543     v[4] = _mm_add_epi32(u[4], u[5]);
1544     v[5] = _mm_sub_epi32(u[4], u[5]);
1545     v[6] = _mm_sub_epi32(u[7], u[6]);
1546     v[7] = _mm_add_epi32(u[7], u[6]);
1547     v[8] = u[8];
1548 
1549     v[9] = _mm_mullo_epi32(u[9], cospim16);
1550     x    = _mm_mullo_epi32(u[14], cospi48);
1551     v[9] = _mm_add_epi32(v[9], x);
1552     v[9] = _mm_add_epi32(v[9], rnding);
1553     v[9] = _mm_srai_epi32(v[9], bit);
1554 
1555     v[14] = _mm_mullo_epi32(u[9], cospi48);
1556     x     = _mm_mullo_epi32(u[14], cospim16);
1557     v[14] = _mm_sub_epi32(v[14], x);
1558     v[14] = _mm_add_epi32(v[14], rnding);
1559     v[14] = _mm_srai_epi32(v[14], bit);
1560 
1561     v[10] = _mm_mullo_epi32(u[10], cospim48);
1562     x     = _mm_mullo_epi32(u[13], cospim16);
1563     v[10] = _mm_add_epi32(v[10], x);
1564     v[10] = _mm_add_epi32(v[10], rnding);
1565     v[10] = _mm_srai_epi32(v[10], bit);
1566 
1567     v[13] = _mm_mullo_epi32(u[10], cospim16);
1568     x     = _mm_mullo_epi32(u[13], cospim48);
1569     v[13] = _mm_sub_epi32(v[13], x);
1570     v[13] = _mm_add_epi32(v[13], rnding);
1571     v[13] = _mm_srai_epi32(v[13], bit);
1572 
1573     v[12] = u[12];
1574 
1575     // stage 5
1576     u[4]   = _mm_mullo_epi32(v[4], cospi56);
1577     x      = _mm_mullo_epi32(v[7], cospi8);
1578     u[4]   = _mm_add_epi32(u[4], x);
1579     u[4]   = _mm_add_epi32(u[4], rnding);
1580     out[2] = _mm_srai_epi32(u[4], bit);
1581 
1582     u[7]    = _mm_mullo_epi32(v[4], cospi8);
1583     x       = _mm_mullo_epi32(v[7], cospi56);
1584     u[7]    = _mm_sub_epi32(x, u[7]);
1585     u[7]    = _mm_add_epi32(u[7], rnding);
1586     out[14] = _mm_srai_epi32(u[7], bit);
1587 
1588     u[5]    = _mm_mullo_epi32(v[5], cospi24);
1589     x       = _mm_mullo_epi32(v[6], cospi40);
1590     u[5]    = _mm_add_epi32(u[5], x);
1591     u[5]    = _mm_add_epi32(u[5], rnding);
1592     out[10] = _mm_srai_epi32(u[5], bit);
1593 
1594     u[6]   = _mm_mullo_epi32(v[5], cospi40);
1595     x      = _mm_mullo_epi32(v[6], cospi24);
1596     u[6]   = _mm_sub_epi32(x, u[6]);
1597     u[6]   = _mm_add_epi32(u[6], rnding);
1598     out[6] = _mm_srai_epi32(u[6], bit);
1599 
1600     u[8]  = _mm_add_epi32(v[8], v[9]);
1601     u[9]  = _mm_sub_epi32(v[8], v[9]);
1602     u[10] = _mm_sub_epi32(v[11], v[10]);
1603     u[11] = _mm_add_epi32(v[11], v[10]);
1604     u[12] = _mm_add_epi32(v[12], v[13]);
1605     u[13] = _mm_sub_epi32(v[12], v[13]);
1606     u[14] = _mm_sub_epi32(v[15], v[14]);
1607     u[15] = _mm_add_epi32(v[15], v[14]);
1608 
1609     // stage 6
1610     v[8]   = _mm_mullo_epi32(u[8], cospi60);
1611     x      = _mm_mullo_epi32(u[15], cospi4);
1612     v[8]   = _mm_add_epi32(v[8], x);
1613     v[8]   = _mm_add_epi32(v[8], rnding);
1614     out[1] = _mm_srai_epi32(v[8], bit);
1615 
1616     v[15]   = _mm_mullo_epi32(u[8], cospi4);
1617     x       = _mm_mullo_epi32(u[15], cospi60);
1618     v[15]   = _mm_sub_epi32(x, v[15]);
1619     v[15]   = _mm_add_epi32(v[15], rnding);
1620     out[15] = _mm_srai_epi32(v[15], bit);
1621 
1622     v[9]   = _mm_mullo_epi32(u[9], cospi28);
1623     x      = _mm_mullo_epi32(u[14], cospi36);
1624     v[9]   = _mm_add_epi32(v[9], x);
1625     v[9]   = _mm_add_epi32(v[9], rnding);
1626     out[9] = _mm_srai_epi32(v[9], bit);
1627 
1628     v[14]  = _mm_mullo_epi32(u[9], cospi36);
1629     x      = _mm_mullo_epi32(u[14], cospi28);
1630     v[14]  = _mm_sub_epi32(x, v[14]);
1631     v[14]  = _mm_add_epi32(v[14], rnding);
1632     out[7] = _mm_srai_epi32(v[14], bit);
1633 
1634     v[10]  = _mm_mullo_epi32(u[10], cospi44);
1635     x      = _mm_mullo_epi32(u[13], cospi20);
1636     v[10]  = _mm_add_epi32(v[10], x);
1637     v[10]  = _mm_add_epi32(v[10], rnding);
1638     out[5] = _mm_srai_epi32(v[10], bit);
1639 
1640     v[13]   = _mm_mullo_epi32(u[10], cospi20);
1641     x       = _mm_mullo_epi32(u[13], cospi44);
1642     v[13]   = _mm_sub_epi32(x, v[13]);
1643     v[13]   = _mm_add_epi32(v[13], rnding);
1644     out[11] = _mm_srai_epi32(v[13], bit);
1645 
1646     v[11]   = _mm_mullo_epi32(u[11], cospi12);
1647     x       = _mm_mullo_epi32(u[12], cospi52);
1648     v[11]   = _mm_add_epi32(v[11], x);
1649     v[11]   = _mm_add_epi32(v[11], rnding);
1650     out[13] = _mm_srai_epi32(v[11], bit);
1651 
1652     v[12]  = _mm_mullo_epi32(u[11], cospi52);
1653     x      = _mm_mullo_epi32(u[12], cospi12);
1654     v[12]  = _mm_sub_epi32(x, v[12]);
1655     v[12]  = _mm_add_epi32(v[12], rnding);
1656     out[3] = _mm_srai_epi32(v[12], bit);
1657 }
1658 
fadst8x4_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t col_num)1659 static INLINE void fadst8x4_avx2(__m256i *input, __m256i *output, int32_t bit,
1660                                  const int32_t col_num) {
1661     __m128i *      in       = (__m128i *)input;
1662     __m128i *      out      = (__m128i *)output;
1663     const int32_t *cospi    = cospi_arr(bit);
1664     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
1665     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
1666     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
1667     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
1668     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
1669     const __m128i  cospi4   = _mm_set1_epi32(cospi[4]);
1670     const __m128i  cospim4  = _mm_set1_epi32(-cospi[4]);
1671     const __m128i  cospi60  = _mm_set1_epi32(cospi[60]);
1672     const __m128i  cospi20  = _mm_set1_epi32(cospi[20]);
1673     const __m128i  cospim20 = _mm_set1_epi32(-cospi[20]);
1674     const __m128i  cospi44  = _mm_set1_epi32(cospi[44]);
1675     const __m128i  cospi28  = _mm_set1_epi32(cospi[28]);
1676     const __m128i  cospi36  = _mm_set1_epi32(cospi[36]);
1677     const __m128i  cospim36 = _mm_set1_epi32(-cospi[36]);
1678     const __m128i  cospi52  = _mm_set1_epi32(cospi[52]);
1679     const __m128i  cospim52 = _mm_set1_epi32(-cospi[52]);
1680     const __m128i  cospi12  = _mm_set1_epi32(cospi[12]);
1681     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
1682     const __m128i  zero     = _mm_setzero_si128();
1683     __m128i        u0, u1, u2, u3, u4, u5, u6, u7;
1684     __m128i        v0, v1, v2, v3, v4, v5, v6, v7;
1685     __m128i        x, y;
1686     int32_t        col;
1687 
1688     // Note:
1689     //  Even column: 0, 2, ..., 14
1690     //  Odd column: 1, 3, ..., 15
1691     //  one even column plus one odd column constructs one row (8 coeffs)
1692     //  total we have 8 rows (8x8).
1693     for (col = 0; col < col_num; ++col) {
1694         // stage 0
1695         // stage 1
1696         u0 = in[col_num * 0 + col];
1697         u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
1698         u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
1699         u3 = in[col_num * 4 + col];
1700         u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
1701         u5 = in[col_num * 6 + col];
1702         u6 = in[col_num * 2 + col];
1703         u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
1704 
1705         // stage 2
1706         v0 = u0;
1707         v1 = u1;
1708 
1709         x  = _mm_mullo_epi32(u2, cospi32);
1710         y  = _mm_mullo_epi32(u3, cospi32);
1711         v2 = _mm_add_epi32(x, y);
1712         v2 = _mm_add_epi32(v2, rnding);
1713         v2 = _mm_srai_epi32(v2, bit);
1714 
1715         v3 = _mm_sub_epi32(x, y);
1716         v3 = _mm_add_epi32(v3, rnding);
1717         v3 = _mm_srai_epi32(v3, bit);
1718 
1719         v4 = u4;
1720         v5 = u5;
1721 
1722         x  = _mm_mullo_epi32(u6, cospi32);
1723         y  = _mm_mullo_epi32(u7, cospi32);
1724         v6 = _mm_add_epi32(x, y);
1725         v6 = _mm_add_epi32(v6, rnding);
1726         v6 = _mm_srai_epi32(v6, bit);
1727 
1728         v7 = _mm_sub_epi32(x, y);
1729         v7 = _mm_add_epi32(v7, rnding);
1730         v7 = _mm_srai_epi32(v7, bit);
1731 
1732         // stage 3
1733         u0 = _mm_add_epi32(v0, v2);
1734         u1 = _mm_add_epi32(v1, v3);
1735         u2 = _mm_sub_epi32(v0, v2);
1736         u3 = _mm_sub_epi32(v1, v3);
1737         u4 = _mm_add_epi32(v4, v6);
1738         u5 = _mm_add_epi32(v5, v7);
1739         u6 = _mm_sub_epi32(v4, v6);
1740         u7 = _mm_sub_epi32(v5, v7);
1741 
1742         // stage 4
1743         v0 = u0;
1744         v1 = u1;
1745         v2 = u2;
1746         v3 = u3;
1747 
1748         x  = _mm_mullo_epi32(u4, cospi16);
1749         y  = _mm_mullo_epi32(u5, cospi48);
1750         v4 = _mm_add_epi32(x, y);
1751         v4 = _mm_add_epi32(v4, rnding);
1752         v4 = _mm_srai_epi32(v4, bit);
1753 
1754         x  = _mm_mullo_epi32(u4, cospi48);
1755         y  = _mm_mullo_epi32(u5, cospim16);
1756         v5 = _mm_add_epi32(x, y);
1757         v5 = _mm_add_epi32(v5, rnding);
1758         v5 = _mm_srai_epi32(v5, bit);
1759 
1760         x  = _mm_mullo_epi32(u6, cospim48);
1761         y  = _mm_mullo_epi32(u7, cospi16);
1762         v6 = _mm_add_epi32(x, y);
1763         v6 = _mm_add_epi32(v6, rnding);
1764         v6 = _mm_srai_epi32(v6, bit);
1765 
1766         x  = _mm_mullo_epi32(u6, cospi16);
1767         y  = _mm_mullo_epi32(u7, cospi48);
1768         v7 = _mm_add_epi32(x, y);
1769         v7 = _mm_add_epi32(v7, rnding);
1770         v7 = _mm_srai_epi32(v7, bit);
1771 
1772         // stage 5
1773         u0 = _mm_add_epi32(v0, v4);
1774         u1 = _mm_add_epi32(v1, v5);
1775         u2 = _mm_add_epi32(v2, v6);
1776         u3 = _mm_add_epi32(v3, v7);
1777         u4 = _mm_sub_epi32(v0, v4);
1778         u5 = _mm_sub_epi32(v1, v5);
1779         u6 = _mm_sub_epi32(v2, v6);
1780         u7 = _mm_sub_epi32(v3, v7);
1781 
1782         // stage 6
1783         x                      = _mm_mullo_epi32(u0, cospi4);
1784         y                      = _mm_mullo_epi32(u1, cospi60);
1785         v0                     = _mm_add_epi32(x, y);
1786         v0                     = _mm_add_epi32(v0, rnding);
1787         out[col_num * 7 + col] = _mm_srai_epi32(v0, bit);
1788 
1789         x                      = _mm_mullo_epi32(u0, cospi60);
1790         y                      = _mm_mullo_epi32(u1, cospim4);
1791         v1                     = _mm_add_epi32(x, y);
1792         v1                     = _mm_add_epi32(v1, rnding);
1793         out[col_num * 0 + col] = _mm_srai_epi32(v1, bit);
1794 
1795         x                      = _mm_mullo_epi32(u2, cospi20);
1796         y                      = _mm_mullo_epi32(u3, cospi44);
1797         v2                     = _mm_add_epi32(x, y);
1798         v2                     = _mm_add_epi32(v2, rnding);
1799         out[col_num * 5 + col] = _mm_srai_epi32(v2, bit);
1800 
1801         x                      = _mm_mullo_epi32(u2, cospi44);
1802         y                      = _mm_mullo_epi32(u3, cospim20);
1803         v3                     = _mm_add_epi32(x, y);
1804         v3                     = _mm_add_epi32(v3, rnding);
1805         out[col_num * 2 + col] = _mm_srai_epi32(v3, bit);
1806 
1807         x                      = _mm_mullo_epi32(u4, cospi36);
1808         y                      = _mm_mullo_epi32(u5, cospi28);
1809         v4                     = _mm_add_epi32(x, y);
1810         v4                     = _mm_add_epi32(v4, rnding);
1811         out[col_num * 3 + col] = _mm_srai_epi32(v4, bit);
1812 
1813         x                      = _mm_mullo_epi32(u4, cospi28);
1814         y                      = _mm_mullo_epi32(u5, cospim36);
1815         v5                     = _mm_add_epi32(x, y);
1816         v5                     = _mm_add_epi32(v5, rnding);
1817         out[col_num * 4 + col] = _mm_srai_epi32(v5, bit);
1818 
1819         x                      = _mm_mullo_epi32(u6, cospi52);
1820         y                      = _mm_mullo_epi32(u7, cospi12);
1821         v6                     = _mm_add_epi32(x, y);
1822         v6                     = _mm_add_epi32(v6, rnding);
1823         out[col_num * 1 + col] = _mm_srai_epi32(v6, bit);
1824 
1825         x                      = _mm_mullo_epi32(u6, cospi12);
1826         y                      = _mm_mullo_epi32(u7, cospim52);
1827         v7                     = _mm_add_epi32(x, y);
1828         v7                     = _mm_add_epi32(v7, rnding);
1829         out[col_num * 6 + col] = _mm_srai_epi32(v7, bit);
1830     }
1831 }
1832 
fadst16x4_avx2(__m256i * input,__m256i * output,int32_t bit)1833 static INLINE void fadst16x4_avx2(__m256i *input, __m256i *output, int32_t bit) {
1834     __m128i *in  = (__m128i *)input;
1835     __m128i *out = (__m128i *)output;
1836 
1837     const int32_t *cospi    = cospi_arr(bit);
1838     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
1839     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
1840     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
1841     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
1842     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
1843     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
1844     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
1845     const __m128i  cospim56 = _mm_set1_epi32(-cospi[56]);
1846     const __m128i  cospim8  = _mm_set1_epi32(-cospi[8]);
1847     const __m128i  cospi24  = _mm_set1_epi32(cospi[24]);
1848     const __m128i  cospim24 = _mm_set1_epi32(-cospi[24]);
1849     const __m128i  cospim40 = _mm_set1_epi32(-cospi[40]);
1850     const __m128i  cospi40  = _mm_set1_epi32(cospi[40]);
1851     const __m128i  cospi2   = _mm_set1_epi32(cospi[2]);
1852     const __m128i  cospi62  = _mm_set1_epi32(cospi[62]);
1853     const __m128i  cospim2  = _mm_set1_epi32(-cospi[2]);
1854     const __m128i  cospi10  = _mm_set1_epi32(cospi[10]);
1855     const __m128i  cospi54  = _mm_set1_epi32(cospi[54]);
1856     const __m128i  cospim10 = _mm_set1_epi32(-cospi[10]);
1857     const __m128i  cospi18  = _mm_set1_epi32(cospi[18]);
1858     const __m128i  cospi46  = _mm_set1_epi32(cospi[46]);
1859     const __m128i  cospim18 = _mm_set1_epi32(-cospi[18]);
1860     const __m128i  cospi26  = _mm_set1_epi32(cospi[26]);
1861     const __m128i  cospi38  = _mm_set1_epi32(cospi[38]);
1862     const __m128i  cospim26 = _mm_set1_epi32(-cospi[26]);
1863     const __m128i  cospi34  = _mm_set1_epi32(cospi[34]);
1864     const __m128i  cospi30  = _mm_set1_epi32(cospi[30]);
1865     const __m128i  cospim34 = _mm_set1_epi32(-cospi[34]);
1866     const __m128i  cospi42  = _mm_set1_epi32(cospi[42]);
1867     const __m128i  cospi22  = _mm_set1_epi32(cospi[22]);
1868     const __m128i  cospim42 = _mm_set1_epi32(-cospi[42]);
1869     const __m128i  cospi50  = _mm_set1_epi32(cospi[50]);
1870     const __m128i  cospi14  = _mm_set1_epi32(cospi[14]);
1871     const __m128i  cospim50 = _mm_set1_epi32(-cospi[50]);
1872     const __m128i  cospi58  = _mm_set1_epi32(cospi[58]);
1873     const __m128i  cospi6   = _mm_set1_epi32(cospi[6]);
1874     const __m128i  cospim58 = _mm_set1_epi32(-cospi[58]);
1875     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
1876     const __m128i  zero     = _mm_setzero_si128();
1877 
1878     __m128i u[16], v[16], x, y;
1879     __m128i tmp[13];
1880 
1881     tmp[0] = _mm_sub_epi32(zero, in[15]);
1882     u[2]   = _mm_sub_epi32(zero, in[7]);
1883     tmp[1] = _mm_sub_epi32(zero, in[3]);
1884     u[7]   = _mm_sub_epi32(zero, in[11]);
1885     tmp[2] = _mm_sub_epi32(zero, in[1]);
1886     u[11]  = _mm_sub_epi32(zero, in[9]);
1887     tmp[3] = _mm_sub_epi32(zero, in[13]);
1888     u[14]  = _mm_sub_epi32(zero, in[5]);
1889 
1890     // stage 2
1891 
1892     x    = _mm_mullo_epi32(u[2], cospi32);
1893     y    = _mm_mullo_epi32(in[8], cospi32);
1894     v[2] = _mm_add_epi32(x, y);
1895     v[2] = _mm_add_epi32(v[2], rnding);
1896     v[2] = _mm_srai_epi32(v[2], bit);
1897 
1898     v[3] = _mm_sub_epi32(x, y);
1899     v[3] = _mm_add_epi32(v[3], rnding);
1900     v[3] = _mm_srai_epi32(v[3], bit);
1901 
1902     x    = _mm_mullo_epi32(in[4], cospi32);
1903     y    = _mm_mullo_epi32(u[7], cospi32);
1904     v[6] = _mm_add_epi32(x, y);
1905     v[6] = _mm_add_epi32(v[6], rnding);
1906     v[6] = _mm_srai_epi32(v[6], bit);
1907 
1908     v[7] = _mm_sub_epi32(x, y);
1909     v[7] = _mm_add_epi32(v[7], rnding);
1910     v[7] = _mm_srai_epi32(v[7], bit);
1911 
1912     x     = _mm_mullo_epi32(in[6], cospi32);
1913     y     = _mm_mullo_epi32(u[11], cospi32);
1914     v[10] = _mm_add_epi32(x, y);
1915     v[10] = _mm_add_epi32(v[10], rnding);
1916     v[10] = _mm_srai_epi32(v[10], bit);
1917 
1918     v[11] = _mm_sub_epi32(x, y);
1919     v[11] = _mm_add_epi32(v[11], rnding);
1920     v[11] = _mm_srai_epi32(v[11], bit);
1921 
1922     x     = _mm_mullo_epi32(u[14], cospi32);
1923     y     = _mm_mullo_epi32(in[10], cospi32);
1924     v[14] = _mm_add_epi32(x, y);
1925     v[14] = _mm_add_epi32(v[14], rnding);
1926     v[14] = _mm_srai_epi32(v[14], bit);
1927 
1928     v[15] = _mm_sub_epi32(x, y);
1929     v[15] = _mm_add_epi32(v[15], rnding);
1930     v[15] = _mm_srai_epi32(v[15], bit);
1931 
1932     // stage 3
1933     tmp[4] = _mm_add_epi32(in[0], v[2]);
1934     tmp[5] = _mm_add_epi32(tmp[0], v[3]);
1935     tmp[6] = _mm_sub_epi32(in[0], v[2]);
1936     tmp[0] = _mm_sub_epi32(tmp[0], v[3]);
1937     u[4]   = _mm_add_epi32(tmp[1], v[6]);
1938     u[5]   = _mm_add_epi32(in[12], v[7]);
1939     u[6]   = _mm_sub_epi32(tmp[1], v[6]);
1940     u[7]   = _mm_sub_epi32(in[12], v[7]);
1941     tmp[1] = _mm_add_epi32(tmp[2], v[10]);
1942     tmp[7] = _mm_add_epi32(in[14], v[11]);
1943     tmp[2] = _mm_sub_epi32(tmp[2], v[10]);
1944     tmp[8] = _mm_sub_epi32(in[14], v[11]);
1945     u[12]  = _mm_add_epi32(in[2], v[14]);
1946     u[13]  = _mm_add_epi32(tmp[3], v[15]);
1947     u[14]  = _mm_sub_epi32(in[2], v[14]);
1948     u[15]  = _mm_sub_epi32(tmp[3], v[15]);
1949 
1950     // stage 4
1951     v[4]  = half_btf_small(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
1952     v[5]  = half_btf_small(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
1953     v[6]  = half_btf_small(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
1954     v[7]  = half_btf_small(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
1955     v[12] = half_btf_small(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
1956     v[13] = half_btf_small(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
1957     v[14] = half_btf_small(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
1958     v[15] = half_btf_small(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
1959 
1960     // stage 5
1961     tmp[9]  = _mm_add_epi32(tmp[4], v[4]);
1962     tmp[10] = _mm_add_epi32(tmp[5], v[5]);
1963     tmp[11] = _mm_add_epi32(tmp[6], v[6]);
1964     tmp[12] = _mm_add_epi32(tmp[0], v[7]);
1965     tmp[4]  = _mm_sub_epi32(tmp[4], v[4]);
1966     tmp[5]  = _mm_sub_epi32(tmp[5], v[5]);
1967     tmp[6]  = _mm_sub_epi32(tmp[6], v[6]);
1968     tmp[0]  = _mm_sub_epi32(tmp[0], v[7]);
1969     u[8]    = _mm_add_epi32(tmp[1], v[12]);
1970     u[9]    = _mm_add_epi32(tmp[7], v[13]);
1971     u[10]   = _mm_add_epi32(tmp[2], v[14]);
1972     u[11]   = _mm_add_epi32(tmp[8], v[15]);
1973     u[12]   = _mm_sub_epi32(tmp[1], v[12]);
1974     u[13]   = _mm_sub_epi32(tmp[7], v[13]);
1975     u[14]   = _mm_sub_epi32(tmp[2], v[14]);
1976     u[15]   = _mm_sub_epi32(tmp[8], v[15]);
1977 
1978     // stage 6
1979     v[8]  = half_btf_small(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
1980     v[9]  = half_btf_small(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
1981     v[10] = half_btf_small(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
1982     v[11] = half_btf_small(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
1983     v[12] = half_btf_small(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
1984     v[13] = half_btf_small(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
1985     v[14] = half_btf_small(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
1986     v[15] = half_btf_small(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
1987 
1988     // stage 7
1989     u[0]  = _mm_add_epi32(tmp[9], v[8]);
1990     u[1]  = _mm_add_epi32(tmp[10], v[9]);
1991     u[2]  = _mm_add_epi32(tmp[11], v[10]);
1992     u[3]  = _mm_add_epi32(tmp[12], v[11]);
1993     u[4]  = _mm_add_epi32(tmp[4], v[12]);
1994     u[5]  = _mm_add_epi32(tmp[5], v[13]);
1995     u[6]  = _mm_add_epi32(tmp[6], v[14]);
1996     u[7]  = _mm_add_epi32(tmp[0], v[15]);
1997     u[8]  = _mm_sub_epi32(tmp[9], v[8]);
1998     u[9]  = _mm_sub_epi32(tmp[10], v[9]);
1999     u[10] = _mm_sub_epi32(tmp[11], v[10]);
2000     u[11] = _mm_sub_epi32(tmp[12], v[11]);
2001     u[12] = _mm_sub_epi32(tmp[4], v[12]);
2002     u[13] = _mm_sub_epi32(tmp[5], v[13]);
2003     u[14] = _mm_sub_epi32(tmp[6], v[14]);
2004     u[15] = _mm_sub_epi32(tmp[0], v[15]);
2005 
2006     // stage 8
2007     out[15] = half_btf_small(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
2008     out[0]  = half_btf_small(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
2009     out[13] = half_btf_small(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
2010     out[2]  = half_btf_small(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
2011     out[11] = half_btf_small(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
2012     out[4]  = half_btf_small(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
2013     out[9]  = half_btf_small(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
2014     out[6]  = half_btf_small(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
2015     out[7]  = half_btf_small(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
2016     out[8]  = half_btf_small(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
2017     out[5]  = half_btf_small(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
2018     out[10] = half_btf_small(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
2019     out[3]  = half_btf_small(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
2020     out[12] = half_btf_small(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
2021     out[1]  = half_btf_small(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
2022     out[14] = half_btf_small(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
2023 }
2024 
fdct16x16_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num)2025 static void fdct16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
2026     const int32_t *cospi    = cospi_arr(bit);
2027     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
2028     const __m256i  cospim32 = _mm256_set1_epi32(-cospi[32]);
2029     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
2030     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
2031     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
2032     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
2033     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
2034     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
2035     const __m256i  cospi24  = _mm256_set1_epi32(cospi[24]);
2036     const __m256i  cospi40  = _mm256_set1_epi32(cospi[40]);
2037     const __m256i  cospi60  = _mm256_set1_epi32(cospi[60]);
2038     const __m256i  cospi4   = _mm256_set1_epi32(cospi[4]);
2039     const __m256i  cospi28  = _mm256_set1_epi32(cospi[28]);
2040     const __m256i  cospi36  = _mm256_set1_epi32(cospi[36]);
2041     const __m256i  cospi44  = _mm256_set1_epi32(cospi[44]);
2042     const __m256i  cospi20  = _mm256_set1_epi32(cospi[20]);
2043     const __m256i  cospi12  = _mm256_set1_epi32(cospi[12]);
2044     const __m256i  cospi52  = _mm256_set1_epi32(cospi[52]);
2045     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
2046     __m256i        u[16], v[16], x;
2047     int32_t        col;
2048 
2049     for (col = 0; col < col_num; ++col) {
2050         // stage 0
2051         // stage 1
2052         u[0]  = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
2053         u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
2054         u[1]  = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
2055         u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
2056         u[2]  = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
2057         u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
2058         u[3]  = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
2059         u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
2060         u[4]  = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
2061         u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
2062         u[5]  = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
2063         u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
2064         u[6]  = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
2065         u[9]  = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
2066         u[7]  = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
2067         u[8]  = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
2068 
2069         // stage 2
2070         v[0] = _mm256_add_epi32(u[0], u[7]);
2071         v[7] = _mm256_sub_epi32(u[0], u[7]);
2072         v[1] = _mm256_add_epi32(u[1], u[6]);
2073         v[6] = _mm256_sub_epi32(u[1], u[6]);
2074         v[2] = _mm256_add_epi32(u[2], u[5]);
2075         v[5] = _mm256_sub_epi32(u[2], u[5]);
2076         v[3] = _mm256_add_epi32(u[3], u[4]);
2077         v[4] = _mm256_sub_epi32(u[3], u[4]);
2078 
2079         v[10] = _mm256_mullo_epi32(u[10], cospim32);
2080         x     = _mm256_mullo_epi32(u[13], cospi32);
2081         v[10] = _mm256_add_epi32(v[10], x);
2082         v[10] = _mm256_add_epi32(v[10], rnding);
2083         v[10] = _mm256_srai_epi32(v[10], bit);
2084 
2085         v[13] = _mm256_mullo_epi32(u[10], cospi32);
2086         x     = _mm256_mullo_epi32(u[13], cospim32);
2087         v[13] = _mm256_sub_epi32(v[13], x);
2088         v[13] = _mm256_add_epi32(v[13], rnding);
2089         v[13] = _mm256_srai_epi32(v[13], bit);
2090 
2091         v[11] = _mm256_mullo_epi32(u[11], cospim32);
2092         x     = _mm256_mullo_epi32(u[12], cospi32);
2093         v[11] = _mm256_add_epi32(v[11], x);
2094         v[11] = _mm256_add_epi32(v[11], rnding);
2095         v[11] = _mm256_srai_epi32(v[11], bit);
2096 
2097         v[12] = _mm256_mullo_epi32(u[11], cospi32);
2098         x     = _mm256_mullo_epi32(u[12], cospim32);
2099         v[12] = _mm256_sub_epi32(v[12], x);
2100         v[12] = _mm256_add_epi32(v[12], rnding);
2101         v[12] = _mm256_srai_epi32(v[12], bit);
2102 
2103         // stage 3
2104         u[0] = _mm256_add_epi32(v[0], v[3]);
2105         u[3] = _mm256_sub_epi32(v[0], v[3]);
2106         u[1] = _mm256_add_epi32(v[1], v[2]);
2107         u[2] = _mm256_sub_epi32(v[1], v[2]);
2108 
2109         u[5] = _mm256_mullo_epi32(v[5], cospim32);
2110         x    = _mm256_mullo_epi32(v[6], cospi32);
2111         u[5] = _mm256_add_epi32(u[5], x);
2112         u[5] = _mm256_add_epi32(u[5], rnding);
2113         u[5] = _mm256_srai_epi32(u[5], bit);
2114 
2115         u[6] = _mm256_mullo_epi32(v[5], cospi32);
2116         x    = _mm256_mullo_epi32(v[6], cospim32);
2117         u[6] = _mm256_sub_epi32(u[6], x);
2118         u[6] = _mm256_add_epi32(u[6], rnding);
2119         u[6] = _mm256_srai_epi32(u[6], bit);
2120 
2121         u[11] = _mm256_sub_epi32(u[8], v[11]);
2122         u[8]  = _mm256_add_epi32(u[8], v[11]);
2123         u[10] = _mm256_sub_epi32(u[9], v[10]);
2124         u[9]  = _mm256_add_epi32(u[9], v[10]);
2125         u[12] = _mm256_sub_epi32(u[15], v[12]);
2126         u[15] = _mm256_add_epi32(u[15], v[12]);
2127         u[13] = _mm256_sub_epi32(u[14], v[13]);
2128         u[14] = _mm256_add_epi32(u[14], v[13]);
2129 
2130         // stage 4
2131         u[0]                   = _mm256_mullo_epi32(u[0], cospi32);
2132         u[1]                   = _mm256_mullo_epi32(u[1], cospi32);
2133         v[0]                   = _mm256_add_epi32(u[0], u[1]);
2134         v[0]                   = _mm256_add_epi32(v[0], rnding);
2135         out[0 * col_num + col] = _mm256_srai_epi32(v[0], bit);
2136 
2137         v[1]                   = _mm256_sub_epi32(u[0], u[1]);
2138         v[1]                   = _mm256_add_epi32(v[1], rnding);
2139         out[8 * col_num + col] = _mm256_srai_epi32(v[1], bit);
2140 
2141         v[2]                   = _mm256_mullo_epi32(u[2], cospi48);
2142         x                      = _mm256_mullo_epi32(u[3], cospi16);
2143         v[2]                   = _mm256_add_epi32(v[2], x);
2144         v[2]                   = _mm256_add_epi32(v[2], rnding);
2145         out[4 * col_num + col] = _mm256_srai_epi32(v[2], bit);
2146 
2147         v[3]                    = _mm256_mullo_epi32(u[2], cospi16);
2148         x                       = _mm256_mullo_epi32(u[3], cospi48);
2149         v[3]                    = _mm256_sub_epi32(x, v[3]);
2150         v[3]                    = _mm256_add_epi32(v[3], rnding);
2151         out[12 * col_num + col] = _mm256_srai_epi32(v[3], bit);
2152 
2153         v[5] = _mm256_sub_epi32(v[4], u[5]);
2154         v[4] = _mm256_add_epi32(v[4], u[5]);
2155         v[6] = _mm256_sub_epi32(v[7], u[6]);
2156         v[7] = _mm256_add_epi32(v[7], u[6]);
2157 
2158         v[9] = _mm256_mullo_epi32(u[9], cospim16);
2159         x    = _mm256_mullo_epi32(u[14], cospi48);
2160         v[9] = _mm256_add_epi32(v[9], x);
2161         v[9] = _mm256_add_epi32(v[9], rnding);
2162         v[9] = _mm256_srai_epi32(v[9], bit);
2163 
2164         v[14] = _mm256_mullo_epi32(u[9], cospi48);
2165         x     = _mm256_mullo_epi32(u[14], cospim16);
2166         v[14] = _mm256_sub_epi32(v[14], x);
2167         v[14] = _mm256_add_epi32(v[14], rnding);
2168         v[14] = _mm256_srai_epi32(v[14], bit);
2169 
2170         v[10] = _mm256_mullo_epi32(u[10], cospim48);
2171         x     = _mm256_mullo_epi32(u[13], cospim16);
2172         v[10] = _mm256_add_epi32(v[10], x);
2173         v[10] = _mm256_add_epi32(v[10], rnding);
2174         v[10] = _mm256_srai_epi32(v[10], bit);
2175 
2176         v[13] = _mm256_mullo_epi32(u[10], cospim16);
2177         x     = _mm256_mullo_epi32(u[13], cospim48);
2178         v[13] = _mm256_sub_epi32(v[13], x);
2179         v[13] = _mm256_add_epi32(v[13], rnding);
2180         v[13] = _mm256_srai_epi32(v[13], bit);
2181 
2182         // stage 5
2183         u[4]                   = _mm256_mullo_epi32(v[4], cospi56);
2184         x                      = _mm256_mullo_epi32(v[7], cospi8);
2185         u[4]                   = _mm256_add_epi32(u[4], x);
2186         u[4]                   = _mm256_add_epi32(u[4], rnding);
2187         out[2 * col_num + col] = _mm256_srai_epi32(u[4], bit);
2188 
2189         u[7]                    = _mm256_mullo_epi32(v[4], cospi8);
2190         x                       = _mm256_mullo_epi32(v[7], cospi56);
2191         u[7]                    = _mm256_sub_epi32(x, u[7]);
2192         u[7]                    = _mm256_add_epi32(u[7], rnding);
2193         out[14 * col_num + col] = _mm256_srai_epi32(u[7], bit);
2194 
2195         u[5]                    = _mm256_mullo_epi32(v[5], cospi24);
2196         x                       = _mm256_mullo_epi32(v[6], cospi40);
2197         u[5]                    = _mm256_add_epi32(u[5], x);
2198         u[5]                    = _mm256_add_epi32(u[5], rnding);
2199         out[10 * col_num + col] = _mm256_srai_epi32(u[5], bit);
2200 
2201         u[6]                   = _mm256_mullo_epi32(v[5], cospi40);
2202         x                      = _mm256_mullo_epi32(v[6], cospi24);
2203         u[6]                   = _mm256_sub_epi32(x, u[6]);
2204         u[6]                   = _mm256_add_epi32(u[6], rnding);
2205         out[6 * col_num + col] = _mm256_srai_epi32(u[6], bit);
2206 
2207         u[9]  = _mm256_sub_epi32(u[8], v[9]);
2208         u[8]  = _mm256_add_epi32(u[8], v[9]);
2209         u[10] = _mm256_sub_epi32(u[11], v[10]);
2210         u[11] = _mm256_add_epi32(u[11], v[10]);
2211         u[13] = _mm256_sub_epi32(u[12], v[13]);
2212         u[12] = _mm256_add_epi32(u[12], v[13]);
2213         u[14] = _mm256_sub_epi32(u[15], v[14]);
2214         u[15] = _mm256_add_epi32(u[15], v[14]);
2215 
2216         // stage 6
2217         v[8]                   = _mm256_mullo_epi32(u[8], cospi60);
2218         x                      = _mm256_mullo_epi32(u[15], cospi4);
2219         v[8]                   = _mm256_add_epi32(v[8], x);
2220         v[8]                   = _mm256_add_epi32(v[8], rnding);
2221         out[1 * col_num + col] = _mm256_srai_epi32(v[8], bit);
2222 
2223         v[15]                   = _mm256_mullo_epi32(u[8], cospi4);
2224         x                       = _mm256_mullo_epi32(u[15], cospi60);
2225         v[15]                   = _mm256_sub_epi32(x, v[15]);
2226         v[15]                   = _mm256_add_epi32(v[15], rnding);
2227         out[15 * col_num + col] = _mm256_srai_epi32(v[15], bit);
2228 
2229         v[9]                   = _mm256_mullo_epi32(u[9], cospi28);
2230         x                      = _mm256_mullo_epi32(u[14], cospi36);
2231         v[9]                   = _mm256_add_epi32(v[9], x);
2232         v[9]                   = _mm256_add_epi32(v[9], rnding);
2233         out[9 * col_num + col] = _mm256_srai_epi32(v[9], bit);
2234 
2235         v[14]                  = _mm256_mullo_epi32(u[9], cospi36);
2236         x                      = _mm256_mullo_epi32(u[14], cospi28);
2237         v[14]                  = _mm256_sub_epi32(x, v[14]);
2238         v[14]                  = _mm256_add_epi32(v[14], rnding);
2239         out[7 * col_num + col] = _mm256_srai_epi32(v[14], bit);
2240 
2241         v[10]                  = _mm256_mullo_epi32(u[10], cospi44);
2242         x                      = _mm256_mullo_epi32(u[13], cospi20);
2243         v[10]                  = _mm256_add_epi32(v[10], x);
2244         v[10]                  = _mm256_add_epi32(v[10], rnding);
2245         out[5 * col_num + col] = _mm256_srai_epi32(v[10], bit);
2246 
2247         v[13]                   = _mm256_mullo_epi32(u[10], cospi20);
2248         x                       = _mm256_mullo_epi32(u[13], cospi44);
2249         v[13]                   = _mm256_sub_epi32(x, v[13]);
2250         v[13]                   = _mm256_add_epi32(v[13], rnding);
2251         out[11 * col_num + col] = _mm256_srai_epi32(v[13], bit);
2252 
2253         v[11]                   = _mm256_mullo_epi32(u[11], cospi12);
2254         x                       = _mm256_mullo_epi32(u[12], cospi52);
2255         v[11]                   = _mm256_add_epi32(v[11], x);
2256         v[11]                   = _mm256_add_epi32(v[11], rnding);
2257         out[13 * col_num + col] = _mm256_srai_epi32(v[11], bit);
2258 
2259         v[12]                  = _mm256_mullo_epi32(u[11], cospi52);
2260         x                      = _mm256_mullo_epi32(u[12], cospi12);
2261         v[12]                  = _mm256_sub_epi32(x, v[12]);
2262         v[12]                  = _mm256_add_epi32(v[12], rnding);
2263         out[3 * col_num + col] = _mm256_srai_epi32(v[12], bit);
2264     }
2265 }
2266 
fadst4x8_row_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t num_col)2267 static INLINE void fadst4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit,
2268                                      const int32_t num_col) {
2269     const int32_t *sinpi  = sinpi_arr(bit);
2270     const __m256i  rnding = _mm256_set1_epi32(1 << (bit - 1));
2271     const __m256i  sinpi1 = _mm256_set1_epi32((int32_t)sinpi[1]);
2272     const __m256i  sinpi2 = _mm256_set1_epi32((int32_t)sinpi[2]);
2273     const __m256i  sinpi3 = _mm256_set1_epi32((int32_t)sinpi[3]);
2274     const __m256i  sinpi4 = _mm256_set1_epi32((int32_t)sinpi[4]);
2275     __m256i        t;
2276     __m256i        s0, s1, s2, s3, s4, s5, s6, s7;
2277     __m256i        x0, x1, x2, x3;
2278     __m256i        u0, u1, u2, u3;
2279     __m256i        v0, v1, v2, v3;
2280     __m256i        in[4];
2281     __m256i        out[4];
2282 
2283     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
2284     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
2285     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
2286     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
2287 
2288     int32_t idx = 0 * num_col;
2289     s0          = _mm256_mullo_epi32(in[idx], sinpi1);
2290     s1          = _mm256_mullo_epi32(in[idx], sinpi4);
2291     t           = _mm256_add_epi32(in[idx], in[idx + num_col]);
2292     idx += num_col;
2293     s2 = _mm256_mullo_epi32(in[idx], sinpi2);
2294     s3 = _mm256_mullo_epi32(in[idx], sinpi1);
2295     idx += num_col;
2296     s4 = _mm256_mullo_epi32(in[idx], sinpi3);
2297     idx += num_col;
2298     s5 = _mm256_mullo_epi32(in[idx], sinpi4);
2299     s6 = _mm256_mullo_epi32(in[idx], sinpi2);
2300     s7 = _mm256_sub_epi32(t, in[idx]);
2301 
2302     t  = _mm256_add_epi32(s0, s2);
2303     x0 = _mm256_add_epi32(t, s5);
2304     x1 = _mm256_mullo_epi32(s7, sinpi3);
2305     t  = _mm256_sub_epi32(s1, s3);
2306     x2 = _mm256_add_epi32(t, s6);
2307     x3 = s4;
2308 
2309     s0 = _mm256_add_epi32(x0, x3);
2310     s1 = x1;
2311     s2 = _mm256_sub_epi32(x2, x3);
2312     t  = _mm256_sub_epi32(x2, x0);
2313     s3 = _mm256_add_epi32(t, x3);
2314 
2315     u0 = _mm256_add_epi32(s0, rnding);
2316     u0 = _mm256_srai_epi32(u0, bit);
2317 
2318     u1 = _mm256_add_epi32(s1, rnding);
2319     u1 = _mm256_srai_epi32(u1, bit);
2320 
2321     u2 = _mm256_add_epi32(s2, rnding);
2322     u2 = _mm256_srai_epi32(u2, bit);
2323 
2324     u3 = _mm256_add_epi32(s3, rnding);
2325     u3 = _mm256_srai_epi32(u3, bit);
2326 
2327     v0 = _mm256_unpacklo_epi32(u0, u1);
2328     v1 = _mm256_unpackhi_epi32(u0, u1);
2329     v2 = _mm256_unpacklo_epi32(u2, u3);
2330     v3 = _mm256_unpackhi_epi32(u2, u3);
2331 
2332     out[0] = _mm256_unpacklo_epi64(v0, v2);
2333     out[1] = _mm256_unpackhi_epi64(v0, v2);
2334     out[2] = _mm256_unpacklo_epi64(v1, v3);
2335     out[3] = _mm256_unpackhi_epi64(v1, v3);
2336 
2337     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
2338     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
2339     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
2340     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
2341 }
2342 
fadst4x8_col_avx2(__m256i * in,__m256i * output,int32_t bit,const int32_t num_col)2343 static INLINE void fadst4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit,
2344                                      const int32_t num_col) {
2345     const int32_t *sinpi  = sinpi_arr(bit);
2346     const __m256i  rnding = _mm256_set1_epi32(1 << (bit - 1));
2347     const __m256i  sinpi1 = _mm256_set1_epi32((int32_t)sinpi[1]);
2348     const __m256i  sinpi2 = _mm256_set1_epi32((int32_t)sinpi[2]);
2349     const __m256i  sinpi3 = _mm256_set1_epi32((int32_t)sinpi[3]);
2350     const __m256i  sinpi4 = _mm256_set1_epi32((int32_t)sinpi[4]);
2351     __m256i        t;
2352     __m256i        s0, s1, s2, s3, s4, s5, s6, s7;
2353     __m256i        x0, x1, x2, x3;
2354     __m256i        u0, u1, u2, u3;
2355     __m256i        v0, v1, v2, v3;
2356     __m256i        out[4];
2357 
2358     int32_t idx = 0 * num_col;
2359     s0          = _mm256_mullo_epi32(in[idx], sinpi1);
2360     s1          = _mm256_mullo_epi32(in[idx], sinpi4);
2361     t           = _mm256_add_epi32(in[idx], in[idx + num_col]);
2362     idx += num_col;
2363     s2 = _mm256_mullo_epi32(in[idx], sinpi2);
2364     s3 = _mm256_mullo_epi32(in[idx], sinpi1);
2365     idx += num_col;
2366     s4 = _mm256_mullo_epi32(in[idx], sinpi3);
2367     idx += num_col;
2368     s5 = _mm256_mullo_epi32(in[idx], sinpi4);
2369     s6 = _mm256_mullo_epi32(in[idx], sinpi2);
2370     s7 = _mm256_sub_epi32(t, in[idx]);
2371 
2372     t  = _mm256_add_epi32(s0, s2);
2373     x0 = _mm256_add_epi32(t, s5);
2374     x1 = _mm256_mullo_epi32(s7, sinpi3);
2375     t  = _mm256_sub_epi32(s1, s3);
2376     x2 = _mm256_add_epi32(t, s6);
2377     x3 = s4;
2378 
2379     s0 = _mm256_add_epi32(x0, x3);
2380     s1 = x1;
2381     s2 = _mm256_sub_epi32(x2, x3);
2382     t  = _mm256_sub_epi32(x2, x0);
2383     s3 = _mm256_add_epi32(t, x3);
2384 
2385     u0 = _mm256_add_epi32(s0, rnding);
2386     u0 = _mm256_srai_epi32(u0, bit);
2387 
2388     u1 = _mm256_add_epi32(s1, rnding);
2389     u1 = _mm256_srai_epi32(u1, bit);
2390 
2391     u2 = _mm256_add_epi32(s2, rnding);
2392     u2 = _mm256_srai_epi32(u2, bit);
2393 
2394     u3 = _mm256_add_epi32(s3, rnding);
2395     u3 = _mm256_srai_epi32(u3, bit);
2396 
2397     v0 = _mm256_unpacklo_epi32(u0, u1);
2398     v1 = _mm256_unpackhi_epi32(u0, u1);
2399     v2 = _mm256_unpacklo_epi32(u2, u3);
2400     v3 = _mm256_unpackhi_epi32(u2, u3);
2401 
2402     out[0] = _mm256_unpacklo_epi64(v0, v2);
2403     out[1] = _mm256_unpackhi_epi64(v0, v2);
2404     out[2] = _mm256_unpacklo_epi64(v1, v3);
2405     out[3] = _mm256_unpackhi_epi64(v1, v3);
2406 
2407     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
2408     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
2409     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
2410     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
2411 }
2412 
fdct4x8_avx2(__m256i * input,__m256i * output,int32_t bit)2413 static INLINE void fdct4x8_avx2(__m256i *input, __m256i *output, int32_t bit) {
2414     __m128i *      in       = (__m128i *)input;
2415     __m128i *      out      = (__m128i *)output;
2416     const int32_t *cospi    = cospi_arr(bit);
2417     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
2418     const __m128i  cospim32 = _mm_set1_epi32(-cospi[32]);
2419     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
2420     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
2421     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
2422     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
2423     const __m128i  cospi24  = _mm_set1_epi32(cospi[24]);
2424     const __m128i  cospi40  = _mm_set1_epi32(cospi[40]);
2425     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
2426     __m128i        u[8], v[8];
2427 
2428     // Even 8 points 0, 2, ..., 14
2429     // stage 0
2430     // stage 1
2431     u[0] = _mm_add_epi32(in[0], in[7]);
2432     v[7] = _mm_sub_epi32(in[0], in[7]); // v[7]
2433     u[1] = _mm_add_epi32(in[1], in[6]);
2434     u[6] = _mm_sub_epi32(in[1], in[6]);
2435     u[2] = _mm_add_epi32(in[2], in[5]);
2436     u[5] = _mm_sub_epi32(in[2], in[5]);
2437     u[3] = _mm_add_epi32(in[3], in[4]);
2438     v[4] = _mm_sub_epi32(in[3], in[4]); // v[4]
2439 
2440     // stage 2
2441     v[0] = _mm_add_epi32(u[0], u[3]);
2442     v[3] = _mm_sub_epi32(u[0], u[3]);
2443     v[1] = _mm_add_epi32(u[1], u[2]);
2444     v[2] = _mm_sub_epi32(u[1], u[2]);
2445 
2446     v[5] = _mm_mullo_epi32(u[5], cospim32);
2447     v[6] = _mm_mullo_epi32(u[6], cospi32);
2448     v[5] = _mm_add_epi32(v[5], v[6]);
2449     v[5] = _mm_add_epi32(v[5], rnding);
2450     v[5] = _mm_srai_epi32(v[5], bit);
2451 
2452     u[0] = _mm_mullo_epi32(u[5], cospi32);
2453     v[6] = _mm_mullo_epi32(u[6], cospim32);
2454     v[6] = _mm_sub_epi32(u[0], v[6]);
2455     v[6] = _mm_add_epi32(v[6], rnding);
2456     v[6] = _mm_srai_epi32(v[6], bit);
2457 
2458     // stage 3
2459     // type 0
2460     v[0]   = _mm_mullo_epi32(v[0], cospi32);
2461     v[1]   = _mm_mullo_epi32(v[1], cospi32);
2462     u[0]   = _mm_add_epi32(v[0], v[1]);
2463     u[0]   = _mm_add_epi32(u[0], rnding);
2464     out[0] = _mm_srai_epi32(u[0], bit);
2465 
2466     u[1]   = _mm_sub_epi32(v[0], v[1]);
2467     u[1]   = _mm_add_epi32(u[1], rnding);
2468     out[4] = _mm_srai_epi32(u[1], bit);
2469 
2470     // type 1
2471     v[0]   = _mm_mullo_epi32(v[2], cospi48);
2472     v[1]   = _mm_mullo_epi32(v[3], cospi16);
2473     u[2]   = _mm_add_epi32(v[0], v[1]);
2474     u[2]   = _mm_add_epi32(u[2], rnding);
2475     out[2] = _mm_srai_epi32(u[2], bit);
2476 
2477     v[0]   = _mm_mullo_epi32(v[2], cospi16);
2478     v[1]   = _mm_mullo_epi32(v[3], cospi48);
2479     u[3]   = _mm_sub_epi32(v[1], v[0]);
2480     u[3]   = _mm_add_epi32(u[3], rnding);
2481     out[6] = _mm_srai_epi32(u[3], bit);
2482 
2483     u[4] = _mm_add_epi32(v[4], v[5]);
2484     u[5] = _mm_sub_epi32(v[4], v[5]);
2485     u[6] = _mm_sub_epi32(v[7], v[6]);
2486     u[7] = _mm_add_epi32(v[7], v[6]);
2487 
2488     // stage 4
2489     // stage 5
2490     v[0]   = _mm_mullo_epi32(u[4], cospi56);
2491     v[1]   = _mm_mullo_epi32(u[7], cospi8);
2492     v[0]   = _mm_add_epi32(v[0], v[1]);
2493     v[0]   = _mm_add_epi32(v[0], rnding);
2494     out[1] = _mm_srai_epi32(v[0], bit); // buf0[4]
2495 
2496     v[0]   = _mm_mullo_epi32(u[4], cospi8);
2497     v[1]   = _mm_mullo_epi32(u[7], cospi56);
2498     v[0]   = _mm_sub_epi32(v[1], v[0]);
2499     v[0]   = _mm_add_epi32(v[0], rnding);
2500     out[7] = _mm_srai_epi32(v[0], bit); // buf0[7]
2501 
2502     v[0]   = _mm_mullo_epi32(u[5], cospi24);
2503     v[1]   = _mm_mullo_epi32(u[6], cospi40);
2504     v[0]   = _mm_add_epi32(v[0], v[1]);
2505     v[0]   = _mm_add_epi32(v[0], rnding);
2506     out[5] = _mm_srai_epi32(v[0], bit); // buf0[5]
2507 
2508     v[0]   = _mm_mullo_epi32(u[5], cospi40);
2509     v[1]   = _mm_mullo_epi32(u[6], cospi24);
2510     v[0]   = _mm_sub_epi32(v[1], v[0]);
2511     v[0]   = _mm_add_epi32(v[0], rnding);
2512     out[3] = _mm_srai_epi32(v[0], bit); // buf0[6]
2513 }
2514 
fadst16x16_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num)2515 static void fadst16x16_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
2516     const int32_t *cospi    = cospi_arr(bit);
2517     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
2518     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
2519     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
2520     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
2521     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
2522     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
2523     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
2524     const __m256i  cospim56 = _mm256_set1_epi32(-cospi[56]);
2525     const __m256i  cospim8  = _mm256_set1_epi32(-cospi[8]);
2526     const __m256i  cospi24  = _mm256_set1_epi32(cospi[24]);
2527     const __m256i  cospim24 = _mm256_set1_epi32(-cospi[24]);
2528     const __m256i  cospim40 = _mm256_set1_epi32(-cospi[40]);
2529     const __m256i  cospi40  = _mm256_set1_epi32(cospi[40]);
2530     const __m256i  cospi2   = _mm256_set1_epi32(cospi[2]);
2531     const __m256i  cospi62  = _mm256_set1_epi32(cospi[62]);
2532     const __m256i  cospim2  = _mm256_set1_epi32(-cospi[2]);
2533     const __m256i  cospi10  = _mm256_set1_epi32(cospi[10]);
2534     const __m256i  cospi54  = _mm256_set1_epi32(cospi[54]);
2535     const __m256i  cospim10 = _mm256_set1_epi32(-cospi[10]);
2536     const __m256i  cospi18  = _mm256_set1_epi32(cospi[18]);
2537     const __m256i  cospi46  = _mm256_set1_epi32(cospi[46]);
2538     const __m256i  cospim18 = _mm256_set1_epi32(-cospi[18]);
2539     const __m256i  cospi26  = _mm256_set1_epi32(cospi[26]);
2540     const __m256i  cospi38  = _mm256_set1_epi32(cospi[38]);
2541     const __m256i  cospim26 = _mm256_set1_epi32(-cospi[26]);
2542     const __m256i  cospi34  = _mm256_set1_epi32(cospi[34]);
2543     const __m256i  cospi30  = _mm256_set1_epi32(cospi[30]);
2544     const __m256i  cospim34 = _mm256_set1_epi32(-cospi[34]);
2545     const __m256i  cospi42  = _mm256_set1_epi32(cospi[42]);
2546     const __m256i  cospi22  = _mm256_set1_epi32(cospi[22]);
2547     const __m256i  cospim42 = _mm256_set1_epi32(-cospi[42]);
2548     const __m256i  cospi50  = _mm256_set1_epi32(cospi[50]);
2549     const __m256i  cospi14  = _mm256_set1_epi32(cospi[14]);
2550     const __m256i  cospim50 = _mm256_set1_epi32(-cospi[50]);
2551     const __m256i  cospi58  = _mm256_set1_epi32(cospi[58]);
2552     const __m256i  cospi6   = _mm256_set1_epi32(cospi[6]);
2553     const __m256i  cospim58 = _mm256_set1_epi32(-cospi[58]);
2554     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
2555     const __m256i  zero     = _mm256_setzero_si256();
2556 
2557     __m256i u[16], v[16], x, y;
2558     int32_t col;
2559 
2560     for (col = 0; col < col_num; ++col) {
2561         // stage 0
2562         // stage 1
2563         u[0]  = in[0 * col_num + col];
2564         u[1]  = _mm256_sub_epi32(zero, in[15 * col_num + col]);
2565         u[2]  = _mm256_sub_epi32(zero, in[7 * col_num + col]);
2566         u[3]  = in[8 * col_num + col];
2567         u[4]  = _mm256_sub_epi32(zero, in[3 * col_num + col]);
2568         u[5]  = in[12 * col_num + col];
2569         u[6]  = in[4 * col_num + col];
2570         u[7]  = _mm256_sub_epi32(zero, in[11 * col_num + col]);
2571         u[8]  = _mm256_sub_epi32(zero, in[1 * col_num + col]);
2572         u[9]  = in[14 * col_num + col];
2573         u[10] = in[6 * col_num + col];
2574         u[11] = _mm256_sub_epi32(zero, in[9 * col_num + col]);
2575         u[12] = in[2 * col_num + col];
2576         u[13] = _mm256_sub_epi32(zero, in[13 * col_num + col]);
2577         u[14] = _mm256_sub_epi32(zero, in[5 * col_num + col]);
2578         u[15] = in[10 * col_num + col];
2579 
2580         // stage 2
2581         v[0] = u[0];
2582         v[1] = u[1];
2583 
2584         x    = _mm256_mullo_epi32(u[2], cospi32);
2585         y    = _mm256_mullo_epi32(u[3], cospi32);
2586         v[2] = _mm256_add_epi32(x, y);
2587         v[2] = _mm256_add_epi32(v[2], rnding);
2588         v[2] = _mm256_srai_epi32(v[2], bit);
2589 
2590         v[3] = _mm256_sub_epi32(x, y);
2591         v[3] = _mm256_add_epi32(v[3], rnding);
2592         v[3] = _mm256_srai_epi32(v[3], bit);
2593 
2594         v[4] = u[4];
2595         v[5] = u[5];
2596 
2597         x    = _mm256_mullo_epi32(u[6], cospi32);
2598         y    = _mm256_mullo_epi32(u[7], cospi32);
2599         v[6] = _mm256_add_epi32(x, y);
2600         v[6] = _mm256_add_epi32(v[6], rnding);
2601         v[6] = _mm256_srai_epi32(v[6], bit);
2602 
2603         v[7] = _mm256_sub_epi32(x, y);
2604         v[7] = _mm256_add_epi32(v[7], rnding);
2605         v[7] = _mm256_srai_epi32(v[7], bit);
2606 
2607         v[8] = u[8];
2608         v[9] = u[9];
2609 
2610         x     = _mm256_mullo_epi32(u[10], cospi32);
2611         y     = _mm256_mullo_epi32(u[11], cospi32);
2612         v[10] = _mm256_add_epi32(x, y);
2613         v[10] = _mm256_add_epi32(v[10], rnding);
2614         v[10] = _mm256_srai_epi32(v[10], bit);
2615 
2616         v[11] = _mm256_sub_epi32(x, y);
2617         v[11] = _mm256_add_epi32(v[11], rnding);
2618         v[11] = _mm256_srai_epi32(v[11], bit);
2619 
2620         v[12] = u[12];
2621         v[13] = u[13];
2622 
2623         x     = _mm256_mullo_epi32(u[14], cospi32);
2624         y     = _mm256_mullo_epi32(u[15], cospi32);
2625         v[14] = _mm256_add_epi32(x, y);
2626         v[14] = _mm256_add_epi32(v[14], rnding);
2627         v[14] = _mm256_srai_epi32(v[14], bit);
2628 
2629         v[15] = _mm256_sub_epi32(x, y);
2630         v[15] = _mm256_add_epi32(v[15], rnding);
2631         v[15] = _mm256_srai_epi32(v[15], bit);
2632 
2633         // stage 3
2634         u[0]  = _mm256_add_epi32(v[0], v[2]);
2635         u[1]  = _mm256_add_epi32(v[1], v[3]);
2636         u[2]  = _mm256_sub_epi32(v[0], v[2]);
2637         u[3]  = _mm256_sub_epi32(v[1], v[3]);
2638         u[4]  = _mm256_add_epi32(v[4], v[6]);
2639         u[5]  = _mm256_add_epi32(v[5], v[7]);
2640         u[6]  = _mm256_sub_epi32(v[4], v[6]);
2641         u[7]  = _mm256_sub_epi32(v[5], v[7]);
2642         u[8]  = _mm256_add_epi32(v[8], v[10]);
2643         u[9]  = _mm256_add_epi32(v[9], v[11]);
2644         u[10] = _mm256_sub_epi32(v[8], v[10]);
2645         u[11] = _mm256_sub_epi32(v[9], v[11]);
2646         u[12] = _mm256_add_epi32(v[12], v[14]);
2647         u[13] = _mm256_add_epi32(v[13], v[15]);
2648         u[14] = _mm256_sub_epi32(v[12], v[14]);
2649         u[15] = _mm256_sub_epi32(v[13], v[15]);
2650 
2651         // stage 4
2652         v[0]  = u[0];
2653         v[1]  = u[1];
2654         v[2]  = u[2];
2655         v[3]  = u[3];
2656         v[4]  = half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
2657         v[5]  = half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
2658         v[6]  = half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
2659         v[7]  = half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
2660         v[8]  = u[8];
2661         v[9]  = u[9];
2662         v[10] = u[10];
2663         v[11] = u[11];
2664         v[12] = half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
2665         v[13] = half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
2666         v[14] = half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
2667         v[15] = half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
2668 
2669         // stage 5
2670         u[0]  = _mm256_add_epi32(v[0], v[4]);
2671         u[1]  = _mm256_add_epi32(v[1], v[5]);
2672         u[2]  = _mm256_add_epi32(v[2], v[6]);
2673         u[3]  = _mm256_add_epi32(v[3], v[7]);
2674         u[4]  = _mm256_sub_epi32(v[0], v[4]);
2675         u[5]  = _mm256_sub_epi32(v[1], v[5]);
2676         u[6]  = _mm256_sub_epi32(v[2], v[6]);
2677         u[7]  = _mm256_sub_epi32(v[3], v[7]);
2678         u[8]  = _mm256_add_epi32(v[8], v[12]);
2679         u[9]  = _mm256_add_epi32(v[9], v[13]);
2680         u[10] = _mm256_add_epi32(v[10], v[14]);
2681         u[11] = _mm256_add_epi32(v[11], v[15]);
2682         u[12] = _mm256_sub_epi32(v[8], v[12]);
2683         u[13] = _mm256_sub_epi32(v[9], v[13]);
2684         u[14] = _mm256_sub_epi32(v[10], v[14]);
2685         u[15] = _mm256_sub_epi32(v[11], v[15]);
2686 
2687         // stage 6
2688         v[0]  = u[0];
2689         v[1]  = u[1];
2690         v[2]  = u[2];
2691         v[3]  = u[3];
2692         v[4]  = u[4];
2693         v[5]  = u[5];
2694         v[6]  = u[6];
2695         v[7]  = u[7];
2696         v[8]  = half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
2697         v[9]  = half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
2698         v[10] = half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
2699         v[11] = half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
2700         v[12] = half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
2701         v[13] = half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
2702         v[14] = half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
2703         v[15] = half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
2704 
2705         // stage 7
2706         u[0]  = _mm256_add_epi32(v[0], v[8]);
2707         u[1]  = _mm256_add_epi32(v[1], v[9]);
2708         u[2]  = _mm256_add_epi32(v[2], v[10]);
2709         u[3]  = _mm256_add_epi32(v[3], v[11]);
2710         u[4]  = _mm256_add_epi32(v[4], v[12]);
2711         u[5]  = _mm256_add_epi32(v[5], v[13]);
2712         u[6]  = _mm256_add_epi32(v[6], v[14]);
2713         u[7]  = _mm256_add_epi32(v[7], v[15]);
2714         u[8]  = _mm256_sub_epi32(v[0], v[8]);
2715         u[9]  = _mm256_sub_epi32(v[1], v[9]);
2716         u[10] = _mm256_sub_epi32(v[2], v[10]);
2717         u[11] = _mm256_sub_epi32(v[3], v[11]);
2718         u[12] = _mm256_sub_epi32(v[4], v[12]);
2719         u[13] = _mm256_sub_epi32(v[5], v[13]);
2720         u[14] = _mm256_sub_epi32(v[6], v[14]);
2721         u[15] = _mm256_sub_epi32(v[7], v[15]);
2722 
2723         // stage 8
2724         v[0]  = half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
2725         v[1]  = half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
2726         v[2]  = half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
2727         v[3]  = half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
2728         v[4]  = half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
2729         v[5]  = half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
2730         v[6]  = half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
2731         v[7]  = half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
2732         v[8]  = half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
2733         v[9]  = half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
2734         v[10] = half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
2735         v[11] = half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
2736         v[12] = half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
2737         v[13] = half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
2738         v[14] = half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
2739         v[15] = half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
2740 
2741         // stage 9
2742         out[0 * col_num + col]  = v[1];
2743         out[1 * col_num + col]  = v[14];
2744         out[2 * col_num + col]  = v[3];
2745         out[3 * col_num + col]  = v[12];
2746         out[4 * col_num + col]  = v[5];
2747         out[5 * col_num + col]  = v[10];
2748         out[6 * col_num + col]  = v[7];
2749         out[7 * col_num + col]  = v[8];
2750         out[8 * col_num + col]  = v[9];
2751         out[9 * col_num + col]  = v[6];
2752         out[10 * col_num + col] = v[11];
2753         out[11 * col_num + col] = v[4];
2754         out[12 * col_num + col] = v[13];
2755         out[13 * col_num + col] = v[2];
2756         out[14 * col_num + col] = v[15];
2757         out[15 * col_num + col] = v[0];
2758     }
2759 }
2760 
svt_av1_fwd_txfm2d_16x16_avx2(int16_t * input,int32_t * coeff,uint32_t stride,TxType tx_type,uint8_t bd)2761 void svt_av1_fwd_txfm2d_16x16_avx2(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type,
2762                                    uint8_t bd) {
2763     __m256i       in[32], out[32];
2764     const int8_t *shift   = fwd_txfm_shift_ls[TX_16X16];
2765     const int32_t txw_idx = get_txw_idx(TX_16X16);
2766     const int32_t txh_idx = get_txh_idx(TX_16X16);
2767     const int32_t col_num = 2;
2768     switch (tx_type) {
2769     case IDTX:
2770         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2771         fidtx16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2772         col_txfm_16x16_rounding(out, -shift[1]);
2773         fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2774         write_buffer_16x16(out, coeff);
2775         break;
2776     case DCT_DCT:
2777         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2778         fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2779         col_txfm_16x16_rounding(out, -shift[1]);
2780         transpose_16x16_avx2(out, in);
2781         fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2782         transpose_16x16_avx2(out, in);
2783         write_buffer_16x16(in, coeff);
2784         break;
2785     case ADST_DCT:
2786         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2787         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2788         col_txfm_16x16_rounding(out, -shift[1]);
2789         transpose_16x16_avx2(out, in);
2790         fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2791         transpose_16x16_avx2(out, in);
2792         write_buffer_16x16(in, coeff);
2793         break;
2794     case DCT_ADST:
2795         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2796         fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2797         col_txfm_16x16_rounding(out, -shift[1]);
2798         transpose_16x16_avx2(out, in);
2799         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2800         transpose_16x16_avx2(out, in);
2801         write_buffer_16x16(in, coeff);
2802         break;
2803     case ADST_ADST:
2804         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2805         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2806         col_txfm_16x16_rounding(out, -shift[1]);
2807         transpose_16x16_avx2(out, in);
2808         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2809         transpose_16x16_avx2(out, in);
2810         write_buffer_16x16(in, coeff);
2811         break;
2812     case DCT_FLIPADST:
2813         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
2814         fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2815         col_txfm_16x16_rounding(out, -shift[1]);
2816         transpose_16x16_avx2(out, in);
2817         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2818         transpose_16x16_avx2(out, in);
2819         write_buffer_16x16(in, coeff);
2820         break;
2821     case FLIPADST_DCT:
2822         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
2823         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2824         col_txfm_16x16_rounding(out, -shift[1]);
2825         transpose_16x16_avx2(out, in);
2826         fdct16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2827         transpose_16x16_avx2(out, in);
2828         write_buffer_16x16(in, coeff);
2829         break;
2830     case FLIPADST_FLIPADST:
2831         load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
2832         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2833         col_txfm_16x16_rounding(out, -shift[1]);
2834         transpose_16x16_avx2(out, in);
2835         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2836         transpose_16x16_avx2(out, in);
2837         write_buffer_16x16(in, coeff);
2838         break;
2839     case ADST_FLIPADST:
2840         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
2841         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2842         col_txfm_16x16_rounding(out, -shift[1]);
2843         transpose_16x16_avx2(out, in);
2844         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2845         transpose_16x16_avx2(out, in);
2846         write_buffer_16x16(in, coeff);
2847         break;
2848     case FLIPADST_ADST:
2849         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
2850         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2851         col_txfm_16x16_rounding(out, -shift[1]);
2852         transpose_16x16_avx2(out, in);
2853         fadst16x16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2854         transpose_16x16_avx2(out, in);
2855         write_buffer_16x16(in, coeff);
2856         break;
2857     case V_DCT:
2858         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2859         fdct16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2860         col_txfm_16x16_rounding(out, -shift[1]);
2861         fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2862         write_buffer_16x16(out, coeff);
2863         break;
2864     case H_DCT:
2865         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2866         fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2867         col_txfm_16x16_rounding(in, -shift[1]);
2868         transpose_16x16_avx2(in, out);
2869         fdct16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2870         transpose_16x16_avx2(in, out);
2871         write_buffer_16x16(out, coeff);
2872         break;
2873     case V_ADST:
2874         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2875         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2876         col_txfm_16x16_rounding(out, -shift[1]);
2877         fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2878         write_buffer_16x16(out, coeff);
2879         break;
2880     case H_ADST:
2881         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
2882         fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2883         col_txfm_16x16_rounding(in, -shift[1]);
2884         transpose_16x16_avx2(in, out);
2885         fadst16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2886         transpose_16x16_avx2(in, out);
2887         write_buffer_16x16(out, coeff);
2888         break;
2889     case V_FLIPADST:
2890         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
2891         fadst16x16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2892         col_txfm_16x16_rounding(out, -shift[1]);
2893         fidtx16x16_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2894         write_buffer_16x16(out, coeff);
2895         break;
2896     case H_FLIPADST:
2897         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
2898         fidtx16x16_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
2899         col_txfm_16x16_rounding(in, -shift[1]);
2900         transpose_16x16_avx2(in, out);
2901         fadst16x16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
2902         transpose_16x16_avx2(in, out);
2903         write_buffer_16x16(out, coeff);
2904         break;
2905     default: assert(0);
2906     }
2907     (void)bd;
2908 }
2909 
av1_fdct32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num,const int32_t stride)2910 static void av1_fdct32_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
2911                                 const int32_t col_num, const int32_t stride) {
2912     const int32_t *cospi      = cospi_arr(cos_bit);
2913     const __m256i  __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
2914     const int32_t  columns    = col_num >> 3;
2915 
2916     __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
2917     __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
2918     __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
2919     __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
2920     __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
2921     __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
2922     __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
2923     __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
2924     __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
2925     __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
2926     __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
2927     __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
2928     __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
2929     __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
2930     __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
2931     __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
2932     __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
2933     __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
2934     __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
2935     __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
2936     __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
2937     __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
2938     __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
2939     __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
2940     __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
2941     __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
2942     __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
2943     __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
2944     __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
2945     __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
2946     __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
2947     __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
2948     __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
2949     __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
2950     __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
2951     __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
2952     __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
2953     __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
2954 
2955     __m256i buf0[32];
2956     __m256i buf1[32];
2957 
2958     for (int32_t col = 0; col < columns; col++) {
2959         const __m256i *in  = &input[col];
2960         __m256i *      out = &output[col];
2961 
2962         // stage 0
2963         // stage 1
2964         buf1[0]  = _mm256_add_epi32(in[0 * stride], in[31 * stride]);
2965         buf1[31] = _mm256_sub_epi32(in[0 * stride], in[31 * stride]);
2966         buf1[1]  = _mm256_add_epi32(in[1 * stride], in[30 * stride]);
2967         buf1[30] = _mm256_sub_epi32(in[1 * stride], in[30 * stride]);
2968         buf1[2]  = _mm256_add_epi32(in[2 * stride], in[29 * stride]);
2969         buf1[29] = _mm256_sub_epi32(in[2 * stride], in[29 * stride]);
2970         buf1[3]  = _mm256_add_epi32(in[3 * stride], in[28 * stride]);
2971         buf1[28] = _mm256_sub_epi32(in[3 * stride], in[28 * stride]);
2972         buf1[4]  = _mm256_add_epi32(in[4 * stride], in[27 * stride]);
2973         buf1[27] = _mm256_sub_epi32(in[4 * stride], in[27 * stride]);
2974         buf1[5]  = _mm256_add_epi32(in[5 * stride], in[26 * stride]);
2975         buf1[26] = _mm256_sub_epi32(in[5 * stride], in[26 * stride]);
2976         buf1[6]  = _mm256_add_epi32(in[6 * stride], in[25 * stride]);
2977         buf1[25] = _mm256_sub_epi32(in[6 * stride], in[25 * stride]);
2978         buf1[7]  = _mm256_add_epi32(in[7 * stride], in[24 * stride]);
2979         buf1[24] = _mm256_sub_epi32(in[7 * stride], in[24 * stride]);
2980         buf1[8]  = _mm256_add_epi32(in[8 * stride], in[23 * stride]);
2981         buf1[23] = _mm256_sub_epi32(in[8 * stride], in[23 * stride]);
2982         buf1[9]  = _mm256_add_epi32(in[9 * stride], in[22 * stride]);
2983         buf1[22] = _mm256_sub_epi32(in[9 * stride], in[22 * stride]);
2984         buf1[10] = _mm256_add_epi32(in[10 * stride], in[21 * stride]);
2985         buf1[21] = _mm256_sub_epi32(in[10 * stride], in[21 * stride]);
2986         buf1[11] = _mm256_add_epi32(in[11 * stride], in[20 * stride]);
2987         buf1[20] = _mm256_sub_epi32(in[11 * stride], in[20 * stride]);
2988         buf1[12] = _mm256_add_epi32(in[12 * stride], in[19 * stride]);
2989         buf1[19] = _mm256_sub_epi32(in[12 * stride], in[19 * stride]);
2990         buf1[13] = _mm256_add_epi32(in[13 * stride], in[18 * stride]);
2991         buf1[18] = _mm256_sub_epi32(in[13 * stride], in[18 * stride]);
2992         buf1[14] = _mm256_add_epi32(in[14 * stride], in[17 * stride]);
2993         buf1[17] = _mm256_sub_epi32(in[14 * stride], in[17 * stride]);
2994         buf1[15] = _mm256_add_epi32(in[15 * stride], in[16 * stride]);
2995         buf1[16] = _mm256_sub_epi32(in[15 * stride], in[16 * stride]);
2996 
2997         // stage 2
2998         buf0[0]  = _mm256_add_epi32(buf1[0], buf1[15]);
2999         buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
3000         buf0[1]  = _mm256_add_epi32(buf1[1], buf1[14]);
3001         buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
3002         buf0[2]  = _mm256_add_epi32(buf1[2], buf1[13]);
3003         buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
3004         buf0[3]  = _mm256_add_epi32(buf1[3], buf1[12]);
3005         buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
3006         buf0[4]  = _mm256_add_epi32(buf1[4], buf1[11]);
3007         buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
3008         buf0[5]  = _mm256_add_epi32(buf1[5], buf1[10]);
3009         buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
3010         buf0[6]  = _mm256_add_epi32(buf1[6], buf1[9]);
3011         buf0[9]  = _mm256_sub_epi32(buf1[6], buf1[9]);
3012         buf0[7]  = _mm256_add_epi32(buf1[7], buf1[8]);
3013         buf0[8]  = _mm256_sub_epi32(buf1[7], buf1[8]);
3014         btf_32_type0_avx2_new(
3015             cospi_m32, cospi_p32, buf1[20], buf1[27], buf0[20], buf0[27], __rounding, cos_bit);
3016         btf_32_type0_avx2_new(
3017             cospi_m32, cospi_p32, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
3018         btf_32_type0_avx2_new(
3019             cospi_m32, cospi_p32, buf1[22], buf1[25], buf0[22], buf0[25], __rounding, cos_bit);
3020         btf_32_type0_avx2_new(
3021             cospi_m32, cospi_p32, buf1[23], buf1[24], buf0[23], buf0[24], __rounding, cos_bit);
3022 
3023         // stage 3
3024         buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
3025         buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
3026         buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
3027         buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
3028         buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
3029         buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
3030         buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
3031         buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
3032         btf_32_type0_avx2_new(
3033             cospi_m32, cospi_p32, buf0[10], buf0[13], buf1[10], buf1[13], __rounding, cos_bit);
3034         btf_32_type0_avx2_new(
3035             cospi_m32, cospi_p32, buf0[11], buf0[12], buf1[11], buf1[12], __rounding, cos_bit);
3036         buf1[23] = _mm256_sub_epi32(buf1[16], buf0[23]);
3037         buf1[16] = _mm256_add_epi32(buf1[16], buf0[23]);
3038         buf1[22] = _mm256_sub_epi32(buf1[17], buf0[22]);
3039         buf1[17] = _mm256_add_epi32(buf1[17], buf0[22]);
3040         buf1[21] = _mm256_sub_epi32(buf1[18], buf0[21]);
3041         buf1[18] = _mm256_add_epi32(buf1[18], buf0[21]);
3042         buf1[20] = _mm256_sub_epi32(buf1[19], buf0[20]);
3043         buf1[19] = _mm256_add_epi32(buf1[19], buf0[20]);
3044         buf1[24] = _mm256_sub_epi32(buf1[31], buf0[24]);
3045         buf1[31] = _mm256_add_epi32(buf1[31], buf0[24]);
3046         buf1[25] = _mm256_sub_epi32(buf1[30], buf0[25]);
3047         buf1[30] = _mm256_add_epi32(buf1[30], buf0[25]);
3048         buf1[26] = _mm256_sub_epi32(buf1[29], buf0[26]);
3049         buf1[29] = _mm256_add_epi32(buf1[29], buf0[26]);
3050         buf1[27] = _mm256_sub_epi32(buf1[28], buf0[27]);
3051         buf1[28] = _mm256_add_epi32(buf1[28], buf0[27]);
3052 
3053         // stage 4
3054         buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
3055         buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
3056         buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
3057         buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
3058         btf_32_type0_avx2_new(
3059             cospi_m32, cospi_p32, buf1[5], buf1[6], buf0[5], buf0[6], __rounding, cos_bit);
3060         buf0[11] = _mm256_sub_epi32(buf0[8], buf1[11]);
3061         buf0[8]  = _mm256_add_epi32(buf0[8], buf1[11]);
3062         buf0[10] = _mm256_sub_epi32(buf0[9], buf1[10]);
3063         buf0[9]  = _mm256_add_epi32(buf0[9], buf1[10]);
3064         buf0[12] = _mm256_sub_epi32(buf0[15], buf1[12]);
3065         buf0[15] = _mm256_add_epi32(buf0[15], buf1[12]);
3066         buf0[13] = _mm256_sub_epi32(buf0[14], buf1[13]);
3067         buf0[14] = _mm256_add_epi32(buf0[14], buf1[13]);
3068         btf_32_type0_avx2_new(
3069             cospi_m16, cospi_p48, buf1[18], buf1[29], buf0[18], buf0[29], __rounding, cos_bit);
3070         btf_32_type0_avx2_new(
3071             cospi_m16, cospi_p48, buf1[19], buf1[28], buf0[19], buf0[28], __rounding, cos_bit);
3072         btf_32_type0_avx2_new(
3073             cospi_m48, cospi_m16, buf1[20], buf1[27], buf0[20], buf0[27], __rounding, cos_bit);
3074         btf_32_type0_avx2_new(
3075             cospi_m48, cospi_m16, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
3076 
3077         // stage 5
3078         btf_32_type0_avx2_new(cospi_p32,
3079                               cospi_p32,
3080                               buf0[0],
3081                               buf0[1],
3082                               out[0 * stride],
3083                               out[16 * stride],
3084                               __rounding,
3085                               cos_bit);
3086         btf_32_type1_avx2_new(cospi_p48,
3087                               cospi_p16,
3088                               buf0[2],
3089                               buf0[3],
3090                               out[8 * stride],
3091                               out[24 * stride],
3092                               __rounding,
3093                               cos_bit);
3094         buf1[5] = _mm256_sub_epi32(buf1[4], buf0[5]);
3095         buf1[4] = _mm256_add_epi32(buf1[4], buf0[5]);
3096         buf1[6] = _mm256_sub_epi32(buf1[7], buf0[6]);
3097         buf1[7] = _mm256_add_epi32(buf1[7], buf0[6]);
3098         btf_32_type0_avx2_new(
3099             cospi_m16, cospi_p48, buf0[9], buf0[14], buf1[9], buf1[14], __rounding, cos_bit);
3100         btf_32_type0_avx2_new(
3101             cospi_m48, cospi_m16, buf0[10], buf0[13], buf1[10], buf1[13], __rounding, cos_bit);
3102         buf1[19] = _mm256_sub_epi32(buf1[16], buf0[19]);
3103         buf1[16] = _mm256_add_epi32(buf1[16], buf0[19]);
3104         buf1[18] = _mm256_sub_epi32(buf1[17], buf0[18]);
3105         buf1[17] = _mm256_add_epi32(buf1[17], buf0[18]);
3106         buf1[20] = _mm256_sub_epi32(buf1[23], buf0[20]);
3107         buf1[23] = _mm256_add_epi32(buf1[23], buf0[20]);
3108         buf1[21] = _mm256_sub_epi32(buf1[22], buf0[21]);
3109         buf1[22] = _mm256_add_epi32(buf1[22], buf0[21]);
3110         buf1[27] = _mm256_sub_epi32(buf1[24], buf0[27]);
3111         buf1[24] = _mm256_add_epi32(buf1[24], buf0[27]);
3112         buf1[26] = _mm256_sub_epi32(buf1[25], buf0[26]);
3113         buf1[25] = _mm256_add_epi32(buf1[25], buf0[26]);
3114         buf1[28] = _mm256_sub_epi32(buf1[31], buf0[28]);
3115         buf1[31] = _mm256_add_epi32(buf1[31], buf0[28]);
3116         buf1[29] = _mm256_sub_epi32(buf1[30], buf0[29]);
3117         buf1[30] = _mm256_add_epi32(buf1[30], buf0[29]);
3118 
3119         // stage 6
3120         btf_32_type1_avx2_new(cospi_p56,
3121                               cospi_p08,
3122                               buf1[4],
3123                               buf1[7],
3124                               out[4 * stride],
3125                               out[28 * stride],
3126                               __rounding,
3127                               cos_bit);
3128         btf_32_type1_avx2_new(cospi_p24,
3129                               cospi_p40,
3130                               buf1[5],
3131                               buf1[6],
3132                               out[20 * stride],
3133                               out[12 * stride],
3134                               __rounding,
3135                               cos_bit);
3136         buf0[9]  = _mm256_sub_epi32(buf0[8], buf1[9]);
3137         buf0[8]  = _mm256_add_epi32(buf0[8], buf1[9]);
3138         buf0[10] = _mm256_sub_epi32(buf0[11], buf1[10]);
3139         buf0[11] = _mm256_add_epi32(buf0[11], buf1[10]);
3140         buf0[13] = _mm256_sub_epi32(buf0[12], buf1[13]);
3141         buf0[12] = _mm256_add_epi32(buf0[12], buf1[13]);
3142         buf0[14] = _mm256_sub_epi32(buf0[15], buf1[14]);
3143         buf0[15] = _mm256_add_epi32(buf0[15], buf1[14]);
3144         btf_32_type0_avx2_new(
3145             cospi_m08, cospi_p56, buf1[17], buf1[30], buf0[17], buf0[30], __rounding, cos_bit);
3146         btf_32_type0_avx2_new(
3147             cospi_m56, cospi_m08, buf1[18], buf1[29], buf0[18], buf0[29], __rounding, cos_bit);
3148         btf_32_type0_avx2_new(
3149             cospi_m40, cospi_p24, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
3150         btf_32_type0_avx2_new(
3151             cospi_m24, cospi_m40, buf1[22], buf1[25], buf0[22], buf0[25], __rounding, cos_bit);
3152 
3153         // stage 7
3154         btf_32_type1_avx2_new(cospi_p60,
3155                               cospi_p04,
3156                               buf0[8],
3157                               buf0[15],
3158                               out[2 * stride],
3159                               out[30 * stride],
3160                               __rounding,
3161                               cos_bit);
3162         btf_32_type1_avx2_new(cospi_p28,
3163                               cospi_p36,
3164                               buf0[9],
3165                               buf0[14],
3166                               out[18 * stride],
3167                               out[14 * stride],
3168                               __rounding,
3169                               cos_bit);
3170         btf_32_type1_avx2_new(cospi_p44,
3171                               cospi_p20,
3172                               buf0[10],
3173                               buf0[13],
3174                               out[10 * stride],
3175                               out[22 * stride],
3176                               __rounding,
3177                               cos_bit);
3178         btf_32_type1_avx2_new(cospi_p12,
3179                               cospi_p52,
3180                               buf0[11],
3181                               buf0[12],
3182                               out[26 * stride],
3183                               out[6 * stride],
3184                               __rounding,
3185                               cos_bit);
3186         buf1[17] = _mm256_sub_epi32(buf1[16], buf0[17]);
3187         buf1[16] = _mm256_add_epi32(buf1[16], buf0[17]);
3188         buf1[18] = _mm256_sub_epi32(buf1[19], buf0[18]);
3189         buf1[19] = _mm256_add_epi32(buf1[19], buf0[18]);
3190         buf1[21] = _mm256_sub_epi32(buf1[20], buf0[21]);
3191         buf1[20] = _mm256_add_epi32(buf1[20], buf0[21]);
3192         buf1[22] = _mm256_sub_epi32(buf1[23], buf0[22]);
3193         buf1[23] = _mm256_add_epi32(buf1[23], buf0[22]);
3194         buf1[25] = _mm256_sub_epi32(buf1[24], buf0[25]);
3195         buf1[24] = _mm256_add_epi32(buf1[24], buf0[25]);
3196         buf1[26] = _mm256_sub_epi32(buf1[27], buf0[26]);
3197         buf1[27] = _mm256_add_epi32(buf1[27], buf0[26]);
3198         buf1[29] = _mm256_sub_epi32(buf1[28], buf0[29]);
3199         buf1[28] = _mm256_add_epi32(buf1[28], buf0[29]);
3200         buf1[30] = _mm256_sub_epi32(buf1[31], buf0[30]);
3201         buf1[31] = _mm256_add_epi32(buf1[31], buf0[30]);
3202 
3203         // stage 8
3204         btf_32_type1_avx2_new(cospi_p62,
3205                               cospi_p02,
3206                               buf1[16],
3207                               buf1[31],
3208                               out[1 * stride],
3209                               out[31 * stride],
3210                               __rounding,
3211                               cos_bit);
3212         btf_32_type1_avx2_new(cospi_p30,
3213                               cospi_p34,
3214                               buf1[17],
3215                               buf1[30],
3216                               out[17 * stride],
3217                               out[15 * stride],
3218                               __rounding,
3219                               cos_bit);
3220         btf_32_type1_avx2_new(cospi_p46,
3221                               cospi_p18,
3222                               buf1[18],
3223                               buf1[29],
3224                               out[9 * stride],
3225                               out[23 * stride],
3226                               __rounding,
3227                               cos_bit);
3228         btf_32_type1_avx2_new(cospi_p14,
3229                               cospi_p50,
3230                               buf1[19],
3231                               buf1[28],
3232                               out[25 * stride],
3233                               out[7 * stride],
3234                               __rounding,
3235                               cos_bit);
3236         btf_32_type1_avx2_new(cospi_p54,
3237                               cospi_p10,
3238                               buf1[20],
3239                               buf1[27],
3240                               out[5 * stride],
3241                               out[27 * stride],
3242                               __rounding,
3243                               cos_bit);
3244         btf_32_type1_avx2_new(cospi_p22,
3245                               cospi_p42,
3246                               buf1[21],
3247                               buf1[26],
3248                               out[21 * stride],
3249                               out[11 * stride],
3250                               __rounding,
3251                               cos_bit);
3252         btf_32_type1_avx2_new(cospi_p38,
3253                               cospi_p26,
3254                               buf1[22],
3255                               buf1[25],
3256                               out[13 * stride],
3257                               out[19 * stride],
3258                               __rounding,
3259                               cos_bit);
3260         btf_32_type1_avx2_new(cospi_p06,
3261                               cospi_p58,
3262                               buf1[23],
3263                               buf1[24],
3264                               out[29 * stride],
3265                               out[3 * stride],
3266                               __rounding,
3267                               cos_bit);
3268     }
3269 }
3270 
av1_fdct32_new_line_wraper_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t stride)3271 static void av1_fdct32_new_line_wraper_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
3272                                             const int32_t stride) {
3273     av1_fdct32_new_avx2(input, output, cos_bit, 8, stride);
3274 }
3275 
av1_fdct64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num,const int32_t stride)3276 static void av1_fdct64_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
3277                                 const int32_t col_num, const int32_t stride) {
3278     const int32_t *cospi      = cospi_arr(cos_bit);
3279     const __m256i  __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
3280     const int32_t  columns    = col_num >> 3;
3281 
3282     __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
3283     __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
3284     __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
3285     __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
3286     __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
3287     __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
3288     __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
3289     __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
3290     __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
3291     __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
3292     __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
3293     __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
3294     __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
3295     __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
3296     __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
3297     __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
3298     __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
3299     __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
3300     __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
3301     __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
3302     __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
3303     __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
3304     __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
3305     __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
3306     __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
3307     __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
3308     __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
3309     __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
3310     __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
3311     __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
3312     __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
3313     __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
3314     __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
3315     __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
3316     __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
3317     __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
3318     __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
3319     __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
3320     __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
3321     __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
3322     __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
3323     __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
3324     __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
3325     __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
3326     __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
3327     __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
3328     __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
3329     __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
3330     __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
3331     __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
3332     __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
3333     __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
3334     __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
3335     __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
3336     __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
3337     __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
3338     __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
3339     __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
3340     __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
3341     __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
3342     __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
3343     __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
3344     __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
3345     __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
3346     __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
3347     __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
3348     __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
3349     __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
3350     __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
3351     __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
3352     __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
3353     __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
3354     __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
3355     __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
3356     __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
3357     __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
3358     __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
3359     __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
3360 
3361     for (int32_t col = 0; col < columns; col++) {
3362         const __m256i *in  = &input[col];
3363         __m256i *      out = &output[col];
3364 
3365         // stage 1
3366         __m256i x1[64];
3367         x1[0]  = _mm256_add_epi32(in[0 * stride], in[63 * stride]);
3368         x1[63] = _mm256_sub_epi32(in[0 * stride], in[63 * stride]);
3369         x1[1]  = _mm256_add_epi32(in[1 * stride], in[62 * stride]);
3370         x1[62] = _mm256_sub_epi32(in[1 * stride], in[62 * stride]);
3371         x1[2]  = _mm256_add_epi32(in[2 * stride], in[61 * stride]);
3372         x1[61] = _mm256_sub_epi32(in[2 * stride], in[61 * stride]);
3373         x1[3]  = _mm256_add_epi32(in[3 * stride], in[60 * stride]);
3374         x1[60] = _mm256_sub_epi32(in[3 * stride], in[60 * stride]);
3375         x1[4]  = _mm256_add_epi32(in[4 * stride], in[59 * stride]);
3376         x1[59] = _mm256_sub_epi32(in[4 * stride], in[59 * stride]);
3377         x1[5]  = _mm256_add_epi32(in[5 * stride], in[58 * stride]);
3378         x1[58] = _mm256_sub_epi32(in[5 * stride], in[58 * stride]);
3379         x1[6]  = _mm256_add_epi32(in[6 * stride], in[57 * stride]);
3380         x1[57] = _mm256_sub_epi32(in[6 * stride], in[57 * stride]);
3381         x1[7]  = _mm256_add_epi32(in[7 * stride], in[56 * stride]);
3382         x1[56] = _mm256_sub_epi32(in[7 * stride], in[56 * stride]);
3383         x1[8]  = _mm256_add_epi32(in[8 * stride], in[55 * stride]);
3384         x1[55] = _mm256_sub_epi32(in[8 * stride], in[55 * stride]);
3385         x1[9]  = _mm256_add_epi32(in[9 * stride], in[54 * stride]);
3386         x1[54] = _mm256_sub_epi32(in[9 * stride], in[54 * stride]);
3387         x1[10] = _mm256_add_epi32(in[10 * stride], in[53 * stride]);
3388         x1[53] = _mm256_sub_epi32(in[10 * stride], in[53 * stride]);
3389         x1[11] = _mm256_add_epi32(in[11 * stride], in[52 * stride]);
3390         x1[52] = _mm256_sub_epi32(in[11 * stride], in[52 * stride]);
3391         x1[12] = _mm256_add_epi32(in[12 * stride], in[51 * stride]);
3392         x1[51] = _mm256_sub_epi32(in[12 * stride], in[51 * stride]);
3393         x1[13] = _mm256_add_epi32(in[13 * stride], in[50 * stride]);
3394         x1[50] = _mm256_sub_epi32(in[13 * stride], in[50 * stride]);
3395         x1[14] = _mm256_add_epi32(in[14 * stride], in[49 * stride]);
3396         x1[49] = _mm256_sub_epi32(in[14 * stride], in[49 * stride]);
3397         x1[15] = _mm256_add_epi32(in[15 * stride], in[48 * stride]);
3398         x1[48] = _mm256_sub_epi32(in[15 * stride], in[48 * stride]);
3399         x1[16] = _mm256_add_epi32(in[16 * stride], in[47 * stride]);
3400         x1[47] = _mm256_sub_epi32(in[16 * stride], in[47 * stride]);
3401         x1[17] = _mm256_add_epi32(in[17 * stride], in[46 * stride]);
3402         x1[46] = _mm256_sub_epi32(in[17 * stride], in[46 * stride]);
3403         x1[18] = _mm256_add_epi32(in[18 * stride], in[45 * stride]);
3404         x1[45] = _mm256_sub_epi32(in[18 * stride], in[45 * stride]);
3405         x1[19] = _mm256_add_epi32(in[19 * stride], in[44 * stride]);
3406         x1[44] = _mm256_sub_epi32(in[19 * stride], in[44 * stride]);
3407         x1[20] = _mm256_add_epi32(in[20 * stride], in[43 * stride]);
3408         x1[43] = _mm256_sub_epi32(in[20 * stride], in[43 * stride]);
3409         x1[21] = _mm256_add_epi32(in[21 * stride], in[42 * stride]);
3410         x1[42] = _mm256_sub_epi32(in[21 * stride], in[42 * stride]);
3411         x1[22] = _mm256_add_epi32(in[22 * stride], in[41 * stride]);
3412         x1[41] = _mm256_sub_epi32(in[22 * stride], in[41 * stride]);
3413         x1[23] = _mm256_add_epi32(in[23 * stride], in[40 * stride]);
3414         x1[40] = _mm256_sub_epi32(in[23 * stride], in[40 * stride]);
3415         x1[24] = _mm256_add_epi32(in[24 * stride], in[39 * stride]);
3416         x1[39] = _mm256_sub_epi32(in[24 * stride], in[39 * stride]);
3417         x1[25] = _mm256_add_epi32(in[25 * stride], in[38 * stride]);
3418         x1[38] = _mm256_sub_epi32(in[25 * stride], in[38 * stride]);
3419         x1[26] = _mm256_add_epi32(in[26 * stride], in[37 * stride]);
3420         x1[37] = _mm256_sub_epi32(in[26 * stride], in[37 * stride]);
3421         x1[27] = _mm256_add_epi32(in[27 * stride], in[36 * stride]);
3422         x1[36] = _mm256_sub_epi32(in[27 * stride], in[36 * stride]);
3423         x1[28] = _mm256_add_epi32(in[28 * stride], in[35 * stride]);
3424         x1[35] = _mm256_sub_epi32(in[28 * stride], in[35 * stride]);
3425         x1[29] = _mm256_add_epi32(in[29 * stride], in[34 * stride]);
3426         x1[34] = _mm256_sub_epi32(in[29 * stride], in[34 * stride]);
3427         x1[30] = _mm256_add_epi32(in[30 * stride], in[33 * stride]);
3428         x1[33] = _mm256_sub_epi32(in[30 * stride], in[33 * stride]);
3429         x1[31] = _mm256_add_epi32(in[31 * stride], in[32 * stride]);
3430         x1[32] = _mm256_sub_epi32(in[31 * stride], in[32 * stride]);
3431 
3432         // stage 2
3433         __m256i x2[54];
3434         x2[0]  = _mm256_add_epi32(x1[0], x1[31]);
3435         x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
3436         x2[1]  = _mm256_add_epi32(x1[1], x1[30]);
3437         x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
3438         x2[2]  = _mm256_add_epi32(x1[2], x1[29]);
3439         x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
3440         x2[3]  = _mm256_add_epi32(x1[3], x1[28]);
3441         x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
3442         x2[4]  = _mm256_add_epi32(x1[4], x1[27]);
3443         x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
3444         x2[5]  = _mm256_add_epi32(x1[5], x1[26]);
3445         x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
3446         x2[6]  = _mm256_add_epi32(x1[6], x1[25]);
3447         x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
3448         x2[7]  = _mm256_add_epi32(x1[7], x1[24]);
3449         x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
3450         x2[8]  = _mm256_add_epi32(x1[8], x1[23]);
3451         x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
3452         x2[9]  = _mm256_add_epi32(x1[9], x1[22]);
3453         x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
3454         x2[10] = _mm256_add_epi32(x1[10], x1[21]);
3455         x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
3456         x2[11] = _mm256_add_epi32(x1[11], x1[20]);
3457         x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
3458         x2[12] = _mm256_add_epi32(x1[12], x1[19]);
3459         x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
3460         x2[13] = _mm256_add_epi32(x1[13], x1[18]);
3461         x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
3462         x2[14] = _mm256_add_epi32(x1[14], x1[17]);
3463         x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
3464         x2[15] = _mm256_add_epi32(x1[15], x1[16]);
3465         x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
3466         btf_32_type0_avx2_new(
3467             cospi_m32, cospi_p32, x1[40], x1[55], x2[32], x2[47], __rounding, cos_bit);
3468         btf_32_type0_avx2_new(
3469             cospi_m32, cospi_p32, x1[41], x1[54], x2[33], x2[46], __rounding, cos_bit);
3470         btf_32_type0_avx2_new(
3471             cospi_m32, cospi_p32, x1[42], x1[53], x2[34], x2[45], __rounding, cos_bit);
3472         btf_32_type0_avx2_new(
3473             cospi_m32, cospi_p32, x1[43], x1[52], x2[35], x2[44], __rounding, cos_bit);
3474         btf_32_type0_avx2_new(
3475             cospi_m32, cospi_p32, x1[44], x1[51], x2[36], x2[43], __rounding, cos_bit);
3476         btf_32_type0_avx2_new(
3477             cospi_m32, cospi_p32, x1[45], x1[50], x2[37], x2[42], __rounding, cos_bit);
3478         btf_32_type0_avx2_new(
3479             cospi_m32, cospi_p32, x1[46], x1[49], x2[38], x2[41], __rounding, cos_bit);
3480         btf_32_type0_avx2_new(
3481             cospi_m32, cospi_p32, x1[47], x1[48], x2[39], x2[40], __rounding, cos_bit);
3482 
3483         // stage 3
3484         __m256i x3[56];
3485         x3[0]  = _mm256_add_epi32(x2[0], x2[15]);
3486         x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
3487         x3[1]  = _mm256_add_epi32(x2[1], x2[14]);
3488         x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
3489         x3[2]  = _mm256_add_epi32(x2[2], x2[13]);
3490         x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
3491         x3[3]  = _mm256_add_epi32(x2[3], x2[12]);
3492         x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
3493         x3[4]  = _mm256_add_epi32(x2[4], x2[11]);
3494         x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
3495         x3[5]  = _mm256_add_epi32(x2[5], x2[10]);
3496         x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
3497         x3[6]  = _mm256_add_epi32(x2[6], x2[9]);
3498         x3[9]  = _mm256_sub_epi32(x2[6], x2[9]);
3499         x3[7]  = _mm256_add_epi32(x2[7], x2[8]);
3500         x3[8]  = _mm256_sub_epi32(x2[7], x2[8]);
3501         btf_32_type0_avx2_new(
3502             cospi_m32, cospi_p32, x2[20], x2[27], x3[16], x3[23], __rounding, cos_bit);
3503         btf_32_type0_avx2_new(
3504             cospi_m32, cospi_p32, x2[21], x2[26], x3[17], x3[22], __rounding, cos_bit);
3505         btf_32_type0_avx2_new(
3506             cospi_m32, cospi_p32, x2[22], x2[25], x3[18], x3[21], __rounding, cos_bit);
3507         btf_32_type0_avx2_new(
3508             cospi_m32, cospi_p32, x2[23], x2[24], x3[19], x3[20], __rounding, cos_bit);
3509         x3[32] = _mm256_add_epi32(x1[32], x2[39]);
3510         x3[47] = _mm256_sub_epi32(x1[32], x2[39]);
3511         x3[33] = _mm256_add_epi32(x1[33], x2[38]);
3512         x3[46] = _mm256_sub_epi32(x1[33], x2[38]);
3513         x3[34] = _mm256_add_epi32(x1[34], x2[37]);
3514         x3[45] = _mm256_sub_epi32(x1[34], x2[37]);
3515         x3[35] = _mm256_add_epi32(x1[35], x2[36]);
3516         x3[44] = _mm256_sub_epi32(x1[35], x2[36]);
3517         x3[36] = _mm256_add_epi32(x1[36], x2[35]);
3518         x3[43] = _mm256_sub_epi32(x1[36], x2[35]);
3519         x3[37] = _mm256_add_epi32(x1[37], x2[34]);
3520         x3[42] = _mm256_sub_epi32(x1[37], x2[34]);
3521         x3[38] = _mm256_add_epi32(x1[38], x2[33]);
3522         x3[41] = _mm256_sub_epi32(x1[38], x2[33]);
3523         x3[39] = _mm256_add_epi32(x1[39], x2[32]);
3524         x3[40] = _mm256_sub_epi32(x1[39], x2[32]);
3525         x3[48] = _mm256_sub_epi32(x1[63], x2[40]);
3526         x3[24] = _mm256_add_epi32(x1[63], x2[40]);
3527         x3[49] = _mm256_sub_epi32(x1[62], x2[41]);
3528         x3[25] = _mm256_add_epi32(x1[62], x2[41]);
3529         x3[50] = _mm256_sub_epi32(x1[61], x2[42]);
3530         x3[26] = _mm256_add_epi32(x1[61], x2[42]);
3531         x3[51] = _mm256_sub_epi32(x1[60], x2[43]);
3532         x3[27] = _mm256_add_epi32(x1[60], x2[43]);
3533         x3[52] = _mm256_sub_epi32(x1[59], x2[44]);
3534         x3[28] = _mm256_add_epi32(x1[59], x2[44]);
3535         x3[53] = _mm256_sub_epi32(x1[58], x2[45]);
3536         x3[29] = _mm256_add_epi32(x1[58], x2[45]);
3537         x3[54] = _mm256_sub_epi32(x1[57], x2[46]);
3538         x3[30] = _mm256_add_epi32(x1[57], x2[46]);
3539         x3[55] = _mm256_sub_epi32(x1[56], x2[47]);
3540         x3[31] = _mm256_add_epi32(x1[56], x2[47]);
3541 
3542         // stage 4
3543         //__m256i x4[44]; replace with x1
3544         x1[0] = _mm256_add_epi32(x3[0], x3[7]);
3545         x1[7] = _mm256_sub_epi32(x3[0], x3[7]);
3546         x1[1] = _mm256_add_epi32(x3[1], x3[6]);
3547         x1[6] = _mm256_sub_epi32(x3[1], x3[6]);
3548         x1[2] = _mm256_add_epi32(x3[2], x3[5]);
3549         x1[5] = _mm256_sub_epi32(x3[2], x3[5]);
3550         x1[3] = _mm256_add_epi32(x3[3], x3[4]);
3551         x1[4] = _mm256_sub_epi32(x3[3], x3[4]);
3552         btf_32_type0_avx2_new(
3553             cospi_m32, cospi_p32, x3[10], x3[13], x1[8], x1[11], __rounding, cos_bit);
3554         btf_32_type0_avx2_new(
3555             cospi_m32, cospi_p32, x3[11], x3[12], x1[9], x1[10], __rounding, cos_bit);
3556         x1[12] = _mm256_add_epi32(x2[16], x3[19]);
3557         x1[19] = _mm256_sub_epi32(x2[16], x3[19]);
3558         x1[13] = _mm256_add_epi32(x2[17], x3[18]);
3559         x1[18] = _mm256_sub_epi32(x2[17], x3[18]);
3560         x1[14] = _mm256_add_epi32(x2[18], x3[17]);
3561         x1[17] = _mm256_sub_epi32(x2[18], x3[17]);
3562         x1[15] = _mm256_add_epi32(x2[19], x3[16]);
3563         x1[16] = _mm256_sub_epi32(x2[19], x3[16]);
3564         x1[20] = _mm256_sub_epi32(x2[31], x3[20]);
3565         x1[27] = _mm256_add_epi32(x2[31], x3[20]);
3566         x1[21] = _mm256_sub_epi32(x2[30], x3[21]);
3567         x1[26] = _mm256_add_epi32(x2[30], x3[21]);
3568         x1[22] = _mm256_sub_epi32(x2[29], x3[22]);
3569         x1[25] = _mm256_add_epi32(x2[29], x3[22]);
3570         x1[23] = _mm256_sub_epi32(x2[28], x3[23]);
3571         x1[24] = _mm256_add_epi32(x2[28], x3[23]);
3572         btf_32_type0_avx2_new(
3573             cospi_m16, cospi_p48, x3[36], x3[28], x1[28], x1[43], __rounding, cos_bit);
3574         btf_32_type0_avx2_new(
3575             cospi_m16, cospi_p48, x3[37], x3[29], x1[29], x1[42], __rounding, cos_bit);
3576         btf_32_type0_avx2_new(
3577             cospi_m16, cospi_p48, x3[38], x3[30], x1[30], x1[41], __rounding, cos_bit);
3578         btf_32_type0_avx2_new(
3579             cospi_m16, cospi_p48, x3[39], x3[31], x1[31], x1[40], __rounding, cos_bit);
3580         btf_32_type0_avx2_new(
3581             cospi_m48, cospi_m16, x3[40], x3[55], x1[32], x1[39], __rounding, cos_bit);
3582         btf_32_type0_avx2_new(
3583             cospi_m48, cospi_m16, x3[41], x3[54], x1[33], x1[38], __rounding, cos_bit);
3584         btf_32_type0_avx2_new(
3585             cospi_m48, cospi_m16, x3[42], x3[53], x1[34], x1[37], __rounding, cos_bit);
3586         btf_32_type0_avx2_new(
3587             cospi_m48, cospi_m16, x3[43], x3[52], x1[35], x1[36], __rounding, cos_bit);
3588 
3589         // stage 5
3590         //__m256i x5[54]; replace with x2
3591         x2[0] = _mm256_add_epi32(x1[0], x1[3]);
3592         x2[3] = _mm256_sub_epi32(x1[0], x1[3]);
3593         x2[1] = _mm256_add_epi32(x1[1], x1[2]);
3594         x2[2] = _mm256_sub_epi32(x1[1], x1[2]);
3595         btf_32_type0_avx2_new(
3596             cospi_m32, cospi_p32, x1[5], x1[6], x2[4], x2[5], __rounding, cos_bit);
3597         x2[6]  = _mm256_add_epi32(x3[8], x1[9]);
3598         x2[9]  = _mm256_sub_epi32(x3[8], x1[9]);
3599         x2[7]  = _mm256_add_epi32(x3[9], x1[8]);
3600         x2[8]  = _mm256_sub_epi32(x3[9], x1[8]);
3601         x2[10] = _mm256_sub_epi32(x3[15], x1[10]);
3602         x2[13] = _mm256_add_epi32(x3[15], x1[10]);
3603         x2[11] = _mm256_sub_epi32(x3[14], x1[11]);
3604         x2[12] = _mm256_add_epi32(x3[14], x1[11]);
3605         btf_32_type0_avx2_new(
3606             cospi_m16, cospi_p48, x1[14], x1[25], x2[14], x2[21], __rounding, cos_bit);
3607         btf_32_type0_avx2_new(
3608             cospi_m16, cospi_p48, x1[15], x1[24], x2[15], x2[20], __rounding, cos_bit);
3609         btf_32_type0_avx2_new(
3610             cospi_m48, cospi_m16, x1[16], x1[23], x2[16], x2[19], __rounding, cos_bit);
3611         btf_32_type0_avx2_new(
3612             cospi_m48, cospi_m16, x1[17], x1[22], x2[17], x2[18], __rounding, cos_bit);
3613         x2[22] = _mm256_add_epi32(x3[32], x1[31]);
3614         x2[29] = _mm256_sub_epi32(x3[32], x1[31]);
3615         x2[23] = _mm256_add_epi32(x3[33], x1[30]);
3616         x2[28] = _mm256_sub_epi32(x3[33], x1[30]);
3617         x2[24] = _mm256_add_epi32(x3[34], x1[29]);
3618         x2[27] = _mm256_sub_epi32(x3[34], x1[29]);
3619         x2[25] = _mm256_add_epi32(x3[35], x1[28]);
3620         x2[26] = _mm256_sub_epi32(x3[35], x1[28]);
3621         x2[30] = _mm256_sub_epi32(x3[47], x1[32]);
3622         x2[37] = _mm256_add_epi32(x3[47], x1[32]);
3623         x2[31] = _mm256_sub_epi32(x3[46], x1[33]);
3624         x2[36] = _mm256_add_epi32(x3[46], x1[33]);
3625         x2[32] = _mm256_sub_epi32(x3[45], x1[34]);
3626         x2[35] = _mm256_add_epi32(x3[45], x1[34]);
3627         x2[33] = _mm256_sub_epi32(x3[44], x1[35]);
3628         x2[34] = _mm256_add_epi32(x3[44], x1[35]);
3629         x2[38] = _mm256_add_epi32(x3[48], x1[39]);
3630         x2[45] = _mm256_sub_epi32(x3[48], x1[39]);
3631         x2[39] = _mm256_add_epi32(x3[49], x1[38]);
3632         x2[44] = _mm256_sub_epi32(x3[49], x1[38]);
3633         x2[40] = _mm256_add_epi32(x3[50], x1[37]);
3634         x2[43] = _mm256_sub_epi32(x3[50], x1[37]);
3635         x2[41] = _mm256_add_epi32(x3[51], x1[36]);
3636         x2[42] = _mm256_sub_epi32(x3[51], x1[36]);
3637         x2[46] = _mm256_sub_epi32(x3[24], x1[40]);
3638         x2[53] = _mm256_add_epi32(x3[24], x1[40]);
3639         x2[47] = _mm256_sub_epi32(x3[25], x1[41]);
3640         x2[52] = _mm256_add_epi32(x3[25], x1[41]);
3641         x2[48] = _mm256_sub_epi32(x3[26], x1[42]);
3642         x2[51] = _mm256_add_epi32(x3[26], x1[42]);
3643         x2[49] = _mm256_sub_epi32(x3[27], x1[43]);
3644         x2[50] = _mm256_add_epi32(x3[27], x1[43]);
3645 
3646         // stage 6
3647         //__m256i x6[40]; replace with x3
3648         btf_32_type0_avx2_new(cospi_p32,
3649                               cospi_p32,
3650                               x2[0],
3651                               x2[1],
3652                               out[0 * stride],
3653                               out[32 * stride],
3654                               __rounding,
3655                               cos_bit);
3656         btf_32_type1_avx2_new(cospi_p48,
3657                               cospi_p16,
3658                               x2[2],
3659                               x2[3],
3660                               out[16 * stride],
3661                               out[48 * stride],
3662                               __rounding,
3663                               cos_bit);
3664         x3[0] = _mm256_add_epi32(x1[4], x2[4]);
3665         x3[1] = _mm256_sub_epi32(x1[4], x2[4]);
3666         x3[2] = _mm256_sub_epi32(x1[7], x2[5]);
3667         x3[3] = _mm256_add_epi32(x1[7], x2[5]);
3668         btf_32_type0_avx2_new(
3669             cospi_m16, cospi_p48, x2[7], x2[12], x3[4], x3[7], __rounding, cos_bit);
3670         btf_32_type0_avx2_new(
3671             cospi_m48, cospi_m16, x2[8], x2[11], x3[5], x3[6], __rounding, cos_bit);
3672         x3[8]  = _mm256_add_epi32(x1[12], x2[15]);
3673         x3[11] = _mm256_sub_epi32(x1[12], x2[15]);
3674         x3[9]  = _mm256_add_epi32(x1[13], x2[14]);
3675         x3[10] = _mm256_sub_epi32(x1[13], x2[14]);
3676         x3[12] = _mm256_sub_epi32(x1[19], x2[16]);
3677         x3[15] = _mm256_add_epi32(x1[19], x2[16]);
3678         x3[13] = _mm256_sub_epi32(x1[18], x2[17]);
3679         x3[14] = _mm256_add_epi32(x1[18], x2[17]);
3680         x3[16] = _mm256_add_epi32(x1[20], x2[19]);
3681         x3[19] = _mm256_sub_epi32(x1[20], x2[19]);
3682         x3[17] = _mm256_add_epi32(x1[21], x2[18]);
3683         x3[18] = _mm256_sub_epi32(x1[21], x2[18]);
3684         x3[20] = _mm256_sub_epi32(x1[27], x2[20]);
3685         x3[23] = _mm256_add_epi32(x1[27], x2[20]);
3686         x3[21] = _mm256_sub_epi32(x1[26], x2[21]);
3687         x3[22] = _mm256_add_epi32(x1[26], x2[21]);
3688         btf_32_type0_avx2_new(
3689             cospi_m08, cospi_p56, x2[24], x2[51], x3[24], x3[39], __rounding, cos_bit);
3690         btf_32_type0_avx2_new(
3691             cospi_m08, cospi_p56, x2[25], x2[50], x3[25], x3[38], __rounding, cos_bit);
3692         btf_32_type0_avx2_new(
3693             cospi_m56, cospi_m08, x2[26], x2[49], x3[26], x3[37], __rounding, cos_bit);
3694         btf_32_type0_avx2_new(
3695             cospi_m56, cospi_m08, x2[27], x2[48], x3[27], x3[36], __rounding, cos_bit);
3696         btf_32_type0_avx2_new(
3697             cospi_m40, cospi_p24, x2[32], x2[43], x3[28], x3[35], __rounding, cos_bit);
3698         btf_32_type0_avx2_new(
3699             cospi_m40, cospi_p24, x2[33], x2[42], x3[29], x3[34], __rounding, cos_bit);
3700         btf_32_type0_avx2_new(
3701             cospi_m24, cospi_m40, x2[34], x2[41], x3[30], x3[33], __rounding, cos_bit);
3702         btf_32_type0_avx2_new(
3703             cospi_m24, cospi_m40, x2[35], x2[40], x3[31], x3[32], __rounding, cos_bit);
3704 
3705         // stage 7
3706         //__m256i x7[48]; replace with x1
3707         btf_32_type1_avx2_new(cospi_p56,
3708                               cospi_p08,
3709                               x3[0],
3710                               x3[3],
3711                               out[8 * stride],
3712                               out[56 * stride],
3713                               __rounding,
3714                               cos_bit);
3715         btf_32_type1_avx2_new(cospi_p24,
3716                               cospi_p40,
3717                               x3[1],
3718                               x3[2],
3719                               out[40 * stride],
3720                               out[24 * stride],
3721                               __rounding,
3722                               cos_bit);
3723         x1[0] = _mm256_add_epi32(x2[6], x3[4]);
3724         x1[1] = _mm256_sub_epi32(x2[6], x3[4]);
3725         x1[2] = _mm256_sub_epi32(x2[9], x3[5]);
3726         x1[3] = _mm256_add_epi32(x2[9], x3[5]);
3727         x1[4] = _mm256_add_epi32(x2[10], x3[6]);
3728         x1[5] = _mm256_sub_epi32(x2[10], x3[6]);
3729         x1[6] = _mm256_sub_epi32(x2[13], x3[7]);
3730         x1[7] = _mm256_add_epi32(x2[13], x3[7]);
3731         btf_32_type0_avx2_new(
3732             cospi_m08, cospi_p56, x3[9], x3[22], x1[8], x1[15], __rounding, cos_bit);
3733         btf_32_type0_avx2_new(
3734             cospi_m56, cospi_m08, x3[10], x3[21], x1[9], x1[14], __rounding, cos_bit);
3735         btf_32_type0_avx2_new(
3736             cospi_m40, cospi_p24, x3[13], x3[18], x1[10], x1[13], __rounding, cos_bit);
3737         btf_32_type0_avx2_new(
3738             cospi_m24, cospi_m40, x3[14], x3[17], x1[11], x1[12], __rounding, cos_bit);
3739         x1[16] = _mm256_add_epi32(x2[22], x3[25]);
3740         x1[17] = _mm256_sub_epi32(x2[22], x3[25]);
3741         x1[19] = _mm256_add_epi32(x2[23], x3[24]);
3742         x1[20] = _mm256_sub_epi32(x2[23], x3[24]);
3743         x1[18] = _mm256_sub_epi32(x2[29], x3[26]);
3744         x1[21] = _mm256_add_epi32(x2[29], x3[26]);
3745         x1[22] = _mm256_sub_epi32(x2[28], x3[27]);
3746         x1[23] = _mm256_add_epi32(x2[28], x3[27]);
3747         x1[24] = _mm256_add_epi32(x2[30], x3[29]);
3748         x1[25] = _mm256_sub_epi32(x2[30], x3[29]);
3749         x1[26] = _mm256_add_epi32(x2[31], x3[28]);
3750         x1[27] = _mm256_sub_epi32(x2[31], x3[28]);
3751         x1[28] = _mm256_sub_epi32(x2[37], x3[30]);
3752         x1[29] = _mm256_add_epi32(x2[37], x3[30]);
3753         x1[30] = _mm256_sub_epi32(x2[36], x3[31]);
3754         x1[31] = _mm256_add_epi32(x2[36], x3[31]);
3755         x1[32] = _mm256_add_epi32(x2[38], x3[33]);
3756         x1[33] = _mm256_sub_epi32(x2[38], x3[33]);
3757         x1[34] = _mm256_add_epi32(x2[39], x3[32]);
3758         x1[35] = _mm256_sub_epi32(x2[39], x3[32]);
3759         x1[36] = _mm256_sub_epi32(x2[45], x3[34]);
3760         x1[37] = _mm256_add_epi32(x2[45], x3[34]);
3761         x1[38] = _mm256_sub_epi32(x2[44], x3[35]);
3762         x1[39] = _mm256_add_epi32(x2[44], x3[35]);
3763         x1[40] = _mm256_add_epi32(x2[46], x3[37]);
3764         x1[41] = _mm256_sub_epi32(x2[46], x3[37]);
3765         x1[42] = _mm256_add_epi32(x2[47], x3[36]);
3766         x1[43] = _mm256_sub_epi32(x2[47], x3[36]);
3767         x1[44] = _mm256_sub_epi32(x2[53], x3[38]);
3768         x1[45] = _mm256_add_epi32(x2[53], x3[38]);
3769         x1[46] = _mm256_sub_epi32(x2[52], x3[39]);
3770         x1[47] = _mm256_add_epi32(x2[52], x3[39]);
3771 
3772         // stage 8
3773         //__m256i x8[32]; replace with x2
3774         btf_32_type1_avx2_new(cospi_p60,
3775                               cospi_p04,
3776                               x1[0],
3777                               x1[7],
3778                               out[4 * stride],
3779                               out[60 * stride],
3780                               __rounding,
3781                               cos_bit);
3782         btf_32_type1_avx2_new(cospi_p28,
3783                               cospi_p36,
3784                               x1[1],
3785                               x1[6],
3786                               out[36 * stride],
3787                               out[28 * stride],
3788                               __rounding,
3789                               cos_bit);
3790         btf_32_type1_avx2_new(cospi_p44,
3791                               cospi_p20,
3792                               x1[2],
3793                               x1[5],
3794                               out[20 * stride],
3795                               out[44 * stride],
3796                               __rounding,
3797                               cos_bit);
3798         btf_32_type1_avx2_new(cospi_p12,
3799                               cospi_p52,
3800                               x1[3],
3801                               x1[4],
3802                               out[52 * stride],
3803                               out[12 * stride],
3804                               __rounding,
3805                               cos_bit);
3806         x2[0]  = _mm256_add_epi32(x3[8], x1[8]);
3807         x2[1]  = _mm256_sub_epi32(x3[8], x1[8]);
3808         x2[2]  = _mm256_sub_epi32(x3[11], x1[9]);
3809         x2[3]  = _mm256_add_epi32(x3[11], x1[9]);
3810         x2[4]  = _mm256_add_epi32(x3[12], x1[10]);
3811         x2[5]  = _mm256_sub_epi32(x3[12], x1[10]);
3812         x2[6]  = _mm256_sub_epi32(x3[15], x1[11]);
3813         x2[7]  = _mm256_add_epi32(x3[15], x1[11]);
3814         x2[8]  = _mm256_add_epi32(x3[16], x1[12]);
3815         x2[9]  = _mm256_sub_epi32(x3[16], x1[12]);
3816         x2[10] = _mm256_sub_epi32(x3[19], x1[13]);
3817         x2[11] = _mm256_add_epi32(x3[19], x1[13]);
3818         x2[12] = _mm256_add_epi32(x3[20], x1[14]);
3819         x2[13] = _mm256_sub_epi32(x3[20], x1[14]);
3820         x2[14] = _mm256_sub_epi32(x3[23], x1[15]);
3821         x2[15] = _mm256_add_epi32(x3[23], x1[15]);
3822         btf_32_type0_avx2_new(
3823             cospi_m04, cospi_p60, x1[19], x1[47], x2[16], x2[31], __rounding, cos_bit);
3824         btf_32_type0_avx2_new(
3825             cospi_m60, cospi_m04, x1[20], x1[46], x2[17], x2[30], __rounding, cos_bit);
3826         btf_32_type0_avx2_new(
3827             cospi_m36, cospi_p28, x1[22], x1[43], x2[18], x2[29], __rounding, cos_bit);
3828         btf_32_type0_avx2_new(
3829             cospi_m28, cospi_m36, x1[23], x1[42], x2[19], x2[28], __rounding, cos_bit);
3830         btf_32_type0_avx2_new(
3831             cospi_m20, cospi_p44, x1[26], x1[39], x2[20], x2[27], __rounding, cos_bit);
3832         btf_32_type0_avx2_new(
3833             cospi_m44, cospi_m20, x1[27], x1[38], x2[21], x2[26], __rounding, cos_bit);
3834         btf_32_type0_avx2_new(
3835             cospi_m52, cospi_p12, x1[30], x1[35], x2[22], x2[25], __rounding, cos_bit);
3836         btf_32_type0_avx2_new(
3837             cospi_m12, cospi_m52, x1[31], x1[34], x2[23], x2[24], __rounding, cos_bit);
3838 
3839         // stage 9
3840         //__m256i x9[32]; replace with x3
3841         btf_32_type1_avx2_new(cospi_p62,
3842                               cospi_p02,
3843                               x2[0],
3844                               x2[15],
3845                               out[2 * stride],
3846                               out[62 * stride],
3847                               __rounding,
3848                               cos_bit);
3849         btf_32_type1_avx2_new(cospi_p30,
3850                               cospi_p34,
3851                               x2[1],
3852                               x2[14],
3853                               out[34 * stride],
3854                               out[30 * stride],
3855                               __rounding,
3856                               cos_bit);
3857         btf_32_type1_avx2_new(cospi_p46,
3858                               cospi_p18,
3859                               x2[2],
3860                               x2[13],
3861                               out[18 * stride],
3862                               out[46 * stride],
3863                               __rounding,
3864                               cos_bit);
3865         btf_32_type1_avx2_new(cospi_p14,
3866                               cospi_p50,
3867                               x2[3],
3868                               x2[12],
3869                               out[50 * stride],
3870                               out[14 * stride],
3871                               __rounding,
3872                               cos_bit);
3873         btf_32_type1_avx2_new(cospi_p54,
3874                               cospi_p10,
3875                               x2[4],
3876                               x2[11],
3877                               out[10 * stride],
3878                               out[54 * stride],
3879                               __rounding,
3880                               cos_bit);
3881         btf_32_type1_avx2_new(cospi_p22,
3882                               cospi_p42,
3883                               x2[5],
3884                               x2[10],
3885                               out[42 * stride],
3886                               out[22 * stride],
3887                               __rounding,
3888                               cos_bit);
3889         btf_32_type1_avx2_new(cospi_p38,
3890                               cospi_p26,
3891                               x2[6],
3892                               x2[9],
3893                               out[26 * stride],
3894                               out[38 * stride],
3895                               __rounding,
3896                               cos_bit);
3897         btf_32_type1_avx2_new(cospi_p06,
3898                               cospi_p58,
3899                               x2[7],
3900                               x2[8],
3901                               out[58 * stride],
3902                               out[6 * stride],
3903                               __rounding,
3904                               cos_bit);
3905         x3[0]  = _mm256_add_epi32(x1[16], x2[16]);
3906         x3[1]  = _mm256_sub_epi32(x1[16], x2[16]);
3907         x3[2]  = _mm256_sub_epi32(x1[17], x2[17]);
3908         x3[3]  = _mm256_add_epi32(x1[17], x2[17]);
3909         x3[4]  = _mm256_add_epi32(x1[18], x2[18]);
3910         x3[5]  = _mm256_sub_epi32(x1[18], x2[18]);
3911         x3[6]  = _mm256_sub_epi32(x1[21], x2[19]);
3912         x3[7]  = _mm256_add_epi32(x1[21], x2[19]);
3913         x3[8]  = _mm256_add_epi32(x1[24], x2[20]);
3914         x3[9]  = _mm256_sub_epi32(x1[24], x2[20]);
3915         x3[10] = _mm256_sub_epi32(x1[25], x2[21]);
3916         x3[11] = _mm256_add_epi32(x1[25], x2[21]);
3917         x3[12] = _mm256_add_epi32(x1[28], x2[22]);
3918         x3[13] = _mm256_sub_epi32(x1[28], x2[22]);
3919         x3[14] = _mm256_sub_epi32(x1[29], x2[23]);
3920         x3[15] = _mm256_add_epi32(x1[29], x2[23]);
3921         x3[16] = _mm256_add_epi32(x1[32], x2[24]);
3922         x3[17] = _mm256_sub_epi32(x1[32], x2[24]);
3923         x3[18] = _mm256_sub_epi32(x1[33], x2[25]);
3924         x3[19] = _mm256_add_epi32(x1[33], x2[25]);
3925         x3[20] = _mm256_add_epi32(x1[36], x2[26]);
3926         x3[21] = _mm256_sub_epi32(x1[36], x2[26]);
3927         x3[22] = _mm256_sub_epi32(x1[37], x2[27]);
3928         x3[23] = _mm256_add_epi32(x1[37], x2[27]);
3929         x3[24] = _mm256_add_epi32(x1[40], x2[28]);
3930         x3[25] = _mm256_sub_epi32(x1[40], x2[28]);
3931         x3[26] = _mm256_sub_epi32(x1[41], x2[29]);
3932         x3[27] = _mm256_add_epi32(x1[41], x2[29]);
3933         x3[28] = _mm256_add_epi32(x1[44], x2[30]);
3934         x3[29] = _mm256_sub_epi32(x1[44], x2[30]);
3935         x3[30] = _mm256_sub_epi32(x1[45], x2[31]);
3936         x3[31] = _mm256_add_epi32(x1[45], x2[31]);
3937 
3938         // stage 10
3939         btf_32_type1_avx2_new(cospi_p63,
3940                               cospi_p01,
3941                               x3[0],
3942                               x3[31],
3943                               out[1 * stride],
3944                               out[63 * stride],
3945                               __rounding,
3946                               cos_bit);
3947         btf_32_type1_avx2_new(cospi_p31,
3948                               cospi_p33,
3949                               x3[1],
3950                               x3[30],
3951                               out[33 * stride],
3952                               out[31 * stride],
3953                               __rounding,
3954                               cos_bit);
3955         btf_32_type1_avx2_new(cospi_p47,
3956                               cospi_p17,
3957                               x3[2],
3958                               x3[29],
3959                               out[17 * stride],
3960                               out[47 * stride],
3961                               __rounding,
3962                               cos_bit);
3963         btf_32_type1_avx2_new(cospi_p15,
3964                               cospi_p49,
3965                               x3[3],
3966                               x3[28],
3967                               out[49 * stride],
3968                               out[15 * stride],
3969                               __rounding,
3970                               cos_bit);
3971         btf_32_type1_avx2_new(cospi_p55,
3972                               cospi_p09,
3973                               x3[4],
3974                               x3[27],
3975                               out[9 * stride],
3976                               out[55 * stride],
3977                               __rounding,
3978                               cos_bit);
3979         btf_32_type1_avx2_new(cospi_p23,
3980                               cospi_p41,
3981                               x3[5],
3982                               x3[26],
3983                               out[41 * stride],
3984                               out[23 * stride],
3985                               __rounding,
3986                               cos_bit);
3987         btf_32_type1_avx2_new(cospi_p39,
3988                               cospi_p25,
3989                               x3[6],
3990                               x3[25],
3991                               out[25 * stride],
3992                               out[39 * stride],
3993                               __rounding,
3994                               cos_bit);
3995         btf_32_type1_avx2_new(cospi_p07,
3996                               cospi_p57,
3997                               x3[7],
3998                               x3[24],
3999                               out[57 * stride],
4000                               out[7 * stride],
4001                               __rounding,
4002                               cos_bit);
4003         btf_32_type1_avx2_new(cospi_p59,
4004                               cospi_p05,
4005                               x3[8],
4006                               x3[23],
4007                               out[5 * stride],
4008                               out[59 * stride],
4009                               __rounding,
4010                               cos_bit);
4011         btf_32_type1_avx2_new(cospi_p27,
4012                               cospi_p37,
4013                               x3[9],
4014                               x3[22],
4015                               out[37 * stride],
4016                               out[27 * stride],
4017                               __rounding,
4018                               cos_bit);
4019         btf_32_type1_avx2_new(cospi_p43,
4020                               cospi_p21,
4021                               x3[10],
4022                               x3[21],
4023                               out[21 * stride],
4024                               out[43 * stride],
4025                               __rounding,
4026                               cos_bit);
4027         btf_32_type1_avx2_new(cospi_p11,
4028                               cospi_p53,
4029                               x3[11],
4030                               x3[20],
4031                               out[53 * stride],
4032                               out[11 * stride],
4033                               __rounding,
4034                               cos_bit);
4035         btf_32_type1_avx2_new(cospi_p51,
4036                               cospi_p13,
4037                               x3[12],
4038                               x3[19],
4039                               out[13 * stride],
4040                               out[51 * stride],
4041                               __rounding,
4042                               cos_bit);
4043         btf_32_type1_avx2_new(cospi_p19,
4044                               cospi_p45,
4045                               x3[13],
4046                               x3[18],
4047                               out[45 * stride],
4048                               out[19 * stride],
4049                               __rounding,
4050                               cos_bit);
4051         btf_32_type1_avx2_new(cospi_p35,
4052                               cospi_p29,
4053                               x3[14],
4054                               x3[17],
4055                               out[29 * stride],
4056                               out[35 * stride],
4057                               __rounding,
4058                               cos_bit);
4059         btf_32_type1_avx2_new(cospi_p03,
4060                               cospi_p61,
4061                               x3[15],
4062                               x3[16],
4063                               out[61 * stride],
4064                               out[3 * stride],
4065                               __rounding,
4066                               cos_bit);
4067     }
4068 }
4069 
4070 typedef void (*TxfmFuncAVX2)(const __m256i *input, __m256i *output, const int8_t cos_bit,
4071                              const int8_t *stage_range);
4072 
fdct32x32_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit,const int8_t * stage_range)4073 static INLINE void fdct32x32_avx2(const __m256i *input, __m256i *output, const int8_t cos_bit,
4074                                   const int8_t *stage_range) {
4075     const int32_t txfm_size   = 32;
4076     const int32_t num_per_256 = 8;
4077     int32_t       col_num     = txfm_size / num_per_256;
4078     (void)stage_range;
4079     av1_fdct32_new_avx2(input, output, cos_bit, txfm_size, col_num);
4080 }
4081 
fdct64x64_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit)4082 static INLINE void fdct64x64_avx2(const __m256i *input, __m256i *output, const int8_t cos_bit) {
4083     const int32_t txfm_size   = 64;
4084     const int32_t num_per_256 = 8;
4085     int32_t       col_num     = txfm_size / num_per_256;
4086     av1_fdct64_new_avx2(input, output, cos_bit, txfm_size, col_num);
4087 }
4088 
fidtx4x8_row_avx2(__m256i * input,__m256i * output,int32_t bit,int32_t col_num)4089 static INLINE void fidtx4x8_row_avx2(__m256i *input, __m256i *output, int32_t bit,
4090                                      int32_t col_num) {
4091     (void)bit;
4092     __m256i in[4];
4093     __m256i out[4];
4094     __m256i fact   = _mm256_set1_epi32(new_sqrt2);
4095     __m256i offset = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
4096     __m256i a_low;
4097     __m256i v[4];
4098 
4099     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
4100     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
4101     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
4102     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
4103 
4104     for (int32_t i = 0; i < 4; i++) {
4105         a_low  = _mm256_mullo_epi32(in[i * col_num], fact);
4106         a_low  = _mm256_add_epi32(a_low, offset);
4107         out[i] = _mm256_srai_epi32(a_low, new_sqrt2_bits);
4108     }
4109 
4110     // Transpose for 4x4
4111     v[0] = _mm256_unpacklo_epi32(out[0], out[1]);
4112     v[1] = _mm256_unpackhi_epi32(out[0], out[1]);
4113     v[2] = _mm256_unpacklo_epi32(out[2], out[3]);
4114     v[3] = _mm256_unpackhi_epi32(out[2], out[3]);
4115 
4116     out[0] = _mm256_unpacklo_epi64(v[0], v[2]);
4117     out[1] = _mm256_unpackhi_epi64(v[0], v[2]);
4118     out[2] = _mm256_unpacklo_epi64(v[1], v[3]);
4119     out[3] = _mm256_unpackhi_epi64(v[1], v[3]);
4120 
4121     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
4122     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
4123     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
4124     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
4125 }
4126 
fidtx4x8_col_avx2(__m256i * in,__m256i * output,int32_t bit,int32_t col_num)4127 static INLINE void fidtx4x8_col_avx2(__m256i *in, __m256i *output, int32_t bit, int32_t col_num) {
4128     (void)bit;
4129     __m256i out[4];
4130     __m256i fact   = _mm256_set1_epi32(new_sqrt2);
4131     __m256i offset = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
4132     __m256i a_low;
4133     __m256i v[4];
4134 
4135     for (int32_t i = 0; i < 4; i++) {
4136         a_low  = _mm256_mullo_epi32(in[i * col_num], fact);
4137         a_low  = _mm256_add_epi32(a_low, offset);
4138         out[i] = _mm256_srai_epi32(a_low, new_sqrt2_bits);
4139     }
4140 
4141     // Transpose for 4x4
4142     v[0] = _mm256_unpacklo_epi32(out[0], out[1]);
4143     v[1] = _mm256_unpackhi_epi32(out[0], out[1]);
4144     v[2] = _mm256_unpacklo_epi32(out[2], out[3]);
4145     v[3] = _mm256_unpackhi_epi32(out[2], out[3]);
4146 
4147     out[0] = _mm256_unpacklo_epi64(v[0], v[2]);
4148     out[1] = _mm256_unpackhi_epi64(v[0], v[2]);
4149     out[2] = _mm256_unpacklo_epi64(v[1], v[3]);
4150     out[3] = _mm256_unpackhi_epi64(v[1], v[3]);
4151 
4152     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
4153     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
4154     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
4155     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
4156 }
4157 
fidtx8x4_avx2(__m256i * in,__m256i * out,int32_t bit)4158 static INLINE void fidtx8x4_avx2(__m256i *in, __m256i *out, int32_t bit) {
4159     (void)bit;
4160 
4161     out[0] = _mm256_add_epi32(in[0], in[0]);
4162     out[1] = _mm256_add_epi32(in[1], in[1]);
4163     out[2] = _mm256_add_epi32(in[2], in[2]);
4164     out[3] = _mm256_add_epi32(in[3], in[3]);
4165 }
4166 
av1_idtx32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num)4167 void av1_idtx32_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
4168                          const int32_t col_num) {
4169     (void)cos_bit;
4170     for (int32_t i = 0; i < 32; i++) output[i * col_num] = _mm256_slli_epi32(input[i * col_num], 2);
4171 }
4172 
fidtx32x32_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit,const int8_t * stage_range)4173 static void fidtx32x32_avx2(const __m256i *input, __m256i *output, const int8_t cos_bit,
4174                             const int8_t *stage_range) {
4175     (void)stage_range;
4176 
4177     for (int32_t i = 0; i < 4; i++)
4178         av1_idtx32_new_avx2(&input[i * 32], &output[i * 32], cos_bit, 1);
4179 }
4180 
fidtx32x8_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num)4181 static void fidtx32x8_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
4182     (void)bit;
4183     (void)col_num;
4184     out[4 * 0] = _mm256_slli_epi32(in[4 * 0], 1);
4185     out[4 * 1] = _mm256_slli_epi32(in[4 * 1], 1);
4186     out[4 * 2] = _mm256_slli_epi32(in[4 * 2], 1);
4187     out[4 * 3] = _mm256_slli_epi32(in[4 * 3], 1);
4188     out[4 * 4] = _mm256_slli_epi32(in[4 * 4], 1);
4189     out[4 * 5] = _mm256_slli_epi32(in[4 * 5], 1);
4190     out[4 * 6] = _mm256_slli_epi32(in[4 * 6], 1);
4191     out[4 * 7] = _mm256_slli_epi32(in[4 * 7], 1);
4192 }
4193 
fidtx64x64_avx2(const __m256i * input,__m256i * output)4194 static void fidtx64x64_avx2(const __m256i *input, __m256i *output) {
4195     const int32_t bits     = 12; // new_sqrt2_bits = 12
4196     const int32_t sqrt     = 4 * 5793; // 4 * new_sqrt2
4197     const int32_t col_num  = 8;
4198     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
4199     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
4200 
4201     __m256i temp;
4202     int32_t num_iters = 64 * col_num;
4203     for (int32_t i = 0; i < num_iters; i++) {
4204         temp      = _mm256_mullo_epi32(input[i], newsqrt);
4205         temp      = _mm256_add_epi32(temp, rounding);
4206         output[i] = _mm256_srai_epi32(temp, bits);
4207     }
4208 }
4209 
fwd_txfm_type_to_func(TxfmType txfmtype)4210 static INLINE TxfmFuncAVX2 fwd_txfm_type_to_func(TxfmType txfmtype) {
4211     switch (txfmtype) {
4212     case TXFM_TYPE_DCT32: return fdct32x32_avx2; break;
4213     case TXFM_TYPE_IDENTITY32: return fidtx32x32_avx2; break;
4214     default: assert(0);
4215     }
4216     return NULL;
4217 }
4218 
load_buffer_32x32_avx2(const int16_t * input,__m256i * output,int32_t stride)4219 static INLINE void load_buffer_32x32_avx2(const int16_t *input, __m256i *output, int32_t stride) {
4220     __m128i temp[4];
4221     int32_t i;
4222 
4223     for (i = 0; i < 32; ++i) {
4224         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
4225         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
4226         temp[2] = _mm_loadu_si128((const __m128i *)(input + 2 * 8));
4227         temp[3] = _mm_loadu_si128((const __m128i *)(input + 3 * 8));
4228 
4229         output[0] = _mm256_cvtepi16_epi32(temp[0]);
4230         output[1] = _mm256_cvtepi16_epi32(temp[1]);
4231         output[2] = _mm256_cvtepi16_epi32(temp[2]);
4232         output[3] = _mm256_cvtepi16_epi32(temp[3]);
4233         input += stride;
4234         output += 4;
4235     }
4236 }
4237 
load_buffer_32x16_avx2(const int16_t * input,__m256i * output,int32_t stride)4238 static INLINE void load_buffer_32x16_avx2(const int16_t *input, __m256i *output, int32_t stride) {
4239     __m128i temp[4];
4240     int32_t i;
4241 
4242     for (i = 0; i < 16; ++i) {
4243         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
4244         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
4245         temp[2] = _mm_loadu_si128((const __m128i *)(input + 2 * 8));
4246         temp[3] = _mm_loadu_si128((const __m128i *)(input + 3 * 8));
4247 
4248         output[0] = _mm256_cvtepi16_epi32(temp[0]);
4249         output[1] = _mm256_cvtepi16_epi32(temp[1]);
4250         output[2] = _mm256_cvtepi16_epi32(temp[2]);
4251         output[3] = _mm256_cvtepi16_epi32(temp[3]);
4252         input += stride;
4253         output += 4;
4254     }
4255 }
4256 
load_buffer_32x16_N2_avx2(const int16_t * input,__m256i * output,int32_t stride)4257 static INLINE void load_buffer_32x16_N2_avx2(const int16_t *input, __m256i *output,
4258                                              int32_t stride) {
4259     __m128i temp[4];
4260     int32_t i;
4261 
4262     for (i = 0; i < 16; ++i) {
4263         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
4264         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
4265 
4266         output[0] = _mm256_cvtepi16_epi32(temp[0]);
4267         output[1] = _mm256_cvtepi16_epi32(temp[1]);
4268         input += stride;
4269         output += 4;
4270     }
4271 }
fwd_txfm2d_32x32_avx2(const int16_t * input,int32_t * output,const int32_t stride,const Txfm2dFlipCfg * cfg,int32_t * txfm_buf)4272 static INLINE void fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
4273                                          const int32_t stride, const Txfm2dFlipCfg *cfg,
4274                                          int32_t *txfm_buf) {
4275     assert(cfg->tx_size < TX_SIZES);
4276     const int32_t      txfm_size       = tx_size_wide[cfg->tx_size];
4277     const int8_t *     shift           = cfg->shift;
4278     const int8_t *     stage_range_col = cfg->stage_range_col;
4279     const int8_t *     stage_range_row = cfg->stage_range_row;
4280     const int8_t       cos_bit_col     = cfg->cos_bit_col;
4281     const int8_t       cos_bit_row     = cfg->cos_bit_row;
4282     const TxfmFuncAVX2 txfm_func_col   = fwd_txfm_type_to_func(cfg->txfm_type_col);
4283     const TxfmFuncAVX2 txfm_func_row   = fwd_txfm_type_to_func(cfg->txfm_type_row);
4284     ASSERT(txfm_func_col);
4285     ASSERT(txfm_func_row);
4286     __m256i *buf_256         = (__m256i *)txfm_buf;
4287     __m256i *out_256         = (__m256i *)output;
4288     int32_t  num_per_256     = 8;
4289     int32_t  txfm2d_size_256 = txfm_size * txfm_size / num_per_256;
4290 
4291     load_buffer_32x32_avx2(input, buf_256, stride);
4292     av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[0]);
4293     txfm_func_col(out_256, buf_256, cos_bit_col, stage_range_col);
4294     av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[1]);
4295     transpose_32_avx2(txfm_size, out_256, buf_256);
4296     txfm_func_row(buf_256, out_256, cos_bit_row, stage_range_row);
4297     av1_round_shift_array_32_avx2(out_256, buf_256, txfm2d_size_256, -shift[2]);
4298     transpose_32_avx2(txfm_size, buf_256, out_256);
4299 }
4300 
svt_av1_fwd_txfm2d_32x32_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4301 void svt_av1_fwd_txfm2d_32x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4302                                    uint8_t bd) {
4303     DECLARE_ALIGNED(32, int32_t, txfm_buf[1024]);
4304     Txfm2dFlipCfg cfg;
4305     av1_transform_config(tx_type, TX_32X32, &cfg);
4306     (void)bd;
4307     fwd_txfm2d_32x32_avx2(input, output, stride, &cfg, txfm_buf);
4308 }
4309 
load_buffer_64x64_avx2(const int16_t * input,int32_t stride,__m256i * output)4310 static INLINE void load_buffer_64x64_avx2(const int16_t *input, int32_t stride, __m256i *output) {
4311     __m128i x0, x1, x2, x3, x4, x5, x6, x7;
4312     __m256i v0, v1, v2, v3, v4, v5, v6, v7;
4313     int32_t i;
4314 
4315     for (i = 0; i < 64; ++i) {
4316         x0 = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
4317         x1 = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
4318         x2 = _mm_loadu_si128((const __m128i *)(input + 2 * 8));
4319         x3 = _mm_loadu_si128((const __m128i *)(input + 3 * 8));
4320         x4 = _mm_loadu_si128((const __m128i *)(input + 4 * 8));
4321         x5 = _mm_loadu_si128((const __m128i *)(input + 5 * 8));
4322         x6 = _mm_loadu_si128((const __m128i *)(input + 6 * 8));
4323         x7 = _mm_loadu_si128((const __m128i *)(input + 7 * 8));
4324 
4325         v0 = _mm256_cvtepi16_epi32(x0);
4326         v1 = _mm256_cvtepi16_epi32(x1);
4327         v2 = _mm256_cvtepi16_epi32(x2);
4328         v3 = _mm256_cvtepi16_epi32(x3);
4329         v4 = _mm256_cvtepi16_epi32(x4);
4330         v5 = _mm256_cvtepi16_epi32(x5);
4331         v6 = _mm256_cvtepi16_epi32(x6);
4332         v7 = _mm256_cvtepi16_epi32(x7);
4333 
4334         _mm256_storeu_si256(output + 0, v0);
4335         _mm256_storeu_si256(output + 1, v1);
4336         _mm256_storeu_si256(output + 2, v2);
4337         _mm256_storeu_si256(output + 3, v3);
4338         _mm256_storeu_si256(output + 4, v4);
4339         _mm256_storeu_si256(output + 5, v5);
4340         _mm256_storeu_si256(output + 6, v6);
4341         _mm256_storeu_si256(output + 7, v7);
4342 
4343         input += stride;
4344         output += 8;
4345     }
4346 }
4347 
svt_av1_fwd_txfm2d_64x64_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4348 void svt_av1_fwd_txfm2d_64x64_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4349                                    uint8_t bd) {
4350     (void)bd;
4351     __m256i       in[512];
4352     __m256i *     out     = (__m256i *)output;
4353     const int32_t txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
4354     const int32_t txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
4355     const int8_t *shift   = fwd_txfm_shift_ls[TX_64X64];
4356 
4357     switch (tx_type) {
4358     case IDTX:
4359         load_buffer_64x64_avx2(input, stride, out);
4360         fidtx64x64_avx2(out, in);
4361         av1_round_shift_array_32_avx2(in, out, 512, -shift[1]);
4362         transpose_8nx8n(out, in, 64, 64);
4363 
4364         /*row wise transform*/
4365         fidtx64x64_avx2(in, out);
4366         av1_round_shift_array_32_avx2(out, in, 512, -shift[2]);
4367         transpose_8nx8n(in, out, 64, 64);
4368         break;
4369     case DCT_DCT:
4370         load_buffer_64x64_avx2(input, stride, out);
4371         fdct64x64_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx]);
4372         av1_round_shift_array_32_avx2(in, out, 512, -shift[1]);
4373         transpose_8nx8n(out, in, 64, 64);
4374 
4375         /*row wise transform*/
4376         fdct64x64_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx]);
4377         av1_round_shift_array_32_avx2(out, in, 512, -shift[2]);
4378         transpose_8nx8n(in, out, 64, 64);
4379         break;
4380     default: assert(0);
4381     }
4382 }
4383 
load_buffer_32_avx2(const int16_t * input,__m256i * in,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)4384 static INLINE void load_buffer_32_avx2(const int16_t *input, __m256i *in, int32_t stride,
4385                                        int32_t flipud, int32_t fliplr, int32_t shift) {
4386     __m128i temp[4];
4387     if (!flipud) {
4388         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
4389         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
4390         temp[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
4391         temp[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
4392     } else {
4393         temp[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
4394         temp[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
4395         temp[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
4396         temp[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
4397     }
4398 
4399     if (fliplr) {
4400         temp[0] = mm_reverse_epi16(temp[0]);
4401         temp[1] = mm_reverse_epi16(temp[1]);
4402         temp[2] = mm_reverse_epi16(temp[2]);
4403         temp[3] = mm_reverse_epi16(temp[3]);
4404     }
4405 
4406     in[0] = _mm256_cvtepi16_epi32(temp[0]);
4407     in[1] = _mm256_cvtepi16_epi32(temp[1]);
4408     in[2] = _mm256_cvtepi16_epi32(temp[2]);
4409     in[3] = _mm256_cvtepi16_epi32(temp[3]);
4410 
4411     in[0] = _mm256_slli_epi32(in[0], shift);
4412     in[1] = _mm256_slli_epi32(in[1], shift);
4413     in[2] = _mm256_slli_epi32(in[2], shift);
4414     in[3] = _mm256_slli_epi32(in[3], shift);
4415 }
4416 
load_buffer_16_avx2(const int16_t * input,__m256i * in,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)4417 static INLINE void load_buffer_16_avx2(const int16_t *input, __m256i *in, int32_t stride,
4418                                        int32_t flipud, int32_t fliplr, int32_t shift) {
4419     __m128i temp[2];
4420     if (!flipud) {
4421         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
4422         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
4423     } else {
4424         temp[0] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
4425         temp[1] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
4426     }
4427 
4428     if (fliplr) {
4429         temp[0] = mm_reverse_epi16(temp[0]);
4430         temp[1] = mm_reverse_epi16(temp[1]);
4431     }
4432 
4433     in[0] = _mm256_cvtepi16_epi32(temp[0]);
4434     in[1] = _mm256_cvtepi16_epi32(temp[1]);
4435 
4436     in[0] = _mm256_slli_epi32(in[0], shift);
4437     in[1] = _mm256_slli_epi32(in[1], shift);
4438 }
4439 
load_buffer_32x8n(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift,const int32_t height)4440 static INLINE void load_buffer_32x8n(const int16_t *input, __m256i *out, int32_t stride,
4441                                      int32_t flipud, int32_t fliplr, int32_t shift,
4442                                      const int32_t height) {
4443     for (int32_t col = 0; col < height; col++) {
4444         const int16_t *in     = input + col * stride;
4445         __m256i *      output = out + col * 4;
4446         load_buffer_32_avx2(in, output, 8, flipud, fliplr, shift);
4447     }
4448 }
4449 
load_buffer_16x8n(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift,const int32_t height)4450 static AOM_FORCE_INLINE void load_buffer_16x8n(const int16_t *input, __m256i *out, int32_t stride,
4451                                                int32_t flipud, int32_t fliplr, int32_t shift,
4452                                                const int32_t height) {
4453     for (int32_t col = 0; col < height; col++) {
4454         const int16_t *in     = input + col * stride;
4455         __m256i *      output = out + col * 4;
4456         load_buffer_16_avx2(in, output, 8, flipud, fliplr, shift);
4457     }
4458 }
load_buffer_8x16(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)4459 static INLINE void load_buffer_8x16(const int16_t *input, __m256i *out, int32_t stride,
4460                                     int32_t flipud, int32_t fliplr, int32_t shift) {
4461     const int16_t *top_l = input;
4462     const int16_t *bot_l = input + 8 * stride;
4463 
4464     const int16_t *tmp;
4465 
4466     if (flipud) {
4467         tmp   = top_l;
4468         top_l = bot_l;
4469         bot_l = tmp;
4470     }
4471 
4472     load_buffer_8x8(top_l, out, stride, flipud, fliplr, shift);
4473     load_buffer_8x8(bot_l, out + 8, stride, flipud, fliplr, shift);
4474 }
4475 
col_txfm_8x4_rounding(__m256i * in,int32_t shift)4476 static INLINE void col_txfm_8x4_rounding(__m256i *in, int32_t shift) {
4477     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
4478 
4479     in[0] = _mm256_add_epi32(in[0], rounding);
4480     in[1] = _mm256_add_epi32(in[1], rounding);
4481     in[2] = _mm256_add_epi32(in[2], rounding);
4482     in[3] = _mm256_add_epi32(in[3], rounding);
4483 
4484     in[0] = _mm256_srai_epi32(in[0], shift);
4485     in[1] = _mm256_srai_epi32(in[1], shift);
4486     in[2] = _mm256_srai_epi32(in[2], shift);
4487     in[3] = _mm256_srai_epi32(in[3], shift);
4488 }
4489 
col_txfm_8x16_rounding(__m256i * in,int32_t shift)4490 static INLINE void col_txfm_8x16_rounding(__m256i *in, int32_t shift) {
4491     col_txfm_8x8_rounding(&in[0], shift);
4492     col_txfm_8x8_rounding(&in[8], shift);
4493 }
4494 
write_buffer_16x8_avx2(const __m256i * res,int32_t * output,const int32_t stride)4495 static INLINE void write_buffer_16x8_avx2(const __m256i *res, int32_t *output,
4496                                           const int32_t stride) {
4497     _mm256_storeu_si256((__m256i *)(output), res[0]);
4498     _mm256_storeu_si256((__m256i *)(output + stride), res[1]);
4499     _mm256_storeu_si256((__m256i *)(output + (stride * 2)), res[2]);
4500     _mm256_storeu_si256((__m256i *)(output + (stride * 3)), res[3]);
4501     _mm256_storeu_si256((__m256i *)(output + (stride * 4)), res[4]);
4502     _mm256_storeu_si256((__m256i *)(output + (stride * 5)), res[5]);
4503     _mm256_storeu_si256((__m256i *)(output + (stride * 6)), res[6]);
4504     _mm256_storeu_si256((__m256i *)(output + (stride * 7)), res[7]);
4505 }
4506 
svt_av1_fwd_txfm2d_32x64_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4507 void svt_av1_fwd_txfm2d_32x64_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4508                                    uint8_t bd) {
4509     (void)tx_type;
4510     __m256i       in[256];
4511     __m256i *     outcoef256    = (__m256i *)output;
4512     const int8_t *shift         = fwd_txfm_shift_ls[TX_32X64];
4513     const int32_t txw_idx       = get_txw_idx(TX_32X64);
4514     const int32_t txh_idx       = get_txh_idx(TX_32X64);
4515     const int32_t txfm_size_col = tx_size_wide[TX_32X64];
4516     const int32_t txfm_size_row = tx_size_high[TX_32X64];
4517     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
4518     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
4519     const int32_t num_row       = txfm_size_row >> 3;
4520     const int32_t num_col       = txfm_size_col >> 3;
4521 
4522     // column transform
4523     load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
4524     av1_fdct64_new_avx2(in, in, bitcol, txfm_size_col, num_col);
4525 
4526     for (int32_t i = 0; i < num_row; i++)
4527         col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
4528     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4529 
4530     // row transform
4531     av1_fdct32_new_avx2(outcoef256, in, bitrow, txfm_size_row, num_row);
4532     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4533     av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 256, -shift[2], new_sqrt2);
4534     (void)bd;
4535 }
4536 
svt_av1_fwd_txfm2d_64x32_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4537 void svt_av1_fwd_txfm2d_64x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4538                                    uint8_t bd) {
4539     (void)tx_type;
4540     __m256i       in[256];
4541     __m256i *     outcoef256    = (__m256i *)output;
4542     const int8_t *shift         = fwd_txfm_shift_ls[TX_64X32];
4543     const int32_t txw_idx       = get_txw_idx(TX_64X32);
4544     const int32_t txh_idx       = get_txh_idx(TX_64X32);
4545     const int32_t txfm_size_col = tx_size_wide[TX_64X32];
4546     const int32_t txfm_size_row = tx_size_high[TX_64X32];
4547     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
4548     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
4549     const int32_t num_row       = txfm_size_row >> 3;
4550     const int32_t num_col       = txfm_size_col >> 3;
4551 
4552     // column transform
4553     for (int32_t i = 0; i < 32; i++) {
4554         load_buffer_32_avx2(input + 0 + i * stride, in + 0 + i * 8, 8, 0, 0, shift[0]);
4555         load_buffer_32_avx2(input + 32 + i * stride, in + 4 + i * 8, 8, 0, 0, shift[0]);
4556     }
4557 
4558     av1_fdct32_new_avx2(in, in, bitcol, txfm_size_col, num_col);
4559 
4560     for (int32_t i = 0; i < num_col; i++)
4561         col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
4562     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4563 
4564     // row transform
4565     av1_fdct64_new_avx2(outcoef256, in, bitrow, txfm_size_row, num_row);
4566     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4567     av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 256, -shift[2], new_sqrt2);
4568     (void)bd;
4569 }
4570 
svt_av1_fwd_txfm2d_16x64_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4571 void svt_av1_fwd_txfm2d_16x64_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4572                                    uint8_t bd) {
4573     __m256i       in[128];
4574     __m256i *     outcoeff256   = (__m256i *)output;
4575     const int8_t *shift         = fwd_txfm_shift_ls[TX_16X64];
4576     const int32_t txw_idx       = get_txw_idx(TX_16X64);
4577     const int32_t txh_idx       = get_txh_idx(TX_16X64);
4578     const int32_t txfm_size_col = tx_size_wide[TX_16X64];
4579     const int32_t txfm_size_row = tx_size_high[TX_16X64];
4580     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
4581     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
4582     int32_t       ud_flip, lr_flip;
4583     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4584     const int32_t num_row = txfm_size_row >> 3;
4585     const int32_t num_col = txfm_size_col >> 3;
4586     // col tranform
4587     for (int32_t i = 0; i < txfm_size_row; i += num_col) {
4588         load_buffer_16_avx2(
4589             input + (i + 0) * stride, in + (i + 0) * num_col, 8, ud_flip, lr_flip, shift[0]);
4590         load_buffer_16_avx2(
4591             input + (i + 1) * stride, in + (i + 1) * num_col, 8, ud_flip, lr_flip, shift[0]);
4592     }
4593 
4594     av1_fdct64_new_avx2(in, outcoeff256, bitcol, txfm_size_col, num_col);
4595 
4596     col_txfm_16x16_rounding(outcoeff256, -shift[1]);
4597     col_txfm_16x16_rounding(outcoeff256 + 32, -shift[1]);
4598     col_txfm_16x16_rounding(outcoeff256 + 64, -shift[1]);
4599     col_txfm_16x16_rounding(outcoeff256 + 96, -shift[1]);
4600     transpose_8nx8n(outcoeff256, in, txfm_size_col, txfm_size_row);
4601     // row tranform
4602     fdct16x16_avx2(in, in, bitrow, num_row);
4603     transpose_8nx8n(in, outcoeff256, txfm_size_row, txfm_size_col);
4604     (void)bd;
4605 }
4606 
svt_av1_fwd_txfm2d_64x16_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4607 void svt_av1_fwd_txfm2d_64x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4608                                    uint8_t bd) {
4609     __m256i       in[128];
4610     __m256i *     outcoeff256   = (__m256i *)output;
4611     const int8_t *shift         = fwd_txfm_shift_ls[TX_64X16];
4612     const int32_t txw_idx       = get_txw_idx(TX_64X16);
4613     const int32_t txh_idx       = get_txh_idx(TX_64X16);
4614     const int32_t txfm_size_col = tx_size_wide[TX_64X16];
4615     const int32_t txfm_size_row = tx_size_high[TX_64X16];
4616     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
4617     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
4618     int32_t       ud_flip, lr_flip;
4619     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4620     const int32_t num_row = txfm_size_row >> 3;
4621     const int32_t num_col = txfm_size_col >> 3;
4622     // col tranform
4623     for (int32_t i = 0; i < txfm_size_row; i++) {
4624         load_buffer_16_avx2(input + 0 + i * stride, in + 0 + i * 8, 8, ud_flip, lr_flip, shift[0]);
4625         load_buffer_16_avx2(input + 16 + i * stride, in + 2 + i * 8, 8, ud_flip, lr_flip, shift[0]);
4626         load_buffer_16_avx2(input + 32 + i * stride, in + 4 + i * 8, 8, ud_flip, lr_flip, shift[0]);
4627         load_buffer_16_avx2(input + 48 + i * stride, in + 6 + i * 8, 8, ud_flip, lr_flip, shift[0]);
4628     }
4629 
4630     fdct16x16_avx2(in, outcoeff256, bitcol, num_col);
4631     col_txfm_16x16_rounding(outcoeff256, -shift[1]);
4632     col_txfm_16x16_rounding(outcoeff256 + 32, -shift[1]);
4633     col_txfm_16x16_rounding(outcoeff256 + 64, -shift[1]);
4634     col_txfm_16x16_rounding(outcoeff256 + 96, -shift[1]);
4635     transpose_8nx8n(outcoeff256, in, txfm_size_col, txfm_size_row);
4636     // row tranform
4637     av1_fdct64_new_avx2(in, in, bitrow, txfm_size_row, num_row);
4638     transpose_8nx8n(in, outcoeff256, txfm_size_row, txfm_size_col);
4639     (void)bd;
4640 }
4641 
4642 static const FwdTransform1dAvx2 col_fwdtxfm_8x32_arr[TX_TYPES] = {
4643     av1_fdct32_new_line_wraper_avx2, // DCT_DCT
4644     NULL, // ADST_DCT
4645     NULL, // DCT_ADST
4646     NULL, // ADST_ADST
4647     NULL, // FLIPADST_DCT
4648     NULL, // DCT_FLIPADST
4649     NULL, // FLIPADST_FLIPADST
4650     NULL, // ADST_FLIPADST
4651     NULL, // FLIPADST_ADST
4652     av1_idtx32_new_avx2, // IDTX
4653     NULL, // V_DCT
4654     NULL, // H_DCT
4655     NULL, // V_ADST
4656     NULL, // H_ADST
4657     NULL, // V_FLIPADST
4658     NULL // H_FLIPADST
4659 };
4660 
4661 static const FwdTransform1dAvx2 row_fwdtxfm_8x32_arr[TX_TYPES] = {
4662     fdct16x16_avx2, // DCT_DCT
4663     NULL, // ADST_DCT
4664     NULL, // DCT_ADST
4665     NULL, // ADST_ADST
4666     NULL, // FLIPADST_DCT
4667     NULL, // DCT_FLIPADST
4668     NULL, // FLIPADST_FLIPADST
4669     NULL, // ADST_FLIPADST
4670     NULL, // FLIPADST_ADST
4671     fidtx16x16_avx2, // IDTX
4672     NULL, // V_DCT
4673     NULL, // H_DCT
4674     NULL, // V_ADST
4675     NULL, // H_ADST
4676     NULL, // V_FLIPADST
4677     NULL // H_FLIPADST
4678 };
4679 
4680 static const FwdTransform1dAvx2 row_fwdtxfm_32x8_arr[TX_TYPES] = {
4681     fdct8x8_avx2, // DCT_DCT
4682     NULL, // ADST_DCT
4683     NULL, // DCT_ADST
4684     NULL, // ADST_ADST
4685     NULL, // FLIPADST_DCT
4686     NULL, // DCT_FLIPADST
4687     NULL, // FLIPADST_FLIPADST
4688     NULL, // ADST_FLIPADST
4689     NULL, // FLIPADST-ADST
4690     fidtx32x8_avx2, // IDTX
4691     NULL, // V_DCT
4692     NULL, // H_DCT
4693     NULL, // V_ADST
4694     NULL, // H_ADST
4695     NULL, // V_FLIPADST
4696     NULL, // H_FLIPADST
4697 };
4698 
4699 static const FwdTransform1dAvx2 col_fwdtxfm_8x16_arr[TX_TYPES] = {
4700     fdct16x16_avx2, // DCT_DCT
4701     fadst16x16_avx2, // ADST_DCT
4702     fdct16x16_avx2, // DCT_ADST
4703     fadst16x16_avx2, // ADST_ADST
4704     fadst16x16_avx2, // FLIPADST_DCT
4705     fdct16x16_avx2, // DCT_FLIPADST
4706     fadst16x16_avx2, // FLIPADST_FLIPADST
4707     fadst16x16_avx2, // ADST_FLIPADST
4708     fadst16x16_avx2, // FLIPADST_ADST
4709     fidtx16x16_avx2, // IDTX
4710     fdct16x16_avx2, // V_DCT
4711     fidtx16x16_avx2, // H_DCT
4712     fadst16x16_avx2, // V_ADST
4713     fidtx16x16_avx2, // H_ADST
4714     fadst16x16_avx2, // V_FLIPADST
4715     fidtx16x16_avx2 // H_FLIPADST
4716 };
4717 
4718 static const FwdTransform1dAvx2 row_fwdtxfm_8x8_arr[TX_TYPES] = {
4719     fdct8x8_avx2, // DCT_DCT
4720     fdct8x8_avx2, // ADST_DCT
4721     fadst8x8_avx2, // DCT_ADST
4722     fadst8x8_avx2, // ADST_ADST
4723     fdct8x8_avx2, // FLIPADST_DCT
4724     fadst8x8_avx2, // DCT_FLIPADST
4725     fadst8x8_avx2, // FLIPADST_FLIPADST
4726     fadst8x8_avx2, // ADST_FLIPADST
4727     fadst8x8_avx2, // FLIPADST_ADST
4728     fidtx8x8_avx2, // IDTX
4729     fidtx8x8_avx2, // V_DCT
4730     fdct8x8_avx2, // H_DCT
4731     fidtx8x8_avx2, // V_ADST
4732     fadst8x8_avx2, // H_ADST
4733     fidtx8x8_avx2, // V_FLIPADST
4734     fadst8x8_avx2 // H_FLIPADST
4735 };
4736 
4737 static const FwdTransform1dAvx2 col_fwdtxfm_8x8_arr[TX_TYPES] = {
4738     fdct8x8_avx2, // DCT_DCT
4739     fadst8x8_avx2, // ADST_DCT
4740     fdct8x8_avx2, // DCT_ADST
4741     fadst8x8_avx2, // ADST_ADST
4742     fadst8x8_avx2, // FLIPADST_DCT
4743     fdct8x8_avx2, // DCT_FLIPADST
4744     fadst8x8_avx2, // FLIPADST_FLIPADST
4745     fadst8x8_avx2, // ADST_FLIPADST
4746     fadst8x8_avx2, // FLIPADST_ADST
4747     fidtx8x8_avx2, // IDTX
4748     fdct8x8_avx2, // V_DCT
4749     fidtx8x8_avx2, // H_DCT
4750     fadst8x8_avx2, // V_ADST
4751     fidtx8x8_avx2, // H_ADST
4752     fadst8x8_avx2, // V_FLIPADST
4753     fidtx8x8_avx2 // H_FLIPADST
4754 };
4755 
4756 static const FwdTransform1dAvx2 row_fwdtxfm_8x16_arr[TX_TYPES] = {
4757     fdct16x16_avx2, // DCT_DCT
4758     fdct16x16_avx2, // ADST_DCT
4759     fadst16x16_avx2, // DCT_ADST
4760     fadst16x16_avx2, // ADST_ADST
4761     fdct16x16_avx2, // FLIPADST_DCT
4762     fadst16x16_avx2, // DCT_FLIPADST
4763     fadst16x16_avx2, // FLIPADST_FLIPADST
4764     fadst16x16_avx2, // ADST_FLIPADST
4765     fadst16x16_avx2, // FLIPADST_ADST
4766     fidtx16x16_avx2, // IDTX
4767     fidtx16x16_avx2, // V_DCT
4768     fdct16x16_avx2, // H_DCT
4769     fidtx16x16_avx2, // V_ADST
4770     fadst16x16_avx2, // H_ADST
4771     fidtx16x16_avx2, // V_FLIPADST
4772     fadst16x16_avx2 // H_FLIPADST
4773 };
4774 
4775 /* call this function only for DCT_DCT, IDTX */
svt_av1_fwd_txfm2d_16x32_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4776 void svt_av1_fwd_txfm2d_16x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4777                                    uint8_t bd) {
4778     __m256i                  in[64];
4779     __m256i *                outcoef256    = (__m256i *)output;
4780     const int8_t *           shift         = fwd_txfm_shift_ls[TX_16X32];
4781     const int32_t            txw_idx       = get_txw_idx(TX_16X32);
4782     const int32_t            txh_idx       = get_txh_idx(TX_16X32);
4783     const FwdTransform1dAvx2 col_txfm      = col_fwdtxfm_8x32_arr[tx_type];
4784     const FwdTransform1dAvx2 row_txfm      = row_fwdtxfm_8x32_arr[tx_type];
4785     int8_t                   bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
4786     int8_t                   bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
4787     const int32_t            txfm_size_col = tx_size_wide[TX_16X32];
4788     const int32_t            txfm_size_row = tx_size_high[TX_16X32];
4789     const int32_t            num_row       = txfm_size_row >> 3;
4790     const int32_t            num_col       = txfm_size_col >> 3;
4791 
4792     // column transform
4793     load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
4794     load_buffer_16x16(input + 16 * stride, in + 32, stride, 0, 0, shift[0]);
4795 
4796     for (int32_t i = 0; i < num_col; i++) col_txfm((in + i), (in + i), bitcol, num_col);
4797     col_txfm_16x16_rounding(&in[0], -shift[1]);
4798     col_txfm_16x16_rounding(&in[32], -shift[1]);
4799     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4800 
4801     // row transform
4802     row_txfm(outcoef256, in, bitrow, num_row);
4803     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4804     av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 64, -shift[2], new_sqrt2);
4805     (void)bd;
4806 }
4807 
4808 /* call this function only for IDTX */
svt_av1_fwd_txfm2d_32x16_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4809 void svt_av1_fwd_txfm2d_32x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4810                                    uint8_t bd) {
4811     __m256i                  in[64];
4812     __m256i *                outcoef256    = (__m256i *)output;
4813     const int8_t *           shift         = fwd_txfm_shift_ls[TX_32X16];
4814     const int32_t            txw_idx       = get_txw_idx(TX_32X16);
4815     const int32_t            txh_idx       = get_txh_idx(TX_32X16);
4816     const FwdTransform1dAvx2 col_txfm      = row_fwdtxfm_8x32_arr[tx_type];
4817     const FwdTransform1dAvx2 row_txfm      = col_fwdtxfm_8x32_arr[tx_type];
4818     int8_t                   bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
4819     int8_t                   bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
4820     const int32_t            txfm_size_col = tx_size_wide[TX_32X16];
4821     const int32_t            txfm_size_row = tx_size_high[TX_32X16];
4822     const int32_t            num_row       = txfm_size_row >> 3;
4823     const int32_t            num_col       = txfm_size_col >> 3;
4824 
4825     // column transform
4826     load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
4827     col_txfm(in, in, bitcol, num_col);
4828     col_txfm_16x16_rounding(&in[0], -shift[1]);
4829     col_txfm_16x16_rounding(&in[32], -shift[1]);
4830     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4831 
4832     // row transform
4833     for (int32_t i = 0; i < num_row; i++) row_txfm((outcoef256 + i), (in + i), bitrow, num_row);
4834     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4835     av1_round_shift_rect_array_32_avx2(outcoef256, outcoef256, 64, -shift[2], new_sqrt2);
4836     (void)bd;
4837 }
4838 
4839 /* call this function only for DCT_DCT, IDTX */
svt_av1_fwd_txfm2d_8x32_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4840 void svt_av1_fwd_txfm2d_8x32_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4841                                   uint8_t bd) {
4842     __m256i                  in[32];
4843     __m256i *                outcoef256 = (__m256i *)output;
4844     const int8_t *           shift      = fwd_txfm_shift_ls[TX_8X32];
4845     const int32_t            txw_idx    = get_txw_idx(TX_8X32);
4846     const int32_t            txh_idx    = get_txh_idx(TX_8X32);
4847     const FwdTransform1dAvx2 col_txfm   = col_fwdtxfm_8x32_arr[tx_type];
4848     const FwdTransform1dAvx2 row_txfm   = row_fwdtxfm_32x8_arr[tx_type];
4849     int8_t                   bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
4850     int8_t                   bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
4851 
4852     const int32_t txfm_size_col = tx_size_wide[TX_8X32];
4853     const int32_t txfm_size_row = tx_size_high[TX_8X32];
4854     const int32_t num_row       = txfm_size_row >> 3;
4855     const int32_t num_col       = txfm_size_col >> 3;
4856 
4857     // column transform
4858     load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
4859     load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + 16, stride, 0, 0, shift[0]);
4860 
4861     col_txfm(in, in, bitcol, num_col);
4862     col_txfm_16x16_rounding(in, -shift[1]);
4863     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4864 
4865     // row transform
4866     for (int32_t i = 0; i < num_row; i++) row_txfm((outcoef256 + i), (in + i), bitrow, num_row);
4867     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4868     (void)bd;
4869 }
4870 
4871 /* call this function only for DCT_DCT, IDTX */
svt_av1_fwd_txfm2d_32x8_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4872 void svt_av1_fwd_txfm2d_32x8_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4873                                   uint8_t bd) {
4874     __m256i                  in[32];
4875     __m256i *                outcoef256 = (__m256i *)output;
4876     const int8_t *           shift      = fwd_txfm_shift_ls[TX_32X8];
4877     const int32_t            txw_idx    = get_txw_idx(TX_32X8);
4878     const int32_t            txh_idx    = get_txh_idx(TX_32X8);
4879     const FwdTransform1dAvx2 col_txfm   = row_fwdtxfm_32x8_arr[tx_type];
4880     const FwdTransform1dAvx2 row_txfm   = col_fwdtxfm_8x32_arr[tx_type];
4881     int8_t                   bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
4882     int8_t                   bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
4883 
4884     const int32_t txfm_size_col = tx_size_wide[TX_32X8];
4885     const int32_t txfm_size_row = tx_size_high[TX_32X8];
4886     const int32_t num_row       = txfm_size_row >> 3;
4887     const int32_t num_col       = txfm_size_col >> 3;
4888 
4889     // column transform
4890     load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
4891     for (int32_t i = 0; i < num_col; i++) col_txfm((in + i), (in + i), bitcol, num_col);
4892     col_txfm_16x16_rounding(&in[0], -shift[1]);
4893     transpose_8nx8n(in, outcoef256, txfm_size_col, txfm_size_row);
4894 
4895     // row transform
4896     row_txfm(outcoef256, in, bitrow, num_row);
4897 
4898     transpose_8nx8n(in, outcoef256, txfm_size_row, txfm_size_col);
4899     (void)bd;
4900 }
4901 
4902 /* call this function for all 16 transform types */
svt_av1_fwd_txfm2d_8x16_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4903 void svt_av1_fwd_txfm2d_8x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4904                                   uint8_t bd) {
4905     __m256i                  in[16], out[16];
4906     const int8_t *           shift    = fwd_txfm_shift_ls[TX_8X16];
4907     const int32_t            txw_idx  = get_txw_idx(TX_8X16);
4908     const int32_t            txh_idx  = get_txh_idx(TX_8X16);
4909     const FwdTransform1dAvx2 col_txfm = col_fwdtxfm_8x16_arr[tx_type];
4910     const FwdTransform1dAvx2 row_txfm = row_fwdtxfm_8x8_arr[tx_type];
4911     int8_t                   bitcol   = fwd_cos_bit_col[txw_idx][txh_idx];
4912     int8_t                   bitrow   = fwd_cos_bit_row[txw_idx][txh_idx];
4913     int32_t                  ud_flip, lr_flip;
4914     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4915     const int32_t txfm_size_col = tx_size_wide[TX_8X16];
4916     const int32_t txfm_size_row = tx_size_high[TX_8X16];
4917     const int32_t num_row       = txfm_size_row >> 3;
4918     const int32_t num_col       = txfm_size_col >> 3;
4919 
4920     load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
4921     // column transform
4922     col_txfm(in, in, bitcol, num_col);
4923     col_txfm_8x16_rounding(in, -shift[1]);
4924     transpose_8x8_avx2(in, out);
4925     transpose_8x8_avx2(in + 8, out + 8);
4926 
4927     // row transform
4928     for (int32_t i = 0; i < num_row; i++) {
4929         row_txfm(out + i * 8, out, bitrow, 1);
4930         transpose_8x8_avx2(out, in);
4931         av1_round_shift_rect_array_32_avx2(in, in, 8, -shift[2], new_sqrt2);
4932         write_buffer_8x8(in, output + i * 64);
4933     }
4934     (void)bd;
4935 }
4936 
4937 /* call this function for all 16 transform types */
svt_av1_fwd_txfm2d_16x8_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4938 void svt_av1_fwd_txfm2d_16x8_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4939                                   uint8_t bd) {
4940     __m256i                  in[16], out[16];
4941     const int8_t *           shift    = fwd_txfm_shift_ls[TX_16X8];
4942     const int32_t            txw_idx  = get_txw_idx(TX_16X8);
4943     const int32_t            txh_idx  = get_txh_idx(TX_16X8);
4944     const FwdTransform1dAvx2 col_txfm = col_fwdtxfm_8x8_arr[tx_type];
4945     const FwdTransform1dAvx2 row_txfm = row_fwdtxfm_8x16_arr[tx_type];
4946     int8_t                   bitcol   = fwd_cos_bit_col[txw_idx][txh_idx];
4947     int8_t                   bitrow   = fwd_cos_bit_row[txw_idx][txh_idx];
4948     int32_t                  ud_flip, lr_flip;
4949     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4950     const int32_t txfm_size_col = tx_size_wide[TX_16X8];
4951     const int32_t txfm_size_row = tx_size_high[TX_16X8];
4952     const int32_t num_row       = txfm_size_row >> 3;
4953     const int32_t num_col       = txfm_size_col >> 3;
4954     assert(num_col > 0);
4955     // column transform
4956     for (int32_t i = 0; i < num_col; i++) {
4957         load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
4958         col_txfm(in, in, bitcol, 1);
4959         col_txfm_8x8_rounding(in, -shift[1]);
4960         transpose_8x8_avx2(in, out + i * 8);
4961     }
4962 
4963     // row transform
4964     if (lr_flip) {
4965         for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
4966         row_txfm(in, out, bitrow, num_row);
4967     } else
4968         row_txfm(out, out, bitrow, num_row);
4969 
4970     for (int32_t i = 0; i < num_col; i++) {
4971         transpose_8x8_avx2(out + i * 8, in);
4972         av1_round_shift_rect_array_32_avx2(in, in, 8, -shift[2], new_sqrt2);
4973         write_buffer_16x8_avx2(in, output + i * 8, 16);
4974     }
4975     (void)bd;
4976 }
4977 
svt_av1_fwd_txfm2d_4x8_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)4978 void svt_av1_fwd_txfm2d_4x8_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
4979                                  uint8_t bd) {
4980     __m256i in[4];
4981     __m256i outcoeff256[4];
4982 
4983     const int8_t *shift   = fwd_txfm_shift_ls[TX_4X8];
4984     const int32_t txw_idx = get_txw_idx(TX_4X8);
4985     const int32_t txh_idx = get_txh_idx(TX_4X8);
4986     int32_t       bitcol  = fwd_cos_bit_col[txw_idx][txh_idx];
4987     int32_t       bitrow  = fwd_cos_bit_row[txw_idx][txh_idx];
4988 
4989     switch (tx_type) {
4990     case DCT_DCT:
4991         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
4992         fdct4x8_avx2(in, in, bitcol);
4993         col_txfm_8x4_rounding(in, -shift[1]);
4994         transpose_4x8_avx2(in, outcoeff256);
4995         fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
4996         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
4997         write_buffer_4x8(outcoeff256, output);
4998         break;
4999     case ADST_DCT:
5000         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
5001         fadst8x4_avx2(in, in, bitcol, 1);
5002         col_txfm_8x4_rounding(in, -shift[1]);
5003         transpose_4x8_avx2(in, outcoeff256);
5004         fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
5005         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5006         write_buffer_4x8(outcoeff256, output);
5007         break;
5008     case DCT_ADST:
5009         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
5010         fdct4x8_avx2(in, in, bitcol);
5011         col_txfm_8x4_rounding(in, -shift[1]);
5012         transpose_4x8_avx2(in, outcoeff256);
5013         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
5014         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5015         write_buffer_4x8(outcoeff256, output);
5016         break;
5017     case ADST_ADST:
5018         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
5019         fadst8x4_avx2(in, in, bitcol, 1);
5020         col_txfm_8x4_rounding(in, -shift[1]);
5021         transpose_4x8_avx2(in, outcoeff256);
5022         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
5023         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5024         write_buffer_4x8(outcoeff256, output);
5025         break;
5026     case FLIPADST_DCT:
5027         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
5028         fadst8x4_avx2(in, in, bitcol, 1);
5029         col_txfm_8x4_rounding(in, -shift[1]);
5030         transpose_4x8_avx2(in, outcoeff256);
5031         fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
5032         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5033         write_buffer_4x8(outcoeff256, output);
5034         break;
5035     case DCT_FLIPADST:
5036         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
5037         fdct4x8_avx2(in, in, bitcol);
5038         col_txfm_8x4_rounding(in, -shift[1]);
5039         transpose_4x8_avx2(in, outcoeff256);
5040         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
5041         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5042         write_buffer_4x8(outcoeff256, output);
5043         break;
5044     case FLIPADST_FLIPADST:
5045         load_buffer_4x8_avx2(input, in, stride, 1, 1, shift[0]);
5046         fadst8x4_avx2(in, in, bitcol, 1);
5047         col_txfm_8x4_rounding(in, -shift[1]);
5048         transpose_4x8_avx2(in, outcoeff256);
5049         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
5050         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5051         write_buffer_4x8(outcoeff256, output);
5052         break;
5053     case ADST_FLIPADST:
5054         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
5055         fadst8x4_avx2(in, in, bitcol, 1);
5056         col_txfm_8x4_rounding(in, -shift[1]);
5057         transpose_4x8_avx2(in, outcoeff256);
5058         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
5059         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5060         write_buffer_4x8(outcoeff256, output);
5061         break;
5062     case FLIPADST_ADST:
5063         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
5064         fadst8x4_avx2(in, in, bitcol, 1);
5065         col_txfm_8x4_rounding(in, -shift[1]);
5066         transpose_4x8_avx2(in, outcoeff256);
5067         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
5068         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5069         write_buffer_4x8(outcoeff256, output);
5070         break;
5071     case IDTX:
5072         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
5073         fidtx8x4_avx2(in, in, bitcol);
5074         col_txfm_8x4_rounding(in, -shift[1]);
5075         transpose_4x8_avx2(in, outcoeff256);
5076         fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
5077         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5078         write_buffer_4x8(outcoeff256, output);
5079         break;
5080     case V_DCT:
5081         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
5082         fdct4x8_avx2(in, in, bitcol);
5083         col_txfm_8x4_rounding(in, -shift[1]);
5084         transpose_4x8_avx2(in, outcoeff256);
5085         fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
5086         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5087         write_buffer_4x8(outcoeff256, output);
5088         break;
5089     case H_DCT:
5090         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
5091         fidtx8x4_avx2(in, in, bitcol);
5092         col_txfm_8x4_rounding(in, -shift[1]);
5093         transpose_4x8_avx2(in, outcoeff256);
5094         fdct4x8_col_avx2(outcoeff256, in, bitrow, 1);
5095         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5096         write_buffer_4x8(outcoeff256, output);
5097         break;
5098     case V_ADST:
5099         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
5100         fadst8x4_avx2(in, in, bitcol, 1);
5101         col_txfm_8x4_rounding(in, -shift[1]);
5102         transpose_4x8_avx2(in, outcoeff256);
5103         fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
5104         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5105         write_buffer_4x8(outcoeff256, output);
5106         break;
5107     case H_ADST:
5108         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
5109         fidtx8x4_avx2(in, in, bitcol);
5110         col_txfm_8x4_rounding(in, -shift[1]);
5111         transpose_4x8_avx2(in, outcoeff256);
5112         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
5113         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5114         write_buffer_4x8(outcoeff256, output);
5115         break;
5116     case V_FLIPADST:
5117         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
5118         fadst8x4_avx2(in, in, bitcol, 1);
5119         col_txfm_8x4_rounding(in, -shift[1]);
5120         transpose_4x8_avx2(in, outcoeff256);
5121         fidtx4x8_col_avx2(outcoeff256, in, bitrow, 1);
5122         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5123         write_buffer_4x8(outcoeff256, output);
5124         break;
5125     case H_FLIPADST:
5126         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
5127         fidtx8x4_avx2(in, in, bitcol);
5128         col_txfm_8x4_rounding(in, -shift[1]);
5129         transpose_4x8_avx2(in, outcoeff256);
5130         fadst4x8_col_avx2(outcoeff256, in, bitrow, 1);
5131         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 4, -shift[2], new_sqrt2);
5132         write_buffer_4x8(outcoeff256, output);
5133         break;
5134     default: assert(0);
5135     }
5136     (void)bd;
5137 }
5138 
svt_av1_fwd_txfm2d_8x4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)5139 void svt_av1_fwd_txfm2d_8x4_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
5140                                  uint8_t bd) {
5141     __m256i       in[4];
5142     __m256i *     outcoeff256 = (__m256i *)output;
5143     const int8_t *shift       = fwd_txfm_shift_ls[TX_8X4];
5144     const int32_t txw_idx     = get_txw_idx(TX_8X4);
5145     const int32_t txh_idx     = get_txh_idx(TX_8X4);
5146     int32_t       bitcol      = fwd_cos_bit_col[txw_idx][txh_idx];
5147     int32_t       bitrow      = fwd_cos_bit_row[txw_idx][txh_idx];
5148 
5149     switch (tx_type) {
5150     case DCT_DCT:
5151         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5152         fdct4x8_row_avx2(in, in, bitcol, 1);
5153         col_txfm_8x4_rounding(in, -shift[1]);
5154         fdct4x8_avx2(in, outcoeff256, bitrow);
5155         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5156         transpose_4x8_avx2(in, outcoeff256);
5157         break;
5158     case ADST_DCT:
5159         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5160         fadst4x8_row_avx2(in, in, bitcol, 1);
5161         col_txfm_8x4_rounding(in, -shift[1]);
5162         fdct4x8_avx2(in, outcoeff256, bitrow);
5163         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5164         transpose_4x8_avx2(in, outcoeff256);
5165         break;
5166     case DCT_ADST:
5167         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5168         fdct4x8_row_avx2(in, in, bitcol, 1);
5169         col_txfm_8x4_rounding(in, -shift[1]);
5170         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5171         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5172         transpose_4x8_avx2(in, outcoeff256);
5173         break;
5174     case ADST_ADST:
5175         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5176         fadst4x8_row_avx2(in, in, bitcol, 1);
5177         col_txfm_8x4_rounding(in, -shift[1]);
5178         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5179         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5180         transpose_4x8_avx2(in, outcoeff256);
5181         break;
5182     case FLIPADST_DCT:
5183         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
5184         fadst4x8_row_avx2(in, in, bitcol, 1);
5185         col_txfm_8x4_rounding(in, -shift[1]);
5186         fdct4x8_avx2(in, outcoeff256, bitrow);
5187         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5188         transpose_4x8_avx2(in, outcoeff256);
5189         break;
5190     case DCT_FLIPADST:
5191         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
5192         fdct4x8_row_avx2(in, in, bitcol, 1);
5193         col_txfm_8x4_rounding(in, -shift[1]);
5194         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5195         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5196         transpose_4x8_avx2(in, outcoeff256);
5197         break;
5198     case FLIPADST_FLIPADST:
5199         load_buffer_8x4_avx2(input, in, stride, 1, 1, shift[0]);
5200         fadst4x8_row_avx2(in, in, bitcol, 1);
5201         col_txfm_8x4_rounding(in, -shift[1]);
5202         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5203         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5204         transpose_4x8_avx2(in, outcoeff256);
5205         break;
5206     case ADST_FLIPADST:
5207         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
5208         fadst4x8_row_avx2(in, in, bitcol, 1);
5209         col_txfm_8x4_rounding(in, -shift[1]);
5210         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5211         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5212         transpose_4x8_avx2(in, outcoeff256);
5213         break;
5214     case FLIPADST_ADST:
5215         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
5216         fadst4x8_row_avx2(in, in, bitcol, 1);
5217         col_txfm_8x4_rounding(in, -shift[1]);
5218         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5219         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5220         transpose_4x8_avx2(in, outcoeff256);
5221         break;
5222     case IDTX:
5223         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5224         fidtx4x8_row_avx2(in, in, bitcol, 1);
5225         col_txfm_8x4_rounding(in, -shift[1]);
5226         fidtx8x4_avx2(in, outcoeff256, bitrow);
5227         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5228         transpose_4x8_avx2(in, outcoeff256);
5229         break;
5230     case V_DCT:
5231         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5232         fdct4x8_row_avx2(in, in, bitcol, 1);
5233         col_txfm_8x4_rounding(in, -shift[1]);
5234         fidtx8x4_avx2(in, outcoeff256, bitrow);
5235         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5236         transpose_4x8_avx2(in, outcoeff256);
5237         break;
5238     case H_DCT:
5239         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5240         fidtx4x8_row_avx2(in, in, bitcol, 1);
5241         col_txfm_8x4_rounding(in, -shift[1]);
5242         fdct4x8_avx2(in, outcoeff256, bitrow);
5243         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5244         transpose_4x8_avx2(in, outcoeff256);
5245         break;
5246     case V_ADST:
5247         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5248         fadst4x8_row_avx2(in, in, bitcol, 1);
5249         col_txfm_8x4_rounding(in, -shift[1]);
5250         fidtx8x4_avx2(in, outcoeff256, bitrow);
5251         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5252         transpose_4x8_avx2(in, outcoeff256);
5253         break;
5254     case H_ADST:
5255         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
5256         fidtx4x8_row_avx2(in, in, bitcol, 1);
5257         col_txfm_8x4_rounding(in, -shift[1]);
5258         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5259         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5260         transpose_4x8_avx2(in, outcoeff256);
5261         break;
5262     case V_FLIPADST:
5263         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
5264         fadst4x8_row_avx2(in, in, bitcol, 1);
5265         col_txfm_8x4_rounding(in, -shift[1]);
5266         fidtx8x4_avx2(in, outcoeff256, bitrow);
5267         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5268         transpose_4x8_avx2(in, outcoeff256);
5269         break;
5270     case H_FLIPADST:
5271         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
5272         fidtx4x8_row_avx2(in, in, bitcol, 1);
5273         col_txfm_8x4_rounding(in, -shift[1]);
5274         fadst8x4_avx2(in, outcoeff256, bitrow, 1);
5275         av1_round_shift_rect_array_32_avx2(outcoeff256, in, 4, -shift[2], new_sqrt2);
5276         transpose_4x8_avx2(in, outcoeff256);
5277         break;
5278     default: assert(0);
5279     }
5280     (void)bd;
5281 }
5282 
svt_av1_fwd_txfm2d_4x16_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)5283 void svt_av1_fwd_txfm2d_4x16_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
5284                                   uint8_t bd) {
5285     __m256i       in[8];
5286     __m256i       outcoeff256[8];
5287     const int8_t *shift   = fwd_txfm_shift_ls[TX_4X16];
5288     const int32_t txw_idx = get_txw_idx(TX_4X16);
5289     const int32_t txh_idx = get_txh_idx(TX_4X16);
5290     int32_t       bitcol  = fwd_cos_bit_col[txw_idx][txh_idx];
5291     int32_t       bitrow  = fwd_cos_bit_row[txw_idx][txh_idx];
5292 
5293     switch (tx_type) {
5294     case DCT_DCT:
5295         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5296         fdct16x4_avx2(in, outcoeff256, bitcol);
5297         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5298         transpose_4x16_avx2(outcoeff256, in);
5299         for (int32_t i = 0; i < 2; i++) fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5300         write_buffer_8x8(outcoeff256, output);
5301         break;
5302     case ADST_DCT:
5303         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5304         fadst16x4_avx2(in, outcoeff256, bitcol);
5305         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5306         transpose_4x16_avx2(outcoeff256, in);
5307         for (int32_t i = 0; i < 2; i++) fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5308         write_buffer_8x8(outcoeff256, output);
5309         break;
5310     case DCT_ADST:
5311         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5312         fdct16x4_avx2(in, outcoeff256, bitcol);
5313         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5314         transpose_4x16_avx2(outcoeff256, in);
5315         for (int32_t i = 0; i < 2; i++) fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5316         write_buffer_8x8(outcoeff256, output);
5317         break;
5318     case ADST_ADST:
5319         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5320         fadst16x4_avx2(in, outcoeff256, bitcol);
5321         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5322         transpose_4x16_avx2(outcoeff256, in);
5323         for (int32_t i = 0; i < 2; i++) fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5324         write_buffer_8x8(outcoeff256, output);
5325         break;
5326     case FLIPADST_DCT:
5327         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
5328         fadst16x4_avx2(in, outcoeff256, bitcol);
5329         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5330         transpose_4x16_avx2(outcoeff256, in);
5331         for (int32_t i = 0; i < 2; i++) fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5332         write_buffer_8x8(outcoeff256, output);
5333         break;
5334     case DCT_FLIPADST:
5335         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
5336         fdct16x4_avx2(in, outcoeff256, bitcol);
5337         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5338         transpose_4x16_avx2(outcoeff256, in);
5339         for (int32_t i = 0; i < 2; i++) fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5340         write_buffer_8x8(outcoeff256, output);
5341         break;
5342     case FLIPADST_FLIPADST:
5343         load_buffer_4x16_avx2(input, in, stride, 1, 1, shift[0]);
5344         fadst16x4_avx2(in, outcoeff256, bitcol);
5345         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5346         transpose_4x16_avx2(outcoeff256, in);
5347         for (int32_t i = 0; i < 2; i++) fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5348         write_buffer_8x8(outcoeff256, output);
5349         break;
5350     case ADST_FLIPADST:
5351         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
5352         fadst16x4_avx2(in, outcoeff256, bitcol);
5353         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5354         transpose_4x16_avx2(outcoeff256, in);
5355         for (int32_t i = 0; i < 2; i++) fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5356         write_buffer_8x8(outcoeff256, output);
5357         break;
5358     case FLIPADST_ADST:
5359         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
5360         fadst16x4_avx2(in, outcoeff256, bitcol);
5361         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5362         transpose_4x16_avx2(outcoeff256, in);
5363         for (int32_t i = 0; i < 2; i++) fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5364         write_buffer_8x8(outcoeff256, output);
5365         break;
5366     case IDTX:
5367         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5368         fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
5369         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5370         transpose_4x16_avx2(outcoeff256, in);
5371         for (int32_t i = 0; i < 2; i++) fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5372         write_buffer_8x8(outcoeff256, output);
5373         break;
5374     case V_DCT:
5375         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5376         fdct16x4_avx2(in, outcoeff256, bitcol);
5377         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5378         transpose_4x16_avx2(outcoeff256, in);
5379         for (int32_t i = 0; i < 2; i++) fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5380         write_buffer_8x8(outcoeff256, output);
5381         break;
5382     case H_DCT:
5383         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5384         fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
5385         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5386         transpose_4x16_avx2(outcoeff256, in);
5387         for (int32_t i = 0; i < 2; i++) fdct4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5388         write_buffer_8x8(outcoeff256, output);
5389         break;
5390     case V_ADST:
5391         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5392         fadst16x4_avx2(in, outcoeff256, bitcol);
5393         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5394         transpose_4x16_avx2(outcoeff256, in);
5395         for (int32_t i = 0; i < 2; i++) fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5396         write_buffer_8x8(outcoeff256, output);
5397         break;
5398     case H_ADST:
5399         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
5400         fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
5401         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5402         transpose_4x16_avx2(outcoeff256, in);
5403         for (int32_t i = 0; i < 2; i++) fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5404         write_buffer_8x8(outcoeff256, output);
5405         break;
5406     case V_FLIPADST:
5407         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
5408         fadst16x4_avx2(in, outcoeff256, bitcol);
5409         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5410         transpose_4x16_avx2(outcoeff256, in);
5411         for (int32_t i = 0; i < 2; i++) fidtx4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5412         write_buffer_8x8(outcoeff256, output);
5413         break;
5414     case H_FLIPADST:
5415         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
5416         fidtx16x8_avx2(in, outcoeff256, bitcol, 1);
5417         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5418         transpose_4x16_avx2(outcoeff256, in);
5419         for (int32_t i = 0; i < 2; i++) fadst4x8_col_avx2(in + i, outcoeff256 + i * 4, bitrow, 2);
5420         write_buffer_8x8(outcoeff256, output);
5421         break;
5422     default: assert(0);
5423     }
5424     (void)bd;
5425 }
5426 
svt_av1_fwd_txfm2d_16x4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)5427 void svt_av1_fwd_txfm2d_16x4_avx2(int16_t *input, int32_t *output, uint32_t stride, TxType tx_type,
5428                                   uint8_t bd) {
5429     __m256i       in[8];
5430     __m256i *     outcoeff256 = (__m256i *)output;
5431     const int8_t *shift       = fwd_shift_16x4;
5432     const int32_t txw_idx     = get_txw_idx(TX_16X4);
5433     const int32_t txh_idx     = get_txh_idx(TX_16X4);
5434     int32_t       bitcol      = fwd_cos_bit_col[txw_idx][txh_idx];
5435     int32_t       bitrow      = fwd_cos_bit_row[txw_idx][txh_idx];
5436 
5437     switch (tx_type) {
5438     case DCT_DCT:
5439         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5440         for (int32_t i = 0; i < 2; i++)
5441             fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5442         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5443         fdct16x4_avx2(outcoeff256, in, bitrow);
5444         transpose_4x16_avx2(in, outcoeff256);
5445         break;
5446     case ADST_DCT:
5447         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5448         for (int32_t i = 0; i < 2; i++)
5449             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5450         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5451         fdct16x4_avx2(outcoeff256, in, bitrow);
5452         transpose_4x16_avx2(in, outcoeff256);
5453         break;
5454     case DCT_ADST:
5455         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5456         for (int32_t i = 0; i < 2; i++)
5457             fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5458         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5459         fadst16x4_avx2(outcoeff256, in, bitrow);
5460         transpose_4x16_avx2(in, outcoeff256);
5461         break;
5462     case ADST_ADST:
5463         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5464         for (int32_t i = 0; i < 2; i++)
5465             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5466         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5467         fadst16x4_avx2(outcoeff256, in, bitrow);
5468         transpose_4x16_avx2(in, outcoeff256);
5469         break;
5470     case FLIPADST_DCT:
5471         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
5472         for (int32_t i = 0; i < 2; i++)
5473             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5474         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5475         fdct16x4_avx2(outcoeff256, in, bitrow);
5476         transpose_4x16_avx2(in, outcoeff256);
5477         break;
5478     case DCT_FLIPADST:
5479         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
5480         for (int32_t i = 0; i < 2; i++)
5481             fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5482         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5483         fadst16x4_avx2(outcoeff256, in, bitrow);
5484         transpose_4x16_avx2(in, outcoeff256);
5485         break;
5486     case FLIPADST_FLIPADST:
5487         load_buffer_16x4_avx2(input, in, stride, 1, 1, shift[0]);
5488         for (int32_t i = 0; i < 2; i++)
5489             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5490         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5491         fadst16x4_avx2(outcoeff256, in, bitrow);
5492         transpose_4x16_avx2(in, outcoeff256);
5493         break;
5494     case ADST_FLIPADST:
5495         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
5496         for (int32_t i = 0; i < 2; i++)
5497             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5498         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5499         fadst16x4_avx2(outcoeff256, in, bitrow);
5500         transpose_4x16_avx2(in, outcoeff256);
5501         break;
5502     case FLIPADST_ADST:
5503         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
5504         for (int32_t i = 0; i < 2; i++)
5505             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5506         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5507         fadst16x4_avx2(outcoeff256, in, bitrow);
5508         transpose_4x16_avx2(in, outcoeff256);
5509         break;
5510     case IDTX:
5511         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5512         for (int32_t i = 0; i < 2; i++)
5513             fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5514         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5515         fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
5516         transpose_4x16_avx2(in, outcoeff256);
5517         break;
5518     case V_DCT:
5519         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5520         for (int32_t i = 0; i < 2; i++)
5521             fdct4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5522         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5523         fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
5524         transpose_4x16_avx2(in, outcoeff256);
5525         break;
5526     case H_DCT:
5527         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5528         for (int32_t i = 0; i < 2; i++)
5529             fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5530         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5531         fdct16x4_avx2(outcoeff256, in, bitrow);
5532         transpose_4x16_avx2(in, outcoeff256);
5533         break;
5534     case V_ADST:
5535         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5536         for (int32_t i = 0; i < 2; i++)
5537             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5538         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5539         fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
5540         transpose_4x16_avx2(in, outcoeff256);
5541         break;
5542     case H_ADST:
5543         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
5544         for (int32_t i = 0; i < 2; i++)
5545             fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5546         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5547         fadst16x4_avx2(outcoeff256, in, bitrow);
5548         transpose_4x16_avx2(in, outcoeff256);
5549         break;
5550     case V_FLIPADST:
5551         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
5552         for (int32_t i = 0; i < 2; i++)
5553             fadst4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5554         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5555         fidtx16x8_avx2(outcoeff256, in, bitrow, 1);
5556         transpose_4x16_avx2(in, outcoeff256);
5557         break;
5558     case H_FLIPADST:
5559         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
5560         for (int32_t i = 0; i < 2; i++)
5561             fidtx4x8_row_avx2(in + i * 4, outcoeff256 + i * 4, bitcol, 1);
5562         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
5563         fadst16x4_avx2(outcoeff256, in, bitrow);
5564         transpose_4x16_avx2(in, outcoeff256);
5565         break;
5566     default: assert(0);
5567     }
5568     (void)bd;
5569 }
5570 
transpose_16x16_in_64x64_avx2(const __m256i * in,__m256i * out)5571 static INLINE void transpose_16x16_in_64x64_avx2(const __m256i *in, __m256i *out) {
5572     __m256i temp[32];
5573     TRANSPOSE_4X4_AVX2(in[0], in[8], in[16], in[24], temp[0], temp[2], temp[4], temp[6]);
5574     TRANSPOSE_4X4_AVX2(in[32], in[40], in[48], in[56], temp[17], temp[19], temp[21], temp[23]);
5575     TRANSPOSE_4X4_AVX2(in[1], in[9], in[17], in[25], temp[16], temp[18], temp[20], temp[22]);
5576     TRANSPOSE_4X4_AVX2(in[33], in[41], in[49], in[57], temp[25], temp[27], temp[29], temp[31]);
5577 
5578     TRANSPOSE_4X4_AVX2(in[64], in[72], in[80], in[88], temp[1], temp[3], temp[5], temp[7]);
5579     TRANSPOSE_4X4_AVX2(in[96], in[104], in[112], in[120], temp[9], temp[11], temp[13], temp[15]);
5580     TRANSPOSE_4X4_AVX2(in[65], in[73], in[81], in[89], temp[8], temp[10], temp[12], temp[14]);
5581     TRANSPOSE_4X4_AVX2(in[97], in[105], in[113], in[121], temp[24], temp[26], temp[28], temp[30]);
5582 
5583     out[0]   = _mm256_permute2x128_si256(temp[0], temp[17], 0x20);
5584     out[1]   = _mm256_permute2x128_si256(temp[1], temp[9], 0x20);
5585     out[8]   = _mm256_permute2x128_si256(temp[2], temp[19], 0x20);
5586     out[9]   = _mm256_permute2x128_si256(temp[3], temp[11], 0x20);
5587     out[16]  = _mm256_permute2x128_si256(temp[4], temp[21], 0x20);
5588     out[17]  = _mm256_permute2x128_si256(temp[5], temp[13], 0x20);
5589     out[24]  = _mm256_permute2x128_si256(temp[6], temp[23], 0x20);
5590     out[25]  = _mm256_permute2x128_si256(temp[7], temp[15], 0x20);
5591     out[32]  = _mm256_permute2x128_si256(temp[0], temp[17], 0x31);
5592     out[33]  = _mm256_permute2x128_si256(temp[1], temp[9], 0x31);
5593     out[40]  = _mm256_permute2x128_si256(temp[2], temp[19], 0x31);
5594     out[41]  = _mm256_permute2x128_si256(temp[3], temp[11], 0x31);
5595     out[48]  = _mm256_permute2x128_si256(temp[4], temp[21], 0x31);
5596     out[49]  = _mm256_permute2x128_si256(temp[5], temp[13], 0x31);
5597     out[56]  = _mm256_permute2x128_si256(temp[6], temp[23], 0x31);
5598     out[57]  = _mm256_permute2x128_si256(temp[7], temp[15], 0x31);
5599     out[64]  = _mm256_permute2x128_si256(temp[16], temp[25], 0x20);
5600     out[65]  = _mm256_permute2x128_si256(temp[8], temp[24], 0x20);
5601     out[72]  = _mm256_permute2x128_si256(temp[18], temp[27], 0x20);
5602     out[73]  = _mm256_permute2x128_si256(temp[10], temp[26], 0x20);
5603     out[80]  = _mm256_permute2x128_si256(temp[20], temp[29], 0x20);
5604     out[81]  = _mm256_permute2x128_si256(temp[12], temp[28], 0x20);
5605     out[88]  = _mm256_permute2x128_si256(temp[22], temp[31], 0x20);
5606     out[89]  = _mm256_permute2x128_si256(temp[14], temp[30], 0x20);
5607     out[96]  = _mm256_permute2x128_si256(temp[16], temp[25], 0x31);
5608     out[97]  = _mm256_permute2x128_si256(temp[8], temp[24], 0x31);
5609     out[104] = _mm256_permute2x128_si256(temp[18], temp[27], 0x31);
5610     out[105] = _mm256_permute2x128_si256(temp[10], temp[26], 0x31);
5611     out[112] = _mm256_permute2x128_si256(temp[20], temp[29], 0x31);
5612     out[113] = _mm256_permute2x128_si256(temp[12], temp[28], 0x31);
5613     out[120] = _mm256_permute2x128_si256(temp[22], temp[31], 0x31);
5614     out[121] = _mm256_permute2x128_si256(temp[14], temp[30], 0x31);
5615 }
5616 
transpose_32x32_in_64x64_avx2(const __m256i * in,__m256i * out)5617 static AOM_FORCE_INLINE void transpose_32x32_in_64x64_avx2(const __m256i *in, __m256i *out) {
5618     transpose_16x16_in_64x64_avx2(in, out); //top-left
5619     transpose_16x16_in_64x64_avx2(in + 2, out + 128); //top-right
5620     transpose_16x16_in_64x64_avx2(in + 128, out + 2); //bottom-left
5621     transpose_16x16_in_64x64_avx2(in + 130, out + 130); //bottom-right
5622 }
5623 
5624 /*
5625 *    Transpose top left block of size 16x16 in 32x32 block
5626 */
transpose_16x16_in_32x32_avx2(const __m256i * in,__m256i * out)5627 static INLINE void transpose_16x16_in_32x32_avx2(const __m256i *in, __m256i *out) {
5628     __m256i temp[32];
5629     TRANSPOSE_4X4_AVX2(in[0], in[4], in[8], in[12], temp[0], temp[2], temp[4], temp[6]);
5630     TRANSPOSE_4X4_AVX2(in[16], in[20], in[24], in[28], temp[17], temp[19], temp[21], temp[23]);
5631     TRANSPOSE_4X4_AVX2(in[1], in[5], in[9], in[13], temp[16], temp[18], temp[20], temp[22]);
5632     TRANSPOSE_4X4_AVX2(in[17], in[21], in[25], in[29], temp[25], temp[27], temp[29], temp[31]);
5633 
5634     TRANSPOSE_4X4_AVX2(in[32], in[36], in[40], in[44], temp[1], temp[3], temp[5], temp[7]);
5635     TRANSPOSE_4X4_AVX2(in[48], in[52], in[56], in[60], temp[9], temp[11], temp[13], temp[15]);
5636     TRANSPOSE_4X4_AVX2(in[33], in[37], in[41], in[45], temp[8], temp[10], temp[12], temp[14]);
5637     TRANSPOSE_4X4_AVX2(in[49], in[53], in[57], in[61], temp[24], temp[26], temp[28], temp[30]);
5638 
5639     out[0]  = _mm256_permute2x128_si256(temp[0], temp[17], 0x20);
5640     out[1]  = _mm256_permute2x128_si256(temp[1], temp[9], 0x20);
5641     out[4]  = _mm256_permute2x128_si256(temp[2], temp[19], 0x20);
5642     out[5]  = _mm256_permute2x128_si256(temp[3], temp[11], 0x20);
5643     out[8]  = _mm256_permute2x128_si256(temp[4], temp[21], 0x20);
5644     out[9]  = _mm256_permute2x128_si256(temp[5], temp[13], 0x20);
5645     out[12] = _mm256_permute2x128_si256(temp[6], temp[23], 0x20);
5646     out[13] = _mm256_permute2x128_si256(temp[7], temp[15], 0x20);
5647     out[16] = _mm256_permute2x128_si256(temp[0], temp[17], 0x31);
5648     out[17] = _mm256_permute2x128_si256(temp[1], temp[9], 0x31);
5649     out[20] = _mm256_permute2x128_si256(temp[2], temp[19], 0x31);
5650     out[21] = _mm256_permute2x128_si256(temp[3], temp[11], 0x31);
5651     out[24] = _mm256_permute2x128_si256(temp[4], temp[21], 0x31);
5652     out[25] = _mm256_permute2x128_si256(temp[5], temp[13], 0x31);
5653     out[28] = _mm256_permute2x128_si256(temp[6], temp[23], 0x31);
5654     out[29] = _mm256_permute2x128_si256(temp[7], temp[15], 0x31);
5655     out[32] = _mm256_permute2x128_si256(temp[16], temp[25], 0x20);
5656     out[33] = _mm256_permute2x128_si256(temp[8], temp[24], 0x20);
5657     out[36] = _mm256_permute2x128_si256(temp[18], temp[27], 0x20);
5658     out[37] = _mm256_permute2x128_si256(temp[10], temp[26], 0x20);
5659     out[40] = _mm256_permute2x128_si256(temp[20], temp[29], 0x20);
5660     out[41] = _mm256_permute2x128_si256(temp[12], temp[28], 0x20);
5661     out[44] = _mm256_permute2x128_si256(temp[22], temp[31], 0x20);
5662     out[45] = _mm256_permute2x128_si256(temp[14], temp[30], 0x20);
5663     out[48] = _mm256_permute2x128_si256(temp[16], temp[25], 0x31);
5664     out[49] = _mm256_permute2x128_si256(temp[8], temp[24], 0x31);
5665     out[52] = _mm256_permute2x128_si256(temp[18], temp[27], 0x31);
5666     out[53] = _mm256_permute2x128_si256(temp[10], temp[26], 0x31);
5667     out[56] = _mm256_permute2x128_si256(temp[20], temp[29], 0x31);
5668     out[57] = _mm256_permute2x128_si256(temp[12], temp[28], 0x31);
5669     out[60] = _mm256_permute2x128_si256(temp[22], temp[31], 0x31);
5670     out[61] = _mm256_permute2x128_si256(temp[14], temp[30], 0x31);
5671 }
5672 
5673 /*
5674 *    Transpose top left block of size 8x8 in 16x16 block
5675 */
transpose_8x8_in_16x16_avx2(const __m256i * in,__m256i * out)5676 static INLINE void transpose_8x8_in_16x16_avx2(const __m256i *in, __m256i *out) {
5677     __m256i out1[8];
5678     TRANSPOSE_4X4_AVX2(in[0], in[2], in[4], in[6], out1[0], out1[1], out1[4], out1[5]);
5679     TRANSPOSE_4X4_AVX2(in[8], in[10], in[12], in[14], out1[2], out1[3], out1[6], out1[7]);
5680 
5681     out[0]  = _mm256_permute2x128_si256(out1[0], out1[2], 0x20);
5682     out[2]  = _mm256_permute2x128_si256(out1[1], out1[3], 0x20);
5683     out[4]  = _mm256_permute2x128_si256(out1[4], out1[6], 0x20);
5684     out[6]  = _mm256_permute2x128_si256(out1[5], out1[7], 0x20);
5685     out[8]  = _mm256_permute2x128_si256(out1[0], out1[2], 0x31);
5686     out[10] = _mm256_permute2x128_si256(out1[1], out1[3], 0x31);
5687     out[12] = _mm256_permute2x128_si256(out1[4], out1[6], 0x31);
5688     out[14] = _mm256_permute2x128_si256(out1[5], out1[7], 0x31);
5689 }
5690 
transpose_8x8_half_avx2(const __m256i * in,__m256i * out)5691 static INLINE void transpose_8x8_half_avx2(const __m256i *in, __m256i *out) {
5692     const __m256i zero = _mm256_setzero_si256();
5693     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
5694     out[4] = _mm256_permute2x128_si256(out[0], zero, 0x31);
5695     out[5] = _mm256_permute2x128_si256(out[1], zero, 0x31);
5696     out[6] = _mm256_permute2x128_si256(out[2], zero, 0x31);
5697     out[7] = _mm256_permute2x128_si256(out[3], zero, 0x31);
5698 }
5699 
transpose_8x8_N2_avx2(const __m256i * in,__m256i * out)5700 static AOM_FORCE_INLINE void transpose_8x8_N2_avx2(const __m256i *in, __m256i *out) {
5701     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
5702 }
5703 
transpose_4x8_in_4x16_avx2(const __m256i * in,__m256i * out)5704 static INLINE void transpose_4x8_in_4x16_avx2(const __m256i *in, __m256i *out) {
5705     __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5706 
5707     TRANSPOSE_4X4_AVX2(in[0], in[1], in[2], in[3], out[0], out[2], out[4], out[6]);
5708     out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
5709     out[2] = _mm256_permutevar8x32_epi32(out[2], perm);
5710     out[4] = _mm256_permutevar8x32_epi32(out[4], perm);
5711     out[6] = _mm256_permutevar8x32_epi32(out[6], perm);
5712 }
5713 
transpose_4x8_in_4x16_half_avx2(const __m256i * in,__m256i * out)5714 static AOM_FORCE_INLINE void transpose_4x8_in_4x16_half_avx2(const __m256i *in, __m256i *out) {
5715     __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
5716     __m256i u0, u1;
5717 
5718     u0     = _mm256_unpacklo_epi32(in[0], in[1]);
5719     u1     = _mm256_unpacklo_epi32(in[2], in[3]);
5720     out[0] = _mm256_unpacklo_epi64(u0, u1);
5721     out[2] = _mm256_unpackhi_epi64(u0, u1);
5722     out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
5723     out[2] = _mm256_permutevar8x32_epi32(out[2], perm);
5724 }
5725 
load_buffer_4x8_in_8x8(const int16_t * input,__m256i * in,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift,int32_t step)5726 static INLINE void load_buffer_4x8_in_8x8(const int16_t *input, __m256i *in, int32_t stride,
5727                                           int32_t flipud, int32_t fliplr, int32_t shift,
5728                                           int32_t step) {
5729     __m128i temp[4];
5730     if (!flipud) {
5731         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
5732         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
5733         temp[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
5734         temp[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
5735     } else {
5736         temp[0] = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
5737         temp[1] = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
5738         temp[2] = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
5739         temp[3] = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
5740     }
5741 
5742     if (fliplr) {
5743         temp[0] = mm_reverse_epi16(temp[0]);
5744         temp[1] = mm_reverse_epi16(temp[1]);
5745         temp[2] = mm_reverse_epi16(temp[2]);
5746         temp[3] = mm_reverse_epi16(temp[3]);
5747     }
5748 
5749     in[0 * step] = _mm256_cvtepi16_epi32(temp[0]);
5750     in[1 * step] = _mm256_cvtepi16_epi32(temp[1]);
5751     in[2 * step] = _mm256_cvtepi16_epi32(temp[2]);
5752     in[3 * step] = _mm256_cvtepi16_epi32(temp[3]);
5753 
5754     in[0 * step] = _mm256_slli_epi32(in[0 * step], shift);
5755     in[1 * step] = _mm256_slli_epi32(in[1 * step], shift);
5756     in[2 * step] = _mm256_slli_epi32(in[2 * step], shift);
5757     in[3 * step] = _mm256_slli_epi32(in[3 * step], shift);
5758 }
5759 
load_buffer_4x16_in_16x16(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)5760 static AOM_FORCE_INLINE void load_buffer_4x16_in_16x16(const int16_t *input, __m256i *out,
5761                                                        int32_t stride, int32_t flipud,
5762                                                        int32_t fliplr, int32_t shift) {
5763     // Load 2 4x8 blocks
5764     const int16_t *top_l = input;
5765     const int16_t *top_r = input + 8;
5766     const int16_t *bot_l = input + 8 * stride;
5767     const int16_t *bot_r = input + 8 * stride + 8;
5768 
5769     const int16_t *tmp;
5770 
5771     if (flipud) {
5772         // Swap left columns
5773         top_l = bot_l;
5774         // Swap right columns
5775         top_r = bot_r;
5776     }
5777 
5778     if (fliplr) {
5779         // Swap top rows
5780         tmp   = top_l;
5781         top_l = top_r;
5782         top_r = tmp;
5783     }
5784 
5785     // load first 4 columns
5786     load_buffer_4x8_in_8x8(top_l, &out[0], stride, flipud, fliplr, shift, 2);
5787 
5788     // load second 4 columns
5789     load_buffer_4x8_in_8x8(top_r, &out[1], stride, flipud, fliplr, shift, 2);
5790 }
5791 
write_buffer_8x8_N2(const __m256i * res,int32_t * output)5792 static AOM_FORCE_INLINE void write_buffer_8x8_N2(const __m256i *res, int32_t *output) {
5793     const __m256i zero256 = _mm256_setzero_si256();
5794     const __m128i zero128 = _mm_setzero_si128();
5795 
5796     _mm_storeu_si128((__m128i *)(output + 0 * 8), _mm256_castsi256_si128(res[0]));
5797     _mm_storeu_si128((__m128i *)(output + 1 * 8), _mm256_castsi256_si128(res[1]));
5798     _mm_storeu_si128((__m128i *)(output + 2 * 8), _mm256_castsi256_si128(res[2]));
5799     _mm_storeu_si128((__m128i *)(output + 3 * 8), _mm256_castsi256_si128(res[3]));
5800 
5801     _mm_storeu_si128((__m128i *)(output + 0 * 8 + 4), zero128);
5802     _mm_storeu_si128((__m128i *)(output + 1 * 8 + 4), zero128);
5803     _mm_storeu_si128((__m128i *)(output + 2 * 8 + 4), zero128);
5804     _mm_storeu_si128((__m128i *)(output + 3 * 8 + 4), zero128);
5805 
5806     _mm256_storeu_si256((__m256i *)(output + 4 * 8), zero256);
5807     _mm256_storeu_si256((__m256i *)(output + 5 * 8), zero256);
5808     _mm256_storeu_si256((__m256i *)(output + 6 * 8), zero256);
5809     _mm256_storeu_si256((__m256i *)(output + 7 * 8), zero256);
5810 }
5811 
fdct8x8_N2_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num)5812 static void fdct8x8_N2_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
5813     const int32_t *cospi    = cospi_arr(bit);
5814     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
5815     const __m256i  cospim32 = _mm256_set1_epi32(-cospi[32]);
5816     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
5817     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
5818     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
5819     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
5820     const __m256i  cospi24  = _mm256_set1_epi32(cospi[24]);
5821     const __m256i  cospi40  = _mm256_set1_epi32(cospi[40]);
5822     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
5823     __m256i        u[8], v[8];
5824 
5825     // stage 0
5826     // stage 1
5827     u[0] = _mm256_add_epi32(in[0 * col_num], in[7 * col_num]);
5828     v[7] = _mm256_sub_epi32(in[0 * col_num], in[7 * col_num]);
5829     u[1] = _mm256_add_epi32(in[1 * col_num], in[6 * col_num]);
5830     u[6] = _mm256_sub_epi32(in[1 * col_num], in[6 * col_num]);
5831     u[2] = _mm256_add_epi32(in[2 * col_num], in[5 * col_num]);
5832     u[5] = _mm256_sub_epi32(in[2 * col_num], in[5 * col_num]);
5833     u[3] = _mm256_add_epi32(in[3 * col_num], in[4 * col_num]);
5834     v[4] = _mm256_sub_epi32(in[3 * col_num], in[4 * col_num]);
5835 
5836     // stage 2
5837     v[0] = _mm256_add_epi32(u[0], u[3]);
5838     v[3] = _mm256_sub_epi32(u[0], u[3]);
5839     v[1] = _mm256_add_epi32(u[1], u[2]);
5840     v[2] = _mm256_sub_epi32(u[1], u[2]);
5841 
5842     v[5] = _mm256_mullo_epi32(u[5], cospim32);
5843     v[6] = _mm256_mullo_epi32(u[6], cospi32);
5844     v[5] = _mm256_add_epi32(v[5], v[6]);
5845     v[5] = _mm256_add_epi32(v[5], rnding);
5846     v[5] = _mm256_srai_epi32(v[5], bit);
5847 
5848     u[0] = _mm256_mullo_epi32(u[5], cospi32);
5849     v[6] = _mm256_mullo_epi32(u[6], cospim32);
5850     v[6] = _mm256_sub_epi32(u[0], v[6]);
5851     v[6] = _mm256_add_epi32(v[6], rnding);
5852     v[6] = _mm256_srai_epi32(v[6], bit);
5853 
5854     // stage 3
5855     // type 0
5856     v[0] = _mm256_mullo_epi32(v[0], cospi32);
5857     v[1] = _mm256_mullo_epi32(v[1], cospi32);
5858     u[0] = _mm256_add_epi32(v[0], v[1]);
5859     u[0] = _mm256_add_epi32(u[0], rnding);
5860     u[0] = _mm256_srai_epi32(u[0], bit);
5861 
5862     // type 1
5863     v[0] = _mm256_mullo_epi32(v[2], cospi48);
5864     v[1] = _mm256_mullo_epi32(v[3], cospi16);
5865     u[2] = _mm256_add_epi32(v[0], v[1]);
5866     u[2] = _mm256_add_epi32(u[2], rnding);
5867     u[2] = _mm256_srai_epi32(u[2], bit);
5868 
5869     u[4] = _mm256_add_epi32(v[4], v[5]);
5870     u[5] = _mm256_sub_epi32(v[4], v[5]);
5871     u[6] = _mm256_sub_epi32(v[7], v[6]);
5872     u[7] = _mm256_add_epi32(v[7], v[6]);
5873 
5874     // stage 4
5875     // stage 5
5876     v[0]             = _mm256_mullo_epi32(u[4], cospi56);
5877     v[1]             = _mm256_mullo_epi32(u[7], cospi8);
5878     v[0]             = _mm256_add_epi32(v[0], v[1]);
5879     v[0]             = _mm256_add_epi32(v[0], rnding);
5880     out[1 * col_num] = _mm256_srai_epi32(v[0], bit);
5881 
5882     v[0]             = _mm256_mullo_epi32(u[5], cospi40);
5883     v[1]             = _mm256_mullo_epi32(u[6], cospi24);
5884     v[0]             = _mm256_sub_epi32(v[1], v[0]);
5885     v[0]             = _mm256_add_epi32(v[0], rnding);
5886     out[3 * col_num] = _mm256_srai_epi32(v[0], bit);
5887 
5888     out[0 * col_num] = u[0];
5889     out[2 * col_num] = u[2];
5890 }
5891 
fadst8x8_N2_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num)5892 static void fadst8x8_N2_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
5893     const int32_t *cospi    = cospi_arr(bit);
5894     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
5895     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
5896     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
5897     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
5898     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
5899     const __m256i  cospim4  = _mm256_set1_epi32(-cospi[4]);
5900     const __m256i  cospi60  = _mm256_set1_epi32(cospi[60]);
5901     const __m256i  cospim20 = _mm256_set1_epi32(-cospi[20]);
5902     const __m256i  cospi44  = _mm256_set1_epi32(cospi[44]);
5903     const __m256i  cospi28  = _mm256_set1_epi32(cospi[28]);
5904     const __m256i  cospi36  = _mm256_set1_epi32(cospi[36]);
5905     const __m256i  cospi52  = _mm256_set1_epi32(cospi[52]);
5906     const __m256i  cospi12  = _mm256_set1_epi32(cospi[12]);
5907     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
5908     const __m256i  zero     = _mm256_setzero_si256();
5909     __m256i        u0, u1, u2, u3, u4, u5, u6, u7;
5910     __m256i        v0, v1, v2, v3, v4, v5, v6, v7;
5911     __m256i        x, y;
5912 
5913     u0 = in[0 * col_num];
5914     u1 = _mm256_sub_epi32(zero, in[7 * col_num]);
5915     u2 = _mm256_sub_epi32(zero, in[3 * col_num]);
5916     u3 = in[4 * col_num];
5917     u4 = _mm256_sub_epi32(zero, in[1 * col_num]);
5918     u5 = in[6 * col_num];
5919     u6 = in[2 * col_num];
5920     u7 = _mm256_sub_epi32(zero, in[5 * col_num]);
5921 
5922     // stage 2
5923     v0 = u0;
5924     v1 = u1;
5925 
5926     x  = _mm256_mullo_epi32(u2, cospi32);
5927     y  = _mm256_mullo_epi32(u3, cospi32);
5928     v2 = _mm256_add_epi32(x, y);
5929     v2 = _mm256_add_epi32(v2, rnding);
5930     v2 = _mm256_srai_epi32(v2, bit);
5931 
5932     v3 = _mm256_sub_epi32(x, y);
5933     v3 = _mm256_add_epi32(v3, rnding);
5934     v3 = _mm256_srai_epi32(v3, bit);
5935 
5936     v4 = u4;
5937     v5 = u5;
5938 
5939     x  = _mm256_mullo_epi32(u6, cospi32);
5940     y  = _mm256_mullo_epi32(u7, cospi32);
5941     v6 = _mm256_add_epi32(x, y);
5942     v6 = _mm256_add_epi32(v6, rnding);
5943     v6 = _mm256_srai_epi32(v6, bit);
5944 
5945     v7 = _mm256_sub_epi32(x, y);
5946     v7 = _mm256_add_epi32(v7, rnding);
5947     v7 = _mm256_srai_epi32(v7, bit);
5948 
5949     // stage 3
5950     u0 = _mm256_add_epi32(v0, v2);
5951     u1 = _mm256_add_epi32(v1, v3);
5952     u2 = _mm256_sub_epi32(v0, v2);
5953     u3 = _mm256_sub_epi32(v1, v3);
5954     u4 = _mm256_add_epi32(v4, v6);
5955     u5 = _mm256_add_epi32(v5, v7);
5956     u6 = _mm256_sub_epi32(v4, v6);
5957     u7 = _mm256_sub_epi32(v5, v7);
5958 
5959     // stage 4
5960     v0 = u0;
5961     v1 = u1;
5962     v2 = u2;
5963     v3 = u3;
5964 
5965     x  = _mm256_mullo_epi32(u4, cospi16);
5966     y  = _mm256_mullo_epi32(u5, cospi48);
5967     v4 = _mm256_add_epi32(x, y);
5968     v4 = _mm256_add_epi32(v4, rnding);
5969     v4 = _mm256_srai_epi32(v4, bit);
5970 
5971     x  = _mm256_mullo_epi32(u4, cospi48);
5972     y  = _mm256_mullo_epi32(u5, cospim16);
5973     v5 = _mm256_add_epi32(x, y);
5974     v5 = _mm256_add_epi32(v5, rnding);
5975     v5 = _mm256_srai_epi32(v5, bit);
5976 
5977     x  = _mm256_mullo_epi32(u6, cospim48);
5978     y  = _mm256_mullo_epi32(u7, cospi16);
5979     v6 = _mm256_add_epi32(x, y);
5980     v6 = _mm256_add_epi32(v6, rnding);
5981     v6 = _mm256_srai_epi32(v6, bit);
5982 
5983     x  = _mm256_mullo_epi32(u6, cospi16);
5984     y  = _mm256_mullo_epi32(u7, cospi48);
5985     v7 = _mm256_add_epi32(x, y);
5986     v7 = _mm256_add_epi32(v7, rnding);
5987     v7 = _mm256_srai_epi32(v7, bit);
5988 
5989     // stage 5
5990     u0 = _mm256_add_epi32(v0, v4);
5991     u1 = _mm256_add_epi32(v1, v5);
5992     u2 = _mm256_add_epi32(v2, v6);
5993     u3 = _mm256_add_epi32(v3, v7);
5994     u4 = _mm256_sub_epi32(v0, v4);
5995     u5 = _mm256_sub_epi32(v1, v5);
5996     u6 = _mm256_sub_epi32(v2, v6);
5997     u7 = _mm256_sub_epi32(v3, v7);
5998 
5999     // stage 6
6000     x  = _mm256_mullo_epi32(u0, cospi60);
6001     y  = _mm256_mullo_epi32(u1, cospim4);
6002     v1 = _mm256_add_epi32(x, y);
6003     v1 = _mm256_add_epi32(v1, rnding);
6004     v1 = _mm256_srai_epi32(v1, bit);
6005 
6006     x  = _mm256_mullo_epi32(u2, cospi44);
6007     y  = _mm256_mullo_epi32(u3, cospim20);
6008     v3 = _mm256_add_epi32(x, y);
6009     v3 = _mm256_add_epi32(v3, rnding);
6010     v3 = _mm256_srai_epi32(v3, bit);
6011 
6012     x  = _mm256_mullo_epi32(u4, cospi36);
6013     y  = _mm256_mullo_epi32(u5, cospi28);
6014     v4 = _mm256_add_epi32(x, y);
6015     v4 = _mm256_add_epi32(v4, rnding);
6016     v4 = _mm256_srai_epi32(v4, bit);
6017 
6018     x  = _mm256_mullo_epi32(u6, cospi52);
6019     y  = _mm256_mullo_epi32(u7, cospi12);
6020     v6 = _mm256_add_epi32(x, y);
6021     v6 = _mm256_add_epi32(v6, rnding);
6022     v6 = _mm256_srai_epi32(v6, bit);
6023 
6024     // stage 7
6025     out[0 * col_num] = v1;
6026     out[1 * col_num] = v6;
6027     out[2 * col_num] = v3;
6028     out[3 * col_num] = v4;
6029 }
6030 
fidtx8x8_N2_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num)6031 static void fidtx8x8_N2_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
6032     (void)bit;
6033     out[0] = _mm256_slli_epi32(in[0 * col_num], 1);
6034     out[1] = _mm256_slli_epi32(in[1 * col_num], 1);
6035     out[2] = _mm256_slli_epi32(in[2 * col_num], 1);
6036     out[3] = _mm256_slli_epi32(in[3 * col_num], 1);
6037 }
6038 
col_txfm_8x8_N2_rounding(__m256i * in,int32_t shift)6039 static AOM_FORCE_INLINE void col_txfm_8x8_N2_rounding(__m256i *in, int32_t shift) {
6040     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
6041 
6042     in[0] = _mm256_add_epi32(in[0], rounding);
6043     in[1] = _mm256_add_epi32(in[1], rounding);
6044     in[2] = _mm256_add_epi32(in[2], rounding);
6045     in[3] = _mm256_add_epi32(in[3], rounding);
6046 
6047     in[0] = _mm256_srai_epi32(in[0], shift);
6048     in[1] = _mm256_srai_epi32(in[1], shift);
6049     in[2] = _mm256_srai_epi32(in[2], shift);
6050     in[3] = _mm256_srai_epi32(in[3], shift);
6051 }
6052 
col_txfm_16x16_N2_rounding(__m256i * in,int32_t shift)6053 static AOM_FORCE_INLINE void col_txfm_16x16_N2_rounding(__m256i *in, int32_t shift) {
6054     col_txfm_8x8_rounding(&in[0], shift);
6055     col_txfm_8x8_rounding(&in[8], shift);
6056 }
6057 
col_txfm_32x8_N2_half_rounding(__m256i * in,int32_t shift)6058 static AOM_FORCE_INLINE void col_txfm_32x8_N2_half_rounding(__m256i *in, int32_t shift) {
6059     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
6060     in[0]                  = _mm256_add_epi32(in[0], rounding);
6061     in[1]                  = _mm256_add_epi32(in[1], rounding);
6062     in[4]                  = _mm256_add_epi32(in[4], rounding);
6063     in[5]                  = _mm256_add_epi32(in[5], rounding);
6064     in[0]                  = _mm256_srai_epi32(in[0], shift);
6065     in[1]                  = _mm256_srai_epi32(in[1], shift);
6066     in[4]                  = _mm256_srai_epi32(in[4], shift);
6067     in[5]                  = _mm256_srai_epi32(in[5], shift);
6068     in[8]                  = _mm256_add_epi32(in[8], rounding);
6069     in[9]                  = _mm256_add_epi32(in[9], rounding);
6070     in[12]                 = _mm256_add_epi32(in[12], rounding);
6071     in[13]                 = _mm256_add_epi32(in[13], rounding);
6072     in[8]                  = _mm256_srai_epi32(in[8], shift);
6073     in[9]                  = _mm256_srai_epi32(in[9], shift);
6074     in[12]                 = _mm256_srai_epi32(in[12], shift);
6075     in[13]                 = _mm256_srai_epi32(in[13], shift);
6076 }
6077 
col_txfm_16x16_N2_half_rounding(__m256i * in,int32_t shift)6078 static AOM_FORCE_INLINE void col_txfm_16x16_N2_half_rounding(__m256i *in, int32_t shift) {
6079     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
6080     in[0]                  = _mm256_add_epi32(in[0], rounding);
6081     in[2]                  = _mm256_add_epi32(in[2], rounding);
6082     in[4]                  = _mm256_add_epi32(in[4], rounding);
6083     in[6]                  = _mm256_add_epi32(in[6], rounding);
6084     in[0]                  = _mm256_srai_epi32(in[0], shift);
6085     in[2]                  = _mm256_srai_epi32(in[2], shift);
6086     in[4]                  = _mm256_srai_epi32(in[4], shift);
6087     in[6]                  = _mm256_srai_epi32(in[6], shift);
6088     in[8]                  = _mm256_add_epi32(in[8], rounding);
6089     in[10]                 = _mm256_add_epi32(in[10], rounding);
6090     in[12]                 = _mm256_add_epi32(in[12], rounding);
6091     in[14]                 = _mm256_add_epi32(in[14], rounding);
6092     in[8]                  = _mm256_srai_epi32(in[8], shift);
6093     in[10]                 = _mm256_srai_epi32(in[10], shift);
6094     in[12]                 = _mm256_srai_epi32(in[12], shift);
6095     in[14]                 = _mm256_srai_epi32(in[14], shift);
6096 }
6097 
write_buffer_16x16_N2(const __m256i * res,int32_t * output)6098 static AOM_FORCE_INLINE void write_buffer_16x16_N2(const __m256i *res, int32_t *output) {
6099     const __m256i zero = _mm256_setzero_si256();
6100     int32_t       i;
6101     for (i = 0; i < 4; i++) {
6102         _mm256_storeu_si256((__m256i *)(output + i * 32), res[i * 4]);
6103         _mm256_storeu_si256((__m256i *)(output + i * 32 + 8), zero);
6104         _mm256_storeu_si256((__m256i *)(output + i * 32 + 16), res[i * 4 + 2]);
6105         _mm256_storeu_si256((__m256i *)(output + i * 32 + 24), zero);
6106     }
6107     for (; i < 8; i++) {
6108         _mm256_storeu_si256((__m256i *)(output + i * 32), zero);
6109         _mm256_storeu_si256((__m256i *)(output + i * 32 + 8), zero);
6110         _mm256_storeu_si256((__m256i *)(output + i * 32 + 16), zero);
6111         _mm256_storeu_si256((__m256i *)(output + i * 32 + 24), zero);
6112     }
6113 }
6114 
fdct16x16_N2_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num,int32_t size)6115 static void fdct16x16_N2_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num,
6116                               int32_t size) {
6117     const int32_t *cospi    = cospi_arr(bit);
6118     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
6119     const __m256i  cospim32 = _mm256_set1_epi32(-cospi[32]);
6120     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
6121     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
6122     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
6123     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
6124     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
6125     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
6126     const __m256i  cospi24  = _mm256_set1_epi32(cospi[24]);
6127     const __m256i  cospi40  = _mm256_set1_epi32(cospi[40]);
6128     const __m256i  cospi60  = _mm256_set1_epi32(cospi[60]);
6129     const __m256i  cospi4   = _mm256_set1_epi32(cospi[4]);
6130     const __m256i  cospi28  = _mm256_set1_epi32(cospi[28]);
6131     const __m256i  cospi36  = _mm256_set1_epi32(cospi[36]);
6132     const __m256i  cospi44  = _mm256_set1_epi32(cospi[44]);
6133     const __m256i  cospi20  = _mm256_set1_epi32(cospi[20]);
6134     const __m256i  cospi12  = _mm256_set1_epi32(cospi[12]);
6135     const __m256i  cospi52  = _mm256_set1_epi32(cospi[52]);
6136     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
6137     __m256i        u[16], v[16], x;
6138     int32_t        col;
6139 
6140     for (col = 0; col < size; ++col) {
6141         // stage 0
6142         // stage 1
6143         u[0]  = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
6144         u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
6145         u[1]  = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
6146         u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
6147         u[2]  = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
6148         u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
6149         u[3]  = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
6150         u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
6151         u[4]  = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
6152         u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
6153         u[5]  = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
6154         u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
6155         u[6]  = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
6156         u[9]  = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
6157         u[7]  = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
6158         u[8]  = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
6159 
6160         // stage 2
6161         v[0] = _mm256_add_epi32(u[0], u[7]);
6162         v[7] = _mm256_sub_epi32(u[0], u[7]);
6163         v[1] = _mm256_add_epi32(u[1], u[6]);
6164         v[6] = _mm256_sub_epi32(u[1], u[6]);
6165         v[2] = _mm256_add_epi32(u[2], u[5]);
6166         v[5] = _mm256_sub_epi32(u[2], u[5]);
6167         v[3] = _mm256_add_epi32(u[3], u[4]);
6168         v[4] = _mm256_sub_epi32(u[3], u[4]);
6169         v[8] = u[8];
6170         v[9] = u[9];
6171 
6172         v[10] = _mm256_mullo_epi32(u[10], cospim32);
6173         x     = _mm256_mullo_epi32(u[13], cospi32);
6174         v[10] = _mm256_add_epi32(v[10], x);
6175         v[10] = _mm256_add_epi32(v[10], rnding);
6176         v[10] = _mm256_srai_epi32(v[10], bit);
6177 
6178         v[13] = _mm256_mullo_epi32(u[10], cospi32);
6179         x     = _mm256_mullo_epi32(u[13], cospim32);
6180         v[13] = _mm256_sub_epi32(v[13], x);
6181         v[13] = _mm256_add_epi32(v[13], rnding);
6182         v[13] = _mm256_srai_epi32(v[13], bit);
6183 
6184         v[11] = _mm256_mullo_epi32(u[11], cospim32);
6185         x     = _mm256_mullo_epi32(u[12], cospi32);
6186         v[11] = _mm256_add_epi32(v[11], x);
6187         v[11] = _mm256_add_epi32(v[11], rnding);
6188         v[11] = _mm256_srai_epi32(v[11], bit);
6189 
6190         v[12] = _mm256_mullo_epi32(u[11], cospi32);
6191         x     = _mm256_mullo_epi32(u[12], cospim32);
6192         v[12] = _mm256_sub_epi32(v[12], x);
6193         v[12] = _mm256_add_epi32(v[12], rnding);
6194         v[12] = _mm256_srai_epi32(v[12], bit);
6195         v[14] = u[14];
6196         v[15] = u[15];
6197 
6198         // stage 3
6199         u[0] = _mm256_add_epi32(v[0], v[3]);
6200         u[3] = _mm256_sub_epi32(v[0], v[3]);
6201         u[1] = _mm256_add_epi32(v[1], v[2]);
6202         u[2] = _mm256_sub_epi32(v[1], v[2]);
6203         u[4] = v[4];
6204 
6205         u[5] = _mm256_mullo_epi32(v[5], cospim32);
6206         x    = _mm256_mullo_epi32(v[6], cospi32);
6207         u[5] = _mm256_add_epi32(u[5], x);
6208         u[5] = _mm256_add_epi32(u[5], rnding);
6209         u[5] = _mm256_srai_epi32(u[5], bit);
6210 
6211         u[6] = _mm256_mullo_epi32(v[5], cospi32);
6212         x    = _mm256_mullo_epi32(v[6], cospim32);
6213         u[6] = _mm256_sub_epi32(u[6], x);
6214         u[6] = _mm256_add_epi32(u[6], rnding);
6215         u[6] = _mm256_srai_epi32(u[6], bit);
6216 
6217         u[7]  = v[7];
6218         u[8]  = _mm256_add_epi32(v[8], v[11]);
6219         u[11] = _mm256_sub_epi32(v[8], v[11]);
6220         u[9]  = _mm256_add_epi32(v[9], v[10]);
6221         u[10] = _mm256_sub_epi32(v[9], v[10]);
6222         u[12] = _mm256_sub_epi32(v[15], v[12]);
6223         u[15] = _mm256_add_epi32(v[15], v[12]);
6224         u[13] = _mm256_sub_epi32(v[14], v[13]);
6225         u[14] = _mm256_add_epi32(v[14], v[13]);
6226 
6227         // stage 4
6228         u[0] = _mm256_mullo_epi32(u[0], cospi32);
6229         u[1] = _mm256_mullo_epi32(u[1], cospi32);
6230         v[0] = _mm256_add_epi32(u[0], u[1]);
6231         v[0] = _mm256_add_epi32(v[0], rnding);
6232         v[0] = _mm256_srai_epi32(v[0], bit);
6233 
6234         v[2] = _mm256_mullo_epi32(u[2], cospi48);
6235         x    = _mm256_mullo_epi32(u[3], cospi16);
6236         v[2] = _mm256_add_epi32(v[2], x);
6237         v[2] = _mm256_add_epi32(v[2], rnding);
6238         v[2] = _mm256_srai_epi32(v[2], bit);
6239 
6240         v[4] = _mm256_add_epi32(u[4], u[5]);
6241         v[5] = _mm256_sub_epi32(u[4], u[5]);
6242         v[6] = _mm256_sub_epi32(u[7], u[6]);
6243         v[7] = _mm256_add_epi32(u[7], u[6]);
6244         v[8] = u[8];
6245 
6246         v[9] = _mm256_mullo_epi32(u[9], cospim16);
6247         x    = _mm256_mullo_epi32(u[14], cospi48);
6248         v[9] = _mm256_add_epi32(v[9], x);
6249         v[9] = _mm256_add_epi32(v[9], rnding);
6250         v[9] = _mm256_srai_epi32(v[9], bit);
6251 
6252         v[14] = _mm256_mullo_epi32(u[9], cospi48);
6253         x     = _mm256_mullo_epi32(u[14], cospim16);
6254         v[14] = _mm256_sub_epi32(v[14], x);
6255         v[14] = _mm256_add_epi32(v[14], rnding);
6256         v[14] = _mm256_srai_epi32(v[14], bit);
6257 
6258         v[10] = _mm256_mullo_epi32(u[10], cospim48);
6259         x     = _mm256_mullo_epi32(u[13], cospim16);
6260         v[10] = _mm256_add_epi32(v[10], x);
6261         v[10] = _mm256_add_epi32(v[10], rnding);
6262         v[10] = _mm256_srai_epi32(v[10], bit);
6263 
6264         v[13] = _mm256_mullo_epi32(u[10], cospim16);
6265         x     = _mm256_mullo_epi32(u[13], cospim48);
6266         v[13] = _mm256_sub_epi32(v[13], x);
6267         v[13] = _mm256_add_epi32(v[13], rnding);
6268         v[13] = _mm256_srai_epi32(v[13], bit);
6269 
6270         v[11] = u[11];
6271         v[12] = u[12];
6272         v[15] = u[15];
6273 
6274         // stage 5
6275         u[0] = v[0];
6276         u[2] = v[2];
6277 
6278         u[4] = _mm256_mullo_epi32(v[4], cospi56);
6279         x    = _mm256_mullo_epi32(v[7], cospi8);
6280         u[4] = _mm256_add_epi32(u[4], x);
6281         u[4] = _mm256_add_epi32(u[4], rnding);
6282         u[4] = _mm256_srai_epi32(u[4], bit);
6283 
6284         u[6] = _mm256_mullo_epi32(v[5], cospi40);
6285         x    = _mm256_mullo_epi32(v[6], cospi24);
6286         u[6] = _mm256_sub_epi32(x, u[6]);
6287         u[6] = _mm256_add_epi32(u[6], rnding);
6288         u[6] = _mm256_srai_epi32(u[6], bit);
6289 
6290         u[8]  = _mm256_add_epi32(v[8], v[9]);
6291         u[9]  = _mm256_sub_epi32(v[8], v[9]);
6292         u[10] = _mm256_sub_epi32(v[11], v[10]);
6293         u[11] = _mm256_add_epi32(v[11], v[10]);
6294         u[12] = _mm256_add_epi32(v[12], v[13]);
6295         u[13] = _mm256_sub_epi32(v[12], v[13]);
6296         u[14] = _mm256_sub_epi32(v[15], v[14]);
6297         u[15] = _mm256_add_epi32(v[15], v[14]);
6298 
6299         // stage 6
6300         v[0] = u[0];
6301         v[2] = u[2];
6302         v[4] = u[4];
6303         v[6] = u[6];
6304 
6305         v[8] = _mm256_mullo_epi32(u[8], cospi60);
6306         x    = _mm256_mullo_epi32(u[15], cospi4);
6307         v[8] = _mm256_add_epi32(v[8], x);
6308         v[8] = _mm256_add_epi32(v[8], rnding);
6309         v[8] = _mm256_srai_epi32(v[8], bit);
6310 
6311         v[14] = _mm256_mullo_epi32(u[9], cospi36);
6312         x     = _mm256_mullo_epi32(u[14], cospi28);
6313         v[14] = _mm256_sub_epi32(x, v[14]);
6314         v[14] = _mm256_add_epi32(v[14], rnding);
6315         v[14] = _mm256_srai_epi32(v[14], bit);
6316 
6317         v[10] = _mm256_mullo_epi32(u[10], cospi44);
6318         x     = _mm256_mullo_epi32(u[13], cospi20);
6319         v[10] = _mm256_add_epi32(v[10], x);
6320         v[10] = _mm256_add_epi32(v[10], rnding);
6321         v[10] = _mm256_srai_epi32(v[10], bit);
6322 
6323         v[12] = _mm256_mullo_epi32(u[11], cospi52);
6324         x     = _mm256_mullo_epi32(u[12], cospi12);
6325         v[12] = _mm256_sub_epi32(x, v[12]);
6326         v[12] = _mm256_add_epi32(v[12], rnding);
6327         v[12] = _mm256_srai_epi32(v[12], bit);
6328 
6329         out[0 * col_num + col] = v[0];
6330         out[1 * col_num + col] = v[8];
6331         out[2 * col_num + col] = v[4];
6332         out[3 * col_num + col] = v[12];
6333         out[4 * col_num + col] = v[2];
6334         out[5 * col_num + col] = v[10];
6335         out[6 * col_num + col] = v[6];
6336         out[7 * col_num + col] = v[14];
6337     }
6338 }
6339 
fadst16x16_N2_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num,int32_t size)6340 static void fadst16x16_N2_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num,
6341                                int32_t size) {
6342     const int32_t *cospi    = cospi_arr(bit);
6343     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
6344     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
6345     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
6346     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
6347     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
6348     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
6349     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
6350     const __m256i  cospim56 = _mm256_set1_epi32(-cospi[56]);
6351     const __m256i  cospim8  = _mm256_set1_epi32(-cospi[8]);
6352     const __m256i  cospi24  = _mm256_set1_epi32(cospi[24]);
6353     const __m256i  cospim24 = _mm256_set1_epi32(-cospi[24]);
6354     const __m256i  cospim40 = _mm256_set1_epi32(-cospi[40]);
6355     const __m256i  cospi40  = _mm256_set1_epi32(cospi[40]);
6356     const __m256i  cospi62  = _mm256_set1_epi32(cospi[62]);
6357     const __m256i  cospim2  = _mm256_set1_epi32(-cospi[2]);
6358     const __m256i  cospi54  = _mm256_set1_epi32(cospi[54]);
6359     const __m256i  cospim10 = _mm256_set1_epi32(-cospi[10]);
6360     const __m256i  cospi46  = _mm256_set1_epi32(cospi[46]);
6361     const __m256i  cospim18 = _mm256_set1_epi32(-cospi[18]);
6362     const __m256i  cospi38  = _mm256_set1_epi32(cospi[38]);
6363     const __m256i  cospim26 = _mm256_set1_epi32(-cospi[26]);
6364     const __m256i  cospi34  = _mm256_set1_epi32(cospi[34]);
6365     const __m256i  cospi30  = _mm256_set1_epi32(cospi[30]);
6366     const __m256i  cospi42  = _mm256_set1_epi32(cospi[42]);
6367     const __m256i  cospi22  = _mm256_set1_epi32(cospi[22]);
6368     const __m256i  cospi50  = _mm256_set1_epi32(cospi[50]);
6369     const __m256i  cospi14  = _mm256_set1_epi32(cospi[14]);
6370     const __m256i  cospi58  = _mm256_set1_epi32(cospi[58]);
6371     const __m256i  cospi6   = _mm256_set1_epi32(cospi[6]);
6372     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
6373     const __m256i  zero     = _mm256_setzero_si256();
6374 
6375     __m256i u[16], v[16], x, y;
6376     int32_t col;
6377 
6378     for (col = 0; col < size; ++col) {
6379         // stage 0
6380         // stage 1
6381         u[0]  = in[0 * col_num + col];
6382         u[1]  = _mm256_sub_epi32(zero, in[15 * col_num + col]);
6383         u[2]  = _mm256_sub_epi32(zero, in[7 * col_num + col]);
6384         u[3]  = in[8 * col_num + col];
6385         u[4]  = _mm256_sub_epi32(zero, in[3 * col_num + col]);
6386         u[5]  = in[12 * col_num + col];
6387         u[6]  = in[4 * col_num + col];
6388         u[7]  = _mm256_sub_epi32(zero, in[11 * col_num + col]);
6389         u[8]  = _mm256_sub_epi32(zero, in[1 * col_num + col]);
6390         u[9]  = in[14 * col_num + col];
6391         u[10] = in[6 * col_num + col];
6392         u[11] = _mm256_sub_epi32(zero, in[9 * col_num + col]);
6393         u[12] = in[2 * col_num + col];
6394         u[13] = _mm256_sub_epi32(zero, in[13 * col_num + col]);
6395         u[14] = _mm256_sub_epi32(zero, in[5 * col_num + col]);
6396         u[15] = in[10 * col_num + col];
6397 
6398         // stage 2
6399         v[0] = u[0];
6400         v[1] = u[1];
6401 
6402         x    = _mm256_mullo_epi32(u[2], cospi32);
6403         y    = _mm256_mullo_epi32(u[3], cospi32);
6404         v[2] = _mm256_add_epi32(x, y);
6405         v[2] = _mm256_add_epi32(v[2], rnding);
6406         v[2] = _mm256_srai_epi32(v[2], bit);
6407 
6408         v[3] = _mm256_sub_epi32(x, y);
6409         v[3] = _mm256_add_epi32(v[3], rnding);
6410         v[3] = _mm256_srai_epi32(v[3], bit);
6411 
6412         v[4] = u[4];
6413         v[5] = u[5];
6414 
6415         x    = _mm256_mullo_epi32(u[6], cospi32);
6416         y    = _mm256_mullo_epi32(u[7], cospi32);
6417         v[6] = _mm256_add_epi32(x, y);
6418         v[6] = _mm256_add_epi32(v[6], rnding);
6419         v[6] = _mm256_srai_epi32(v[6], bit);
6420 
6421         v[7] = _mm256_sub_epi32(x, y);
6422         v[7] = _mm256_add_epi32(v[7], rnding);
6423         v[7] = _mm256_srai_epi32(v[7], bit);
6424 
6425         v[8] = u[8];
6426         v[9] = u[9];
6427 
6428         x     = _mm256_mullo_epi32(u[10], cospi32);
6429         y     = _mm256_mullo_epi32(u[11], cospi32);
6430         v[10] = _mm256_add_epi32(x, y);
6431         v[10] = _mm256_add_epi32(v[10], rnding);
6432         v[10] = _mm256_srai_epi32(v[10], bit);
6433 
6434         v[11] = _mm256_sub_epi32(x, y);
6435         v[11] = _mm256_add_epi32(v[11], rnding);
6436         v[11] = _mm256_srai_epi32(v[11], bit);
6437 
6438         v[12] = u[12];
6439         v[13] = u[13];
6440 
6441         x     = _mm256_mullo_epi32(u[14], cospi32);
6442         y     = _mm256_mullo_epi32(u[15], cospi32);
6443         v[14] = _mm256_add_epi32(x, y);
6444         v[14] = _mm256_add_epi32(v[14], rnding);
6445         v[14] = _mm256_srai_epi32(v[14], bit);
6446 
6447         v[15] = _mm256_sub_epi32(x, y);
6448         v[15] = _mm256_add_epi32(v[15], rnding);
6449         v[15] = _mm256_srai_epi32(v[15], bit);
6450 
6451         // stage 3
6452         u[0]  = _mm256_add_epi32(v[0], v[2]);
6453         u[1]  = _mm256_add_epi32(v[1], v[3]);
6454         u[2]  = _mm256_sub_epi32(v[0], v[2]);
6455         u[3]  = _mm256_sub_epi32(v[1], v[3]);
6456         u[4]  = _mm256_add_epi32(v[4], v[6]);
6457         u[5]  = _mm256_add_epi32(v[5], v[7]);
6458         u[6]  = _mm256_sub_epi32(v[4], v[6]);
6459         u[7]  = _mm256_sub_epi32(v[5], v[7]);
6460         u[8]  = _mm256_add_epi32(v[8], v[10]);
6461         u[9]  = _mm256_add_epi32(v[9], v[11]);
6462         u[10] = _mm256_sub_epi32(v[8], v[10]);
6463         u[11] = _mm256_sub_epi32(v[9], v[11]);
6464         u[12] = _mm256_add_epi32(v[12], v[14]);
6465         u[13] = _mm256_add_epi32(v[13], v[15]);
6466         u[14] = _mm256_sub_epi32(v[12], v[14]);
6467         u[15] = _mm256_sub_epi32(v[13], v[15]);
6468 
6469         // stage 4
6470         v[0]  = u[0];
6471         v[1]  = u[1];
6472         v[2]  = u[2];
6473         v[3]  = u[3];
6474         v[4]  = half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
6475         v[5]  = half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
6476         v[6]  = half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
6477         v[7]  = half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
6478         v[8]  = u[8];
6479         v[9]  = u[9];
6480         v[10] = u[10];
6481         v[11] = u[11];
6482         v[12] = half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
6483         v[13] = half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
6484         v[14] = half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
6485         v[15] = half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
6486 
6487         // stage 5
6488         u[0]  = _mm256_add_epi32(v[0], v[4]);
6489         u[1]  = _mm256_add_epi32(v[1], v[5]);
6490         u[2]  = _mm256_add_epi32(v[2], v[6]);
6491         u[3]  = _mm256_add_epi32(v[3], v[7]);
6492         u[4]  = _mm256_sub_epi32(v[0], v[4]);
6493         u[5]  = _mm256_sub_epi32(v[1], v[5]);
6494         u[6]  = _mm256_sub_epi32(v[2], v[6]);
6495         u[7]  = _mm256_sub_epi32(v[3], v[7]);
6496         u[8]  = _mm256_add_epi32(v[8], v[12]);
6497         u[9]  = _mm256_add_epi32(v[9], v[13]);
6498         u[10] = _mm256_add_epi32(v[10], v[14]);
6499         u[11] = _mm256_add_epi32(v[11], v[15]);
6500         u[12] = _mm256_sub_epi32(v[8], v[12]);
6501         u[13] = _mm256_sub_epi32(v[9], v[13]);
6502         u[14] = _mm256_sub_epi32(v[10], v[14]);
6503         u[15] = _mm256_sub_epi32(v[11], v[15]);
6504 
6505         // stage 6
6506         v[0]  = u[0];
6507         v[1]  = u[1];
6508         v[2]  = u[2];
6509         v[3]  = u[3];
6510         v[4]  = u[4];
6511         v[5]  = u[5];
6512         v[6]  = u[6];
6513         v[7]  = u[7];
6514         v[8]  = half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
6515         v[9]  = half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
6516         v[10] = half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
6517         v[11] = half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
6518         v[12] = half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
6519         v[13] = half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
6520         v[14] = half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
6521         v[15] = half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
6522 
6523         // stage 7
6524         u[0]  = _mm256_add_epi32(v[0], v[8]);
6525         u[1]  = _mm256_add_epi32(v[1], v[9]);
6526         u[2]  = _mm256_add_epi32(v[2], v[10]);
6527         u[3]  = _mm256_add_epi32(v[3], v[11]);
6528         u[4]  = _mm256_add_epi32(v[4], v[12]);
6529         u[5]  = _mm256_add_epi32(v[5], v[13]);
6530         u[6]  = _mm256_add_epi32(v[6], v[14]);
6531         u[7]  = _mm256_add_epi32(v[7], v[15]);
6532         u[8]  = _mm256_sub_epi32(v[0], v[8]);
6533         u[9]  = _mm256_sub_epi32(v[1], v[9]);
6534         u[10] = _mm256_sub_epi32(v[2], v[10]);
6535         u[11] = _mm256_sub_epi32(v[3], v[11]);
6536         u[12] = _mm256_sub_epi32(v[4], v[12]);
6537         u[13] = _mm256_sub_epi32(v[5], v[13]);
6538         u[14] = _mm256_sub_epi32(v[6], v[14]);
6539         u[15] = _mm256_sub_epi32(v[7], v[15]);
6540 
6541         // stage 8
6542         v[1]  = half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
6543         v[3]  = half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
6544         v[5]  = half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
6545         v[7]  = half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
6546         v[8]  = half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
6547         v[10] = half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
6548         v[12] = half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
6549         v[14] = half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
6550 
6551         // stage 9
6552         out[0 * col_num + col] = v[1];
6553         out[1 * col_num + col] = v[14];
6554         out[2 * col_num + col] = v[3];
6555         out[3 * col_num + col] = v[12];
6556         out[4 * col_num + col] = v[5];
6557         out[5 * col_num + col] = v[10];
6558         out[6 * col_num + col] = v[7];
6559         out[7 * col_num + col] = v[8];
6560     }
6561 }
6562 
fidtx8xn_N2_col_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num)6563 static void fidtx8xn_N2_col_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
6564     (void)bit;
6565     const int32_t bits     = 12; // new_sqrt2_bits = 12
6566     const int32_t sqrt     = 2 * 5793; // 2 * new_sqrt2
6567     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
6568     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
6569     __m256i       temp;
6570     for (int32_t i = 0; i < col_num; i++) {
6571         temp   = _mm256_mullo_epi32(in[i], newsqrt);
6572         temp   = _mm256_add_epi32(temp, rounding);
6573         out[i] = _mm256_srai_epi32(temp, bits);
6574     }
6575 }
6576 
fidtx16x16_N2_row_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num)6577 static void fidtx16x16_N2_row_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
6578     (void)bit;
6579     const int32_t bits     = 12; // new_sqrt2_bits = 12
6580     const int32_t sqrt     = 2 * 5793; // 2 * new_sqrt2
6581     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
6582     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
6583     __m256i       temp;
6584     int32_t       num_iters = 16 * col_num;
6585     for (int32_t i = 0; i < num_iters / 2; i += 2) {
6586         temp   = _mm256_mullo_epi32(in[i], newsqrt);
6587         temp   = _mm256_add_epi32(temp, rounding);
6588         out[i] = _mm256_srai_epi32(temp, bits);
6589     }
6590 }
6591 
av1_idtx32_new_N2_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num,int32_t size)6592 void av1_idtx32_new_N2_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
6593                             const int32_t col_num, int32_t size) {
6594     (void)cos_bit;
6595 
6596     for (int32_t i = 0; i < size; i += 4) {
6597         output[i * col_num]     = _mm256_slli_epi32(input[i * col_num], 2);
6598         output[i * col_num + 1] = _mm256_slli_epi32(input[i * col_num + 1], 2);
6599     }
6600 }
6601 
av1_idtx16x32_N2_avx2(const __m256i * input,__m256i * output)6602 void av1_idtx16x32_N2_avx2(const __m256i *input, __m256i *output) {
6603     for (int32_t i = 0; i < 32; i += 2) { output[i] = _mm256_slli_epi32(input[i], 2); }
6604 }
6605 
clear_buffer_wxh_N2(__m256i * buff,int32_t num_col,int32_t num_row)6606 static AOM_FORCE_INLINE void clear_buffer_wxh_N2(__m256i *buff, int32_t num_col, int32_t num_row) {
6607     const __m256i zero    = _mm256_setzero_si256();
6608     const __m128i zero128 = _mm_setzero_si128();
6609     assert(num_col > 0);
6610     assert(num_row > 1);
6611 
6612     //clear top-right quarter
6613     if (num_col == 1) { //this means we have to use 128bit register to clear top-right quarter
6614         __m128i *ptr_128b = (__m128i *)buff;
6615         for (int i = 0; i < num_row / 2; i++) ptr_128b[i * 2 + 1] = zero128;
6616 
6617     } else {
6618         for (int i = 0; i < num_row / 2; i++)
6619             for (int j = num_col / 2; j < num_col; j++) buff[i * num_col + j] = zero;
6620     }
6621     //clear bottom half
6622     for (int i = num_row / 2; i < num_row; i++)
6623         for (int j = 0; j < num_col; j++) buff[i * num_col + j] = zero;
6624 }
6625 
fidtx32x32_N2_col_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit)6626 static void fidtx32x32_N2_col_avx2(const __m256i *input, __m256i *output, const int8_t cos_bit) {
6627     for (int32_t i = 0; i < 2; i++)
6628         av1_idtx32_new_avx2(&input[i * 32], &output[i * 32], cos_bit, 1);
6629 }
6630 
fidtx32x32_N2_row_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit)6631 static void fidtx32x32_N2_row_avx2(const __m256i *input, __m256i *output, const int8_t cos_bit) {
6632     int32_t i;
6633     for (i = 0; i < 2; i++) av1_idtx32_new_N2_avx2(&input[i * 32], &output[i * 32], cos_bit, 1, 32);
6634 }
6635 
av1_fdct32_new_N2_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num,const int32_t stride)6636 static void av1_fdct32_new_N2_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
6637                                    const int32_t col_num, const int32_t stride) {
6638     const int32_t *cospi      = cospi_arr(cos_bit);
6639     const __m256i  __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
6640     const int32_t  columns    = col_num >> 3;
6641 
6642     __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
6643     __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
6644     __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
6645     __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
6646     __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
6647     __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
6648     __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
6649     __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
6650     __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
6651     __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
6652     __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
6653     __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
6654     __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
6655     __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
6656     __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
6657     __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
6658     __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
6659     __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
6660     __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
6661     __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
6662     __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
6663     __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
6664     __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
6665     __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
6666     __m256i cospi_m34 = _mm256_set1_epi32(-cospi[34]);
6667     __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
6668     __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
6669     __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
6670     __m256i cospi_m50 = _mm256_set1_epi32(-cospi[50]);
6671     __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
6672     __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
6673     __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
6674     __m256i cospi_m42 = _mm256_set1_epi32(-cospi[42]);
6675     __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
6676     __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
6677     __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
6678     __m256i cospi_m58 = _mm256_set1_epi32(-cospi[58]);
6679 
6680     __m256i buf0[32];
6681     __m256i buf1[32];
6682 
6683     for (int32_t col = 0; col < columns; col++) {
6684         const __m256i *in  = &input[col];
6685         __m256i *      out = &output[col];
6686 
6687         // stage 0
6688         // stage 1
6689         buf1[0]  = _mm256_add_epi32(in[0 * stride], in[31 * stride]);
6690         buf1[31] = _mm256_sub_epi32(in[0 * stride], in[31 * stride]);
6691         buf1[1]  = _mm256_add_epi32(in[1 * stride], in[30 * stride]);
6692         buf1[30] = _mm256_sub_epi32(in[1 * stride], in[30 * stride]);
6693         buf1[2]  = _mm256_add_epi32(in[2 * stride], in[29 * stride]);
6694         buf1[29] = _mm256_sub_epi32(in[2 * stride], in[29 * stride]);
6695         buf1[3]  = _mm256_add_epi32(in[3 * stride], in[28 * stride]);
6696         buf1[28] = _mm256_sub_epi32(in[3 * stride], in[28 * stride]);
6697         buf1[4]  = _mm256_add_epi32(in[4 * stride], in[27 * stride]);
6698         buf1[27] = _mm256_sub_epi32(in[4 * stride], in[27 * stride]);
6699         buf1[5]  = _mm256_add_epi32(in[5 * stride], in[26 * stride]);
6700         buf1[26] = _mm256_sub_epi32(in[5 * stride], in[26 * stride]);
6701         buf1[6]  = _mm256_add_epi32(in[6 * stride], in[25 * stride]);
6702         buf1[25] = _mm256_sub_epi32(in[6 * stride], in[25 * stride]);
6703         buf1[7]  = _mm256_add_epi32(in[7 * stride], in[24 * stride]);
6704         buf1[24] = _mm256_sub_epi32(in[7 * stride], in[24 * stride]);
6705         buf1[8]  = _mm256_add_epi32(in[8 * stride], in[23 * stride]);
6706         buf1[23] = _mm256_sub_epi32(in[8 * stride], in[23 * stride]);
6707         buf1[9]  = _mm256_add_epi32(in[9 * stride], in[22 * stride]);
6708         buf1[22] = _mm256_sub_epi32(in[9 * stride], in[22 * stride]);
6709         buf1[10] = _mm256_add_epi32(in[10 * stride], in[21 * stride]);
6710         buf1[21] = _mm256_sub_epi32(in[10 * stride], in[21 * stride]);
6711         buf1[11] = _mm256_add_epi32(in[11 * stride], in[20 * stride]);
6712         buf1[20] = _mm256_sub_epi32(in[11 * stride], in[20 * stride]);
6713         buf1[12] = _mm256_add_epi32(in[12 * stride], in[19 * stride]);
6714         buf1[19] = _mm256_sub_epi32(in[12 * stride], in[19 * stride]);
6715         buf1[13] = _mm256_add_epi32(in[13 * stride], in[18 * stride]);
6716         buf1[18] = _mm256_sub_epi32(in[13 * stride], in[18 * stride]);
6717         buf1[14] = _mm256_add_epi32(in[14 * stride], in[17 * stride]);
6718         buf1[17] = _mm256_sub_epi32(in[14 * stride], in[17 * stride]);
6719         buf1[15] = _mm256_add_epi32(in[15 * stride], in[16 * stride]);
6720         buf1[16] = _mm256_sub_epi32(in[15 * stride], in[16 * stride]);
6721 
6722         // stage 2
6723         buf0[0]  = _mm256_add_epi32(buf1[0], buf1[15]);
6724         buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
6725         buf0[1]  = _mm256_add_epi32(buf1[1], buf1[14]);
6726         buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
6727         buf0[2]  = _mm256_add_epi32(buf1[2], buf1[13]);
6728         buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
6729         buf0[3]  = _mm256_add_epi32(buf1[3], buf1[12]);
6730         buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
6731         buf0[4]  = _mm256_add_epi32(buf1[4], buf1[11]);
6732         buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
6733         buf0[5]  = _mm256_add_epi32(buf1[5], buf1[10]);
6734         buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
6735         buf0[6]  = _mm256_add_epi32(buf1[6], buf1[9]);
6736         buf0[9]  = _mm256_sub_epi32(buf1[6], buf1[9]);
6737         buf0[7]  = _mm256_add_epi32(buf1[7], buf1[8]);
6738         buf0[8]  = _mm256_sub_epi32(buf1[7], buf1[8]);
6739         buf0[16] = buf1[16];
6740         buf0[17] = buf1[17];
6741         buf0[18] = buf1[18];
6742         buf0[19] = buf1[19];
6743         btf_32_type0_avx2_new(
6744             cospi_m32, cospi_p32, buf1[20], buf1[27], buf0[20], buf0[27], __rounding, cos_bit);
6745         btf_32_type0_avx2_new(
6746             cospi_m32, cospi_p32, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
6747         btf_32_type0_avx2_new(
6748             cospi_m32, cospi_p32, buf1[22], buf1[25], buf0[22], buf0[25], __rounding, cos_bit);
6749         btf_32_type0_avx2_new(
6750             cospi_m32, cospi_p32, buf1[23], buf1[24], buf0[23], buf0[24], __rounding, cos_bit);
6751         buf0[28] = buf1[28];
6752         buf0[29] = buf1[29];
6753         buf0[30] = buf1[30];
6754         buf0[31] = buf1[31];
6755 
6756         // stage 3
6757         buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
6758         buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
6759         buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
6760         buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
6761         buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
6762         buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
6763         buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
6764         buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
6765         buf1[8] = buf0[8];
6766         buf1[9] = buf0[9];
6767         btf_32_type0_avx2_new(
6768             cospi_m32, cospi_p32, buf0[10], buf0[13], buf1[10], buf1[13], __rounding, cos_bit);
6769         btf_32_type0_avx2_new(
6770             cospi_m32, cospi_p32, buf0[11], buf0[12], buf1[11], buf1[12], __rounding, cos_bit);
6771         buf1[14] = buf0[14];
6772         buf1[15] = buf0[15];
6773         buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
6774         buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
6775         buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
6776         buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
6777         buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
6778         buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
6779         buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
6780         buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
6781         buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
6782         buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
6783         buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
6784         buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
6785         buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
6786         buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
6787         buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
6788         buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
6789 
6790         // stage 4
6791         buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
6792         buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
6793         buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
6794         buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
6795         buf0[4] = buf1[4];
6796         btf_32_type0_avx2_new(
6797             cospi_m32, cospi_p32, buf1[5], buf1[6], buf0[5], buf0[6], __rounding, cos_bit);
6798         buf0[7]  = buf1[7];
6799         buf0[8]  = _mm256_add_epi32(buf1[8], buf1[11]);
6800         buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
6801         buf0[9]  = _mm256_add_epi32(buf1[9], buf1[10]);
6802         buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
6803         buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
6804         buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
6805         buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
6806         buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
6807         buf0[16] = buf1[16];
6808         buf0[17] = buf1[17];
6809         btf_32_type0_avx2_new(
6810             cospi_m16, cospi_p48, buf1[18], buf1[29], buf0[18], buf0[29], __rounding, cos_bit);
6811         btf_32_type0_avx2_new(
6812             cospi_m16, cospi_p48, buf1[19], buf1[28], buf0[19], buf0[28], __rounding, cos_bit);
6813         btf_32_type0_avx2_new(
6814             cospi_m48, cospi_m16, buf1[20], buf1[27], buf0[20], buf0[27], __rounding, cos_bit);
6815         btf_32_type0_avx2_new(
6816             cospi_m48, cospi_m16, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
6817         buf0[22] = buf1[22];
6818         buf0[23] = buf1[23];
6819         buf0[24] = buf1[24];
6820         buf0[25] = buf1[25];
6821         buf0[30] = buf1[30];
6822         buf0[31] = buf1[31];
6823 
6824         // stage 5
6825         buf1[0] = half_btf_avx2(&cospi_p32, &buf0[0], &cospi_p32, &buf0[1], &__rounding, cos_bit);
6826         buf1[2] = half_btf_avx2(&cospi_p48, &buf0[2], &cospi_p16, &buf0[3], &__rounding, cos_bit);
6827         buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
6828         buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
6829         buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
6830         buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
6831         buf1[8] = buf0[8];
6832         btf_32_type0_avx2_new(
6833             cospi_m16, cospi_p48, buf0[9], buf0[14], buf1[9], buf1[14], __rounding, cos_bit);
6834         btf_32_type0_avx2_new(
6835             cospi_m48, cospi_m16, buf0[10], buf0[13], buf1[10], buf1[13], __rounding, cos_bit);
6836         buf1[11] = buf0[11];
6837         buf1[12] = buf0[12];
6838         buf1[15] = buf0[15];
6839         buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
6840         buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
6841         buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
6842         buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
6843         buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
6844         buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
6845         buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
6846         buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
6847         buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
6848         buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
6849         buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
6850         buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
6851         buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
6852         buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
6853         buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
6854         buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
6855 
6856         // stage 6
6857         buf0[0]  = buf1[0];
6858         buf0[2]  = buf1[2];
6859         buf0[4]  = half_btf_avx2(&cospi_p56, &buf1[4], &cospi_p08, &buf1[7], &__rounding, cos_bit);
6860         buf0[6]  = half_btf_avx2(&cospi_p24, &buf1[6], &cospi_m40, &buf1[5], &__rounding, cos_bit);
6861         buf0[8]  = _mm256_add_epi32(buf1[8], buf1[9]);
6862         buf0[9]  = _mm256_sub_epi32(buf1[8], buf1[9]);
6863         buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
6864         buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
6865         buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
6866         buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
6867         buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
6868         buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
6869         buf0[16] = buf1[16];
6870         btf_32_type0_avx2_new(
6871             cospi_m08, cospi_p56, buf1[17], buf1[30], buf0[17], buf0[30], __rounding, cos_bit);
6872         btf_32_type0_avx2_new(
6873             cospi_m56, cospi_m08, buf1[18], buf1[29], buf0[18], buf0[29], __rounding, cos_bit);
6874         buf0[19] = buf1[19];
6875         buf0[20] = buf1[20];
6876         btf_32_type0_avx2_new(
6877             cospi_m40, cospi_p24, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
6878         btf_32_type0_avx2_new(
6879             cospi_m24, cospi_m40, buf1[22], buf1[25], buf0[22], buf0[25], __rounding, cos_bit);
6880         buf0[23] = buf1[23];
6881         buf0[24] = buf1[24];
6882         buf0[27] = buf1[27];
6883         buf0[28] = buf1[28];
6884         buf0[31] = buf1[31];
6885 
6886         // stage 7
6887         buf1[0]  = buf0[0];
6888         buf1[2]  = buf0[2];
6889         buf1[4]  = buf0[4];
6890         buf1[6]  = buf0[6];
6891         buf1[8]  = half_btf_avx2(&cospi_p60, &buf0[8], &cospi_p04, &buf0[15], &__rounding, cos_bit);
6892         buf1[14] = half_btf_avx2(&cospi_p28, &buf0[14], &cospi_m36, &buf0[9], &__rounding, cos_bit);
6893         buf1[10] = half_btf_avx2(
6894             &cospi_p44, &buf0[10], &cospi_p20, &buf0[13], &__rounding, cos_bit);
6895         buf1[12] = half_btf_avx2(
6896             &cospi_p12, &buf0[12], &cospi_m52, &buf0[11], &__rounding, cos_bit);
6897 
6898         buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
6899         buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
6900         buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
6901         buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
6902         buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
6903         buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
6904         buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
6905         buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
6906         buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
6907         buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
6908         buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
6909         buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
6910         buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
6911         buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
6912         buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
6913         buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
6914 
6915         // stage 8
6916         buf0[0]  = buf1[0];
6917         buf0[2]  = buf1[2];
6918         buf0[4]  = buf1[4];
6919         buf0[6]  = buf1[6];
6920         buf0[8]  = buf1[8];
6921         buf0[10] = buf1[10];
6922         buf0[12] = buf1[12];
6923         buf0[14] = buf1[14];
6924         buf0[16] = half_btf_avx2(
6925             &cospi_p62, &buf1[16], &cospi_p02, &buf1[31], &__rounding, cos_bit);
6926         buf0[30] = half_btf_avx2(
6927             &cospi_p30, &buf1[30], &cospi_m34, &buf1[17], &__rounding, cos_bit);
6928         buf0[18] = half_btf_avx2(
6929             &cospi_p46, &buf1[18], &cospi_p18, &buf1[29], &__rounding, cos_bit);
6930         buf0[28] = half_btf_avx2(
6931             &cospi_p14, &buf1[28], &cospi_m50, &buf1[19], &__rounding, cos_bit);
6932         buf0[20] = half_btf_avx2(
6933             &cospi_p54, &buf1[20], &cospi_p10, &buf1[27], &__rounding, cos_bit);
6934         buf0[26] = half_btf_avx2(
6935             &cospi_p22, &buf1[26], &cospi_m42, &buf1[21], &__rounding, cos_bit);
6936         buf0[22] = half_btf_avx2(
6937             &cospi_p38, &buf1[22], &cospi_p26, &buf1[25], &__rounding, cos_bit);
6938         buf0[24] = half_btf_avx2(
6939             &cospi_p06, &buf1[24], &cospi_m58, &buf1[23], &__rounding, cos_bit);
6940 
6941         // stage 9
6942         out[0 * stride]  = buf0[0];
6943         out[1 * stride]  = buf0[16];
6944         out[2 * stride]  = buf0[8];
6945         out[3 * stride]  = buf0[24];
6946         out[4 * stride]  = buf0[4];
6947         out[5 * stride]  = buf0[20];
6948         out[6 * stride]  = buf0[12];
6949         out[7 * stride]  = buf0[28];
6950         out[8 * stride]  = buf0[2];
6951         out[9 * stride]  = buf0[18];
6952         out[10 * stride] = buf0[10];
6953         out[11 * stride] = buf0[26];
6954         out[12 * stride] = buf0[6];
6955         out[13 * stride] = buf0[22];
6956         out[14 * stride] = buf0[14];
6957         out[15 * stride] = buf0[30];
6958     }
6959 }
6960 
fdct32x32_N2_row_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit)6961 static AOM_FORCE_INLINE void fdct32x32_N2_row_avx2(const __m256i *input, __m256i *output,
6962                                                    const int8_t cos_bit) {
6963     const int32_t txfm_size   = 32;
6964     const int32_t num_per_256 = 8;
6965     int32_t       col_num     = txfm_size / num_per_256;
6966     av1_fdct32_new_N2_avx2(input, output, cos_bit, txfm_size / 2, col_num);
6967 }
6968 
fdct32x32_N2_col_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit)6969 static AOM_FORCE_INLINE void fdct32x32_N2_col_avx2(const __m256i *input, __m256i *output,
6970                                                    const int8_t cos_bit) {
6971     const int32_t txfm_size   = 32;
6972     const int32_t num_per_256 = 8;
6973     int32_t       col_num     = txfm_size / num_per_256;
6974     av1_fdct32_new_N2_avx2(input, output, cos_bit, txfm_size, col_num);
6975 }
6976 
av1_round_shift_array_32_N2_avx2(__m256i * input,__m256i * output,const int32_t size,const int32_t bit)6977 static AOM_FORCE_INLINE void av1_round_shift_array_32_N2_avx2(__m256i *input, __m256i *output,
6978                                                               const int32_t size,
6979                                                               const int32_t bit) {
6980     int32_t i;
6981 
6982     if (bit > 0) {
6983         const __m256i round = _mm256_set1_epi32(1 << (bit - 1));
6984         __m256i       r0;
6985         for (i = 0; i < size; i += 4) {
6986             r0            = _mm256_add_epi32(input[i], round);
6987             output[i]     = _mm256_srai_epi32(r0, bit);
6988             r0            = _mm256_add_epi32(input[i + 1], round);
6989             output[i + 1] = _mm256_srai_epi32(r0, bit);
6990         }
6991     } else {
6992         for (i = 0; i < size; i += 4) {
6993             output[i]     = _mm256_slli_epi32(input[i], -bit);
6994             output[i + 1] = _mm256_slli_epi32(input[i + 1], -bit);
6995         }
6996     }
6997 }
6998 
av1_round_shift_rect_array_wxh_avx2(__m256i * input,__m256i * output,const int32_t bit,const int32_t val,int32_t num_col,int32_t num_row)6999 static INLINE void av1_round_shift_rect_array_wxh_avx2(__m256i *input, __m256i *output,
7000                                                        const int32_t bit, const int32_t val,
7001                                                        int32_t num_col, int32_t num_row) {
7002     const __m256i sqrt2  = _mm256_set1_epi32(val);
7003     const __m256i round2 = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
7004     int32_t       i;
7005     if (bit > 0) {
7006         const __m256i round1 = _mm256_set1_epi32(1 << (bit - 1));
7007         __m256i       r0, r1, r2, r3;
7008         for (i = 0; i < num_row / 2; i++) {
7009             for (int j = 0; j < num_col / 2; j++) {
7010                 r0                      = _mm256_add_epi32(input[i * num_col + j], round1);
7011                 r1                      = _mm256_srai_epi32(r0, bit);
7012                 r2                      = _mm256_mullo_epi32(sqrt2, r1);
7013                 r3                      = _mm256_add_epi32(r2, round2);
7014                 output[i * num_col + j] = _mm256_srai_epi32(r3, new_sqrt2_bits);
7015             }
7016         }
7017     } else {
7018         __m256i r0, r1, r2;
7019         for (i = 0; i < num_row / 2; i++) {
7020             for (int j = 0; j < num_col / 2; j++) {
7021                 r0                      = _mm256_slli_epi32(input[i * num_col + j], -bit);
7022                 r1                      = _mm256_mullo_epi32(sqrt2, r0);
7023                 r2                      = _mm256_add_epi32(r1, round2);
7024                 output[i * num_col + j] = _mm256_srai_epi32(r2, new_sqrt2_bits);
7025             }
7026         }
7027     }
7028 }
7029 
load_buffer_32x32_in_64x64_avx2(const int16_t * input,int32_t stride,__m256i * output)7030 static INLINE void load_buffer_32x32_in_64x64_avx2(const int16_t *input, int32_t stride,
7031                                                    __m256i *output) {
7032     __m128i x0, x1, x2, x3;
7033     __m256i v0, v1, v2, v3;
7034     int32_t i;
7035 
7036     for (i = 0; i < 32; ++i) {
7037         x0 = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
7038         x1 = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
7039         x2 = _mm_loadu_si128((const __m128i *)(input + 2 * 8));
7040         x3 = _mm_loadu_si128((const __m128i *)(input + 3 * 8));
7041 
7042         v0 = _mm256_cvtepi16_epi32(x0);
7043         v1 = _mm256_cvtepi16_epi32(x1);
7044         v2 = _mm256_cvtepi16_epi32(x2);
7045         v3 = _mm256_cvtepi16_epi32(x3);
7046 
7047         _mm256_storeu_si256(output + 0, v0);
7048         _mm256_storeu_si256(output + 1, v1);
7049         _mm256_storeu_si256(output + 2, v2);
7050         _mm256_storeu_si256(output + 3, v3);
7051 
7052         input += stride;
7053         output += 8;
7054     }
7055 }
7056 
av1_fdct64_new_N2_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num,const int32_t stride)7057 static void av1_fdct64_new_N2_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
7058                                    const int32_t col_num, const int32_t stride) {
7059     const int32_t *cospi      = cospi_arr(cos_bit);
7060     const __m256i  __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
7061     const int32_t  columns    = col_num >> 3;
7062 
7063     __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
7064     __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
7065     __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
7066     __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
7067     __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
7068     __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
7069     __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
7070     __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
7071     __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
7072     __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
7073     __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
7074     __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
7075     __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
7076     __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
7077     __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
7078     __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
7079     __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
7080     __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
7081     __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
7082     __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
7083     __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
7084     __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
7085     __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
7086     __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
7087     __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
7088     __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
7089     __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
7090     __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
7091     __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
7092     __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
7093     __m256i cospi_m34 = _mm256_set1_epi32(-cospi[34]);
7094     __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
7095     __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
7096     __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
7097     __m256i cospi_m50 = _mm256_set1_epi32(-cospi[50]);
7098     __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
7099     __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
7100     __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
7101     __m256i cospi_m42 = _mm256_set1_epi32(-cospi[42]);
7102     __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
7103     __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
7104     __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
7105     __m256i cospi_m58 = _mm256_set1_epi32(-cospi[58]);
7106     __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
7107     __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
7108     __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
7109     __m256i cospi_m33 = _mm256_set1_epi32(-cospi[33]);
7110     __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
7111     __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
7112     __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
7113     __m256i cospi_m49 = _mm256_set1_epi32(-cospi[49]);
7114     __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
7115     __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
7116     __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
7117     __m256i cospi_m41 = _mm256_set1_epi32(-cospi[41]);
7118     __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
7119     __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
7120     __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
7121     __m256i cospi_m57 = _mm256_set1_epi32(-cospi[57]);
7122     __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
7123     __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
7124     __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
7125     __m256i cospi_m37 = _mm256_set1_epi32(-cospi[37]);
7126     __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
7127     __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
7128     __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
7129     __m256i cospi_m53 = _mm256_set1_epi32(-cospi[53]);
7130     __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
7131     __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
7132     __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
7133     __m256i cospi_m45 = _mm256_set1_epi32(-cospi[45]);
7134     __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
7135     __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
7136     __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
7137     __m256i cospi_m61 = _mm256_set1_epi32(-cospi[61]);
7138 
7139     for (int32_t col = 0; col < columns; col++) {
7140         const __m256i *in  = &input[col];
7141         __m256i *      out = &output[col];
7142 
7143         // stage 1
7144         __m256i x1[64];
7145         x1[0]  = _mm256_add_epi32(in[0 * stride], in[63 * stride]);
7146         x1[63] = _mm256_sub_epi32(in[0 * stride], in[63 * stride]);
7147         x1[1]  = _mm256_add_epi32(in[1 * stride], in[62 * stride]);
7148         x1[62] = _mm256_sub_epi32(in[1 * stride], in[62 * stride]);
7149         x1[2]  = _mm256_add_epi32(in[2 * stride], in[61 * stride]);
7150         x1[61] = _mm256_sub_epi32(in[2 * stride], in[61 * stride]);
7151         x1[3]  = _mm256_add_epi32(in[3 * stride], in[60 * stride]);
7152         x1[60] = _mm256_sub_epi32(in[3 * stride], in[60 * stride]);
7153         x1[4]  = _mm256_add_epi32(in[4 * stride], in[59 * stride]);
7154         x1[59] = _mm256_sub_epi32(in[4 * stride], in[59 * stride]);
7155         x1[5]  = _mm256_add_epi32(in[5 * stride], in[58 * stride]);
7156         x1[58] = _mm256_sub_epi32(in[5 * stride], in[58 * stride]);
7157         x1[6]  = _mm256_add_epi32(in[6 * stride], in[57 * stride]);
7158         x1[57] = _mm256_sub_epi32(in[6 * stride], in[57 * stride]);
7159         x1[7]  = _mm256_add_epi32(in[7 * stride], in[56 * stride]);
7160         x1[56] = _mm256_sub_epi32(in[7 * stride], in[56 * stride]);
7161         x1[8]  = _mm256_add_epi32(in[8 * stride], in[55 * stride]);
7162         x1[55] = _mm256_sub_epi32(in[8 * stride], in[55 * stride]);
7163         x1[9]  = _mm256_add_epi32(in[9 * stride], in[54 * stride]);
7164         x1[54] = _mm256_sub_epi32(in[9 * stride], in[54 * stride]);
7165         x1[10] = _mm256_add_epi32(in[10 * stride], in[53 * stride]);
7166         x1[53] = _mm256_sub_epi32(in[10 * stride], in[53 * stride]);
7167         x1[11] = _mm256_add_epi32(in[11 * stride], in[52 * stride]);
7168         x1[52] = _mm256_sub_epi32(in[11 * stride], in[52 * stride]);
7169         x1[12] = _mm256_add_epi32(in[12 * stride], in[51 * stride]);
7170         x1[51] = _mm256_sub_epi32(in[12 * stride], in[51 * stride]);
7171         x1[13] = _mm256_add_epi32(in[13 * stride], in[50 * stride]);
7172         x1[50] = _mm256_sub_epi32(in[13 * stride], in[50 * stride]);
7173         x1[14] = _mm256_add_epi32(in[14 * stride], in[49 * stride]);
7174         x1[49] = _mm256_sub_epi32(in[14 * stride], in[49 * stride]);
7175         x1[15] = _mm256_add_epi32(in[15 * stride], in[48 * stride]);
7176         x1[48] = _mm256_sub_epi32(in[15 * stride], in[48 * stride]);
7177         x1[16] = _mm256_add_epi32(in[16 * stride], in[47 * stride]);
7178         x1[47] = _mm256_sub_epi32(in[16 * stride], in[47 * stride]);
7179         x1[17] = _mm256_add_epi32(in[17 * stride], in[46 * stride]);
7180         x1[46] = _mm256_sub_epi32(in[17 * stride], in[46 * stride]);
7181         x1[18] = _mm256_add_epi32(in[18 * stride], in[45 * stride]);
7182         x1[45] = _mm256_sub_epi32(in[18 * stride], in[45 * stride]);
7183         x1[19] = _mm256_add_epi32(in[19 * stride], in[44 * stride]);
7184         x1[44] = _mm256_sub_epi32(in[19 * stride], in[44 * stride]);
7185         x1[20] = _mm256_add_epi32(in[20 * stride], in[43 * stride]);
7186         x1[43] = _mm256_sub_epi32(in[20 * stride], in[43 * stride]);
7187         x1[21] = _mm256_add_epi32(in[21 * stride], in[42 * stride]);
7188         x1[42] = _mm256_sub_epi32(in[21 * stride], in[42 * stride]);
7189         x1[22] = _mm256_add_epi32(in[22 * stride], in[41 * stride]);
7190         x1[41] = _mm256_sub_epi32(in[22 * stride], in[41 * stride]);
7191         x1[23] = _mm256_add_epi32(in[23 * stride], in[40 * stride]);
7192         x1[40] = _mm256_sub_epi32(in[23 * stride], in[40 * stride]);
7193         x1[24] = _mm256_add_epi32(in[24 * stride], in[39 * stride]);
7194         x1[39] = _mm256_sub_epi32(in[24 * stride], in[39 * stride]);
7195         x1[25] = _mm256_add_epi32(in[25 * stride], in[38 * stride]);
7196         x1[38] = _mm256_sub_epi32(in[25 * stride], in[38 * stride]);
7197         x1[26] = _mm256_add_epi32(in[26 * stride], in[37 * stride]);
7198         x1[37] = _mm256_sub_epi32(in[26 * stride], in[37 * stride]);
7199         x1[27] = _mm256_add_epi32(in[27 * stride], in[36 * stride]);
7200         x1[36] = _mm256_sub_epi32(in[27 * stride], in[36 * stride]);
7201         x1[28] = _mm256_add_epi32(in[28 * stride], in[35 * stride]);
7202         x1[35] = _mm256_sub_epi32(in[28 * stride], in[35 * stride]);
7203         x1[29] = _mm256_add_epi32(in[29 * stride], in[34 * stride]);
7204         x1[34] = _mm256_sub_epi32(in[29 * stride], in[34 * stride]);
7205         x1[30] = _mm256_add_epi32(in[30 * stride], in[33 * stride]);
7206         x1[33] = _mm256_sub_epi32(in[30 * stride], in[33 * stride]);
7207         x1[31] = _mm256_add_epi32(in[31 * stride], in[32 * stride]);
7208         x1[32] = _mm256_sub_epi32(in[31 * stride], in[32 * stride]);
7209 
7210         // stage 2
7211         __m256i x2[64];
7212         x2[0]  = _mm256_add_epi32(x1[0], x1[31]);
7213         x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
7214         x2[1]  = _mm256_add_epi32(x1[1], x1[30]);
7215         x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
7216         x2[2]  = _mm256_add_epi32(x1[2], x1[29]);
7217         x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
7218         x2[3]  = _mm256_add_epi32(x1[3], x1[28]);
7219         x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
7220         x2[4]  = _mm256_add_epi32(x1[4], x1[27]);
7221         x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
7222         x2[5]  = _mm256_add_epi32(x1[5], x1[26]);
7223         x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
7224         x2[6]  = _mm256_add_epi32(x1[6], x1[25]);
7225         x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
7226         x2[7]  = _mm256_add_epi32(x1[7], x1[24]);
7227         x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
7228         x2[8]  = _mm256_add_epi32(x1[8], x1[23]);
7229         x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
7230         x2[9]  = _mm256_add_epi32(x1[9], x1[22]);
7231         x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
7232         x2[10] = _mm256_add_epi32(x1[10], x1[21]);
7233         x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
7234         x2[11] = _mm256_add_epi32(x1[11], x1[20]);
7235         x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
7236         x2[12] = _mm256_add_epi32(x1[12], x1[19]);
7237         x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
7238         x2[13] = _mm256_add_epi32(x1[13], x1[18]);
7239         x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
7240         x2[14] = _mm256_add_epi32(x1[14], x1[17]);
7241         x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
7242         x2[15] = _mm256_add_epi32(x1[15], x1[16]);
7243         x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
7244         x2[32] = x1[32];
7245         x2[33] = x1[33];
7246         x2[34] = x1[34];
7247         x2[35] = x1[35];
7248         x2[36] = x1[36];
7249         x2[37] = x1[37];
7250         x2[38] = x1[38];
7251         x2[39] = x1[39];
7252         btf_32_type0_avx2_new(
7253             cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], __rounding, cos_bit);
7254         btf_32_type0_avx2_new(
7255             cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], __rounding, cos_bit);
7256         btf_32_type0_avx2_new(
7257             cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], __rounding, cos_bit);
7258         btf_32_type0_avx2_new(
7259             cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], __rounding, cos_bit);
7260         btf_32_type0_avx2_new(
7261             cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], __rounding, cos_bit);
7262         btf_32_type0_avx2_new(
7263             cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], __rounding, cos_bit);
7264         btf_32_type0_avx2_new(
7265             cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], __rounding, cos_bit);
7266         btf_32_type0_avx2_new(
7267             cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], __rounding, cos_bit);
7268         x2[56] = x1[56];
7269         x2[57] = x1[57];
7270         x2[58] = x1[58];
7271         x2[59] = x1[59];
7272         x2[60] = x1[60];
7273         x2[61] = x1[61];
7274         x2[62] = x1[62];
7275         x2[63] = x1[63];
7276 
7277         // stage 3
7278         __m256i x3[64];
7279         x3[0]  = _mm256_add_epi32(x2[0], x2[15]);
7280         x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
7281         x3[1]  = _mm256_add_epi32(x2[1], x2[14]);
7282         x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
7283         x3[2]  = _mm256_add_epi32(x2[2], x2[13]);
7284         x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
7285         x3[3]  = _mm256_add_epi32(x2[3], x2[12]);
7286         x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
7287         x3[4]  = _mm256_add_epi32(x2[4], x2[11]);
7288         x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
7289         x3[5]  = _mm256_add_epi32(x2[5], x2[10]);
7290         x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
7291         x3[6]  = _mm256_add_epi32(x2[6], x2[9]);
7292         x3[9]  = _mm256_sub_epi32(x2[6], x2[9]);
7293         x3[7]  = _mm256_add_epi32(x2[7], x2[8]);
7294         x3[8]  = _mm256_sub_epi32(x2[7], x2[8]);
7295         x3[16] = x2[16];
7296         x3[17] = x2[17];
7297         x3[18] = x2[18];
7298         x3[19] = x2[19];
7299         btf_32_type0_avx2_new(
7300             cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], __rounding, cos_bit);
7301         btf_32_type0_avx2_new(
7302             cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], __rounding, cos_bit);
7303         btf_32_type0_avx2_new(
7304             cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], __rounding, cos_bit);
7305         btf_32_type0_avx2_new(
7306             cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], __rounding, cos_bit);
7307         x3[28] = x2[28];
7308         x3[29] = x2[29];
7309         x3[30] = x2[30];
7310         x3[31] = x2[31];
7311         x3[32] = _mm256_add_epi32(x2[32], x2[47]);
7312         x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
7313         x3[33] = _mm256_add_epi32(x2[33], x2[46]);
7314         x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
7315         x3[34] = _mm256_add_epi32(x2[34], x2[45]);
7316         x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
7317         x3[35] = _mm256_add_epi32(x2[35], x2[44]);
7318         x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
7319         x3[36] = _mm256_add_epi32(x2[36], x2[43]);
7320         x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
7321         x3[37] = _mm256_add_epi32(x2[37], x2[42]);
7322         x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
7323         x3[38] = _mm256_add_epi32(x2[38], x2[41]);
7324         x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
7325         x3[39] = _mm256_add_epi32(x2[39], x2[40]);
7326         x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
7327         x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
7328         x3[63] = _mm256_add_epi32(x2[63], x2[48]);
7329         x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
7330         x3[62] = _mm256_add_epi32(x2[62], x2[49]);
7331         x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
7332         x3[61] = _mm256_add_epi32(x2[61], x2[50]);
7333         x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
7334         x3[60] = _mm256_add_epi32(x2[60], x2[51]);
7335         x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
7336         x3[59] = _mm256_add_epi32(x2[59], x2[52]);
7337         x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
7338         x3[58] = _mm256_add_epi32(x2[58], x2[53]);
7339         x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
7340         x3[57] = _mm256_add_epi32(x2[57], x2[54]);
7341         x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
7342         x3[56] = _mm256_add_epi32(x2[56], x2[55]);
7343 
7344         // stage 4
7345         __m256i x4[64];
7346         x4[0] = _mm256_add_epi32(x3[0], x3[7]);
7347         x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
7348         x4[1] = _mm256_add_epi32(x3[1], x3[6]);
7349         x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
7350         x4[2] = _mm256_add_epi32(x3[2], x3[5]);
7351         x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
7352         x4[3] = _mm256_add_epi32(x3[3], x3[4]);
7353         x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
7354         x4[8] = x3[8];
7355         x4[9] = x3[9];
7356         btf_32_type0_avx2_new(
7357             cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], __rounding, cos_bit);
7358         btf_32_type0_avx2_new(
7359             cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], __rounding, cos_bit);
7360         x4[14] = x3[14];
7361         x4[15] = x3[15];
7362         x4[16] = _mm256_add_epi32(x3[16], x3[23]);
7363         x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
7364         x4[17] = _mm256_add_epi32(x3[17], x3[22]);
7365         x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
7366         x4[18] = _mm256_add_epi32(x3[18], x3[21]);
7367         x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
7368         x4[19] = _mm256_add_epi32(x3[19], x3[20]);
7369         x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
7370         x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
7371         x4[31] = _mm256_add_epi32(x3[31], x3[24]);
7372         x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
7373         x4[30] = _mm256_add_epi32(x3[30], x3[25]);
7374         x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
7375         x4[29] = _mm256_add_epi32(x3[29], x3[26]);
7376         x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
7377         x4[28] = _mm256_add_epi32(x3[28], x3[27]);
7378         x4[32] = x3[32];
7379         x4[33] = x3[33];
7380         x4[34] = x3[34];
7381         x4[35] = x3[35];
7382         btf_32_type0_avx2_new(
7383             cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], __rounding, cos_bit);
7384         btf_32_type0_avx2_new(
7385             cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], __rounding, cos_bit);
7386         btf_32_type0_avx2_new(
7387             cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], __rounding, cos_bit);
7388         btf_32_type0_avx2_new(
7389             cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], __rounding, cos_bit);
7390         btf_32_type0_avx2_new(
7391             cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], __rounding, cos_bit);
7392         btf_32_type0_avx2_new(
7393             cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], __rounding, cos_bit);
7394         btf_32_type0_avx2_new(
7395             cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], __rounding, cos_bit);
7396         btf_32_type0_avx2_new(
7397             cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], __rounding, cos_bit);
7398         x4[44] = x3[44];
7399         x4[45] = x3[45];
7400         x4[46] = x3[46];
7401         x4[47] = x3[47];
7402         x4[48] = x3[48];
7403         x4[49] = x3[49];
7404         x4[50] = x3[50];
7405         x4[51] = x3[51];
7406         x4[60] = x3[60];
7407         x4[61] = x3[61];
7408         x4[62] = x3[62];
7409         x4[63] = x3[63];
7410 
7411         // stage 5
7412         __m256i x5[64];
7413         x5[0] = _mm256_add_epi32(x4[0], x4[3]);
7414         x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
7415         x5[1] = _mm256_add_epi32(x4[1], x4[2]);
7416         x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
7417         x5[4] = x4[4];
7418         btf_32_type0_avx2_new(
7419             cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], __rounding, cos_bit);
7420         x5[7]  = x4[7];
7421         x5[8]  = _mm256_add_epi32(x4[8], x4[11]);
7422         x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
7423         x5[9]  = _mm256_add_epi32(x4[9], x4[10]);
7424         x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
7425         x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
7426         x5[15] = _mm256_add_epi32(x4[15], x4[12]);
7427         x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
7428         x5[14] = _mm256_add_epi32(x4[14], x4[13]);
7429         x5[16] = x4[16];
7430         x5[17] = x4[17];
7431         btf_32_type0_avx2_new(
7432             cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], __rounding, cos_bit);
7433         btf_32_type0_avx2_new(
7434             cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], __rounding, cos_bit);
7435         btf_32_type0_avx2_new(
7436             cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], __rounding, cos_bit);
7437         btf_32_type0_avx2_new(
7438             cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], __rounding, cos_bit);
7439         x5[22] = x4[22];
7440         x5[23] = x4[23];
7441         x5[24] = x4[24];
7442         x5[25] = x4[25];
7443         x5[30] = x4[30];
7444         x5[31] = x4[31];
7445         x5[32] = _mm256_add_epi32(x4[32], x4[39]);
7446         x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
7447         x5[33] = _mm256_add_epi32(x4[33], x4[38]);
7448         x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
7449         x5[34] = _mm256_add_epi32(x4[34], x4[37]);
7450         x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
7451         x5[35] = _mm256_add_epi32(x4[35], x4[36]);
7452         x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
7453         x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
7454         x5[47] = _mm256_add_epi32(x4[47], x4[40]);
7455         x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
7456         x5[46] = _mm256_add_epi32(x4[46], x4[41]);
7457         x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
7458         x5[45] = _mm256_add_epi32(x4[45], x4[42]);
7459         x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
7460         x5[44] = _mm256_add_epi32(x4[44], x4[43]);
7461         x5[48] = _mm256_add_epi32(x4[48], x4[55]);
7462         x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
7463         x5[49] = _mm256_add_epi32(x4[49], x4[54]);
7464         x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
7465         x5[50] = _mm256_add_epi32(x4[50], x4[53]);
7466         x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
7467         x5[51] = _mm256_add_epi32(x4[51], x4[52]);
7468         x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
7469         x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
7470         x5[63] = _mm256_add_epi32(x4[63], x4[56]);
7471         x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
7472         x5[62] = _mm256_add_epi32(x4[62], x4[57]);
7473         x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
7474         x5[61] = _mm256_add_epi32(x4[61], x4[58]);
7475         x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
7476         x5[60] = _mm256_add_epi32(x4[60], x4[59]);
7477 
7478         // stage 6
7479         __m256i x6[64];
7480         x6[0] = half_btf_avx2(&cospi_p32, &x5[0], &cospi_p32, &x5[1], &__rounding, cos_bit);
7481         x6[2] = half_btf_avx2(&cospi_p48, &x5[2], &cospi_p16, &x5[3], &__rounding, cos_bit);
7482         x6[4] = _mm256_add_epi32(x5[4], x5[5]);
7483         x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
7484         x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
7485         x6[7] = _mm256_add_epi32(x5[7], x5[6]);
7486         x6[8] = x5[8];
7487         btf_32_type0_avx2_new(
7488             cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], __rounding, cos_bit);
7489         btf_32_type0_avx2_new(
7490             cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], __rounding, cos_bit);
7491         x6[11] = x5[11];
7492         x6[12] = x5[12];
7493         x6[15] = x5[15];
7494         x6[16] = _mm256_add_epi32(x5[16], x5[19]);
7495         x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
7496         x6[17] = _mm256_add_epi32(x5[17], x5[18]);
7497         x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
7498         x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
7499         x6[23] = _mm256_add_epi32(x5[23], x5[20]);
7500         x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
7501         x6[22] = _mm256_add_epi32(x5[22], x5[21]);
7502         x6[24] = _mm256_add_epi32(x5[24], x5[27]);
7503         x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
7504         x6[25] = _mm256_add_epi32(x5[25], x5[26]);
7505         x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
7506         x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
7507         x6[31] = _mm256_add_epi32(x5[31], x5[28]);
7508         x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
7509         x6[30] = _mm256_add_epi32(x5[30], x5[29]);
7510         x6[32] = x5[32];
7511         x6[33] = x5[33];
7512         btf_32_type0_avx2_new(
7513             cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], __rounding, cos_bit);
7514         btf_32_type0_avx2_new(
7515             cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], __rounding, cos_bit);
7516         btf_32_type0_avx2_new(
7517             cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], __rounding, cos_bit);
7518         btf_32_type0_avx2_new(
7519             cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], __rounding, cos_bit);
7520         x6[38] = x5[38];
7521         x6[39] = x5[39];
7522         x6[40] = x5[40];
7523         x6[41] = x5[41];
7524         btf_32_type0_avx2_new(
7525             cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], __rounding, cos_bit);
7526         btf_32_type0_avx2_new(
7527             cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], __rounding, cos_bit);
7528         btf_32_type0_avx2_new(
7529             cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], __rounding, cos_bit);
7530         btf_32_type0_avx2_new(
7531             cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], __rounding, cos_bit);
7532         x6[46] = x5[46];
7533         x6[47] = x5[47];
7534         x6[48] = x5[48];
7535         x6[49] = x5[49];
7536         x6[54] = x5[54];
7537         x6[55] = x5[55];
7538         x6[56] = x5[56];
7539         x6[57] = x5[57];
7540         x6[62] = x5[62];
7541         x6[63] = x5[63];
7542 
7543         // stage 7
7544         __m256i x7[64];
7545         x7[0] = x6[0];
7546         //x7[1] = x6[1];
7547         x7[2] = x6[2];
7548         //x7[3] = x6[3];
7549         x7[4]  = half_btf_avx2(&cospi_p56, &x6[4], &cospi_p08, &x6[7], &__rounding, cos_bit);
7550         x7[6]  = half_btf_avx2(&cospi_p24, &x6[6], &cospi_m40, &x6[5], &__rounding, cos_bit);
7551         x7[8]  = _mm256_add_epi32(x6[8], x6[9]);
7552         x7[9]  = _mm256_sub_epi32(x6[8], x6[9]);
7553         x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
7554         x7[11] = _mm256_add_epi32(x6[11], x6[10]);
7555         x7[12] = _mm256_add_epi32(x6[12], x6[13]);
7556         x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
7557         x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
7558         x7[15] = _mm256_add_epi32(x6[15], x6[14]);
7559         x7[16] = x6[16];
7560         btf_32_type0_avx2_new(
7561             cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], __rounding, cos_bit);
7562         btf_32_type0_avx2_new(
7563             cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], __rounding, cos_bit);
7564         x7[19] = x6[19];
7565         x7[20] = x6[20];
7566         btf_32_type0_avx2_new(
7567             cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], __rounding, cos_bit);
7568         btf_32_type0_avx2_new(
7569             cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], __rounding, cos_bit);
7570         x7[23] = x6[23];
7571         x7[24] = x6[24];
7572         x7[27] = x6[27];
7573         x7[28] = x6[28];
7574         x7[31] = x6[31];
7575         x7[32] = _mm256_add_epi32(x6[32], x6[35]);
7576         x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
7577         x7[33] = _mm256_add_epi32(x6[33], x6[34]);
7578         x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
7579         x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
7580         x7[39] = _mm256_add_epi32(x6[39], x6[36]);
7581         x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
7582         x7[38] = _mm256_add_epi32(x6[38], x6[37]);
7583         x7[40] = _mm256_add_epi32(x6[40], x6[43]);
7584         x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
7585         x7[41] = _mm256_add_epi32(x6[41], x6[42]);
7586         x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
7587         x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
7588         x7[47] = _mm256_add_epi32(x6[47], x6[44]);
7589         x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
7590         x7[46] = _mm256_add_epi32(x6[46], x6[45]);
7591         x7[48] = _mm256_add_epi32(x6[48], x6[51]);
7592         x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
7593         x7[49] = _mm256_add_epi32(x6[49], x6[50]);
7594         x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
7595         x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
7596         x7[55] = _mm256_add_epi32(x6[55], x6[52]);
7597         x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
7598         x7[54] = _mm256_add_epi32(x6[54], x6[53]);
7599         x7[56] = _mm256_add_epi32(x6[56], x6[59]);
7600         x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
7601         x7[57] = _mm256_add_epi32(x6[57], x6[58]);
7602         x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
7603         x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
7604         x7[63] = _mm256_add_epi32(x6[63], x6[60]);
7605         x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
7606         x7[62] = _mm256_add_epi32(x6[62], x6[61]);
7607 
7608         // stage 8
7609         __m256i x8[64];
7610         x8[0] = x7[0];
7611         x8[2] = x7[2];
7612         x8[4] = x7[4];
7613         x8[6] = x7[6];
7614 
7615         x8[8]  = half_btf_avx2(&cospi_p60, &x7[8], &cospi_p04, &x7[15], &__rounding, cos_bit);
7616         x8[14] = half_btf_avx2(&cospi_p28, &x7[14], &cospi_m36, &x7[9], &__rounding, cos_bit);
7617         x8[10] = half_btf_avx2(&cospi_p44, &x7[10], &cospi_p20, &x7[13], &__rounding, cos_bit);
7618         x8[12] = half_btf_avx2(&cospi_p12, &x7[12], &cospi_m52, &x7[11], &__rounding, cos_bit);
7619         x8[16] = _mm256_add_epi32(x7[16], x7[17]);
7620         x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
7621         x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
7622         x8[19] = _mm256_add_epi32(x7[19], x7[18]);
7623         x8[20] = _mm256_add_epi32(x7[20], x7[21]);
7624         x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
7625         x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
7626         x8[23] = _mm256_add_epi32(x7[23], x7[22]);
7627         x8[24] = _mm256_add_epi32(x7[24], x7[25]);
7628         x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
7629         x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
7630         x8[27] = _mm256_add_epi32(x7[27], x7[26]);
7631         x8[28] = _mm256_add_epi32(x7[28], x7[29]);
7632         x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
7633         x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
7634         x8[31] = _mm256_add_epi32(x7[31], x7[30]);
7635         x8[32] = x7[32];
7636         btf_32_type0_avx2_new(
7637             cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], __rounding, cos_bit);
7638         btf_32_type0_avx2_new(
7639             cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], __rounding, cos_bit);
7640         x8[35] = x7[35];
7641         x8[36] = x7[36];
7642         btf_32_type0_avx2_new(
7643             cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], __rounding, cos_bit);
7644         btf_32_type0_avx2_new(
7645             cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], __rounding, cos_bit);
7646         x8[39] = x7[39];
7647         x8[40] = x7[40];
7648         btf_32_type0_avx2_new(
7649             cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], __rounding, cos_bit);
7650         btf_32_type0_avx2_new(
7651             cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], __rounding, cos_bit);
7652         x8[43] = x7[43];
7653         x8[44] = x7[44];
7654         btf_32_type0_avx2_new(
7655             cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], __rounding, cos_bit);
7656         btf_32_type0_avx2_new(
7657             cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], __rounding, cos_bit);
7658         x8[47] = x7[47];
7659         x8[48] = x7[48];
7660         x8[51] = x7[51];
7661         x8[52] = x7[52];
7662         x8[55] = x7[55];
7663         x8[56] = x7[56];
7664         x8[59] = x7[59];
7665         x8[60] = x7[60];
7666         x8[63] = x7[63];
7667 
7668         // stage 9
7669         __m256i x9[64];
7670         x9[0]  = x8[0];
7671         x9[2]  = x8[2];
7672         x9[4]  = x8[4];
7673         x9[6]  = x8[6];
7674         x9[8]  = x8[8];
7675         x9[10] = x8[10];
7676         x9[12] = x8[12];
7677         x9[14] = x8[14];
7678         x9[16] = half_btf_avx2(&cospi_p62, &x8[16], &cospi_p02, &x8[31], &__rounding, cos_bit);
7679         x9[30] = half_btf_avx2(&cospi_p30, &x8[30], &cospi_m34, &x8[17], &__rounding, cos_bit);
7680         x9[18] = half_btf_avx2(&cospi_p46, &x8[18], &cospi_p18, &x8[29], &__rounding, cos_bit);
7681         x9[28] = half_btf_avx2(&cospi_p14, &x8[28], &cospi_m50, &x8[19], &__rounding, cos_bit);
7682         x9[20] = half_btf_avx2(&cospi_p54, &x8[20], &cospi_p10, &x8[27], &__rounding, cos_bit);
7683         x9[26] = half_btf_avx2(&cospi_p22, &x8[26], &cospi_m42, &x8[21], &__rounding, cos_bit);
7684         x9[22] = half_btf_avx2(&cospi_p38, &x8[22], &cospi_p26, &x8[25], &__rounding, cos_bit);
7685         x9[24] = half_btf_avx2(&cospi_p06, &x8[24], &cospi_m58, &x8[23], &__rounding, cos_bit);
7686         x9[32] = _mm256_add_epi32(x8[32], x8[33]);
7687         x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
7688         x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
7689         x9[35] = _mm256_add_epi32(x8[35], x8[34]);
7690         x9[36] = _mm256_add_epi32(x8[36], x8[37]);
7691         x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
7692         x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
7693         x9[39] = _mm256_add_epi32(x8[39], x8[38]);
7694         x9[40] = _mm256_add_epi32(x8[40], x8[41]);
7695         x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
7696         x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
7697         x9[43] = _mm256_add_epi32(x8[43], x8[42]);
7698         x9[44] = _mm256_add_epi32(x8[44], x8[45]);
7699         x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
7700         x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
7701         x9[47] = _mm256_add_epi32(x8[47], x8[46]);
7702         x9[48] = _mm256_add_epi32(x8[48], x8[49]);
7703         x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
7704         x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
7705         x9[51] = _mm256_add_epi32(x8[51], x8[50]);
7706         x9[52] = _mm256_add_epi32(x8[52], x8[53]);
7707         x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
7708         x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
7709         x9[55] = _mm256_add_epi32(x8[55], x8[54]);
7710         x9[56] = _mm256_add_epi32(x8[56], x8[57]);
7711         x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
7712         x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
7713         x9[59] = _mm256_add_epi32(x8[59], x8[58]);
7714         x9[60] = _mm256_add_epi32(x8[60], x8[61]);
7715         x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
7716         x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
7717         x9[63] = _mm256_add_epi32(x8[63], x8[62]);
7718 
7719         // stage 10
7720         __m256i x10[64];
7721         out[0 * stride]  = x9[0];
7722         out[16 * stride] = x9[2];
7723         out[8 * stride]  = x9[4];
7724         out[24 * stride] = x9[6];
7725         out[4 * stride]  = x9[8];
7726         out[20 * stride] = x9[10];
7727         out[12 * stride] = x9[12];
7728         out[28 * stride] = x9[14];
7729         out[2 * stride]  = x9[16];
7730         out[18 * stride] = x9[18];
7731         out[10 * stride] = x9[20];
7732         out[26 * stride] = x9[22];
7733         out[6 * stride]  = x9[24];
7734         out[22 * stride] = x9[26];
7735         out[14 * stride] = x9[28];
7736         out[30 * stride] = x9[30];
7737         x10[32] = half_btf_avx2(&cospi_p63, &x9[32], &cospi_p01, &x9[63], &__rounding, cos_bit);
7738         x10[62] = half_btf_avx2(&cospi_p31, &x9[62], &cospi_m33, &x9[33], &__rounding, cos_bit);
7739         x10[34] = half_btf_avx2(&cospi_p47, &x9[34], &cospi_p17, &x9[61], &__rounding, cos_bit);
7740         x10[60] = half_btf_avx2(&cospi_p15, &x9[60], &cospi_m49, &x9[35], &__rounding, cos_bit);
7741         x10[36] = half_btf_avx2(&cospi_p55, &x9[36], &cospi_p09, &x9[59], &__rounding, cos_bit);
7742         x10[58] = half_btf_avx2(&cospi_p23, &x9[58], &cospi_m41, &x9[37], &__rounding, cos_bit);
7743         x10[38] = half_btf_avx2(&cospi_p39, &x9[38], &cospi_p25, &x9[57], &__rounding, cos_bit);
7744         x10[56] = half_btf_avx2(&cospi_p07, &x9[56], &cospi_m57, &x9[39], &__rounding, cos_bit);
7745         x10[40] = half_btf_avx2(&cospi_p59, &x9[40], &cospi_p05, &x9[55], &__rounding, cos_bit);
7746         x10[54] = half_btf_avx2(&cospi_p27, &x9[54], &cospi_m37, &x9[41], &__rounding, cos_bit);
7747         x10[42] = half_btf_avx2(&cospi_p43, &x9[42], &cospi_p21, &x9[53], &__rounding, cos_bit);
7748         x10[52] = half_btf_avx2(&cospi_p11, &x9[52], &cospi_m53, &x9[43], &__rounding, cos_bit);
7749         x10[44] = half_btf_avx2(&cospi_p51, &x9[44], &cospi_p13, &x9[51], &__rounding, cos_bit);
7750         x10[50] = half_btf_avx2(&cospi_p19, &x9[50], &cospi_m45, &x9[45], &__rounding, cos_bit);
7751         x10[46] = half_btf_avx2(&cospi_p35, &x9[46], &cospi_p29, &x9[49], &__rounding, cos_bit);
7752         x10[48] = half_btf_avx2(&cospi_p03, &x9[48], &cospi_m61, &x9[47], &__rounding, cos_bit);
7753 
7754         // stage 11
7755         out[1 * stride]  = x10[32];
7756         out[3 * stride]  = x10[48];
7757         out[5 * stride]  = x10[40];
7758         out[7 * stride]  = x10[56];
7759         out[9 * stride]  = x10[36];
7760         out[11 * stride] = x10[52];
7761         out[13 * stride] = x10[44];
7762         out[15 * stride] = x10[60];
7763         out[17 * stride] = x10[34];
7764         out[19 * stride] = x10[50];
7765         out[21 * stride] = x10[42];
7766         out[23 * stride] = x10[58];
7767         out[25 * stride] = x10[38];
7768         out[27 * stride] = x10[54];
7769         out[29 * stride] = x10[46];
7770         out[31 * stride] = x10[62];
7771     }
7772 }
7773 
fidtx64x64_N2_avx2(const __m256i * input,__m256i * output)7774 static void fidtx64x64_N2_avx2(const __m256i *input, __m256i *output) {
7775     const int32_t bits     = 12; // new_sqrt2_bits = 12
7776     const int32_t sqrt     = 4 * 5793; // 4 * new_sqrt2
7777     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
7778     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
7779 
7780     __m256i temp;
7781     for (int32_t i = 0; i < 256; i += 8) {
7782         temp          = _mm256_mullo_epi32(input[i], newsqrt);
7783         temp          = _mm256_add_epi32(temp, rounding);
7784         output[i]     = _mm256_srai_epi32(temp, bits);
7785         temp          = _mm256_mullo_epi32(input[i + 1], newsqrt);
7786         temp          = _mm256_add_epi32(temp, rounding);
7787         output[i + 1] = _mm256_srai_epi32(temp, bits);
7788         temp          = _mm256_mullo_epi32(input[i + 2], newsqrt);
7789         temp          = _mm256_add_epi32(temp, rounding);
7790         output[i + 2] = _mm256_srai_epi32(temp, bits);
7791         temp          = _mm256_mullo_epi32(input[i + 3], newsqrt);
7792         temp          = _mm256_add_epi32(temp, rounding);
7793         output[i + 3] = _mm256_srai_epi32(temp, bits);
7794     }
7795 }
7796 
av1_round_shift_array_64_N2_avx2(__m256i * input,__m256i * output,const int32_t size,const int32_t bit)7797 static INLINE void av1_round_shift_array_64_N2_avx2(__m256i *input, __m256i *output,
7798                                                     const int32_t size, const int32_t bit) {
7799     int32_t i;
7800     if (bit > 0) {
7801         const __m256i round = _mm256_set1_epi32(1 << (bit - 1));
7802         __m256i       r0;
7803         for (i = 0; i < size; i += 8) {
7804             r0            = _mm256_add_epi32(input[i], round);
7805             output[i]     = _mm256_srai_epi32(r0, bit);
7806             r0            = _mm256_add_epi32(input[i + 1], round);
7807             output[i + 1] = _mm256_srai_epi32(r0, bit);
7808             r0            = _mm256_add_epi32(input[i + 2], round);
7809             output[i + 2] = _mm256_srai_epi32(r0, bit);
7810             r0            = _mm256_add_epi32(input[i + 3], round);
7811             output[i + 3] = _mm256_srai_epi32(r0, bit);
7812         }
7813     } else {
7814         for (i = 0; i < size; i += 8) {
7815             output[i]     = _mm256_slli_epi32(input[i], -bit);
7816             output[i + 1] = _mm256_slli_epi32(input[i + 1], -bit);
7817             output[i + 2] = _mm256_slli_epi32(input[i + 2], -bit);
7818             output[i + 3] = _mm256_slli_epi32(input[i + 3], -bit);
7819         }
7820     }
7821 }
7822 
av1_idtx16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num)7823 void av1_idtx16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
7824                          const int32_t col_num) {
7825     (void)cos_bit;
7826     for (int32_t i = 0; i < 16; i++) output[i * col_num] = _mm256_slli_epi32(input[i * col_num], 2);
7827 }
7828 
fidtx32x8_N2_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num,int32_t row_num)7829 void fidtx32x8_N2_avx2(const __m256i *input, __m256i *output, int8_t cos_bit, const int32_t col_num,
7830                        int32_t row_num) {
7831     (void)cos_bit;
7832     for (int32_t i = 0; i < row_num; i++)
7833         output[i * col_num] = _mm256_slli_epi32(input[i * col_num], 1);
7834 }
7835 
fidtx32x16_N2_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t rows)7836 static void fidtx32x16_N2_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t rows) {
7837     (void)bit;
7838     const int32_t bits     = 12; // new_sqrt2_bits = 12
7839     const int32_t sqrt     = 2 * 5793; // 2 * new_sqrt2
7840     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
7841     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
7842     __m256i       temp;
7843 
7844     for (int32_t i = 0; i < rows; i++) {
7845         temp           = _mm256_mullo_epi32(in[i * 4], newsqrt);
7846         temp           = _mm256_add_epi32(temp, rounding);
7847         out[i * 4]     = _mm256_srai_epi32(temp, bits);
7848         temp           = _mm256_mullo_epi32(in[i * 4 + 1], newsqrt);
7849         temp           = _mm256_add_epi32(temp, rounding);
7850         out[i * 4 + 1] = _mm256_srai_epi32(temp, bits);
7851     }
7852 }
7853 
av1_idtx32x16_N2_avx2(const __m256i * input,__m256i * output,const int32_t rows)7854 void av1_idtx32x16_N2_avx2(const __m256i *input, __m256i *output, const int32_t rows) {
7855     for (int32_t i = 0; i < rows; i++) {
7856         output[i * 4]     = _mm256_slli_epi32(input[i * 4], 2);
7857         output[i * 4 + 1] = _mm256_slli_epi32(input[i * 4 + 1], 2);
7858     }
7859 }
7860 
col_txfm_32x16_N2_rounding(__m256i * in,int32_t shift)7861 static AOM_FORCE_INLINE void col_txfm_32x16_N2_rounding(__m256i *in, int32_t shift) {
7862     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
7863     in[0]                  = _mm256_add_epi32(in[0], rounding);
7864     in[4]                  = _mm256_add_epi32(in[4], rounding);
7865     in[8]                  = _mm256_add_epi32(in[8], rounding);
7866     in[12]                 = _mm256_add_epi32(in[12], rounding);
7867     in[0]                  = _mm256_srai_epi32(in[0], shift);
7868     in[4]                  = _mm256_srai_epi32(in[4], shift);
7869     in[8]                  = _mm256_srai_epi32(in[8], shift);
7870     in[12]                 = _mm256_srai_epi32(in[12], shift);
7871     in[16]                 = _mm256_add_epi32(in[16], rounding);
7872     in[20]                 = _mm256_add_epi32(in[20], rounding);
7873     in[24]                 = _mm256_add_epi32(in[24], rounding);
7874     in[28]                 = _mm256_add_epi32(in[28], rounding);
7875     in[16]                 = _mm256_srai_epi32(in[16], shift);
7876     in[20]                 = _mm256_srai_epi32(in[20], shift);
7877     in[24]                 = _mm256_srai_epi32(in[24], shift);
7878     in[28]                 = _mm256_srai_epi32(in[28], shift);
7879 }
7880 
fdct16x4_N2_avx2(__m256i * input,__m256i * output,int32_t bit)7881 static void fdct16x4_N2_avx2(__m256i *input, __m256i *output, int32_t bit) {
7882     __m128i *in  = (__m128i *)input;
7883     __m128i *out = (__m128i *)output;
7884 
7885     const int32_t *cospi    = cospi_arr(bit);
7886     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
7887     const __m128i  cospim32 = _mm_set1_epi32(-cospi[32]);
7888     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
7889     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
7890     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
7891     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
7892     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
7893     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
7894     const __m128i  cospi24  = _mm_set1_epi32(cospi[24]);
7895     const __m128i  cospi40  = _mm_set1_epi32(cospi[40]);
7896     const __m128i  cospi60  = _mm_set1_epi32(cospi[60]);
7897     const __m128i  cospi4   = _mm_set1_epi32(cospi[4]);
7898     const __m128i  cospi28  = _mm_set1_epi32(cospi[28]);
7899     const __m128i  cospi36  = _mm_set1_epi32(cospi[36]);
7900     const __m128i  cospi44  = _mm_set1_epi32(cospi[44]);
7901     const __m128i  cospi20  = _mm_set1_epi32(cospi[20]);
7902     const __m128i  cospi12  = _mm_set1_epi32(cospi[12]);
7903     const __m128i  cospi52  = _mm_set1_epi32(cospi[52]);
7904     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
7905     __m128i        u[16], v[16], x;
7906 
7907     // stage 0
7908     // stage 1
7909     u[0]  = _mm_add_epi32(in[0], in[15]);
7910     v[15] = _mm_sub_epi32(in[0], in[15]);
7911     u[1]  = _mm_add_epi32(in[1], in[14]);
7912     v[14] = _mm_sub_epi32(in[1], in[14]);
7913     u[2]  = _mm_add_epi32(in[2], in[13]);
7914     u[13] = _mm_sub_epi32(in[2], in[13]);
7915     u[3]  = _mm_add_epi32(in[3], in[12]);
7916     u[12] = _mm_sub_epi32(in[3], in[12]);
7917     u[4]  = _mm_add_epi32(in[4], in[11]);
7918     u[11] = _mm_sub_epi32(in[4], in[11]);
7919     u[5]  = _mm_add_epi32(in[5], in[10]);
7920     u[10] = _mm_sub_epi32(in[5], in[10]);
7921     u[6]  = _mm_add_epi32(in[6], in[9]);
7922     v[9]  = _mm_sub_epi32(in[6], in[9]);
7923     u[7]  = _mm_add_epi32(in[7], in[8]);
7924     v[8]  = _mm_sub_epi32(in[7], in[8]);
7925 
7926     // stage 2
7927     v[0] = _mm_add_epi32(u[0], u[7]);
7928     u[7] = _mm_sub_epi32(u[0], u[7]);
7929     v[1] = _mm_add_epi32(u[1], u[6]);
7930     v[6] = _mm_sub_epi32(u[1], u[6]);
7931     v[2] = _mm_add_epi32(u[2], u[5]);
7932     v[5] = _mm_sub_epi32(u[2], u[5]);
7933     v[3] = _mm_add_epi32(u[3], u[4]);
7934     u[4] = _mm_sub_epi32(u[3], u[4]);
7935 
7936     v[10] = _mm_mullo_epi32(u[10], cospim32);
7937     x     = _mm_mullo_epi32(u[13], cospi32);
7938     v[10] = _mm_add_epi32(v[10], x);
7939     v[10] = _mm_add_epi32(v[10], rnding);
7940     v[10] = _mm_srai_epi32(v[10], bit);
7941 
7942     v[13] = _mm_mullo_epi32(u[10], cospi32);
7943     x     = _mm_mullo_epi32(u[13], cospim32);
7944     v[13] = _mm_sub_epi32(v[13], x);
7945     v[13] = _mm_add_epi32(v[13], rnding);
7946     v[13] = _mm_srai_epi32(v[13], bit);
7947 
7948     v[11] = _mm_mullo_epi32(u[11], cospim32);
7949     x     = _mm_mullo_epi32(u[12], cospi32);
7950     v[11] = _mm_add_epi32(v[11], x);
7951     v[11] = _mm_add_epi32(v[11], rnding);
7952     v[11] = _mm_srai_epi32(v[11], bit);
7953 
7954     v[12] = _mm_mullo_epi32(u[11], cospi32);
7955     x     = _mm_mullo_epi32(u[12], cospim32);
7956     v[12] = _mm_sub_epi32(v[12], x);
7957     v[12] = _mm_add_epi32(v[12], rnding);
7958     v[12] = _mm_srai_epi32(v[12], bit);
7959 
7960     // stage 3
7961     u[0] = _mm_add_epi32(v[0], v[3]);
7962     u[3] = _mm_sub_epi32(v[0], v[3]);
7963     u[1] = _mm_add_epi32(v[1], v[2]);
7964     u[2] = _mm_sub_epi32(v[1], v[2]);
7965 
7966     u[5] = _mm_mullo_epi32(v[5], cospim32);
7967     x    = _mm_mullo_epi32(v[6], cospi32);
7968     u[5] = _mm_add_epi32(u[5], x);
7969     u[5] = _mm_add_epi32(u[5], rnding);
7970     u[5] = _mm_srai_epi32(u[5], bit);
7971 
7972     u[6] = _mm_mullo_epi32(v[5], cospi32);
7973     x    = _mm_mullo_epi32(v[6], cospim32);
7974     u[6] = _mm_sub_epi32(u[6], x);
7975     u[6] = _mm_add_epi32(u[6], rnding);
7976     u[6] = _mm_srai_epi32(u[6], bit);
7977 
7978     u[8]  = _mm_add_epi32(v[8], v[11]);
7979     v[11] = _mm_sub_epi32(v[8], v[11]);
7980     u[9]  = _mm_add_epi32(v[9], v[10]);
7981     u[10] = _mm_sub_epi32(v[9], v[10]);
7982     u[12] = _mm_sub_epi32(v[15], v[12]);
7983     v[15] = _mm_add_epi32(v[15], v[12]);
7984     u[13] = _mm_sub_epi32(v[14], v[13]);
7985     u[14] = _mm_add_epi32(v[14], v[13]);
7986 
7987     // stage 4
7988     u[0]   = _mm_mullo_epi32(u[0], cospi32);
7989     u[1]   = _mm_mullo_epi32(u[1], cospi32);
7990     v[0]   = _mm_add_epi32(u[0], u[1]);
7991     v[0]   = _mm_add_epi32(v[0], rnding);
7992     out[0] = _mm_srai_epi32(v[0], bit);
7993 
7994     v[2]   = _mm_mullo_epi32(u[2], cospi48);
7995     x      = _mm_mullo_epi32(u[3], cospi16);
7996     v[2]   = _mm_add_epi32(v[2], x);
7997     v[2]   = _mm_add_epi32(v[2], rnding);
7998     out[4] = _mm_srai_epi32(v[2], bit);
7999 
8000     v[4] = _mm_add_epi32(u[4], u[5]);
8001     v[5] = _mm_sub_epi32(u[4], u[5]);
8002     v[6] = _mm_sub_epi32(u[7], u[6]);
8003     v[7] = _mm_add_epi32(u[7], u[6]);
8004     v[8] = u[8];
8005 
8006     v[9] = _mm_mullo_epi32(u[9], cospim16);
8007     x    = _mm_mullo_epi32(u[14], cospi48);
8008     v[9] = _mm_add_epi32(v[9], x);
8009     v[9] = _mm_add_epi32(v[9], rnding);
8010     v[9] = _mm_srai_epi32(v[9], bit);
8011 
8012     v[14] = _mm_mullo_epi32(u[9], cospi48);
8013     x     = _mm_mullo_epi32(u[14], cospim16);
8014     v[14] = _mm_sub_epi32(v[14], x);
8015     v[14] = _mm_add_epi32(v[14], rnding);
8016     v[14] = _mm_srai_epi32(v[14], bit);
8017 
8018     v[10] = _mm_mullo_epi32(u[10], cospim48);
8019     x     = _mm_mullo_epi32(u[13], cospim16);
8020     v[10] = _mm_add_epi32(v[10], x);
8021     v[10] = _mm_add_epi32(v[10], rnding);
8022     v[10] = _mm_srai_epi32(v[10], bit);
8023 
8024     v[13] = _mm_mullo_epi32(u[10], cospim16);
8025     x     = _mm_mullo_epi32(u[13], cospim48);
8026     v[13] = _mm_sub_epi32(v[13], x);
8027     v[13] = _mm_add_epi32(v[13], rnding);
8028     v[13] = _mm_srai_epi32(v[13], bit);
8029 
8030     v[12] = u[12];
8031 
8032     // stage 5
8033     u[4]   = _mm_mullo_epi32(v[4], cospi56);
8034     x      = _mm_mullo_epi32(v[7], cospi8);
8035     u[4]   = _mm_add_epi32(u[4], x);
8036     u[4]   = _mm_add_epi32(u[4], rnding);
8037     out[2] = _mm_srai_epi32(u[4], bit);
8038 
8039     u[6]   = _mm_mullo_epi32(v[5], cospi40);
8040     x      = _mm_mullo_epi32(v[6], cospi24);
8041     u[6]   = _mm_sub_epi32(x, u[6]);
8042     u[6]   = _mm_add_epi32(u[6], rnding);
8043     out[6] = _mm_srai_epi32(u[6], bit);
8044 
8045     u[8]  = _mm_add_epi32(v[8], v[9]);
8046     u[9]  = _mm_sub_epi32(v[8], v[9]);
8047     u[10] = _mm_sub_epi32(v[11], v[10]);
8048     u[11] = _mm_add_epi32(v[11], v[10]);
8049     u[12] = _mm_add_epi32(v[12], v[13]);
8050     u[13] = _mm_sub_epi32(v[12], v[13]);
8051     u[14] = _mm_sub_epi32(v[15], v[14]);
8052     u[15] = _mm_add_epi32(v[15], v[14]);
8053 
8054     // stage 6
8055     v[8]   = _mm_mullo_epi32(u[8], cospi60);
8056     x      = _mm_mullo_epi32(u[15], cospi4);
8057     v[8]   = _mm_add_epi32(v[8], x);
8058     v[8]   = _mm_add_epi32(v[8], rnding);
8059     out[1] = _mm_srai_epi32(v[8], bit);
8060 
8061     v[14]  = _mm_mullo_epi32(u[9], cospi36);
8062     x      = _mm_mullo_epi32(u[14], cospi28);
8063     v[14]  = _mm_sub_epi32(x, v[14]);
8064     v[14]  = _mm_add_epi32(v[14], rnding);
8065     out[7] = _mm_srai_epi32(v[14], bit);
8066 
8067     v[10]  = _mm_mullo_epi32(u[10], cospi44);
8068     x      = _mm_mullo_epi32(u[13], cospi20);
8069     v[10]  = _mm_add_epi32(v[10], x);
8070     v[10]  = _mm_add_epi32(v[10], rnding);
8071     out[5] = _mm_srai_epi32(v[10], bit);
8072 
8073     v[12]  = _mm_mullo_epi32(u[11], cospi52);
8074     x      = _mm_mullo_epi32(u[12], cospi12);
8075     v[12]  = _mm_sub_epi32(x, v[12]);
8076     v[12]  = _mm_add_epi32(v[12], rnding);
8077     out[3] = _mm_srai_epi32(v[12], bit);
8078 }
8079 
fadst16x4_N2_avx2(__m256i * input,__m256i * output,int32_t bit)8080 static void fadst16x4_N2_avx2(__m256i *input, __m256i *output, int32_t bit) {
8081     __m128i *in  = (__m128i *)input;
8082     __m128i *out = (__m128i *)output;
8083 
8084     const int32_t *cospi    = cospi_arr(bit);
8085     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
8086     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
8087     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
8088     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
8089     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
8090     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
8091     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
8092     const __m128i  cospim56 = _mm_set1_epi32(-cospi[56]);
8093     const __m128i  cospim8  = _mm_set1_epi32(-cospi[8]);
8094     const __m128i  cospi24  = _mm_set1_epi32(cospi[24]);
8095     const __m128i  cospim24 = _mm_set1_epi32(-cospi[24]);
8096     const __m128i  cospim40 = _mm_set1_epi32(-cospi[40]);
8097     const __m128i  cospi40  = _mm_set1_epi32(cospi[40]);
8098     const __m128i  cospi62  = _mm_set1_epi32(cospi[62]);
8099     const __m128i  cospim2  = _mm_set1_epi32(-cospi[2]);
8100     const __m128i  cospi54  = _mm_set1_epi32(cospi[54]);
8101     const __m128i  cospim10 = _mm_set1_epi32(-cospi[10]);
8102     const __m128i  cospi46  = _mm_set1_epi32(cospi[46]);
8103     const __m128i  cospim18 = _mm_set1_epi32(-cospi[18]);
8104     const __m128i  cospi38  = _mm_set1_epi32(cospi[38]);
8105     const __m128i  cospim26 = _mm_set1_epi32(-cospi[26]);
8106     const __m128i  cospi34  = _mm_set1_epi32(cospi[34]);
8107     const __m128i  cospi30  = _mm_set1_epi32(cospi[30]);
8108     const __m128i  cospi42  = _mm_set1_epi32(cospi[42]);
8109     const __m128i  cospi22  = _mm_set1_epi32(cospi[22]);
8110     const __m128i  cospi50  = _mm_set1_epi32(cospi[50]);
8111     const __m128i  cospi14  = _mm_set1_epi32(cospi[14]);
8112     const __m128i  cospi58  = _mm_set1_epi32(cospi[58]);
8113     const __m128i  cospi6   = _mm_set1_epi32(cospi[6]);
8114     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
8115     const __m128i  zero     = _mm_setzero_si128();
8116 
8117     __m128i u[16], v[16], x, y;
8118     __m128i tmp[13];
8119 
8120     tmp[0] = _mm_sub_epi32(zero, in[15]);
8121     u[2]   = _mm_sub_epi32(zero, in[7]);
8122     tmp[1] = _mm_sub_epi32(zero, in[3]);
8123     u[7]   = _mm_sub_epi32(zero, in[11]);
8124     tmp[2] = _mm_sub_epi32(zero, in[1]);
8125     u[11]  = _mm_sub_epi32(zero, in[9]);
8126     tmp[3] = _mm_sub_epi32(zero, in[13]);
8127     u[14]  = _mm_sub_epi32(zero, in[5]);
8128 
8129     // stage 2
8130 
8131     x    = _mm_mullo_epi32(u[2], cospi32);
8132     y    = _mm_mullo_epi32(in[8], cospi32);
8133     v[2] = _mm_add_epi32(x, y);
8134     v[2] = _mm_add_epi32(v[2], rnding);
8135     v[2] = _mm_srai_epi32(v[2], bit);
8136 
8137     v[3] = _mm_sub_epi32(x, y);
8138     v[3] = _mm_add_epi32(v[3], rnding);
8139     v[3] = _mm_srai_epi32(v[3], bit);
8140 
8141     x    = _mm_mullo_epi32(in[4], cospi32);
8142     y    = _mm_mullo_epi32(u[7], cospi32);
8143     v[6] = _mm_add_epi32(x, y);
8144     v[6] = _mm_add_epi32(v[6], rnding);
8145     v[6] = _mm_srai_epi32(v[6], bit);
8146 
8147     v[7] = _mm_sub_epi32(x, y);
8148     v[7] = _mm_add_epi32(v[7], rnding);
8149     v[7] = _mm_srai_epi32(v[7], bit);
8150 
8151     x     = _mm_mullo_epi32(in[6], cospi32);
8152     y     = _mm_mullo_epi32(u[11], cospi32);
8153     v[10] = _mm_add_epi32(x, y);
8154     v[10] = _mm_add_epi32(v[10], rnding);
8155     v[10] = _mm_srai_epi32(v[10], bit);
8156 
8157     v[11] = _mm_sub_epi32(x, y);
8158     v[11] = _mm_add_epi32(v[11], rnding);
8159     v[11] = _mm_srai_epi32(v[11], bit);
8160 
8161     x     = _mm_mullo_epi32(u[14], cospi32);
8162     y     = _mm_mullo_epi32(in[10], cospi32);
8163     v[14] = _mm_add_epi32(x, y);
8164     v[14] = _mm_add_epi32(v[14], rnding);
8165     v[14] = _mm_srai_epi32(v[14], bit);
8166 
8167     v[15] = _mm_sub_epi32(x, y);
8168     v[15] = _mm_add_epi32(v[15], rnding);
8169     v[15] = _mm_srai_epi32(v[15], bit);
8170 
8171     // stage 3
8172     tmp[4] = _mm_add_epi32(in[0], v[2]);
8173     tmp[5] = _mm_add_epi32(tmp[0], v[3]);
8174     tmp[6] = _mm_sub_epi32(in[0], v[2]);
8175     tmp[0] = _mm_sub_epi32(tmp[0], v[3]);
8176     u[4]   = _mm_add_epi32(tmp[1], v[6]);
8177     u[5]   = _mm_add_epi32(in[12], v[7]);
8178     u[6]   = _mm_sub_epi32(tmp[1], v[6]);
8179     u[7]   = _mm_sub_epi32(in[12], v[7]);
8180     tmp[1] = _mm_add_epi32(tmp[2], v[10]);
8181     tmp[7] = _mm_add_epi32(in[14], v[11]);
8182     tmp[2] = _mm_sub_epi32(tmp[2], v[10]);
8183     tmp[8] = _mm_sub_epi32(in[14], v[11]);
8184     u[12]  = _mm_add_epi32(in[2], v[14]);
8185     u[13]  = _mm_add_epi32(tmp[3], v[15]);
8186     u[14]  = _mm_sub_epi32(in[2], v[14]);
8187     u[15]  = _mm_sub_epi32(tmp[3], v[15]);
8188 
8189     // stage 4
8190     v[4]  = half_btf_small(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
8191     v[5]  = half_btf_small(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
8192     v[6]  = half_btf_small(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
8193     v[7]  = half_btf_small(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
8194     v[12] = half_btf_small(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
8195     v[13] = half_btf_small(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
8196     v[14] = half_btf_small(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
8197     v[15] = half_btf_small(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
8198 
8199     // stage 5
8200     tmp[9]  = _mm_add_epi32(tmp[4], v[4]);
8201     tmp[10] = _mm_add_epi32(tmp[5], v[5]);
8202     tmp[11] = _mm_add_epi32(tmp[6], v[6]);
8203     tmp[12] = _mm_add_epi32(tmp[0], v[7]);
8204     tmp[4]  = _mm_sub_epi32(tmp[4], v[4]);
8205     tmp[5]  = _mm_sub_epi32(tmp[5], v[5]);
8206     tmp[6]  = _mm_sub_epi32(tmp[6], v[6]);
8207     tmp[0]  = _mm_sub_epi32(tmp[0], v[7]);
8208     u[8]    = _mm_add_epi32(tmp[1], v[12]);
8209     u[9]    = _mm_add_epi32(tmp[7], v[13]);
8210     u[10]   = _mm_add_epi32(tmp[2], v[14]);
8211     u[11]   = _mm_add_epi32(tmp[8], v[15]);
8212     u[12]   = _mm_sub_epi32(tmp[1], v[12]);
8213     u[13]   = _mm_sub_epi32(tmp[7], v[13]);
8214     u[14]   = _mm_sub_epi32(tmp[2], v[14]);
8215     u[15]   = _mm_sub_epi32(tmp[8], v[15]);
8216 
8217     // stage 6
8218     v[8]  = half_btf_small(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
8219     v[9]  = half_btf_small(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
8220     v[10] = half_btf_small(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
8221     v[11] = half_btf_small(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
8222     v[12] = half_btf_small(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
8223     v[13] = half_btf_small(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
8224     v[14] = half_btf_small(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
8225     v[15] = half_btf_small(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
8226 
8227     // stage 7
8228     u[0]  = _mm_add_epi32(tmp[9], v[8]);
8229     u[1]  = _mm_add_epi32(tmp[10], v[9]);
8230     u[2]  = _mm_add_epi32(tmp[11], v[10]);
8231     u[3]  = _mm_add_epi32(tmp[12], v[11]);
8232     u[4]  = _mm_add_epi32(tmp[4], v[12]);
8233     u[5]  = _mm_add_epi32(tmp[5], v[13]);
8234     u[6]  = _mm_add_epi32(tmp[6], v[14]);
8235     u[7]  = _mm_add_epi32(tmp[0], v[15]);
8236     u[8]  = _mm_sub_epi32(tmp[9], v[8]);
8237     u[9]  = _mm_sub_epi32(tmp[10], v[9]);
8238     u[10] = _mm_sub_epi32(tmp[11], v[10]);
8239     u[11] = _mm_sub_epi32(tmp[12], v[11]);
8240     u[12] = _mm_sub_epi32(tmp[4], v[12]);
8241     u[13] = _mm_sub_epi32(tmp[5], v[13]);
8242     u[14] = _mm_sub_epi32(tmp[6], v[14]);
8243     u[15] = _mm_sub_epi32(tmp[0], v[15]);
8244 
8245     // stage 8
8246     out[0] = half_btf_small(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
8247     out[2] = half_btf_small(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
8248     out[4] = half_btf_small(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
8249     out[6] = half_btf_small(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
8250     out[7] = half_btf_small(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
8251     out[5] = half_btf_small(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
8252     out[3] = half_btf_small(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
8253     out[1] = half_btf_small(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
8254 }
8255 
fdct4x8_col_N2_avx2(__m256i * in,__m256i * output,int32_t bit,const int32_t num_col)8256 static void fdct4x8_col_N2_avx2(__m256i *in, __m256i *output, int32_t bit, const int32_t num_col) {
8257     const int32_t *cospi   = cospi_arr(bit);
8258     const __m256i  zero    = _mm256_setzero_si256();
8259     const __m256i  cospi32 = _mm256_set1_epi32(cospi[32]);
8260     const __m256i  cospi48 = _mm256_set1_epi32(cospi[48]);
8261     const __m256i  cospi16 = _mm256_set1_epi32(cospi[16]);
8262     const __m256i  rnding  = _mm256_set1_epi32(1 << (bit - 1));
8263     __m256i        s0, s1, s2, s3;
8264     __m256i        u0, u1, u2, u3;
8265     __m256i        v0, v1, v2, v3;
8266     __m256i        out[4];
8267 
8268     int32_t endidx = 3 * num_col;
8269     s0             = _mm256_add_epi32(in[0], in[endidx]);
8270     s3             = _mm256_sub_epi32(in[0], in[endidx]);
8271     endidx -= num_col;
8272     s1 = _mm256_add_epi32(in[num_col], in[endidx]);
8273     s2 = _mm256_sub_epi32(in[num_col], in[endidx]);
8274 
8275     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
8276     u0 = _mm256_mullo_epi32(s0, cospi32);
8277     u1 = _mm256_mullo_epi32(s1, cospi32);
8278     u2 = _mm256_add_epi32(u0, u1);
8279 
8280     u3 = _mm256_add_epi32(u2, rnding);
8281 
8282     u0 = _mm256_srai_epi32(u3, bit);
8283 
8284     // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
8285     v0 = _mm256_mullo_epi32(s2, cospi48);
8286     v1 = _mm256_mullo_epi32(s3, cospi16);
8287     v2 = _mm256_add_epi32(v0, v1);
8288 
8289     v3 = _mm256_add_epi32(v2, rnding);
8290     u1 = _mm256_srai_epi32(v3, bit);
8291 
8292     // Transpose 4x4 32-bit
8293     v0 = _mm256_unpacklo_epi32(u0, u1);
8294     v1 = _mm256_unpackhi_epi32(u0, u1);
8295 
8296     out[0] = _mm256_unpacklo_epi64(v0, zero);
8297     out[1] = _mm256_unpackhi_epi64(v0, zero);
8298     out[2] = _mm256_unpacklo_epi64(v1, zero);
8299     out[3] = _mm256_unpackhi_epi64(v1, zero);
8300 
8301     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
8302     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
8303     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
8304     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
8305 }
8306 
fadst4x8_col_N2_avx2(__m256i * in,__m256i * output,int32_t bit,const int32_t num_col)8307 static void fadst4x8_col_N2_avx2(__m256i *in, __m256i *output, int32_t bit, const int32_t num_col) {
8308     const int32_t *sinpi  = sinpi_arr(bit);
8309     const __m256i  zero   = _mm256_setzero_si256();
8310     const __m256i  rnding = _mm256_set1_epi32(1 << (bit - 1));
8311     const __m256i  sinpi1 = _mm256_set1_epi32((int32_t)sinpi[1]);
8312     const __m256i  sinpi2 = _mm256_set1_epi32((int32_t)sinpi[2]);
8313     const __m256i  sinpi3 = _mm256_set1_epi32((int32_t)sinpi[3]);
8314     const __m256i  sinpi4 = _mm256_set1_epi32((int32_t)sinpi[4]);
8315     __m256i        s0, s1, s2, s3, s4;
8316     __m256i        x0, x1;
8317     __m256i        u0, u1;
8318     __m256i        v0, v1;
8319     __m256i        out[4];
8320 
8321     int32_t idx = 0 * num_col;
8322     s0          = _mm256_mullo_epi32(in[idx], sinpi1);
8323     u0          = _mm256_add_epi32(in[idx], in[idx + num_col]);
8324     idx += num_col;
8325     s2 = _mm256_mullo_epi32(in[idx], sinpi2);
8326     idx += num_col;
8327     s4 = _mm256_mullo_epi32(in[idx], sinpi3);
8328     idx += num_col;
8329     s3 = _mm256_mullo_epi32(in[idx], sinpi4);
8330     s1 = _mm256_sub_epi32(u0, in[idx]);
8331 
8332     u0 = _mm256_add_epi32(s0, s2);
8333     x0 = _mm256_add_epi32(u0, s3);
8334     x1 = _mm256_mullo_epi32(s1, sinpi3);
8335 
8336     s0 = _mm256_add_epi32(x0, s4);
8337 
8338     u0 = _mm256_add_epi32(s0, rnding);
8339     u0 = _mm256_srai_epi32(u0, bit);
8340 
8341     u1 = _mm256_add_epi32(x1, rnding);
8342     u1 = _mm256_srai_epi32(u1, bit);
8343 
8344     v0 = _mm256_unpacklo_epi32(u0, u1);
8345     v1 = _mm256_unpackhi_epi32(u0, u1);
8346 
8347     out[0] = _mm256_unpacklo_epi64(v0, zero);
8348     out[1] = _mm256_unpackhi_epi64(v0, zero);
8349     out[2] = _mm256_unpacklo_epi64(v1, zero);
8350     out[3] = _mm256_unpackhi_epi64(v1, zero);
8351 
8352     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
8353     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
8354     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
8355     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
8356 }
8357 
clear_buffer_4x16_N2(__m256i * buff)8358 static AOM_FORCE_INLINE void clear_buffer_4x16_N2(__m256i *buff) {
8359     const __m256i zero = _mm256_setzero_si256();
8360     buff[4]            = zero;
8361     buff[5]            = zero;
8362     buff[6]            = zero;
8363     buff[7]            = zero;
8364 }
8365 
fidtx4x8_col_N2_avx2(__m256i * in,__m256i * output,int32_t bit,int32_t col_num)8366 static INLINE void fidtx4x8_col_N2_avx2(__m256i *in, __m256i *output, int32_t bit,
8367                                         int32_t col_num) {
8368     (void)bit;
8369     const __m256i zero   = _mm256_setzero_si256();
8370     __m256i       fact   = _mm256_set1_epi32(new_sqrt2);
8371     __m256i       offset = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
8372     __m256i       a_low;
8373 
8374     for (int32_t i = 0; i < col_num; i++) {
8375         a_low     = _mm256_mullo_epi32(in[i], fact);
8376         a_low     = _mm256_add_epi32(a_low, offset);
8377         a_low     = _mm256_srai_epi32(a_low, new_sqrt2_bits);
8378         output[i] = _mm256_unpacklo_epi64(a_low, zero);
8379     }
8380 }
8381 
fidtx4x8_N2_perm_avx2(__m256i * in,__m256i * output,int32_t bit)8382 static AOM_FORCE_INLINE void fidtx4x8_N2_perm_avx2(__m256i *in, __m256i *output, int32_t bit) {
8383     (void)bit;
8384     __m256i fact   = _mm256_set1_epi32(2 * new_sqrt2);
8385     __m256i offset = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
8386     __m256i a_low;
8387     __m256i out[2];
8388 
8389     a_low  = _mm256_mullo_epi32(in[0], fact);
8390     a_low  = _mm256_add_epi32(a_low, offset);
8391     out[0] = _mm256_srai_epi32(a_low, new_sqrt2_bits);
8392     a_low  = _mm256_mullo_epi32(in[2], fact);
8393     a_low  = _mm256_add_epi32(a_low, offset);
8394     out[1] = _mm256_srai_epi32(a_low, new_sqrt2_bits);
8395 
8396     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
8397     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
8398 }
8399 
fidtx4x8_row_N2_avx2(__m256i * in,__m256i * output,int32_t bit)8400 static INLINE void fidtx4x8_row_N2_avx2(__m256i *in, __m256i *output, int32_t bit) {
8401     (void)bit;
8402     __m256i fact   = _mm256_set1_epi32(new_sqrt2);
8403     __m256i offset = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
8404     __m256i a_low;
8405 
8406     for (int32_t i = 0; i < 4; i += 2) {
8407         a_low     = _mm256_mullo_epi32(in[i], fact);
8408         a_low     = _mm256_add_epi32(a_low, offset);
8409         output[i] = _mm256_srai_epi32(a_low, new_sqrt2_bits);
8410     }
8411 }
8412 
fdct4x8_row_N2_with_round_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t num_col,int32_t shift)8413 static void fdct4x8_row_N2_with_round_avx2(__m256i *input, __m256i *output, int32_t bit,
8414                                            const int32_t num_col, int32_t shift) {
8415     const int32_t *cospi    = cospi_arr(bit);
8416     const __m256i  zero     = _mm256_setzero_si256();
8417     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
8418     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
8419     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
8420     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
8421     const __m256i  rounding = _mm256_set1_epi32(1 << (shift - 1));
8422     __m256i        in[4];
8423     __m256i        out[4];
8424     __m256i        s0, s1, s2, s3;
8425     __m256i        u0, u1, u2, u3;
8426     __m256i        v0, v1, v2, v3;
8427     int32_t        endidx = 3 * num_col;
8428 
8429     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
8430     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
8431     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
8432     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
8433 
8434     s0 = _mm256_add_epi32(in[0], in[endidx]);
8435     s3 = _mm256_sub_epi32(in[0], in[endidx]);
8436     endidx -= num_col;
8437     s1 = _mm256_add_epi32(in[num_col], in[endidx]);
8438     s2 = _mm256_sub_epi32(in[num_col], in[endidx]);
8439 
8440     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
8441     u0 = _mm256_mullo_epi32(s0, cospi32);
8442     u1 = _mm256_mullo_epi32(s1, cospi32);
8443     u2 = _mm256_add_epi32(u0, u1);
8444 
8445     u3 = _mm256_add_epi32(u2, rnding);
8446 
8447     u0 = _mm256_srai_epi32(u3, bit);
8448 
8449     // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
8450     v0 = _mm256_mullo_epi32(s2, cospi48);
8451     v1 = _mm256_mullo_epi32(s3, cospi16);
8452     v2 = _mm256_add_epi32(v0, v1);
8453 
8454     v3 = _mm256_add_epi32(v2, rnding);
8455     u1 = _mm256_srai_epi32(v3, bit);
8456 
8457     //round
8458     u0 = _mm256_add_epi32(u0, rounding);
8459     u0 = _mm256_srai_epi32(u0, shift);
8460     u1 = _mm256_add_epi32(u1, rounding);
8461     u1 = _mm256_srai_epi32(u1, shift);
8462 
8463     // Transpose 4x4 32-bit
8464     v0 = _mm256_unpacklo_epi32(u0, u1);
8465     v1 = _mm256_unpackhi_epi32(u0, u1);
8466 
8467     out[0] = _mm256_unpacklo_epi64(v0, zero);
8468     out[1] = _mm256_unpackhi_epi64(v0, zero);
8469     out[2] = _mm256_unpacklo_epi64(v1, zero);
8470     out[3] = _mm256_unpackhi_epi64(v1, zero);
8471 
8472     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
8473     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
8474     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
8475     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
8476 }
8477 
fadst4x8_row_N2_with_round_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t num_col,int32_t shift)8478 static void fadst4x8_row_N2_with_round_avx2(__m256i *input, __m256i *output, int32_t bit,
8479                                             const int32_t num_col, int32_t shift) {
8480     const int32_t *sinpi    = sinpi_arr(bit);
8481     const __m256i  zero     = _mm256_setzero_si256();
8482     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
8483     const __m256i  rounding = _mm256_set1_epi32(1 << (shift - 1));
8484     const __m256i  sinpi1   = _mm256_set1_epi32((int32_t)sinpi[1]);
8485     const __m256i  sinpi2   = _mm256_set1_epi32((int32_t)sinpi[2]);
8486     const __m256i  sinpi3   = _mm256_set1_epi32((int32_t)sinpi[3]);
8487     const __m256i  sinpi4   = _mm256_set1_epi32((int32_t)sinpi[4]);
8488     __m256i        s0, s1, s2, s3, s4;
8489     __m256i        x0, x1;
8490     __m256i        u0, u1;
8491     __m256i        v0, v1;
8492     __m256i        out[4];
8493     __m256i        in[4];
8494 
8495     int32_t idx = 0 * num_col;
8496 
8497     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
8498     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
8499     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
8500     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
8501 
8502     s0 = _mm256_mullo_epi32(in[idx], sinpi1);
8503     u0 = _mm256_add_epi32(in[idx], in[idx + num_col]);
8504     idx += num_col;
8505     s2 = _mm256_mullo_epi32(in[idx], sinpi2);
8506     idx += num_col;
8507     s4 = _mm256_mullo_epi32(in[idx], sinpi3);
8508     idx += num_col;
8509     s3 = _mm256_mullo_epi32(in[idx], sinpi4);
8510     s1 = _mm256_sub_epi32(u0, in[idx]);
8511 
8512     u0 = _mm256_add_epi32(s0, s2);
8513     x0 = _mm256_add_epi32(u0, s3);
8514     x1 = _mm256_mullo_epi32(s1, sinpi3);
8515 
8516     s0 = _mm256_add_epi32(x0, s4);
8517 
8518     u0 = _mm256_add_epi32(s0, rnding);
8519     u0 = _mm256_srai_epi32(u0, bit);
8520 
8521     u1 = _mm256_add_epi32(x1, rnding);
8522     u1 = _mm256_srai_epi32(u1, bit);
8523 
8524     //round
8525     u0 = _mm256_add_epi32(u0, rounding);
8526     u0 = _mm256_srai_epi32(u0, shift);
8527     u1 = _mm256_add_epi32(u1, rounding);
8528     u1 = _mm256_srai_epi32(u1, shift);
8529 
8530     v0 = _mm256_unpacklo_epi32(u0, u1);
8531     v1 = _mm256_unpackhi_epi32(u0, u1);
8532 
8533     out[0] = _mm256_unpacklo_epi64(v0, zero);
8534     out[1] = _mm256_unpackhi_epi64(v0, zero);
8535     out[2] = _mm256_unpacklo_epi64(v1, zero);
8536     out[3] = _mm256_unpackhi_epi64(v1, zero);
8537 
8538     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
8539     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
8540     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
8541     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
8542 }
8543 
fidtx4x8_row_N2_with_round_avx2(__m256i * input,__m256i * output,int32_t bit,int32_t shift)8544 static INLINE void fidtx4x8_row_N2_with_round_avx2(__m256i *input, __m256i *output, int32_t bit,
8545                                                    int32_t shift) {
8546     (void)bit;
8547     __m256i       in[2];
8548     __m256i       out[4];
8549     __m256i       fact     = _mm256_set1_epi32(new_sqrt2);
8550     __m256i       offset   = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
8551     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
8552     const __m256i zero     = _mm256_setzero_si256();
8553     __m256i       a_low;
8554     __m256i       v[4];
8555 
8556     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
8557     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
8558 
8559     for (int32_t i = 0; i < 2; i++) {
8560         a_low  = _mm256_mullo_epi32(in[i], fact);
8561         a_low  = _mm256_add_epi32(a_low, offset);
8562         a_low  = _mm256_srai_epi32(a_low, new_sqrt2_bits);
8563         a_low  = _mm256_add_epi32(a_low, rounding);
8564         out[i] = _mm256_srai_epi32(a_low, shift);
8565     }
8566 
8567     // Transpose for 4x4
8568     v[0] = _mm256_unpacklo_epi32(out[0], out[1]);
8569     v[1] = _mm256_unpackhi_epi32(out[0], out[1]);
8570 
8571     out[0] = _mm256_unpacklo_epi64(v[0], zero);
8572     out[1] = _mm256_unpackhi_epi64(v[0], zero);
8573     out[2] = _mm256_unpacklo_epi64(v[1], zero);
8574     out[3] = _mm256_unpackhi_epi64(v[1], zero);
8575 
8576     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
8577     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
8578     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
8579     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
8580 }
8581 
fdct4x8_N2_avx2(__m256i * input,__m256i * output,int32_t bit)8582 static void fdct4x8_N2_avx2(__m256i *input, __m256i *output, int32_t bit) {
8583     __m128i *      in       = (__m128i *)input;
8584     __m128i *      out      = (__m128i *)output;
8585     const int32_t *cospi    = cospi_arr(bit);
8586     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
8587     const __m128i  cospim32 = _mm_set1_epi32(-cospi[32]);
8588     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
8589     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
8590     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
8591     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
8592     const __m128i  cospi24  = _mm_set1_epi32(cospi[24]);
8593     const __m128i  cospi40  = _mm_set1_epi32(cospi[40]);
8594     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
8595     __m128i        u[8], v[8];
8596 
8597     // Even 8 points 0, 2, ..., 14
8598     // stage 0
8599     // stage 1
8600     u[0] = _mm_add_epi32(in[0], in[7]);
8601     v[7] = _mm_sub_epi32(in[0], in[7]); // v[7]
8602     u[1] = _mm_add_epi32(in[1], in[6]);
8603     u[6] = _mm_sub_epi32(in[1], in[6]);
8604     u[2] = _mm_add_epi32(in[2], in[5]);
8605     u[5] = _mm_sub_epi32(in[2], in[5]);
8606     u[3] = _mm_add_epi32(in[3], in[4]);
8607     v[4] = _mm_sub_epi32(in[3], in[4]); // v[4]
8608 
8609     // stage 2
8610     v[0] = _mm_add_epi32(u[0], u[3]);
8611     v[3] = _mm_sub_epi32(u[0], u[3]);
8612     v[1] = _mm_add_epi32(u[1], u[2]);
8613     v[2] = _mm_sub_epi32(u[1], u[2]);
8614 
8615     v[5] = _mm_mullo_epi32(u[5], cospim32);
8616     v[6] = _mm_mullo_epi32(u[6], cospi32);
8617     v[5] = _mm_add_epi32(v[5], v[6]);
8618     v[5] = _mm_add_epi32(v[5], rnding);
8619     v[5] = _mm_srai_epi32(v[5], bit);
8620 
8621     u[0] = _mm_mullo_epi32(u[5], cospi32);
8622     v[6] = _mm_mullo_epi32(u[6], cospim32);
8623     v[6] = _mm_sub_epi32(u[0], v[6]);
8624     v[6] = _mm_add_epi32(v[6], rnding);
8625     v[6] = _mm_srai_epi32(v[6], bit);
8626 
8627     // stage 3
8628     // type 0
8629     v[0]   = _mm_mullo_epi32(v[0], cospi32);
8630     v[1]   = _mm_mullo_epi32(v[1], cospi32);
8631     u[0]   = _mm_add_epi32(v[0], v[1]);
8632     u[0]   = _mm_add_epi32(u[0], rnding);
8633     out[0] = _mm_srai_epi32(u[0], bit);
8634 
8635     // type 1
8636     v[0]   = _mm_mullo_epi32(v[2], cospi48);
8637     v[1]   = _mm_mullo_epi32(v[3], cospi16);
8638     u[2]   = _mm_add_epi32(v[0], v[1]);
8639     u[2]   = _mm_add_epi32(u[2], rnding);
8640     out[2] = _mm_srai_epi32(u[2], bit);
8641 
8642     u[4] = _mm_add_epi32(v[4], v[5]);
8643     u[5] = _mm_sub_epi32(v[4], v[5]);
8644     u[6] = _mm_sub_epi32(v[7], v[6]);
8645     u[7] = _mm_add_epi32(v[7], v[6]);
8646 
8647     // stage 4
8648     // stage 5
8649     v[0]   = _mm_mullo_epi32(u[4], cospi56);
8650     v[1]   = _mm_mullo_epi32(u[7], cospi8);
8651     v[0]   = _mm_add_epi32(v[0], v[1]);
8652     v[0]   = _mm_add_epi32(v[0], rnding);
8653     out[1] = _mm_srai_epi32(v[0], bit); // buf0[4]
8654 
8655     v[0]   = _mm_mullo_epi32(u[5], cospi40);
8656     v[1]   = _mm_mullo_epi32(u[6], cospi24);
8657     v[0]   = _mm_sub_epi32(v[1], v[0]);
8658     v[0]   = _mm_add_epi32(v[0], rnding);
8659     out[3] = _mm_srai_epi32(v[0], bit); // buf0[6]
8660 }
8661 
col_txfm_8x4_N2_rounding(__m256i * in,int32_t shift)8662 static AOM_FORCE_INLINE void col_txfm_8x4_N2_rounding(__m256i *in, int32_t shift) {
8663     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
8664 
8665     in[0] = _mm256_add_epi32(in[0], rounding);
8666     in[1] = _mm256_add_epi32(in[1], rounding);
8667 
8668     in[0] = _mm256_srai_epi32(in[0], shift);
8669     in[1] = _mm256_srai_epi32(in[1], shift);
8670 }
8671 
write_buffer_4x8_N2(const __m256i * res,int32_t * output)8672 static AOM_FORCE_INLINE void write_buffer_4x8_N2(const __m256i *res, int32_t *output) {
8673     const __m256i zero = _mm256_setzero_si256();
8674     _mm256_storeu_si256((__m256i *)(output + 0 * 8), res[0]);
8675     _mm256_storeu_si256((__m256i *)(output + 1 * 8), res[1]);
8676     _mm256_storeu_si256((__m256i *)(output + 2 * 8), zero);
8677     _mm256_storeu_si256((__m256i *)(output + 3 * 8), zero);
8678 }
8679 
fadst8x4_N2_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t col_num)8680 static void fadst8x4_N2_avx2(__m256i *input, __m256i *output, int32_t bit, const int32_t col_num) {
8681     __m128i *      in       = (__m128i *)input;
8682     __m128i *      out      = (__m128i *)output;
8683     const int32_t *cospi    = cospi_arr(bit);
8684     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
8685     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
8686     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
8687     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
8688     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
8689     const __m128i  cospim4  = _mm_set1_epi32(-cospi[4]);
8690     const __m128i  cospi60  = _mm_set1_epi32(cospi[60]);
8691     const __m128i  cospim20 = _mm_set1_epi32(-cospi[20]);
8692     const __m128i  cospi44  = _mm_set1_epi32(cospi[44]);
8693     const __m128i  cospi28  = _mm_set1_epi32(cospi[28]);
8694     const __m128i  cospi36  = _mm_set1_epi32(cospi[36]);
8695     const __m128i  cospi52  = _mm_set1_epi32(cospi[52]);
8696     const __m128i  cospi12  = _mm_set1_epi32(cospi[12]);
8697     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
8698     const __m128i  zero     = _mm_setzero_si128();
8699     __m128i        u0, u1, u2, u3, u4, u5, u6, u7;
8700     __m128i        v0, v1, v2, v3, v4, v5, v6, v7;
8701     __m128i        x, y;
8702     int32_t        col;
8703 
8704     // Note:
8705     //  Even column: 0, 2, ..., 14
8706     //  Odd column: 1, 3, ..., 15
8707     //  one even column plus one odd column constructs one row (8 coeffs)
8708     //  total we have 8 rows (8x8).
8709     for (col = 0; col < col_num; ++col) {
8710         // stage 0
8711         // stage 1
8712         u0 = in[col_num * 0 + col];
8713         u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
8714         u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
8715         u3 = in[col_num * 4 + col];
8716         u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
8717         u5 = in[col_num * 6 + col];
8718         u6 = in[col_num * 2 + col];
8719         u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
8720 
8721         // stage 2
8722         v0 = u0;
8723         v1 = u1;
8724 
8725         x  = _mm_mullo_epi32(u2, cospi32);
8726         y  = _mm_mullo_epi32(u3, cospi32);
8727         v2 = _mm_add_epi32(x, y);
8728         v2 = _mm_add_epi32(v2, rnding);
8729         v2 = _mm_srai_epi32(v2, bit);
8730 
8731         v3 = _mm_sub_epi32(x, y);
8732         v3 = _mm_add_epi32(v3, rnding);
8733         v3 = _mm_srai_epi32(v3, bit);
8734 
8735         v4 = u4;
8736         v5 = u5;
8737 
8738         x  = _mm_mullo_epi32(u6, cospi32);
8739         y  = _mm_mullo_epi32(u7, cospi32);
8740         v6 = _mm_add_epi32(x, y);
8741         v6 = _mm_add_epi32(v6, rnding);
8742         v6 = _mm_srai_epi32(v6, bit);
8743 
8744         v7 = _mm_sub_epi32(x, y);
8745         v7 = _mm_add_epi32(v7, rnding);
8746         v7 = _mm_srai_epi32(v7, bit);
8747 
8748         // stage 3
8749         u0 = _mm_add_epi32(v0, v2);
8750         u1 = _mm_add_epi32(v1, v3);
8751         u2 = _mm_sub_epi32(v0, v2);
8752         u3 = _mm_sub_epi32(v1, v3);
8753         u4 = _mm_add_epi32(v4, v6);
8754         u5 = _mm_add_epi32(v5, v7);
8755         u6 = _mm_sub_epi32(v4, v6);
8756         u7 = _mm_sub_epi32(v5, v7);
8757 
8758         // stage 4
8759         v0 = u0;
8760         v1 = u1;
8761         v2 = u2;
8762         v3 = u3;
8763 
8764         x  = _mm_mullo_epi32(u4, cospi16);
8765         y  = _mm_mullo_epi32(u5, cospi48);
8766         v4 = _mm_add_epi32(x, y);
8767         v4 = _mm_add_epi32(v4, rnding);
8768         v4 = _mm_srai_epi32(v4, bit);
8769 
8770         x  = _mm_mullo_epi32(u4, cospi48);
8771         y  = _mm_mullo_epi32(u5, cospim16);
8772         v5 = _mm_add_epi32(x, y);
8773         v5 = _mm_add_epi32(v5, rnding);
8774         v5 = _mm_srai_epi32(v5, bit);
8775 
8776         x  = _mm_mullo_epi32(u6, cospim48);
8777         y  = _mm_mullo_epi32(u7, cospi16);
8778         v6 = _mm_add_epi32(x, y);
8779         v6 = _mm_add_epi32(v6, rnding);
8780         v6 = _mm_srai_epi32(v6, bit);
8781 
8782         x  = _mm_mullo_epi32(u6, cospi16);
8783         y  = _mm_mullo_epi32(u7, cospi48);
8784         v7 = _mm_add_epi32(x, y);
8785         v7 = _mm_add_epi32(v7, rnding);
8786         v7 = _mm_srai_epi32(v7, bit);
8787 
8788         // stage 5
8789         u0 = _mm_add_epi32(v0, v4);
8790         u1 = _mm_add_epi32(v1, v5);
8791         u2 = _mm_add_epi32(v2, v6);
8792         u3 = _mm_add_epi32(v3, v7);
8793         u4 = _mm_sub_epi32(v0, v4);
8794         u5 = _mm_sub_epi32(v1, v5);
8795         u6 = _mm_sub_epi32(v2, v6);
8796         u7 = _mm_sub_epi32(v3, v7);
8797 
8798         // stage 6
8799         x                      = _mm_mullo_epi32(u0, cospi60);
8800         y                      = _mm_mullo_epi32(u1, cospim4);
8801         v1                     = _mm_add_epi32(x, y);
8802         v1                     = _mm_add_epi32(v1, rnding);
8803         out[col_num * 0 + col] = _mm_srai_epi32(v1, bit);
8804 
8805         x                      = _mm_mullo_epi32(u2, cospi44);
8806         y                      = _mm_mullo_epi32(u3, cospim20);
8807         v3                     = _mm_add_epi32(x, y);
8808         v3                     = _mm_add_epi32(v3, rnding);
8809         out[col_num * 2 + col] = _mm_srai_epi32(v3, bit);
8810 
8811         x                      = _mm_mullo_epi32(u4, cospi36);
8812         y                      = _mm_mullo_epi32(u5, cospi28);
8813         v4                     = _mm_add_epi32(x, y);
8814         v4                     = _mm_add_epi32(v4, rnding);
8815         out[col_num * 3 + col] = _mm_srai_epi32(v4, bit);
8816 
8817         x                      = _mm_mullo_epi32(u6, cospi52);
8818         y                      = _mm_mullo_epi32(u7, cospi12);
8819         v6                     = _mm_add_epi32(x, y);
8820         v6                     = _mm_add_epi32(v6, rnding);
8821         out[col_num * 1 + col] = _mm_srai_epi32(v6, bit);
8822     }
8823 }
8824 
fidtx8x4_N2_avx2(__m256i * in,__m256i * out,int32_t bit)8825 static AOM_FORCE_INLINE void fidtx8x4_N2_avx2(__m256i *in, __m256i *out, int32_t bit) {
8826     (void)bit;
8827 
8828     out[0] = _mm256_add_epi32(in[0], in[0]);
8829     out[1] = _mm256_add_epi32(in[1], in[1]);
8830 }
8831 
write_buffer_16x8_N2_avx2(const __m256i * res,__m256i * out)8832 static AOM_FORCE_INLINE void write_buffer_16x8_N2_avx2(const __m256i *res, __m256i *out) {
8833     out[0] = res[0];
8834     out[2] = res[1];
8835     out[4] = res[2];
8836     out[6] = res[3];
8837 }
8838 
svt_av1_fwd_txfm2d_8x8_N2_avx2(int16_t * input,int32_t * coeff,uint32_t stride,TxType tx_type,uint8_t bd)8839 void svt_av1_fwd_txfm2d_8x8_N2_avx2(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type,
8840                                     uint8_t bd) {
8841     __m256i       in[8], out[8];
8842     const int8_t *shift   = fwd_txfm_shift_ls[TX_8X8];
8843     const int32_t txw_idx = get_txw_idx(TX_8X8);
8844     const int32_t txh_idx = get_txh_idx(TX_8X8);
8845 
8846     switch (tx_type) {
8847     case DCT_DCT:
8848         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
8849         fdct8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8850         col_txfm_8x8_N2_rounding(out, -shift[1]);
8851         transpose_8x8_half_avx2(out, in);
8852         fdct8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8853         transpose_8x8_N2_avx2(out, in);
8854         write_buffer_8x8_N2(in, coeff);
8855         break;
8856     case ADST_DCT:
8857         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
8858         fadst8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8859         col_txfm_8x8_N2_rounding(out, -shift[1]);
8860         transpose_8x8_half_avx2(out, in);
8861         fdct8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8862         transpose_8x8_N2_avx2(out, in);
8863         write_buffer_8x8_N2(in, coeff);
8864         break;
8865     case DCT_ADST:
8866         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
8867         fdct8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8868         col_txfm_8x8_N2_rounding(out, -shift[1]);
8869         transpose_8x8_half_avx2(out, in);
8870         fadst8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8871         transpose_8x8_N2_avx2(out, in);
8872         write_buffer_8x8_N2(in, coeff);
8873         break;
8874     case ADST_ADST:
8875         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
8876         fadst8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8877         col_txfm_8x8_N2_rounding(out, -shift[1]);
8878         transpose_8x8_half_avx2(out, in);
8879         fadst8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8880         transpose_8x8_N2_avx2(out, in);
8881         write_buffer_8x8_N2(in, coeff);
8882         break;
8883     case FLIPADST_DCT:
8884         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
8885         fadst8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8886         col_txfm_8x8_N2_rounding(out, -shift[1]);
8887         transpose_8x8_half_avx2(out, in);
8888         fdct8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8889         transpose_8x8_N2_avx2(out, in);
8890         write_buffer_8x8_N2(in, coeff);
8891         break;
8892     case DCT_FLIPADST:
8893         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
8894         fdct8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8895         col_txfm_8x8_N2_rounding(out, -shift[1]);
8896         transpose_8x8_half_avx2(out, in);
8897         fadst8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8898         transpose_8x8_N2_avx2(out, in);
8899         write_buffer_8x8_N2(in, coeff);
8900         break;
8901     case FLIPADST_FLIPADST:
8902         load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
8903         fadst8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8904         col_txfm_8x8_N2_rounding(out, -shift[1]);
8905         transpose_8x8_half_avx2(out, in);
8906         fadst8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8907         transpose_8x8_N2_avx2(out, in);
8908         write_buffer_8x8_N2(in, coeff);
8909         break;
8910     case ADST_FLIPADST:
8911         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
8912         fadst8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8913         col_txfm_8x8_N2_rounding(out, -shift[1]);
8914         transpose_8x8_half_avx2(out, in);
8915         fadst8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8916         transpose_8x8_N2_avx2(out, in);
8917         write_buffer_8x8_N2(in, coeff);
8918         break;
8919     case FLIPADST_ADST:
8920         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
8921         fadst8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8922         col_txfm_8x8_N2_rounding(out, -shift[1]);
8923         transpose_8x8_half_avx2(out, in);
8924         fadst8x8_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8925         transpose_8x8_N2_avx2(out, in);
8926         write_buffer_8x8_N2(in, coeff);
8927         break;
8928     case IDTX:
8929         load_buffer_4x8_in_8x8(input, in, stride, 0, 0, shift[0], 1);
8930         fidtx8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8931         col_txfm_8x8_N2_rounding(out, -shift[1]);
8932         fidtx8x8_N2_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8933         write_buffer_8x8_N2(out, coeff);
8934         break;
8935     case V_DCT:
8936         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
8937         fdct8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8938         col_txfm_8x8_N2_rounding(out, -shift[1]);
8939         fidtx8x8_N2_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8940         write_buffer_8x8_N2(out, coeff);
8941         break;
8942     case H_DCT:
8943         load_buffer_4x8_in_8x8(input, in, stride, 0, 0, shift[0], 1);
8944         fidtx8x8_N2_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8945         col_txfm_8x8_N2_rounding(in, -shift[1]);
8946         transpose_8x8_half_avx2(in, out);
8947         fdct8x8_N2_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8948         transpose_8x8_N2_avx2(in, out);
8949         write_buffer_8x8_N2(out, coeff);
8950         break;
8951     case V_ADST:
8952         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
8953         fadst8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8954         col_txfm_8x8_N2_rounding(out, -shift[1]);
8955         fidtx8x8_N2_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8956         write_buffer_8x8_N2(out, coeff);
8957         break;
8958     case H_ADST:
8959         load_buffer_4x8_in_8x8(input, in, stride, 0, 0, shift[0], 1);
8960         fidtx8x8_N2_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8961         col_txfm_8x8_N2_rounding(in, -shift[1]);
8962         transpose_8x8_half_avx2(in, out);
8963         fadst8x8_N2_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8964         transpose_8x8_N2_avx2(in, out);
8965         write_buffer_8x8_N2(out, coeff);
8966         break;
8967     case V_FLIPADST:
8968         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
8969         fadst8x8_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8970         col_txfm_8x8_N2_rounding(out, -shift[1]);
8971         fidtx8x8_N2_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8972         write_buffer_8x8_N2(out, coeff);
8973         break;
8974     case H_FLIPADST:
8975         load_buffer_4x8_in_8x8(input, in, stride, 0, 1, shift[0], 1);
8976         fidtx8x8_N2_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
8977         col_txfm_8x8_N2_rounding(in, -shift[1]);
8978         transpose_8x8_half_avx2(in, out);
8979         fadst8x8_N2_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
8980         transpose_8x8_N2_avx2(in, out);
8981         write_buffer_8x8_N2(out, coeff);
8982         break;
8983     default: assert(0);
8984     }
8985     (void)bd;
8986 }
8987 
svt_av1_fwd_txfm2d_16x16_N2_avx2(int16_t * input,int32_t * coeff,uint32_t stride,TxType tx_type,uint8_t bd)8988 void svt_av1_fwd_txfm2d_16x16_N2_avx2(int16_t *input, int32_t *coeff, uint32_t stride,
8989                                       TxType tx_type, uint8_t bd) {
8990     __m256i       in[32], out[32];
8991     const int8_t *shift   = fwd_txfm_shift_ls[TX_16X16];
8992     const int32_t txw_idx = get_txw_idx(TX_16X16);
8993     const int32_t txh_idx = get_txh_idx(TX_16X16);
8994     const int32_t col_num = 2;
8995     switch (tx_type) {
8996     case IDTX:
8997         load_buffer_16x16_N2_half(input, in, stride, 0, 0, shift[0]);
8998         fidtx16x16_N2_row_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
8999         col_txfm_16x16_N2_half_rounding(out, -shift[1]);
9000         fidtx16x16_N2_row_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
9001         write_buffer_16x16_N2(out, coeff);
9002         break;
9003     case DCT_DCT:
9004         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
9005         fdct16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9006         col_txfm_16x16_N2_rounding(out, -shift[1]);
9007         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9008         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9009         fdct16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9010         transpose_8x8_in_16x16_avx2(out, in);
9011         write_buffer_16x16_N2(in, coeff);
9012         break;
9013     case ADST_DCT:
9014         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
9015         fadst16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9016         col_txfm_16x16_N2_rounding(out, -shift[1]);
9017         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9018         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9019         fdct16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9020         transpose_8x8_in_16x16_avx2(out, in);
9021         write_buffer_16x16_N2(in, coeff);
9022         break;
9023     case DCT_ADST:
9024         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
9025         fdct16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9026         col_txfm_16x16_N2_rounding(out, -shift[1]);
9027         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9028         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9029         fadst16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num);
9030         transpose_8x8_in_16x16_avx2(out, in);
9031         write_buffer_16x16_N2(in, coeff);
9032         break;
9033     case ADST_ADST:
9034         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
9035         fadst16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9036         col_txfm_16x16_N2_rounding(out, -shift[1]);
9037         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9038         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9039         fadst16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9040         transpose_8x8_in_16x16_avx2(out, in);
9041         write_buffer_16x16_N2(in, coeff);
9042         break;
9043     case DCT_FLIPADST:
9044         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
9045         fdct16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9046         col_txfm_16x16_N2_rounding(out, -shift[1]);
9047         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9048         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9049         fadst16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9050         transpose_8x8_in_16x16_avx2(out, in);
9051         write_buffer_16x16_N2(in, coeff);
9052         break;
9053     case FLIPADST_DCT:
9054         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
9055         fadst16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9056         col_txfm_16x16_N2_rounding(out, -shift[1]);
9057         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9058         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9059         fdct16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9060         transpose_8x8_in_16x16_avx2(out, in);
9061         write_buffer_16x16_N2(in, coeff);
9062         break;
9063     case FLIPADST_FLIPADST:
9064         load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
9065         fadst16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9066         col_txfm_16x16_N2_rounding(out, -shift[1]);
9067         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9068         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9069         fadst16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9070         transpose_8x8_in_16x16_avx2(out, in);
9071         write_buffer_16x16_N2(in, coeff);
9072         break;
9073     case ADST_FLIPADST:
9074         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
9075         fadst16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9076         col_txfm_16x16_N2_rounding(out, -shift[1]);
9077         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9078         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9079         fadst16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9080         transpose_8x8_in_16x16_avx2(out, in);
9081         write_buffer_16x16_N2(in, coeff);
9082         break;
9083     case FLIPADST_ADST:
9084         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
9085         fadst16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
9086         col_txfm_16x16_N2_rounding(out, -shift[1]);
9087         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
9088         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
9089         fadst16x16_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9090         transpose_8x8_in_16x16_avx2(out, in);
9091         write_buffer_16x16_N2(in, coeff);
9092         break;
9093     case V_DCT:
9094         load_buffer_16x16_N2(input, in, stride, 0, 0, shift[0]);
9095         fdct16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num / 2);
9096         col_txfm_16x16_N2_half_rounding(out, -shift[1]);
9097         fidtx16x16_N2_row_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
9098         write_buffer_16x16_N2(out, coeff);
9099         break;
9100     case H_DCT:
9101         load_buffer_16x16_N2_H(input, in, stride, 0, 0, shift[0]);
9102         fidtx8xn_N2_col_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 16);
9103         col_txfm_16x16_N2_rounding(in, -shift[1]);
9104         transpose_8x8_in_16x16_avx2(in, out); //top-left -> top-left
9105         transpose_8x8_in_16x16_avx2(in + 1, out + 16); //top-right ->bottom-left
9106         fdct16x16_N2_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9107         transpose_8x8_in_16x16_avx2(in, out);
9108         write_buffer_16x16_N2(out, coeff);
9109         break;
9110     case V_ADST:
9111         load_buffer_16x16_N2(input, in, stride, 0, 0, shift[0]);
9112         fadst16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num / 2);
9113         col_txfm_16x16_N2_half_rounding(out, -shift[1]);
9114         fidtx16x16_N2_row_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
9115         write_buffer_16x16_N2(out, coeff);
9116         break;
9117     case H_ADST:
9118         load_buffer_16x16_N2_H(input, in, stride, 0, 0, shift[0]);
9119         fidtx8xn_N2_col_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 16);
9120         col_txfm_16x16_N2_rounding(in, -shift[1]);
9121         transpose_8x8_in_16x16_avx2(in, out); //top-left -> top-left
9122         transpose_8x8_in_16x16_avx2(in + 1, out + 16); //top-right ->bottom-left
9123         fadst16x16_N2_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9124         transpose_8x8_in_16x16_avx2(in, out);
9125         write_buffer_16x16_N2(out, coeff);
9126         break;
9127     case V_FLIPADST:
9128         load_buffer_16x16_N2(input, in, stride, 1, 0, shift[0]);
9129         fadst16x16_N2_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num / 2);
9130         col_txfm_16x16_N2_half_rounding(out, -shift[1]);
9131         fidtx16x16_N2_row_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
9132         write_buffer_16x16_N2(out, coeff);
9133         break;
9134     case H_FLIPADST:
9135         load_buffer_16x16_N2_H(input, in, stride, 0, 1, shift[0]);
9136         fidtx8xn_N2_col_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 16);
9137         col_txfm_16x16_N2_rounding(in, -shift[1]);
9138         transpose_8x8_in_16x16_avx2(in, out); //top-left -> top-left
9139         transpose_8x8_in_16x16_avx2(in + 1, out + 16); //top-right ->bottom-left
9140         fadst16x16_N2_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num, col_num / 2);
9141         transpose_8x8_in_16x16_avx2(in, out);
9142         write_buffer_16x16_N2(out, coeff);
9143         break;
9144     default: assert(0);
9145     }
9146     (void)bd;
9147 }
9148 
svt_av1_fwd_txfm2d_64x64_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9149 void svt_av1_fwd_txfm2d_64x64_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9150                                       TxType tx_type, uint8_t bd) {
9151     (void)bd;
9152     __m256i       in[512];
9153     __m256i *     out     = (__m256i *)output;
9154     const int32_t txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
9155     const int32_t txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
9156     const int8_t *shift   = fwd_txfm_shift_ls[TX_64X64];
9157 
9158     switch (tx_type) {
9159     case IDTX:
9160         load_buffer_32x32_in_64x64_avx2(input, stride, out);
9161         fidtx64x64_N2_avx2(out, in);
9162         av1_round_shift_array_64_N2_avx2(in, in, 512 / 2, -shift[1]);
9163         /*row wise transform*/
9164         fidtx64x64_N2_avx2(in, out);
9165         av1_round_shift_array_64_N2_avx2(out, out, 512 / 2, -shift[2]);
9166         clear_buffer_wxh_N2(out, 8, 64);
9167         break;
9168     case DCT_DCT:
9169         load_buffer_64x64_avx2(input, stride, out);
9170         av1_fdct64_new_N2_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], 64, 8);
9171         av1_round_shift_array_32_avx2(in, out, 512 / 2, -shift[1]);
9172         transpose_32x32_in_64x64_avx2(out, in); //top-left
9173         transpose_32x32_in_64x64_avx2(out + 4, in + 256); //top-right -> bottom left
9174         /*row wise transform*/
9175         av1_fdct64_new_N2_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 32, 8);
9176         av1_round_shift_array_64_N2_avx2(out, in, 512 / 2, -shift[2]);
9177         transpose_32x32_in_64x64_avx2(in, out); //top-left
9178         clear_buffer_wxh_N2(out, 8, 64);
9179         break;
9180     default: assert(0);
9181     }
9182 }
9183 
svt_av1_fwd_txfm2d_32x32_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9184 void svt_av1_fwd_txfm2d_32x32_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9185                                       TxType tx_type, uint8_t bd) {
9186     const int8_t *shift           = fwd_txfm_shift_ls[TX_32X32];
9187     const int32_t txw_idx         = tx_size_wide_log2[TX_32X32] - tx_size_wide_log2[0];
9188     const int32_t txh_idx         = tx_size_high_log2[TX_32X32] - tx_size_high_log2[0];
9189     const int8_t  cos_bit_col     = fwd_cos_bit_col[txw_idx][txh_idx];
9190     const int8_t  cos_bit_row     = fwd_cos_bit_row[txw_idx][txh_idx];
9191     const int32_t txfm2d_size_256 = 32 * 32 / 8;
9192     __m256i       buf_256[128];
9193     __m256i *     out_256 = (__m256i *)output;
9194     (void)bd;
9195 
9196     switch (tx_type) {
9197     case IDTX:
9198         load_buffer_32x16_N2_avx2(input, buf_256, stride);
9199         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256 / 2, -shift[0]);
9200         fidtx32x32_N2_row_avx2(out_256, buf_256, cos_bit_col);
9201         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256 / 2, -shift[1]);
9202         fidtx32x32_N2_row_avx2(out_256, buf_256, cos_bit_row);
9203         av1_round_shift_array_32_N2_avx2(buf_256, out_256, txfm2d_size_256 / 2, -shift[2]);
9204         clear_buffer_wxh_N2(out_256, 4, 32);
9205         break;
9206     case DCT_DCT:
9207         load_buffer_32x32_avx2(input, buf_256, stride);
9208         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[0]);
9209         fdct32x32_N2_col_avx2(out_256, buf_256, cos_bit_col);
9210         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256 / 2, -shift[1]);
9211         transpose_16x16_in_32x32_avx2(out_256, buf_256); //top-left -> top-left
9212         transpose_16x16_in_32x32_avx2(out_256 + 2, buf_256 + 64); //top-right ->bottom-left
9213         fdct32x32_N2_row_avx2(buf_256, out_256, cos_bit_row);
9214         av1_round_shift_array_32_N2_avx2(out_256, out_256, txfm2d_size_256 / 2, -shift[2]);
9215         transpose_16x16_in_32x32_avx2(out_256, out_256); //transpose nonzer output
9216         clear_buffer_wxh_N2(out_256, 4, 32);
9217         break;
9218     case V_DCT:
9219         load_buffer_32x32_avx2(input, buf_256, stride);
9220         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[0]);
9221         fdct32x32_N2_row_avx2(out_256, buf_256, cos_bit_col); //col
9222         av1_round_shift_array_32_N2_avx2(buf_256, out_256, txfm2d_size_256 / 2, -shift[1]);
9223         fidtx32x32_N2_row_avx2(out_256, buf_256, cos_bit_row);
9224         av1_round_shift_array_32_N2_avx2(buf_256, out_256, txfm2d_size_256 / 2, -shift[2]);
9225         clear_buffer_wxh_N2(out_256, 4, 32);
9226         break;
9227     case H_DCT:
9228         load_buffer_32x16_avx2(input, buf_256, stride);
9229         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256 / 2, -shift[0]);
9230         fidtx32x32_N2_col_avx2(out_256, buf_256, cos_bit_col);
9231         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256 / 2, -shift[1]);
9232         transpose_16x16_in_32x32_avx2(out_256, buf_256); //top-left -> top-left
9233         transpose_16x16_in_32x32_avx2(out_256 + 2, buf_256 + 64); //top-right ->bottom-left
9234         fdct32x32_N2_row_avx2(buf_256, out_256, cos_bit_row);
9235         av1_round_shift_array_32_N2_avx2(out_256, out_256, txfm2d_size_256 / 2, -shift[2]);
9236         transpose_16x16_in_32x32_avx2(out_256, out_256); //transpose nonzer output
9237         clear_buffer_wxh_N2(out_256, 4, 32);
9238         break;
9239     default: assert(0);
9240     }
9241 }
9242 
svt_av1_fwd_txfm2d_16x32_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9243 void svt_av1_fwd_txfm2d_16x32_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9244                                       TxType tx_type, uint8_t bd) {
9245     __m256i       in[64];
9246     __m256i *     outcoef256    = (__m256i *)output;
9247     const int8_t *shift         = fwd_txfm_shift_ls[TX_16X32];
9248     const int32_t txw_idx       = get_txw_idx(TX_16X32);
9249     const int32_t txh_idx       = get_txh_idx(TX_16X32);
9250     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
9251     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
9252     const int32_t txfm_size_col = tx_size_wide[TX_16X32];
9253     const int32_t txfm_size_row = tx_size_high[TX_16X32];
9254     const int32_t num_row       = txfm_size_row >> 3;
9255     const int32_t num_col       = txfm_size_col >> 3;
9256 
9257     switch (tx_type) {
9258     case IDTX:
9259         load_buffer_16x16_N2(input, in, stride, 0, 0, shift[0]);
9260         av1_idtx16x32_N2_avx2(in, in);
9261         col_txfm_16x16_N2_half_rounding(&in[0], -shift[1]);
9262         col_txfm_16x16_N2_half_rounding(&in[16], -shift[1]);
9263         fidtx16x16_N2_row_avx2(in, outcoef256, bitrow, num_row);
9264         av1_round_shift_rect_array_wxh_avx2(
9265             outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
9266         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9267         break;
9268     case DCT_DCT:
9269         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
9270         load_buffer_16x16(input + 16 * stride, in + 32, stride, 0, 0, shift[0]);
9271         for (int32_t i = 0; i < num_col; i++)
9272             av1_fdct32_new_N2_avx2((in + i), (in + i), bitcol, 8, num_col);
9273         col_txfm_16x16_rounding(&in[0], -shift[1]);
9274         transpose_8nx8n_N2_half(in, outcoef256, txfm_size_col, txfm_size_row);
9275         fdct16x16_N2_avx2(outcoef256, in, bitrow, num_row, num_row / 2);
9276         transpose_8nx8n_N2_quad(in, outcoef256, txfm_size_row, txfm_size_col);
9277         av1_round_shift_rect_array_wxh_avx2(
9278             outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
9279         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9280         break;
9281     default: assert(0);
9282     }
9283 
9284     (void)bd;
9285 }
9286 
9287 /* call this function only for IDTX */
svt_av1_fwd_txfm2d_32x16_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9288 void svt_av1_fwd_txfm2d_32x16_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9289                                       TxType tx_type, uint8_t bd) {
9290     __m256i       in[64];
9291     __m256i *     outcoef256    = (__m256i *)output;
9292     const int8_t *shift         = fwd_txfm_shift_ls[TX_32X16];
9293     const int32_t txw_idx       = get_txw_idx(TX_32X16);
9294     const int32_t txh_idx       = get_txh_idx(TX_32X16);
9295     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
9296     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
9297     const int32_t txfm_size_col = tx_size_wide[TX_32X16];
9298     const int32_t txfm_size_row = tx_size_high[TX_32X16];
9299     const int32_t num_row       = txfm_size_row >> 3;
9300     const int32_t num_col       = txfm_size_col >> 3;
9301 
9302     switch (tx_type) {
9303     case IDTX:
9304         load_buffer_16x8n(input, in, stride, 0, 0, shift[0], txfm_size_row / 2);
9305         fidtx32x16_N2_avx2(in, in, bitcol, txfm_size_row / 2);
9306         col_txfm_32x16_N2_rounding(&in[0], -shift[1]);
9307         col_txfm_32x16_N2_rounding(&in[1], -shift[1]);
9308         av1_idtx32x16_N2_avx2(in, outcoef256, txfm_size_row / 2);
9309         av1_round_shift_rect_array_wxh_avx2(
9310             outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
9311         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9312         break;
9313     case DCT_DCT:
9314         load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
9315         fdct16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9316         col_txfm_16x16_rounding(&in[0], -shift[1]);
9317         transpose_8nx8n_N2_half(in, outcoef256, txfm_size_col, txfm_size_row);
9318         av1_fdct32_new_N2_avx2(outcoef256, in, bitrow, 8, num_row);
9319         transpose_8nx8n_N2_quad(in, outcoef256, txfm_size_row, txfm_size_col);
9320         av1_round_shift_rect_array_wxh_avx2(
9321             outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
9322         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9323         break;
9324     default: assert(0);
9325     }
9326 
9327     (void)bd;
9328 }
9329 
9330 /* call this function only for DCT_DCT, IDTX */
svt_av1_fwd_txfm2d_8x32_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9331 void svt_av1_fwd_txfm2d_8x32_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9332                                      TxType tx_type, uint8_t bd) {
9333     __m256i       in[32];
9334     __m256i *     outcoef256 = (__m256i *)output;
9335     const int8_t *shift      = fwd_txfm_shift_ls[TX_8X32];
9336     const int32_t txw_idx    = get_txw_idx(TX_8X32);
9337     const int32_t txh_idx    = get_txh_idx(TX_8X32);
9338     int8_t        bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
9339     int8_t        bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
9340 
9341     const int32_t txfm_size_col = tx_size_wide[TX_8X32];
9342     const int32_t txfm_size_row = tx_size_high[TX_8X32];
9343     const int32_t num_row       = txfm_size_row >> 3;
9344     const int32_t num_col       = txfm_size_col >> 3;
9345 
9346     switch (tx_type) {
9347     case IDTX:
9348         load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
9349         av1_idtx16_new_avx2(in, in, bitcol, num_col);
9350         col_txfm_16x16_N2_rounding(in, -shift[1]);
9351         // row transform
9352         fidtx32x8_N2_avx2(in, outcoef256, bitrow, num_col, 16);
9353         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9354         break;
9355     case DCT_DCT:
9356         load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
9357         load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + 16, stride, 0, 0, shift[0]);
9358         av1_fdct32_new_N2_avx2(in, in, bitcol, 8, num_col);
9359         col_txfm_16x16_N2_rounding(in, -shift[1]);
9360         transpose_8nx8n_N2_half(in, outcoef256, txfm_size_col, txfm_size_row);
9361         // row transform
9362         fdct8x8_N2_avx2(outcoef256, in, bitrow, num_row);
9363         fdct8x8_N2_avx2(outcoef256 + 1, in + 1, bitrow, num_row);
9364         transpose_8nx8n_N2_quad(in, outcoef256, txfm_size_row, txfm_size_col);
9365         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9366         break;
9367     default: assert(0);
9368     }
9369 
9370     (void)bd;
9371 }
9372 
9373 /* call this function only for DCT_DCT, IDTX */
svt_av1_fwd_txfm2d_32x8_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9374 void svt_av1_fwd_txfm2d_32x8_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9375                                      TxType tx_type, uint8_t bd) {
9376     __m256i       in[32];
9377     __m256i *     outcoef256 = (__m256i *)output;
9378     const int8_t *shift      = fwd_txfm_shift_ls[TX_32X8];
9379     const int32_t txw_idx    = get_txw_idx(TX_32X8);
9380     const int32_t txh_idx    = get_txh_idx(TX_32X8);
9381     int8_t        bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
9382     int8_t        bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
9383 
9384     const int32_t txfm_size_col = tx_size_wide[TX_32X8];
9385     const int32_t txfm_size_row = tx_size_high[TX_32X8];
9386     const int32_t num_row       = txfm_size_row >> 3;
9387     const int32_t num_col       = txfm_size_col >> 3;
9388 
9389     (void)bd;
9390 
9391     switch (tx_type) {
9392     case IDTX:
9393         load_buffer_16x8n(input, in, stride, 0, 0, shift[0], txfm_size_row / 2);
9394         fidtx32x8_N2_avx2(in, in, bitcol, num_col, 4);
9395         fidtx32x8_N2_avx2(in + 1, in + 1, bitcol, num_col, 4);
9396         col_txfm_32x8_N2_half_rounding(&in[0], -shift[1]);
9397         // row transform
9398         av1_idtx32_new_N2_avx2(in, outcoef256, bitrow, num_row, 16);
9399         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9400         break;
9401     case DCT_DCT:
9402         load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
9403         for (int32_t i = 0; i < num_col; i++) fdct8x8_N2_avx2((in + i), (in + i), bitcol, num_col);
9404         col_txfm_16x16_N2_rounding(&in[0], -shift[1]);
9405         transpose_8nx8n_N2_half(in, outcoef256, txfm_size_col, txfm_size_row);
9406         // row transform
9407         av1_fdct32_new_N2_avx2(outcoef256, in, bitrow, 8, num_row);
9408         transpose_8nx8n_N2_quad(in, outcoef256, txfm_size_row, txfm_size_col);
9409         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9410         break;
9411     default: assert(0);
9412     }
9413 }
9414 
9415 /* call this function for all 16 transform types */
svt_av1_fwd_txfm2d_8x16_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9416 void svt_av1_fwd_txfm2d_8x16_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9417                                      TxType tx_type, uint8_t bd) {
9418     __m256i       in[16], out[16];
9419     __m256i *     outcoef256 = (__m256i *)output;
9420     const int8_t *shift      = fwd_txfm_shift_ls[TX_8X16];
9421     const int32_t txw_idx    = get_txw_idx(TX_8X16);
9422     const int32_t txh_idx    = get_txh_idx(TX_8X16);
9423     int8_t        bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
9424     int8_t        bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
9425     int32_t       ud_flip, lr_flip;
9426     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
9427     const int32_t txfm_size_col = tx_size_wide[TX_8X16];
9428     const int32_t txfm_size_row = tx_size_high[TX_8X16];
9429     const int32_t num_col       = txfm_size_col >> 3;
9430 
9431     switch (tx_type) {
9432     case DCT_DCT:
9433         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9434         fdct16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9435         col_txfm_8x8_rounding(in, -shift[1]);
9436         transpose_8x8_avx2(in, out);
9437         fdct8x8_N2_avx2(out, in, bitrow, 1);
9438         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9439         transpose_8x8_half_avx2(out, outcoef256);
9440         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9441         break;
9442     case ADST_DCT:
9443         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9444         fadst16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9445         col_txfm_8x8_rounding(in, -shift[1]);
9446         transpose_8x8_avx2(in, out);
9447         fdct8x8_N2_avx2(out, in, bitrow, 1);
9448         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9449         transpose_8x8_half_avx2(out, outcoef256);
9450         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9451         break;
9452     case DCT_ADST:
9453         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9454         fdct16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9455         col_txfm_8x8_rounding(in, -shift[1]);
9456         transpose_8x8_avx2(in, out);
9457         fadst8x8_N2_avx2(out, in, bitrow, 1);
9458         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9459         transpose_8x8_half_avx2(out, outcoef256);
9460         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9461         break;
9462     case ADST_ADST:
9463         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9464         fadst16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9465         col_txfm_8x8_rounding(in, -shift[1]);
9466         transpose_8x8_avx2(in, out);
9467         fadst8x8_N2_avx2(out, in, bitrow, 1);
9468         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9469         transpose_8x8_half_avx2(out, outcoef256);
9470         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9471         break;
9472     case FLIPADST_DCT:
9473         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9474         fadst16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9475         col_txfm_8x8_rounding(in, -shift[1]);
9476         transpose_8x8_avx2(in, out);
9477         fdct8x8_N2_avx2(out, in, bitrow, 1);
9478         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9479         transpose_8x8_half_avx2(out, outcoef256);
9480         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9481         break;
9482     case DCT_FLIPADST:
9483         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9484         fdct16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9485         col_txfm_8x8_rounding(in, -shift[1]);
9486         transpose_8x8_avx2(in, out);
9487         fadst8x8_N2_avx2(out, in, bitrow, 1);
9488         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9489         transpose_8x8_half_avx2(out, outcoef256);
9490         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9491         break;
9492     case FLIPADST_FLIPADST:
9493         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9494         fadst16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9495         col_txfm_8x8_rounding(in, -shift[1]);
9496         transpose_8x8_avx2(in, out);
9497         fadst8x8_N2_avx2(out, in, bitrow, 1);
9498         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9499         transpose_8x8_half_avx2(out, outcoef256);
9500         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9501         break;
9502     case ADST_FLIPADST:
9503         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9504         fadst16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9505         col_txfm_8x8_rounding(in, -shift[1]);
9506         transpose_8x8_avx2(in, out);
9507         fadst8x8_N2_avx2(out, in, bitrow, 1);
9508         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9509         transpose_8x8_half_avx2(out, outcoef256);
9510         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9511         break;
9512     case FLIPADST_ADST:
9513         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9514         fadst16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9515         col_txfm_8x8_rounding(in, -shift[1]);
9516         transpose_8x8_avx2(in, out);
9517         fadst8x8_N2_avx2(out, in, bitrow, 1);
9518         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9519         transpose_8x8_half_avx2(out, outcoef256);
9520         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9521         break;
9522     case IDTX:
9523         load_buffer_8x8(input, in, stride, ud_flip, lr_flip, shift[0]);
9524         fidtx8xn_N2_col_avx2(in, in, bitcol, 8);
9525         col_txfm_8x8_rounding(in, -shift[1]);
9526         transpose_8x8_avx2(in, out);
9527         fidtx8x8_N2_avx2(out, in, bitrow, 1);
9528         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9529         transpose_8x8_half_avx2(out, outcoef256);
9530         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9531         break;
9532     case V_DCT:
9533         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9534         fdct16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9535         col_txfm_8x8_rounding(in, -shift[1]);
9536         transpose_8x8_avx2(in, out);
9537         fidtx8x8_N2_avx2(out, in, bitrow, 1);
9538         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9539         transpose_8x8_half_avx2(out, outcoef256);
9540         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9541         break;
9542     case H_DCT:
9543         load_buffer_8x8(input, in, stride, ud_flip, lr_flip, shift[0]);
9544         fidtx8xn_N2_col_avx2(in, in, bitcol, 8);
9545         col_txfm_8x8_rounding(in, -shift[1]);
9546         transpose_8x8_avx2(in, out);
9547         fdct8x8_N2_avx2(out, in, bitrow, 1);
9548         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9549         transpose_8x8_half_avx2(out, outcoef256);
9550         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9551         break;
9552     case V_ADST:
9553         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9554         fadst16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9555         col_txfm_8x8_rounding(in, -shift[1]);
9556         transpose_8x8_avx2(in, out);
9557         fidtx8x8_N2_avx2(out, in, bitrow, 1);
9558         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9559         transpose_8x8_half_avx2(out, outcoef256);
9560         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9561         break;
9562     case H_ADST:
9563         load_buffer_8x8(input, in, stride, ud_flip, lr_flip, shift[0]);
9564         fidtx8xn_N2_col_avx2(in, in, bitcol, 8);
9565         col_txfm_8x8_rounding(in, -shift[1]);
9566         transpose_8x8_avx2(in, out);
9567         fadst8x8_N2_avx2(out, in, bitrow, 1);
9568         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9569         transpose_8x8_half_avx2(out, outcoef256);
9570         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9571         break;
9572     case V_FLIPADST:
9573         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
9574         fadst16x16_N2_avx2(in, in, bitcol, num_col, num_col);
9575         col_txfm_8x8_rounding(in, -shift[1]);
9576         transpose_8x8_avx2(in, out);
9577         fidtx8x8_N2_avx2(out, in, bitrow, 1);
9578         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9579         transpose_8x8_half_avx2(out, outcoef256);
9580         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9581         break;
9582     case H_FLIPADST:
9583         load_buffer_8x8(input, in, stride, ud_flip, lr_flip, shift[0]);
9584         fidtx8xn_N2_col_avx2(in, in, bitcol, 8);
9585         col_txfm_8x8_rounding(in, -shift[1]);
9586         transpose_8x8_avx2(in, out);
9587         fadst8x8_N2_avx2(out, in, bitrow, 1);
9588         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9589         transpose_8x8_half_avx2(out, outcoef256);
9590         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9591         break;
9592     default: assert(0);
9593     }
9594 
9595     (void)bd;
9596 }
9597 
9598 /* call this function for all 16 transform types */
svt_av1_fwd_txfm2d_16x8_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9599 void svt_av1_fwd_txfm2d_16x8_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9600                                      TxType tx_type, uint8_t bd) {
9601     __m256i       in[16], out[16];
9602     __m256i *     outcoef256 = (__m256i *)output;
9603     const int8_t *shift      = fwd_txfm_shift_ls[TX_16X8];
9604     const int32_t txw_idx    = get_txw_idx(TX_16X8);
9605     const int32_t txh_idx    = get_txh_idx(TX_16X8);
9606     int8_t        bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
9607     int8_t        bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
9608     int32_t       ud_flip, lr_flip;
9609     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
9610     const int32_t txfm_size_col = tx_size_wide[TX_16X8];
9611     const int32_t txfm_size_row = tx_size_high[TX_16X8];
9612     const int32_t num_row       = txfm_size_row >> 3;
9613     const int32_t num_col       = txfm_size_col >> 3;
9614     assert(num_col > 0);
9615 
9616     switch (tx_type) {
9617     case DCT_DCT:
9618         for (int32_t i = 0; i < num_col; i++) {
9619             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9620             fdct8x8_N2_avx2(in, in, bitcol, 1);
9621             col_txfm_8x8_N2_rounding(in, -shift[1]);
9622             transpose_8x8_half_avx2(in, out + i * 8);
9623         }
9624         if (lr_flip) {
9625             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9626             fdct16x16_N2_avx2(in, out, bitrow, num_row, 1);
9627         } else
9628             fdct16x16_N2_avx2(out, out, bitrow, num_row, 1);
9629         transpose_8x8_avx2(out, in);
9630         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9631         write_buffer_16x8_N2_avx2(out, outcoef256);
9632         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9633         break;
9634     case ADST_DCT:
9635         for (int32_t i = 0; i < num_col; i++) {
9636             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9637             fadst8x8_N2_avx2(in, in, bitcol, 1);
9638             col_txfm_8x8_N2_rounding(in, -shift[1]);
9639             transpose_8x8_half_avx2(in, out + i * 8);
9640         }
9641         if (lr_flip) {
9642             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9643             fdct16x16_N2_avx2(in, out, bitrow, num_row, 1);
9644         } else
9645             fdct16x16_N2_avx2(out, out, bitrow, num_row, 1);
9646         transpose_8x8_avx2(out, in);
9647         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9648         write_buffer_16x8_N2_avx2(out, outcoef256);
9649         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9650         break;
9651     case DCT_ADST:
9652         for (int32_t i = 0; i < num_col; i++) {
9653             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9654             fdct8x8_N2_avx2(in, in, bitcol, 1);
9655             col_txfm_8x8_N2_rounding(in, -shift[1]);
9656             transpose_8x8_half_avx2(in, out + i * 8);
9657         }
9658         if (lr_flip) {
9659             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9660             fadst16x16_N2_avx2(in, out, bitrow, num_row, 1);
9661         } else
9662             fadst16x16_N2_avx2(out, out, bitrow, num_row, 1);
9663         transpose_8x8_avx2(out, in);
9664         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9665         write_buffer_16x8_N2_avx2(out, outcoef256);
9666         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9667         break;
9668     case ADST_ADST:
9669         for (int32_t i = 0; i < num_col; i++) {
9670             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9671             fadst8x8_N2_avx2(in, in, bitcol, 1);
9672             col_txfm_8x8_N2_rounding(in, -shift[1]);
9673             transpose_8x8_half_avx2(in, out + i * 8);
9674         }
9675         if (lr_flip) {
9676             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9677             fadst16x16_N2_avx2(in, out, bitrow, num_row, 1);
9678         } else
9679             fadst16x16_N2_avx2(out, out, bitrow, num_row, 1);
9680         transpose_8x8_avx2(out, in);
9681         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9682         write_buffer_16x8_N2_avx2(out, outcoef256);
9683         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9684         break;
9685     case FLIPADST_DCT:
9686         for (int32_t i = 0; i < num_col; i++) {
9687             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9688             fadst8x8_N2_avx2(in, in, bitcol, 1);
9689             col_txfm_8x8_N2_rounding(in, -shift[1]);
9690             transpose_8x8_half_avx2(in, out + i * 8);
9691         }
9692         if (lr_flip) {
9693             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9694             fdct16x16_N2_avx2(in, out, bitrow, num_row, 1);
9695         } else
9696             fdct16x16_N2_avx2(out, out, bitrow, num_row, 1);
9697         transpose_8x8_avx2(out, in);
9698         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9699         write_buffer_16x8_N2_avx2(out, outcoef256);
9700         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9701         break;
9702     case DCT_FLIPADST:
9703         for (int32_t i = 0; i < num_col; i++) {
9704             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9705             fdct8x8_N2_avx2(in, in, bitcol, 1);
9706             col_txfm_8x8_N2_rounding(in, -shift[1]);
9707             transpose_8x8_half_avx2(in, out + i * 8);
9708         }
9709         if (lr_flip) {
9710             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9711             fadst16x16_N2_avx2(in, out, bitrow, num_row, 1);
9712         } else
9713             fadst16x16_N2_avx2(out, out, bitrow, num_row, 1);
9714         transpose_8x8_avx2(out, in);
9715         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9716         write_buffer_16x8_N2_avx2(out, outcoef256);
9717         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9718         break;
9719     case FLIPADST_FLIPADST:
9720         for (int32_t i = 0; i < num_col; i++) {
9721             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9722             fadst8x8_N2_avx2(in, in, bitcol, 1);
9723             col_txfm_8x8_N2_rounding(in, -shift[1]);
9724             transpose_8x8_half_avx2(in, out + i * 8);
9725         }
9726         if (lr_flip) {
9727             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9728             fadst16x16_N2_avx2(in, out, bitrow, num_row, 1);
9729         } else
9730             fadst16x16_N2_avx2(out, out, bitrow, num_row, 1);
9731         transpose_8x8_avx2(out, in);
9732         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9733         write_buffer_16x8_N2_avx2(out, outcoef256);
9734         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9735         break;
9736     case ADST_FLIPADST:
9737         for (int32_t i = 0; i < num_col; i++) {
9738             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9739             fadst8x8_N2_avx2(in, in, bitcol, 1);
9740             col_txfm_8x8_N2_rounding(in, -shift[1]);
9741             transpose_8x8_half_avx2(in, out + i * 8);
9742         }
9743         if (lr_flip) {
9744             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9745             fadst16x16_N2_avx2(in, out, bitrow, num_row, 1);
9746         } else
9747             fadst16x16_N2_avx2(out, out, bitrow, num_row, 1);
9748         transpose_8x8_avx2(out, in);
9749         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9750         write_buffer_16x8_N2_avx2(out, outcoef256);
9751         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9752         break;
9753     case FLIPADST_ADST:
9754         for (int32_t i = 0; i < num_col; i++) {
9755             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9756             fadst8x8_N2_avx2(in, in, bitcol, 1);
9757             col_txfm_8x8_N2_rounding(in, -shift[1]);
9758             transpose_8x8_half_avx2(in, out + i * 8);
9759         }
9760         if (lr_flip) {
9761             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9762             fadst16x16_N2_avx2(in, out, bitrow, num_row, 1);
9763         } else
9764             fadst16x16_N2_avx2(out, out, bitrow, num_row, 1);
9765         transpose_8x8_avx2(out, in);
9766         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9767         write_buffer_16x8_N2_avx2(out, outcoef256);
9768         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9769         break;
9770     case IDTX:
9771         load_buffer_8x8(input, in, stride, ud_flip, 0, shift[0]);
9772         fidtx8x8_N2_avx2(in, out, bitcol, 1);
9773         col_txfm_8x8_N2_rounding(out, -shift[1]);
9774         if (lr_flip) {
9775             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9776             fidtx8xn_N2_col_avx2(in, out, bitrow, 4);
9777         } else
9778             fidtx8xn_N2_col_avx2(out, out, bitrow, 4);
9779         av1_round_shift_rect_array_32_avx2(out, out, 4, -shift[2], new_sqrt2);
9780         write_buffer_16x8_N2_avx2(out, outcoef256);
9781         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9782         break;
9783     case V_DCT:
9784         load_buffer_8x8(input, in, stride, ud_flip, 0, shift[0]);
9785         fdct8x8_N2_avx2(in, out, bitcol, 1);
9786         col_txfm_8x8_N2_rounding(out, -shift[1]);
9787         if (lr_flip) {
9788             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9789             fidtx8xn_N2_col_avx2(in, out, bitrow, 4);
9790         } else
9791             fidtx8xn_N2_col_avx2(out, out, bitrow, 4);
9792         av1_round_shift_rect_array_32_avx2(out, out, 4, -shift[2], new_sqrt2);
9793         write_buffer_16x8_N2_avx2(out, outcoef256);
9794         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9795         break;
9796     case H_DCT:
9797         for (int32_t i = 0; i < num_col; i++) {
9798             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9799             fidtx8x8_N2_avx2(in, in, bitcol, 1);
9800             col_txfm_8x8_N2_rounding(in, -shift[1]);
9801             transpose_8x8_half_avx2(in, out + i * 8);
9802         }
9803         if (lr_flip) {
9804             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9805             fdct16x16_N2_avx2(in, out, bitrow, num_row, 1);
9806         } else
9807             fdct16x16_N2_avx2(out, out, bitrow, num_row, 1);
9808         transpose_8x8_avx2(out, in);
9809         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9810         write_buffer_16x8_N2_avx2(out, outcoef256);
9811         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9812         break;
9813     case V_ADST:
9814         load_buffer_8x8(input, in, stride, ud_flip, 0, shift[0]);
9815         fadst8x8_N2_avx2(in, out, bitcol, 1);
9816         col_txfm_8x8_N2_rounding(out, -shift[1]);
9817         if (lr_flip) {
9818             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9819             fidtx8xn_N2_col_avx2(in, out, bitrow, 4);
9820         } else
9821             fidtx8xn_N2_col_avx2(out, out, bitrow, 4);
9822         av1_round_shift_rect_array_32_avx2(out, out, 4, -shift[2], new_sqrt2);
9823         write_buffer_16x8_N2_avx2(out, outcoef256);
9824         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9825         break;
9826     case H_ADST:
9827         for (int32_t i = 0; i < num_col; i++) {
9828             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9829             fidtx8x8_N2_avx2(in, in, bitcol, 1);
9830             col_txfm_8x8_N2_rounding(in, -shift[1]);
9831             transpose_8x8_half_avx2(in, out + i * 8);
9832         }
9833         if (lr_flip) {
9834             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9835             fadst16x16_N2_avx2(in, out, bitrow, num_row, 1);
9836         } else
9837             fadst16x16_N2_avx2(out, out, bitrow, num_row, 1);
9838         transpose_8x8_avx2(out, in);
9839         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9840         write_buffer_16x8_N2_avx2(out, outcoef256);
9841         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9842         break;
9843     case V_FLIPADST:
9844         load_buffer_8x8(input, in, stride, ud_flip, 0, shift[0]);
9845         fadst8x8_N2_avx2(in, out, bitcol, 1);
9846         col_txfm_8x8_N2_rounding(out, -shift[1]);
9847         if (lr_flip) {
9848             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9849             fidtx8xn_N2_col_avx2(in, out, bitrow, 4);
9850         } else
9851             fidtx8xn_N2_col_avx2(out, out, bitrow, 4);
9852         av1_round_shift_rect_array_32_avx2(out, out, 4, -shift[2], new_sqrt2);
9853         write_buffer_16x8_N2_avx2(out, outcoef256);
9854         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9855         break;
9856     case H_FLIPADST:
9857         for (int32_t i = 0; i < num_col; i++) {
9858             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
9859             fidtx8x8_N2_avx2(in, in, bitcol, 1);
9860             col_txfm_8x8_N2_rounding(in, -shift[1]);
9861             transpose_8x8_half_avx2(in, out + i * 8);
9862         }
9863         if (lr_flip) {
9864             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
9865             fadst16x16_avx2(in, out, bitrow, num_row);
9866         } else
9867             fadst16x16_avx2(out, out, bitrow, num_row);
9868         transpose_8x8_avx2(out, in);
9869         av1_round_shift_rect_array_32_avx2(in, out, 4, -shift[2], new_sqrt2);
9870         write_buffer_16x8_N2_avx2(out, outcoef256);
9871         clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
9872         break;
9873     default: assert(0);
9874     }
9875 
9876     (void)bd;
9877 }
9878 
svt_av1_fwd_txfm2d_4x8_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)9879 void svt_av1_fwd_txfm2d_4x8_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
9880                                     TxType tx_type, uint8_t bd) {
9881     __m256i in[4];
9882     __m256i outcoeff256[4];
9883 
9884     const int8_t *shift   = fwd_txfm_shift_ls[TX_4X8];
9885     const int32_t txw_idx = get_txw_idx(TX_4X8);
9886     const int32_t txh_idx = get_txh_idx(TX_4X8);
9887     int32_t       bitcol  = fwd_cos_bit_col[txw_idx][txh_idx];
9888     int32_t       bitrow  = fwd_cos_bit_row[txw_idx][txh_idx];
9889 
9890     switch (tx_type) {
9891     case DCT_DCT:
9892         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
9893         fdct4x8_N2_avx2(in, in, bitcol);
9894         col_txfm_8x4_N2_rounding(in, -shift[1]);
9895         transpose_4x8_avx2(in, outcoeff256);
9896         fdct4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9897         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9898         write_buffer_4x8_N2(outcoeff256, output);
9899         break;
9900     case ADST_DCT:
9901         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
9902         fadst8x4_N2_avx2(in, in, bitcol, 1);
9903         col_txfm_8x4_N2_rounding(in, -shift[1]);
9904         transpose_4x8_avx2(in, outcoeff256);
9905         fdct4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9906         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9907         write_buffer_4x8_N2(outcoeff256, output);
9908         break;
9909     case DCT_ADST:
9910         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
9911         fdct4x8_N2_avx2(in, in, bitcol);
9912         col_txfm_8x4_N2_rounding(in, -shift[1]);
9913         transpose_4x8_avx2(in, outcoeff256);
9914         fadst4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9915         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9916         write_buffer_4x8_N2(outcoeff256, output);
9917         break;
9918     case ADST_ADST:
9919         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
9920         fadst8x4_N2_avx2(in, in, bitcol, 1);
9921         col_txfm_8x4_N2_rounding(in, -shift[1]);
9922         transpose_4x8_avx2(in, outcoeff256);
9923         fadst4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9924         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9925         write_buffer_4x8_N2(outcoeff256, output);
9926         break;
9927     case FLIPADST_DCT:
9928         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
9929         fadst8x4_N2_avx2(in, in, bitcol, 1);
9930         col_txfm_8x4_N2_rounding(in, -shift[1]);
9931         transpose_4x8_avx2(in, outcoeff256);
9932         fdct4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9933         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9934         write_buffer_4x8_N2(outcoeff256, output);
9935         break;
9936     case DCT_FLIPADST:
9937         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
9938         fdct4x8_N2_avx2(in, in, bitcol);
9939         col_txfm_8x4_N2_rounding(in, -shift[1]);
9940         transpose_4x8_avx2(in, outcoeff256);
9941         fadst4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9942         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9943         write_buffer_4x8_N2(outcoeff256, output);
9944         break;
9945     case FLIPADST_FLIPADST:
9946         load_buffer_4x8_avx2(input, in, stride, 1, 1, shift[0]);
9947         fadst8x4_N2_avx2(in, in, bitcol, 1);
9948         col_txfm_8x4_N2_rounding(in, -shift[1]);
9949         transpose_4x8_avx2(in, outcoeff256);
9950         fadst4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9951         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9952         write_buffer_4x8_N2(outcoeff256, output);
9953         break;
9954     case ADST_FLIPADST:
9955         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
9956         fadst8x4_N2_avx2(in, in, bitcol, 1);
9957         col_txfm_8x4_N2_rounding(in, -shift[1]);
9958         transpose_4x8_avx2(in, outcoeff256);
9959         fadst4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9960         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9961         write_buffer_4x8_N2(outcoeff256, output);
9962         break;
9963     case FLIPADST_ADST:
9964         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
9965         fadst8x4_N2_avx2(in, in, bitcol, 1);
9966         col_txfm_8x4_N2_rounding(in, -shift[1]);
9967         transpose_4x8_avx2(in, outcoeff256);
9968         fadst4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9969         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9970         write_buffer_4x8_N2(outcoeff256, output);
9971         break;
9972     case IDTX:
9973         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
9974         fidtx8x4_N2_avx2(in, in, bitcol);
9975         col_txfm_8x4_N2_rounding(in, -shift[1]);
9976         fidtx4x8_col_N2_avx2(in, in, bitrow, 2);
9977         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9978         write_buffer_4x8_N2(outcoeff256, output);
9979         break;
9980     case V_DCT:
9981         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
9982         fdct4x8_N2_avx2(in, in, bitcol);
9983         col_txfm_8x4_N2_rounding(in, -shift[1]);
9984         fidtx4x8_col_N2_avx2(in, in, bitrow, 2);
9985         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9986         write_buffer_4x8_N2(outcoeff256, output);
9987         break;
9988     case H_DCT:
9989         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
9990         fidtx8x4_N2_avx2(in, in, bitcol);
9991         col_txfm_8x4_N2_rounding(in, -shift[1]);
9992         transpose_4x8_avx2(in, outcoeff256);
9993         fdct4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
9994         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
9995         write_buffer_4x8_N2(outcoeff256, output);
9996         break;
9997     case V_ADST:
9998         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
9999         fadst8x4_N2_avx2(in, in, bitcol, 1);
10000         col_txfm_8x4_N2_rounding(in, -shift[1]);
10001         fidtx4x8_col_N2_avx2(in, in, bitrow, 2);
10002         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10003         write_buffer_4x8_N2(outcoeff256, output);
10004         break;
10005     case H_ADST:
10006         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
10007         fidtx8x4_N2_avx2(in, in, bitcol);
10008         col_txfm_8x4_N2_rounding(in, -shift[1]);
10009         transpose_4x8_avx2(in, outcoeff256);
10010         fadst4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
10011         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10012         write_buffer_4x8_N2(outcoeff256, output);
10013         break;
10014     case V_FLIPADST:
10015         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
10016         fadst8x4_N2_avx2(in, in, bitcol, 1);
10017         col_txfm_8x4_N2_rounding(in, -shift[1]);
10018         fidtx4x8_col_N2_avx2(in, in, bitrow, 2);
10019         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10020         write_buffer_4x8_N2(outcoeff256, output);
10021         break;
10022     case H_FLIPADST:
10023         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
10024         fidtx8x4_N2_avx2(in, in, bitcol);
10025         col_txfm_8x4_N2_rounding(in, -shift[1]);
10026         transpose_4x8_avx2(in, outcoeff256);
10027         fadst4x8_col_N2_avx2(outcoeff256, in, bitrow, 1);
10028         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10029         write_buffer_4x8_N2(outcoeff256, output);
10030         break;
10031     default: assert(0);
10032     }
10033     (void)bd;
10034 }
10035 
svt_av1_fwd_txfm2d_8x4_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)10036 void svt_av1_fwd_txfm2d_8x4_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
10037                                     TxType tx_type, uint8_t bd) {
10038     __m256i       in[4];
10039     __m256i *     outcoeff256 = (__m256i *)output;
10040     const int8_t *shift       = fwd_txfm_shift_ls[TX_8X4];
10041     const int32_t txw_idx     = get_txw_idx(TX_8X4);
10042     const int32_t txh_idx     = get_txh_idx(TX_8X4);
10043     int32_t       bitcol      = fwd_cos_bit_col[txw_idx][txh_idx];
10044     int32_t       bitrow      = fwd_cos_bit_row[txw_idx][txh_idx];
10045 
10046     switch (tx_type) {
10047     case DCT_DCT:
10048         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10049         fdct4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10050         fdct4x8_N2_avx2(in, outcoeff256, bitrow);
10051         transpose_4x8_avx2(outcoeff256, in);
10052         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10053         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10054         break;
10055     case ADST_DCT:
10056         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10057         fadst4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10058         fdct4x8_N2_avx2(in, outcoeff256, bitrow);
10059         transpose_4x8_avx2(outcoeff256, in);
10060         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10061         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10062         break;
10063     case DCT_ADST:
10064         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10065         fdct4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10066         fadst8x4_N2_avx2(in, outcoeff256, bitrow, 1);
10067         transpose_4x8_avx2(outcoeff256, in);
10068         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10069         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10070         break;
10071     case ADST_ADST:
10072         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10073         fadst4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10074         fadst8x4_N2_avx2(in, outcoeff256, bitrow, 1);
10075         transpose_4x8_avx2(outcoeff256, in);
10076         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10077         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10078         break;
10079     case FLIPADST_DCT:
10080         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
10081         fadst4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10082         fdct4x8_N2_avx2(in, outcoeff256, bitrow);
10083         transpose_4x8_avx2(outcoeff256, in);
10084         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10085         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10086         break;
10087     case DCT_FLIPADST:
10088         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
10089         fdct4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10090         fadst8x4_N2_avx2(in, outcoeff256, bitrow, 1);
10091         transpose_4x8_avx2(outcoeff256, in);
10092         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10093         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10094         break;
10095     case FLIPADST_FLIPADST:
10096         load_buffer_8x4_avx2(input, in, stride, 1, 1, shift[0]);
10097         fadst4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10098         fadst8x4_N2_avx2(in, outcoeff256, bitrow, 1);
10099         transpose_4x8_avx2(outcoeff256, in);
10100         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10101         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10102         break;
10103     case ADST_FLIPADST:
10104         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
10105         fadst4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10106         fadst8x4_N2_avx2(in, outcoeff256, bitrow, 1);
10107         transpose_4x8_avx2(outcoeff256, in);
10108         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10109         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10110         break;
10111     case FLIPADST_ADST:
10112         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
10113         fadst4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10114         fadst8x4_N2_avx2(in, outcoeff256, bitrow, 1);
10115         transpose_4x8_avx2(outcoeff256, in);
10116         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10117         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10118         break;
10119     case IDTX:
10120         load_buffer_4x4_avx2(input, in, stride, 0, 0, shift[0]);
10121         fidtx4x8_row_N2_with_round_avx2(in, in, bitcol, -shift[1]);
10122         fidtx8x4_N2_avx2(in, outcoeff256, bitrow);
10123         transpose_4x8_avx2(outcoeff256, in);
10124         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10125         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10126         break;
10127     case V_DCT:
10128         load_buffer_4x4_avx2(input, in, stride, 0, 0, shift[0]);
10129         fdct4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10130         fidtx8x4_N2_avx2(in, outcoeff256, bitrow);
10131         transpose_4x8_avx2(outcoeff256, in);
10132         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10133         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10134         break;
10135     case H_DCT:
10136         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10137         fidtx4x8_row_N2_with_round_avx2(in, in, bitcol, -shift[1]);
10138         fdct4x8_N2_avx2(in, outcoeff256, bitrow);
10139         transpose_4x8_avx2(outcoeff256, in);
10140         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10141         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10142         break;
10143     case V_ADST:
10144         load_buffer_4x4_avx2(input, in, stride, 0, 0, shift[0]);
10145         fadst4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10146         fidtx8x4_N2_avx2(in, outcoeff256, bitrow);
10147         transpose_4x8_avx2(outcoeff256, in);
10148         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10149         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10150         break;
10151     case H_ADST:
10152         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10153         fidtx4x8_row_N2_with_round_avx2(in, in, bitcol, -shift[1]);
10154         fadst8x4_N2_avx2(in, outcoeff256, bitrow, 1);
10155         transpose_4x8_avx2(outcoeff256, in);
10156         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10157         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10158         break;
10159     case V_FLIPADST:
10160         load_buffer_4x4_avx2(input, in, stride, 1, 0, shift[0]);
10161         fadst4x8_row_N2_with_round_avx2(in, in, bitcol, 1, -shift[1]);
10162         fidtx8x4_N2_avx2(in, outcoeff256, bitrow);
10163         transpose_4x8_avx2(outcoeff256, in);
10164         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10165         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10166         break;
10167     case H_FLIPADST:
10168         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
10169         fidtx4x8_row_N2_with_round_avx2(in, in, bitcol, -shift[1]);
10170         fadst8x4_N2_avx2(in, outcoeff256, bitrow, 1);
10171         transpose_4x8_avx2(outcoeff256, in);
10172         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 2, -shift[2], new_sqrt2);
10173         clear_buffer_wxh_N2(outcoeff256, 1, 4);
10174         break;
10175     default: assert(0);
10176     }
10177     (void)bd;
10178 }
10179 
svt_av1_fwd_txfm2d_4x16_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)10180 void svt_av1_fwd_txfm2d_4x16_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
10181                                      TxType tx_type, uint8_t bd) {
10182     __m256i       in[8];
10183     __m256i *     outcoeff256 = (__m256i *)output;
10184     const int8_t *shift       = fwd_txfm_shift_ls[TX_4X16];
10185     const int32_t txw_idx     = get_txw_idx(TX_4X16);
10186     const int32_t txh_idx     = get_txh_idx(TX_4X16);
10187     int32_t       bitcol      = fwd_cos_bit_col[txw_idx][txh_idx];
10188     int32_t       bitrow      = fwd_cos_bit_row[txw_idx][txh_idx];
10189 
10190     switch (tx_type) {
10191     case DCT_DCT:
10192         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
10193         fdct16x4_N2_avx2(in, outcoeff256, bitcol);
10194         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10195         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10196         fdct4x8_col_N2_avx2(
10197             in, outcoeff256, bitrow, 2); //dct + transpose + clear right half of buffer
10198         clear_buffer_4x16_N2(outcoeff256);
10199         break;
10200     case ADST_DCT:
10201         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
10202         fadst16x4_N2_avx2(in, outcoeff256, bitcol);
10203         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10204         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10205         fdct4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10206         clear_buffer_4x16_N2(outcoeff256);
10207         break;
10208     case DCT_ADST:
10209         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
10210         fdct16x4_N2_avx2(in, outcoeff256, bitcol);
10211         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10212         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10213         fadst4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10214         clear_buffer_4x16_N2(outcoeff256);
10215         break;
10216     case ADST_ADST:
10217         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
10218         fadst16x4_N2_avx2(in, outcoeff256, bitcol);
10219         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10220         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10221         fadst4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10222         clear_buffer_4x16_N2(outcoeff256);
10223         break;
10224     case FLIPADST_DCT:
10225         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
10226         fadst16x4_N2_avx2(in, outcoeff256, bitcol);
10227         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10228         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10229         fdct4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10230         clear_buffer_4x16_N2(outcoeff256);
10231         break;
10232     case DCT_FLIPADST:
10233         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
10234         fdct16x4_N2_avx2(in, outcoeff256, bitcol);
10235         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10236         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10237         fadst4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10238         clear_buffer_4x16_N2(outcoeff256);
10239         break;
10240     case FLIPADST_FLIPADST:
10241         load_buffer_4x16_avx2(input, in, stride, 1, 1, shift[0]);
10242         fadst16x4_N2_avx2(in, outcoeff256, bitcol);
10243         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10244         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10245         fadst4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10246         clear_buffer_4x16_N2(outcoeff256);
10247         break;
10248     case ADST_FLIPADST:
10249         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
10250         fadst16x4_N2_avx2(in, outcoeff256, bitcol);
10251         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10252         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10253         fadst4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10254         clear_buffer_4x16_N2(outcoeff256);
10255         break;
10256     case FLIPADST_ADST:
10257         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
10258         fadst16x4_N2_avx2(in, outcoeff256, bitcol);
10259         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10260         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10261         fadst4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10262         clear_buffer_4x16_N2(outcoeff256);
10263         break;
10264     case IDTX:
10265         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
10266         fidtx8xn_N2_col_avx2(in, outcoeff256, bitcol, 4);
10267         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10268         fidtx4x8_col_N2_avx2(outcoeff256, outcoeff256, bitrow, 4);
10269         clear_buffer_4x16_N2(outcoeff256);
10270         break;
10271     case V_DCT:
10272         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
10273         fdct16x4_N2_avx2(in, outcoeff256, bitcol);
10274         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10275         fidtx4x8_col_N2_avx2(outcoeff256, outcoeff256, bitrow, 4);
10276         clear_buffer_4x16_N2(outcoeff256);
10277         break;
10278     case H_DCT:
10279         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
10280         fidtx8xn_N2_col_avx2(in, outcoeff256, bitcol, 4);
10281         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10282         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10283         fdct4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10284         clear_buffer_4x16_N2(outcoeff256);
10285         break;
10286     case V_ADST:
10287         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
10288         fadst16x4_N2_avx2(in, outcoeff256, bitcol);
10289         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10290         fidtx4x8_col_N2_avx2(outcoeff256, outcoeff256, bitrow, 4);
10291         clear_buffer_4x16_N2(outcoeff256);
10292         break;
10293     case H_ADST:
10294         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
10295         fidtx8xn_N2_col_avx2(in, outcoeff256, bitcol, 4);
10296         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10297         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10298         fadst4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10299         clear_buffer_4x16_N2(outcoeff256);
10300         break;
10301     case V_FLIPADST:
10302         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
10303         fadst16x4_N2_avx2(in, outcoeff256, bitcol);
10304         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10305         fidtx4x8_col_N2_avx2(outcoeff256, outcoeff256, bitrow, 4);
10306         clear_buffer_4x16_N2(outcoeff256);
10307         break;
10308     case H_FLIPADST:
10309         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
10310         fidtx8xn_N2_col_avx2(in, outcoeff256, bitcol, 4);
10311         col_txfm_8x8_N2_rounding(outcoeff256, -shift[1]);
10312         transpose_4x8_in_4x16_avx2(outcoeff256, in);
10313         fadst4x8_col_N2_avx2(in, outcoeff256, bitrow, 2);
10314         clear_buffer_4x16_N2(outcoeff256);
10315         break;
10316     default: assert(0);
10317     }
10318     (void)bd;
10319 }
10320 
svt_av1_fwd_txfm2d_16x4_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)10321 void svt_av1_fwd_txfm2d_16x4_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
10322                                      TxType tx_type, uint8_t bd) {
10323     __m256i       in[8];
10324     __m256i *     outcoeff256 = (__m256i *)output;
10325     const int8_t *shift       = fwd_shift_16x4;
10326     const int32_t txw_idx     = get_txw_idx(TX_16X4);
10327     const int32_t txh_idx     = get_txh_idx(TX_16X4);
10328     int32_t       bitcol      = fwd_cos_bit_col[txw_idx][txh_idx];
10329     int32_t       bitrow      = fwd_cos_bit_row[txw_idx][txh_idx];
10330 
10331     switch (tx_type) {
10332     case DCT_DCT:
10333         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
10334         fdct4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10335         fdct4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10336         fdct16x4_N2_avx2(outcoeff256, in, bitrow);
10337         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10338         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10339         break;
10340     case ADST_DCT:
10341         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
10342         fadst4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10343         fadst4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10344         fdct16x4_N2_avx2(outcoeff256, in, bitrow);
10345         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10346         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10347         break;
10348     case DCT_ADST:
10349         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
10350         fdct4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10351         fdct4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10352         fadst16x4_N2_avx2(outcoeff256, in, bitrow);
10353         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10354         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10355         break;
10356     case ADST_ADST:
10357         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
10358         fadst4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10359         fadst4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10360         fadst16x4_N2_avx2(outcoeff256, in, bitrow);
10361         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10362         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10363         break;
10364     case FLIPADST_DCT:
10365         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
10366         fadst4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10367         fadst4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10368         fdct16x4_N2_avx2(outcoeff256, in, bitrow);
10369         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10370         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10371         break;
10372     case DCT_FLIPADST:
10373         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
10374         fdct4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10375         fdct4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10376         fadst16x4_N2_avx2(outcoeff256, in, bitrow);
10377         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10378         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10379         break;
10380     case FLIPADST_FLIPADST:
10381         load_buffer_16x4_avx2(input, in, stride, 1, 1, shift[0]);
10382         fadst4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10383         fadst4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10384         fadst16x4_N2_avx2(outcoeff256, in, bitrow);
10385         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10386         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10387         break;
10388     case ADST_FLIPADST:
10389         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
10390         fadst4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10391         fadst4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10392         fadst16x4_N2_avx2(outcoeff256, in, bitrow);
10393         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10394         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10395         break;
10396     case FLIPADST_ADST:
10397         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
10398         fadst4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10399         fadst4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
10400         fadst16x4_N2_avx2(outcoeff256, in, bitrow);
10401         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10402         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10403         break;
10404     case IDTX:
10405         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10406         fidtx4x8_row_N2_avx2(in, outcoeff256, bitcol);
10407         col_txfm_8x8_rounding(outcoeff256, -shift[1]);
10408         fidtx4x8_N2_perm_avx2(outcoeff256, outcoeff256, bitrow);
10409         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10410         break;
10411     case V_DCT:
10412         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10413         fdct4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10414         fidtx8xn_N2_col_avx2(outcoeff256, in, bitrow, 4);
10415         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10416         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10417         break;
10418     case H_DCT:
10419         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
10420         fidtx4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, -shift[1]);
10421         fidtx4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, -shift[1]);
10422         fdct16x4_N2_avx2(outcoeff256, in, bitrow);
10423         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10424         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10425         break;
10426     case V_ADST:
10427         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
10428         fadst4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10429         fidtx8xn_N2_col_avx2(outcoeff256, in, bitrow, 4);
10430         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10431         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10432         break;
10433     case H_ADST:
10434         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
10435         fidtx4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, -shift[1]);
10436         fidtx4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, -shift[1]);
10437         fadst16x4_N2_avx2(outcoeff256, in, bitrow);
10438         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10439         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10440         break;
10441     case V_FLIPADST:
10442         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
10443         fadst4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
10444         fidtx8xn_N2_col_avx2(outcoeff256, in, bitrow, 4);
10445         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10446         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10447         break;
10448     case H_FLIPADST:
10449         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
10450         fidtx4x8_row_N2_with_round_avx2(in, outcoeff256, bitcol, -shift[1]);
10451         fidtx4x8_row_N2_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, -shift[1]);
10452         fadst16x4_N2_avx2(outcoeff256, in, bitrow);
10453         transpose_4x8_in_4x16_half_avx2(in, outcoeff256);
10454         clear_buffer_wxh_N2(outcoeff256, 2, 4);
10455         break;
10456     default: assert(0);
10457     }
10458     (void)bd;
10459 }
10460 
svt_av1_fwd_txfm2d_32x64_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)10461 void svt_av1_fwd_txfm2d_32x64_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
10462                                       TxType tx_type, uint8_t bd) {
10463     (void)tx_type;
10464     __m256i       in[256];
10465     __m256i *     outcoef256    = (__m256i *)output;
10466     const int8_t *shift         = fwd_txfm_shift_ls[TX_32X64];
10467     const int32_t txw_idx       = get_txw_idx(TX_32X64);
10468     const int32_t txh_idx       = get_txh_idx(TX_32X64);
10469     const int32_t txfm_size_col = tx_size_wide[TX_32X64];
10470     const int32_t txfm_size_row = tx_size_high[TX_32X64];
10471     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
10472     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
10473     const int32_t num_row       = txfm_size_row >> 3;
10474     const int32_t num_col       = txfm_size_col >> 3;
10475 
10476     // column transform
10477     load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
10478     av1_fdct64_new_N2_avx2(in, in, bitcol, txfm_size_col, num_col);
10479 
10480     for (int32_t i = 0; i < num_row / 2; i++)
10481         col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
10482     transpose_8nx8n_N2_half(in, outcoef256, txfm_size_col, txfm_size_row);
10483 
10484     // row transform
10485     av1_fdct32_new_N2_avx2(outcoef256, in, bitrow, txfm_size_row / 2, num_row);
10486     transpose_8nx8n_N2_quad(in, outcoef256, txfm_size_row, txfm_size_col);
10487     av1_round_shift_rect_array_wxh_avx2(
10488         outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
10489     clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
10490     (void)bd;
10491 }
10492 
svt_av1_fwd_txfm2d_64x32_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)10493 void svt_av1_fwd_txfm2d_64x32_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
10494                                       TxType tx_type, uint8_t bd) {
10495     (void)tx_type;
10496     __m256i       in[256];
10497     __m256i *     outcoef256    = (__m256i *)output;
10498     const int8_t *shift         = fwd_txfm_shift_ls[TX_64X32];
10499     const int32_t txw_idx       = get_txw_idx(TX_64X32);
10500     const int32_t txh_idx       = get_txh_idx(TX_64X32);
10501     const int32_t txfm_size_col = tx_size_wide[TX_64X32];
10502     const int32_t txfm_size_row = tx_size_high[TX_64X32];
10503     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
10504     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
10505     const int32_t num_row       = txfm_size_row >> 3;
10506     const int32_t num_col       = txfm_size_col >> 3;
10507 
10508     // column transform
10509     for (int32_t i = 0; i < 32; i++) {
10510         load_buffer_32_avx2(input + 0 + i * stride, in + 0 + i * 8, 8, 0, 0, shift[0]);
10511         load_buffer_32_avx2(input + 32 + i * stride, in + 4 + i * 8, 8, 0, 0, shift[0]);
10512     }
10513 
10514     av1_fdct32_new_N2_avx2(in, in, bitcol, txfm_size_col, num_col);
10515 
10516     for (int32_t i = 0; i < num_col / 2; i++)
10517         col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
10518     transpose_8nx8n_N2_half(in, outcoef256, txfm_size_col, txfm_size_row);
10519 
10520     // row transform
10521     av1_fdct64_new_N2_avx2(outcoef256, in, bitrow, txfm_size_row / 2, num_row);
10522     transpose_8nx8n_N2_quad(in, outcoef256, txfm_size_row, txfm_size_col);
10523     av1_round_shift_rect_array_wxh_avx2(
10524         outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
10525     clear_buffer_wxh_N2(outcoef256, num_col, txfm_size_row);
10526     (void)bd;
10527 }
10528 
svt_av1_fwd_txfm2d_16x64_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)10529 void svt_av1_fwd_txfm2d_16x64_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
10530                                       TxType tx_type, uint8_t bd) {
10531     __m256i       in[128];
10532     __m256i *     outcoeff256   = (__m256i *)output;
10533     const int8_t *shift         = fwd_txfm_shift_ls[TX_16X64];
10534     const int32_t txw_idx       = get_txw_idx(TX_16X64);
10535     const int32_t txh_idx       = get_txh_idx(TX_16X64);
10536     const int32_t txfm_size_col = tx_size_wide[TX_16X64];
10537     const int32_t txfm_size_row = tx_size_high[TX_16X64];
10538     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
10539     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
10540     int32_t       ud_flip, lr_flip;
10541     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
10542     const int32_t num_row = txfm_size_row >> 3;
10543     const int32_t num_col = txfm_size_col >> 3;
10544     // col tranform
10545     for (int32_t i = 0; i < txfm_size_row; i += num_col) {
10546         load_buffer_16_avx2(
10547             input + (i + 0) * stride, in + (i + 0) * num_col, 8, ud_flip, lr_flip, shift[0]);
10548         load_buffer_16_avx2(
10549             input + (i + 1) * stride, in + (i + 1) * num_col, 8, ud_flip, lr_flip, shift[0]);
10550     }
10551 
10552     av1_fdct64_new_N2_avx2(in, outcoeff256, bitcol, txfm_size_col, num_col);
10553 
10554     col_txfm_16x16_rounding(outcoeff256, -shift[1]);
10555     col_txfm_16x16_rounding(outcoeff256 + 32, -shift[1]);
10556     transpose_8nx8n_N2_half(outcoeff256, in, txfm_size_col, txfm_size_row);
10557     // row tranform
10558     fdct16x16_N2_avx2(in, in, bitrow, num_row, num_row / 2);
10559     transpose_8nx8n_N2_quad(in, outcoeff256, txfm_size_row, txfm_size_col);
10560     clear_buffer_wxh_N2(outcoeff256, num_col, txfm_size_row);
10561     (void)bd;
10562 }
10563 
svt_av1_fwd_txfm2d_64x16_N2_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)10564 void svt_av1_fwd_txfm2d_64x16_N2_avx2(int16_t *input, int32_t *output, uint32_t stride,
10565                                       TxType tx_type, uint8_t bd) {
10566     __m256i       in[128];
10567     __m256i *     outcoeff256   = (__m256i *)output;
10568     const int8_t *shift         = fwd_txfm_shift_ls[TX_64X16];
10569     const int32_t txw_idx       = get_txw_idx(TX_64X16);
10570     const int32_t txh_idx       = get_txh_idx(TX_64X16);
10571     const int32_t txfm_size_col = tx_size_wide[TX_64X16];
10572     const int32_t txfm_size_row = tx_size_high[TX_64X16];
10573     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
10574     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
10575     int32_t       ud_flip, lr_flip;
10576     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
10577     const int32_t num_row = txfm_size_row >> 3;
10578     const int32_t num_col = txfm_size_col >> 3;
10579     // col tranform
10580     for (int32_t i = 0; i < txfm_size_row; i++) {
10581         load_buffer_16_avx2(input + 0 + i * stride, in + 0 + i * 8, 8, ud_flip, lr_flip, shift[0]);
10582         load_buffer_16_avx2(input + 16 + i * stride, in + 2 + i * 8, 8, ud_flip, lr_flip, shift[0]);
10583         load_buffer_16_avx2(input + 32 + i * stride, in + 4 + i * 8, 8, ud_flip, lr_flip, shift[0]);
10584         load_buffer_16_avx2(input + 48 + i * stride, in + 6 + i * 8, 8, ud_flip, lr_flip, shift[0]);
10585     }
10586 
10587     fdct16x16_N2_avx2(in, outcoeff256, bitcol, num_col, num_col);
10588     col_txfm_16x16_rounding(outcoeff256, -shift[1]);
10589     col_txfm_16x16_rounding(outcoeff256 + 32, -shift[1]);
10590     transpose_8nx8n_N2_half(outcoeff256, in, txfm_size_col, txfm_size_row);
10591     // row tranform
10592     av1_fdct64_new_N2_avx2(in, in, bitrow, txfm_size_row / 2, num_row);
10593     transpose_8nx8n_N2_quad(in, outcoeff256, txfm_size_row, txfm_size_col);
10594     clear_buffer_wxh_N2(outcoeff256, num_col, txfm_size_row);
10595     (void)bd;
10596 }
10597 
transpose_4x4_in_16x16_avx2(const __m256i * in,__m256i * out)10598 static INLINE void transpose_4x4_in_16x16_avx2(const __m256i *in, __m256i *out) {
10599     __m256i out1[4];
10600     __m256i zero = _mm256_setzero_si256();
10601     TRANSPOSE_4X4_AVX2(in[0], in[2], in[4], in[6], out1[0], out1[1], out1[2], out1[3]);
10602     out[0] = _mm256_permute2x128_si256(out1[0], zero, 0x20);
10603     out[2] = _mm256_permute2x128_si256(out1[1], zero, 0x20);
10604     out[4] = _mm256_permute2x128_si256(out1[2], zero, 0x20);
10605     out[6] = _mm256_permute2x128_si256(out1[3], zero, 0x20);
10606 }
10607 
transpose_8x8_in_32x32_avx2(const __m256i * in,__m256i * out)10608 static INLINE void transpose_8x8_in_32x32_avx2(const __m256i *in, __m256i *out) {
10609     __m256i temp[8];
10610     TRANSPOSE_4X4_AVX2(in[0], in[4], in[8], in[12], temp[0], temp[1], temp[2], temp[3]);
10611     TRANSPOSE_4X4_AVX2(in[16], in[20], in[24], in[28], temp[4], temp[5], temp[6], temp[7]);
10612 
10613     out[0]  = _mm256_permute2x128_si256(temp[0], temp[4], 0x20);
10614     out[4]  = _mm256_permute2x128_si256(temp[1], temp[5], 0x20);
10615     out[8]  = _mm256_permute2x128_si256(temp[2], temp[6], 0x20);
10616     out[12] = _mm256_permute2x128_si256(temp[3], temp[7], 0x20);
10617     out[16] = _mm256_permute2x128_si256(temp[0], temp[4], 0x31);
10618     out[20] = _mm256_permute2x128_si256(temp[1], temp[5], 0x31);
10619     out[24] = _mm256_permute2x128_si256(temp[2], temp[6], 0x31);
10620     out[28] = _mm256_permute2x128_si256(temp[3], temp[7], 0x31);
10621 }
10622 
transpose_8nx8n_N4_half(const __m256i * input,__m256i * output,const int32_t width,const int32_t height)10623 static INLINE void transpose_8nx8n_N4_half(const __m256i *input, __m256i *output,
10624                                            const int32_t width, const int32_t height) {
10625     const int32_t numcol      = height >> 3;
10626     const int32_t numrow      = width >> 3;
10627     int32_t       calc_numcol = numcol >> 2;
10628     if (!calc_numcol) {
10629         calc_numcol = 1;
10630     }
10631 
10632     __m256i out1[8];
10633     for (int32_t j = 0; j < numrow; j++) {
10634         for (int32_t i = 0; i < calc_numcol; i++) {
10635             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 0)],
10636                                input[i * width + j + (numrow * 1)],
10637                                input[i * width + j + (numrow * 2)],
10638                                input[i * width + j + (numrow * 3)],
10639                                out1[0],
10640                                out1[1],
10641                                out1[4],
10642                                out1[5]);
10643             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 4)],
10644                                input[i * width + j + (numrow * 5)],
10645                                input[i * width + j + (numrow * 6)],
10646                                input[i * width + j + (numrow * 7)],
10647                                out1[2],
10648                                out1[3],
10649                                out1[6],
10650                                out1[7]);
10651             output[j * height + i + (numcol * 0)] = _mm256_permute2x128_si256(
10652                 out1[0], out1[2], 0x20);
10653             output[j * height + i + (numcol * 1)] = _mm256_permute2x128_si256(
10654                 out1[1], out1[3], 0x20);
10655             output[j * height + i + (numcol * 2)] = _mm256_permute2x128_si256(
10656                 out1[4], out1[6], 0x20);
10657             output[j * height + i + (numcol * 3)] = _mm256_permute2x128_si256(
10658                 out1[5], out1[7], 0x20);
10659             output[j * height + i + (numcol * 4)] = _mm256_permute2x128_si256(
10660                 out1[0], out1[2], 0x31);
10661             output[j * height + i + (numcol * 5)] = _mm256_permute2x128_si256(
10662                 out1[1], out1[3], 0x31);
10663             output[j * height + i + (numcol * 6)] = _mm256_permute2x128_si256(
10664                 out1[4], out1[6], 0x31);
10665             output[j * height + i + (numcol * 7)] = _mm256_permute2x128_si256(
10666                 out1[5], out1[7], 0x31);
10667         }
10668     }
10669 }
10670 
transpose_8nx8n_N4_quad(const __m256i * input,__m256i * output,const int32_t width,const int32_t height)10671 static INLINE void transpose_8nx8n_N4_quad(const __m256i *input, __m256i *output,
10672                                            const int32_t width, const int32_t height) {
10673     const int32_t numcol = height >> 3;
10674     const int32_t numrow = width >> 3;
10675 
10676     int32_t calc_numcol = numcol >> 2;
10677     int32_t calc_numrow = numrow >> 2;
10678     if (!calc_numcol) {
10679         calc_numcol = 1;
10680     }
10681     if (!calc_numrow) {
10682         calc_numrow = 1;
10683     }
10684 
10685     __m256i out1[8];
10686     for (int32_t j = 0; j < calc_numrow; j++) {
10687         for (int32_t i = 0; i < calc_numcol; i++) {
10688             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 0)],
10689                                input[i * width + j + (numrow * 1)],
10690                                input[i * width + j + (numrow * 2)],
10691                                input[i * width + j + (numrow * 3)],
10692                                out1[0],
10693                                out1[1],
10694                                out1[4],
10695                                out1[5]);
10696             TRANSPOSE_4X4_AVX2(input[i * width + j + (numrow * 4)],
10697                                input[i * width + j + (numrow * 5)],
10698                                input[i * width + j + (numrow * 6)],
10699                                input[i * width + j + (numrow * 7)],
10700                                out1[2],
10701                                out1[3],
10702                                out1[6],
10703                                out1[7]);
10704             output[j * height + i + (numcol * 0)] = _mm256_permute2x128_si256(
10705                 out1[0], out1[2], 0x20);
10706             output[j * height + i + (numcol * 1)] = _mm256_permute2x128_si256(
10707                 out1[1], out1[3], 0x20);
10708             output[j * height + i + (numcol * 2)] = _mm256_permute2x128_si256(
10709                 out1[4], out1[6], 0x20);
10710             output[j * height + i + (numcol * 3)] = _mm256_permute2x128_si256(
10711                 out1[5], out1[7], 0x20);
10712             output[j * height + i + (numcol * 4)] = _mm256_permute2x128_si256(
10713                 out1[0], out1[2], 0x31);
10714             output[j * height + i + (numcol * 5)] = _mm256_permute2x128_si256(
10715                 out1[1], out1[3], 0x31);
10716             output[j * height + i + (numcol * 6)] = _mm256_permute2x128_si256(
10717                 out1[4], out1[6], 0x31);
10718             output[j * height + i + (numcol * 7)] = _mm256_permute2x128_si256(
10719                 out1[5], out1[7], 0x31);
10720         }
10721     }
10722 }
10723 
transpose_4x8_in_4x16_quad_avx2(const __m256i * in,__m256i * out)10724 static INLINE void transpose_4x8_in_4x16_quad_avx2(const __m256i *in, __m256i *out) {
10725     __m256i perm = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
10726     __m256i u0, u1;
10727 
10728     u0     = _mm256_unpacklo_epi32(in[0], in[1]);
10729     u1     = _mm256_unpacklo_epi32(in[2], in[3]);
10730     out[0] = _mm256_unpacklo_epi64(u0, u1);
10731     out[0] = _mm256_permutevar8x32_epi32(out[0], perm);
10732 }
10733 
write_buffer_8x8_N4(const __m256i * res,int32_t * output)10734 static AOM_FORCE_INLINE void write_buffer_8x8_N4(const __m256i *res, int32_t *output) {
10735     const __m256i zero256 = _mm256_setzero_si256();
10736 
10737     _mm256_storeu_si256((__m256i *)(output + 0 * 8), zero256);
10738     _mm256_storeu_si256((__m256i *)(output + 1 * 8), zero256);
10739 
10740     _mm_storel_epi64((__m128i *)(output + 0 * 8), _mm256_castsi256_si128(res[0]));
10741     _mm_storel_epi64((__m128i *)(output + 1 * 8), _mm256_castsi256_si128(res[1]));
10742 
10743     _mm256_storeu_si256((__m256i *)(output + 2 * 8), zero256);
10744     _mm256_storeu_si256((__m256i *)(output + 3 * 8), zero256);
10745     _mm256_storeu_si256((__m256i *)(output + 4 * 8), zero256);
10746     _mm256_storeu_si256((__m256i *)(output + 5 * 8), zero256);
10747     _mm256_storeu_si256((__m256i *)(output + 6 * 8), zero256);
10748     _mm256_storeu_si256((__m256i *)(output + 7 * 8), zero256);
10749 }
10750 
write_buffer_16x16_N4(const __m256i * res,int32_t * output)10751 static INLINE void write_buffer_16x16_N4(const __m256i *res, int32_t *output) {
10752     int32_t       fact = -1, index = -1;
10753     const __m256i zero    = _mm256_setzero_si256();
10754     const __m128i zero128 = _mm_setzero_si128();
10755     int32_t       i;
10756     for (i = 0; i < 2; i++) {
10757         _mm_storeu_si128((__m128i *)(output + (++fact) * 16), _mm256_castsi256_si128(res[++index]));
10758         _mm_storeu_si128((__m128i *)(output + fact * 16 + 4), zero128);
10759         _mm256_storeu_si256((__m256i *)(output + fact * 16 + 8), zero);
10760         ++index;
10761         _mm_storeu_si128((__m128i *)(output + (++fact) * 16), _mm256_castsi256_si128(res[++index]));
10762         _mm_storeu_si128((__m128i *)(output + fact * 16 + 4), zero128);
10763         _mm256_storeu_si256((__m256i *)(output + fact * 16 + 8), zero);
10764         ++index;
10765     }
10766     for (; i < 8; i++) {
10767         _mm256_storeu_si256((__m256i *)(output + (++fact) * 16), zero);
10768         _mm256_storeu_si256((__m256i *)(output + (fact)*16 + 8), zero);
10769         _mm256_storeu_si256((__m256i *)(output + (++fact) * 16), zero);
10770         _mm256_storeu_si256((__m256i *)(output + (fact)*16 + 8), zero);
10771     }
10772 }
10773 
write_buffer_32x32_N4(const __m256i * res,int32_t * output)10774 static INLINE void write_buffer_32x32_N4(const __m256i *res, int32_t *output) {
10775     const __m256i zero = _mm256_setzero_si256();
10776     uint32_t      i;
10777 
10778     for (i = 0; i < 8; i++) {
10779         _mm256_storeu_si256((__m256i *)(output + i * 32 + 0), res[i * 4]);
10780         _mm256_storeu_si256((__m256i *)(output + i * 32 + 8), zero);
10781         _mm256_storeu_si256((__m256i *)(output + i * 32 + 16), zero);
10782         _mm256_storeu_si256((__m256i *)(output + i * 32 + 24), zero);
10783     }
10784 
10785     for (; i < 32; i++) {
10786         _mm256_storeu_si256((__m256i *)(output + i * 32 + 0), zero);
10787         _mm256_storeu_si256((__m256i *)(output + i * 32 + 8), zero);
10788         _mm256_storeu_si256((__m256i *)(output + i * 32 + 16), zero);
10789         _mm256_storeu_si256((__m256i *)(output + i * 32 + 24), zero);
10790     }
10791 }
10792 
write_buffer_16x8_N4_avx2(const __m256i * res,__m256i * out)10793 static AOM_FORCE_INLINE void write_buffer_16x8_N4_avx2(const __m256i *res, __m256i *out) {
10794     out[0] = res[0];
10795     out[2] = res[1];
10796 }
10797 
col_txfm_8x8_N4_rounding(__m256i * in,int32_t shift)10798 static AOM_FORCE_INLINE void col_txfm_8x8_N4_rounding(__m256i *in, int32_t shift) {
10799     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
10800 
10801     in[0] = _mm256_add_epi32(in[0], rounding);
10802     in[1] = _mm256_add_epi32(in[1], rounding);
10803 
10804     in[0] = _mm256_srai_epi32(in[0], shift);
10805     in[1] = _mm256_srai_epi32(in[1], shift);
10806 }
10807 
col_txfm_32x8_N4_rounding(__m256i * in,int32_t shift)10808 static AOM_FORCE_INLINE void col_txfm_32x8_N4_rounding(__m256i *in, int32_t shift) {
10809     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
10810 
10811     in[0] = _mm256_add_epi32(in[0], rounding);
10812     in[4] = _mm256_add_epi32(in[4], rounding);
10813 
10814     in[0] = _mm256_srai_epi32(in[0], shift);
10815     in[4] = _mm256_srai_epi32(in[4], shift);
10816 }
10817 
col_txfm_16x16_N4_rounding(__m256i * in,int32_t shift)10818 static AOM_FORCE_INLINE void col_txfm_16x16_N4_rounding(__m256i *in, int32_t shift) {
10819     col_txfm_8x8_rounding(&in[0], shift);
10820 }
10821 
av1_round_shift_array_32_N4_avx2(__m256i * input,__m256i * output,const int32_t size,const int32_t bit)10822 static INLINE void av1_round_shift_array_32_N4_avx2(__m256i *input, __m256i *output,
10823                                                     const int32_t size, const int32_t bit) {
10824     int32_t i;
10825     if (bit > 0) {
10826         const __m256i round = _mm256_set1_epi32(1 << (bit - 1));
10827         __m256i       r0;
10828         for (i = 0; i < size; i += 4) {
10829             r0        = _mm256_add_epi32(input[i], round);
10830             output[i] = _mm256_srai_epi32(r0, bit);
10831         }
10832     } else {
10833         for (i = 0; i < size; i += 4) { output[i] = _mm256_slli_epi32(input[i], -bit); }
10834     }
10835 }
10836 
av1_round_shift_rect_wxh_N4(__m256i * input,__m256i * output,const int32_t bit,const int32_t val,int32_t num_col,int32_t num_row)10837 static INLINE void av1_round_shift_rect_wxh_N4(__m256i *input, __m256i *output, const int32_t bit,
10838                                                const int32_t val, int32_t num_col,
10839                                                int32_t num_row) {
10840     const __m256i sqrt2       = _mm256_set1_epi32(val);
10841     const __m256i round2      = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
10842     int32_t       calc_numcol = num_col >> 2;
10843     if (!calc_numcol)
10844         calc_numcol = 1;
10845     int32_t i;
10846     if (bit > 0) {
10847         const __m256i round1 = _mm256_set1_epi32(1 << (bit - 1));
10848         __m256i       r0, r1, r2, r3;
10849         for (i = 0; i < num_row / 4; i++) {
10850             for (int j = 0; j < calc_numcol; j++) {
10851                 r0                      = _mm256_add_epi32(input[i * num_col + j], round1);
10852                 r1                      = _mm256_srai_epi32(r0, bit);
10853                 r2                      = _mm256_mullo_epi32(sqrt2, r1);
10854                 r3                      = _mm256_add_epi32(r2, round2);
10855                 output[i * num_col + j] = _mm256_srai_epi32(r3, new_sqrt2_bits);
10856             }
10857         }
10858     } else {
10859         __m256i r0, r1, r2;
10860         for (i = 0; i < num_row / 4; i++) {
10861             for (int j = 0; j < calc_numcol; j++) {
10862                 r0                      = _mm256_slli_epi32(input[i * num_col + j], -bit);
10863                 r1                      = _mm256_mullo_epi32(sqrt2, r0);
10864                 r2                      = _mm256_add_epi32(r1, round2);
10865                 output[i * num_col + j] = _mm256_srai_epi32(r2, new_sqrt2_bits);
10866             }
10867         }
10868     }
10869 }
10870 
col_txfm_32x16_N4_rounding(__m256i * in,int32_t shift)10871 static AOM_FORCE_INLINE void col_txfm_32x16_N4_rounding(__m256i *in, int32_t shift) {
10872     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
10873     in[0]                  = _mm256_add_epi32(in[0], rounding);
10874     in[4]                  = _mm256_add_epi32(in[4], rounding);
10875     in[8]                  = _mm256_add_epi32(in[8], rounding);
10876     in[12]                 = _mm256_add_epi32(in[12], rounding);
10877     in[0]                  = _mm256_srai_epi32(in[0], shift);
10878     in[4]                  = _mm256_srai_epi32(in[4], shift);
10879     in[8]                  = _mm256_srai_epi32(in[8], shift);
10880     in[12]                 = _mm256_srai_epi32(in[12], shift);
10881 }
10882 
load_buffer_8x8_in_32x32_avx2(const int16_t * input,__m256i * output,int32_t stride)10883 static INLINE void load_buffer_8x8_in_32x32_avx2(const int16_t *input, __m256i *output,
10884                                                  int32_t stride) {
10885     __m128i temp;
10886     int32_t i;
10887 
10888     for (i = 0; i < 8; ++i) {
10889         temp      = _mm_loadu_si128((const __m128i *)(input));
10890         output[0] = _mm256_cvtepi16_epi32(temp);
10891 
10892         input += stride;
10893         output += 4;
10894     }
10895 }
10896 
load_buffer_8x32_in_32x32_avx2(const int16_t * input,__m256i * output,int32_t stride)10897 static INLINE void load_buffer_8x32_in_32x32_avx2(const int16_t *input, __m256i *output,
10898                                                   int32_t stride) {
10899     __m128i temp;
10900     int32_t i;
10901 
10902     for (i = 0; i < 32; ++i) {
10903         temp      = _mm_loadu_si128((const __m128i *)(input));
10904         output[0] = _mm256_cvtepi16_epi32(temp);
10905 
10906         input += stride;
10907         output += 4;
10908     }
10909 }
10910 
load_buffer_32x8_in_32x32_avx2(const int16_t * input,__m256i * output,int32_t stride)10911 static INLINE void load_buffer_32x8_in_32x32_avx2(const int16_t *input, __m256i *output,
10912                                                   int32_t stride) {
10913     __m128i temp[4];
10914     int32_t i;
10915 
10916     for (i = 0; i < 8; ++i) {
10917         temp[0] = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
10918         temp[1] = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
10919         temp[2] = _mm_loadu_si128((const __m128i *)(input + 2 * 8));
10920         temp[3] = _mm_loadu_si128((const __m128i *)(input + 3 * 8));
10921 
10922         output[0] = _mm256_cvtepi16_epi32(temp[0]);
10923         output[1] = _mm256_cvtepi16_epi32(temp[1]);
10924         output[2] = _mm256_cvtepi16_epi32(temp[2]);
10925         output[3] = _mm256_cvtepi16_epi32(temp[3]);
10926         input += stride;
10927         output += 4;
10928     }
10929 }
10930 
load_buffer_16x16_in_64x64_avx2(const int16_t * input,int32_t stride,__m256i * output)10931 static INLINE void load_buffer_16x16_in_64x64_avx2(const int16_t *input, int32_t stride,
10932                                                    __m256i *output) {
10933     __m128i x0, x1;
10934     __m256i v0, v1;
10935     int32_t i;
10936 
10937     for (i = 0; i < 16; ++i) {
10938         x0 = _mm_loadu_si128((const __m128i *)(input + 0 * 8));
10939         x1 = _mm_loadu_si128((const __m128i *)(input + 1 * 8));
10940 
10941         v0 = _mm256_cvtepi16_epi32(x0);
10942         v1 = _mm256_cvtepi16_epi32(x1);
10943 
10944         _mm256_storeu_si256(output + 0, v0);
10945         _mm256_storeu_si256(output + 1, v1);
10946 
10947         input += stride;
10948         output += 8;
10949     }
10950 }
10951 
load_buffer_8x16_N4(const int16_t * input,__m256i * out,int32_t stride,int32_t flipud,int32_t fliplr,int32_t shift)10952 static AOM_FORCE_INLINE void load_buffer_8x16_N4(const int16_t *input, __m256i *out, int32_t stride,
10953                                                  int32_t flipud, int32_t fliplr, int32_t shift) {
10954     if (flipud)
10955         load_buffer_8x8(input + 8 * stride, out, stride, flipud, fliplr, shift);
10956     else
10957         load_buffer_8x8(input, out, stride, flipud, fliplr, shift);
10958 }
10959 
clear_buffer_wxh_N4(__m256i * buff,int32_t num_col,int32_t num_row)10960 static INLINE void clear_buffer_wxh_N4(__m256i *buff, int32_t num_col, int32_t num_row) {
10961     const __m256i zero    = _mm256_setzero_si256();
10962     const __m128i zero128 = _mm_setzero_si128();
10963     assert(num_col > 0);
10964     assert(num_row > 1);
10965 
10966     if (num_col == 1) {
10967         __m128i *ptr_128b = (__m128i *)buff;
10968         for (int i = 0; i < num_row / 4; i++) {
10969             ptr_128b[i * 2]     = _mm_unpacklo_epi64(ptr_128b[i * 2], zero128);
10970             ptr_128b[i * 2 + 1] = zero128;
10971         }
10972     } else if (num_col == 2) {
10973         __m128i *ptr_128b = (__m128i *)buff;
10974         for (int i = 0; i < num_row / 4; i++) {
10975             ptr_128b[i * 4 + 1] = zero128;
10976             buff[i * 2 + 1]     = zero;
10977         }
10978     } else {
10979         for (int i = 0; i < num_row / 4; i++)
10980             for (int j = num_col / 4; j < num_col; j++) buff[i * num_col + j] = zero;
10981     }
10982     //clear bottom
10983     for (int i = num_row / 4; i < num_row; i++)
10984         for (int j = 0; j < num_col; j++) buff[i * num_col + j] = zero;
10985 }
10986 
clear_buffer_4x16_N4(__m256i * buff)10987 static AOM_FORCE_INLINE void clear_buffer_4x16_N4(__m256i *buff) {
10988     const __m256i zero = _mm256_setzero_si256();
10989     buff[2]            = zero;
10990     buff[3]            = zero;
10991     buff[4]            = zero;
10992     buff[5]            = zero;
10993     buff[6]            = zero;
10994     buff[7]            = zero;
10995 }
10996 
av1_round_shift_array_64_N4_avx2(__m256i * input,__m256i * output,const int32_t size,const int32_t bit)10997 static INLINE void av1_round_shift_array_64_N4_avx2(__m256i *input, __m256i *output,
10998                                                     const int32_t size, const int32_t bit) {
10999     int32_t i;
11000     if (bit > 0) {
11001         const __m256i round = _mm256_set1_epi32(1 << (bit - 1));
11002         __m256i       r0;
11003         for (i = 0; i < size; i += 8) {
11004             r0            = _mm256_add_epi32(input[i], round);
11005             output[i]     = _mm256_srai_epi32(r0, bit);
11006             r0            = _mm256_add_epi32(input[i + 1], round);
11007             output[i + 1] = _mm256_srai_epi32(r0, bit);
11008         }
11009     } else {
11010         for (i = 0; i < size; i += 8) {
11011             output[i]     = _mm256_slli_epi32(input[i], -bit);
11012             output[i + 1] = _mm256_slli_epi32(input[i + 1], -bit);
11013         }
11014     }
11015 }
11016 
fdct8x8_N4_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num)11017 static void fdct8x8_N4_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
11018     const int32_t *cospi    = cospi_arr(bit);
11019     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
11020     const __m256i  cospim32 = _mm256_set1_epi32(-cospi[32]);
11021     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
11022     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
11023     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
11024     __m256i        u[8], v[8];
11025 
11026     // stage 0
11027     // stage 1
11028     u[0] = _mm256_add_epi32(in[0 * col_num], in[7 * col_num]);
11029     v[7] = _mm256_sub_epi32(in[0 * col_num], in[7 * col_num]);
11030     u[1] = _mm256_add_epi32(in[1 * col_num], in[6 * col_num]);
11031     u[6] = _mm256_sub_epi32(in[1 * col_num], in[6 * col_num]);
11032     u[2] = _mm256_add_epi32(in[2 * col_num], in[5 * col_num]);
11033     u[5] = _mm256_sub_epi32(in[2 * col_num], in[5 * col_num]);
11034     u[3] = _mm256_add_epi32(in[3 * col_num], in[4 * col_num]);
11035     v[4] = _mm256_sub_epi32(in[3 * col_num], in[4 * col_num]);
11036 
11037     // stage 2
11038     v[0] = _mm256_add_epi32(u[0], u[3]);
11039     v[1] = _mm256_add_epi32(u[1], u[2]);
11040 
11041     v[5] = _mm256_mullo_epi32(u[5], cospim32);
11042     v[6] = _mm256_mullo_epi32(u[6], cospi32);
11043     v[5] = _mm256_add_epi32(v[5], v[6]);
11044     v[5] = _mm256_add_epi32(v[5], rnding);
11045     v[5] = _mm256_srai_epi32(v[5], bit);
11046 
11047     u[0] = _mm256_mullo_epi32(u[5], cospi32);
11048     v[6] = _mm256_mullo_epi32(u[6], cospim32);
11049     v[6] = _mm256_sub_epi32(u[0], v[6]);
11050     v[6] = _mm256_add_epi32(v[6], rnding);
11051     v[6] = _mm256_srai_epi32(v[6], bit);
11052 
11053     // stage 3
11054     // type 0
11055     v[0] = _mm256_mullo_epi32(v[0], cospi32);
11056     v[1] = _mm256_mullo_epi32(v[1], cospi32);
11057     u[0] = _mm256_add_epi32(v[0], v[1]);
11058     u[0] = _mm256_add_epi32(u[0], rnding);
11059     u[0] = _mm256_srai_epi32(u[0], bit);
11060 
11061     u[4] = _mm256_add_epi32(v[4], v[5]);
11062     u[7] = _mm256_add_epi32(v[7], v[6]);
11063 
11064     // stage 4
11065     // stage 5
11066     v[0]             = _mm256_mullo_epi32(u[4], cospi56);
11067     v[1]             = _mm256_mullo_epi32(u[7], cospi8);
11068     v[0]             = _mm256_add_epi32(v[0], v[1]);
11069     v[0]             = _mm256_add_epi32(v[0], rnding);
11070     out[1 * col_num] = _mm256_srai_epi32(v[0], bit);
11071 
11072     out[0 * col_num] = u[0];
11073 }
11074 
fadst8x8_N4_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num)11075 static void fadst8x8_N4_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num) {
11076     const int32_t *cospi    = cospi_arr(bit);
11077     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
11078     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
11079     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
11080     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
11081     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
11082     const __m256i  cospim4  = _mm256_set1_epi32(-cospi[4]);
11083     const __m256i  cospi60  = _mm256_set1_epi32(cospi[60]);
11084     const __m256i  cospi52  = _mm256_set1_epi32(cospi[52]);
11085     const __m256i  cospi12  = _mm256_set1_epi32(cospi[12]);
11086     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
11087     const __m256i  zero     = _mm256_setzero_si256();
11088     __m256i        u0, u1, u2, u3, u4, u5, u6, u7;
11089     __m256i        v0, v1, v2, v3, v4, v5, v6, v7;
11090     __m256i        x, y;
11091 
11092     u0 = in[0 * col_num];
11093     u1 = _mm256_sub_epi32(zero, in[7 * col_num]);
11094     u2 = _mm256_sub_epi32(zero, in[3 * col_num]);
11095     u3 = in[4 * col_num];
11096     u4 = _mm256_sub_epi32(zero, in[1 * col_num]);
11097     u5 = in[6 * col_num];
11098     u6 = in[2 * col_num];
11099     u7 = _mm256_sub_epi32(zero, in[5 * col_num]);
11100 
11101     // stage 2
11102     v0 = u0;
11103     v1 = u1;
11104 
11105     x  = _mm256_mullo_epi32(u2, cospi32);
11106     y  = _mm256_mullo_epi32(u3, cospi32);
11107     v2 = _mm256_add_epi32(x, y);
11108     v2 = _mm256_add_epi32(v2, rnding);
11109     v2 = _mm256_srai_epi32(v2, bit);
11110 
11111     v3 = _mm256_sub_epi32(x, y);
11112     v3 = _mm256_add_epi32(v3, rnding);
11113     v3 = _mm256_srai_epi32(v3, bit);
11114 
11115     v4 = u4;
11116     v5 = u5;
11117 
11118     x  = _mm256_mullo_epi32(u6, cospi32);
11119     y  = _mm256_mullo_epi32(u7, cospi32);
11120     v6 = _mm256_add_epi32(x, y);
11121     v6 = _mm256_add_epi32(v6, rnding);
11122     v6 = _mm256_srai_epi32(v6, bit);
11123 
11124     v7 = _mm256_sub_epi32(x, y);
11125     v7 = _mm256_add_epi32(v7, rnding);
11126     v7 = _mm256_srai_epi32(v7, bit);
11127 
11128     // stage 3
11129     u0 = _mm256_add_epi32(v0, v2);
11130     u1 = _mm256_add_epi32(v1, v3);
11131     u2 = _mm256_sub_epi32(v0, v2);
11132     u3 = _mm256_sub_epi32(v1, v3);
11133     u4 = _mm256_add_epi32(v4, v6);
11134     u5 = _mm256_add_epi32(v5, v7);
11135     u6 = _mm256_sub_epi32(v4, v6);
11136     u7 = _mm256_sub_epi32(v5, v7);
11137 
11138     // stage 4
11139     v0 = u0;
11140     v1 = u1;
11141     v2 = u2;
11142     v3 = u3;
11143 
11144     x  = _mm256_mullo_epi32(u4, cospi16);
11145     y  = _mm256_mullo_epi32(u5, cospi48);
11146     v4 = _mm256_add_epi32(x, y);
11147     v4 = _mm256_add_epi32(v4, rnding);
11148     v4 = _mm256_srai_epi32(v4, bit);
11149 
11150     x  = _mm256_mullo_epi32(u4, cospi48);
11151     y  = _mm256_mullo_epi32(u5, cospim16);
11152     v5 = _mm256_add_epi32(x, y);
11153     v5 = _mm256_add_epi32(v5, rnding);
11154     v5 = _mm256_srai_epi32(v5, bit);
11155 
11156     x  = _mm256_mullo_epi32(u6, cospim48);
11157     y  = _mm256_mullo_epi32(u7, cospi16);
11158     v6 = _mm256_add_epi32(x, y);
11159     v6 = _mm256_add_epi32(v6, rnding);
11160     v6 = _mm256_srai_epi32(v6, bit);
11161 
11162     x  = _mm256_mullo_epi32(u6, cospi16);
11163     y  = _mm256_mullo_epi32(u7, cospi48);
11164     v7 = _mm256_add_epi32(x, y);
11165     v7 = _mm256_add_epi32(v7, rnding);
11166     v7 = _mm256_srai_epi32(v7, bit);
11167 
11168     // stage 5
11169     u0 = _mm256_add_epi32(v0, v4);
11170     u1 = _mm256_add_epi32(v1, v5);
11171     u6 = _mm256_sub_epi32(v2, v6);
11172     u7 = _mm256_sub_epi32(v3, v7);
11173 
11174     // stage 6
11175     x  = _mm256_mullo_epi32(u0, cospi60);
11176     y  = _mm256_mullo_epi32(u1, cospim4);
11177     v1 = _mm256_add_epi32(x, y);
11178     v1 = _mm256_add_epi32(v1, rnding);
11179     v1 = _mm256_srai_epi32(v1, bit);
11180 
11181     x  = _mm256_mullo_epi32(u6, cospi52);
11182     y  = _mm256_mullo_epi32(u7, cospi12);
11183     v6 = _mm256_add_epi32(x, y);
11184     v6 = _mm256_add_epi32(v6, rnding);
11185     v6 = _mm256_srai_epi32(v6, bit);
11186 
11187     // stage 7
11188     out[0 * col_num] = v1;
11189     out[1 * col_num] = v6;
11190 }
11191 
fidtx8x8_N4_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num)11192 static void fidtx8x8_N4_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num) {
11193     (void)bit;
11194     out[0] = _mm256_slli_epi32(in[0 * col_num], 1);
11195     out[1] = _mm256_slli_epi32(in[1 * col_num], 1);
11196 }
11197 
fdct16x16_N4_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num,int32_t size)11198 static void fdct16x16_N4_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num,
11199                               int32_t size) {
11200     const int32_t *cospi    = cospi_arr(bit);
11201     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
11202     const __m256i  cospim32 = _mm256_set1_epi32(-cospi[32]);
11203     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
11204     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
11205     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
11206     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
11207     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
11208     const __m256i  cospi60  = _mm256_set1_epi32(cospi[60]);
11209     const __m256i  cospi4   = _mm256_set1_epi32(cospi[4]);
11210     const __m256i  cospi12  = _mm256_set1_epi32(cospi[12]);
11211     const __m256i  cospi52  = _mm256_set1_epi32(cospi[52]);
11212     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
11213     __m256i        u[16], v[16], x;
11214     int32_t        col;
11215 
11216     for (col = 0; col < size; ++col) {
11217         // stage 0
11218         // stage 1
11219         u[0]  = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
11220         u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
11221         u[1]  = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
11222         u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
11223         u[2]  = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
11224         u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
11225         u[3]  = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
11226         u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
11227         u[4]  = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
11228         u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
11229         u[5]  = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
11230         u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
11231         u[6]  = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
11232         u[9]  = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
11233         u[7]  = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
11234         u[8]  = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
11235 
11236         // stage 2
11237         v[0] = _mm256_add_epi32(u[0], u[7]);
11238         v[7] = _mm256_sub_epi32(u[0], u[7]);
11239         v[1] = _mm256_add_epi32(u[1], u[6]);
11240         v[6] = _mm256_sub_epi32(u[1], u[6]);
11241         v[2] = _mm256_add_epi32(u[2], u[5]);
11242         v[5] = _mm256_sub_epi32(u[2], u[5]);
11243         v[3] = _mm256_add_epi32(u[3], u[4]);
11244         v[4] = _mm256_sub_epi32(u[3], u[4]);
11245         v[8] = u[8];
11246         v[9] = u[9];
11247 
11248         v[10] = _mm256_mullo_epi32(u[10], cospim32);
11249         x     = _mm256_mullo_epi32(u[13], cospi32);
11250         v[10] = _mm256_add_epi32(v[10], x);
11251         v[10] = _mm256_add_epi32(v[10], rnding);
11252         v[10] = _mm256_srai_epi32(v[10], bit);
11253 
11254         v[13] = _mm256_mullo_epi32(u[10], cospi32);
11255         x     = _mm256_mullo_epi32(u[13], cospim32);
11256         v[13] = _mm256_sub_epi32(v[13], x);
11257         v[13] = _mm256_add_epi32(v[13], rnding);
11258         v[13] = _mm256_srai_epi32(v[13], bit);
11259 
11260         v[11] = _mm256_mullo_epi32(u[11], cospim32);
11261         x     = _mm256_mullo_epi32(u[12], cospi32);
11262         v[11] = _mm256_add_epi32(v[11], x);
11263         v[11] = _mm256_add_epi32(v[11], rnding);
11264         v[11] = _mm256_srai_epi32(v[11], bit);
11265 
11266         v[12] = _mm256_mullo_epi32(u[11], cospi32);
11267         x     = _mm256_mullo_epi32(u[12], cospim32);
11268         v[12] = _mm256_sub_epi32(v[12], x);
11269         v[12] = _mm256_add_epi32(v[12], rnding);
11270         v[12] = _mm256_srai_epi32(v[12], bit);
11271         v[14] = u[14];
11272         v[15] = u[15];
11273 
11274         // stage 3
11275         u[0] = _mm256_add_epi32(v[0], v[3]);
11276         u[1] = _mm256_add_epi32(v[1], v[2]);
11277         u[4] = v[4];
11278 
11279         u[5] = _mm256_mullo_epi32(v[5], cospim32);
11280         x    = _mm256_mullo_epi32(v[6], cospi32);
11281         u[5] = _mm256_add_epi32(u[5], x);
11282         u[5] = _mm256_add_epi32(u[5], rnding);
11283         u[5] = _mm256_srai_epi32(u[5], bit);
11284 
11285         u[6] = _mm256_mullo_epi32(v[5], cospi32);
11286         x    = _mm256_mullo_epi32(v[6], cospim32);
11287         u[6] = _mm256_sub_epi32(u[6], x);
11288         u[6] = _mm256_add_epi32(u[6], rnding);
11289         u[6] = _mm256_srai_epi32(u[6], bit);
11290 
11291         u[7]  = v[7];
11292         u[8]  = _mm256_add_epi32(v[8], v[11]);
11293         u[11] = _mm256_sub_epi32(v[8], v[11]);
11294         u[9]  = _mm256_add_epi32(v[9], v[10]);
11295         u[10] = _mm256_sub_epi32(v[9], v[10]);
11296         u[12] = _mm256_sub_epi32(v[15], v[12]);
11297         u[15] = _mm256_add_epi32(v[15], v[12]);
11298         u[13] = _mm256_sub_epi32(v[14], v[13]);
11299         u[14] = _mm256_add_epi32(v[14], v[13]);
11300 
11301         // stage 4
11302         u[0] = _mm256_mullo_epi32(u[0], cospi32);
11303         u[1] = _mm256_mullo_epi32(u[1], cospi32);
11304         v[0] = _mm256_add_epi32(u[0], u[1]);
11305         v[0] = _mm256_add_epi32(v[0], rnding);
11306         v[0] = _mm256_srai_epi32(v[0], bit);
11307 
11308         v[4] = _mm256_add_epi32(u[4], u[5]);
11309         v[7] = _mm256_add_epi32(u[7], u[6]);
11310         v[8] = u[8];
11311 
11312         v[9] = _mm256_mullo_epi32(u[9], cospim16);
11313         x    = _mm256_mullo_epi32(u[14], cospi48);
11314         v[9] = _mm256_add_epi32(v[9], x);
11315         v[9] = _mm256_add_epi32(v[9], rnding);
11316         v[9] = _mm256_srai_epi32(v[9], bit);
11317 
11318         v[14] = _mm256_mullo_epi32(u[9], cospi48);
11319         x     = _mm256_mullo_epi32(u[14], cospim16);
11320         v[14] = _mm256_sub_epi32(v[14], x);
11321         v[14] = _mm256_add_epi32(v[14], rnding);
11322         v[14] = _mm256_srai_epi32(v[14], bit);
11323 
11324         v[10] = _mm256_mullo_epi32(u[10], cospim48);
11325         x     = _mm256_mullo_epi32(u[13], cospim16);
11326         v[10] = _mm256_add_epi32(v[10], x);
11327         v[10] = _mm256_add_epi32(v[10], rnding);
11328         v[10] = _mm256_srai_epi32(v[10], bit);
11329 
11330         v[13] = _mm256_mullo_epi32(u[10], cospim16);
11331         x     = _mm256_mullo_epi32(u[13], cospim48);
11332         v[13] = _mm256_sub_epi32(v[13], x);
11333         v[13] = _mm256_add_epi32(v[13], rnding);
11334         v[13] = _mm256_srai_epi32(v[13], bit);
11335 
11336         v[11] = u[11];
11337         v[12] = u[12];
11338         v[15] = u[15];
11339 
11340         // stage 5
11341         u[0] = v[0];
11342 
11343         u[4] = _mm256_mullo_epi32(v[4], cospi56);
11344         x    = _mm256_mullo_epi32(v[7], cospi8);
11345         u[4] = _mm256_add_epi32(u[4], x);
11346         u[4] = _mm256_add_epi32(u[4], rnding);
11347         u[4] = _mm256_srai_epi32(u[4], bit);
11348 
11349         u[8]  = _mm256_add_epi32(v[8], v[9]);
11350         u[11] = _mm256_add_epi32(v[11], v[10]);
11351         u[12] = _mm256_add_epi32(v[12], v[13]);
11352         u[15] = _mm256_add_epi32(v[15], v[14]);
11353 
11354         // stage 6
11355         v[0] = u[0];
11356         v[4] = u[4];
11357 
11358         v[8] = _mm256_mullo_epi32(u[8], cospi60);
11359         x    = _mm256_mullo_epi32(u[15], cospi4);
11360         v[8] = _mm256_add_epi32(v[8], x);
11361         v[8] = _mm256_add_epi32(v[8], rnding);
11362         v[8] = _mm256_srai_epi32(v[8], bit);
11363 
11364         v[12] = _mm256_mullo_epi32(u[11], cospi52);
11365         x     = _mm256_mullo_epi32(u[12], cospi12);
11366         v[12] = _mm256_sub_epi32(x, v[12]);
11367         v[12] = _mm256_add_epi32(v[12], rnding);
11368         v[12] = _mm256_srai_epi32(v[12], bit);
11369 
11370         out[0 * col_num + col] = v[0];
11371         out[1 * col_num + col] = v[8];
11372         out[2 * col_num + col] = v[4];
11373         out[3 * col_num + col] = v[12];
11374     }
11375 }
11376 
fadst16x16_N4_avx2(const __m256i * in,__m256i * out,int8_t bit,const int32_t col_num,int32_t size)11377 static void fadst16x16_N4_avx2(const __m256i *in, __m256i *out, int8_t bit, const int32_t col_num,
11378                                int32_t size) {
11379     const int32_t *cospi    = cospi_arr(bit);
11380     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
11381     const __m256i  cospi48  = _mm256_set1_epi32(cospi[48]);
11382     const __m256i  cospi16  = _mm256_set1_epi32(cospi[16]);
11383     const __m256i  cospim16 = _mm256_set1_epi32(-cospi[16]);
11384     const __m256i  cospim48 = _mm256_set1_epi32(-cospi[48]);
11385     const __m256i  cospi8   = _mm256_set1_epi32(cospi[8]);
11386     const __m256i  cospi56  = _mm256_set1_epi32(cospi[56]);
11387     const __m256i  cospim56 = _mm256_set1_epi32(-cospi[56]);
11388     const __m256i  cospim8  = _mm256_set1_epi32(-cospi[8]);
11389     const __m256i  cospi24  = _mm256_set1_epi32(cospi[24]);
11390     const __m256i  cospim24 = _mm256_set1_epi32(-cospi[24]);
11391     const __m256i  cospim40 = _mm256_set1_epi32(-cospi[40]);
11392     const __m256i  cospi40  = _mm256_set1_epi32(cospi[40]);
11393     const __m256i  cospi62  = _mm256_set1_epi32(cospi[62]);
11394     const __m256i  cospim2  = _mm256_set1_epi32(-cospi[2]);
11395     const __m256i  cospi54  = _mm256_set1_epi32(cospi[54]);
11396     const __m256i  cospim10 = _mm256_set1_epi32(-cospi[10]);
11397     const __m256i  cospi50  = _mm256_set1_epi32(cospi[50]);
11398     const __m256i  cospi14  = _mm256_set1_epi32(cospi[14]);
11399     const __m256i  cospi58  = _mm256_set1_epi32(cospi[58]);
11400     const __m256i  cospi6   = _mm256_set1_epi32(cospi[6]);
11401     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
11402     const __m256i  zero     = _mm256_setzero_si256();
11403 
11404     __m256i u[16], v[16], x, y;
11405     int32_t col;
11406 
11407     for (col = 0; col < size; ++col) {
11408         // stage 0
11409         // stage 1
11410         u[0]  = in[0 * col_num + col];
11411         u[1]  = _mm256_sub_epi32(zero, in[15 * col_num + col]);
11412         u[2]  = _mm256_sub_epi32(zero, in[7 * col_num + col]);
11413         u[3]  = in[8 * col_num + col];
11414         u[4]  = _mm256_sub_epi32(zero, in[3 * col_num + col]);
11415         u[5]  = in[12 * col_num + col];
11416         u[6]  = in[4 * col_num + col];
11417         u[7]  = _mm256_sub_epi32(zero, in[11 * col_num + col]);
11418         u[8]  = _mm256_sub_epi32(zero, in[1 * col_num + col]);
11419         u[9]  = in[14 * col_num + col];
11420         u[10] = in[6 * col_num + col];
11421         u[11] = _mm256_sub_epi32(zero, in[9 * col_num + col]);
11422         u[12] = in[2 * col_num + col];
11423         u[13] = _mm256_sub_epi32(zero, in[13 * col_num + col]);
11424         u[14] = _mm256_sub_epi32(zero, in[5 * col_num + col]);
11425         u[15] = in[10 * col_num + col];
11426 
11427         // stage 2
11428         v[0] = u[0];
11429         v[1] = u[1];
11430 
11431         x    = _mm256_mullo_epi32(u[2], cospi32);
11432         y    = _mm256_mullo_epi32(u[3], cospi32);
11433         v[2] = _mm256_add_epi32(x, y);
11434         v[2] = _mm256_add_epi32(v[2], rnding);
11435         v[2] = _mm256_srai_epi32(v[2], bit);
11436 
11437         v[3] = _mm256_sub_epi32(x, y);
11438         v[3] = _mm256_add_epi32(v[3], rnding);
11439         v[3] = _mm256_srai_epi32(v[3], bit);
11440 
11441         v[4] = u[4];
11442         v[5] = u[5];
11443 
11444         x    = _mm256_mullo_epi32(u[6], cospi32);
11445         y    = _mm256_mullo_epi32(u[7], cospi32);
11446         v[6] = _mm256_add_epi32(x, y);
11447         v[6] = _mm256_add_epi32(v[6], rnding);
11448         v[6] = _mm256_srai_epi32(v[6], bit);
11449 
11450         v[7] = _mm256_sub_epi32(x, y);
11451         v[7] = _mm256_add_epi32(v[7], rnding);
11452         v[7] = _mm256_srai_epi32(v[7], bit);
11453 
11454         v[8] = u[8];
11455         v[9] = u[9];
11456 
11457         x     = _mm256_mullo_epi32(u[10], cospi32);
11458         y     = _mm256_mullo_epi32(u[11], cospi32);
11459         v[10] = _mm256_add_epi32(x, y);
11460         v[10] = _mm256_add_epi32(v[10], rnding);
11461         v[10] = _mm256_srai_epi32(v[10], bit);
11462 
11463         v[11] = _mm256_sub_epi32(x, y);
11464         v[11] = _mm256_add_epi32(v[11], rnding);
11465         v[11] = _mm256_srai_epi32(v[11], bit);
11466 
11467         v[12] = u[12];
11468         v[13] = u[13];
11469 
11470         x     = _mm256_mullo_epi32(u[14], cospi32);
11471         y     = _mm256_mullo_epi32(u[15], cospi32);
11472         v[14] = _mm256_add_epi32(x, y);
11473         v[14] = _mm256_add_epi32(v[14], rnding);
11474         v[14] = _mm256_srai_epi32(v[14], bit);
11475 
11476         v[15] = _mm256_sub_epi32(x, y);
11477         v[15] = _mm256_add_epi32(v[15], rnding);
11478         v[15] = _mm256_srai_epi32(v[15], bit);
11479 
11480         // stage 3
11481         u[0]  = _mm256_add_epi32(v[0], v[2]);
11482         u[1]  = _mm256_add_epi32(v[1], v[3]);
11483         u[2]  = _mm256_sub_epi32(v[0], v[2]);
11484         u[3]  = _mm256_sub_epi32(v[1], v[3]);
11485         u[4]  = _mm256_add_epi32(v[4], v[6]);
11486         u[5]  = _mm256_add_epi32(v[5], v[7]);
11487         u[6]  = _mm256_sub_epi32(v[4], v[6]);
11488         u[7]  = _mm256_sub_epi32(v[5], v[7]);
11489         u[8]  = _mm256_add_epi32(v[8], v[10]);
11490         u[9]  = _mm256_add_epi32(v[9], v[11]);
11491         u[10] = _mm256_sub_epi32(v[8], v[10]);
11492         u[11] = _mm256_sub_epi32(v[9], v[11]);
11493         u[12] = _mm256_add_epi32(v[12], v[14]);
11494         u[13] = _mm256_add_epi32(v[13], v[15]);
11495         u[14] = _mm256_sub_epi32(v[12], v[14]);
11496         u[15] = _mm256_sub_epi32(v[13], v[15]);
11497 
11498         // stage 4
11499         v[0]  = u[0];
11500         v[1]  = u[1];
11501         v[2]  = u[2];
11502         v[3]  = u[3];
11503         v[4]  = half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
11504         v[5]  = half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
11505         v[6]  = half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
11506         v[7]  = half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
11507         v[8]  = u[8];
11508         v[9]  = u[9];
11509         v[10] = u[10];
11510         v[11] = u[11];
11511         v[12] = half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
11512         v[13] = half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
11513         v[14] = half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
11514         v[15] = half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
11515 
11516         // stage 5
11517         u[0]  = _mm256_add_epi32(v[0], v[4]);
11518         u[1]  = _mm256_add_epi32(v[1], v[5]);
11519         u[2]  = _mm256_add_epi32(v[2], v[6]);
11520         u[3]  = _mm256_add_epi32(v[3], v[7]);
11521         u[4]  = _mm256_sub_epi32(v[0], v[4]);
11522         u[5]  = _mm256_sub_epi32(v[1], v[5]);
11523         u[6]  = _mm256_sub_epi32(v[2], v[6]);
11524         u[7]  = _mm256_sub_epi32(v[3], v[7]);
11525         u[8]  = _mm256_add_epi32(v[8], v[12]);
11526         u[9]  = _mm256_add_epi32(v[9], v[13]);
11527         u[10] = _mm256_add_epi32(v[10], v[14]);
11528         u[11] = _mm256_add_epi32(v[11], v[15]);
11529         u[12] = _mm256_sub_epi32(v[8], v[12]);
11530         u[13] = _mm256_sub_epi32(v[9], v[13]);
11531         u[14] = _mm256_sub_epi32(v[10], v[14]);
11532         u[15] = _mm256_sub_epi32(v[11], v[15]);
11533 
11534         // stage 6
11535         v[0]  = u[0];
11536         v[1]  = u[1];
11537         v[2]  = u[2];
11538         v[3]  = u[3];
11539         v[4]  = u[4];
11540         v[5]  = u[5];
11541         v[6]  = u[6];
11542         v[7]  = u[7];
11543         v[8]  = half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
11544         v[9]  = half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
11545         v[10] = half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
11546         v[11] = half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
11547         v[12] = half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
11548         v[13] = half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
11549         v[14] = half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
11550         v[15] = half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
11551 
11552         // stage 7
11553         u[0]  = _mm256_add_epi32(v[0], v[8]);
11554         u[1]  = _mm256_add_epi32(v[1], v[9]);
11555         u[2]  = _mm256_add_epi32(v[2], v[10]);
11556         u[3]  = _mm256_add_epi32(v[3], v[11]);
11557         u[12] = _mm256_sub_epi32(v[4], v[12]);
11558         u[13] = _mm256_sub_epi32(v[5], v[13]);
11559         u[14] = _mm256_sub_epi32(v[6], v[14]);
11560         u[15] = _mm256_sub_epi32(v[7], v[15]);
11561 
11562         // stage 8
11563         v[1]  = half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
11564         v[3]  = half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
11565         v[12] = half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
11566         v[14] = half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
11567 
11568         // stage 9
11569         out[0 * col_num + col] = v[1];
11570         out[1 * col_num + col] = v[14];
11571         out[2 * col_num + col] = v[3];
11572         out[3 * col_num + col] = v[12];
11573     }
11574 }
11575 
fidtx16x16_N4_avx2(const __m256i * in,__m256i * out,int8_t bit,int32_t col_num,int32_t step)11576 static void fidtx16x16_N4_avx2(const __m256i *in, __m256i *out, int8_t bit, int32_t col_num,
11577                                int32_t step) {
11578     (void)bit;
11579     const int32_t bits     = 12; // new_sqrt2_bits = 12
11580     const int32_t sqrt     = 2 * 5793; // 2 * new_sqrt2
11581     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
11582     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
11583     __m256i       temp;
11584     int32_t       num_iters = 16 * col_num;
11585     for (int32_t i = 0; i < num_iters / 4; i += step) {
11586         temp   = _mm256_mullo_epi32(in[i], newsqrt);
11587         temp   = _mm256_add_epi32(temp, rounding);
11588         out[i] = _mm256_srai_epi32(temp, bits);
11589     }
11590 }
11591 
av1_fdct32_new_N4_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num,const int32_t stride)11592 static void av1_fdct32_new_N4_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
11593                                    const int32_t col_num, const int32_t stride) {
11594     const int32_t *cospi      = cospi_arr(cos_bit);
11595     const __m256i  __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
11596     const int32_t  columns    = col_num >> 3;
11597 
11598     __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
11599     __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
11600     __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
11601     __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
11602     __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
11603     __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
11604     __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
11605     __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
11606     __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
11607     __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
11608     __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
11609     __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
11610     __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
11611     __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
11612     __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
11613     __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
11614     __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
11615     __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
11616     __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
11617     __m256i cospi_m50 = _mm256_set1_epi32(-cospi[50]);
11618     __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
11619     __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
11620     __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
11621     __m256i cospi_m58 = _mm256_set1_epi32(-cospi[58]);
11622 
11623     __m256i buf0[32];
11624     __m256i buf1[32];
11625 
11626     for (int32_t col = 0; col < columns; col++) {
11627         const __m256i *in  = &input[col];
11628         __m256i *      out = &output[col];
11629 
11630         // stage 0
11631         // stage 1
11632         buf1[0]  = _mm256_add_epi32(in[0 * stride], in[31 * stride]);
11633         buf1[31] = _mm256_sub_epi32(in[0 * stride], in[31 * stride]);
11634         buf1[1]  = _mm256_add_epi32(in[1 * stride], in[30 * stride]);
11635         buf1[30] = _mm256_sub_epi32(in[1 * stride], in[30 * stride]);
11636         buf1[2]  = _mm256_add_epi32(in[2 * stride], in[29 * stride]);
11637         buf1[29] = _mm256_sub_epi32(in[2 * stride], in[29 * stride]);
11638         buf1[3]  = _mm256_add_epi32(in[3 * stride], in[28 * stride]);
11639         buf1[28] = _mm256_sub_epi32(in[3 * stride], in[28 * stride]);
11640         buf1[4]  = _mm256_add_epi32(in[4 * stride], in[27 * stride]);
11641         buf1[27] = _mm256_sub_epi32(in[4 * stride], in[27 * stride]);
11642         buf1[5]  = _mm256_add_epi32(in[5 * stride], in[26 * stride]);
11643         buf1[26] = _mm256_sub_epi32(in[5 * stride], in[26 * stride]);
11644         buf1[6]  = _mm256_add_epi32(in[6 * stride], in[25 * stride]);
11645         buf1[25] = _mm256_sub_epi32(in[6 * stride], in[25 * stride]);
11646         buf1[7]  = _mm256_add_epi32(in[7 * stride], in[24 * stride]);
11647         buf1[24] = _mm256_sub_epi32(in[7 * stride], in[24 * stride]);
11648         buf1[8]  = _mm256_add_epi32(in[8 * stride], in[23 * stride]);
11649         buf1[23] = _mm256_sub_epi32(in[8 * stride], in[23 * stride]);
11650         buf1[9]  = _mm256_add_epi32(in[9 * stride], in[22 * stride]);
11651         buf1[22] = _mm256_sub_epi32(in[9 * stride], in[22 * stride]);
11652         buf1[10] = _mm256_add_epi32(in[10 * stride], in[21 * stride]);
11653         buf1[21] = _mm256_sub_epi32(in[10 * stride], in[21 * stride]);
11654         buf1[11] = _mm256_add_epi32(in[11 * stride], in[20 * stride]);
11655         buf1[20] = _mm256_sub_epi32(in[11 * stride], in[20 * stride]);
11656         buf1[12] = _mm256_add_epi32(in[12 * stride], in[19 * stride]);
11657         buf1[19] = _mm256_sub_epi32(in[12 * stride], in[19 * stride]);
11658         buf1[13] = _mm256_add_epi32(in[13 * stride], in[18 * stride]);
11659         buf1[18] = _mm256_sub_epi32(in[13 * stride], in[18 * stride]);
11660         buf1[14] = _mm256_add_epi32(in[14 * stride], in[17 * stride]);
11661         buf1[17] = _mm256_sub_epi32(in[14 * stride], in[17 * stride]);
11662         buf1[15] = _mm256_add_epi32(in[15 * stride], in[16 * stride]);
11663         buf1[16] = _mm256_sub_epi32(in[15 * stride], in[16 * stride]);
11664 
11665         // stage 2
11666         buf0[0]  = _mm256_add_epi32(buf1[0], buf1[15]);
11667         buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
11668         buf0[1]  = _mm256_add_epi32(buf1[1], buf1[14]);
11669         buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
11670         buf0[2]  = _mm256_add_epi32(buf1[2], buf1[13]);
11671         buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
11672         buf0[3]  = _mm256_add_epi32(buf1[3], buf1[12]);
11673         buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
11674         buf0[4]  = _mm256_add_epi32(buf1[4], buf1[11]);
11675         buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
11676         buf0[5]  = _mm256_add_epi32(buf1[5], buf1[10]);
11677         buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
11678         buf0[6]  = _mm256_add_epi32(buf1[6], buf1[9]);
11679         buf0[9]  = _mm256_sub_epi32(buf1[6], buf1[9]);
11680         buf0[7]  = _mm256_add_epi32(buf1[7], buf1[8]);
11681         buf0[8]  = _mm256_sub_epi32(buf1[7], buf1[8]);
11682         buf0[16] = buf1[16];
11683         buf0[17] = buf1[17];
11684         buf0[18] = buf1[18];
11685         buf0[19] = buf1[19];
11686         btf_32_type0_avx2_new(
11687             cospi_m32, cospi_p32, buf1[20], buf1[27], buf0[20], buf0[27], __rounding, cos_bit);
11688         btf_32_type0_avx2_new(
11689             cospi_m32, cospi_p32, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
11690         btf_32_type0_avx2_new(
11691             cospi_m32, cospi_p32, buf1[22], buf1[25], buf0[22], buf0[25], __rounding, cos_bit);
11692         btf_32_type0_avx2_new(
11693             cospi_m32, cospi_p32, buf1[23], buf1[24], buf0[23], buf0[24], __rounding, cos_bit);
11694         buf0[28] = buf1[28];
11695         buf0[29] = buf1[29];
11696         buf0[30] = buf1[30];
11697         buf0[31] = buf1[31];
11698 
11699         // stage 3
11700         buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
11701         buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
11702         buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
11703         buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
11704         buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
11705         buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
11706         buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
11707         buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
11708         buf1[8] = buf0[8];
11709         buf1[9] = buf0[9];
11710         btf_32_type0_avx2_new(
11711             cospi_m32, cospi_p32, buf0[10], buf0[13], buf1[10], buf1[13], __rounding, cos_bit);
11712         btf_32_type0_avx2_new(
11713             cospi_m32, cospi_p32, buf0[11], buf0[12], buf1[11], buf1[12], __rounding, cos_bit);
11714         buf1[14] = buf0[14];
11715         buf1[15] = buf0[15];
11716         buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
11717         buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
11718         buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
11719         buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
11720         buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
11721         buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
11722         buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
11723         buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
11724         buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
11725         buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
11726         buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
11727         buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
11728         buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
11729         buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
11730         buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
11731         buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
11732 
11733         // stage 4
11734         buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
11735         buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
11736         buf0[4] = buf1[4];
11737         btf_32_type0_avx2_new(
11738             cospi_m32, cospi_p32, buf1[5], buf1[6], buf0[5], buf0[6], __rounding, cos_bit);
11739         buf0[7]  = buf1[7];
11740         buf0[8]  = _mm256_add_epi32(buf1[8], buf1[11]);
11741         buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
11742         buf0[9]  = _mm256_add_epi32(buf1[9], buf1[10]);
11743         buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
11744         buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
11745         buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
11746         buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
11747         buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
11748         buf0[16] = buf1[16];
11749         buf0[17] = buf1[17];
11750         btf_32_type0_avx2_new(
11751             cospi_m16, cospi_p48, buf1[18], buf1[29], buf0[18], buf0[29], __rounding, cos_bit);
11752         btf_32_type0_avx2_new(
11753             cospi_m16, cospi_p48, buf1[19], buf1[28], buf0[19], buf0[28], __rounding, cos_bit);
11754         btf_32_type0_avx2_new(
11755             cospi_m48, cospi_m16, buf1[20], buf1[27], buf0[20], buf0[27], __rounding, cos_bit);
11756         btf_32_type0_avx2_new(
11757             cospi_m48, cospi_m16, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
11758         buf0[22] = buf1[22];
11759         buf0[23] = buf1[23];
11760         buf0[24] = buf1[24];
11761         buf0[25] = buf1[25];
11762         buf0[30] = buf1[30];
11763         buf0[31] = buf1[31];
11764 
11765         // stage 5
11766         buf1[0] = half_btf_avx2(&cospi_p32, &buf0[0], &cospi_p32, &buf0[1], &__rounding, cos_bit);
11767         buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
11768         buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
11769         buf1[8] = buf0[8];
11770         btf_32_type0_avx2_new(
11771             cospi_m16, cospi_p48, buf0[9], buf0[14], buf1[9], buf1[14], __rounding, cos_bit);
11772         btf_32_type0_avx2_new(
11773             cospi_m48, cospi_m16, buf0[10], buf0[13], buf1[10], buf1[13], __rounding, cos_bit);
11774         buf1[11] = buf0[11];
11775         buf1[12] = buf0[12];
11776         buf1[15] = buf0[15];
11777         buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
11778         buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
11779         buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
11780         buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
11781         buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
11782         buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
11783         buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
11784         buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
11785         buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
11786         buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
11787         buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
11788         buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
11789         buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
11790         buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
11791         buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
11792         buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
11793 
11794         // stage 6
11795         buf0[0]  = buf1[0];
11796         buf0[4]  = half_btf_avx2(&cospi_p56, &buf1[4], &cospi_p08, &buf1[7], &__rounding, cos_bit);
11797         buf0[8]  = _mm256_add_epi32(buf1[8], buf1[9]);
11798         buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
11799         buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
11800         buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
11801         buf0[16] = buf1[16];
11802         btf_32_type0_avx2_new(
11803             cospi_m08, cospi_p56, buf1[17], buf1[30], buf0[17], buf0[30], __rounding, cos_bit);
11804         btf_32_type0_avx2_new(
11805             cospi_m56, cospi_m08, buf1[18], buf1[29], buf0[18], buf0[29], __rounding, cos_bit);
11806         buf0[19] = buf1[19];
11807         buf0[20] = buf1[20];
11808         btf_32_type0_avx2_new(
11809             cospi_m40, cospi_p24, buf1[21], buf1[26], buf0[21], buf0[26], __rounding, cos_bit);
11810         btf_32_type0_avx2_new(
11811             cospi_m24, cospi_m40, buf1[22], buf1[25], buf0[22], buf0[25], __rounding, cos_bit);
11812         buf0[23] = buf1[23];
11813         buf0[24] = buf1[24];
11814         buf0[27] = buf1[27];
11815         buf0[28] = buf1[28];
11816         buf0[31] = buf1[31];
11817 
11818         // stage 7
11819         buf1[0]  = buf0[0];
11820         buf1[4]  = buf0[4];
11821         buf1[8]  = half_btf_avx2(&cospi_p60, &buf0[8], &cospi_p04, &buf0[15], &__rounding, cos_bit);
11822         buf1[12] = half_btf_avx2(
11823             &cospi_p12, &buf0[12], &cospi_m52, &buf0[11], &__rounding, cos_bit);
11824 
11825         buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
11826         buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
11827         buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
11828         buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
11829         buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
11830         buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
11831         buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
11832         buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
11833 
11834         // stage 8
11835         buf0[0]  = buf1[0];
11836         buf0[4]  = buf1[4];
11837         buf0[8]  = buf1[8];
11838         buf0[12] = buf1[12];
11839         buf0[16] = half_btf_avx2(
11840             &cospi_p62, &buf1[16], &cospi_p02, &buf1[31], &__rounding, cos_bit);
11841         buf0[28] = half_btf_avx2(
11842             &cospi_p14, &buf1[28], &cospi_m50, &buf1[19], &__rounding, cos_bit);
11843         buf0[20] = half_btf_avx2(
11844             &cospi_p54, &buf1[20], &cospi_p10, &buf1[27], &__rounding, cos_bit);
11845         buf0[24] = half_btf_avx2(
11846             &cospi_p06, &buf1[24], &cospi_m58, &buf1[23], &__rounding, cos_bit);
11847 
11848         // stage 9
11849         out[0 * stride] = buf0[0];
11850         out[1 * stride] = buf0[16];
11851         out[2 * stride] = buf0[8];
11852         out[3 * stride] = buf0[24];
11853         out[4 * stride] = buf0[4];
11854         out[5 * stride] = buf0[20];
11855         out[6 * stride] = buf0[12];
11856         out[7 * stride] = buf0[28];
11857     }
11858 }
11859 
fdct32x32_N4_col_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit)11860 static AOM_FORCE_INLINE void fdct32x32_N4_col_avx2(const __m256i *input, __m256i *output,
11861                                                    const int8_t cos_bit) {
11862     const int32_t txfm_size   = 32;
11863     const int32_t num_per_256 = 8;
11864     int32_t       col_num     = txfm_size / num_per_256;
11865     av1_fdct32_new_N4_avx2(input, output, cos_bit, txfm_size, col_num);
11866 }
11867 
fdct32x32_N4_row_avx2(const __m256i * input,__m256i * output,const int8_t cos_bit)11868 static AOM_FORCE_INLINE void fdct32x32_N4_row_avx2(const __m256i *input, __m256i *output,
11869                                                    const int8_t cos_bit) {
11870     const int32_t txfm_size   = 32;
11871     const int32_t num_per_256 = 8;
11872     int32_t       col_num     = txfm_size / num_per_256;
11873     av1_fdct32_new_N4_avx2(input, output, cos_bit, txfm_size / 4, col_num);
11874 }
11875 
fidtx_wxh_N4_avx2(const __m256i * input,__m256i * output,int32_t size,int32_t step)11876 static void fidtx_wxh_N4_avx2(const __m256i *input, __m256i *output, int32_t size, int32_t step) {
11877     for (int32_t i = 0; i < size; i += step) output[i] = _mm256_slli_epi32(input[i], 2);
11878 }
11879 
av1_fdct64_new_N4_avx2(const __m256i * input,__m256i * output,int8_t cos_bit,const int32_t col_num,const int32_t stride)11880 static void av1_fdct64_new_N4_avx2(const __m256i *input, __m256i *output, int8_t cos_bit,
11881                                    const int32_t col_num, const int32_t stride) {
11882     const int32_t *cospi      = cospi_arr(cos_bit);
11883     const __m256i  __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
11884     const int32_t  columns    = col_num >> 3;
11885 
11886     __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
11887     __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
11888     __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
11889     __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
11890     __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
11891     __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
11892     __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
11893     __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
11894     __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
11895     __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
11896     __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
11897     __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
11898     __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
11899     __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
11900     __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
11901     __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
11902     __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
11903     __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
11904     __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
11905     __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
11906     __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
11907     __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
11908     __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
11909     __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
11910     __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
11911     __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
11912     __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
11913     __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
11914     __m256i cospi_m50 = _mm256_set1_epi32(-cospi[50]);
11915     __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
11916     __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
11917     __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
11918     __m256i cospi_m58 = _mm256_set1_epi32(-cospi[58]);
11919     __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
11920     __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
11921     __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
11922     __m256i cospi_m49 = _mm256_set1_epi32(-cospi[49]);
11923     __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
11924     __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
11925     __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
11926     __m256i cospi_m57 = _mm256_set1_epi32(-cospi[57]);
11927     __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
11928     __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
11929     __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
11930     __m256i cospi_m53 = _mm256_set1_epi32(-cospi[53]);
11931     __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
11932     __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
11933     __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
11934     __m256i cospi_m61 = _mm256_set1_epi32(-cospi[61]);
11935 
11936     for (int32_t col = 0; col < columns; col++) {
11937         const __m256i *in  = &input[col];
11938         __m256i *      out = &output[col];
11939 
11940         // stage 1
11941         __m256i x1[64];
11942         x1[0]  = _mm256_add_epi32(in[0 * stride], in[63 * stride]);
11943         x1[63] = _mm256_sub_epi32(in[0 * stride], in[63 * stride]);
11944         x1[1]  = _mm256_add_epi32(in[1 * stride], in[62 * stride]);
11945         x1[62] = _mm256_sub_epi32(in[1 * stride], in[62 * stride]);
11946         x1[2]  = _mm256_add_epi32(in[2 * stride], in[61 * stride]);
11947         x1[61] = _mm256_sub_epi32(in[2 * stride], in[61 * stride]);
11948         x1[3]  = _mm256_add_epi32(in[3 * stride], in[60 * stride]);
11949         x1[60] = _mm256_sub_epi32(in[3 * stride], in[60 * stride]);
11950         x1[4]  = _mm256_add_epi32(in[4 * stride], in[59 * stride]);
11951         x1[59] = _mm256_sub_epi32(in[4 * stride], in[59 * stride]);
11952         x1[5]  = _mm256_add_epi32(in[5 * stride], in[58 * stride]);
11953         x1[58] = _mm256_sub_epi32(in[5 * stride], in[58 * stride]);
11954         x1[6]  = _mm256_add_epi32(in[6 * stride], in[57 * stride]);
11955         x1[57] = _mm256_sub_epi32(in[6 * stride], in[57 * stride]);
11956         x1[7]  = _mm256_add_epi32(in[7 * stride], in[56 * stride]);
11957         x1[56] = _mm256_sub_epi32(in[7 * stride], in[56 * stride]);
11958         x1[8]  = _mm256_add_epi32(in[8 * stride], in[55 * stride]);
11959         x1[55] = _mm256_sub_epi32(in[8 * stride], in[55 * stride]);
11960         x1[9]  = _mm256_add_epi32(in[9 * stride], in[54 * stride]);
11961         x1[54] = _mm256_sub_epi32(in[9 * stride], in[54 * stride]);
11962         x1[10] = _mm256_add_epi32(in[10 * stride], in[53 * stride]);
11963         x1[53] = _mm256_sub_epi32(in[10 * stride], in[53 * stride]);
11964         x1[11] = _mm256_add_epi32(in[11 * stride], in[52 * stride]);
11965         x1[52] = _mm256_sub_epi32(in[11 * stride], in[52 * stride]);
11966         x1[12] = _mm256_add_epi32(in[12 * stride], in[51 * stride]);
11967         x1[51] = _mm256_sub_epi32(in[12 * stride], in[51 * stride]);
11968         x1[13] = _mm256_add_epi32(in[13 * stride], in[50 * stride]);
11969         x1[50] = _mm256_sub_epi32(in[13 * stride], in[50 * stride]);
11970         x1[14] = _mm256_add_epi32(in[14 * stride], in[49 * stride]);
11971         x1[49] = _mm256_sub_epi32(in[14 * stride], in[49 * stride]);
11972         x1[15] = _mm256_add_epi32(in[15 * stride], in[48 * stride]);
11973         x1[48] = _mm256_sub_epi32(in[15 * stride], in[48 * stride]);
11974         x1[16] = _mm256_add_epi32(in[16 * stride], in[47 * stride]);
11975         x1[47] = _mm256_sub_epi32(in[16 * stride], in[47 * stride]);
11976         x1[17] = _mm256_add_epi32(in[17 * stride], in[46 * stride]);
11977         x1[46] = _mm256_sub_epi32(in[17 * stride], in[46 * stride]);
11978         x1[18] = _mm256_add_epi32(in[18 * stride], in[45 * stride]);
11979         x1[45] = _mm256_sub_epi32(in[18 * stride], in[45 * stride]);
11980         x1[19] = _mm256_add_epi32(in[19 * stride], in[44 * stride]);
11981         x1[44] = _mm256_sub_epi32(in[19 * stride], in[44 * stride]);
11982         x1[20] = _mm256_add_epi32(in[20 * stride], in[43 * stride]);
11983         x1[43] = _mm256_sub_epi32(in[20 * stride], in[43 * stride]);
11984         x1[21] = _mm256_add_epi32(in[21 * stride], in[42 * stride]);
11985         x1[42] = _mm256_sub_epi32(in[21 * stride], in[42 * stride]);
11986         x1[22] = _mm256_add_epi32(in[22 * stride], in[41 * stride]);
11987         x1[41] = _mm256_sub_epi32(in[22 * stride], in[41 * stride]);
11988         x1[23] = _mm256_add_epi32(in[23 * stride], in[40 * stride]);
11989         x1[40] = _mm256_sub_epi32(in[23 * stride], in[40 * stride]);
11990         x1[24] = _mm256_add_epi32(in[24 * stride], in[39 * stride]);
11991         x1[39] = _mm256_sub_epi32(in[24 * stride], in[39 * stride]);
11992         x1[25] = _mm256_add_epi32(in[25 * stride], in[38 * stride]);
11993         x1[38] = _mm256_sub_epi32(in[25 * stride], in[38 * stride]);
11994         x1[26] = _mm256_add_epi32(in[26 * stride], in[37 * stride]);
11995         x1[37] = _mm256_sub_epi32(in[26 * stride], in[37 * stride]);
11996         x1[27] = _mm256_add_epi32(in[27 * stride], in[36 * stride]);
11997         x1[36] = _mm256_sub_epi32(in[27 * stride], in[36 * stride]);
11998         x1[28] = _mm256_add_epi32(in[28 * stride], in[35 * stride]);
11999         x1[35] = _mm256_sub_epi32(in[28 * stride], in[35 * stride]);
12000         x1[29] = _mm256_add_epi32(in[29 * stride], in[34 * stride]);
12001         x1[34] = _mm256_sub_epi32(in[29 * stride], in[34 * stride]);
12002         x1[30] = _mm256_add_epi32(in[30 * stride], in[33 * stride]);
12003         x1[33] = _mm256_sub_epi32(in[30 * stride], in[33 * stride]);
12004         x1[31] = _mm256_add_epi32(in[31 * stride], in[32 * stride]);
12005         x1[32] = _mm256_sub_epi32(in[31 * stride], in[32 * stride]);
12006 
12007         // stage 2
12008         __m256i x2[64];
12009         x2[0]  = _mm256_add_epi32(x1[0], x1[31]);
12010         x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
12011         x2[1]  = _mm256_add_epi32(x1[1], x1[30]);
12012         x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
12013         x2[2]  = _mm256_add_epi32(x1[2], x1[29]);
12014         x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
12015         x2[3]  = _mm256_add_epi32(x1[3], x1[28]);
12016         x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
12017         x2[4]  = _mm256_add_epi32(x1[4], x1[27]);
12018         x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
12019         x2[5]  = _mm256_add_epi32(x1[5], x1[26]);
12020         x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
12021         x2[6]  = _mm256_add_epi32(x1[6], x1[25]);
12022         x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
12023         x2[7]  = _mm256_add_epi32(x1[7], x1[24]);
12024         x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
12025         x2[8]  = _mm256_add_epi32(x1[8], x1[23]);
12026         x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
12027         x2[9]  = _mm256_add_epi32(x1[9], x1[22]);
12028         x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
12029         x2[10] = _mm256_add_epi32(x1[10], x1[21]);
12030         x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
12031         x2[11] = _mm256_add_epi32(x1[11], x1[20]);
12032         x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
12033         x2[12] = _mm256_add_epi32(x1[12], x1[19]);
12034         x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
12035         x2[13] = _mm256_add_epi32(x1[13], x1[18]);
12036         x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
12037         x2[14] = _mm256_add_epi32(x1[14], x1[17]);
12038         x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
12039         x2[15] = _mm256_add_epi32(x1[15], x1[16]);
12040         x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
12041         x2[32] = x1[32];
12042         x2[33] = x1[33];
12043         x2[34] = x1[34];
12044         x2[35] = x1[35];
12045         x2[36] = x1[36];
12046         x2[37] = x1[37];
12047         x2[38] = x1[38];
12048         x2[39] = x1[39];
12049         btf_32_type0_avx2_new(
12050             cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55], __rounding, cos_bit);
12051         btf_32_type0_avx2_new(
12052             cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54], __rounding, cos_bit);
12053         btf_32_type0_avx2_new(
12054             cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53], __rounding, cos_bit);
12055         btf_32_type0_avx2_new(
12056             cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52], __rounding, cos_bit);
12057         btf_32_type0_avx2_new(
12058             cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51], __rounding, cos_bit);
12059         btf_32_type0_avx2_new(
12060             cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50], __rounding, cos_bit);
12061         btf_32_type0_avx2_new(
12062             cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49], __rounding, cos_bit);
12063         btf_32_type0_avx2_new(
12064             cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48], __rounding, cos_bit);
12065         x2[56] = x1[56];
12066         x2[57] = x1[57];
12067         x2[58] = x1[58];
12068         x2[59] = x1[59];
12069         x2[60] = x1[60];
12070         x2[61] = x1[61];
12071         x2[62] = x1[62];
12072         x2[63] = x1[63];
12073 
12074         // stage 3
12075         __m256i x3[64];
12076         x3[0]  = _mm256_add_epi32(x2[0], x2[15]);
12077         x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
12078         x3[1]  = _mm256_add_epi32(x2[1], x2[14]);
12079         x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
12080         x3[2]  = _mm256_add_epi32(x2[2], x2[13]);
12081         x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
12082         x3[3]  = _mm256_add_epi32(x2[3], x2[12]);
12083         x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
12084         x3[4]  = _mm256_add_epi32(x2[4], x2[11]);
12085         x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
12086         x3[5]  = _mm256_add_epi32(x2[5], x2[10]);
12087         x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
12088         x3[6]  = _mm256_add_epi32(x2[6], x2[9]);
12089         x3[9]  = _mm256_sub_epi32(x2[6], x2[9]);
12090         x3[7]  = _mm256_add_epi32(x2[7], x2[8]);
12091         x3[8]  = _mm256_sub_epi32(x2[7], x2[8]);
12092         x3[16] = x2[16];
12093         x3[17] = x2[17];
12094         x3[18] = x2[18];
12095         x3[19] = x2[19];
12096         btf_32_type0_avx2_new(
12097             cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27], __rounding, cos_bit);
12098         btf_32_type0_avx2_new(
12099             cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26], __rounding, cos_bit);
12100         btf_32_type0_avx2_new(
12101             cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25], __rounding, cos_bit);
12102         btf_32_type0_avx2_new(
12103             cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24], __rounding, cos_bit);
12104         x3[28] = x2[28];
12105         x3[29] = x2[29];
12106         x3[30] = x2[30];
12107         x3[31] = x2[31];
12108         x3[32] = _mm256_add_epi32(x2[32], x2[47]);
12109         x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
12110         x3[33] = _mm256_add_epi32(x2[33], x2[46]);
12111         x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
12112         x3[34] = _mm256_add_epi32(x2[34], x2[45]);
12113         x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
12114         x3[35] = _mm256_add_epi32(x2[35], x2[44]);
12115         x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
12116         x3[36] = _mm256_add_epi32(x2[36], x2[43]);
12117         x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
12118         x3[37] = _mm256_add_epi32(x2[37], x2[42]);
12119         x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
12120         x3[38] = _mm256_add_epi32(x2[38], x2[41]);
12121         x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
12122         x3[39] = _mm256_add_epi32(x2[39], x2[40]);
12123         x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
12124         x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
12125         x3[63] = _mm256_add_epi32(x2[63], x2[48]);
12126         x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
12127         x3[62] = _mm256_add_epi32(x2[62], x2[49]);
12128         x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
12129         x3[61] = _mm256_add_epi32(x2[61], x2[50]);
12130         x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
12131         x3[60] = _mm256_add_epi32(x2[60], x2[51]);
12132         x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
12133         x3[59] = _mm256_add_epi32(x2[59], x2[52]);
12134         x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
12135         x3[58] = _mm256_add_epi32(x2[58], x2[53]);
12136         x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
12137         x3[57] = _mm256_add_epi32(x2[57], x2[54]);
12138         x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
12139         x3[56] = _mm256_add_epi32(x2[56], x2[55]);
12140 
12141         // stage 4
12142         __m256i x4[64];
12143         x4[0] = _mm256_add_epi32(x3[0], x3[7]);
12144         x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
12145         x4[1] = _mm256_add_epi32(x3[1], x3[6]);
12146         x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
12147         x4[2] = _mm256_add_epi32(x3[2], x3[5]);
12148         x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
12149         x4[3] = _mm256_add_epi32(x3[3], x3[4]);
12150         x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
12151         x4[8] = x3[8];
12152         x4[9] = x3[9];
12153         btf_32_type0_avx2_new(
12154             cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13], __rounding, cos_bit);
12155         btf_32_type0_avx2_new(
12156             cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12], __rounding, cos_bit);
12157         x4[14] = x3[14];
12158         x4[15] = x3[15];
12159         x4[16] = _mm256_add_epi32(x3[16], x3[23]);
12160         x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
12161         x4[17] = _mm256_add_epi32(x3[17], x3[22]);
12162         x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
12163         x4[18] = _mm256_add_epi32(x3[18], x3[21]);
12164         x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
12165         x4[19] = _mm256_add_epi32(x3[19], x3[20]);
12166         x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
12167         x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
12168         x4[31] = _mm256_add_epi32(x3[31], x3[24]);
12169         x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
12170         x4[30] = _mm256_add_epi32(x3[30], x3[25]);
12171         x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
12172         x4[29] = _mm256_add_epi32(x3[29], x3[26]);
12173         x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
12174         x4[28] = _mm256_add_epi32(x3[28], x3[27]);
12175         x4[32] = x3[32];
12176         x4[33] = x3[33];
12177         x4[34] = x3[34];
12178         x4[35] = x3[35];
12179         btf_32_type0_avx2_new(
12180             cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59], __rounding, cos_bit);
12181         btf_32_type0_avx2_new(
12182             cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58], __rounding, cos_bit);
12183         btf_32_type0_avx2_new(
12184             cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57], __rounding, cos_bit);
12185         btf_32_type0_avx2_new(
12186             cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56], __rounding, cos_bit);
12187         btf_32_type0_avx2_new(
12188             cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55], __rounding, cos_bit);
12189         btf_32_type0_avx2_new(
12190             cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54], __rounding, cos_bit);
12191         btf_32_type0_avx2_new(
12192             cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53], __rounding, cos_bit);
12193         btf_32_type0_avx2_new(
12194             cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52], __rounding, cos_bit);
12195         x4[44] = x3[44];
12196         x4[45] = x3[45];
12197         x4[46] = x3[46];
12198         x4[47] = x3[47];
12199         x4[48] = x3[48];
12200         x4[49] = x3[49];
12201         x4[50] = x3[50];
12202         x4[51] = x3[51];
12203         x4[60] = x3[60];
12204         x4[61] = x3[61];
12205         x4[62] = x3[62];
12206         x4[63] = x3[63];
12207 
12208         // stage 5
12209         __m256i x5[64];
12210         x5[0] = _mm256_add_epi32(x4[0], x4[3]);
12211         x5[1] = _mm256_add_epi32(x4[1], x4[2]);
12212         x5[4] = x4[4];
12213         btf_32_type0_avx2_new(
12214             cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6], __rounding, cos_bit);
12215         x5[7]  = x4[7];
12216         x5[8]  = _mm256_add_epi32(x4[8], x4[11]);
12217         x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
12218         x5[9]  = _mm256_add_epi32(x4[9], x4[10]);
12219         x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
12220         x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
12221         x5[15] = _mm256_add_epi32(x4[15], x4[12]);
12222         x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
12223         x5[14] = _mm256_add_epi32(x4[14], x4[13]);
12224         x5[16] = x4[16];
12225         x5[17] = x4[17];
12226         btf_32_type0_avx2_new(
12227             cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29], __rounding, cos_bit);
12228         btf_32_type0_avx2_new(
12229             cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28], __rounding, cos_bit);
12230         btf_32_type0_avx2_new(
12231             cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27], __rounding, cos_bit);
12232         btf_32_type0_avx2_new(
12233             cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26], __rounding, cos_bit);
12234         x5[22] = x4[22];
12235         x5[23] = x4[23];
12236         x5[24] = x4[24];
12237         x5[25] = x4[25];
12238         x5[30] = x4[30];
12239         x5[31] = x4[31];
12240         x5[32] = _mm256_add_epi32(x4[32], x4[39]);
12241         x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
12242         x5[33] = _mm256_add_epi32(x4[33], x4[38]);
12243         x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
12244         x5[34] = _mm256_add_epi32(x4[34], x4[37]);
12245         x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
12246         x5[35] = _mm256_add_epi32(x4[35], x4[36]);
12247         x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
12248         x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
12249         x5[47] = _mm256_add_epi32(x4[47], x4[40]);
12250         x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
12251         x5[46] = _mm256_add_epi32(x4[46], x4[41]);
12252         x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
12253         x5[45] = _mm256_add_epi32(x4[45], x4[42]);
12254         x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
12255         x5[44] = _mm256_add_epi32(x4[44], x4[43]);
12256         x5[48] = _mm256_add_epi32(x4[48], x4[55]);
12257         x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
12258         x5[49] = _mm256_add_epi32(x4[49], x4[54]);
12259         x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
12260         x5[50] = _mm256_add_epi32(x4[50], x4[53]);
12261         x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
12262         x5[51] = _mm256_add_epi32(x4[51], x4[52]);
12263         x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
12264         x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
12265         x5[63] = _mm256_add_epi32(x4[63], x4[56]);
12266         x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
12267         x5[62] = _mm256_add_epi32(x4[62], x4[57]);
12268         x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
12269         x5[61] = _mm256_add_epi32(x4[61], x4[58]);
12270         x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
12271         x5[60] = _mm256_add_epi32(x4[60], x4[59]);
12272 
12273         // stage 6
12274         __m256i x6[64];
12275         out[0 * stride] = half_btf_avx2(
12276             &cospi_p32, &x5[0], &cospi_p32, &x5[1], &__rounding, cos_bit);
12277         x6[4] = _mm256_add_epi32(x5[4], x5[5]);
12278         x6[7] = _mm256_add_epi32(x5[7], x5[6]);
12279         x6[8] = x5[8];
12280         btf_32_type0_avx2_new(
12281             cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14], __rounding, cos_bit);
12282         btf_32_type0_avx2_new(
12283             cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13], __rounding, cos_bit);
12284         x6[11] = x5[11];
12285         x6[12] = x5[12];
12286         x6[15] = x5[15];
12287         x6[16] = _mm256_add_epi32(x5[16], x5[19]);
12288         x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
12289         x6[17] = _mm256_add_epi32(x5[17], x5[18]);
12290         x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
12291         x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
12292         x6[23] = _mm256_add_epi32(x5[23], x5[20]);
12293         x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
12294         x6[22] = _mm256_add_epi32(x5[22], x5[21]);
12295         x6[24] = _mm256_add_epi32(x5[24], x5[27]);
12296         x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
12297         x6[25] = _mm256_add_epi32(x5[25], x5[26]);
12298         x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
12299         x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
12300         x6[31] = _mm256_add_epi32(x5[31], x5[28]);
12301         x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
12302         x6[30] = _mm256_add_epi32(x5[30], x5[29]);
12303         x6[32] = x5[32];
12304         x6[33] = x5[33];
12305         btf_32_type0_avx2_new(
12306             cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61], __rounding, cos_bit);
12307         btf_32_type0_avx2_new(
12308             cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60], __rounding, cos_bit);
12309         btf_32_type0_avx2_new(
12310             cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59], __rounding, cos_bit);
12311         btf_32_type0_avx2_new(
12312             cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58], __rounding, cos_bit);
12313         x6[38] = x5[38];
12314         x6[39] = x5[39];
12315         x6[40] = x5[40];
12316         x6[41] = x5[41];
12317         btf_32_type0_avx2_new(
12318             cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53], __rounding, cos_bit);
12319         btf_32_type0_avx2_new(
12320             cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52], __rounding, cos_bit);
12321         btf_32_type0_avx2_new(
12322             cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51], __rounding, cos_bit);
12323         btf_32_type0_avx2_new(
12324             cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50], __rounding, cos_bit);
12325         x6[46] = x5[46];
12326         x6[47] = x5[47];
12327         x6[48] = x5[48];
12328         x6[49] = x5[49];
12329         x6[54] = x5[54];
12330         x6[55] = x5[55];
12331         x6[56] = x5[56];
12332         x6[57] = x5[57];
12333         x6[62] = x5[62];
12334         x6[63] = x5[63];
12335 
12336         // stage 7
12337         __m256i x7[64];
12338         out[8 * stride] = half_btf_avx2(
12339             &cospi_p56, &x6[4], &cospi_p08, &x6[7], &__rounding, cos_bit);
12340         x7[8]  = _mm256_add_epi32(x6[8], x6[9]);
12341         x7[11] = _mm256_add_epi32(x6[11], x6[10]);
12342         x7[12] = _mm256_add_epi32(x6[12], x6[13]);
12343         x7[15] = _mm256_add_epi32(x6[15], x6[14]);
12344         x7[16] = x6[16];
12345         btf_32_type0_avx2_new(
12346             cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30], __rounding, cos_bit);
12347         btf_32_type0_avx2_new(
12348             cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29], __rounding, cos_bit);
12349         x7[19] = x6[19];
12350         x7[20] = x6[20];
12351         btf_32_type0_avx2_new(
12352             cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26], __rounding, cos_bit);
12353         btf_32_type0_avx2_new(
12354             cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25], __rounding, cos_bit);
12355         x7[23] = x6[23];
12356         x7[24] = x6[24];
12357         x7[27] = x6[27];
12358         x7[28] = x6[28];
12359         x7[31] = x6[31];
12360         x7[32] = _mm256_add_epi32(x6[32], x6[35]);
12361         x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
12362         x7[33] = _mm256_add_epi32(x6[33], x6[34]);
12363         x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
12364         x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
12365         x7[39] = _mm256_add_epi32(x6[39], x6[36]);
12366         x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
12367         x7[38] = _mm256_add_epi32(x6[38], x6[37]);
12368         x7[40] = _mm256_add_epi32(x6[40], x6[43]);
12369         x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
12370         x7[41] = _mm256_add_epi32(x6[41], x6[42]);
12371         x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
12372         x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
12373         x7[47] = _mm256_add_epi32(x6[47], x6[44]);
12374         x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
12375         x7[46] = _mm256_add_epi32(x6[46], x6[45]);
12376         x7[48] = _mm256_add_epi32(x6[48], x6[51]);
12377         x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
12378         x7[49] = _mm256_add_epi32(x6[49], x6[50]);
12379         x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
12380         x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
12381         x7[55] = _mm256_add_epi32(x6[55], x6[52]);
12382         x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
12383         x7[54] = _mm256_add_epi32(x6[54], x6[53]);
12384         x7[56] = _mm256_add_epi32(x6[56], x6[59]);
12385         x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
12386         x7[57] = _mm256_add_epi32(x6[57], x6[58]);
12387         x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
12388         x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
12389         x7[63] = _mm256_add_epi32(x6[63], x6[60]);
12390         x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
12391         x7[62] = _mm256_add_epi32(x6[62], x6[61]);
12392 
12393         // stage 8
12394         __m256i x8[40];
12395         out[4 * stride] = half_btf_avx2(
12396             &cospi_p60, &x7[8], &cospi_p04, &x7[15], &__rounding, cos_bit);
12397         out[12 * stride] = half_btf_avx2(
12398             &cospi_p12, &x7[12], &cospi_m52, &x7[11], &__rounding, cos_bit);
12399         x8[0] = _mm256_add_epi32(x7[16], x7[17]);
12400         x8[1] = _mm256_add_epi32(x7[19], x7[18]);
12401         x8[2] = _mm256_add_epi32(x7[20], x7[21]);
12402         x8[3] = _mm256_add_epi32(x7[23], x7[22]);
12403         x8[4] = _mm256_add_epi32(x7[24], x7[25]);
12404         x8[5] = _mm256_add_epi32(x7[27], x7[26]);
12405         x8[6] = _mm256_add_epi32(x7[28], x7[29]);
12406         x8[7] = _mm256_add_epi32(x7[31], x7[30]);
12407         x8[8] = x7[32];
12408         btf_32_type0_avx2_new(
12409             cospi_m04, cospi_p60, x7[33], x7[62], x8[9], x8[32], __rounding, cos_bit);
12410         btf_32_type0_avx2_new(
12411             cospi_m60, cospi_m04, x7[34], x7[61], x8[10], x8[33], __rounding, cos_bit);
12412         x8[11] = x7[35];
12413         x8[12] = x7[36];
12414         btf_32_type0_avx2_new(
12415             cospi_m36, cospi_p28, x7[37], x7[58], x8[13], x8[34], __rounding, cos_bit);
12416         btf_32_type0_avx2_new(
12417             cospi_m28, cospi_m36, x7[38], x7[57], x8[14], x8[35], __rounding, cos_bit);
12418         x8[15] = x7[39];
12419         x8[16] = x7[40];
12420         btf_32_type0_avx2_new(
12421             cospi_m20, cospi_p44, x7[41], x7[54], x8[17], x8[36], __rounding, cos_bit);
12422         btf_32_type0_avx2_new(
12423             cospi_m44, cospi_m20, x7[42], x7[53], x8[18], x8[37], __rounding, cos_bit);
12424         x8[19] = x7[43];
12425         x8[20] = x7[44];
12426         btf_32_type0_avx2_new(
12427             cospi_m52, cospi_p12, x7[45], x7[50], x8[21], x8[38], __rounding, cos_bit);
12428         btf_32_type0_avx2_new(
12429             cospi_m12, cospi_m52, x7[46], x7[49], x8[22], x8[39], __rounding, cos_bit);
12430         x8[23] = x7[47];
12431         x8[24] = x7[48];
12432         x8[25] = x7[51];
12433         x8[26] = x7[52];
12434         x8[27] = x7[55];
12435         x8[28] = x7[56];
12436         x8[29] = x7[59];
12437         x8[30] = x7[60];
12438         x8[31] = x7[63];
12439 
12440         // stage 9
12441         __m256i x9[16];
12442         out[2 * stride] = half_btf_avx2(
12443             &cospi_p62, &x8[0], &cospi_p02, &x8[7], &__rounding, cos_bit);
12444         out[14 * stride] = half_btf_avx2(
12445             &cospi_p14, &x8[6], &cospi_m50, &x8[1], &__rounding, cos_bit);
12446         out[10 * stride] = half_btf_avx2(
12447             &cospi_p54, &x8[2], &cospi_p10, &x8[5], &__rounding, cos_bit);
12448         out[6 * stride] = half_btf_avx2(
12449             &cospi_p06, &x8[4], &cospi_m58, &x8[3], &__rounding, cos_bit);
12450         x9[0]  = _mm256_add_epi32(x8[8], x8[9]);
12451         x9[1]  = _mm256_add_epi32(x8[11], x8[10]);
12452         x9[2]  = _mm256_add_epi32(x8[12], x8[13]);
12453         x9[3]  = _mm256_add_epi32(x8[15], x8[14]);
12454         x9[4]  = _mm256_add_epi32(x8[16], x8[17]);
12455         x9[5]  = _mm256_add_epi32(x8[19], x8[18]);
12456         x9[6]  = _mm256_add_epi32(x8[20], x8[21]);
12457         x9[7]  = _mm256_add_epi32(x8[23], x8[22]);
12458         x9[8]  = _mm256_add_epi32(x8[24], x8[39]);
12459         x9[9]  = _mm256_add_epi32(x8[25], x8[38]);
12460         x9[10] = _mm256_add_epi32(x8[26], x8[37]);
12461         x9[11] = _mm256_add_epi32(x8[27], x8[36]);
12462         x9[12] = _mm256_add_epi32(x8[28], x8[35]);
12463         x9[13] = _mm256_add_epi32(x8[29], x8[34]);
12464         x9[14] = _mm256_add_epi32(x8[30], x8[33]);
12465         x9[15] = _mm256_add_epi32(x8[31], x8[32]);
12466 
12467         // stage 10
12468         out[1 * stride] = half_btf_avx2(
12469             &cospi_p63, &x9[0], &cospi_p01, &x9[15], &__rounding, cos_bit);
12470         out[15 * stride] = half_btf_avx2(
12471             &cospi_p15, &x9[14], &cospi_m49, &x9[1], &__rounding, cos_bit);
12472         out[9 * stride] = half_btf_avx2(
12473             &cospi_p55, &x9[2], &cospi_p09, &x9[13], &__rounding, cos_bit);
12474         out[7 * stride] = half_btf_avx2(
12475             &cospi_p07, &x9[12], &cospi_m57, &x9[3], &__rounding, cos_bit);
12476         out[5 * stride] = half_btf_avx2(
12477             &cospi_p59, &x9[4], &cospi_p05, &x9[11], &__rounding, cos_bit);
12478         out[11 * stride] = half_btf_avx2(
12479             &cospi_p11, &x9[10], &cospi_m53, &x9[5], &__rounding, cos_bit);
12480         out[13 * stride] = half_btf_avx2(
12481             &cospi_p51, &x9[6], &cospi_p13, &x9[9], &__rounding, cos_bit);
12482         out[3 * stride] = half_btf_avx2(
12483             &cospi_p03, &x9[8], &cospi_m61, &x9[7], &__rounding, cos_bit);
12484     }
12485 }
12486 
fidtx64x64_N4_avx2(const __m256i * input,__m256i * output)12487 static void fidtx64x64_N4_avx2(const __m256i *input, __m256i *output) {
12488     const int32_t bits     = 12; // new_sqrt2_bits = 12
12489     const int32_t sqrt     = 4 * 5793; // 4 * new_sqrt2
12490     const __m256i newsqrt  = _mm256_set1_epi32(sqrt);
12491     const __m256i rounding = _mm256_set1_epi32(1 << (bits - 1));
12492 
12493     __m256i temp;
12494     for (int32_t i = 0; i < 128; i += 8) {
12495         temp          = _mm256_mullo_epi32(input[i], newsqrt);
12496         temp          = _mm256_add_epi32(temp, rounding);
12497         output[i]     = _mm256_srai_epi32(temp, bits);
12498         temp          = _mm256_mullo_epi32(input[i + 1], newsqrt);
12499         temp          = _mm256_add_epi32(temp, rounding);
12500         output[i + 1] = _mm256_srai_epi32(temp, bits);
12501     }
12502 }
12503 
fdct16x4_N4_avx2(__m256i * input,__m256i * output,int32_t bit)12504 static void fdct16x4_N4_avx2(__m256i *input, __m256i *output, int32_t bit) {
12505     __m128i *in  = (__m128i *)input;
12506     __m128i *out = (__m128i *)output;
12507 
12508     const int32_t *cospi    = cospi_arr(bit);
12509     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
12510     const __m128i  cospim32 = _mm_set1_epi32(-cospi[32]);
12511     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
12512     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
12513     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
12514     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
12515     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
12516     const __m128i  cospi60  = _mm_set1_epi32(cospi[60]);
12517     const __m128i  cospi4   = _mm_set1_epi32(cospi[4]);
12518     const __m128i  cospi12  = _mm_set1_epi32(cospi[12]);
12519     const __m128i  cospi52  = _mm_set1_epi32(cospi[52]);
12520     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
12521     __m128i        u[16], v[16], x;
12522 
12523     // stage 0
12524     // stage 1
12525     u[0]  = _mm_add_epi32(in[0], in[15]);
12526     v[15] = _mm_sub_epi32(in[0], in[15]);
12527     u[1]  = _mm_add_epi32(in[1], in[14]);
12528     v[14] = _mm_sub_epi32(in[1], in[14]);
12529     u[2]  = _mm_add_epi32(in[2], in[13]);
12530     u[13] = _mm_sub_epi32(in[2], in[13]);
12531     u[3]  = _mm_add_epi32(in[3], in[12]);
12532     u[12] = _mm_sub_epi32(in[3], in[12]);
12533     u[4]  = _mm_add_epi32(in[4], in[11]);
12534     u[11] = _mm_sub_epi32(in[4], in[11]);
12535     u[5]  = _mm_add_epi32(in[5], in[10]);
12536     u[10] = _mm_sub_epi32(in[5], in[10]);
12537     u[6]  = _mm_add_epi32(in[6], in[9]);
12538     v[9]  = _mm_sub_epi32(in[6], in[9]);
12539     u[7]  = _mm_add_epi32(in[7], in[8]);
12540     v[8]  = _mm_sub_epi32(in[7], in[8]);
12541 
12542     // stage 2
12543     v[0] = _mm_add_epi32(u[0], u[7]);
12544     u[7] = _mm_sub_epi32(u[0], u[7]);
12545     v[1] = _mm_add_epi32(u[1], u[6]);
12546     v[6] = _mm_sub_epi32(u[1], u[6]);
12547     v[2] = _mm_add_epi32(u[2], u[5]);
12548     v[5] = _mm_sub_epi32(u[2], u[5]);
12549     v[3] = _mm_add_epi32(u[3], u[4]);
12550     u[4] = _mm_sub_epi32(u[3], u[4]);
12551 
12552     v[10] = _mm_mullo_epi32(u[10], cospim32);
12553     x     = _mm_mullo_epi32(u[13], cospi32);
12554     v[10] = _mm_add_epi32(v[10], x);
12555     v[10] = _mm_add_epi32(v[10], rnding);
12556     v[10] = _mm_srai_epi32(v[10], bit);
12557 
12558     v[13] = _mm_mullo_epi32(u[10], cospi32);
12559     x     = _mm_mullo_epi32(u[13], cospim32);
12560     v[13] = _mm_sub_epi32(v[13], x);
12561     v[13] = _mm_add_epi32(v[13], rnding);
12562     v[13] = _mm_srai_epi32(v[13], bit);
12563 
12564     v[11] = _mm_mullo_epi32(u[11], cospim32);
12565     x     = _mm_mullo_epi32(u[12], cospi32);
12566     v[11] = _mm_add_epi32(v[11], x);
12567     v[11] = _mm_add_epi32(v[11], rnding);
12568     v[11] = _mm_srai_epi32(v[11], bit);
12569 
12570     v[12] = _mm_mullo_epi32(u[11], cospi32);
12571     x     = _mm_mullo_epi32(u[12], cospim32);
12572     v[12] = _mm_sub_epi32(v[12], x);
12573     v[12] = _mm_add_epi32(v[12], rnding);
12574     v[12] = _mm_srai_epi32(v[12], bit);
12575 
12576     // stage 3
12577     u[0] = _mm_add_epi32(v[0], v[3]);
12578     u[1] = _mm_add_epi32(v[1], v[2]);
12579 
12580     u[5] = _mm_mullo_epi32(v[5], cospim32);
12581     x    = _mm_mullo_epi32(v[6], cospi32);
12582     u[5] = _mm_add_epi32(u[5], x);
12583     u[5] = _mm_add_epi32(u[5], rnding);
12584     u[5] = _mm_srai_epi32(u[5], bit);
12585 
12586     u[6] = _mm_mullo_epi32(v[5], cospi32);
12587     x    = _mm_mullo_epi32(v[6], cospim32);
12588     u[6] = _mm_sub_epi32(u[6], x);
12589     u[6] = _mm_add_epi32(u[6], rnding);
12590     u[6] = _mm_srai_epi32(u[6], bit);
12591 
12592     u[8]  = _mm_add_epi32(v[8], v[11]);
12593     v[11] = _mm_sub_epi32(v[8], v[11]);
12594     u[9]  = _mm_add_epi32(v[9], v[10]);
12595     u[10] = _mm_sub_epi32(v[9], v[10]);
12596     u[12] = _mm_sub_epi32(v[15], v[12]);
12597     v[15] = _mm_add_epi32(v[15], v[12]);
12598     u[13] = _mm_sub_epi32(v[14], v[13]);
12599     u[14] = _mm_add_epi32(v[14], v[13]);
12600 
12601     // stage 4
12602     u[0]   = _mm_mullo_epi32(u[0], cospi32);
12603     u[1]   = _mm_mullo_epi32(u[1], cospi32);
12604     v[0]   = _mm_add_epi32(u[0], u[1]);
12605     v[0]   = _mm_add_epi32(v[0], rnding);
12606     out[0] = _mm_srai_epi32(v[0], bit);
12607 
12608     v[4] = _mm_add_epi32(u[4], u[5]);
12609     v[7] = _mm_add_epi32(u[7], u[6]);
12610     v[8] = u[8];
12611 
12612     v[9] = _mm_mullo_epi32(u[9], cospim16);
12613     x    = _mm_mullo_epi32(u[14], cospi48);
12614     v[9] = _mm_add_epi32(v[9], x);
12615     v[9] = _mm_add_epi32(v[9], rnding);
12616     v[9] = _mm_srai_epi32(v[9], bit);
12617 
12618     v[14] = _mm_mullo_epi32(u[9], cospi48);
12619     x     = _mm_mullo_epi32(u[14], cospim16);
12620     v[14] = _mm_sub_epi32(v[14], x);
12621     v[14] = _mm_add_epi32(v[14], rnding);
12622     v[14] = _mm_srai_epi32(v[14], bit);
12623 
12624     v[10] = _mm_mullo_epi32(u[10], cospim48);
12625     x     = _mm_mullo_epi32(u[13], cospim16);
12626     v[10] = _mm_add_epi32(v[10], x);
12627     v[10] = _mm_add_epi32(v[10], rnding);
12628     v[10] = _mm_srai_epi32(v[10], bit);
12629 
12630     v[13] = _mm_mullo_epi32(u[10], cospim16);
12631     x     = _mm_mullo_epi32(u[13], cospim48);
12632     v[13] = _mm_sub_epi32(v[13], x);
12633     v[13] = _mm_add_epi32(v[13], rnding);
12634     v[13] = _mm_srai_epi32(v[13], bit);
12635 
12636     v[12] = u[12];
12637 
12638     // stage 5
12639     u[4]   = _mm_mullo_epi32(v[4], cospi56);
12640     x      = _mm_mullo_epi32(v[7], cospi8);
12641     u[4]   = _mm_add_epi32(u[4], x);
12642     u[4]   = _mm_add_epi32(u[4], rnding);
12643     out[2] = _mm_srai_epi32(u[4], bit);
12644 
12645     u[8]  = _mm_add_epi32(v[8], v[9]);
12646     u[11] = _mm_add_epi32(v[11], v[10]);
12647     u[12] = _mm_add_epi32(v[12], v[13]);
12648     u[15] = _mm_add_epi32(v[15], v[14]);
12649 
12650     // stage 6
12651     v[8]   = _mm_mullo_epi32(u[8], cospi60);
12652     x      = _mm_mullo_epi32(u[15], cospi4);
12653     v[8]   = _mm_add_epi32(v[8], x);
12654     v[8]   = _mm_add_epi32(v[8], rnding);
12655     out[1] = _mm_srai_epi32(v[8], bit);
12656 
12657     v[12]  = _mm_mullo_epi32(u[11], cospi52);
12658     x      = _mm_mullo_epi32(u[12], cospi12);
12659     v[12]  = _mm_sub_epi32(x, v[12]);
12660     v[12]  = _mm_add_epi32(v[12], rnding);
12661     out[3] = _mm_srai_epi32(v[12], bit);
12662 }
12663 
fdct4x8_col_N4_avx2(__m256i * in,__m256i * output,int32_t bit,const int32_t num_col)12664 static void fdct4x8_col_N4_avx2(__m256i *in, __m256i *output, int32_t bit, const int32_t num_col) {
12665     const int32_t *cospi   = cospi_arr(bit);
12666     const __m256i  zero    = _mm256_setzero_si256();
12667     const __m256i  cospi32 = _mm256_set1_epi32(cospi[32]);
12668     const __m256i  rnding  = _mm256_set1_epi32(1 << (bit - 1));
12669     __m256i        s0, s1;
12670     __m256i        u0, u1, u2, u3;
12671     __m256i        v0, v1;
12672     __m256i        out[4];
12673 
12674     int32_t endidx = 3 * num_col;
12675     s0             = _mm256_add_epi32(in[0], in[endidx]);
12676     endidx -= num_col;
12677     s1 = _mm256_add_epi32(in[num_col], in[endidx]);
12678 
12679     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
12680     u0 = _mm256_mullo_epi32(s0, cospi32);
12681     u1 = _mm256_mullo_epi32(s1, cospi32);
12682     u2 = _mm256_add_epi32(u0, u1);
12683 
12684     u3 = _mm256_add_epi32(u2, rnding);
12685 
12686     u0 = _mm256_srai_epi32(u3, bit);
12687 
12688     // Transpose 4x4 32-bit
12689     v0 = _mm256_unpacklo_epi32(u0, zero);
12690     v1 = _mm256_unpackhi_epi32(u0, zero);
12691 
12692     out[0] = _mm256_unpacklo_epi64(v0, zero);
12693     out[1] = _mm256_unpackhi_epi64(v0, zero);
12694     out[2] = _mm256_unpacklo_epi64(v1, zero);
12695     out[3] = _mm256_unpackhi_epi64(v1, zero);
12696 
12697     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
12698     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
12699 }
12700 
fdct4x8_row_N4_with_round_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t num_col,int32_t shift)12701 static void fdct4x8_row_N4_with_round_avx2(__m256i *input, __m256i *output, int32_t bit,
12702                                            const int32_t num_col, int32_t shift) {
12703     const int32_t *cospi    = cospi_arr(bit);
12704     const __m256i  zero     = _mm256_setzero_si256();
12705     const __m256i  cospi32  = _mm256_set1_epi32(cospi[32]);
12706     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
12707     const __m256i  rounding = _mm256_set1_epi32(1 << (shift - 1));
12708     __m256i        s0, s1;
12709     __m256i        u0, u1, u2, u3;
12710     __m256i        v0, v1;
12711     __m256i        out[4];
12712     __m256i        in[4];
12713 
12714     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
12715     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
12716     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
12717     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
12718 
12719     int32_t endidx = 3 * num_col;
12720     s0             = _mm256_add_epi32(in[0], in[endidx]);
12721     endidx -= num_col;
12722     s1 = _mm256_add_epi32(in[num_col], in[endidx]);
12723 
12724     // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
12725     u0 = _mm256_mullo_epi32(s0, cospi32);
12726     u1 = _mm256_mullo_epi32(s1, cospi32);
12727     u2 = _mm256_add_epi32(u0, u1);
12728 
12729     u3 = _mm256_add_epi32(u2, rnding);
12730 
12731     u0 = _mm256_srai_epi32(u3, bit);
12732 
12733     u0 = _mm256_add_epi32(u0, rounding);
12734     u0 = _mm256_srai_epi32(u0, shift);
12735 
12736     // Transpose 4x4 32-bit
12737     v0 = _mm256_unpacklo_epi32(u0, zero);
12738     v1 = _mm256_unpackhi_epi32(u0, zero);
12739 
12740     out[0] = _mm256_unpacklo_epi64(v0, zero);
12741     out[1] = _mm256_unpackhi_epi64(v0, zero);
12742     out[2] = _mm256_unpacklo_epi64(v1, zero);
12743     out[3] = _mm256_unpackhi_epi64(v1, zero);
12744 
12745     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
12746     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
12747     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
12748     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
12749 }
12750 
fadst4x8_col_N4_avx2(__m256i * in,__m256i * output,int32_t bit,const int32_t num_col)12751 static INLINE void fadst4x8_col_N4_avx2(__m256i *in, __m256i *output, int32_t bit,
12752                                         const int32_t num_col) {
12753     const int32_t *sinpi  = sinpi_arr(bit);
12754     const __m256i  zero   = _mm256_setzero_si256();
12755     const __m256i  rnding = _mm256_set1_epi32(1 << (bit - 1));
12756     const __m256i  sinpi1 = _mm256_set1_epi32((int32_t)sinpi[1]);
12757     const __m256i  sinpi2 = _mm256_set1_epi32((int32_t)sinpi[2]);
12758     const __m256i  sinpi3 = _mm256_set1_epi32((int32_t)sinpi[3]);
12759     const __m256i  sinpi4 = _mm256_set1_epi32((int32_t)sinpi[4]);
12760     __m256i        s0, s2, s3, s4;
12761     __m256i        u0, u1;
12762     __m256i        v0, v1;
12763     __m256i        out[4];
12764 
12765     int32_t idx = 0 * num_col;
12766     s0          = _mm256_mullo_epi32(in[idx], sinpi1);
12767     idx += num_col;
12768     s2 = _mm256_mullo_epi32(in[idx], sinpi2);
12769     idx += num_col;
12770     s4 = _mm256_mullo_epi32(in[idx], sinpi3);
12771     idx += num_col;
12772     s3 = _mm256_mullo_epi32(in[idx], sinpi4);
12773 
12774     u0 = _mm256_add_epi32(s0, s2);
12775     u1 = _mm256_add_epi32(u0, s3);
12776 
12777     s0 = _mm256_add_epi32(u1, s4);
12778 
12779     u0 = _mm256_add_epi32(s0, rnding);
12780     u0 = _mm256_srai_epi32(u0, bit);
12781 
12782     v0 = _mm256_unpacklo_epi32(u0, zero);
12783     v1 = _mm256_unpackhi_epi32(u0, zero);
12784 
12785     out[0] = _mm256_unpacklo_epi64(v0, zero);
12786     out[1] = _mm256_unpackhi_epi64(v0, zero);
12787     out[2] = _mm256_unpacklo_epi64(v1, zero);
12788     out[3] = _mm256_unpackhi_epi64(v1, zero);
12789 
12790     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
12791     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
12792 }
12793 
fadst4x8_row_N4_with_round_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t num_col,int32_t shift)12794 static void fadst4x8_row_N4_with_round_avx2(__m256i *input, __m256i *output, int32_t bit,
12795                                             const int32_t num_col, int32_t shift) {
12796     const int32_t *sinpi    = sinpi_arr(bit);
12797     const __m256i  zero     = _mm256_setzero_si256();
12798     const __m256i  rnding   = _mm256_set1_epi32(1 << (bit - 1));
12799     const __m256i  rounding = _mm256_set1_epi32(1 << (shift - 1));
12800     const __m256i  sinpi1   = _mm256_set1_epi32((int32_t)sinpi[1]);
12801     const __m256i  sinpi2   = _mm256_set1_epi32((int32_t)sinpi[2]);
12802     const __m256i  sinpi3   = _mm256_set1_epi32((int32_t)sinpi[3]);
12803     const __m256i  sinpi4   = _mm256_set1_epi32((int32_t)sinpi[4]);
12804     __m256i        s0, s2, s3, s4;
12805     __m256i        u0, u1;
12806     __m256i        v0, v1;
12807     __m256i        out[4];
12808     __m256i        in[4];
12809 
12810     in[0] = _mm256_permute2x128_si256(input[0], input[2], 0x20);
12811     in[1] = _mm256_permute2x128_si256(input[0], input[2], 0x31);
12812     in[2] = _mm256_permute2x128_si256(input[1], input[3], 0x20);
12813     in[3] = _mm256_permute2x128_si256(input[1], input[3], 0x31);
12814 
12815     int32_t idx = 0 * num_col;
12816     s0          = _mm256_mullo_epi32(in[idx], sinpi1);
12817     idx += num_col;
12818     s2 = _mm256_mullo_epi32(in[idx], sinpi2);
12819     idx += num_col;
12820     s4 = _mm256_mullo_epi32(in[idx], sinpi3);
12821     idx += num_col;
12822     s3 = _mm256_mullo_epi32(in[idx], sinpi4);
12823 
12824     u0 = _mm256_add_epi32(s0, s2);
12825     u1 = _mm256_add_epi32(u0, s3);
12826 
12827     s0 = _mm256_add_epi32(u1, s4);
12828 
12829     u0 = _mm256_add_epi32(s0, rnding);
12830     u0 = _mm256_srai_epi32(u0, bit);
12831 
12832     u0 = _mm256_add_epi32(u0, rounding);
12833     u0 = _mm256_srai_epi32(u0, shift);
12834 
12835     v0 = _mm256_unpacklo_epi32(u0, zero);
12836     v1 = _mm256_unpackhi_epi32(u0, zero);
12837 
12838     out[0] = _mm256_unpacklo_epi64(v0, zero);
12839     out[1] = _mm256_unpackhi_epi64(v0, zero);
12840     out[2] = _mm256_unpacklo_epi64(v1, zero);
12841     out[3] = _mm256_unpackhi_epi64(v1, zero);
12842 
12843     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
12844     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
12845     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
12846     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
12847 }
12848 
fadst16x4_N4_avx2(__m256i * input,__m256i * output,int32_t bit)12849 static void fadst16x4_N4_avx2(__m256i *input, __m256i *output, int32_t bit) {
12850     __m128i *in  = (__m128i *)input;
12851     __m128i *out = (__m128i *)output;
12852 
12853     const int32_t *cospi    = cospi_arr(bit);
12854     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
12855     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
12856     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
12857     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
12858     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
12859     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
12860     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
12861     const __m128i  cospim56 = _mm_set1_epi32(-cospi[56]);
12862     const __m128i  cospim8  = _mm_set1_epi32(-cospi[8]);
12863     const __m128i  cospi24  = _mm_set1_epi32(cospi[24]);
12864     const __m128i  cospim24 = _mm_set1_epi32(-cospi[24]);
12865     const __m128i  cospim40 = _mm_set1_epi32(-cospi[40]);
12866     const __m128i  cospi40  = _mm_set1_epi32(cospi[40]);
12867     const __m128i  cospi62  = _mm_set1_epi32(cospi[62]);
12868     const __m128i  cospim2  = _mm_set1_epi32(-cospi[2]);
12869     const __m128i  cospi54  = _mm_set1_epi32(cospi[54]);
12870     const __m128i  cospim10 = _mm_set1_epi32(-cospi[10]);
12871     const __m128i  cospi50  = _mm_set1_epi32(cospi[50]);
12872     const __m128i  cospi14  = _mm_set1_epi32(cospi[14]);
12873     const __m128i  cospi58  = _mm_set1_epi32(cospi[58]);
12874     const __m128i  cospi6   = _mm_set1_epi32(cospi[6]);
12875     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
12876     const __m128i  zero     = _mm_setzero_si128();
12877 
12878     __m128i u[16], v[16], x, y;
12879     __m128i tmp[13];
12880 
12881     tmp[0] = _mm_sub_epi32(zero, in[15]);
12882     u[2]   = _mm_sub_epi32(zero, in[7]);
12883     tmp[1] = _mm_sub_epi32(zero, in[3]);
12884     u[7]   = _mm_sub_epi32(zero, in[11]);
12885     tmp[2] = _mm_sub_epi32(zero, in[1]);
12886     u[11]  = _mm_sub_epi32(zero, in[9]);
12887     tmp[3] = _mm_sub_epi32(zero, in[13]);
12888     u[14]  = _mm_sub_epi32(zero, in[5]);
12889 
12890     // stage 2
12891 
12892     x    = _mm_mullo_epi32(u[2], cospi32);
12893     y    = _mm_mullo_epi32(in[8], cospi32);
12894     v[2] = _mm_add_epi32(x, y);
12895     v[2] = _mm_add_epi32(v[2], rnding);
12896     v[2] = _mm_srai_epi32(v[2], bit);
12897 
12898     v[3] = _mm_sub_epi32(x, y);
12899     v[3] = _mm_add_epi32(v[3], rnding);
12900     v[3] = _mm_srai_epi32(v[3], bit);
12901 
12902     x    = _mm_mullo_epi32(in[4], cospi32);
12903     y    = _mm_mullo_epi32(u[7], cospi32);
12904     v[6] = _mm_add_epi32(x, y);
12905     v[6] = _mm_add_epi32(v[6], rnding);
12906     v[6] = _mm_srai_epi32(v[6], bit);
12907 
12908     v[7] = _mm_sub_epi32(x, y);
12909     v[7] = _mm_add_epi32(v[7], rnding);
12910     v[7] = _mm_srai_epi32(v[7], bit);
12911 
12912     x     = _mm_mullo_epi32(in[6], cospi32);
12913     y     = _mm_mullo_epi32(u[11], cospi32);
12914     v[10] = _mm_add_epi32(x, y);
12915     v[10] = _mm_add_epi32(v[10], rnding);
12916     v[10] = _mm_srai_epi32(v[10], bit);
12917 
12918     v[11] = _mm_sub_epi32(x, y);
12919     v[11] = _mm_add_epi32(v[11], rnding);
12920     v[11] = _mm_srai_epi32(v[11], bit);
12921 
12922     x     = _mm_mullo_epi32(u[14], cospi32);
12923     y     = _mm_mullo_epi32(in[10], cospi32);
12924     v[14] = _mm_add_epi32(x, y);
12925     v[14] = _mm_add_epi32(v[14], rnding);
12926     v[14] = _mm_srai_epi32(v[14], bit);
12927 
12928     v[15] = _mm_sub_epi32(x, y);
12929     v[15] = _mm_add_epi32(v[15], rnding);
12930     v[15] = _mm_srai_epi32(v[15], bit);
12931 
12932     // stage 3
12933     tmp[4] = _mm_add_epi32(in[0], v[2]);
12934     tmp[5] = _mm_add_epi32(tmp[0], v[3]);
12935     tmp[6] = _mm_sub_epi32(in[0], v[2]);
12936     tmp[0] = _mm_sub_epi32(tmp[0], v[3]);
12937     u[4]   = _mm_add_epi32(tmp[1], v[6]);
12938     u[5]   = _mm_add_epi32(in[12], v[7]);
12939     u[6]   = _mm_sub_epi32(tmp[1], v[6]);
12940     u[7]   = _mm_sub_epi32(in[12], v[7]);
12941     tmp[1] = _mm_add_epi32(tmp[2], v[10]);
12942     tmp[7] = _mm_add_epi32(in[14], v[11]);
12943     tmp[2] = _mm_sub_epi32(tmp[2], v[10]);
12944     tmp[8] = _mm_sub_epi32(in[14], v[11]);
12945     u[12]  = _mm_add_epi32(in[2], v[14]);
12946     u[13]  = _mm_add_epi32(tmp[3], v[15]);
12947     u[14]  = _mm_sub_epi32(in[2], v[14]);
12948     u[15]  = _mm_sub_epi32(tmp[3], v[15]);
12949 
12950     // stage 4
12951     v[4]  = half_btf_small(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
12952     v[5]  = half_btf_small(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
12953     v[6]  = half_btf_small(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
12954     v[7]  = half_btf_small(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
12955     v[12] = half_btf_small(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
12956     v[13] = half_btf_small(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
12957     v[14] = half_btf_small(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
12958     v[15] = half_btf_small(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
12959 
12960     // stage 5
12961     tmp[9]  = _mm_add_epi32(tmp[4], v[4]);
12962     tmp[10] = _mm_add_epi32(tmp[5], v[5]);
12963     tmp[11] = _mm_add_epi32(tmp[6], v[6]);
12964     tmp[12] = _mm_add_epi32(tmp[0], v[7]);
12965     tmp[4]  = _mm_sub_epi32(tmp[4], v[4]);
12966     tmp[5]  = _mm_sub_epi32(tmp[5], v[5]);
12967     tmp[6]  = _mm_sub_epi32(tmp[6], v[6]);
12968     tmp[0]  = _mm_sub_epi32(tmp[0], v[7]);
12969     u[8]    = _mm_add_epi32(tmp[1], v[12]);
12970     u[9]    = _mm_add_epi32(tmp[7], v[13]);
12971     u[10]   = _mm_add_epi32(tmp[2], v[14]);
12972     u[11]   = _mm_add_epi32(tmp[8], v[15]);
12973     u[12]   = _mm_sub_epi32(tmp[1], v[12]);
12974     u[13]   = _mm_sub_epi32(tmp[7], v[13]);
12975     u[14]   = _mm_sub_epi32(tmp[2], v[14]);
12976     u[15]   = _mm_sub_epi32(tmp[8], v[15]);
12977 
12978     // stage 6
12979     v[8]  = half_btf_small(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
12980     v[9]  = half_btf_small(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
12981     v[10] = half_btf_small(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
12982     v[11] = half_btf_small(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
12983     v[12] = half_btf_small(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
12984     v[13] = half_btf_small(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
12985     v[14] = half_btf_small(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
12986     v[15] = half_btf_small(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
12987 
12988     // stage 7
12989     u[0]  = _mm_add_epi32(tmp[9], v[8]);
12990     u[1]  = _mm_add_epi32(tmp[10], v[9]);
12991     u[2]  = _mm_add_epi32(tmp[11], v[10]);
12992     u[3]  = _mm_add_epi32(tmp[12], v[11]);
12993     u[12] = _mm_sub_epi32(tmp[4], v[12]);
12994     u[13] = _mm_sub_epi32(tmp[5], v[13]);
12995     u[14] = _mm_sub_epi32(tmp[6], v[14]);
12996     u[15] = _mm_sub_epi32(tmp[0], v[15]);
12997 
12998     // stage 8
12999     out[0] = half_btf_small(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
13000     out[2] = half_btf_small(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
13001     out[3] = half_btf_small(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
13002     out[1] = half_btf_small(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
13003 }
13004 
fidtx4x8_col_N4_avx2(__m256i * in,__m256i * output,int32_t bit,int32_t col_num)13005 static INLINE void fidtx4x8_col_N4_avx2(__m256i *in, __m256i *output, int32_t bit,
13006                                         int32_t col_num) {
13007     (void)bit;
13008     const __m256i zero   = _mm256_setzero_si256();
13009     __m256i       fact   = _mm256_set1_epi32(new_sqrt2);
13010     __m256i       offset = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
13011     __m256i       a_low;
13012 
13013     for (int32_t i = 0; i < col_num; i++) {
13014         a_low     = _mm256_mullo_epi32(in[i], fact);
13015         a_low     = _mm256_add_epi32(a_low, offset);
13016         a_low     = _mm256_srai_epi32(a_low, new_sqrt2_bits);
13017         output[i] = _mm256_blend_epi32(zero, a_low, 17);
13018     }
13019 }
13020 
fidtx4x8_row_N4_with_round_avx2(__m256i * input,__m256i * output,int32_t bit,int32_t shift)13021 static INLINE void fidtx4x8_row_N4_with_round_avx2(__m256i *input, __m256i *output, int32_t bit,
13022                                                    int32_t shift) {
13023     (void)bit;
13024     __m256i out[4];
13025     __m256i v[2];
13026     __m256i a_low;
13027 
13028     const __m256i fact     = _mm256_set1_epi32(new_sqrt2);
13029     const __m256i offset   = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
13030     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
13031     const __m256i zero     = _mm256_setzero_si256();
13032 
13033     a_low = _mm256_permute2x128_si256(input[0], input[2], 0x20);
13034     a_low = _mm256_mullo_epi32(a_low, fact);
13035     a_low = _mm256_add_epi32(a_low, offset);
13036     a_low = _mm256_srai_epi32(a_low, new_sqrt2_bits);
13037     a_low = _mm256_add_epi32(a_low, rounding);
13038     a_low = _mm256_srai_epi32(a_low, shift);
13039 
13040     // Transpose for 4x4
13041     v[0] = _mm256_unpacklo_epi32(a_low, zero);
13042     v[1] = _mm256_unpackhi_epi32(a_low, zero);
13043 
13044     out[0] = _mm256_unpacklo_epi64(v[0], zero);
13045     out[1] = _mm256_unpackhi_epi64(v[0], zero);
13046     out[2] = _mm256_unpacklo_epi64(v[1], zero);
13047     out[3] = _mm256_unpackhi_epi64(v[1], zero);
13048 
13049     output[0] = _mm256_permute2x128_si256(out[0], out[1], 0x20);
13050     output[1] = _mm256_permute2x128_si256(out[2], out[3], 0x20);
13051     output[2] = _mm256_permute2x128_si256(out[0], out[1], 0x31);
13052     output[3] = _mm256_permute2x128_si256(out[2], out[3], 0x31);
13053 }
13054 
fidtx4x8_row_N4_avx2(__m256i * in,__m256i * output,int32_t bit)13055 static INLINE void fidtx4x8_row_N4_avx2(__m256i *in, __m256i *output, int32_t bit) {
13056     (void)bit;
13057     __m256i fact   = _mm256_set1_epi32(new_sqrt2);
13058     __m256i offset = _mm256_set1_epi32(1 << (new_sqrt2_bits - 1));
13059     __m256i a_low;
13060 
13061     a_low     = _mm256_mullo_epi32(in[0], fact);
13062     a_low     = _mm256_add_epi32(a_low, offset);
13063     output[0] = _mm256_srai_epi32(a_low, new_sqrt2_bits);
13064 }
13065 
fidtx8x4_N4_avx2(__m256i * in,__m256i * out,int32_t bit)13066 static AOM_FORCE_INLINE void fidtx8x4_N4_avx2(__m256i *in, __m256i *out, int32_t bit) {
13067     (void)bit;
13068     out[0] = _mm256_add_epi32(in[0], in[0]);
13069 }
13070 
fdct4x8_N4_avx2(__m256i * input,__m256i * output,int32_t bit)13071 static void fdct4x8_N4_avx2(__m256i *input, __m256i *output, int32_t bit) {
13072     __m128i *      in       = (__m128i *)input;
13073     __m128i *      out      = (__m128i *)output;
13074     const int32_t *cospi    = cospi_arr(bit);
13075     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
13076     const __m128i  cospim32 = _mm_set1_epi32(-cospi[32]);
13077     const __m128i  cospi56  = _mm_set1_epi32(cospi[56]);
13078     const __m128i  cospi8   = _mm_set1_epi32(cospi[8]);
13079     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
13080     __m128i        u[8], v[8];
13081 
13082     // Even 8 points 0, 2, ..., 14
13083     // stage 0
13084     // stage 1
13085     u[0] = _mm_add_epi32(in[0], in[7]);
13086     v[7] = _mm_sub_epi32(in[0], in[7]); // v[7]
13087     u[1] = _mm_add_epi32(in[1], in[6]);
13088     u[6] = _mm_sub_epi32(in[1], in[6]);
13089     u[2] = _mm_add_epi32(in[2], in[5]);
13090     u[5] = _mm_sub_epi32(in[2], in[5]);
13091     u[3] = _mm_add_epi32(in[3], in[4]);
13092     v[4] = _mm_sub_epi32(in[3], in[4]); // v[4]
13093 
13094     // stage 2
13095     v[0] = _mm_add_epi32(u[0], u[3]);
13096     v[1] = _mm_add_epi32(u[1], u[2]);
13097 
13098     v[5] = _mm_mullo_epi32(u[5], cospim32);
13099     v[6] = _mm_mullo_epi32(u[6], cospi32);
13100     v[5] = _mm_add_epi32(v[5], v[6]);
13101     v[5] = _mm_add_epi32(v[5], rnding);
13102     v[5] = _mm_srai_epi32(v[5], bit);
13103 
13104     u[0] = _mm_mullo_epi32(u[5], cospi32);
13105     v[6] = _mm_mullo_epi32(u[6], cospim32);
13106     v[6] = _mm_sub_epi32(u[0], v[6]);
13107     v[6] = _mm_add_epi32(v[6], rnding);
13108     v[6] = _mm_srai_epi32(v[6], bit);
13109 
13110     // stage 3
13111     // type 0
13112     v[0]   = _mm_mullo_epi32(v[0], cospi32);
13113     v[1]   = _mm_mullo_epi32(v[1], cospi32);
13114     u[0]   = _mm_add_epi32(v[0], v[1]);
13115     u[0]   = _mm_add_epi32(u[0], rnding);
13116     out[0] = _mm_srai_epi32(u[0], bit);
13117 
13118     u[4] = _mm_add_epi32(v[4], v[5]);
13119     u[7] = _mm_add_epi32(v[7], v[6]);
13120 
13121     // stage 4
13122     // stage 5
13123     v[0]   = _mm_mullo_epi32(u[4], cospi56);
13124     v[1]   = _mm_mullo_epi32(u[7], cospi8);
13125     v[0]   = _mm_add_epi32(v[0], v[1]);
13126     v[0]   = _mm_add_epi32(v[0], rnding);
13127     out[1] = _mm_srai_epi32(v[0], bit); // buf0[4]
13128 }
13129 
fadst8x4_N4_avx2(__m256i * input,__m256i * output,int32_t bit,const int32_t col_num)13130 static void fadst8x4_N4_avx2(__m256i *input, __m256i *output, int32_t bit, const int32_t col_num) {
13131     __m128i *      in       = (__m128i *)input;
13132     __m128i *      out      = (__m128i *)output;
13133     const int32_t *cospi    = cospi_arr(bit);
13134     const __m128i  cospi32  = _mm_set1_epi32(cospi[32]);
13135     const __m128i  cospi16  = _mm_set1_epi32(cospi[16]);
13136     const __m128i  cospim16 = _mm_set1_epi32(-cospi[16]);
13137     const __m128i  cospi48  = _mm_set1_epi32(cospi[48]);
13138     const __m128i  cospim48 = _mm_set1_epi32(-cospi[48]);
13139     const __m128i  cospim4  = _mm_set1_epi32(-cospi[4]);
13140     const __m128i  cospi60  = _mm_set1_epi32(cospi[60]);
13141     const __m128i  cospi52  = _mm_set1_epi32(cospi[52]);
13142     const __m128i  cospi12  = _mm_set1_epi32(cospi[12]);
13143     const __m128i  rnding   = _mm_set1_epi32(1 << (bit - 1));
13144     const __m128i  zero     = _mm_setzero_si128();
13145     __m128i        u0, u1, u2, u3, u4, u5, u6, u7;
13146     __m128i        v0, v1, v2, v3, v4, v5, v6, v7;
13147     __m128i        x, y;
13148     int32_t        col;
13149 
13150     // Note:
13151     //  Even column: 0, 2, ..., 14
13152     //  Odd column: 1, 3, ..., 15
13153     //  one even column plus one odd column constructs one row (8 coeffs)
13154     //  total we have 8 rows (8x8).
13155     for (col = 0; col < col_num; ++col) {
13156         // stage 0
13157         // stage 1
13158         u0 = in[col_num * 0 + col];
13159         u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
13160         u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
13161         u3 = in[col_num * 4 + col];
13162         u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
13163         u5 = in[col_num * 6 + col];
13164         u6 = in[col_num * 2 + col];
13165         u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
13166 
13167         // stage 2
13168         v0 = u0;
13169         v1 = u1;
13170 
13171         x  = _mm_mullo_epi32(u2, cospi32);
13172         y  = _mm_mullo_epi32(u3, cospi32);
13173         v2 = _mm_add_epi32(x, y);
13174         v2 = _mm_add_epi32(v2, rnding);
13175         v2 = _mm_srai_epi32(v2, bit);
13176 
13177         v3 = _mm_sub_epi32(x, y);
13178         v3 = _mm_add_epi32(v3, rnding);
13179         v3 = _mm_srai_epi32(v3, bit);
13180 
13181         v4 = u4;
13182         v5 = u5;
13183 
13184         x  = _mm_mullo_epi32(u6, cospi32);
13185         y  = _mm_mullo_epi32(u7, cospi32);
13186         v6 = _mm_add_epi32(x, y);
13187         v6 = _mm_add_epi32(v6, rnding);
13188         v6 = _mm_srai_epi32(v6, bit);
13189 
13190         v7 = _mm_sub_epi32(x, y);
13191         v7 = _mm_add_epi32(v7, rnding);
13192         v7 = _mm_srai_epi32(v7, bit);
13193 
13194         // stage 3
13195         u0 = _mm_add_epi32(v0, v2);
13196         u1 = _mm_add_epi32(v1, v3);
13197         u2 = _mm_sub_epi32(v0, v2);
13198         u3 = _mm_sub_epi32(v1, v3);
13199         u4 = _mm_add_epi32(v4, v6);
13200         u5 = _mm_add_epi32(v5, v7);
13201         u6 = _mm_sub_epi32(v4, v6);
13202         u7 = _mm_sub_epi32(v5, v7);
13203 
13204         // stage 4
13205         v0 = u0;
13206         v1 = u1;
13207         v2 = u2;
13208         v3 = u3;
13209 
13210         x  = _mm_mullo_epi32(u4, cospi16);
13211         y  = _mm_mullo_epi32(u5, cospi48);
13212         v4 = _mm_add_epi32(x, y);
13213         v4 = _mm_add_epi32(v4, rnding);
13214         v4 = _mm_srai_epi32(v4, bit);
13215 
13216         x  = _mm_mullo_epi32(u4, cospi48);
13217         y  = _mm_mullo_epi32(u5, cospim16);
13218         v5 = _mm_add_epi32(x, y);
13219         v5 = _mm_add_epi32(v5, rnding);
13220         v5 = _mm_srai_epi32(v5, bit);
13221 
13222         x  = _mm_mullo_epi32(u6, cospim48);
13223         y  = _mm_mullo_epi32(u7, cospi16);
13224         v6 = _mm_add_epi32(x, y);
13225         v6 = _mm_add_epi32(v6, rnding);
13226         v6 = _mm_srai_epi32(v6, bit);
13227 
13228         x  = _mm_mullo_epi32(u6, cospi16);
13229         y  = _mm_mullo_epi32(u7, cospi48);
13230         v7 = _mm_add_epi32(x, y);
13231         v7 = _mm_add_epi32(v7, rnding);
13232         v7 = _mm_srai_epi32(v7, bit);
13233 
13234         // stage 5
13235         u0 = _mm_add_epi32(v0, v4);
13236         u1 = _mm_add_epi32(v1, v5);
13237         u6 = _mm_sub_epi32(v2, v6);
13238         u7 = _mm_sub_epi32(v3, v7);
13239 
13240         // stage 6
13241         x                      = _mm_mullo_epi32(u0, cospi60);
13242         y                      = _mm_mullo_epi32(u1, cospim4);
13243         v1                     = _mm_add_epi32(x, y);
13244         v1                     = _mm_add_epi32(v1, rnding);
13245         out[col_num * 0 + col] = _mm_srai_epi32(v1, bit);
13246 
13247         x                      = _mm_mullo_epi32(u6, cospi52);
13248         y                      = _mm_mullo_epi32(u7, cospi12);
13249         v6                     = _mm_add_epi32(x, y);
13250         v6                     = _mm_add_epi32(v6, rnding);
13251         out[col_num * 1 + col] = _mm_srai_epi32(v6, bit);
13252     }
13253 }
13254 
write_buffer_4x8_N4(const __m256i * res,int32_t * output)13255 static AOM_FORCE_INLINE void write_buffer_4x8_N4(const __m256i *res, int32_t *output) {
13256     const __m256i zero = _mm256_setzero_si256();
13257     _mm256_storeu_si256((__m256i *)(output + 0 * 8), res[0]);
13258     _mm256_storeu_si256((__m256i *)(output + 1 * 8), zero);
13259     _mm256_storeu_si256((__m256i *)(output + 2 * 8), zero);
13260     _mm256_storeu_si256((__m256i *)(output + 3 * 8), zero);
13261 }
13262 
col_txfm_8x4_N4_rounding(__m256i * in,int32_t shift)13263 static AOM_FORCE_INLINE void col_txfm_8x4_N4_rounding(__m256i *in, int32_t shift) {
13264     const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
13265 
13266     in[0] = _mm256_add_epi32(in[0], rounding);
13267     in[0] = _mm256_srai_epi32(in[0], shift);
13268 }
13269 
svt_av1_fwd_txfm2d_8x8_N4_avx2(int16_t * input,int32_t * coeff,uint32_t stride,TxType tx_type,uint8_t bd)13270 void svt_av1_fwd_txfm2d_8x8_N4_avx2(int16_t *input, int32_t *coeff, uint32_t stride, TxType tx_type,
13271                                     uint8_t bd) {
13272     __m256i       in[8], out[8];
13273     const int8_t *shift   = fwd_txfm_shift_ls[TX_8X8];
13274     const int32_t txw_idx = get_txw_idx(TX_8X8);
13275     const int32_t txh_idx = get_txh_idx(TX_8X8);
13276 
13277     switch (tx_type) {
13278     case DCT_DCT:
13279         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
13280         fdct8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13281         col_txfm_8x8_N4_rounding(out, -shift[1]);
13282         transpose_8x8_avx2(out, in);
13283         fdct8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13284         transpose_8x8_N2_avx2(out, in);
13285         write_buffer_8x8_N4(in, coeff);
13286         break;
13287     case ADST_DCT:
13288         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
13289         fadst8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13290         col_txfm_8x8_N4_rounding(out, -shift[1]);
13291         transpose_8x8_avx2(out, in);
13292         fdct8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13293         transpose_8x8_N2_avx2(out, in);
13294         write_buffer_8x8_N4(in, coeff);
13295         break;
13296     case DCT_ADST:
13297         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
13298         fdct8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13299         col_txfm_8x8_N4_rounding(out, -shift[1]);
13300         transpose_8x8_avx2(out, in);
13301         fadst8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13302         transpose_8x8_N2_avx2(out, in);
13303         write_buffer_8x8_N4(in, coeff);
13304         break;
13305     case ADST_ADST:
13306         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
13307         fadst8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13308         col_txfm_8x8_N4_rounding(out, -shift[1]);
13309         transpose_8x8_avx2(out, in);
13310         fadst8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13311         transpose_8x8_N2_avx2(out, in);
13312         write_buffer_8x8_N4(in, coeff);
13313         break;
13314     case FLIPADST_DCT:
13315         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
13316         fadst8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13317         col_txfm_8x8_N4_rounding(out, -shift[1]);
13318         transpose_8x8_avx2(out, in);
13319         fdct8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13320         transpose_8x8_N2_avx2(out, in);
13321         write_buffer_8x8_N4(in, coeff);
13322         break;
13323     case DCT_FLIPADST:
13324         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
13325         fdct8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13326         col_txfm_8x8_N4_rounding(out, -shift[1]);
13327         transpose_8x8_avx2(out, in);
13328         fadst8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13329         transpose_8x8_N2_avx2(out, in);
13330         write_buffer_8x8_N4(in, coeff);
13331         break;
13332     case FLIPADST_FLIPADST:
13333         load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
13334         fadst8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13335         col_txfm_8x8_N4_rounding(out, -shift[1]);
13336         transpose_8x8_avx2(out, in);
13337         fadst8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13338         transpose_8x8_N2_avx2(out, in);
13339         write_buffer_8x8_N4(in, coeff);
13340         break;
13341     case ADST_FLIPADST:
13342         load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
13343         fadst8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13344         col_txfm_8x8_N4_rounding(out, -shift[1]);
13345         transpose_8x8_avx2(out, in);
13346         fadst8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13347         transpose_8x8_N2_avx2(out, in);
13348         write_buffer_8x8_N4(in, coeff);
13349         break;
13350     case FLIPADST_ADST:
13351         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
13352         fadst8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13353         col_txfm_8x8_N4_rounding(out, -shift[1]);
13354         transpose_8x8_avx2(out, in);
13355         fadst8x8_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13356         transpose_8x8_N2_avx2(out, in);
13357         write_buffer_8x8_N4(in, coeff);
13358         break;
13359     case IDTX:
13360         load_buffer_4x8_in_8x8(input, in, stride, 0, 0, shift[0], 1);
13361         clear_buffer_4x16_N2(in);
13362         fidtx8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13363         col_txfm_8x8_N4_rounding(out, -shift[1]);
13364         fidtx8x8_N4_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13365         write_buffer_8x8_N4(out, coeff);
13366         break;
13367     case V_DCT:
13368         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
13369         fdct8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13370         col_txfm_8x8_N4_rounding(out, -shift[1]);
13371         fidtx8x8_N4_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13372         write_buffer_8x8_N4(out, coeff);
13373         break;
13374     case H_DCT:
13375         load_buffer_4x8_in_8x8(input, in, stride, 0, 0, shift[0], 1);
13376         clear_buffer_4x16_N2(in);
13377         fidtx8x8_N4_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13378         col_txfm_8x8_N4_rounding(in, -shift[1]);
13379         transpose_8x8_avx2(in, out);
13380         fdct8x8_N4_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13381         transpose_8x8_N2_avx2(in, out);
13382         write_buffer_8x8_N4(out, coeff);
13383         break;
13384     case V_ADST:
13385         load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
13386         fadst8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13387         col_txfm_8x8_N4_rounding(out, -shift[1]);
13388         fidtx8x8_N4_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13389         write_buffer_8x8_N4(out, coeff);
13390         break;
13391     case H_ADST:
13392         load_buffer_4x8_in_8x8(input, in, stride, 0, 0, shift[0], 1);
13393         clear_buffer_4x16_N2(in);
13394         fidtx8x8_N4_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13395         col_txfm_8x8_N4_rounding(in, -shift[1]);
13396         transpose_8x8_avx2(in, out);
13397         fadst8x8_N4_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13398         transpose_8x8_N2_avx2(in, out);
13399         write_buffer_8x8_N4(out, coeff);
13400         break;
13401     case V_FLIPADST:
13402         load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
13403         fadst8x8_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13404         col_txfm_8x8_N4_rounding(out, -shift[1]);
13405         fidtx8x8_N4_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13406         write_buffer_8x8_N4(out, coeff);
13407         break;
13408     case H_FLIPADST:
13409         load_buffer_4x8_in_8x8(input, in, stride, 0, 1, shift[0], 1);
13410         clear_buffer_4x16_N2(in);
13411         fidtx8x8_N4_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
13412         col_txfm_8x8_N4_rounding(in, -shift[1]);
13413         transpose_8x8_avx2(in, out);
13414         fadst8x8_N4_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
13415         transpose_8x8_N2_avx2(in, out);
13416         write_buffer_8x8_N4(out, coeff);
13417         break;
13418     default: assert(0);
13419     }
13420     (void)bd;
13421 }
13422 
svt_av1_fwd_txfm2d_16x16_N4_avx2(int16_t * input,int32_t * coeff,uint32_t stride,TxType tx_type,uint8_t bd)13423 void svt_av1_fwd_txfm2d_16x16_N4_avx2(int16_t *input, int32_t *coeff, uint32_t stride,
13424                                       TxType tx_type, uint8_t bd) {
13425     __m256i       in[32], out[32];
13426     const int8_t *shift   = fwd_txfm_shift_ls[TX_16X16];
13427     const int32_t txw_idx = get_txw_idx(TX_16X16);
13428     const int32_t txh_idx = get_txh_idx(TX_16X16);
13429     const int32_t col_num = 2;
13430     switch (tx_type) {
13431     case IDTX:
13432         load_buffer_4x16_in_16x16(input, in, stride, 0, 0, shift[0]);
13433         fidtx16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, 2);
13434         col_txfm_16x16_N4_rounding(out, -shift[1]);
13435         fidtx16x16_N4_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 2);
13436         write_buffer_16x16_N4(out, coeff);
13437         break;
13438     case DCT_DCT:
13439         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
13440         fdct16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13441         col_txfm_16x16_N4_rounding(out, -shift[1]);
13442         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13443         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13444         fdct16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13445         transpose_4x4_in_16x16_avx2(out, in);
13446         write_buffer_16x16_N4(in, coeff);
13447         break;
13448     case ADST_DCT:
13449         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
13450         fadst16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13451         col_txfm_16x16_N4_rounding(out, -shift[1]);
13452         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13453         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13454         fdct16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13455         transpose_4x4_in_16x16_avx2(out, in);
13456         write_buffer_16x16_N4(in, coeff);
13457         break;
13458     case DCT_ADST:
13459         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
13460         fdct16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13461         col_txfm_16x16_N4_rounding(out, -shift[1]);
13462         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13463         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13464         fadst16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13465         transpose_4x4_in_16x16_avx2(out, in);
13466         write_buffer_16x16_N4(in, coeff);
13467         break;
13468     case ADST_ADST:
13469         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
13470         fadst16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13471         col_txfm_16x16_N4_rounding(out, -shift[1]);
13472         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13473         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13474         fadst16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13475         transpose_4x4_in_16x16_avx2(out, in);
13476         write_buffer_16x16_N4(in, coeff);
13477         break;
13478     case DCT_FLIPADST:
13479         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
13480         fdct16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13481         col_txfm_16x16_N4_rounding(out, -shift[1]);
13482         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13483         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13484         fadst16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13485         transpose_4x4_in_16x16_avx2(out, in);
13486         write_buffer_16x16_N4(in, coeff);
13487         break;
13488     case FLIPADST_DCT:
13489         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
13490         fadst16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13491         col_txfm_16x16_N4_rounding(out, -shift[1]);
13492         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13493         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13494         fdct16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13495         transpose_4x4_in_16x16_avx2(out, in);
13496         write_buffer_16x16_N4(in, coeff);
13497         break;
13498     case FLIPADST_FLIPADST:
13499         load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
13500         fadst16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13501         col_txfm_16x16_N4_rounding(out, -shift[1]);
13502         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13503         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13504         fadst16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13505         transpose_4x4_in_16x16_avx2(out, in);
13506         write_buffer_16x16_N4(in, coeff);
13507         break;
13508     case ADST_FLIPADST:
13509         load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
13510         fadst16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13511         col_txfm_16x16_N4_rounding(out, -shift[1]);
13512         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13513         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13514         fadst16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13515         transpose_4x4_in_16x16_avx2(out, in);
13516         write_buffer_16x16_N4(in, coeff);
13517         break;
13518     case FLIPADST_ADST:
13519         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
13520         fadst16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13521         col_txfm_16x16_N4_rounding(out, -shift[1]);
13522         transpose_8x8_in_16x16_avx2(out, in); //top-left -> top-left
13523         transpose_8x8_in_16x16_avx2(out + 1, in + 16); //top-right ->bottom-left
13524         fadst16x16_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13525         transpose_4x4_in_16x16_avx2(out, in);
13526         write_buffer_16x16_N4(in, coeff);
13527         break;
13528     case V_DCT:
13529         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
13530         fdct16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, 1);
13531         col_txfm_16x16_N4_rounding(out, -shift[1]);
13532         fidtx16x16_N4_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 2);
13533         write_buffer_16x16_N4(out, coeff);
13534         break;
13535     case H_DCT:
13536         load_buffer_4x16_in_16x16(input, in, stride, 0, 0, shift[0]);
13537         fidtx16x16_N4_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num, 1);
13538         col_txfm_16x16_N4_rounding(in, -shift[1]);
13539         transpose_8x8_in_16x16_avx2(in, out); //top-left -> top-left
13540         transpose_8x8_in_16x16_avx2(in + 1, out + 16); //top-right ->bottom-left
13541         fdct16x16_N4_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13542         transpose_4x4_in_16x16_avx2(in, out);
13543         write_buffer_16x16_N4(out, coeff);
13544         break;
13545     case V_ADST:
13546         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
13547         fadst16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, 1);
13548         col_txfm_16x16_N4_rounding(out, -shift[1]);
13549         fidtx16x16_N4_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 2);
13550         write_buffer_16x16_N4(out, coeff);
13551         break;
13552     case H_ADST:
13553         load_buffer_4x16_in_16x16(input, in, stride, 0, 0, shift[0]);
13554         fidtx16x16_N4_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num, 1);
13555         col_txfm_16x16_N4_rounding(in, -shift[1]);
13556         transpose_8x8_in_16x16_avx2(in, out); //top-left -> top-left
13557         transpose_8x8_in_16x16_avx2(in + 1, out + 16); //top-right ->bottom-left
13558         fadst16x16_N4_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13559         transpose_4x4_in_16x16_avx2(in, out);
13560         write_buffer_16x16_N4(out, coeff);
13561         break;
13562     case V_FLIPADST:
13563         load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
13564         fadst16x16_N4_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num, col_num);
13565         col_txfm_16x16_N4_rounding(out, -shift[1]);
13566         fidtx16x16_N4_avx2(out, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 2);
13567         write_buffer_16x16_N4(out, coeff);
13568         break;
13569     case H_FLIPADST:
13570         load_buffer_4x16_in_16x16(input, in, stride, 0, 1, shift[0]);
13571         fidtx16x16_N4_avx2(in, in, fwd_cos_bit_col[txw_idx][txh_idx], col_num, 1);
13572         col_txfm_16x16_N4_rounding(in, -shift[1]);
13573         transpose_8x8_in_16x16_avx2(in, out); //top-left -> top-left
13574         transpose_8x8_in_16x16_avx2(in + 1, out + 16); //top-right ->bottom-left
13575         fadst16x16_N4_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], col_num, 1);
13576         transpose_4x4_in_16x16_avx2(in, out);
13577         write_buffer_16x16_N4(out, coeff);
13578         break;
13579     default: assert(0);
13580     }
13581     (void)bd;
13582 }
13583 
svt_av1_fwd_txfm2d_64x64_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)13584 void svt_av1_fwd_txfm2d_64x64_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
13585                                       TxType tx_type, uint8_t bd) {
13586     (void)bd;
13587     __m256i       in[512];
13588     __m256i *     out     = (__m256i *)output;
13589     const int32_t txw_idx = tx_size_wide_log2[TX_64X64] - tx_size_wide_log2[0];
13590     const int32_t txh_idx = tx_size_high_log2[TX_64X64] - tx_size_high_log2[0];
13591     const int8_t *shift   = fwd_txfm_shift_ls[TX_64X64];
13592 
13593     switch (tx_type) {
13594     case IDTX:
13595         load_buffer_16x16_in_64x64_avx2(input, stride, out);
13596         fidtx64x64_N4_avx2(out, in);
13597         av1_round_shift_array_64_N4_avx2(in, in, 512 / 4, -shift[1]);
13598         /*row wise transform*/
13599         fidtx64x64_N4_avx2(in, out);
13600         av1_round_shift_array_64_N4_avx2(out, out, 512 / 4, -shift[2]);
13601         clear_buffer_wxh_N4(out, 8, 64);
13602         break;
13603     case DCT_DCT:
13604         load_buffer_64x64_avx2(input, stride, out);
13605         av1_fdct64_new_N4_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], 64, 8);
13606         av1_round_shift_array_32_avx2(in, out, 512 / 4, -shift[1]);
13607         transpose_16x16_in_64x64_avx2(out, in);
13608         transpose_16x16_in_64x64_avx2(out + 2, in + 128);
13609         transpose_16x16_in_64x64_avx2(out + 4, in + 256);
13610         transpose_16x16_in_64x64_avx2(out + 6, in + 384);
13611         /*row wise transform*/
13612         av1_fdct64_new_N4_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 16, 8);
13613         av1_round_shift_array_64_N4_avx2(out, in, 512 / 4, -shift[2]);
13614         transpose_16x16_in_64x64_avx2(in, out); //top-left
13615         clear_buffer_wxh_N4(out, 8, 64);
13616         break;
13617     default: assert(0);
13618     }
13619 }
13620 
svt_av1_fwd_txfm2d_32x32_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)13621 void svt_av1_fwd_txfm2d_32x32_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
13622                                       TxType tx_type, uint8_t bd) {
13623     const int8_t *shift           = fwd_txfm_shift_ls[TX_32X32];
13624     const int32_t txw_idx         = tx_size_wide_log2[TX_32X32] - tx_size_wide_log2[0];
13625     const int32_t txh_idx         = tx_size_high_log2[TX_32X32] - tx_size_high_log2[0];
13626     const int8_t  cos_bit_col     = fwd_cos_bit_col[txw_idx][txh_idx];
13627     const int8_t  cos_bit_row     = fwd_cos_bit_row[txw_idx][txh_idx];
13628     const int32_t txfm2d_size_256 = 32 * 32 / 8;
13629     __m256i       buf_256[128];
13630     __m256i *     out_256 = (__m256i *)output;
13631     (void)bd;
13632 
13633     switch (tx_type) {
13634     case IDTX:
13635         load_buffer_8x8_in_32x32_avx2(input, buf_256, stride);
13636         av1_round_shift_array_32_N4_avx2(buf_256, out_256, 32, -shift[0]);
13637         fidtx_wxh_N4_avx2(out_256, buf_256, 32, 4);
13638         av1_round_shift_array_32_N4_avx2(buf_256, out_256, 32, -shift[1]);
13639         fidtx_wxh_N4_avx2(out_256, buf_256, 32, 4);
13640         av1_round_shift_array_32_N4_avx2(buf_256, buf_256, 32, -shift[2]);
13641         write_buffer_32x32_N4(buf_256, output);
13642         break;
13643     case DCT_DCT:
13644         load_buffer_32x32_avx2(input, buf_256, stride);
13645         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[0]);
13646         fdct32x32_N4_col_avx2(out_256, buf_256, cos_bit_col);
13647         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256 / 4, -shift[1]);
13648         //transpose 32x8 to 8x32
13649         transpose_8x8_in_32x32_avx2(out_256, buf_256);
13650         transpose_8x8_in_32x32_avx2(out_256 + 1, buf_256 + 32);
13651         transpose_8x8_in_32x32_avx2(out_256 + 2, buf_256 + 64);
13652         transpose_8x8_in_32x32_avx2(out_256 + 3, buf_256 + 96);
13653         fdct32x32_N4_row_avx2(buf_256, out_256, cos_bit_row);
13654         av1_round_shift_array_32_N4_avx2(out_256, out_256, 32, -shift[2]);
13655         transpose_8x8_in_32x32_avx2(out_256, buf_256);
13656         write_buffer_32x32_N4(buf_256, output);
13657         break;
13658     case V_DCT:
13659         load_buffer_8x32_in_32x32_avx2(input, buf_256, stride);
13660         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256, -shift[0]);
13661         fdct32x32_N4_row_avx2(out_256, buf_256, cos_bit_col);
13662         av1_round_shift_array_32_N4_avx2(buf_256, out_256, 32, -shift[1]);
13663         fidtx_wxh_N4_avx2(out_256, buf_256, 32, 4);
13664         av1_round_shift_array_32_N4_avx2(buf_256, buf_256, 32, -shift[2]);
13665         write_buffer_32x32_N4(buf_256, output);
13666         break;
13667     case H_DCT:
13668         load_buffer_32x8_in_32x32_avx2(input, buf_256, stride);
13669         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256 / 4, -shift[0]);
13670         fidtx_wxh_N4_avx2(out_256, buf_256, 32, 1);
13671         av1_round_shift_array_32_avx2(buf_256, out_256, txfm2d_size_256 / 4, -shift[1]);
13672         //transpose 32x8 to 8x32
13673         transpose_8x8_in_32x32_avx2(out_256, buf_256);
13674         transpose_8x8_in_32x32_avx2(out_256 + 1, buf_256 + 32);
13675         transpose_8x8_in_32x32_avx2(out_256 + 2, buf_256 + 64);
13676         transpose_8x8_in_32x32_avx2(out_256 + 3, buf_256 + 96);
13677         fdct32x32_N4_row_avx2(buf_256, out_256, cos_bit_row);
13678         av1_round_shift_array_32_N4_avx2(out_256, out_256, 32, -shift[2]);
13679         transpose_8x8_in_32x32_avx2(out_256, buf_256);
13680         write_buffer_32x32_N4(buf_256, output);
13681         break;
13682     default: assert(0);
13683     }
13684 }
13685 
svt_av1_fwd_txfm2d_16x32_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)13686 void svt_av1_fwd_txfm2d_16x32_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
13687                                       TxType tx_type, uint8_t bd) {
13688     __m256i       in[64];
13689     __m256i *     outcoef256    = (__m256i *)output;
13690     const int8_t *shift         = fwd_txfm_shift_ls[TX_16X32];
13691     const int32_t txw_idx       = get_txw_idx(TX_16X32);
13692     const int32_t txh_idx       = get_txh_idx(TX_16X32);
13693     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
13694     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
13695     const int32_t txfm_size_col = tx_size_wide[TX_16X32];
13696     const int32_t txfm_size_row = tx_size_high[TX_16X32];
13697     const int32_t num_row       = txfm_size_row >> 3;
13698     const int32_t num_col       = txfm_size_col >> 3;
13699 
13700     switch (tx_type) {
13701     case IDTX:
13702         load_buffer_16x16_N2_half(input, in, stride, 0, 0, shift[0]);
13703         fidtx_wxh_N4_avx2(in, in, 16, 2);
13704         col_txfm_16x16_N2_half_rounding(&in[0], -shift[1]);
13705         fidtx16x16_N4_avx2(in, outcoef256, bitrow, num_row, 2);
13706         av1_round_shift_rect_wxh_N4(
13707             outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
13708         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13709         break;
13710     case DCT_DCT:
13711         load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
13712         load_buffer_16x16(input + 16 * stride, in + 32, stride, 0, 0, shift[0]);
13713         for (int32_t i = 0; i < num_col; i++)
13714             av1_fdct32_new_N4_avx2((in + i), (in + i), bitcol, 8, num_col);
13715         col_txfm_16x16_N2_rounding(&in[0], -shift[1]);
13716         transpose_8nx8n_N4_half(in, outcoef256, txfm_size_col, txfm_size_row);
13717         fdct16x16_N4_avx2(outcoef256, in, bitrow, num_row, num_row / 2);
13718         transpose_8nx8n_N4_quad(in, outcoef256, txfm_size_row, txfm_size_col);
13719         av1_round_shift_rect_wxh_N4(
13720             outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
13721         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13722         break;
13723     default: assert(0);
13724     }
13725     (void)bd;
13726 }
13727 
13728 /* call this function only for IDTX */
svt_av1_fwd_txfm2d_32x16_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)13729 void svt_av1_fwd_txfm2d_32x16_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
13730                                       TxType tx_type, uint8_t bd) {
13731     __m256i       in[64];
13732     __m256i *     outcoef256    = (__m256i *)output;
13733     const int8_t *shift         = fwd_txfm_shift_ls[TX_32X16];
13734     const int32_t txw_idx       = get_txw_idx(TX_32X16);
13735     const int32_t txh_idx       = get_txh_idx(TX_32X16);
13736     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
13737     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
13738     const int32_t txfm_size_col = tx_size_wide[TX_32X16];
13739     const int32_t txfm_size_row = tx_size_high[TX_32X16];
13740     const int32_t num_row       = txfm_size_row >> 3;
13741     const int32_t num_col       = txfm_size_col >> 3;
13742 
13743     switch (tx_type) {
13744     case IDTX:
13745         load_buffer_16x8n(input, in, stride, 0, 0, shift[0], txfm_size_row / 4);
13746         fidtx16x16_N4_avx2(in, in, bitcol, 4, 4);
13747         col_txfm_32x16_N4_rounding(&in[0], -shift[1]);
13748         fidtx_wxh_N4_avx2(in, outcoef256, 16, 4);
13749         av1_round_shift_rect_wxh_N4(
13750             outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
13751         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13752         break;
13753     case DCT_DCT:
13754         load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
13755         fdct16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13756         col_txfm_16x16_N4_rounding(&in[0], -shift[1]);
13757         col_txfm_16x16_N4_rounding(&in[8], -shift[1]);
13758         transpose_8nx8n_N4_half(in, outcoef256, txfm_size_col, txfm_size_row);
13759         av1_fdct32_new_N4_avx2(outcoef256, in, bitrow, 8, num_row);
13760         transpose_8nx8n_N4_quad(in, outcoef256, txfm_size_row, txfm_size_col);
13761         av1_round_shift_rect_wxh_N4(
13762             outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
13763         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13764         break;
13765     default: assert(0);
13766     }
13767     (void)bd;
13768 }
13769 
13770 /* call this function only for DCT_DCT, IDTX */
svt_av1_fwd_txfm2d_8x32_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)13771 void svt_av1_fwd_txfm2d_8x32_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
13772                                      TxType tx_type, uint8_t bd) {
13773     __m256i       in[32];
13774     __m256i *     outcoef256 = (__m256i *)output;
13775     const int8_t *shift      = fwd_txfm_shift_ls[TX_8X32];
13776     const int32_t txw_idx    = get_txw_idx(TX_8X32);
13777     const int32_t txh_idx    = get_txh_idx(TX_8X32);
13778     int8_t        bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
13779     int8_t        bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
13780 
13781     const int32_t txfm_size_col = tx_size_wide[TX_8X32];
13782     const int32_t txfm_size_row = tx_size_high[TX_8X32];
13783     const int32_t num_row       = txfm_size_row >> 3;
13784     const int32_t num_col       = txfm_size_col >> 3;
13785 
13786     switch (tx_type) {
13787     case IDTX:
13788         load_buffer_8x16_N4(input, in, stride, 0, 0, shift[0]);
13789         fidtx_wxh_N4_avx2(in, in, 8, 1);
13790         col_txfm_16x16_N4_rounding(in, -shift[1]);
13791         // row transform
13792         fidtx32x8_N2_avx2(in, outcoef256, bitrow, num_col, 8);
13793         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13794         break;
13795     case DCT_DCT:
13796         load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
13797         load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + 16, stride, 0, 0, shift[0]);
13798         av1_fdct32_new_N4_avx2(in, in, bitcol, 8, num_col);
13799         col_txfm_16x16_N4_rounding(in, -shift[1]);
13800         transpose_8nx8n_N4_half(in, outcoef256, txfm_size_col, txfm_size_row);
13801         // row transform
13802         fdct8x8_N4_avx2(outcoef256, in, bitrow, num_row);
13803         transpose_8nx8n_N4_quad(in, outcoef256, txfm_size_row, txfm_size_col);
13804         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13805         break;
13806     default: assert(0);
13807     }
13808     (void)bd;
13809 }
13810 
13811 /* call this function only for DCT_DCT, IDTX */
svt_av1_fwd_txfm2d_32x8_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)13812 void svt_av1_fwd_txfm2d_32x8_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
13813                                      TxType tx_type, uint8_t bd) {
13814     __m256i       in[32];
13815     __m256i *     outcoef256 = (__m256i *)output;
13816     const int8_t *shift      = fwd_txfm_shift_ls[TX_32X8];
13817     const int32_t txw_idx    = get_txw_idx(TX_32X8);
13818     const int32_t txh_idx    = get_txh_idx(TX_32X8);
13819     int8_t        bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
13820     int8_t        bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
13821 
13822     const int32_t txfm_size_col = tx_size_wide[TX_32X8];
13823     const int32_t txfm_size_row = tx_size_high[TX_32X8];
13824     const int32_t num_row       = txfm_size_row >> 3;
13825     const int32_t num_col       = txfm_size_col >> 3;
13826 
13827     switch (tx_type) {
13828     case IDTX:
13829         load_buffer_16x8n(input, in, stride, 0, 0, shift[0], txfm_size_row / 4);
13830         fidtx32x8_N2_avx2(in, in, bitcol, num_col, 2);
13831         col_txfm_32x8_N4_rounding(&in[0], -shift[1]);
13832         // row transform
13833         fidtx_wxh_N4_avx2(in, outcoef256, 8, 4);
13834         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13835         break;
13836     case DCT_DCT:
13837         load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
13838         for (int32_t i = 0; i < num_col; i++) fdct8x8_N4_avx2((in + i), (in + i), bitcol, num_col);
13839         col_txfm_16x16_N4_rounding(&in[0], -shift[1]);
13840         transpose_8nx8n_N4_half(in, outcoef256, txfm_size_col, txfm_size_row);
13841         // row transform
13842         av1_fdct32_new_N4_avx2(outcoef256, in, bitrow, 8, num_row);
13843         transpose_8nx8n_N4_quad(in, outcoef256, txfm_size_row, txfm_size_col);
13844         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13845         break;
13846     default: assert(0);
13847     }
13848     (void)bd;
13849 }
13850 
13851 /* call this function for all 16 transform types */
svt_av1_fwd_txfm2d_8x16_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)13852 void svt_av1_fwd_txfm2d_8x16_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
13853                                      TxType tx_type, uint8_t bd) {
13854     __m256i       in[16], out[16];
13855     __m256i *     outcoef256 = (__m256i *)output;
13856     const int8_t *shift      = fwd_txfm_shift_ls[TX_8X16];
13857     const int32_t txw_idx    = get_txw_idx(TX_8X16);
13858     const int32_t txh_idx    = get_txh_idx(TX_8X16);
13859     int8_t        bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
13860     int8_t        bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
13861     int32_t       ud_flip, lr_flip;
13862     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
13863     const int32_t txfm_size_col = tx_size_wide[TX_8X16];
13864     const int32_t txfm_size_row = tx_size_high[TX_8X16];
13865     const int32_t num_col       = txfm_size_col >> 3;
13866 
13867     switch (tx_type) {
13868     case DCT_DCT:
13869         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13870         fdct16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13871         col_txfm_8x8_N2_rounding(in, -shift[1]);
13872         transpose_8x8_half_avx2(in, out);
13873         fdct8x8_N4_avx2(out, in, bitrow, 1);
13874         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13875         transpose_8x8_N2_avx2(out, outcoef256);
13876         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13877         break;
13878     case ADST_DCT:
13879         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13880         fadst16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13881         col_txfm_8x8_N2_rounding(in, -shift[1]);
13882         transpose_8x8_half_avx2(in, out);
13883         fdct8x8_N4_avx2(out, in, bitrow, 1);
13884         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13885         transpose_8x8_N2_avx2(out, outcoef256);
13886         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13887         break;
13888     case DCT_ADST:
13889         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13890         fdct16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13891         col_txfm_8x8_N2_rounding(in, -shift[1]);
13892         transpose_8x8_half_avx2(in, out);
13893         fadst8x8_N4_avx2(out, in, bitrow, 1);
13894         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13895         transpose_8x8_N2_avx2(out, outcoef256);
13896         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13897         break;
13898     case ADST_ADST:
13899         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13900         fadst16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13901         col_txfm_8x8_N2_rounding(in, -shift[1]);
13902         transpose_8x8_half_avx2(in, out);
13903         fadst8x8_N4_avx2(out, in, bitrow, 1);
13904         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13905         transpose_8x8_N2_avx2(out, outcoef256);
13906         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13907         break;
13908     case FLIPADST_DCT:
13909         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13910         fadst16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13911         col_txfm_8x8_N2_rounding(in, -shift[1]);
13912         transpose_8x8_half_avx2(in, out);
13913         fdct8x8_N4_avx2(out, in, bitrow, 1);
13914         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13915         transpose_8x8_N2_avx2(out, outcoef256);
13916         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13917         break;
13918     case DCT_FLIPADST:
13919         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13920         fdct16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13921         col_txfm_8x8_N2_rounding(in, -shift[1]);
13922         transpose_8x8_half_avx2(in, out);
13923         fadst8x8_N4_avx2(out, in, bitrow, 1);
13924         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13925         transpose_8x8_N2_avx2(out, outcoef256);
13926         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13927         break;
13928     case FLIPADST_FLIPADST:
13929         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13930         fadst16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13931         col_txfm_8x8_N2_rounding(in, -shift[1]);
13932         transpose_8x8_half_avx2(in, out);
13933         fadst8x8_N4_avx2(out, in, bitrow, 1);
13934         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13935         transpose_8x8_N2_avx2(out, outcoef256);
13936         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13937         break;
13938     case ADST_FLIPADST:
13939         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13940         fadst16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13941         col_txfm_8x8_N2_rounding(in, -shift[1]);
13942         transpose_8x8_half_avx2(in, out);
13943         fadst8x8_N4_avx2(out, in, bitrow, 1);
13944         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13945         transpose_8x8_N2_avx2(out, outcoef256);
13946         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13947         break;
13948     case FLIPADST_ADST:
13949         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13950         fadst16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13951         col_txfm_8x8_N2_rounding(in, -shift[1]);
13952         transpose_8x8_half_avx2(in, out);
13953         fadst8x8_N4_avx2(out, in, bitrow, 1);
13954         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13955         transpose_8x8_N2_avx2(out, outcoef256);
13956         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13957         break;
13958     case IDTX:
13959         load_buffer_8x8(input, in, stride, ud_flip, lr_flip, shift[0]);
13960         fidtx8xn_N2_col_avx2(in, in, bitcol, 4);
13961         col_txfm_8x8_N2_rounding(in, -shift[1]);
13962         fidtx8x8_N2_avx2(in, in, bitrow, 1);
13963         av1_round_shift_rect_array_32_avx2(in, outcoef256, 4, -shift[2], new_sqrt2);
13964         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13965         break;
13966     case V_DCT:
13967         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13968         fdct16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13969         col_txfm_8x8_N2_rounding(in, -shift[1]);
13970         fidtx8x8_N2_avx2(in, in, bitrow, 1);
13971         av1_round_shift_rect_array_32_avx2(in, outcoef256, 4, -shift[2], new_sqrt2);
13972         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13973         break;
13974     case H_DCT:
13975         load_buffer_8x8(input, in, stride, ud_flip, lr_flip, shift[0]);
13976         fidtx8xn_N2_col_avx2(in, in, bitcol, 4);
13977         col_txfm_8x8_N2_rounding(in, -shift[1]);
13978         transpose_8x8_half_avx2(in, out);
13979         fdct8x8_N4_avx2(out, in, bitrow, 1);
13980         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13981         transpose_8x8_N2_avx2(out, outcoef256);
13982         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13983         break;
13984     case V_ADST:
13985         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
13986         fadst16x16_N4_avx2(in, in, bitcol, num_col, num_col);
13987         col_txfm_8x8_N2_rounding(in, -shift[1]);
13988         fidtx8x8_N2_avx2(in, in, bitrow, 1);
13989         av1_round_shift_rect_array_32_avx2(in, outcoef256, 4, -shift[2], new_sqrt2);
13990         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
13991         break;
13992     case H_ADST:
13993         load_buffer_8x8(input, in, stride, ud_flip, lr_flip, shift[0]);
13994         fidtx8xn_N2_col_avx2(in, in, bitcol, 4);
13995         col_txfm_8x8_N2_rounding(in, -shift[1]);
13996         transpose_8x8_half_avx2(in, out);
13997         fadst8x8_N4_avx2(out, in, bitrow, 1);
13998         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
13999         transpose_8x8_N2_avx2(out, outcoef256);
14000         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14001         break;
14002     case V_FLIPADST:
14003         load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
14004         fadst16x16_N4_avx2(in, in, bitcol, num_col, num_col);
14005         col_txfm_8x8_N2_rounding(in, -shift[1]);
14006         fidtx8x8_N2_avx2(in, in, bitrow, 1);
14007         av1_round_shift_rect_array_32_avx2(in, outcoef256, 4, -shift[2], new_sqrt2);
14008         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14009         break;
14010     case H_FLIPADST:
14011         load_buffer_8x8(input, in, stride, ud_flip, lr_flip, shift[0]);
14012         fidtx8xn_N2_col_avx2(in, in, bitcol, 4);
14013         col_txfm_8x8_N2_rounding(in, -shift[1]);
14014         transpose_8x8_half_avx2(in, out);
14015         fadst8x8_N4_avx2(out, in, bitrow, 1);
14016         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14017         transpose_8x8_N2_avx2(out, outcoef256);
14018         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14019         break;
14020     default: assert(0);
14021     }
14022     (void)bd;
14023 }
14024 
14025 /* call this function for all 16 transform types */
svt_av1_fwd_txfm2d_16x8_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14026 void svt_av1_fwd_txfm2d_16x8_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14027                                      TxType tx_type, uint8_t bd) {
14028     __m256i       in[16], out[16];
14029     __m256i *     outcoef256 = (__m256i *)output;
14030     const int8_t *shift      = fwd_txfm_shift_ls[TX_16X8];
14031     const int32_t txw_idx    = get_txw_idx(TX_16X8);
14032     const int32_t txh_idx    = get_txh_idx(TX_16X8);
14033     int8_t        bitcol     = fwd_cos_bit_col[txw_idx][txh_idx];
14034     int8_t        bitrow     = fwd_cos_bit_row[txw_idx][txh_idx];
14035     int32_t       ud_flip, lr_flip;
14036     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
14037     const int32_t txfm_size_col = tx_size_wide[TX_16X8];
14038     const int32_t txfm_size_row = tx_size_high[TX_16X8];
14039     const int32_t num_row       = txfm_size_row >> 3;
14040     const int32_t num_col       = txfm_size_col >> 3;
14041     assert(num_col > 0);
14042     // column transform
14043     switch (tx_type) {
14044     case DCT_DCT:
14045         for (int32_t i = 0; i < num_col; i++) {
14046             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14047             fdct8x8_N4_avx2(in, in, bitcol, 1);
14048             col_txfm_8x8_N4_rounding(in, -shift[1]);
14049             transpose_8x8_half_avx2(in, out + i * 8);
14050         }
14051         if (lr_flip) {
14052             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14053             fdct16x16_N4_avx2(in, out, bitrow, num_row, 1);
14054         } else
14055             fdct16x16_N4_avx2(out, out, bitrow, num_row, 1);
14056         transpose_8x8_N2_avx2(out, in);
14057         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14058         write_buffer_16x8_N4_avx2(out, outcoef256);
14059         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14060         break;
14061     case ADST_DCT:
14062         for (int32_t i = 0; i < num_col; i++) {
14063             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14064             fadst8x8_N4_avx2(in, in, bitcol, 1);
14065             col_txfm_8x8_N4_rounding(in, -shift[1]);
14066             transpose_8x8_half_avx2(in, out + i * 8);
14067         }
14068         if (lr_flip) {
14069             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14070             fdct16x16_N4_avx2(in, out, bitrow, num_row, 1);
14071         } else
14072             fdct16x16_N4_avx2(out, out, bitrow, num_row, 1);
14073         transpose_8x8_N2_avx2(out, in);
14074         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14075         write_buffer_16x8_N4_avx2(out, outcoef256);
14076         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14077         break;
14078     case DCT_ADST:
14079         for (int32_t i = 0; i < num_col; i++) {
14080             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14081             fdct8x8_N4_avx2(in, in, bitcol, 1);
14082             col_txfm_8x8_N4_rounding(in, -shift[1]);
14083             transpose_8x8_half_avx2(in, out + i * 8);
14084         }
14085         if (lr_flip) {
14086             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14087             fadst16x16_N4_avx2(in, out, bitrow, num_row, 1);
14088         } else
14089             fadst16x16_N4_avx2(out, out, bitrow, num_row, 1);
14090         transpose_8x8_N2_avx2(out, in);
14091         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14092         write_buffer_16x8_N4_avx2(out, outcoef256);
14093         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14094         break;
14095     case ADST_ADST:
14096         for (int32_t i = 0; i < num_col; i++) {
14097             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14098             fadst8x8_N4_avx2(in, in, bitcol, 1);
14099             col_txfm_8x8_N4_rounding(in, -shift[1]);
14100             transpose_8x8_half_avx2(in, out + i * 8);
14101         }
14102         if (lr_flip) {
14103             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14104             fadst16x16_N4_avx2(in, out, bitrow, num_row, 1);
14105         } else
14106             fadst16x16_N4_avx2(out, out, bitrow, num_row, 1);
14107         transpose_8x8_N2_avx2(out, in);
14108         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14109         write_buffer_16x8_N4_avx2(out, outcoef256);
14110         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14111         break;
14112     case FLIPADST_DCT:
14113         for (int32_t i = 0; i < num_col; i++) {
14114             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14115             fadst8x8_N4_avx2(in, in, bitcol, 1);
14116             col_txfm_8x8_N4_rounding(in, -shift[1]);
14117             transpose_8x8_half_avx2(in, out + i * 8);
14118         }
14119         if (lr_flip) {
14120             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14121             fdct16x16_N4_avx2(in, out, bitrow, num_row, 1);
14122         } else
14123             fdct16x16_N4_avx2(out, out, bitrow, num_row, 1);
14124         transpose_8x8_N2_avx2(out, in);
14125         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14126         write_buffer_16x8_N4_avx2(out, outcoef256);
14127         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14128         break;
14129     case DCT_FLIPADST:
14130         for (int32_t i = 0; i < num_col; i++) {
14131             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14132             fdct8x8_N4_avx2(in, in, bitcol, 1);
14133             col_txfm_8x8_N4_rounding(in, -shift[1]);
14134             transpose_8x8_half_avx2(in, out + i * 8);
14135         }
14136         if (lr_flip) {
14137             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14138             fadst16x16_N4_avx2(in, out, bitrow, num_row, 1);
14139         } else
14140             fadst16x16_N4_avx2(out, out, bitrow, num_row, 1);
14141         transpose_8x8_N2_avx2(out, in);
14142         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14143         write_buffer_16x8_N4_avx2(out, outcoef256);
14144         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14145         break;
14146     case FLIPADST_FLIPADST:
14147         for (int32_t i = 0; i < num_col; i++) {
14148             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14149             fadst8x8_N4_avx2(in, in, bitcol, 1);
14150             col_txfm_8x8_N4_rounding(in, -shift[1]);
14151             transpose_8x8_half_avx2(in, out + i * 8);
14152         }
14153         if (lr_flip) {
14154             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14155             fadst16x16_N4_avx2(in, out, bitrow, num_row, 1);
14156         } else
14157             fadst16x16_N4_avx2(out, out, bitrow, num_row, 1);
14158         transpose_8x8_N2_avx2(out, in);
14159         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14160         write_buffer_16x8_N4_avx2(out, outcoef256);
14161         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14162         break;
14163     case ADST_FLIPADST:
14164         for (int32_t i = 0; i < num_col; i++) {
14165             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14166             fadst8x8_N4_avx2(in, in, bitcol, 1);
14167             col_txfm_8x8_N4_rounding(in, -shift[1]);
14168             transpose_8x8_half_avx2(in, out + i * 8);
14169         }
14170         if (lr_flip) {
14171             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14172             fadst16x16_N4_avx2(in, out, bitrow, num_row, 1);
14173         } else
14174             fadst16x16_N4_avx2(out, out, bitrow, num_row, 1);
14175         transpose_8x8_N2_avx2(out, in);
14176         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14177         write_buffer_16x8_N4_avx2(out, outcoef256);
14178         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14179         break;
14180     case FLIPADST_ADST:
14181         for (int32_t i = 0; i < num_col; i++) {
14182             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14183             fadst8x8_N4_avx2(in, in, bitcol, 1);
14184             col_txfm_8x8_N4_rounding(in, -shift[1]);
14185             transpose_8x8_half_avx2(in, out + i * 8);
14186         }
14187         if (lr_flip) {
14188             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14189             fadst16x16_N4_avx2(in, out, bitrow, num_row, 1);
14190         } else
14191             fadst16x16_N4_avx2(out, out, bitrow, num_row, 1);
14192         transpose_8x8_N2_avx2(out, in);
14193         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14194         write_buffer_16x8_N4_avx2(out, outcoef256);
14195         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14196         break;
14197     case IDTX:
14198         load_buffer_8x8(input, in, stride, ud_flip, 0, shift[0]);
14199         fidtx8x8_N4_avx2(in, out, bitcol, 1);
14200         col_txfm_8x8_N4_rounding(out, -shift[1]);
14201         if (lr_flip) {
14202             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14203             fidtx8xn_N2_col_avx2(in, out, bitrow, 2);
14204         } else
14205             fidtx8xn_N2_col_avx2(out, out, bitrow, 2);
14206         av1_round_shift_rect_array_32_avx2(out, out, 2, -shift[2], new_sqrt2);
14207         write_buffer_16x8_N4_avx2(out, outcoef256);
14208         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14209         break;
14210     case V_DCT:
14211         load_buffer_8x8(input, in, stride, ud_flip, 0, shift[0]);
14212         fdct8x8_N4_avx2(in, out, bitcol, 1);
14213         col_txfm_8x8_N4_rounding(out, -shift[1]);
14214         if (lr_flip) {
14215             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14216             fidtx8xn_N2_col_avx2(in, out, bitrow, 2);
14217         } else
14218             fidtx8xn_N2_col_avx2(out, out, bitrow, 2);
14219         av1_round_shift_rect_array_32_avx2(out, out, 2, -shift[2], new_sqrt2);
14220         write_buffer_16x8_N4_avx2(out, outcoef256);
14221         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14222         break;
14223     case H_DCT:
14224         for (int32_t i = 0; i < num_col; i++) {
14225             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14226             fidtx8x8_N4_avx2(in, in, bitcol, 1);
14227             col_txfm_8x8_N4_rounding(in, -shift[1]);
14228             transpose_8x8_half_avx2(in, out + i * 8);
14229         }
14230         if (lr_flip) {
14231             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14232             fdct16x16_N4_avx2(in, out, bitrow, num_row, 1);
14233         } else
14234             fdct16x16_N4_avx2(out, out, bitrow, num_row, 1);
14235         transpose_8x8_N2_avx2(out, in);
14236         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14237         write_buffer_16x8_N4_avx2(out, outcoef256);
14238         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14239         break;
14240     case V_ADST:
14241         load_buffer_8x8(input, in, stride, ud_flip, 0, shift[0]);
14242         fadst8x8_N4_avx2(in, out, bitcol, 1);
14243         col_txfm_8x8_N4_rounding(out, -shift[1]);
14244         if (lr_flip) {
14245             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14246             fidtx8xn_N2_col_avx2(in, out, bitrow, 2);
14247         } else
14248             fidtx8xn_N2_col_avx2(out, out, bitrow, 2);
14249         av1_round_shift_rect_array_32_avx2(out, out, 2, -shift[2], new_sqrt2);
14250         write_buffer_16x8_N4_avx2(out, outcoef256);
14251         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14252         break;
14253     case H_ADST:
14254         for (int32_t i = 0; i < num_col; i++) {
14255             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14256             fidtx8x8_N4_avx2(in, in, bitcol, 1);
14257             col_txfm_8x8_N4_rounding(in, -shift[1]);
14258             transpose_8x8_half_avx2(in, out + i * 8);
14259         }
14260         if (lr_flip) {
14261             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14262             fadst16x16_N4_avx2(in, out, bitrow, num_row, 1);
14263         } else
14264             fadst16x16_N4_avx2(out, out, bitrow, num_row, 1);
14265         transpose_8x8_N2_avx2(out, in);
14266         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14267         write_buffer_16x8_N4_avx2(out, outcoef256);
14268         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14269         break;
14270     case V_FLIPADST:
14271         load_buffer_8x8(input, in, stride, ud_flip, 0, shift[0]);
14272         fadst8x8_N4_avx2(in, out, bitcol, 1);
14273         col_txfm_8x8_N4_rounding(out, -shift[1]);
14274         if (lr_flip) {
14275             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14276             fidtx8xn_N2_col_avx2(in, out, bitrow, 2);
14277         } else
14278             fidtx8xn_N2_col_avx2(out, out, bitrow, 2);
14279         av1_round_shift_rect_array_32_avx2(out, out, 2, -shift[2], new_sqrt2);
14280         write_buffer_16x8_N4_avx2(out, outcoef256);
14281         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14282         break;
14283     case H_FLIPADST:
14284         for (int32_t i = 0; i < num_col; i++) {
14285             load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
14286             fidtx8x8_N4_avx2(in, in, bitcol, 1);
14287             col_txfm_8x8_N4_rounding(in, -shift[1]);
14288             transpose_8x8_half_avx2(in, out + i * 8);
14289         }
14290         if (lr_flip) {
14291             for (int32_t i = 0; i < 16; i++) in[16 - i - 1] = out[i];
14292             fadst16x16_avx2(in, out, bitrow, num_row);
14293         } else
14294             fadst16x16_avx2(out, out, bitrow, num_row);
14295         transpose_8x8_N2_avx2(out, in);
14296         av1_round_shift_rect_array_32_avx2(in, out, 2, -shift[2], new_sqrt2);
14297         write_buffer_16x8_N4_avx2(out, outcoef256);
14298         clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14299         break;
14300     default: assert(0);
14301     }
14302     (void)bd;
14303 }
14304 
svt_av1_fwd_txfm2d_4x8_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14305 void svt_av1_fwd_txfm2d_4x8_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14306                                     TxType tx_type, uint8_t bd) {
14307     __m256i in[4];
14308     __m256i outcoeff256[4];
14309 
14310     const int8_t *shift   = fwd_txfm_shift_ls[TX_4X8];
14311     const int32_t txw_idx = get_txw_idx(TX_4X8);
14312     const int32_t txh_idx = get_txh_idx(TX_4X8);
14313     int32_t       bitcol  = fwd_cos_bit_col[txw_idx][txh_idx];
14314     int32_t       bitrow  = fwd_cos_bit_row[txw_idx][txh_idx];
14315 
14316     switch (tx_type) {
14317     case DCT_DCT:
14318         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14319         fdct4x8_N4_avx2(in, in, bitcol);
14320         col_txfm_8x4_N4_rounding(in, -shift[1]);
14321         transpose_4x8_avx2(in, outcoeff256);
14322         fdct4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14323         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14324         write_buffer_4x8_N4(outcoeff256, output);
14325         break;
14326     case ADST_DCT:
14327         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14328         fadst8x4_N4_avx2(in, in, bitcol, 1);
14329         col_txfm_8x4_N4_rounding(in, -shift[1]);
14330         transpose_4x8_avx2(in, outcoeff256);
14331         fdct4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14332         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14333         write_buffer_4x8_N4(outcoeff256, output);
14334         break;
14335     case DCT_ADST:
14336         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14337         fdct4x8_N4_avx2(in, in, bitcol);
14338         col_txfm_8x4_N4_rounding(in, -shift[1]);
14339         transpose_4x8_avx2(in, outcoeff256);
14340         fadst4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14341         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14342         write_buffer_4x8_N4(outcoeff256, output);
14343         break;
14344     case ADST_ADST:
14345         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14346         fadst8x4_N4_avx2(in, in, bitcol, 1);
14347         col_txfm_8x4_N4_rounding(in, -shift[1]);
14348         transpose_4x8_avx2(in, outcoeff256);
14349         fadst4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14350         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14351         write_buffer_4x8_N4(outcoeff256, output);
14352         break;
14353     case FLIPADST_DCT:
14354         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
14355         fadst8x4_N4_avx2(in, in, bitcol, 1);
14356         col_txfm_8x4_N4_rounding(in, -shift[1]);
14357         transpose_4x8_avx2(in, outcoeff256);
14358         fdct4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14359         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14360         write_buffer_4x8_N4(outcoeff256, output);
14361         break;
14362     case DCT_FLIPADST:
14363         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
14364         fdct4x8_N4_avx2(in, in, bitcol);
14365         col_txfm_8x4_N4_rounding(in, -shift[1]);
14366         transpose_4x8_avx2(in, outcoeff256);
14367         fadst4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14368         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14369         write_buffer_4x8_N4(outcoeff256, output);
14370         break;
14371     case FLIPADST_FLIPADST:
14372         load_buffer_4x8_avx2(input, in, stride, 1, 1, shift[0]);
14373         fadst8x4_N4_avx2(in, in, bitcol, 1);
14374         col_txfm_8x4_N4_rounding(in, -shift[1]);
14375         transpose_4x8_avx2(in, outcoeff256);
14376         fadst4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14377         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14378         write_buffer_4x8_N4(outcoeff256, output);
14379         break;
14380     case ADST_FLIPADST:
14381         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
14382         fadst8x4_N4_avx2(in, in, bitcol, 1);
14383         col_txfm_8x4_N4_rounding(in, -shift[1]);
14384         transpose_4x8_avx2(in, outcoeff256);
14385         fadst4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14386         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14387         write_buffer_4x8_N4(outcoeff256, output);
14388         break;
14389     case FLIPADST_ADST:
14390         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
14391         fadst8x4_N4_avx2(in, in, bitcol, 1);
14392         col_txfm_8x4_N4_rounding(in, -shift[1]);
14393         transpose_4x8_avx2(in, outcoeff256);
14394         fadst4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14395         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14396         write_buffer_4x8_N4(outcoeff256, output);
14397         break;
14398     case IDTX:
14399         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14400         fidtx8x4_N4_avx2(in, in, bitcol);
14401         col_txfm_8x4_N4_rounding(in, -shift[1]);
14402         fidtx4x8_col_N4_avx2(in, in, bitrow, 1);
14403         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14404         write_buffer_4x8_N4(outcoeff256, output);
14405         break;
14406     case V_DCT:
14407         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14408         fdct4x8_N4_avx2(in, in, bitcol);
14409         col_txfm_8x4_N4_rounding(in, -shift[1]);
14410         fidtx4x8_col_N4_avx2(in, in, bitrow, 1);
14411         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14412         write_buffer_4x8_N4(outcoeff256, output);
14413         break;
14414     case H_DCT:
14415         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14416         fidtx8x4_N4_avx2(in, in, bitcol);
14417         col_txfm_8x4_N4_rounding(in, -shift[1]);
14418         transpose_4x8_avx2(in, outcoeff256);
14419         fdct4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14420         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14421         write_buffer_4x8_N4(outcoeff256, output);
14422         break;
14423     case V_ADST:
14424         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14425         fadst8x4_N4_avx2(in, in, bitcol, 1);
14426         col_txfm_8x4_N4_rounding(in, -shift[1]);
14427         fidtx4x8_col_N4_avx2(in, in, bitrow, 1);
14428         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14429         write_buffer_4x8_N4(outcoeff256, output);
14430         break;
14431     case H_ADST:
14432         load_buffer_4x8_avx2(input, in, stride, 0, 0, shift[0]);
14433         fidtx8x4_N4_avx2(in, in, bitcol);
14434         col_txfm_8x4_N4_rounding(in, -shift[1]);
14435         transpose_4x8_avx2(in, outcoeff256);
14436         fadst4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14437         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14438         write_buffer_4x8_N4(outcoeff256, output);
14439         break;
14440     case V_FLIPADST:
14441         load_buffer_4x8_avx2(input, in, stride, 1, 0, shift[0]);
14442         fadst8x4_N4_avx2(in, in, bitcol, 1);
14443         col_txfm_8x4_N4_rounding(in, -shift[1]);
14444         fidtx4x8_col_N4_avx2(in, in, bitrow, 1);
14445         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14446         write_buffer_4x8_N4(outcoeff256, output);
14447         break;
14448     case H_FLIPADST:
14449         load_buffer_4x8_avx2(input, in, stride, 0, 1, shift[0]);
14450         fidtx8x4_N4_avx2(in, in, bitcol);
14451         col_txfm_8x4_N4_rounding(in, -shift[1]);
14452         transpose_4x8_avx2(in, outcoeff256);
14453         fadst4x8_col_N4_avx2(outcoeff256, in, bitrow, 1);
14454         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14455         write_buffer_4x8_N4(outcoeff256, output);
14456         break;
14457     default: assert(0);
14458     }
14459     (void)bd;
14460 }
14461 
svt_av1_fwd_txfm2d_8x4_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14462 void svt_av1_fwd_txfm2d_8x4_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14463                                     TxType tx_type, uint8_t bd) {
14464     __m256i       in[4];
14465     __m256i *     outcoeff256 = (__m256i *)output;
14466     const int8_t *shift       = fwd_txfm_shift_ls[TX_8X4];
14467     const int32_t txw_idx     = get_txw_idx(TX_8X4);
14468     const int32_t txh_idx     = get_txh_idx(TX_8X4);
14469     int32_t       bitcol      = fwd_cos_bit_col[txw_idx][txh_idx];
14470     int32_t       bitrow      = fwd_cos_bit_row[txw_idx][txh_idx];
14471 
14472     switch (tx_type) {
14473     case DCT_DCT:
14474         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14475         fdct4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14476         fdct4x8_N4_avx2(in, outcoeff256, bitrow);
14477         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14478         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14479         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14480         break;
14481     case ADST_DCT:
14482         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14483         fadst4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14484         fdct4x8_N4_avx2(in, outcoeff256, bitrow);
14485         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14486         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14487         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14488         break;
14489     case DCT_ADST:
14490         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14491         fdct4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14492         fadst8x4_N4_avx2(in, outcoeff256, bitrow, 1);
14493         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14494         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14495         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14496         break;
14497     case ADST_ADST:
14498         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14499         fadst4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14500         fadst8x4_N4_avx2(in, outcoeff256, bitrow, 1);
14501         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14502         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14503         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14504         break;
14505     case FLIPADST_DCT:
14506         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
14507         fadst4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14508         fdct4x8_N4_avx2(in, outcoeff256, bitrow);
14509         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14510         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14511         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14512         break;
14513     case DCT_FLIPADST:
14514         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
14515         fdct4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14516         fadst8x4_N4_avx2(in, outcoeff256, bitrow, 1);
14517         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14518         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14519         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14520         break;
14521     case FLIPADST_FLIPADST:
14522         load_buffer_8x4_avx2(input, in, stride, 1, 1, shift[0]);
14523         fadst4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14524         fadst8x4_N4_avx2(in, outcoeff256, bitrow, 1);
14525         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14526         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14527         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14528         break;
14529     case ADST_FLIPADST:
14530         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
14531         fadst4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14532         fadst8x4_N4_avx2(in, outcoeff256, bitrow, 1);
14533         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14534         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14535         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14536         break;
14537     case FLIPADST_ADST:
14538         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
14539         fadst4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14540         fadst8x4_N4_avx2(in, outcoeff256, bitrow, 1);
14541         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14542         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14543         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14544         break;
14545     case IDTX:
14546         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14547         fidtx4x8_row_N4_with_round_avx2(in, in, bitcol, -shift[1]);
14548         fidtx8x4_N2_avx2(in, outcoeff256, bitrow);
14549         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14550         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14551         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14552         break;
14553     case V_DCT:
14554         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14555         fdct4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14556         fidtx8x4_N2_avx2(in, outcoeff256, bitrow);
14557         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14558         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14559         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14560         break;
14561     case H_DCT:
14562         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14563         fidtx4x8_row_N4_with_round_avx2(in, in, bitcol, -shift[1]);
14564         fdct4x8_N4_avx2(in, outcoeff256, bitrow);
14565         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14566         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14567         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14568         break;
14569     case V_ADST:
14570         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14571         fadst4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14572         fidtx8x4_N2_avx2(in, outcoeff256, bitrow);
14573         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14574         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14575         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14576         break;
14577     case H_ADST:
14578         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14579         fidtx4x8_row_N4_with_round_avx2(in, in, bitcol, -shift[1]);
14580         fadst8x4_N4_avx2(in, outcoeff256, bitrow, 1);
14581         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14582         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14583         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14584         break;
14585     case V_FLIPADST:
14586         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
14587         fadst4x8_row_N4_with_round_avx2(in, in, bitcol, 1, -shift[1]);
14588         fidtx8x4_N2_avx2(in, outcoeff256, bitrow);
14589         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14590         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14591         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14592         break;
14593     case H_FLIPADST:
14594         load_buffer_8x4_avx2(input, in, stride, 0, 1, shift[0]);
14595         fidtx4x8_row_N4_with_round_avx2(in, in, bitcol, -shift[1]);
14596         fadst8x4_N4_avx2(in, outcoeff256, bitrow, 1);
14597         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14598         av1_round_shift_rect_array_32_avx2(in, outcoeff256, 1, -shift[2], new_sqrt2);
14599         clear_buffer_wxh_N4(outcoeff256, 1, 4);
14600         break;
14601     default: assert(0);
14602     }
14603     (void)bd;
14604 }
14605 
svt_av1_fwd_txfm2d_4x16_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14606 void svt_av1_fwd_txfm2d_4x16_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14607                                      TxType tx_type, uint8_t bd) {
14608     __m256i       in[8];
14609     __m256i *     outcoeff256 = (__m256i *)output;
14610     const int8_t *shift       = fwd_txfm_shift_ls[TX_4X16];
14611     const int32_t txw_idx     = get_txw_idx(TX_4X16);
14612     const int32_t txh_idx     = get_txh_idx(TX_4X16);
14613     int32_t       bitcol      = fwd_cos_bit_col[txw_idx][txh_idx];
14614     int32_t       bitrow      = fwd_cos_bit_row[txw_idx][txh_idx];
14615 
14616     switch (tx_type) {
14617     case DCT_DCT:
14618         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
14619         fdct16x4_N4_avx2(in, outcoeff256, bitcol);
14620         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14621         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14622         fdct4x8_col_N4_avx2(
14623             in, outcoeff256, bitrow, 2); //dct + transpose + clear right half of buffer
14624         clear_buffer_4x16_N4(outcoeff256);
14625         break;
14626     case ADST_DCT:
14627         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
14628         fadst16x4_N4_avx2(in, outcoeff256, bitcol);
14629         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14630         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14631         fdct4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14632         clear_buffer_4x16_N4(outcoeff256);
14633         break;
14634     case DCT_ADST:
14635         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
14636         fdct16x4_N4_avx2(in, outcoeff256, bitcol);
14637         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14638         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14639         fadst4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14640         clear_buffer_4x16_N4(outcoeff256);
14641         break;
14642     case ADST_ADST:
14643         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
14644         fadst16x4_N4_avx2(in, outcoeff256, bitcol);
14645         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14646         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14647         fadst4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14648         clear_buffer_4x16_N4(outcoeff256);
14649         break;
14650     case FLIPADST_DCT:
14651         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
14652         fadst16x4_N4_avx2(in, outcoeff256, bitcol);
14653         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14654         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14655         fdct4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14656         clear_buffer_4x16_N4(outcoeff256);
14657         break;
14658     case DCT_FLIPADST:
14659         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
14660         fdct16x4_N4_avx2(in, outcoeff256, bitcol);
14661         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14662         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14663         fadst4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14664         clear_buffer_4x16_N4(outcoeff256);
14665         break;
14666     case FLIPADST_FLIPADST:
14667         load_buffer_4x16_avx2(input, in, stride, 1, 1, shift[0]);
14668         fadst16x4_N4_avx2(in, outcoeff256, bitcol);
14669         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14670         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14671         fadst4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14672         clear_buffer_4x16_N4(outcoeff256);
14673         break;
14674     case ADST_FLIPADST:
14675         load_buffer_4x16_avx2(input, in, stride, 0, 1, shift[0]);
14676         fadst16x4_N4_avx2(in, outcoeff256, bitcol);
14677         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14678         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14679         fadst4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14680         clear_buffer_4x16_N4(outcoeff256);
14681         break;
14682     case FLIPADST_ADST:
14683         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
14684         fadst16x4_N4_avx2(in, outcoeff256, bitcol);
14685         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14686         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14687         fadst4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14688         clear_buffer_4x16_N4(outcoeff256);
14689         break;
14690     case IDTX:
14691         load_buffer_4x4_avx2(input, in, stride, 0, 0, shift[0]);
14692         fidtx8xn_N2_col_avx2(in, outcoeff256, bitcol, 2);
14693         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14694         fidtx4x8_col_N4_avx2(outcoeff256, outcoeff256, bitrow, 2);
14695         clear_buffer_4x16_N4(outcoeff256);
14696         break;
14697     case V_DCT:
14698         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
14699         fdct16x4_N4_avx2(in, outcoeff256, bitcol);
14700         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14701         fidtx4x8_col_N4_avx2(outcoeff256, outcoeff256, bitrow, 2);
14702         clear_buffer_4x16_N4(outcoeff256);
14703         break;
14704     case H_DCT:
14705         load_buffer_4x4_avx2(input, in, stride, 0, 0, shift[0]);
14706         fidtx8xn_N2_col_avx2(in, outcoeff256, bitcol, 2);
14707         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14708         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14709         fdct4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14710         clear_buffer_4x16_N4(outcoeff256);
14711         break;
14712     case V_ADST:
14713         load_buffer_4x16_avx2(input, in, stride, 0, 0, shift[0]);
14714         fadst16x4_N4_avx2(in, outcoeff256, bitcol);
14715         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14716         fidtx4x8_col_N4_avx2(outcoeff256, outcoeff256, bitrow, 2);
14717         clear_buffer_4x16_N4(outcoeff256);
14718         break;
14719     case H_ADST:
14720         load_buffer_4x4_avx2(input, in, stride, 0, 0, shift[0]);
14721         fidtx8xn_N2_col_avx2(in, outcoeff256, bitcol, 2);
14722         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14723         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14724         fadst4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14725         clear_buffer_4x16_N4(outcoeff256);
14726         break;
14727     case V_FLIPADST:
14728         load_buffer_4x16_avx2(input, in, stride, 1, 0, shift[0]);
14729         fadst16x4_N4_avx2(in, outcoeff256, bitcol);
14730         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14731         fidtx4x8_col_N4_avx2(outcoeff256, outcoeff256, bitrow, 2);
14732         clear_buffer_4x16_N4(outcoeff256);
14733         break;
14734     case H_FLIPADST:
14735         load_buffer_4x4_avx2(input, in, stride, 0, 1, shift[0]);
14736         fidtx8xn_N2_col_avx2(in, outcoeff256, bitcol, 2);
14737         col_txfm_8x8_N4_rounding(outcoeff256, -shift[1]);
14738         transpose_4x8_in_4x16_avx2(outcoeff256, in);
14739         fadst4x8_col_N4_avx2(in, outcoeff256, bitrow, 2);
14740         clear_buffer_4x16_N4(outcoeff256);
14741         break;
14742     default: assert(0);
14743     }
14744     (void)bd;
14745 }
14746 
svt_av1_fwd_txfm2d_16x4_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14747 void svt_av1_fwd_txfm2d_16x4_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14748                                      TxType tx_type, uint8_t bd) {
14749     __m256i       in[8];
14750     __m256i *     outcoeff256 = (__m256i *)output;
14751     const int8_t *shift       = fwd_shift_16x4;
14752     const int32_t txw_idx     = get_txw_idx(TX_16X4);
14753     const int32_t txh_idx     = get_txh_idx(TX_16X4);
14754     int32_t       bitcol      = fwd_cos_bit_col[txw_idx][txh_idx];
14755     int32_t       bitrow      = fwd_cos_bit_row[txw_idx][txh_idx];
14756 
14757     switch (tx_type) {
14758     case DCT_DCT:
14759         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
14760         fdct4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14761         fdct4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14762         fdct16x4_N4_avx2(outcoeff256, in, bitrow);
14763         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14764         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14765         break;
14766     case ADST_DCT:
14767         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
14768         fadst4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14769         fadst4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14770         fdct16x4_N4_avx2(outcoeff256, in, bitrow);
14771         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14772         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14773         break;
14774     case DCT_ADST:
14775         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
14776         fdct4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14777         fdct4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14778         fadst16x4_N4_avx2(outcoeff256, in, bitrow);
14779         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14780         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14781         break;
14782     case ADST_ADST:
14783         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
14784         fadst4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14785         fadst4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14786         fadst16x4_N4_avx2(outcoeff256, in, bitrow);
14787         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14788         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14789         break;
14790     case FLIPADST_DCT:
14791         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
14792         fadst4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14793         fadst4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14794         fdct16x4_N4_avx2(outcoeff256, in, bitrow);
14795         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14796         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14797         break;
14798     case DCT_FLIPADST:
14799         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
14800         fdct4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14801         fdct4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14802         fadst16x4_N4_avx2(outcoeff256, in, bitrow);
14803         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14804         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14805         break;
14806     case FLIPADST_FLIPADST:
14807         load_buffer_16x4_avx2(input, in, stride, 1, 1, shift[0]);
14808         fadst4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14809         fadst4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14810         fadst16x4_N4_avx2(outcoeff256, in, bitrow);
14811         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14812         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14813         break;
14814     case ADST_FLIPADST:
14815         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
14816         fadst4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14817         fadst4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14818         fadst16x4_N4_avx2(outcoeff256, in, bitrow);
14819         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14820         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14821         break;
14822     case FLIPADST_ADST:
14823         load_buffer_16x4_avx2(input, in, stride, 1, 0, shift[0]);
14824         fadst4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14825         fadst4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, 1, -shift[1]);
14826         fadst16x4_N4_avx2(outcoeff256, in, bitrow);
14827         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14828         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14829         break;
14830     case IDTX:
14831         load_buffer_4x4_avx2(input, in, stride, 0, 0, shift[0]);
14832         fidtx4x8_row_N4_avx2(in, outcoeff256, bitcol);
14833         __m256i rounding = _mm256_set1_epi32(1 << (-shift[1] - 1));
14834         outcoeff256[0]   = _mm256_add_epi32(outcoeff256[0], rounding);
14835         outcoeff256[0]   = _mm256_srai_epi32(outcoeff256[0], -shift[1]);
14836         fidtx4x8_N2_perm_avx2(outcoeff256, outcoeff256, bitrow);
14837         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14838         break;
14839     case V_DCT:
14840         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14841         fdct4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14842         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14843         fidtx8xn_N2_col_avx2(in, outcoeff256, bitrow, 1);
14844         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14845         break;
14846     case H_DCT:
14847         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
14848         fidtx4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, -shift[1]);
14849         fidtx4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, -shift[1]);
14850         fdct16x4_N4_avx2(outcoeff256, in, bitrow);
14851         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14852         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14853         break;
14854     case V_ADST:
14855         load_buffer_8x4_avx2(input, in, stride, 0, 0, shift[0]);
14856         fadst4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14857         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14858         fidtx8xn_N2_col_avx2(in, outcoeff256, bitrow, 1);
14859         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14860         break;
14861     case H_ADST:
14862         load_buffer_16x4_avx2(input, in, stride, 0, 0, shift[0]);
14863         fidtx4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, -shift[1]);
14864         fidtx4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, -shift[1]);
14865         fadst16x4_N4_avx2(outcoeff256, in, bitrow);
14866         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14867         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14868         break;
14869     case V_FLIPADST:
14870         load_buffer_8x4_avx2(input, in, stride, 1, 0, shift[0]);
14871         fadst4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, 1, -shift[1]);
14872         transpose_4x8_in_4x16_quad_avx2(outcoeff256, in);
14873         fidtx8xn_N2_col_avx2(in, outcoeff256, bitrow, 1);
14874         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14875         break;
14876     case H_FLIPADST:
14877         load_buffer_16x4_avx2(input, in, stride, 0, 1, shift[0]);
14878         fidtx4x8_row_N4_with_round_avx2(in, outcoeff256, bitcol, -shift[1]);
14879         fidtx4x8_row_N4_with_round_avx2(in + 4, outcoeff256 + 4, bitcol, -shift[1]);
14880         fadst16x4_N4_avx2(outcoeff256, in, bitrow);
14881         transpose_4x8_in_4x16_quad_avx2(in, outcoeff256);
14882         clear_buffer_wxh_N4(outcoeff256, 2, 4);
14883         break;
14884     default: assert(0);
14885     }
14886     (void)bd;
14887 }
14888 
svt_av1_fwd_txfm2d_32x64_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14889 void svt_av1_fwd_txfm2d_32x64_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14890                                       TxType tx_type, uint8_t bd) {
14891     (void)tx_type;
14892     __m256i       in[256];
14893     __m256i *     outcoef256    = (__m256i *)output;
14894     const int8_t *shift         = fwd_txfm_shift_ls[TX_32X64];
14895     const int32_t txw_idx       = get_txw_idx(TX_32X64);
14896     const int32_t txh_idx       = get_txh_idx(TX_32X64);
14897     const int32_t txfm_size_col = tx_size_wide[TX_32X64];
14898     const int32_t txfm_size_row = tx_size_high[TX_32X64];
14899     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
14900     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
14901     const int32_t num_row       = txfm_size_row >> 3;
14902     const int32_t num_col       = txfm_size_col >> 3;
14903 
14904     load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
14905     av1_fdct64_new_N4_avx2(in, in, bitcol, txfm_size_col, num_col);
14906     col_txfm_16x16_rounding(in, -shift[1]);
14907     col_txfm_16x16_rounding(in + txfm_size_col, -shift[1]);
14908     transpose_8nx8n_N4_half(in, outcoef256, txfm_size_col, txfm_size_row);
14909     av1_fdct32_new_N4_avx2(outcoef256, in, bitrow, txfm_size_row / 2, num_row);
14910     transpose_8nx8n_N4_quad(in, outcoef256, txfm_size_row, txfm_size_col);
14911     av1_round_shift_rect_wxh_N4(
14912         outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
14913     clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14914     (void)bd;
14915 }
14916 
svt_av1_fwd_txfm2d_64x32_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14917 void svt_av1_fwd_txfm2d_64x32_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14918                                       TxType tx_type, uint8_t bd) {
14919     (void)tx_type;
14920     __m256i       in[256];
14921     __m256i *     outcoef256    = (__m256i *)output;
14922     const int8_t *shift         = fwd_txfm_shift_ls[TX_64X32];
14923     const int32_t txw_idx       = get_txw_idx(TX_64X32);
14924     const int32_t txh_idx       = get_txh_idx(TX_64X32);
14925     const int32_t txfm_size_col = tx_size_wide[TX_64X32];
14926     const int32_t txfm_size_row = tx_size_high[TX_64X32];
14927     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
14928     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
14929     const int32_t num_row       = txfm_size_row >> 3;
14930     const int32_t num_col       = txfm_size_col >> 3;
14931 
14932     for (int32_t i = 0; i < 32; i++) {
14933         load_buffer_32_avx2(input + 0 + i * stride, in + 0 + i * 8, 8, 0, 0, shift[0]);
14934         load_buffer_32_avx2(input + 32 + i * stride, in + 4 + i * 8, 8, 0, 0, shift[0]);
14935     }
14936     av1_fdct32_new_N4_avx2(in, in, bitcol, txfm_size_col, num_col);
14937     col_txfm_16x16_rounding(in, -shift[1]);
14938     col_txfm_16x16_rounding(in + txfm_size_row, -shift[1]);
14939     transpose_8nx8n_N4_half(in, outcoef256, txfm_size_col, txfm_size_row);
14940     av1_fdct64_new_N4_avx2(outcoef256, in, bitrow, txfm_size_row / 2, num_row);
14941     transpose_8nx8n_N4_quad(in, outcoef256, txfm_size_row, txfm_size_col);
14942     av1_round_shift_rect_wxh_N4(
14943         outcoef256, outcoef256, -shift[2], new_sqrt2, num_col, txfm_size_row);
14944     clear_buffer_wxh_N4(outcoef256, num_col, txfm_size_row);
14945     (void)bd;
14946 }
14947 
svt_av1_fwd_txfm2d_16x64_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14948 void svt_av1_fwd_txfm2d_16x64_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14949                                       TxType tx_type, uint8_t bd) {
14950     __m256i       in[128];
14951     __m256i *     outcoeff256   = (__m256i *)output;
14952     const int8_t *shift         = fwd_txfm_shift_ls[TX_16X64];
14953     const int32_t txw_idx       = get_txw_idx(TX_16X64);
14954     const int32_t txh_idx       = get_txh_idx(TX_16X64);
14955     const int32_t txfm_size_col = tx_size_wide[TX_16X64];
14956     const int32_t txfm_size_row = tx_size_high[TX_16X64];
14957     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
14958     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
14959     int32_t       ud_flip, lr_flip;
14960     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
14961     const int32_t num_row = txfm_size_row >> 3;
14962     const int32_t num_col = txfm_size_col >> 3;
14963     // col tranform
14964     for (int32_t i = 0; i < txfm_size_row; i += num_col) {
14965         load_buffer_16_avx2(
14966             input + (i + 0) * stride, in + (i + 0) * num_col, 8, ud_flip, lr_flip, shift[0]);
14967         load_buffer_16_avx2(
14968             input + (i + 1) * stride, in + (i + 1) * num_col, 8, ud_flip, lr_flip, shift[0]);
14969     }
14970     av1_fdct64_new_N4_avx2(in, outcoeff256, bitcol, txfm_size_col, num_col);
14971     col_txfm_16x16_rounding(outcoeff256, -shift[1]);
14972     transpose_8nx8n_N4_half(outcoeff256, in, txfm_size_col, txfm_size_row);
14973     // row tranform
14974     fdct16x16_N4_avx2(in, in, bitrow, num_row, num_row / 4);
14975     transpose_8nx8n_N4_quad(in, outcoeff256, txfm_size_row, txfm_size_col);
14976     clear_buffer_wxh_N4(outcoeff256, num_col, txfm_size_row);
14977     (void)bd;
14978 }
14979 
svt_av1_fwd_txfm2d_64x16_N4_avx2(int16_t * input,int32_t * output,uint32_t stride,TxType tx_type,uint8_t bd)14980 void svt_av1_fwd_txfm2d_64x16_N4_avx2(int16_t *input, int32_t *output, uint32_t stride,
14981                                       TxType tx_type, uint8_t bd) {
14982     __m256i       in[128];
14983     __m256i *     outcoeff256   = (__m256i *)output;
14984     const int8_t *shift         = fwd_txfm_shift_ls[TX_64X16];
14985     const int32_t txw_idx       = get_txw_idx(TX_64X16);
14986     const int32_t txh_idx       = get_txh_idx(TX_64X16);
14987     const int32_t txfm_size_col = tx_size_wide[TX_64X16];
14988     const int32_t txfm_size_row = tx_size_high[TX_64X16];
14989     int8_t        bitcol        = fwd_cos_bit_col[txw_idx][txh_idx];
14990     int8_t        bitrow        = fwd_cos_bit_row[txw_idx][txh_idx];
14991     int32_t       ud_flip, lr_flip;
14992     get_flip_cfg(tx_type, &ud_flip, &lr_flip);
14993     const int32_t num_row = txfm_size_row >> 3;
14994     const int32_t num_col = txfm_size_col >> 3;
14995     // col tranform
14996     for (int32_t i = 0; i < txfm_size_row; i++) {
14997         load_buffer_16_avx2(input + 0 + i * stride, in + 0 + i * 8, 8, ud_flip, lr_flip, shift[0]);
14998         load_buffer_16_avx2(input + 16 + i * stride, in + 2 + i * 8, 8, ud_flip, lr_flip, shift[0]);
14999         load_buffer_16_avx2(input + 32 + i * stride, in + 4 + i * 8, 8, ud_flip, lr_flip, shift[0]);
15000         load_buffer_16_avx2(input + 48 + i * stride, in + 6 + i * 8, 8, ud_flip, lr_flip, shift[0]);
15001     }
15002 
15003     fdct16x16_N4_avx2(in, outcoeff256, bitcol, num_col, num_col);
15004     col_txfm_16x16_rounding(outcoeff256, -shift[1]);
15005     transpose_8nx8n_N4_half(outcoeff256, in, txfm_size_col, txfm_size_row);
15006     // row tranform
15007     av1_fdct64_new_N4_avx2(in, in, bitrow, txfm_size_row / 2, num_row);
15008     transpose_8nx8n_N4_quad(in, outcoeff256, txfm_size_row, txfm_size_col);
15009     clear_buffer_wxh_N4(outcoeff256, num_col, txfm_size_row);
15010     (void)bd;
15011 }
15012