1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/aom_config.h"
13 
14 #include "config/av1_rtcd.h"
15 
16 #include "av1/common/av1_inv_txfm1d_cfg.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/common/x86/av1_inv_txfm_avx2.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 
21 // TODO(venkatsanampudi@ittiam.com): move this to header file
22 
23 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
24 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
25                                           4 * 5793 };
26 
idct16_stage5_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)27 static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
28                                       const __m256i _r, int8_t cos_bit) {
29   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
30   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
31   btf_16_adds_subs_avx2(&x1[0], &x1[3]);
32   btf_16_adds_subs_avx2(&x1[1], &x1[2]);
33   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
34 
35   btf_16_adds_subs_avx2(&x1[8], &x1[11]);
36   btf_16_adds_subs_avx2(&x1[9], &x1[10]);
37   btf_16_adds_subs_avx2(&x1[15], &x1[12]);
38   btf_16_adds_subs_avx2(&x1[14], &x1[13]);
39 }
40 
idct16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)41 static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
42                                       const __m256i _r, int8_t cos_bit) {
43   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
44   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
45   btf_16_adds_subs_avx2(&x[0], &x[7]);
46   btf_16_adds_subs_avx2(&x[1], &x[6]);
47   btf_16_adds_subs_avx2(&x[2], &x[5]);
48   btf_16_adds_subs_avx2(&x[3], &x[4]);
49   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
50   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
51 }
52 
idct16_stage7_avx2(__m256i * output,__m256i * x1)53 static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
54   btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
55   btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
56   btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
57   btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
58   btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
59   btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
60   btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
61   btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
62 }
63 
idct16_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)64 static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
65   (void)(cos_bit);
66   const int32_t *cospi = cospi_arr(INV_COS_BIT);
67   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
68 
69   __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
70   __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
71   __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
72   __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
73   __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
74   __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
75   __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
76   __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
77   __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
78   __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
79   __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
80   __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
81   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
82   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
83   __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
84   __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
85   __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
86   __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
87   __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
88 
89   // stage 1
90   __m256i x1[16];
91   x1[0] = input[0];
92   x1[1] = input[8];
93   x1[2] = input[4];
94   x1[3] = input[12];
95   x1[4] = input[2];
96   x1[5] = input[10];
97   x1[6] = input[6];
98   x1[7] = input[14];
99   x1[8] = input[1];
100   x1[9] = input[9];
101   x1[10] = input[5];
102   x1[11] = input[13];
103   x1[12] = input[3];
104   x1[13] = input[11];
105   x1[14] = input[7];
106   x1[15] = input[15];
107 
108   // stage 2
109   btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
110   btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
111   btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
112   btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
113 
114   // stage 3
115   btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
116   btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
117   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
118   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
119   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
120   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
121 
122   // stage 4
123   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
124   btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
125   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
126   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
127   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
128   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
129 
130   idct16_stage5_avx2(x1, cospi, _r, cos_bit);
131   idct16_stage6_avx2(x1, cospi, _r, cos_bit);
132   idct16_stage7_avx2(output, x1);
133 }
134 
idct16_low8_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)135 static void idct16_low8_avx2(const __m256i *input, __m256i *output,
136                              int8_t cos_bit) {
137   (void)(cos_bit);
138   const int32_t *cospi = cospi_arr(INV_COS_BIT);
139   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
140 
141   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
142   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
143   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
144 
145   // stage 1
146   __m256i x1[16];
147   x1[0] = input[0];
148   x1[2] = input[4];
149   x1[4] = input[2];
150   x1[6] = input[6];
151   x1[8] = input[1];
152   x1[10] = input[5];
153   x1[12] = input[3];
154   x1[14] = input[7];
155 
156   // stage 2
157   btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
158   btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
159   btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
160   btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
161 
162   // stage 3
163   btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
164   btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
165   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
166   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
167   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
168   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
169 
170   // stage 4
171   btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
172   btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
173   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
174   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
175   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
176   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
177 
178   idct16_stage5_avx2(x1, cospi, _r, cos_bit);
179   idct16_stage6_avx2(x1, cospi, _r, cos_bit);
180   idct16_stage7_avx2(output, x1);
181 }
182 
idct16_low1_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)183 static void idct16_low1_avx2(const __m256i *input, __m256i *output,
184                              int8_t cos_bit) {
185   (void)(cos_bit);
186   const int32_t *cospi = cospi_arr(INV_COS_BIT);
187 
188   // stage 1
189   __m256i x1[2];
190   x1[0] = input[0];
191 
192   // stage 2
193   // stage 3
194   // stage 4
195   btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
196 
197   // stage 5
198   // stage 6
199   output[0] = x1[0];
200   output[1] = x1[1];
201   output[2] = x1[1];
202   output[3] = x1[0];
203   output[4] = x1[0];
204   output[5] = x1[1];
205   output[6] = x1[1];
206   output[7] = x1[0];
207   output[8] = x1[0];
208   output[9] = x1[1];
209   output[10] = x1[1];
210   output[11] = x1[0];
211   output[12] = x1[0];
212   output[13] = x1[1];
213   output[14] = x1[1];
214   output[15] = x1[0];
215 }
216 
iadst16_stage3_avx2(__m256i * x)217 static INLINE void iadst16_stage3_avx2(__m256i *x) {
218   btf_16_adds_subs_avx2(&x[0], &x[8]);
219   btf_16_adds_subs_avx2(&x[1], &x[9]);
220   btf_16_adds_subs_avx2(&x[2], &x[10]);
221   btf_16_adds_subs_avx2(&x[3], &x[11]);
222   btf_16_adds_subs_avx2(&x[4], &x[12]);
223   btf_16_adds_subs_avx2(&x[5], &x[13]);
224   btf_16_adds_subs_avx2(&x[6], &x[14]);
225   btf_16_adds_subs_avx2(&x[7], &x[15]);
226 }
227 
iadst16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)228 static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
229                                        const __m256i _r, int8_t cos_bit) {
230   const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
231   const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
232   const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
233   const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
234   const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
235   const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
236   btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
237   btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
238   btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
239   btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
240 }
241 
iadst16_stage5_avx2(__m256i * x)242 static INLINE void iadst16_stage5_avx2(__m256i *x) {
243   btf_16_adds_subs_avx2(&x[0], &x[4]);
244   btf_16_adds_subs_avx2(&x[1], &x[5]);
245   btf_16_adds_subs_avx2(&x[2], &x[6]);
246   btf_16_adds_subs_avx2(&x[3], &x[7]);
247   btf_16_adds_subs_avx2(&x[8], &x[12]);
248   btf_16_adds_subs_avx2(&x[9], &x[13]);
249   btf_16_adds_subs_avx2(&x[10], &x[14]);
250   btf_16_adds_subs_avx2(&x[11], &x[15]);
251 }
252 
iadst16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)253 static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
254                                        const __m256i _r, int8_t cos_bit) {
255   const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
256   const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
257   const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
258   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
259   btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
260   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
261   btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
262 }
263 
iadst16_stage7_avx2(__m256i * x)264 static INLINE void iadst16_stage7_avx2(__m256i *x) {
265   btf_16_adds_subs_avx2(&x[0], &x[2]);
266   btf_16_adds_subs_avx2(&x[1], &x[3]);
267   btf_16_adds_subs_avx2(&x[4], &x[6]);
268   btf_16_adds_subs_avx2(&x[5], &x[7]);
269   btf_16_adds_subs_avx2(&x[8], &x[10]);
270   btf_16_adds_subs_avx2(&x[9], &x[11]);
271   btf_16_adds_subs_avx2(&x[12], &x[14]);
272   btf_16_adds_subs_avx2(&x[13], &x[15]);
273 }
274 
iadst16_stage8_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)275 static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
276                                        const __m256i _r, int8_t cos_bit) {
277   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
278   const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
279   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
280   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
281   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
282   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
283 }
284 
iadst16_stage9_avx2(__m256i * output,__m256i * x1)285 static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
286   const __m256i __zero = _mm256_setzero_si256();
287   output[0] = x1[0];
288   output[1] = _mm256_subs_epi16(__zero, x1[8]);
289   output[2] = x1[12];
290   output[3] = _mm256_subs_epi16(__zero, x1[4]);
291   output[4] = x1[6];
292   output[5] = _mm256_subs_epi16(__zero, x1[14]);
293   output[6] = x1[10];
294   output[7] = _mm256_subs_epi16(__zero, x1[2]);
295   output[8] = x1[3];
296   output[9] = _mm256_subs_epi16(__zero, x1[11]);
297   output[10] = x1[15];
298   output[11] = _mm256_subs_epi16(__zero, x1[7]);
299   output[12] = x1[5];
300   output[13] = _mm256_subs_epi16(__zero, x1[13]);
301   output[14] = x1[9];
302   output[15] = _mm256_subs_epi16(__zero, x1[1]);
303 }
304 
iadst16_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)305 static void iadst16_avx2(const __m256i *input, __m256i *output,
306                          int8_t cos_bit) {
307   (void)(cos_bit);
308   const int32_t *cospi = cospi_arr(INV_COS_BIT);
309 
310   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
311 
312   __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
313   __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
314   __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
315   __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
316   __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
317   __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
318   __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
319   __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
320   __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
321   __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
322   __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
323   __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
324   __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
325   __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
326   __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
327   __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
328 
329   // stage 1
330   __m256i x1[16];
331   x1[0] = input[15];
332   x1[1] = input[0];
333   x1[2] = input[13];
334   x1[3] = input[2];
335   x1[4] = input[11];
336   x1[5] = input[4];
337   x1[6] = input[9];
338   x1[7] = input[6];
339   x1[8] = input[7];
340   x1[9] = input[8];
341   x1[10] = input[5];
342   x1[11] = input[10];
343   x1[12] = input[3];
344   x1[13] = input[12];
345   x1[14] = input[1];
346   x1[15] = input[14];
347 
348   // stage 2
349   btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
350   btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
351   btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
352   btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
353   btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
354   btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
355   btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
356   btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
357 
358   iadst16_stage3_avx2(x1);
359   iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
360   iadst16_stage5_avx2(x1);
361   iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
362   iadst16_stage7_avx2(x1);
363   iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
364   iadst16_stage9_avx2(output, x1);
365 }
366 
iadst16_low8_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)367 static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
368                               int8_t cos_bit) {
369   (void)(cos_bit);
370   const int32_t *cospi = cospi_arr(INV_COS_BIT);
371   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
372 
373   // stage 1
374   __m256i x1[16];
375   x1[1] = input[0];
376   x1[3] = input[2];
377   x1[5] = input[4];
378   x1[7] = input[6];
379   x1[8] = input[7];
380   x1[10] = input[5];
381   x1[12] = input[3];
382   x1[14] = input[1];
383 
384   // stage 2
385   btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
386   btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
387   btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
388   btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
389   btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
390   btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
391   btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
392   btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
393 
394   iadst16_stage3_avx2(x1);
395   iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
396   iadst16_stage5_avx2(x1);
397   iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
398   iadst16_stage7_avx2(x1);
399   iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
400   iadst16_stage9_avx2(output, x1);
401 }
402 
iadst16_low1_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)403 static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
404                               int8_t cos_bit) {
405   (void)(cos_bit);
406   const int32_t *cospi = cospi_arr(INV_COS_BIT);
407   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
408 
409   const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
410   const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
411   const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
412   const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
413 
414   // stage 1
415   __m256i x1[16];
416   x1[1] = input[0];
417 
418   // stage 2
419   btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
420 
421   // stage 3
422   x1[8] = x1[0];
423   x1[9] = x1[1];
424 
425   // stage 4
426   btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
427 
428   // stage 5
429   x1[4] = x1[0];
430   x1[5] = x1[1];
431 
432   x1[12] = x1[8];
433   x1[13] = x1[9];
434 
435   // stage 6
436   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
437   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
438 
439   // stage 7
440   x1[2] = x1[0];
441   x1[3] = x1[1];
442   x1[6] = x1[4];
443   x1[7] = x1[5];
444   x1[10] = x1[8];
445   x1[11] = x1[9];
446   x1[14] = x1[12];
447   x1[15] = x1[13];
448 
449   iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
450   iadst16_stage9_avx2(output, x1);
451 }
452 
idct32_high16_stage3_avx2(__m256i * x)453 static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
454   btf_16_adds_subs_avx2(&x[16], &x[17]);
455   btf_16_adds_subs_avx2(&x[19], &x[18]);
456   btf_16_adds_subs_avx2(&x[20], &x[21]);
457   btf_16_adds_subs_avx2(&x[23], &x[22]);
458   btf_16_adds_subs_avx2(&x[24], &x[25]);
459   btf_16_adds_subs_avx2(&x[27], &x[26]);
460   btf_16_adds_subs_avx2(&x[28], &x[29]);
461   btf_16_adds_subs_avx2(&x[31], &x[30]);
462 }
463 
idct32_high16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)464 static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
465                                              const __m256i _r, int8_t cos_bit) {
466   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
467   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
468   const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
469   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
470   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
471   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
472   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
473   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
474   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
475   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
476 }
477 
idct32_high24_stage5_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)478 static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
479                                              const __m256i _r, int8_t cos_bit) {
480   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
481   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
482   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
483   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
484   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
485   btf_16_adds_subs_avx2(&x[16], &x[19]);
486   btf_16_adds_subs_avx2(&x[17], &x[18]);
487   btf_16_adds_subs_avx2(&x[23], &x[20]);
488   btf_16_adds_subs_avx2(&x[22], &x[21]);
489   btf_16_adds_subs_avx2(&x[24], &x[27]);
490   btf_16_adds_subs_avx2(&x[25], &x[26]);
491   btf_16_adds_subs_avx2(&x[31], &x[28]);
492   btf_16_adds_subs_avx2(&x[30], &x[29]);
493 }
494 
idct32_high28_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)495 static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
496                                              const __m256i _r, int8_t cos_bit) {
497   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
498   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
499   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
500   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
501   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
502   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
503   btf_16_adds_subs_avx2(&x[8], &x[11]);
504   btf_16_adds_subs_avx2(&x[9], &x[10]);
505   btf_16_adds_subs_avx2(&x[15], &x[12]);
506   btf_16_adds_subs_avx2(&x[14], &x[13]);
507   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
508   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
509   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
510   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
511 }
512 
idct32_stage7_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)513 static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
514                                       const __m256i _r, int8_t cos_bit) {
515   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
516   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
517   btf_16_adds_subs_avx2(&x[0], &x[7]);
518   btf_16_adds_subs_avx2(&x[1], &x[6]);
519   btf_16_adds_subs_avx2(&x[2], &x[5]);
520   btf_16_adds_subs_avx2(&x[3], &x[4]);
521   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
522   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
523   btf_16_adds_subs_avx2(&x[16], &x[23]);
524   btf_16_adds_subs_avx2(&x[17], &x[22]);
525   btf_16_adds_subs_avx2(&x[18], &x[21]);
526   btf_16_adds_subs_avx2(&x[19], &x[20]);
527   btf_16_adds_subs_avx2(&x[31], &x[24]);
528   btf_16_adds_subs_avx2(&x[30], &x[25]);
529   btf_16_adds_subs_avx2(&x[29], &x[26]);
530   btf_16_adds_subs_avx2(&x[28], &x[27]);
531 }
532 
idct32_stage8_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)533 static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
534                                       const __m256i _r, int8_t cos_bit) {
535   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
536   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
537   btf_16_adds_subs_avx2(&x[0], &x[15]);
538   btf_16_adds_subs_avx2(&x[1], &x[14]);
539   btf_16_adds_subs_avx2(&x[2], &x[13]);
540   btf_16_adds_subs_avx2(&x[3], &x[12]);
541   btf_16_adds_subs_avx2(&x[4], &x[11]);
542   btf_16_adds_subs_avx2(&x[5], &x[10]);
543   btf_16_adds_subs_avx2(&x[6], &x[9]);
544   btf_16_adds_subs_avx2(&x[7], &x[8]);
545   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
546   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
547   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
548   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
549 }
550 
idct32_stage9_avx2(__m256i * output,__m256i * x)551 static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
552   btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
553   btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
554   btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
555   btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
556   btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
557   btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
558   btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
559   btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
560   btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
561   btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
562   btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
563   btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
564   btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
565   btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
566   btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
567   btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
568 }
569 
idct32_low1_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)570 static void idct32_low1_avx2(const __m256i *input, __m256i *output,
571                              int8_t cos_bit) {
572   (void)cos_bit;
573   const int32_t *cospi = cospi_arr(INV_COS_BIT);
574 
575   // stage 1
576   __m256i x[2];
577   x[0] = input[0];
578 
579   // stage 2
580   // stage 3
581   // stage 4
582   // stage 5
583   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
584 
585   // stage 6
586   // stage 7
587   // stage 8
588   // stage 9
589   output[0] = x[0];
590   output[31] = x[0];
591   output[1] = x[1];
592   output[30] = x[1];
593   output[2] = x[1];
594   output[29] = x[1];
595   output[3] = x[0];
596   output[28] = x[0];
597   output[4] = x[0];
598   output[27] = x[0];
599   output[5] = x[1];
600   output[26] = x[1];
601   output[6] = x[1];
602   output[25] = x[1];
603   output[7] = x[0];
604   output[24] = x[0];
605   output[8] = x[0];
606   output[23] = x[0];
607   output[9] = x[1];
608   output[22] = x[1];
609   output[10] = x[1];
610   output[21] = x[1];
611   output[11] = x[0];
612   output[20] = x[0];
613   output[12] = x[0];
614   output[19] = x[0];
615   output[13] = x[1];
616   output[18] = x[1];
617   output[14] = x[1];
618   output[17] = x[1];
619   output[15] = x[0];
620   output[16] = x[0];
621 }
622 
idct32_low8_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)623 static void idct32_low8_avx2(const __m256i *input, __m256i *output,
624                              int8_t cos_bit) {
625   (void)cos_bit;
626   const int32_t *cospi = cospi_arr(INV_COS_BIT);
627   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
628 
629   // stage 1
630   __m256i x[32];
631   x[0] = input[0];
632   x[4] = input[4];
633   x[8] = input[2];
634   x[12] = input[6];
635   x[16] = input[1];
636   x[20] = input[5];
637   x[24] = input[3];
638   x[28] = input[7];
639 
640   // stage 2
641   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
642   btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
643   btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
644   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
645 
646   // stage 3
647   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
648   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
649   x[17] = x[16];
650   x[18] = x[19];
651   x[21] = x[20];
652   x[22] = x[23];
653   x[25] = x[24];
654   x[26] = x[27];
655   x[29] = x[28];
656   x[30] = x[31];
657 
658   // stage 4
659   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
660   x[9] = x[8];
661   x[10] = x[11];
662   x[13] = x[12];
663   x[14] = x[15];
664   idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
665 
666   // stage 5
667   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
668   x[5] = x[4];
669   x[6] = x[7];
670   idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
671   // stage 6
672   x[3] = x[0];
673   x[2] = x[1];
674   idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
675 
676   idct32_stage7_avx2(x, cospi, _r, cos_bit);
677   idct32_stage8_avx2(x, cospi, _r, cos_bit);
678   idct32_stage9_avx2(output, x);
679 }
680 
idct32_low16_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)681 static void idct32_low16_avx2(const __m256i *input, __m256i *output,
682                               int8_t cos_bit) {
683   (void)cos_bit;
684   const int32_t *cospi = cospi_arr(INV_COS_BIT);
685   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
686 
687   // stage 1
688   __m256i x[32];
689   x[0] = input[0];
690   x[2] = input[8];
691   x[4] = input[4];
692   x[6] = input[12];
693   x[8] = input[2];
694   x[10] = input[10];
695   x[12] = input[6];
696   x[14] = input[14];
697   x[16] = input[1];
698   x[18] = input[9];
699   x[20] = input[5];
700   x[22] = input[13];
701   x[24] = input[3];
702   x[26] = input[11];
703   x[28] = input[7];
704   x[30] = input[15];
705 
706   // stage 2
707   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
708   btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
709   btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
710   btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
711   btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
712   btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
713   btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
714   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
715 
716   // stage 3
717   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
718   btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
719   btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
720   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
721   idct32_high16_stage3_avx2(x);
722 
723   // stage 4
724   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
725   btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
726   btf_16_adds_subs_avx2(&x[8], &x[9]);
727   btf_16_adds_subs_avx2(&x[11], &x[10]);
728   btf_16_adds_subs_avx2(&x[12], &x[13]);
729   btf_16_adds_subs_avx2(&x[15], &x[14]);
730   idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
731 
732   // stage 5
733   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
734   btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
735   btf_16_adds_subs_avx2(&x[4], &x[5]);
736   btf_16_adds_subs_avx2(&x[7], &x[6]);
737   idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
738 
739   btf_16_adds_subs_avx2(&x[0], &x[3]);
740   btf_16_adds_subs_avx2(&x[1], &x[2]);
741   idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
742 
743   idct32_stage7_avx2(x, cospi, _r, cos_bit);
744   idct32_stage8_avx2(x, cospi, _r, cos_bit);
745   idct32_stage9_avx2(output, x);
746 }
747 
idct32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)748 static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
749   (void)(cos_bit);
750   const int32_t *cospi = cospi_arr(INV_COS_BIT);
751   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
752 
753   __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
754   __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
755   __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
756   __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
757   __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
758   __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
759   __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
760   __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
761   __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
762   __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
763   __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
764   __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
765   __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
766   __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
767   __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
768   __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
769   __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
770   __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
771   __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
772   __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
773   __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
774   __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
775   __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
776   __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
777   __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
778   __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
779   __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
780   __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
781   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
782   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
783   __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
784   __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
785 
786   // stage 1
787   __m256i x1[32];
788   x1[0] = input[0];
789   x1[1] = input[16];
790   x1[2] = input[8];
791   x1[3] = input[24];
792   x1[4] = input[4];
793   x1[5] = input[20];
794   x1[6] = input[12];
795   x1[7] = input[28];
796   x1[8] = input[2];
797   x1[9] = input[18];
798   x1[10] = input[10];
799   x1[11] = input[26];
800   x1[12] = input[6];
801   x1[13] = input[22];
802   x1[14] = input[14];
803   x1[15] = input[30];
804   x1[16] = input[1];
805   x1[17] = input[17];
806   x1[18] = input[9];
807   x1[19] = input[25];
808   x1[20] = input[5];
809   x1[21] = input[21];
810   x1[22] = input[13];
811   x1[23] = input[29];
812   x1[24] = input[3];
813   x1[25] = input[19];
814   x1[26] = input[11];
815   x1[27] = input[27];
816   x1[28] = input[7];
817   x1[29] = input[23];
818   x1[30] = input[15];
819   x1[31] = input[31];
820 
821   // stage 2
822   btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
823   btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
824   btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
825   btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
826   btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
827   btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
828   btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
829   btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
830 
831   // stage 3
832   btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
833   btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
834   btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
835   btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
836   idct32_high16_stage3_avx2(x1);
837 
838   // stage 4
839   btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
840   btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
841   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
842   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
843   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
844   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
845   idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
846 
847   // stage 5
848   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
849   btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
850   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
851   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
852   idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
853 
854   // stage 6
855   btf_16_adds_subs_avx2(&x1[0], &x1[3]);
856   btf_16_adds_subs_avx2(&x1[1], &x1[2]);
857   idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
858 
859   idct32_stage7_avx2(x1, cospi, _r, cos_bit);
860   idct32_stage8_avx2(x1, cospi, _r, cos_bit);
861   idct32_stage9_avx2(output, x1);
862 }
863 
idct64_stage4_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)864 static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
865                                              const __m256i _r, int8_t cos_bit) {
866   (void)cos_bit;
867   const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
868   const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
869   const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
870   const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
871   const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
872   const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
873   const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
874   const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
875   const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
876   const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
877   const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
878   const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
879   btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
880   btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
881   btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
882   btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
883   btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
884   btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
885   btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
886   btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
887 }
888 
idct64_stage5_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)889 static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
890                                              const __m256i _r, int8_t cos_bit) {
891   (void)cos_bit;
892   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
893   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
894   const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
895   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
896   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
897   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
898   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
899   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
900   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
901   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
902   btf_16_adds_subs_avx2(&x[32], &x[35]);
903   btf_16_adds_subs_avx2(&x[33], &x[34]);
904   btf_16_adds_subs_avx2(&x[39], &x[36]);
905   btf_16_adds_subs_avx2(&x[38], &x[37]);
906   btf_16_adds_subs_avx2(&x[40], &x[43]);
907   btf_16_adds_subs_avx2(&x[41], &x[42]);
908   btf_16_adds_subs_avx2(&x[47], &x[44]);
909   btf_16_adds_subs_avx2(&x[46], &x[45]);
910   btf_16_adds_subs_avx2(&x[48], &x[51]);
911   btf_16_adds_subs_avx2(&x[49], &x[50]);
912   btf_16_adds_subs_avx2(&x[55], &x[52]);
913   btf_16_adds_subs_avx2(&x[54], &x[53]);
914   btf_16_adds_subs_avx2(&x[56], &x[59]);
915   btf_16_adds_subs_avx2(&x[57], &x[58]);
916   btf_16_adds_subs_avx2(&x[63], &x[60]);
917   btf_16_adds_subs_avx2(&x[62], &x[61]);
918 }
919 
idct64_stage6_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)920 static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
921                                              const __m256i _r, int8_t cos_bit) {
922   (void)cos_bit;
923   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
924   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
925   const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
926   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
927   const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
928   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
929   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
930   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
931   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
932   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
933   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
934   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
935   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
936   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
937 }
938 
idct64_stage6_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)939 static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
940                                              const __m256i _r, int8_t cos_bit) {
941   btf_16_adds_subs_avx2(&x[16], &x[19]);
942   btf_16_adds_subs_avx2(&x[17], &x[18]);
943   btf_16_adds_subs_avx2(&x[23], &x[20]);
944   btf_16_adds_subs_avx2(&x[22], &x[21]);
945   btf_16_adds_subs_avx2(&x[24], &x[27]);
946   btf_16_adds_subs_avx2(&x[25], &x[26]);
947   btf_16_adds_subs_avx2(&x[31], &x[28]);
948   btf_16_adds_subs_avx2(&x[30], &x[29]);
949   idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
950 }
951 
idct64_stage7_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)952 static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
953                                              const __m256i _r, int8_t cos_bit) {
954   (void)cos_bit;
955   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
956   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
957   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
958   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
959   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
960   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
961   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
962   btf_16_adds_subs_avx2(&x[32], &x[39]);
963   btf_16_adds_subs_avx2(&x[33], &x[38]);
964   btf_16_adds_subs_avx2(&x[34], &x[37]);
965   btf_16_adds_subs_avx2(&x[35], &x[36]);
966   btf_16_adds_subs_avx2(&x[47], &x[40]);
967   btf_16_adds_subs_avx2(&x[46], &x[41]);
968   btf_16_adds_subs_avx2(&x[45], &x[42]);
969   btf_16_adds_subs_avx2(&x[44], &x[43]);
970   btf_16_adds_subs_avx2(&x[48], &x[55]);
971   btf_16_adds_subs_avx2(&x[49], &x[54]);
972   btf_16_adds_subs_avx2(&x[50], &x[53]);
973   btf_16_adds_subs_avx2(&x[51], &x[52]);
974   btf_16_adds_subs_avx2(&x[63], &x[56]);
975   btf_16_adds_subs_avx2(&x[62], &x[57]);
976   btf_16_adds_subs_avx2(&x[61], &x[58]);
977   btf_16_adds_subs_avx2(&x[60], &x[59]);
978 }
979 
idct64_stage8_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)980 static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
981                                              const __m256i _r, int8_t cos_bit) {
982   (void)cos_bit;
983   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
984   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
985   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
986   btf_16_adds_subs_avx2(&x[16], &x[23]);
987   btf_16_adds_subs_avx2(&x[17], &x[22]);
988   btf_16_adds_subs_avx2(&x[18], &x[21]);
989   btf_16_adds_subs_avx2(&x[19], &x[20]);
990   btf_16_adds_subs_avx2(&x[31], &x[24]);
991   btf_16_adds_subs_avx2(&x[30], &x[25]);
992   btf_16_adds_subs_avx2(&x[29], &x[26]);
993   btf_16_adds_subs_avx2(&x[28], &x[27]);
994   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
995   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
996   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
997   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
998   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
999   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
1000   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
1001   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
1002 }
1003 
idct64_stage9_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1004 static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
1005                                       const __m256i _r, int8_t cos_bit) {
1006   (void)cos_bit;
1007   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1008   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1009   btf_16_adds_subs_avx2(&x[0], &x[15]);
1010   btf_16_adds_subs_avx2(&x[1], &x[14]);
1011   btf_16_adds_subs_avx2(&x[2], &x[13]);
1012   btf_16_adds_subs_avx2(&x[3], &x[12]);
1013   btf_16_adds_subs_avx2(&x[4], &x[11]);
1014   btf_16_adds_subs_avx2(&x[5], &x[10]);
1015   btf_16_adds_subs_avx2(&x[6], &x[9]);
1016   btf_16_adds_subs_avx2(&x[7], &x[8]);
1017   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
1018   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
1019   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
1020   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
1021   btf_16_adds_subs_avx2(&x[32], &x[47]);
1022   btf_16_adds_subs_avx2(&x[33], &x[46]);
1023   btf_16_adds_subs_avx2(&x[34], &x[45]);
1024   btf_16_adds_subs_avx2(&x[35], &x[44]);
1025   btf_16_adds_subs_avx2(&x[36], &x[43]);
1026   btf_16_adds_subs_avx2(&x[37], &x[42]);
1027   btf_16_adds_subs_avx2(&x[38], &x[41]);
1028   btf_16_adds_subs_avx2(&x[39], &x[40]);
1029   btf_16_adds_subs_avx2(&x[63], &x[48]);
1030   btf_16_adds_subs_avx2(&x[62], &x[49]);
1031   btf_16_adds_subs_avx2(&x[61], &x[50]);
1032   btf_16_adds_subs_avx2(&x[60], &x[51]);
1033   btf_16_adds_subs_avx2(&x[59], &x[52]);
1034   btf_16_adds_subs_avx2(&x[58], &x[53]);
1035   btf_16_adds_subs_avx2(&x[57], &x[54]);
1036   btf_16_adds_subs_avx2(&x[56], &x[55]);
1037 }
1038 
idct64_stage10_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1039 static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
1040                                        const __m256i _r, int8_t cos_bit) {
1041   (void)cos_bit;
1042   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1043   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1044   btf_16_adds_subs_avx2(&x[0], &x[31]);
1045   btf_16_adds_subs_avx2(&x[1], &x[30]);
1046   btf_16_adds_subs_avx2(&x[2], &x[29]);
1047   btf_16_adds_subs_avx2(&x[3], &x[28]);
1048   btf_16_adds_subs_avx2(&x[4], &x[27]);
1049   btf_16_adds_subs_avx2(&x[5], &x[26]);
1050   btf_16_adds_subs_avx2(&x[6], &x[25]);
1051   btf_16_adds_subs_avx2(&x[7], &x[24]);
1052   btf_16_adds_subs_avx2(&x[8], &x[23]);
1053   btf_16_adds_subs_avx2(&x[9], &x[22]);
1054   btf_16_adds_subs_avx2(&x[10], &x[21]);
1055   btf_16_adds_subs_avx2(&x[11], &x[20]);
1056   btf_16_adds_subs_avx2(&x[12], &x[19]);
1057   btf_16_adds_subs_avx2(&x[13], &x[18]);
1058   btf_16_adds_subs_avx2(&x[14], &x[17]);
1059   btf_16_adds_subs_avx2(&x[15], &x[16]);
1060   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
1061   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
1062   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
1063   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
1064   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
1065   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
1066   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
1067   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
1068 }
1069 
idct64_stage11_avx2(__m256i * output,__m256i * x)1070 static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
1071   btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
1072   btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
1073   btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
1074   btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
1075   btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
1076   btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
1077   btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
1078   btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
1079   btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
1080   btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
1081   btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
1082   btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
1083   btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
1084   btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
1085   btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
1086   btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
1087   btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
1088   btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
1089   btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
1090   btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
1091   btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
1092   btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
1093   btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
1094   btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
1095   btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
1096   btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
1097   btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
1098   btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
1099   btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
1100   btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
1101   btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
1102   btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
1103 }
1104 
idct64_low1_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1105 static void idct64_low1_avx2(const __m256i *input, __m256i *output,
1106                              int8_t cos_bit) {
1107   (void)cos_bit;
1108   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1109 
1110   // stage 1
1111   __m256i x[32];
1112   x[0] = input[0];
1113 
1114   // stage 2
1115   // stage 3
1116   // stage 4
1117   // stage 5
1118   // stage 6
1119   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1120 
1121   // stage 7
1122   // stage 8
1123   // stage 9
1124   // stage 10
1125   // stage 11
1126   output[0] = x[0];
1127   output[63] = x[0];
1128   output[1] = x[1];
1129   output[62] = x[1];
1130   output[2] = x[1];
1131   output[61] = x[1];
1132   output[3] = x[0];
1133   output[60] = x[0];
1134   output[4] = x[0];
1135   output[59] = x[0];
1136   output[5] = x[1];
1137   output[58] = x[1];
1138   output[6] = x[1];
1139   output[57] = x[1];
1140   output[7] = x[0];
1141   output[56] = x[0];
1142   output[8] = x[0];
1143   output[55] = x[0];
1144   output[9] = x[1];
1145   output[54] = x[1];
1146   output[10] = x[1];
1147   output[53] = x[1];
1148   output[11] = x[0];
1149   output[52] = x[0];
1150   output[12] = x[0];
1151   output[51] = x[0];
1152   output[13] = x[1];
1153   output[50] = x[1];
1154   output[14] = x[1];
1155   output[49] = x[1];
1156   output[15] = x[0];
1157   output[48] = x[0];
1158   output[16] = x[0];
1159   output[47] = x[0];
1160   output[17] = x[1];
1161   output[46] = x[1];
1162   output[18] = x[1];
1163   output[45] = x[1];
1164   output[19] = x[0];
1165   output[44] = x[0];
1166   output[20] = x[0];
1167   output[43] = x[0];
1168   output[21] = x[1];
1169   output[42] = x[1];
1170   output[22] = x[1];
1171   output[41] = x[1];
1172   output[23] = x[0];
1173   output[40] = x[0];
1174   output[24] = x[0];
1175   output[39] = x[0];
1176   output[25] = x[1];
1177   output[38] = x[1];
1178   output[26] = x[1];
1179   output[37] = x[1];
1180   output[27] = x[0];
1181   output[36] = x[0];
1182   output[28] = x[0];
1183   output[35] = x[0];
1184   output[29] = x[1];
1185   output[34] = x[1];
1186   output[30] = x[1];
1187   output[33] = x[1];
1188   output[31] = x[0];
1189   output[32] = x[0];
1190 }
1191 
idct64_low8_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1192 static void idct64_low8_avx2(const __m256i *input, __m256i *output,
1193                              int8_t cos_bit) {
1194   (void)cos_bit;
1195   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1196   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1197   const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
1198   const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
1199   const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
1200   const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
1201   const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
1202   const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
1203   const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
1204   const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
1205   const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
1206   const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
1207   const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
1208   const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
1209   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1210   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1211   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1212   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1213 
1214   // stage 1
1215   __m256i x[64];
1216   x[0] = input[0];
1217   x[8] = input[4];
1218   x[16] = input[2];
1219   x[24] = input[6];
1220   x[32] = input[1];
1221   x[40] = input[5];
1222   x[48] = input[3];
1223   x[56] = input[7];
1224 
1225   // stage 2
1226   btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1227   btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1228   btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1229   btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1230 
1231   // stage 3
1232   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1233   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1234   x[33] = x[32];
1235   x[38] = x[39];
1236   x[41] = x[40];
1237   x[46] = x[47];
1238   x[49] = x[48];
1239   x[54] = x[55];
1240   x[57] = x[56];
1241   x[62] = x[63];
1242 
1243   // stage 4
1244   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1245   x[17] = x[16];
1246   x[22] = x[23];
1247   x[25] = x[24];
1248   x[30] = x[31];
1249   btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
1250   btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
1251   btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
1252   btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
1253 
1254   // stage 5
1255   x[9] = x[8];
1256   x[14] = x[15];
1257   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
1258   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
1259   x[35] = x[32];
1260   x[34] = x[33];
1261   x[36] = x[39];
1262   x[37] = x[38];
1263   x[43] = x[40];
1264   x[42] = x[41];
1265   x[44] = x[47];
1266   x[45] = x[46];
1267   x[51] = x[48];
1268   x[50] = x[49];
1269   x[52] = x[55];
1270   x[53] = x[54];
1271   x[59] = x[56];
1272   x[58] = x[57];
1273   x[60] = x[63];
1274   x[61] = x[62];
1275 
1276   // stage 6
1277   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1278   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1279   x[19] = x[16];
1280   x[18] = x[17];
1281   x[20] = x[23];
1282   x[21] = x[22];
1283   x[27] = x[24];
1284   x[26] = x[25];
1285   x[28] = x[31];
1286   x[29] = x[30];
1287   idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
1288 
1289   // stage 7
1290   x[3] = x[0];
1291   x[2] = x[1];
1292   x[11] = x[8];
1293   x[10] = x[9];
1294   x[12] = x[15];
1295   x[13] = x[14];
1296   idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1297 
1298   // stage 8
1299   x[7] = x[0];
1300   x[6] = x[1];
1301   x[5] = x[2];
1302   x[4] = x[3];
1303   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1304   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1305   idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1306 
1307   idct64_stage9_avx2(x, cospi, _r, cos_bit);
1308   idct64_stage10_avx2(x, cospi, _r, cos_bit);
1309   idct64_stage11_avx2(output, x);
1310 }
1311 
idct64_low16_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1312 static void idct64_low16_avx2(const __m256i *input, __m256i *output,
1313                               int8_t cos_bit) {
1314   (void)cos_bit;
1315   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1316   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1317 
1318   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1319   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1320   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1321   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1322   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1323 
1324   // stage 1
1325   __m256i x[64];
1326   x[0] = input[0];
1327   x[4] = input[8];
1328   x[8] = input[4];
1329   x[12] = input[12];
1330   x[16] = input[2];
1331   x[20] = input[10];
1332   x[24] = input[6];
1333   x[28] = input[14];
1334   x[32] = input[1];
1335   x[36] = input[9];
1336   x[40] = input[5];
1337   x[44] = input[13];
1338   x[48] = input[3];
1339   x[52] = input[11];
1340   x[56] = input[7];
1341   x[60] = input[15];
1342 
1343   // stage 2
1344   btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1345   btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1346   btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1347   btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1348   btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1349   btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1350   btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1351   btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1352 
1353   // stage 3
1354   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1355   btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1356   btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1357   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1358   x[33] = x[32];
1359   x[34] = x[35];
1360   x[37] = x[36];
1361   x[38] = x[39];
1362   x[41] = x[40];
1363   x[42] = x[43];
1364   x[45] = x[44];
1365   x[46] = x[47];
1366   x[49] = x[48];
1367   x[50] = x[51];
1368   x[53] = x[52];
1369   x[54] = x[55];
1370   x[57] = x[56];
1371   x[58] = x[59];
1372   x[61] = x[60];
1373   x[62] = x[63];
1374 
1375   // stage 4
1376   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1377   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1378   x[17] = x[16];
1379   x[18] = x[19];
1380   x[21] = x[20];
1381   x[22] = x[23];
1382   x[25] = x[24];
1383   x[26] = x[27];
1384   x[29] = x[28];
1385   x[30] = x[31];
1386   idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
1387 
1388   // stage 5
1389   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1390   x[9] = x[8];
1391   x[10] = x[11];
1392   x[13] = x[12];
1393   x[14] = x[15];
1394   idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
1395 
1396   // stage 6
1397   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1398   x[5] = x[4];
1399   x[6] = x[7];
1400   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1401   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
1402   idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
1403 
1404   // stage 7
1405   x[3] = x[0];
1406   x[2] = x[1];
1407   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
1408   btf_16_adds_subs_avx2(&x[8], &x[11]);
1409   btf_16_adds_subs_avx2(&x[9], &x[10]);
1410   btf_16_adds_subs_avx2(&x[15], &x[12]);
1411   btf_16_adds_subs_avx2(&x[14], &x[13]);
1412   idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1413 
1414   // stage 8
1415   btf_16_adds_subs_avx2(&x[0], &x[7]);
1416   btf_16_adds_subs_avx2(&x[1], &x[6]);
1417   btf_16_adds_subs_avx2(&x[2], &x[5]);
1418   btf_16_adds_subs_avx2(&x[3], &x[4]);
1419   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1420   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1421   idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1422 
1423   idct64_stage9_avx2(x, cospi, _r, cos_bit);
1424   idct64_stage10_avx2(x, cospi, _r, cos_bit);
1425   idct64_stage11_avx2(output, x);
1426 }
1427 
idct64_low32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1428 static void idct64_low32_avx2(const __m256i *input, __m256i *output,
1429                               int8_t cos_bit) {
1430   (void)cos_bit;
1431   const int32_t *cospi = cospi_arr(INV_COS_BIT);
1432   const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1433 
1434   const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1435   const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1436   const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1437   const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1438   const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1439 
1440   // stage 1
1441   __m256i x[64];
1442   x[0] = input[0];
1443   x[2] = input[16];
1444   x[4] = input[8];
1445   x[6] = input[24];
1446   x[8] = input[4];
1447   x[10] = input[20];
1448   x[12] = input[12];
1449   x[14] = input[28];
1450   x[16] = input[2];
1451   x[18] = input[18];
1452   x[20] = input[10];
1453   x[22] = input[26];
1454   x[24] = input[6];
1455   x[26] = input[22];
1456   x[28] = input[14];
1457   x[30] = input[30];
1458   x[32] = input[1];
1459   x[34] = input[17];
1460   x[36] = input[9];
1461   x[38] = input[25];
1462   x[40] = input[5];
1463   x[42] = input[21];
1464   x[44] = input[13];
1465   x[46] = input[29];
1466   x[48] = input[3];
1467   x[50] = input[19];
1468   x[52] = input[11];
1469   x[54] = input[27];
1470   x[56] = input[7];
1471   x[58] = input[23];
1472   x[60] = input[15];
1473   x[62] = input[31];
1474 
1475   // stage 2
1476   btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1477   btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
1478   btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
1479   btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1480   btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1481   btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
1482   btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
1483   btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1484   btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1485   btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
1486   btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
1487   btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1488   btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1489   btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
1490   btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
1491   btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1492 
1493   // stage 3
1494   btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1495   btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
1496   btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
1497   btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1498   btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1499   btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
1500   btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
1501   btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1502   btf_16_adds_subs_avx2(&x[32], &x[33]);
1503   btf_16_adds_subs_avx2(&x[35], &x[34]);
1504   btf_16_adds_subs_avx2(&x[36], &x[37]);
1505   btf_16_adds_subs_avx2(&x[39], &x[38]);
1506   btf_16_adds_subs_avx2(&x[40], &x[41]);
1507   btf_16_adds_subs_avx2(&x[43], &x[42]);
1508   btf_16_adds_subs_avx2(&x[44], &x[45]);
1509   btf_16_adds_subs_avx2(&x[47], &x[46]);
1510   btf_16_adds_subs_avx2(&x[48], &x[49]);
1511   btf_16_adds_subs_avx2(&x[51], &x[50]);
1512   btf_16_adds_subs_avx2(&x[52], &x[53]);
1513   btf_16_adds_subs_avx2(&x[55], &x[54]);
1514   btf_16_adds_subs_avx2(&x[56], &x[57]);
1515   btf_16_adds_subs_avx2(&x[59], &x[58]);
1516   btf_16_adds_subs_avx2(&x[60], &x[61]);
1517   btf_16_adds_subs_avx2(&x[63], &x[62]);
1518 
1519   // stage 4
1520   btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1521   btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
1522   btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
1523   btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1524   btf_16_adds_subs_avx2(&x[16], &x[17]);
1525   btf_16_adds_subs_avx2(&x[19], &x[18]);
1526   btf_16_adds_subs_avx2(&x[20], &x[21]);
1527   btf_16_adds_subs_avx2(&x[23], &x[22]);
1528   btf_16_adds_subs_avx2(&x[24], &x[25]);
1529   btf_16_adds_subs_avx2(&x[27], &x[26]);
1530   btf_16_adds_subs_avx2(&x[28], &x[29]);
1531   btf_16_adds_subs_avx2(&x[31], &x[30]);
1532   idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
1533 
1534   // stage 5
1535   btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1536   btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
1537   btf_16_adds_subs_avx2(&x[8], &x[9]);
1538   btf_16_adds_subs_avx2(&x[11], &x[10]);
1539   btf_16_adds_subs_avx2(&x[12], &x[13]);
1540   btf_16_adds_subs_avx2(&x[15], &x[14]);
1541   idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
1542 
1543   // stage 6
1544   btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1545   btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
1546   btf_16_adds_subs_avx2(&x[4], &x[5]);
1547   btf_16_adds_subs_avx2(&x[7], &x[6]);
1548   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1549   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
1550   idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
1551 
1552   // stage 7
1553   btf_16_adds_subs_avx2(&x[0], &x[3]);
1554   btf_16_adds_subs_avx2(&x[1], &x[2]);
1555   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
1556   btf_16_adds_subs_avx2(&x[8], &x[11]);
1557   btf_16_adds_subs_avx2(&x[9], &x[10]);
1558   btf_16_adds_subs_avx2(&x[15], &x[12]);
1559   btf_16_adds_subs_avx2(&x[14], &x[13]);
1560   idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1561 
1562   // stage 8
1563   btf_16_adds_subs_avx2(&x[0], &x[7]);
1564   btf_16_adds_subs_avx2(&x[1], &x[6]);
1565   btf_16_adds_subs_avx2(&x[2], &x[5]);
1566   btf_16_adds_subs_avx2(&x[3], &x[4]);
1567   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1568   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1569   idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1570 
1571   // stage 9~11
1572   idct64_stage9_avx2(x, cospi, _r, cos_bit);
1573   idct64_stage10_avx2(x, cospi, _r, cos_bit);
1574   idct64_stage11_avx2(output, x);
1575 }
1576 
1577 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
1578                                   int8_t cos_bit);
1579 
1580 // 1D functions process 16 pixels at one time.
1581 static const transform_1d_avx2
1582     lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
1583       {
1584           { NULL, NULL, NULL, NULL },
1585           { NULL, NULL, NULL, NULL },
1586           { NULL, NULL, NULL, NULL },
1587       },
1588       { { NULL, NULL, NULL, NULL },
1589         { NULL, NULL, NULL, NULL },
1590         { NULL, NULL, NULL, NULL } },
1591       {
1592           { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
1593           { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
1594           { NULL, NULL, NULL, NULL },
1595       },
1596       { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
1597         { NULL, NULL, NULL, NULL },
1598         { NULL, NULL, NULL, NULL } },
1599       { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
1600           idct64_low32_avx2 },
1601         { NULL, NULL, NULL, NULL },
1602         { NULL, NULL, NULL, NULL } }
1603     };
1604 
1605 // only process w >= 16 h >= 16
lowbd_inv_txfm2d_add_no_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1606 static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
1607     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1608     TX_SIZE tx_size, int eob) {
1609   __m256i buf1[64 * 16];
1610   int eobx, eoby;
1611   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
1612   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1613   const int txw_idx = get_txw_idx(tx_size);
1614   const int txh_idx = get_txh_idx(tx_size);
1615   const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
1616   const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
1617   const int txfm_size_col = tx_size_wide[tx_size];
1618   const int txfm_size_row = tx_size_high[tx_size];
1619   const int buf_size_w_div16 = txfm_size_col >> 4;
1620   const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
1621   const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
1622   const int input_stride = AOMMIN(32, txfm_size_col);
1623   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1624 
1625   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1626   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1627   const transform_1d_avx2 row_txfm =
1628       lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1629   const transform_1d_avx2 col_txfm =
1630       lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1631 
1632   assert(col_txfm != NULL);
1633   assert(row_txfm != NULL);
1634   int ud_flip, lr_flip;
1635   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1636   const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
1637   for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
1638     __m256i buf0[64];
1639     const int32_t *input_row = input + (i << 4) * input_stride;
1640     for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
1641       __m256i *buf0_cur = buf0 + j * 16;
1642       const int32_t *input_cur = input_row + j * 16;
1643       load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
1644                                           16);
1645       transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1646     }
1647     if (rect_type == 1 || rect_type == -1) {
1648       round_shift_avx2(buf0, buf0, input_stride);  // rect special code
1649     }
1650     row_txfm(buf0, buf0, cos_bit_row);
1651     for (int j = 0; j < txfm_size_col; ++j) {
1652       buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
1653     }
1654 
1655     __m256i *buf1_cur = buf1 + (i << 4);
1656     if (lr_flip) {
1657       for (int j = 0; j < buf_size_w_div16; ++j) {
1658         __m256i temp[16];
1659         flip_buf_avx2(buf0 + 16 * j, temp, 16);
1660         int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
1661         transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
1662       }
1663     } else {
1664       for (int j = 0; j < buf_size_w_div16; ++j) {
1665         transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
1666       }
1667     }
1668   }
1669   const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
1670   for (int i = 0; i < buf_size_w_div16; i++) {
1671     __m256i *buf1_cur = buf1 + i * txfm_size_row;
1672     col_txfm(buf1_cur, buf1_cur, cos_bit_col);
1673     for (int j = 0; j < txfm_size_row; ++j) {
1674       buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
1675     }
1676   }
1677   for (int i = 0; i < buf_size_w_div16; i++) {
1678     lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
1679                                  stride, ud_flip, txfm_size_row);
1680   }
1681 }
1682 
iidentity_row_16xn_avx2(__m256i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)1683 static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
1684                                            int stride, int shift, int height,
1685                                            int txw_idx, int rect_type) {
1686   const int32_t *input_row = input;
1687   const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
1688   const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
1689                                        (1 << (NewSqrt2Bits - shift - 1)));
1690   const __m256i one = _mm256_set1_epi16(1);
1691   const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
1692   if (rect_type != 1 && rect_type != -1) {
1693     for (int i = 0; i < height; ++i) {
1694       const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1695       input_row += stride;
1696       __m256i lo = _mm256_unpacklo_epi16(src, one);
1697       __m256i hi = _mm256_unpackhi_epi16(src, one);
1698       lo = _mm256_madd_epi16(lo, scale__r);
1699       hi = _mm256_madd_epi16(hi, scale__r);
1700       lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1701       hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1702       out[i] = _mm256_packs_epi32(lo, hi);
1703     }
1704   } else {
1705     const __m256i rect_scale =
1706         _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
1707     for (int i = 0; i < height; ++i) {
1708       __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1709       src = _mm256_mulhrs_epi16(src, rect_scale);
1710       input_row += stride;
1711       __m256i lo = _mm256_unpacklo_epi16(src, one);
1712       __m256i hi = _mm256_unpackhi_epi16(src, one);
1713       lo = _mm256_madd_epi16(lo, scale__r);
1714       hi = _mm256_madd_epi16(hi, scale__r);
1715       lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1716       hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1717       out[i] = _mm256_packs_epi32(lo, hi);
1718     }
1719   }
1720 }
1721 
iidentity_col_16xn_avx2(uint8_t * output,int stride,__m256i * buf,int shift,int height,int txh_idx)1722 static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
1723                                            __m256i *buf, int shift, int height,
1724                                            int txh_idx) {
1725   const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
1726   const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
1727   const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
1728   const __m256i one = _mm256_set1_epi16(1);
1729   const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
1730   for (int h = 0; h < height; ++h) {
1731     __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
1732     __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
1733     lo = _mm256_madd_epi16(lo, scale_coeff);
1734     hi = _mm256_madd_epi16(hi, scale_coeff);
1735     lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
1736     hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
1737     lo = _mm256_add_epi32(lo, shift__r);
1738     hi = _mm256_add_epi32(hi, shift__r);
1739     lo = _mm256_srai_epi32(lo, -shift);
1740     hi = _mm256_srai_epi32(hi, -shift);
1741     const __m256i x = _mm256_packs_epi32(lo, hi);
1742     write_recon_w16_avx2(x, output);
1743     output += stride;
1744   }
1745 }
1746 
lowbd_inv_txfm2d_add_idtx_avx2(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size,int32_t eob)1747 static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
1748                                                   uint8_t *output, int stride,
1749                                                   TX_SIZE tx_size,
1750                                                   int32_t eob) {
1751   (void)eob;
1752   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1753   const int txw_idx = get_txw_idx(tx_size);
1754   const int txh_idx = get_txh_idx(tx_size);
1755   const int txfm_size_col = tx_size_wide[tx_size];
1756   const int txfm_size_row = tx_size_high[tx_size];
1757   const int input_stride = AOMMIN(32, txfm_size_col);
1758   const int row_max = AOMMIN(32, txfm_size_row);
1759   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1760   __m256i buf[32];
1761   for (int i = 0; i < input_stride; i += 16) {
1762     iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
1763                             txw_idx, rect_type);
1764     iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
1765                             txh_idx);
1766   }
1767 }
1768 
lowbd_inv_txfm2d_add_h_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1769 static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
1770     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1771     TX_SIZE tx_size, int eob) {
1772   int eobx, eoby;
1773   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
1774   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1775   const int txw_idx = get_txw_idx(tx_size);
1776   const int txh_idx = get_txh_idx(tx_size);
1777   const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
1778   const int txfm_size_col = tx_size_wide[tx_size];
1779   const int txfm_size_row = tx_size_high[tx_size];
1780   const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
1781   const int input_stride = txfm_size_col_notzero;
1782   const int buf_size_w_div16 = (eobx + 16) >> 4;
1783   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1784 
1785   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1786   const transform_1d_avx2 col_txfm =
1787       lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1788 
1789   assert(col_txfm != NULL);
1790 
1791   int ud_flip, lr_flip;
1792   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1793   for (int i = 0; i < buf_size_w_div16; i++) {
1794     __m256i buf0[64];
1795     iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
1796                             eoby + 1, txw_idx, rect_type);
1797     col_txfm(buf0, buf0, cos_bit_col);
1798     __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
1799     int k = ud_flip ? (txfm_size_row - 1) : 0;
1800     const int step = ud_flip ? -1 : 1;
1801     for (int j = 0; j < txfm_size_row; ++j, k += step) {
1802       __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
1803       write_recon_w16_avx2(res, output + (i << 4) + j * stride);
1804     }
1805   }
1806 }
1807 
lowbd_inv_txfm2d_add_v_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1808 static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
1809     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1810     TX_SIZE tx_size, int eob) {
1811   __m256i buf1[64];
1812   int eobx, eoby;
1813   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
1814   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1815   const int txw_idx = get_txw_idx(tx_size);
1816   const int txh_idx = get_txh_idx(tx_size);
1817   const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
1818   const int txfm_size_col = tx_size_wide[tx_size];
1819   const int txfm_size_row = tx_size_high[tx_size];
1820   const int buf_size_w_div16 = txfm_size_col >> 4;
1821   const int buf_size_h_div16 = (eoby + 16) >> 4;
1822   const int input_stride = AOMMIN(32, txfm_size_col);
1823   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1824 
1825   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1826   const transform_1d_avx2 row_txfm =
1827       lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1828 
1829   assert(row_txfm != NULL);
1830 
1831   int ud_flip, lr_flip;
1832   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1833   for (int i = 0; i < buf_size_h_div16; i++) {
1834     __m256i buf0[64];
1835     const int32_t *input_row = input + i * input_stride * 16;
1836     for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
1837       __m256i *buf0_cur = buf0 + j * 16;
1838       load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
1839                                           buf0_cur, 16);
1840       transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1841     }
1842     if (rect_type == 1 || rect_type == -1) {
1843       round_shift_avx2(buf0, buf0, input_stride);  // rect special code
1844     }
1845     row_txfm(buf0, buf0, cos_bit_row);
1846     round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
1847     __m256i *_buf1 = buf1;
1848     if (lr_flip) {
1849       for (int j = 0; j < buf_size_w_div16; ++j) {
1850         __m256i temp[16];
1851         flip_buf_avx2(buf0 + 16 * j, temp, 16);
1852         transpose_16bit_16x16_avx2(temp,
1853                                    _buf1 + 16 * (buf_size_w_div16 - 1 - j));
1854       }
1855     } else {
1856       for (int j = 0; j < buf_size_w_div16; ++j) {
1857         transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
1858       }
1859     }
1860     for (int j = 0; j < buf_size_w_div16; ++j) {
1861       iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
1862                               buf1 + j * 16, shift[1], 16, txh_idx);
1863     }
1864   }
1865 }
1866 
1867 // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1868 static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
1869     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1870     TX_SIZE tx_size, int eob) {
1871   (void)eob;
1872   switch (tx_type) {
1873     case DCT_DCT:
1874     case ADST_DCT:   // ADST in vertical, DCT in horizontal
1875     case DCT_ADST:   // DCT  in vertical, ADST in horizontal
1876     case ADST_ADST:  // ADST in both directions
1877     case FLIPADST_DCT:
1878     case DCT_FLIPADST:
1879     case FLIPADST_FLIPADST:
1880     case ADST_FLIPADST:
1881     case FLIPADST_ADST:
1882       lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
1883                                             tx_size, eob);
1884       break;
1885     case IDTX:
1886       lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
1887       break;
1888     case V_DCT:
1889     case V_ADST:
1890     case V_FLIPADST:
1891       lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
1892                                            tx_size, eob);
1893       break;
1894     case H_DCT:
1895     case H_ADST:
1896     case H_FLIPADST:
1897       lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
1898                                            tx_size, eob);
1899       break;
1900     default:
1901       av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
1902                                      eob);
1903       break;
1904   }
1905 }
1906 
av1_lowbd_inv_txfm2d_add_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1907 void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
1908                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
1909                                    int eob) {
1910   switch (tx_size) {
1911     case TX_4X4:
1912     case TX_8X8:
1913     case TX_4X8:
1914     case TX_8X4:
1915     case TX_8X16:
1916     case TX_16X8:
1917     case TX_4X16:
1918     case TX_16X4:
1919     case TX_8X32:
1920     case TX_32X8:
1921       av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
1922                                      eob);
1923       break;
1924     case TX_16X16:
1925     case TX_32X32:
1926     case TX_64X64:
1927     case TX_16X32:
1928     case TX_32X16:
1929     case TX_32X64:
1930     case TX_64X32:
1931     case TX_16X64:
1932     case TX_64X16:
1933     default:
1934       lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
1935                                          tx_size, eob);
1936       break;
1937   }
1938 }
1939 
av1_inv_txfm_add_avx2(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)1940 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
1941                            const TxfmParam *txfm_param) {
1942   const TX_TYPE tx_type = txfm_param->tx_type;
1943   if (!txfm_param->lossless) {
1944     av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
1945                                   txfm_param->tx_size, txfm_param->eob);
1946   } else {
1947     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
1948   }
1949 }
1950