1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/av1_rtcd.h"
13
14 #include "av1/common/enums.h"
15 #include "av1/common/av1_txfm.h"
16 #include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
19 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
20 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
21 #include "aom_dsp/x86/txfm_common_avx2.h"
22
fdct16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)23 static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
24 int8_t cos_bit) {
25 const int32_t *cospi = cospi_arr(cos_bit);
26 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
27
28 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
29 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
30 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
31 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
32 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
33 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
34 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
35 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
36 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
37 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
38 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
39 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
40 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
41 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
42 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
43 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
44 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
45 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
46
47 // stage 1
48 __m256i x1[16];
49 btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
50 btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
51 btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
52 btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
53 btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
54 btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
55 btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
56 btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
57
58 // stage 2
59 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
60 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
61 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
62 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
63 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
64 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
65
66 // stage 3
67 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
68 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
69 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
70 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
71 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
72 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
73 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
74
75 // stage 4
76 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
77 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
78 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
79 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
80 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
81 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
82
83 // stage 5
84 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
85 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
86 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
87 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
88 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
89 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
90
91 // stage 6
92 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
93 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
94 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
95 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
96
97 // stage 7
98 output[0] = x1[0];
99 output[1] = x1[8];
100 output[2] = x1[4];
101 output[3] = x1[12];
102 output[4] = x1[2];
103 output[5] = x1[10];
104 output[6] = x1[6];
105 output[7] = x1[14];
106 output[8] = x1[1];
107 output[9] = x1[9];
108 output[10] = x1[5];
109 output[11] = x1[13];
110 output[12] = x1[3];
111 output[13] = x1[11];
112 output[14] = x1[7];
113 output[15] = x1[15];
114 }
115
fdct16x32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)116 static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output,
117 int8_t cos_bit) {
118 const int32_t *cospi = cospi_arr(cos_bit);
119 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
120
121 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
122 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
123 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
124 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
125 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
126 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
127 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
128 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
129 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
130 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
131 __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
132 __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
133 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
134 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
135 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
136 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
137 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
138 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
139 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
140 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
141 __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
142 __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
143 __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
144 __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
145 __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
146 __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
147 __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
148 __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
149 __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
150 __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
151 __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
152 __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
153 __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
154 __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
155 __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
156 __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
157
158 // stage 1
159 __m256i x1[32];
160 btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
161 btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
162 btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
163 btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
164 btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
165 btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
166 btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
167 btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
168 btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
169 btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
170 btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
171 btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
172 btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
173 btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
174 btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
175 btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
176
177 // stage 2
178 btf_16_adds_subs_avx2(&x1[0], &x1[15]);
179 btf_16_adds_subs_avx2(&x1[1], &x1[14]);
180 btf_16_adds_subs_avx2(&x1[2], &x1[13]);
181 btf_16_adds_subs_avx2(&x1[3], &x1[12]);
182 btf_16_adds_subs_avx2(&x1[4], &x1[11]);
183 btf_16_adds_subs_avx2(&x1[5], &x1[10]);
184 btf_16_adds_subs_avx2(&x1[6], &x1[9]);
185 btf_16_adds_subs_avx2(&x1[7], &x1[8]);
186 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
187 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
188 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
189 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
190
191 // stage 3
192 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
193 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
194 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
195 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
196 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
197 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
198 btf_16_adds_subs_avx2(&x1[16], &x1[23]);
199 btf_16_adds_subs_avx2(&x1[17], &x1[22]);
200 btf_16_adds_subs_avx2(&x1[18], &x1[21]);
201 btf_16_adds_subs_avx2(&x1[19], &x1[20]);
202 btf_16_adds_subs_avx2(&x1[31], &x1[24]);
203 btf_16_adds_subs_avx2(&x1[30], &x1[25]);
204 btf_16_adds_subs_avx2(&x1[29], &x1[26]);
205 btf_16_adds_subs_avx2(&x1[28], &x1[27]);
206
207 // stage 4
208 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
209 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
210 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
211 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
212 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
213 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
214 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
215 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
216 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
217 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
218 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
219
220 // stage 5
221 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
222 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
223 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
224 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
225 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
226 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
227 btf_16_adds_subs_avx2(&x1[16], &x1[19]);
228 btf_16_adds_subs_avx2(&x1[17], &x1[18]);
229 btf_16_adds_subs_avx2(&x1[23], &x1[20]);
230 btf_16_adds_subs_avx2(&x1[22], &x1[21]);
231 btf_16_adds_subs_avx2(&x1[24], &x1[27]);
232 btf_16_adds_subs_avx2(&x1[25], &x1[26]);
233 btf_16_adds_subs_avx2(&x1[31], &x1[28]);
234 btf_16_adds_subs_avx2(&x1[30], &x1[29]);
235
236 // stage 6
237 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
238 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
239 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
240 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
241 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
242 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
243 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
244 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
245 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
246 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
247
248 // stage 7
249 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
250 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
251 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
252 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
253 btf_16_adds_subs_avx2(&x1[16], &x1[17]);
254 btf_16_adds_subs_avx2(&x1[19], &x1[18]);
255 btf_16_adds_subs_avx2(&x1[20], &x1[21]);
256 btf_16_adds_subs_avx2(&x1[23], &x1[22]);
257 btf_16_adds_subs_avx2(&x1[24], &x1[25]);
258 btf_16_adds_subs_avx2(&x1[27], &x1[26]);
259 btf_16_adds_subs_avx2(&x1[28], &x1[29]);
260 btf_16_adds_subs_avx2(&x1[31], &x1[30]);
261
262 // stage 8
263 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
264 btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
265 btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
266 btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
267 btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
268 btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
269 btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
270 btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
271
272 // stage 9
273 output[0] = x1[0];
274 output[1] = x1[16];
275 output[2] = x1[8];
276 output[3] = x1[24];
277 output[4] = x1[4];
278 output[5] = x1[20];
279 output[6] = x1[12];
280 output[7] = x1[28];
281 output[8] = x1[2];
282 output[9] = x1[18];
283 output[10] = x1[10];
284 output[11] = x1[26];
285 output[12] = x1[6];
286 output[13] = x1[22];
287 output[14] = x1[14];
288 output[15] = x1[30];
289 output[16] = x1[1];
290 output[17] = x1[17];
291 output[18] = x1[9];
292 output[19] = x1[25];
293 output[20] = x1[5];
294 output[21] = x1[21];
295 output[22] = x1[13];
296 output[23] = x1[29];
297 output[24] = x1[3];
298 output[25] = x1[19];
299 output[26] = x1[11];
300 output[27] = x1[27];
301 output[28] = x1[7];
302 output[29] = x1[23];
303 output[30] = x1[15];
304 output[31] = x1[31];
305 }
306
fdct16x64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)307 static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
308 int8_t cos_bit) {
309 const int32_t *cospi = cospi_arr(cos_bit);
310 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
311
312 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
313 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
314 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
315 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
316 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
317 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
318 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
319 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
320 __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
321 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
322 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
323 __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
324 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
325 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
326 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
327 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
328 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
329 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
330 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
331 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
332 __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
333 __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
334 __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
335 __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
336 __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
337 __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
338 __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
339 __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
340 __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
341 __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
342 __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
343 __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
344 __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
345 __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
346 __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
347 __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
348 __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
349 __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
350 __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
351 __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
352 __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
353 __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
354 __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
355 __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
356 __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
357 __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
358 __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
359 __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
360 __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
361 __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
362 __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
363 __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
364 __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
365 __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
366 __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
367 __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
368 __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
369 __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
370 __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
371 __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
372 __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
373 __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
374 __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
375 __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
376 __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
377 __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
378 __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
379 __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
380 __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
381 __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
382 __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
383 __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
384
385 // stage 1
386 __m256i x1[64];
387 btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
388 btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
389 btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
390 btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
391 btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
392 btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
393 btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
394 btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
395 btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
396 btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
397 btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
398 btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
399 btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
400 btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
401 btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
402 btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
403 btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
404 btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
405 btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
406 btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
407 btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
408 btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
409 btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
410 btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
411 btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
412 btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
413 btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
414 btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
415 btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
416 btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
417 btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
418 btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
419
420 // stage 2
421 btf_16_adds_subs_avx2(&x1[0], &x1[31]);
422 btf_16_adds_subs_avx2(&x1[1], &x1[30]);
423 btf_16_adds_subs_avx2(&x1[2], &x1[29]);
424 btf_16_adds_subs_avx2(&x1[3], &x1[28]);
425 btf_16_adds_subs_avx2(&x1[4], &x1[27]);
426 btf_16_adds_subs_avx2(&x1[5], &x1[26]);
427 btf_16_adds_subs_avx2(&x1[6], &x1[25]);
428 btf_16_adds_subs_avx2(&x1[7], &x1[24]);
429 btf_16_adds_subs_avx2(&x1[8], &x1[23]);
430 btf_16_adds_subs_avx2(&x1[9], &x1[22]);
431 btf_16_adds_subs_avx2(&x1[10], &x1[21]);
432 btf_16_adds_subs_avx2(&x1[11], &x1[20]);
433 btf_16_adds_subs_avx2(&x1[12], &x1[19]);
434 btf_16_adds_subs_avx2(&x1[13], &x1[18]);
435 btf_16_adds_subs_avx2(&x1[14], &x1[17]);
436 btf_16_adds_subs_avx2(&x1[15], &x1[16]);
437 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
438 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
439 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
440 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
441 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
442 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
443 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
444 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
445
446 // stage 3
447 btf_16_adds_subs_avx2(&x1[0], &x1[15]);
448 btf_16_adds_subs_avx2(&x1[1], &x1[14]);
449 btf_16_adds_subs_avx2(&x1[2], &x1[13]);
450 btf_16_adds_subs_avx2(&x1[3], &x1[12]);
451 btf_16_adds_subs_avx2(&x1[4], &x1[11]);
452 btf_16_adds_subs_avx2(&x1[5], &x1[10]);
453 btf_16_adds_subs_avx2(&x1[6], &x1[9]);
454 btf_16_adds_subs_avx2(&x1[7], &x1[8]);
455 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
456 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
457 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
458 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
459 btf_16_adds_subs_avx2(&x1[32], &x1[47]);
460 btf_16_adds_subs_avx2(&x1[33], &x1[46]);
461 btf_16_adds_subs_avx2(&x1[34], &x1[45]);
462 btf_16_adds_subs_avx2(&x1[35], &x1[44]);
463 btf_16_adds_subs_avx2(&x1[36], &x1[43]);
464 btf_16_adds_subs_avx2(&x1[37], &x1[42]);
465 btf_16_adds_subs_avx2(&x1[38], &x1[41]);
466 btf_16_adds_subs_avx2(&x1[39], &x1[40]);
467 btf_16_adds_subs_avx2(&x1[63], &x1[48]);
468 btf_16_adds_subs_avx2(&x1[62], &x1[49]);
469 btf_16_adds_subs_avx2(&x1[61], &x1[50]);
470 btf_16_adds_subs_avx2(&x1[60], &x1[51]);
471 btf_16_adds_subs_avx2(&x1[59], &x1[52]);
472 btf_16_adds_subs_avx2(&x1[58], &x1[53]);
473 btf_16_adds_subs_avx2(&x1[57], &x1[54]);
474 btf_16_adds_subs_avx2(&x1[56], &x1[55]);
475
476 // stage 4
477 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
478 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
479 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
480 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
481 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
482 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
483 btf_16_adds_subs_avx2(&x1[16], &x1[23]);
484 btf_16_adds_subs_avx2(&x1[17], &x1[22]);
485 btf_16_adds_subs_avx2(&x1[18], &x1[21]);
486 btf_16_adds_subs_avx2(&x1[19], &x1[20]);
487 btf_16_adds_subs_avx2(&x1[31], &x1[24]);
488 btf_16_adds_subs_avx2(&x1[30], &x1[25]);
489 btf_16_adds_subs_avx2(&x1[29], &x1[26]);
490 btf_16_adds_subs_avx2(&x1[28], &x1[27]);
491 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
492 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
493 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
494 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
495 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
496 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
497 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
498 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
499
500 // stage 5
501 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
502 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
503 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
504 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
505 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
506 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
507 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
508 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
509 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
510 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
511 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
512 btf_16_adds_subs_avx2(&x1[32], &x1[39]);
513 btf_16_adds_subs_avx2(&x1[33], &x1[38]);
514 btf_16_adds_subs_avx2(&x1[34], &x1[37]);
515 btf_16_adds_subs_avx2(&x1[35], &x1[36]);
516 btf_16_adds_subs_avx2(&x1[47], &x1[40]);
517 btf_16_adds_subs_avx2(&x1[46], &x1[41]);
518 btf_16_adds_subs_avx2(&x1[45], &x1[42]);
519 btf_16_adds_subs_avx2(&x1[44], &x1[43]);
520 btf_16_adds_subs_avx2(&x1[48], &x1[55]);
521 btf_16_adds_subs_avx2(&x1[49], &x1[54]);
522 btf_16_adds_subs_avx2(&x1[50], &x1[53]);
523 btf_16_adds_subs_avx2(&x1[51], &x1[52]);
524 btf_16_adds_subs_avx2(&x1[63], &x1[56]);
525 btf_16_adds_subs_avx2(&x1[62], &x1[57]);
526 btf_16_adds_subs_avx2(&x1[61], &x1[58]);
527 btf_16_adds_subs_avx2(&x1[60], &x1[59]);
528
529 // stage 6
530 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
531 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
532 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
533 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
534 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
535 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
536 btf_16_adds_subs_avx2(&x1[16], &x1[19]);
537 btf_16_adds_subs_avx2(&x1[17], &x1[18]);
538 btf_16_adds_subs_avx2(&x1[23], &x1[20]);
539 btf_16_adds_subs_avx2(&x1[22], &x1[21]);
540 btf_16_adds_subs_avx2(&x1[24], &x1[27]);
541 btf_16_adds_subs_avx2(&x1[25], &x1[26]);
542 btf_16_adds_subs_avx2(&x1[31], &x1[28]);
543 btf_16_adds_subs_avx2(&x1[30], &x1[29]);
544 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
545 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
546 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
547 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
548 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
549 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
550 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
551 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
552
553 // stage 7
554 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
555 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
556 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
557 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
558 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
559 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
560 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
561 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
562 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
563 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
564 btf_16_adds_subs_avx2(&x1[32], &x1[35]);
565 btf_16_adds_subs_avx2(&x1[33], &x1[34]);
566 btf_16_adds_subs_avx2(&x1[39], &x1[36]);
567 btf_16_adds_subs_avx2(&x1[38], &x1[37]);
568 btf_16_adds_subs_avx2(&x1[40], &x1[43]);
569 btf_16_adds_subs_avx2(&x1[41], &x1[42]);
570 btf_16_adds_subs_avx2(&x1[47], &x1[44]);
571 btf_16_adds_subs_avx2(&x1[46], &x1[45]);
572 btf_16_adds_subs_avx2(&x1[48], &x1[51]);
573 btf_16_adds_subs_avx2(&x1[49], &x1[50]);
574 btf_16_adds_subs_avx2(&x1[55], &x1[52]);
575 btf_16_adds_subs_avx2(&x1[54], &x1[53]);
576 btf_16_adds_subs_avx2(&x1[56], &x1[59]);
577 btf_16_adds_subs_avx2(&x1[57], &x1[58]);
578 btf_16_adds_subs_avx2(&x1[63], &x1[60]);
579 btf_16_adds_subs_avx2(&x1[62], &x1[61]);
580
581 // stage 8
582 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
583 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
584 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
585 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
586 btf_16_adds_subs_avx2(&x1[16], &x1[17]);
587 btf_16_adds_subs_avx2(&x1[19], &x1[18]);
588 btf_16_adds_subs_avx2(&x1[20], &x1[21]);
589 btf_16_adds_subs_avx2(&x1[23], &x1[22]);
590 btf_16_adds_subs_avx2(&x1[24], &x1[25]);
591 btf_16_adds_subs_avx2(&x1[27], &x1[26]);
592 btf_16_adds_subs_avx2(&x1[28], &x1[29]);
593 btf_16_adds_subs_avx2(&x1[31], &x1[30]);
594 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
595 btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
596 btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
597 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
598 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
599 btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
600 btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
601 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
602
603 // stage 9
604 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
605 btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
606 btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
607 btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
608 btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
609 btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
610 btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
611 btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
612 btf_16_adds_subs_avx2(&x1[32], &x1[33]);
613 btf_16_adds_subs_avx2(&x1[35], &x1[34]);
614 btf_16_adds_subs_avx2(&x1[36], &x1[37]);
615 btf_16_adds_subs_avx2(&x1[39], &x1[38]);
616 btf_16_adds_subs_avx2(&x1[40], &x1[41]);
617 btf_16_adds_subs_avx2(&x1[43], &x1[42]);
618 btf_16_adds_subs_avx2(&x1[44], &x1[45]);
619 btf_16_adds_subs_avx2(&x1[47], &x1[46]);
620 btf_16_adds_subs_avx2(&x1[48], &x1[49]);
621 btf_16_adds_subs_avx2(&x1[51], &x1[50]);
622 btf_16_adds_subs_avx2(&x1[52], &x1[53]);
623 btf_16_adds_subs_avx2(&x1[55], &x1[54]);
624 btf_16_adds_subs_avx2(&x1[56], &x1[57]);
625 btf_16_adds_subs_avx2(&x1[59], &x1[58]);
626 btf_16_adds_subs_avx2(&x1[60], &x1[61]);
627 btf_16_adds_subs_avx2(&x1[63], &x1[62]);
628
629 // stage 10
630 btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
631 btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
632 btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
633 btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
634 btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
635 btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
636 btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
637 btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
638 btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
639 btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
640 btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
641 btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
642 btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
643 btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
644 btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
645 btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
646
647 // stage 11
648 output[0] = x1[0];
649 output[1] = x1[32];
650 output[2] = x1[16];
651 output[3] = x1[48];
652 output[4] = x1[8];
653 output[5] = x1[40];
654 output[6] = x1[24];
655 output[7] = x1[56];
656 output[8] = x1[4];
657 output[9] = x1[36];
658 output[10] = x1[20];
659 output[11] = x1[52];
660 output[12] = x1[12];
661 output[13] = x1[44];
662 output[14] = x1[28];
663 output[15] = x1[60];
664 output[16] = x1[2];
665 output[17] = x1[34];
666 output[18] = x1[18];
667 output[19] = x1[50];
668 output[20] = x1[10];
669 output[21] = x1[42];
670 output[22] = x1[26];
671 output[23] = x1[58];
672 output[24] = x1[6];
673 output[25] = x1[38];
674 output[26] = x1[22];
675 output[27] = x1[54];
676 output[28] = x1[14];
677 output[29] = x1[46];
678 output[30] = x1[30];
679 output[31] = x1[62];
680 output[32] = x1[1];
681 output[33] = x1[33];
682 output[34] = x1[17];
683 output[35] = x1[49];
684 output[36] = x1[9];
685 output[37] = x1[41];
686 output[38] = x1[25];
687 output[39] = x1[57];
688 output[40] = x1[5];
689 output[41] = x1[37];
690 output[42] = x1[21];
691 output[43] = x1[53];
692 output[44] = x1[13];
693 output[45] = x1[45];
694 output[46] = x1[29];
695 output[47] = x1[61];
696 output[48] = x1[3];
697 output[49] = x1[35];
698 output[50] = x1[19];
699 output[51] = x1[51];
700 output[52] = x1[11];
701 output[53] = x1[43];
702 output[54] = x1[27];
703 output[55] = x1[59];
704 output[56] = x1[7];
705 output[57] = x1[39];
706 output[58] = x1[23];
707 output[59] = x1[55];
708 output[60] = x1[15];
709 output[61] = x1[47];
710 output[62] = x1[31];
711 output[63] = x1[63];
712 }
713
av1_fdct32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)714 static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
715 int8_t cos_bit) {
716 __m256i x1[32];
717 const int32_t *cospi = cospi_arr(cos_bit);
718 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
719 // stage 0
720 // stage 1
721 btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
722 btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
723 btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
724 btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
725 btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
726 btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
727 btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
728 btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
729 btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
730 btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
731 btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
732 btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
733 btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
734 btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
735 btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
736 btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
737
738 // stage 2
739 btf_32_add_sub_avx2(&x1[0], &x1[15]);
740 btf_32_add_sub_avx2(&x1[1], &x1[14]);
741 btf_32_add_sub_avx2(&x1[2], &x1[13]);
742 btf_32_add_sub_avx2(&x1[3], &x1[12]);
743 btf_32_add_sub_avx2(&x1[4], &x1[11]);
744 btf_32_add_sub_avx2(&x1[5], &x1[10]);
745 btf_32_add_sub_avx2(&x1[6], &x1[9]);
746 btf_32_add_sub_avx2(&x1[7], &x1[8]);
747 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
748 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
749 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
750 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
751
752 // stage 3
753 btf_32_add_sub_avx2(&x1[0], &x1[7]);
754 btf_32_add_sub_avx2(&x1[1], &x1[6]);
755 btf_32_add_sub_avx2(&x1[2], &x1[5]);
756 btf_32_add_sub_avx2(&x1[3], &x1[4]);
757 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
758 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
759 btf_32_add_sub_avx2(&x1[16], &x1[23]);
760 btf_32_add_sub_avx2(&x1[17], &x1[22]);
761 btf_32_add_sub_avx2(&x1[18], &x1[21]);
762 btf_32_add_sub_avx2(&x1[19], &x1[20]);
763 btf_32_add_sub_avx2(&x1[31], &x1[24]);
764 btf_32_add_sub_avx2(&x1[30], &x1[25]);
765 btf_32_add_sub_avx2(&x1[29], &x1[26]);
766 btf_32_add_sub_avx2(&x1[28], &x1[27]);
767
768 // stage 4
769 btf_32_add_sub_avx2(&x1[0], &x1[3]);
770 btf_32_add_sub_avx2(&x1[1], &x1[2]);
771 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
772 btf_32_add_sub_avx2(&x1[8], &x1[11]);
773 btf_32_add_sub_avx2(&x1[9], &x1[10]);
774 btf_32_add_sub_avx2(&x1[15], &x1[12]);
775 btf_32_add_sub_avx2(&x1[14], &x1[13]);
776 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
777 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
778 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
779 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
780
781 // stage 5
782 btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
783 btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
784 btf_32_add_sub_avx2(&x1[4], &x1[5]);
785 btf_32_add_sub_avx2(&x1[7], &x1[6]);
786 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
787 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
788 btf_32_add_sub_avx2(&x1[16], &x1[19]);
789 btf_32_add_sub_avx2(&x1[17], &x1[18]);
790 btf_32_add_sub_avx2(&x1[23], &x1[20]);
791 btf_32_add_sub_avx2(&x1[22], &x1[21]);
792 btf_32_add_sub_avx2(&x1[24], &x1[27]);
793 btf_32_add_sub_avx2(&x1[25], &x1[26]);
794 btf_32_add_sub_avx2(&x1[31], &x1[28]);
795 btf_32_add_sub_avx2(&x1[30], &x1[29]);
796
797 // stage 6
798 btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
799 btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
800 btf_32_add_sub_avx2(&x1[8], &x1[9]);
801 btf_32_add_sub_avx2(&x1[11], &x1[10]);
802 btf_32_add_sub_avx2(&x1[12], &x1[13]);
803 btf_32_add_sub_avx2(&x1[15], &x1[14]);
804 btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
805 btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
806 btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
807 btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
808
809 // stage 7
810 btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
811 btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
812 btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
813 btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
814 btf_32_add_sub_avx2(&x1[16], &x1[17]);
815 btf_32_add_sub_avx2(&x1[19], &x1[18]);
816 btf_32_add_sub_avx2(&x1[20], &x1[21]);
817 btf_32_add_sub_avx2(&x1[23], &x1[22]);
818 btf_32_add_sub_avx2(&x1[24], &x1[25]);
819 btf_32_add_sub_avx2(&x1[27], &x1[26]);
820 btf_32_add_sub_avx2(&x1[28], &x1[29]);
821 btf_32_add_sub_avx2(&x1[31], &x1[30]);
822
823 // stage 8
824 btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
825 btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
826 btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
827 btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
828 btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
829 btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
830 btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
831 btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
832
833 // stage 9
834 output[0] = x1[0];
835 output[1] = x1[16];
836 output[2] = x1[8];
837 output[3] = x1[24];
838 output[4] = x1[4];
839 output[5] = x1[20];
840 output[6] = x1[12];
841 output[7] = x1[28];
842 output[8] = x1[2];
843 output[9] = x1[18];
844 output[10] = x1[10];
845 output[11] = x1[26];
846 output[12] = x1[6];
847 output[13] = x1[22];
848 output[14] = x1[14];
849 output[15] = x1[30];
850 output[16] = x1[1];
851 output[17] = x1[17];
852 output[18] = x1[9];
853 output[19] = x1[25];
854 output[20] = x1[5];
855 output[21] = x1[21];
856 output[22] = x1[13];
857 output[23] = x1[29];
858 output[24] = x1[3];
859 output[25] = x1[19];
860 output[26] = x1[11];
861 output[27] = x1[27];
862 output[28] = x1[7];
863 output[29] = x1[23];
864 output[30] = x1[15];
865 output[31] = x1[31];
866 }
867
av1_fdct64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)868 static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
869 int8_t cos_bit) {
870 const int32_t *cospi = cospi_arr(cos_bit);
871 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
872
873 __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
874 __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
875 __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
876 __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
877 __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
878 __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
879 __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
880 __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
881 __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
882 __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
883 __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
884 __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
885 __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
886 __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
887 __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
888 __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
889 __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
890 __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
891 __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
892 __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
893 __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
894 __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
895 __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
896 __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
897 __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
898 __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
899 __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
900 __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
901 __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
902 __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
903 __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
904 __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
905 __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
906 __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
907 __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
908 __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
909 __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
910 __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
911 __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
912 __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
913 __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
914 __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
915 __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
916 __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
917 __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
918 __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
919 __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
920 __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
921 __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
922 __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
923 __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
924 __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
925 __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
926 __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
927 __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
928 __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
929 __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
930 __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
931 __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
932 __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
933 __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
934 __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
935 __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
936 __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
937 __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
938 __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
939 __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
940 __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
941 __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
942 __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
943 __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
944 __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
945 __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
946 __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
947 __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
948 __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
949 __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
950 __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
951
952 // stage 1
953 __m256i x1[64];
954 btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
955 btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
956 btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
957 btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
958 btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
959 btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
960 btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
961 btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
962 btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
963 btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
964 btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
965 btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
966 btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
967 btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
968 btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
969 btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
970 btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
971 btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
972 btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
973 btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
974 btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
975 btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
976 btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
977 btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
978 btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
979 btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
980 btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
981 btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
982 btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
983 btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
984 btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
985 btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
986
987 // stage 2
988 btf_32_add_sub_avx2(&x1[0], &x1[31]);
989 btf_32_add_sub_avx2(&x1[1], &x1[30]);
990 btf_32_add_sub_avx2(&x1[2], &x1[29]);
991 btf_32_add_sub_avx2(&x1[3], &x1[28]);
992 btf_32_add_sub_avx2(&x1[4], &x1[27]);
993 btf_32_add_sub_avx2(&x1[5], &x1[26]);
994 btf_32_add_sub_avx2(&x1[6], &x1[25]);
995 btf_32_add_sub_avx2(&x1[7], &x1[24]);
996 btf_32_add_sub_avx2(&x1[8], &x1[23]);
997 btf_32_add_sub_avx2(&x1[9], &x1[22]);
998 btf_32_add_sub_avx2(&x1[10], &x1[21]);
999 btf_32_add_sub_avx2(&x1[11], &x1[20]);
1000 btf_32_add_sub_avx2(&x1[12], &x1[19]);
1001 btf_32_add_sub_avx2(&x1[13], &x1[18]);
1002 btf_32_add_sub_avx2(&x1[14], &x1[17]);
1003 btf_32_add_sub_avx2(&x1[15], &x1[16]);
1004 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
1005 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
1006 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
1007 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
1008 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
1009 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
1010 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
1011 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
1012
1013 // stage 3
1014 btf_32_add_sub_avx2(&x1[0], &x1[15]);
1015 btf_32_add_sub_avx2(&x1[1], &x1[14]);
1016 btf_32_add_sub_avx2(&x1[2], &x1[13]);
1017 btf_32_add_sub_avx2(&x1[3], &x1[12]);
1018 btf_32_add_sub_avx2(&x1[4], &x1[11]);
1019 btf_32_add_sub_avx2(&x1[5], &x1[10]);
1020 btf_32_add_sub_avx2(&x1[6], &x1[9]);
1021 btf_32_add_sub_avx2(&x1[7], &x1[8]);
1022 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
1023 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
1024 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
1025 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
1026 btf_32_add_sub_avx2(&x1[32], &x1[47]);
1027 btf_32_add_sub_avx2(&x1[33], &x1[46]);
1028 btf_32_add_sub_avx2(&x1[34], &x1[45]);
1029 btf_32_add_sub_avx2(&x1[35], &x1[44]);
1030 btf_32_add_sub_avx2(&x1[36], &x1[43]);
1031 btf_32_add_sub_avx2(&x1[37], &x1[42]);
1032 btf_32_add_sub_avx2(&x1[38], &x1[41]);
1033 btf_32_add_sub_avx2(&x1[39], &x1[40]);
1034 btf_32_add_sub_avx2(&x1[63], &x1[48]);
1035 btf_32_add_sub_avx2(&x1[62], &x1[49]);
1036 btf_32_add_sub_avx2(&x1[61], &x1[50]);
1037 btf_32_add_sub_avx2(&x1[60], &x1[51]);
1038 btf_32_add_sub_avx2(&x1[59], &x1[52]);
1039 btf_32_add_sub_avx2(&x1[58], &x1[53]);
1040 btf_32_add_sub_avx2(&x1[57], &x1[54]);
1041 btf_32_add_sub_avx2(&x1[56], &x1[55]);
1042
1043 // stage 4
1044 btf_32_add_sub_avx2(&x1[0], &x1[7]);
1045 btf_32_add_sub_avx2(&x1[1], &x1[6]);
1046 btf_32_add_sub_avx2(&x1[2], &x1[5]);
1047 btf_32_add_sub_avx2(&x1[3], &x1[4]);
1048 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
1049 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
1050 btf_32_add_sub_avx2(&x1[16], &x1[23]);
1051 btf_32_add_sub_avx2(&x1[17], &x1[22]);
1052 btf_32_add_sub_avx2(&x1[18], &x1[21]);
1053 btf_32_add_sub_avx2(&x1[19], &x1[20]);
1054 btf_32_add_sub_avx2(&x1[31], &x1[24]);
1055 btf_32_add_sub_avx2(&x1[30], &x1[25]);
1056 btf_32_add_sub_avx2(&x1[29], &x1[26]);
1057 btf_32_add_sub_avx2(&x1[28], &x1[27]);
1058 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
1059 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
1060 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
1061 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
1062 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
1063 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
1064 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
1065 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
1066
1067 // stage 5
1068 btf_32_add_sub_avx2(&x1[0], &x1[3]);
1069 btf_32_add_sub_avx2(&x1[1], &x1[2]);
1070 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
1071 btf_32_add_sub_avx2(&x1[8], &x1[11]);
1072 btf_32_add_sub_avx2(&x1[9], &x1[10]);
1073 btf_32_add_sub_avx2(&x1[15], &x1[12]);
1074 btf_32_add_sub_avx2(&x1[14], &x1[13]);
1075 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
1076 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
1077 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
1078 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
1079 btf_32_add_sub_avx2(&x1[32], &x1[39]);
1080 btf_32_add_sub_avx2(&x1[33], &x1[38]);
1081 btf_32_add_sub_avx2(&x1[34], &x1[37]);
1082 btf_32_add_sub_avx2(&x1[35], &x1[36]);
1083 btf_32_add_sub_avx2(&x1[47], &x1[40]);
1084 btf_32_add_sub_avx2(&x1[46], &x1[41]);
1085 btf_32_add_sub_avx2(&x1[45], &x1[42]);
1086 btf_32_add_sub_avx2(&x1[44], &x1[43]);
1087 btf_32_add_sub_avx2(&x1[48], &x1[55]);
1088 btf_32_add_sub_avx2(&x1[49], &x1[54]);
1089 btf_32_add_sub_avx2(&x1[50], &x1[53]);
1090 btf_32_add_sub_avx2(&x1[51], &x1[52]);
1091 btf_32_add_sub_avx2(&x1[63], &x1[56]);
1092 btf_32_add_sub_avx2(&x1[62], &x1[57]);
1093 btf_32_add_sub_avx2(&x1[61], &x1[58]);
1094 btf_32_add_sub_avx2(&x1[60], &x1[59]);
1095
1096 // stage 6
1097 btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
1098 btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
1099 btf_32_add_sub_avx2(&x1[4], &x1[5]);
1100 btf_32_add_sub_avx2(&x1[7], &x1[6]);
1101 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
1102 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
1103 btf_32_add_sub_avx2(&x1[16], &x1[19]);
1104 btf_32_add_sub_avx2(&x1[17], &x1[18]);
1105 btf_32_add_sub_avx2(&x1[23], &x1[20]);
1106 btf_32_add_sub_avx2(&x1[22], &x1[21]);
1107 btf_32_add_sub_avx2(&x1[24], &x1[27]);
1108 btf_32_add_sub_avx2(&x1[25], &x1[26]);
1109 btf_32_add_sub_avx2(&x1[31], &x1[28]);
1110 btf_32_add_sub_avx2(&x1[30], &x1[29]);
1111 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
1112 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
1113 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
1114 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
1115 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
1116 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
1117 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
1118 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
1119
1120 // stage 7
1121 btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
1122 btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
1123 btf_32_add_sub_avx2(&x1[8], &x1[9]);
1124 btf_32_add_sub_avx2(&x1[11], &x1[10]);
1125 btf_32_add_sub_avx2(&x1[12], &x1[13]);
1126 btf_32_add_sub_avx2(&x1[15], &x1[14]);
1127 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
1128 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
1129 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
1130 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
1131 btf_32_add_sub_avx2(&x1[32], &x1[35]);
1132 btf_32_add_sub_avx2(&x1[33], &x1[34]);
1133 btf_32_add_sub_avx2(&x1[39], &x1[36]);
1134 btf_32_add_sub_avx2(&x1[38], &x1[37]);
1135 btf_32_add_sub_avx2(&x1[40], &x1[43]);
1136 btf_32_add_sub_avx2(&x1[41], &x1[42]);
1137 btf_32_add_sub_avx2(&x1[47], &x1[44]);
1138 btf_32_add_sub_avx2(&x1[46], &x1[45]);
1139 btf_32_add_sub_avx2(&x1[48], &x1[51]);
1140 btf_32_add_sub_avx2(&x1[49], &x1[50]);
1141 btf_32_add_sub_avx2(&x1[55], &x1[52]);
1142 btf_32_add_sub_avx2(&x1[54], &x1[53]);
1143 btf_32_add_sub_avx2(&x1[56], &x1[59]);
1144 btf_32_add_sub_avx2(&x1[57], &x1[58]);
1145 btf_32_add_sub_avx2(&x1[63], &x1[60]);
1146 btf_32_add_sub_avx2(&x1[62], &x1[61]);
1147
1148 // stage 8
1149 btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
1150 btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
1151 btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
1152 btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
1153 btf_32_add_sub_avx2(&x1[16], &x1[17]);
1154 btf_32_add_sub_avx2(&x1[19], &x1[18]);
1155 btf_32_add_sub_avx2(&x1[20], &x1[21]);
1156 btf_32_add_sub_avx2(&x1[23], &x1[22]);
1157 btf_32_add_sub_avx2(&x1[24], &x1[25]);
1158 btf_32_add_sub_avx2(&x1[27], &x1[26]);
1159 btf_32_add_sub_avx2(&x1[28], &x1[29]);
1160 btf_32_add_sub_avx2(&x1[31], &x1[30]);
1161 btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
1162 btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
1163 btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
1164 btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
1165 btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
1166 btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
1167 btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
1168 btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
1169
1170 // stage 9
1171 btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
1172 btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
1173 btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
1174 btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
1175 btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
1176 btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
1177 btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
1178 btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
1179 btf_32_add_sub_avx2(&x1[32], &x1[33]);
1180 btf_32_add_sub_avx2(&x1[35], &x1[34]);
1181 btf_32_add_sub_avx2(&x1[36], &x1[37]);
1182 btf_32_add_sub_avx2(&x1[39], &x1[38]);
1183 btf_32_add_sub_avx2(&x1[40], &x1[41]);
1184 btf_32_add_sub_avx2(&x1[43], &x1[42]);
1185 btf_32_add_sub_avx2(&x1[44], &x1[45]);
1186 btf_32_add_sub_avx2(&x1[47], &x1[46]);
1187 btf_32_add_sub_avx2(&x1[48], &x1[49]);
1188 btf_32_add_sub_avx2(&x1[51], &x1[50]);
1189 btf_32_add_sub_avx2(&x1[52], &x1[53]);
1190 btf_32_add_sub_avx2(&x1[55], &x1[54]);
1191 btf_32_add_sub_avx2(&x1[56], &x1[57]);
1192 btf_32_add_sub_avx2(&x1[59], &x1[58]);
1193 btf_32_add_sub_avx2(&x1[60], &x1[61]);
1194 btf_32_add_sub_avx2(&x1[63], &x1[62]);
1195
1196 // stage 10
1197 btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
1198 btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
1199 btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
1200 btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
1201 btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
1202 btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
1203 btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
1204 btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
1205 btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
1206 btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
1207 btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
1208 btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
1209 btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
1210 btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
1211 btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
1212 btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
1213
1214 // stage 11
1215 output[0] = x1[0];
1216 output[1] = x1[32];
1217 output[2] = x1[16];
1218 output[3] = x1[48];
1219 output[4] = x1[8];
1220 output[5] = x1[40];
1221 output[6] = x1[24];
1222 output[7] = x1[56];
1223 output[8] = x1[4];
1224 output[9] = x1[36];
1225 output[10] = x1[20];
1226 output[11] = x1[52];
1227 output[12] = x1[12];
1228 output[13] = x1[44];
1229 output[14] = x1[28];
1230 output[15] = x1[60];
1231 output[16] = x1[2];
1232 output[17] = x1[34];
1233 output[18] = x1[18];
1234 output[19] = x1[50];
1235 output[20] = x1[10];
1236 output[21] = x1[42];
1237 output[22] = x1[26];
1238 output[23] = x1[58];
1239 output[24] = x1[6];
1240 output[25] = x1[38];
1241 output[26] = x1[22];
1242 output[27] = x1[54];
1243 output[28] = x1[14];
1244 output[29] = x1[46];
1245 output[30] = x1[30];
1246 output[31] = x1[62];
1247 output[32] = x1[1];
1248 output[33] = x1[33];
1249 output[34] = x1[17];
1250 output[35] = x1[49];
1251 output[36] = x1[9];
1252 output[37] = x1[41];
1253 output[38] = x1[25];
1254 output[39] = x1[57];
1255 output[40] = x1[5];
1256 output[41] = x1[37];
1257 output[42] = x1[21];
1258 output[43] = x1[53];
1259 output[44] = x1[13];
1260 output[45] = x1[45];
1261 output[46] = x1[29];
1262 output[47] = x1[61];
1263 output[48] = x1[3];
1264 output[49] = x1[35];
1265 output[50] = x1[19];
1266 output[51] = x1[51];
1267 output[52] = x1[11];
1268 output[53] = x1[43];
1269 output[54] = x1[27];
1270 output[55] = x1[59];
1271 output[56] = x1[7];
1272 output[57] = x1[39];
1273 output[58] = x1[23];
1274 output[59] = x1[55];
1275 output[60] = x1[15];
1276 output[61] = x1[47];
1277 output[62] = x1[31];
1278 output[63] = x1[63];
1279 }
1280
fadst16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1281 static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
1282 int8_t cos_bit) {
1283 const int32_t *cospi = cospi_arr(cos_bit);
1284 const __m256i __zero = _mm256_setzero_si256();
1285 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
1286
1287 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1288 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
1289 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
1290 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
1291 __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
1292 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
1293 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
1294 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
1295 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
1296 __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
1297 __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
1298 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
1299 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
1300 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
1301 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
1302 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
1303 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
1304 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
1305 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
1306 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
1307 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
1308 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
1309 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
1310 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
1311 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
1312 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
1313 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
1314
1315 // stage 1
1316 __m256i x1[16];
1317 x1[0] = input[0];
1318 x1[1] = _mm256_subs_epi16(__zero, input[15]);
1319 x1[2] = _mm256_subs_epi16(__zero, input[7]);
1320 x1[3] = input[8];
1321 x1[4] = _mm256_subs_epi16(__zero, input[3]);
1322 x1[5] = input[12];
1323 x1[6] = input[4];
1324 x1[7] = _mm256_subs_epi16(__zero, input[11]);
1325 x1[8] = _mm256_subs_epi16(__zero, input[1]);
1326 x1[9] = input[14];
1327 x1[10] = input[6];
1328 x1[11] = _mm256_subs_epi16(__zero, input[9]);
1329 x1[12] = input[2];
1330 x1[13] = _mm256_subs_epi16(__zero, input[13]);
1331 x1[14] = _mm256_subs_epi16(__zero, input[5]);
1332 x1[15] = input[10];
1333
1334 // stage 2
1335 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
1336 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
1337 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
1338 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
1339
1340 // stage 3
1341 btf_16_adds_subs_avx2(&x1[0], &x1[2]);
1342 btf_16_adds_subs_avx2(&x1[1], &x1[3]);
1343 btf_16_adds_subs_avx2(&x1[4], &x1[6]);
1344 btf_16_adds_subs_avx2(&x1[5], &x1[7]);
1345 btf_16_adds_subs_avx2(&x1[8], &x1[10]);
1346 btf_16_adds_subs_avx2(&x1[9], &x1[11]);
1347 btf_16_adds_subs_avx2(&x1[12], &x1[14]);
1348 btf_16_adds_subs_avx2(&x1[13], &x1[15]);
1349
1350 // stage 4
1351 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
1352 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
1353 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
1354 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
1355
1356 // stage 5
1357 btf_16_adds_subs_avx2(&x1[0], &x1[4]);
1358 btf_16_adds_subs_avx2(&x1[1], &x1[5]);
1359 btf_16_adds_subs_avx2(&x1[2], &x1[6]);
1360 btf_16_adds_subs_avx2(&x1[3], &x1[7]);
1361 btf_16_adds_subs_avx2(&x1[8], &x1[12]);
1362 btf_16_adds_subs_avx2(&x1[9], &x1[13]);
1363 btf_16_adds_subs_avx2(&x1[10], &x1[14]);
1364 btf_16_adds_subs_avx2(&x1[11], &x1[15]);
1365
1366 // stage 6
1367 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
1368 btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
1369 btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
1370 btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
1371
1372 // stage 7
1373 btf_16_adds_subs_avx2(&x1[0], &x1[8]);
1374 btf_16_adds_subs_avx2(&x1[1], &x1[9]);
1375 btf_16_adds_subs_avx2(&x1[2], &x1[10]);
1376 btf_16_adds_subs_avx2(&x1[3], &x1[11]);
1377 btf_16_adds_subs_avx2(&x1[4], &x1[12]);
1378 btf_16_adds_subs_avx2(&x1[5], &x1[13]);
1379 btf_16_adds_subs_avx2(&x1[6], &x1[14]);
1380 btf_16_adds_subs_avx2(&x1[7], &x1[15]);
1381
1382 // stage 8
1383 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
1384 btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
1385 btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
1386 btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
1387 btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
1388 btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
1389 btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
1390 btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
1391
1392 // stage 9
1393 output[0] = x1[1];
1394 output[1] = x1[14];
1395 output[2] = x1[3];
1396 output[3] = x1[12];
1397 output[4] = x1[5];
1398 output[5] = x1[10];
1399 output[6] = x1[7];
1400 output[7] = x1[8];
1401 output[8] = x1[9];
1402 output[9] = x1[6];
1403 output[10] = x1[11];
1404 output[11] = x1[4];
1405 output[12] = x1[13];
1406 output[13] = x1[2];
1407 output[14] = x1[15];
1408 output[15] = x1[0];
1409 }
1410
scale_round_avx2(const __m256i a,const int scale)1411 static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
1412 const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
1413 const __m256i b = _mm256_madd_epi16(a, scale__r);
1414 return _mm256_srai_epi32(b, NewSqrt2Bits);
1415 }
1416
fidentity16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1417 static INLINE void fidentity16x16_new_avx2(const __m256i *input,
1418 __m256i *output, int8_t cos_bit) {
1419 (void)cos_bit;
1420 const __m256i one = _mm256_set1_epi16(1);
1421
1422 for (int i = 0; i < 16; ++i) {
1423 const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
1424 const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
1425 const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
1426 const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
1427 output[i] = _mm256_packs_epi32(b_lo, b_hi);
1428 }
1429 }
1430
fidentity16x32_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1431 static INLINE void fidentity16x32_new_avx2(const __m256i *input,
1432 __m256i *output, int8_t cos_bit) {
1433 (void)cos_bit;
1434 for (int i = 0; i < 32; ++i) {
1435 output[i] = _mm256_slli_epi16(input[i], 2);
1436 }
1437 }
1438
av1_round_shift_array_32_avx2(__m256i * input,__m256i * output,const int size,const int bit)1439 static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
1440 __m256i *output,
1441 const int size,
1442 const int bit) {
1443 if (bit > 0) {
1444 int i;
1445 for (i = 0; i < size; i++) {
1446 output[i] = av1_round_shift_32_avx2(input[i], bit);
1447 }
1448 } else {
1449 int i;
1450 for (i = 0; i < size; i++) {
1451 output[i] = _mm256_slli_epi32(input[i], -bit);
1452 }
1453 }
1454 }
1455
av1_round_shift_rect_array_32_avx2(__m256i * input,__m256i * output,const int size,const int bit)1456 static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
1457 __m256i *output,
1458 const int size,
1459 const int bit) {
1460 const __m256i sqrt2 = _mm256_set1_epi32(NewSqrt2);
1461 if (bit > 0) {
1462 int i;
1463 for (i = 0; i < size; i++) {
1464 const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
1465 const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
1466 output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
1467 }
1468 } else {
1469 int i;
1470 for (i = 0; i < size; i++) {
1471 const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
1472 const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
1473 output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
1474 }
1475 }
1476 }
1477
transpose_32_8x8_avx2(int stride,const __m256i * inputA,__m256i * output)1478 static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA,
1479 __m256i *output) {
1480 __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]);
1481 __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]);
1482 __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]);
1483 __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]);
1484 __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]);
1485 __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]);
1486 __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]);
1487 __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]);
1488
1489 __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2);
1490 __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2);
1491 __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3);
1492 __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3);
1493 __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6);
1494 __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6);
1495 __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7);
1496 __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7);
1497
1498 output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20);
1499 output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20);
1500 output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20);
1501 output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20);
1502 output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31);
1503 output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31);
1504 output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31);
1505 output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31);
1506 }
1507
1508 // Store 8 16 bit values. Sign extend the values.
store_buffer_16bit_to_32bit_w16_avx2(const __m256i * const in,int32_t * out,const int stride,const int out_size)1509 static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
1510 int32_t *out,
1511 const int stride,
1512 const int out_size) {
1513 for (int i = 0; i < out_size; ++i) {
1514 _mm256_store_si256((__m256i *)(out),
1515 _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
1516 _mm256_store_si256(
1517 (__m256i *)(out + 8),
1518 _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
1519 out += stride;
1520 }
1521 }
1522
store_rect_16bit_to_32bit_avx2(const __m256i a,int32_t * const b)1523 static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
1524 int32_t *const b) {
1525 const __m256i one = _mm256_set1_epi16(1);
1526 const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
1527 const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
1528 const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
1529 const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
1530 const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
1531 _mm256_store_si256((__m256i *)b, b_lo);
1532 _mm256_store_si256((__m256i *)(b + 8), b_hi);
1533 }
1534
store_rect_buffer_16bit_to_32bit_w16_avx2(const __m256i * const in,int32_t * const out,const int stride,const int out_size)1535 static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
1536 const __m256i *const in, int32_t *const out, const int stride,
1537 const int out_size) {
1538 for (int i = 0; i < out_size; ++i) {
1539 store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
1540 }
1541 }
1542
1543 static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
1544 fdct16x32_new_avx2, // DCT_DCT
1545 NULL, // ADST_DCT
1546 NULL, // DCT_ADST
1547 NULL, // ADST_ADST
1548 NULL, // FLIPADST_DCT
1549 NULL, // DCT_FLIPADST
1550 NULL, // FLIPADST_FLIPADST
1551 NULL, // ADST_FLIPADST
1552 NULL, // FLIPADST_ADST
1553 fidentity16x32_new_avx2, // IDTX
1554 fdct16x32_new_avx2, // V_DCT
1555 fidentity16x32_new_avx2, // H_DCT
1556 NULL, // V_ADST
1557 NULL, // H_ADST
1558 NULL, // V_FLIPADST
1559 NULL // H_FLIPADST
1560 };
1561
1562 static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
1563 fdct16x32_new_avx2, // DCT_DCT
1564 NULL, // ADST_DCT
1565 NULL, // DCT_ADST
1566 NULL, // ADST_ADST
1567 NULL, // FLIPADST_DCT
1568 NULL, // DCT_FLIPADST
1569 NULL, // FLIPADST_FLIPADST
1570 NULL, // ADST_FLIPADST
1571 NULL, // FLIPADST_ADST
1572 fidentity16x32_new_avx2, // IDTX
1573 fidentity16x32_new_avx2, // V_DCT
1574 fdct16x32_new_avx2, // H_DCT
1575 NULL, // V_ADST
1576 NULL, // H_ADST
1577 NULL, // V_FLIPADST
1578 NULL // H_FLIPADST
1579 };
1580
1581 static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
1582 fdct16x16_new_avx2, // DCT_DCT
1583 fadst16x16_new_avx2, // ADST_DCT
1584 fdct16x16_new_avx2, // DCT_ADST
1585 fadst16x16_new_avx2, // ADST_ADST
1586 fadst16x16_new_avx2, // FLIPADST_DCT
1587 fdct16x16_new_avx2, // DCT_FLIPADST
1588 fadst16x16_new_avx2, // FLIPADST_FLIPADST
1589 fadst16x16_new_avx2, // ADST_FLIPADST
1590 fadst16x16_new_avx2, // FLIPADST_ADST
1591 fidentity16x16_new_avx2, // IDTX
1592 fdct16x16_new_avx2, // V_DCT
1593 fidentity16x16_new_avx2, // H_DCT
1594 fadst16x16_new_avx2, // V_ADST
1595 fidentity16x16_new_avx2, // H_ADST
1596 fadst16x16_new_avx2, // V_FLIPADST
1597 fidentity16x16_new_avx2 // H_FLIPADST
1598 };
1599
1600 static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
1601 fdct16x16_new_avx2, // DCT_DCT
1602 fdct16x16_new_avx2, // ADST_DCT
1603 fadst16x16_new_avx2, // DCT_ADST
1604 fadst16x16_new_avx2, // ADST_ADST
1605 fdct16x16_new_avx2, // FLIPADST_DCT
1606 fadst16x16_new_avx2, // DCT_FLIPADST
1607 fadst16x16_new_avx2, // FLIPADST_FLIPADST
1608 fadst16x16_new_avx2, // ADST_FLIPADST
1609 fadst16x16_new_avx2, // FLIPADST_ADST
1610 fidentity16x16_new_avx2, // IDTX
1611 fidentity16x16_new_avx2, // V_DCT
1612 fdct16x16_new_avx2, // H_DCT
1613 fidentity16x16_new_avx2, // V_ADST
1614 fadst16x16_new_avx2, // H_ADST
1615 fidentity16x16_new_avx2, // V_FLIPADST
1616 fadst16x16_new_avx2 // H_FLIPADST
1617 };
1618
lowbd_fwd_txfm2d_16x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1619 static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
1620 int stride, TX_TYPE tx_type, int bd) {
1621 (void)bd;
1622 const TX_SIZE tx_size = TX_16X16;
1623 __m256i buf0[16], buf1[16];
1624 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1625 const int txw_idx = get_txw_idx(tx_size);
1626 const int txh_idx = get_txh_idx(tx_size);
1627 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1628 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1629 const int width = tx_size_wide[tx_size];
1630 const int height = tx_size_high[tx_size];
1631 const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
1632 const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
1633 int ud_flip, lr_flip;
1634
1635 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1636 const int32_t i = 0;
1637 if (ud_flip) {
1638 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
1639 } else {
1640 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1641 }
1642 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1643 col_txfm(buf0, buf0, cos_bit_col);
1644 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1645 transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
1646
1647 __m256i *buf;
1648 if (lr_flip) {
1649 buf = buf0;
1650 flip_buf_avx2(buf1 + width * i, buf, width);
1651 } else {
1652 buf = buf1 + width * i;
1653 }
1654 row_txfm(buf, buf, cos_bit_row);
1655 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1656 transpose_16bit_16x16_avx2(buf, buf);
1657 store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16);
1658 }
1659
lowbd_fwd_txfm2d_32x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1660 static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
1661 int stride, TX_TYPE tx_type, int bd) {
1662 (void)bd;
1663 const TX_SIZE tx_size = TX_32X32;
1664 __m256i buf0[32], buf1[128];
1665 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1666 const int txw_idx = get_txw_idx(tx_size);
1667 const int txh_idx = get_txh_idx(tx_size);
1668 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1669 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1670 const int width = tx_size_wide[tx_size];
1671 const int height = tx_size_high[tx_size];
1672 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1673 const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
1674
1675 int ud_flip, lr_flip;
1676 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1677
1678 for (int i = 0; i < 2; i++) {
1679 if (ud_flip) {
1680 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
1681 height);
1682 } else {
1683 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1684 }
1685 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1686 col_txfm(buf0, buf0, cos_bit_col);
1687 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1688 transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
1689 transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
1690 }
1691
1692 for (int i = 0; i < 2; i++) {
1693 __m256i *buf;
1694 if (lr_flip) {
1695 buf = buf0;
1696 flip_buf_avx2(buf1 + width * i, buf, width);
1697 } else {
1698 buf = buf1 + width * i;
1699 }
1700 row_txfm(buf, buf, cos_bit_row);
1701 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1702 transpose_16bit_16x16_avx2(buf, buf);
1703 store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width,
1704 16);
1705 transpose_16bit_16x16_avx2(buf + 16, buf + 16);
1706 store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16,
1707 width, 16);
1708 }
1709 }
1710
lowbd_fwd_txfm2d_64x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1711 static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
1712 int stride, TX_TYPE tx_type, int bd) {
1713 (void)bd;
1714 (void)tx_type;
1715 assert(tx_type == DCT_DCT);
1716 const TX_SIZE tx_size = TX_64X64;
1717 __m256i buf0[64], buf1[256];
1718 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1719 const int txw_idx = get_txw_idx(tx_size);
1720 const int txh_idx = get_txh_idx(tx_size);
1721 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1722 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1723 const int width = tx_size_wide[tx_size];
1724 const int height = tx_size_high[tx_size];
1725 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
1726 const int width_div16 = (width >> 4);
1727 const int height_div16 = (height >> 4);
1728
1729 for (int i = 0; i < width_div16; i++) {
1730 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1731 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1732 col_txfm(buf0, buf0, cos_bit_col);
1733 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1734 for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
1735 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1736 }
1737 }
1738
1739 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
1740 __m256i bufA[64];
1741 __m256i bufB[64];
1742 __m128i *buf = (__m128i *)(buf1 + width * i);
1743 for (int j = 0; j < width; ++j) {
1744 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
1745 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
1746 }
1747 av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
1748 av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
1749 av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
1750 av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
1751
1752 int32_t *output8 = output + 16 * 32 * i;
1753 for (int j = 0; j < 4; ++j) {
1754 __m256i *out = (__m256i *)(output8 + 8 * j);
1755 transpose_32_8x8_avx2(4, bufA + 8 * j, out);
1756 transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
1757 }
1758 }
1759 }
1760
lowbd_fwd_txfm2d_16x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1761 static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
1762 int stride, TX_TYPE tx_type, int bd) {
1763 (void)bd;
1764 const TX_SIZE tx_size = TX_16X32;
1765 __m256i buf0[32], buf1[32];
1766 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1767 const int txw_idx = get_txw_idx(tx_size);
1768 const int txh_idx = get_txh_idx(tx_size);
1769 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1770 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1771 const int width = tx_size_wide[tx_size];
1772 const int height = tx_size_high[tx_size];
1773 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1774 const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
1775
1776 int ud_flip, lr_flip;
1777 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1778
1779 if (ud_flip) {
1780 load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
1781 } else {
1782 load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
1783 }
1784 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1785 col_txfm(buf0, buf0, cos_bit_col);
1786 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1787 transpose_16bit_16x16_avx2(buf0, buf1);
1788 transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
1789
1790 for (int i = 0; i < 2; i++) {
1791 __m256i *buf;
1792 if (lr_flip) {
1793 buf = buf0;
1794 flip_buf_avx2(buf1 + width * i, buf, width);
1795 } else {
1796 buf = buf1 + width * i;
1797 }
1798 row_txfm(buf, buf, cos_bit_row);
1799 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1800 transpose_16bit_16x16_avx2(buf, buf);
1801 store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i,
1802 width, 16);
1803 }
1804 }
1805
lowbd_fwd_txfm2d_32x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1806 static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
1807 int stride, TX_TYPE tx_type, int bd) {
1808 (void)bd;
1809 __m256i buf0[32], buf1[64];
1810 const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
1811 const int txw_idx = get_txw_idx(TX_32X16);
1812 const int txh_idx = get_txh_idx(TX_32X16);
1813 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1814 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1815 const int width = 32;
1816 const int height = 16;
1817 const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
1818 const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
1819
1820 int ud_flip, lr_flip;
1821 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1822
1823 for (int i = 0; i < 2; i++) {
1824 if (ud_flip) {
1825 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
1826 height);
1827 } else {
1828 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1829 }
1830 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1831 col_txfm(buf0, buf0, cos_bit_col);
1832 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1833 transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
1834 }
1835
1836 __m256i *buf;
1837 if (lr_flip) {
1838 buf = buf0;
1839 flip_buf_avx2(buf1, buf, width);
1840 } else {
1841 buf = buf1;
1842 }
1843 row_txfm(buf, buf, cos_bit_row);
1844 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1845 transpose_16bit_16x16_avx2(buf, buf);
1846 store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16);
1847
1848 transpose_16bit_16x16_avx2(buf + 16, buf + 16);
1849 store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16);
1850 }
1851
lowbd_fwd_txfm2d_64x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1852 static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
1853 int stride, TX_TYPE tx_type, int bd) {
1854 (void)bd;
1855 const TX_SIZE tx_size = TX_64X32;
1856 __m256i buf0[64], buf1[256];
1857 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1858 const int txw_idx = get_txw_idx(tx_size);
1859 const int txh_idx = get_txh_idx(tx_size);
1860 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1861 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1862 const int width = tx_size_wide[tx_size];
1863 const int height = tx_size_high[tx_size];
1864 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1865 const int width_div16 = (width >> 4);
1866 const int height_div16 = (height >> 4);
1867
1868 for (int i = 0; i < width_div16; i++) {
1869 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1870 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1871 col_txfm(buf0, buf0, cos_bit_col);
1872 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1873 for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
1874 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1875 }
1876 }
1877 assert(tx_type == DCT_DCT);
1878 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
1879 __m256i bufA[64];
1880 __m256i bufB[64];
1881 __m128i *buf = (__m128i *)(buf1 + width * i);
1882 for (int j = 0; j < width; ++j) {
1883 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
1884 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
1885 }
1886 av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
1887 av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
1888 av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
1889 av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
1890
1891 int32_t *output8 = output + 16 * 32 * i;
1892 for (int j = 0; j < 4; ++j) {
1893 __m256i *out = (__m256i *)(output8 + 8 * j);
1894 transpose_32_8x8_avx2(4, bufA + 8 * j, out);
1895 transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
1896 }
1897 }
1898 }
1899
lowbd_fwd_txfm2d_32x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1900 static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
1901 int stride, TX_TYPE tx_type, int bd) {
1902 (void)bd;
1903 (void)tx_type;
1904 assert(tx_type == DCT_DCT);
1905 const TX_SIZE tx_size = TX_32X64;
1906 __m256i buf0[64], buf1[256];
1907 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1908 const int txw_idx = get_txw_idx(tx_size);
1909 const int txh_idx = get_txh_idx(tx_size);
1910 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1911 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1912 const int width = tx_size_wide[tx_size];
1913 const int height = tx_size_high[tx_size];
1914 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
1915 const int width_div16 = (width >> 4);
1916 const int height_div16 = (height >> 4);
1917
1918 for (int i = 0; i < width_div16; i++) {
1919 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1920 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1921 col_txfm(buf0, buf0, cos_bit_col);
1922 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1923 for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
1924 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1925 }
1926 }
1927
1928 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
1929 __m256i bufA[32];
1930 __m256i bufB[32];
1931 __m128i *buf = (__m128i *)(buf1 + width * i);
1932 for (int j = 0; j < width; ++j) {
1933 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
1934 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
1935 }
1936 av1_fdct32_new_avx2(bufA, bufA, cos_bit_row);
1937 av1_fdct32_new_avx2(bufB, bufB, cos_bit_row);
1938 av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
1939 av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
1940
1941 int32_t *output8 = output + 16 * 32 * i;
1942 for (int j = 0; j < 4; ++j) {
1943 __m256i *out = (__m256i *)(output8 + 8 * j);
1944 transpose_32_8x8_avx2(4, bufA + 8 * j, out);
1945 transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
1946 }
1947 }
1948 }
1949
lowbd_fwd_txfm2d_16x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1950 static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
1951 int stride, TX_TYPE tx_type, int bd) {
1952 (void)bd;
1953 (void)tx_type;
1954 assert(tx_type == DCT_DCT);
1955 const TX_SIZE tx_size = TX_16X64;
1956 __m256i buf0[64], buf1[64];
1957 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
1958 const int txw_idx = get_txw_idx(tx_size);
1959 const int txh_idx = get_txh_idx(tx_size);
1960 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
1961 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
1962 const int width = tx_size_wide[tx_size];
1963 const int height = tx_size_high[tx_size];
1964 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
1965 const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
1966 const int width_div16 = (width >> 4);
1967 const int height_div16 = (height >> 4);
1968
1969 for (int i = 0; i < width_div16; i++) {
1970 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1971 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1972 col_txfm(buf0, buf0, cos_bit_col);
1973 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1974 for (int j = 0; j < height_div16; ++j) {
1975 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1976 }
1977 }
1978
1979 for (int i = 0; i < AOMMIN(4, height_div16); i++) {
1980 __m256i *buf = buf1 + width * i;
1981 row_txfm(buf, buf, cos_bit_row);
1982 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1983 int32_t *output16 = output + 16 * width * i;
1984 for (int j = 0; j < width_div16; ++j) {
1985 __m256i *buf16 = buf + 16 * j;
1986 transpose_16bit_16x16_avx2(buf16, buf16);
1987 store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16);
1988 }
1989 }
1990 // Zero out the bottom 16x32 area.
1991 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
1992 }
1993
lowbd_fwd_txfm2d_64x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1994 static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
1995 int stride, TX_TYPE tx_type, int bd) {
1996 (void)bd;
1997 (void)tx_type;
1998 assert(tx_type == DCT_DCT);
1999 const TX_SIZE tx_size = TX_64X16;
2000 __m256i buf0[64], buf1[64];
2001 const int8_t *shift = fwd_txfm_shift_ls[tx_size];
2002 const int txw_idx = get_txw_idx(tx_size);
2003 const int txh_idx = get_txh_idx(tx_size);
2004 const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
2005 const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
2006 const int width = tx_size_wide[tx_size];
2007 const int height = tx_size_high[tx_size];
2008 const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
2009 const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
2010 const int width_div16 = (width >> 4);
2011 const int height_div16 = (height >> 4);
2012
2013 for (int i = 0; i < width_div16; i++) {
2014 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2015 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2016 col_txfm(buf0, buf0, cos_bit_col);
2017 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2018 for (int j = 0; j < height_div16; ++j) {
2019 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2020 }
2021 }
2022
2023 for (int i = 0; i < height_div16; i++) {
2024 __m256i *buf = buf1 + width * i;
2025 row_txfm(buf, buf, cos_bit_row);
2026 round_shift_16bit_w16_avx2(buf, width, shift[2]);
2027 int32_t *output16 = output + 16 * 32 * i;
2028 for (int j = 0; j < 2; ++j) {
2029 __m256i *buf16 = buf + 16 * j;
2030 transpose_16bit_16x16_avx2(buf16, buf16);
2031 store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16);
2032 }
2033 }
2034 }
2035
2036 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
2037 av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
2038 av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
2039 lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform
2040 lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform
2041 lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform
2042 av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
2043 av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
2044 av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
2045 av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
2046 lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform
2047 lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform
2048 lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform
2049 lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform
2050 av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
2051 av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
2052 av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
2053 av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
2054 lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform
2055 lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform
2056 };
2057
av1_lowbd_fwd_txfm_avx2(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)2058 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
2059 int diff_stride, TxfmParam *txfm_param) {
2060 FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
2061 if ((fwd_txfm2d_func == NULL) ||
2062 (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
2063 av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
2064 } else {
2065 fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
2066 txfm_param->bd);
2067 }
2068 }
2069