1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/aom_config.h"
13
14 #include "config/av1_rtcd.h"
15
16 #include "av1/common/av1_inv_txfm1d_cfg.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/common/x86/av1_inv_txfm_avx2.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20
21 // TODO(venkatsanampudi@ittiam.com): move this to header file
22
23 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
24 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
25 4 * 5793 };
26
idct16_stage5_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)27 static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
28 const __m256i _r, int8_t cos_bit) {
29 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
30 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
31 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
32 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
33 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
34
35 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
36 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
37 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
38 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
39 }
40
idct16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)41 static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
42 const __m256i _r, int8_t cos_bit) {
43 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
44 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
45 btf_16_adds_subs_avx2(&x[0], &x[7]);
46 btf_16_adds_subs_avx2(&x[1], &x[6]);
47 btf_16_adds_subs_avx2(&x[2], &x[5]);
48 btf_16_adds_subs_avx2(&x[3], &x[4]);
49 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
50 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
51 }
52
idct16_stage7_avx2(__m256i * output,__m256i * x1)53 static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
54 btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
55 btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
56 btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
57 btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
58 btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
59 btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
60 btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
61 btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
62 }
63
idct16_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)64 static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
65 (void)(cos_bit);
66 const int32_t *cospi = cospi_arr(INV_COS_BIT);
67 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
68
69 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
70 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
71 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
72 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
73 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
74 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
75 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
76 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
77 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
78 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
79 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
80 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
81 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
82 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
83 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
84 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
85 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
86 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
87 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
88
89 // stage 1
90 __m256i x1[16];
91 x1[0] = input[0];
92 x1[1] = input[8];
93 x1[2] = input[4];
94 x1[3] = input[12];
95 x1[4] = input[2];
96 x1[5] = input[10];
97 x1[6] = input[6];
98 x1[7] = input[14];
99 x1[8] = input[1];
100 x1[9] = input[9];
101 x1[10] = input[5];
102 x1[11] = input[13];
103 x1[12] = input[3];
104 x1[13] = input[11];
105 x1[14] = input[7];
106 x1[15] = input[15];
107
108 // stage 2
109 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
110 btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
111 btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
112 btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
113
114 // stage 3
115 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
116 btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
117 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
118 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
119 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
120 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
121
122 // stage 4
123 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
124 btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
125 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
126 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
127 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
128 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
129
130 idct16_stage5_avx2(x1, cospi, _r, cos_bit);
131 idct16_stage6_avx2(x1, cospi, _r, cos_bit);
132 idct16_stage7_avx2(output, x1);
133 }
134
idct16_low8_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)135 static void idct16_low8_avx2(const __m256i *input, __m256i *output,
136 int8_t cos_bit) {
137 (void)(cos_bit);
138 const int32_t *cospi = cospi_arr(INV_COS_BIT);
139 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
140
141 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
142 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
143 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
144
145 // stage 1
146 __m256i x1[16];
147 x1[0] = input[0];
148 x1[2] = input[4];
149 x1[4] = input[2];
150 x1[6] = input[6];
151 x1[8] = input[1];
152 x1[10] = input[5];
153 x1[12] = input[3];
154 x1[14] = input[7];
155
156 // stage 2
157 btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
158 btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
159 btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
160 btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
161
162 // stage 3
163 btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
164 btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
165 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
166 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
167 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
168 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
169
170 // stage 4
171 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
172 btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
173 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
174 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
175 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
176 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
177
178 idct16_stage5_avx2(x1, cospi, _r, cos_bit);
179 idct16_stage6_avx2(x1, cospi, _r, cos_bit);
180 idct16_stage7_avx2(output, x1);
181 }
182
idct16_low1_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)183 static void idct16_low1_avx2(const __m256i *input, __m256i *output,
184 int8_t cos_bit) {
185 (void)(cos_bit);
186 const int32_t *cospi = cospi_arr(INV_COS_BIT);
187
188 // stage 1
189 __m256i x1[2];
190 x1[0] = input[0];
191
192 // stage 2
193 // stage 3
194 // stage 4
195 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
196
197 // stage 5
198 // stage 6
199 output[0] = x1[0];
200 output[1] = x1[1];
201 output[2] = x1[1];
202 output[3] = x1[0];
203 output[4] = x1[0];
204 output[5] = x1[1];
205 output[6] = x1[1];
206 output[7] = x1[0];
207 output[8] = x1[0];
208 output[9] = x1[1];
209 output[10] = x1[1];
210 output[11] = x1[0];
211 output[12] = x1[0];
212 output[13] = x1[1];
213 output[14] = x1[1];
214 output[15] = x1[0];
215 }
216
iadst16_stage3_avx2(__m256i * x)217 static INLINE void iadst16_stage3_avx2(__m256i *x) {
218 btf_16_adds_subs_avx2(&x[0], &x[8]);
219 btf_16_adds_subs_avx2(&x[1], &x[9]);
220 btf_16_adds_subs_avx2(&x[2], &x[10]);
221 btf_16_adds_subs_avx2(&x[3], &x[11]);
222 btf_16_adds_subs_avx2(&x[4], &x[12]);
223 btf_16_adds_subs_avx2(&x[5], &x[13]);
224 btf_16_adds_subs_avx2(&x[6], &x[14]);
225 btf_16_adds_subs_avx2(&x[7], &x[15]);
226 }
227
iadst16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)228 static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
229 const __m256i _r, int8_t cos_bit) {
230 const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
231 const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
232 const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
233 const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
234 const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
235 const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
236 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
237 btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
238 btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
239 btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
240 }
241
iadst16_stage5_avx2(__m256i * x)242 static INLINE void iadst16_stage5_avx2(__m256i *x) {
243 btf_16_adds_subs_avx2(&x[0], &x[4]);
244 btf_16_adds_subs_avx2(&x[1], &x[5]);
245 btf_16_adds_subs_avx2(&x[2], &x[6]);
246 btf_16_adds_subs_avx2(&x[3], &x[7]);
247 btf_16_adds_subs_avx2(&x[8], &x[12]);
248 btf_16_adds_subs_avx2(&x[9], &x[13]);
249 btf_16_adds_subs_avx2(&x[10], &x[14]);
250 btf_16_adds_subs_avx2(&x[11], &x[15]);
251 }
252
iadst16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)253 static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
254 const __m256i _r, int8_t cos_bit) {
255 const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
256 const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
257 const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
258 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
259 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
260 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
261 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
262 }
263
iadst16_stage7_avx2(__m256i * x)264 static INLINE void iadst16_stage7_avx2(__m256i *x) {
265 btf_16_adds_subs_avx2(&x[0], &x[2]);
266 btf_16_adds_subs_avx2(&x[1], &x[3]);
267 btf_16_adds_subs_avx2(&x[4], &x[6]);
268 btf_16_adds_subs_avx2(&x[5], &x[7]);
269 btf_16_adds_subs_avx2(&x[8], &x[10]);
270 btf_16_adds_subs_avx2(&x[9], &x[11]);
271 btf_16_adds_subs_avx2(&x[12], &x[14]);
272 btf_16_adds_subs_avx2(&x[13], &x[15]);
273 }
274
iadst16_stage8_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)275 static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
276 const __m256i _r, int8_t cos_bit) {
277 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
278 const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
279 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
280 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
281 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
282 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
283 }
284
iadst16_stage9_avx2(__m256i * output,__m256i * x1)285 static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
286 const __m256i __zero = _mm256_setzero_si256();
287 output[0] = x1[0];
288 output[1] = _mm256_subs_epi16(__zero, x1[8]);
289 output[2] = x1[12];
290 output[3] = _mm256_subs_epi16(__zero, x1[4]);
291 output[4] = x1[6];
292 output[5] = _mm256_subs_epi16(__zero, x1[14]);
293 output[6] = x1[10];
294 output[7] = _mm256_subs_epi16(__zero, x1[2]);
295 output[8] = x1[3];
296 output[9] = _mm256_subs_epi16(__zero, x1[11]);
297 output[10] = x1[15];
298 output[11] = _mm256_subs_epi16(__zero, x1[7]);
299 output[12] = x1[5];
300 output[13] = _mm256_subs_epi16(__zero, x1[13]);
301 output[14] = x1[9];
302 output[15] = _mm256_subs_epi16(__zero, x1[1]);
303 }
304
iadst16_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)305 static void iadst16_avx2(const __m256i *input, __m256i *output,
306 int8_t cos_bit) {
307 (void)(cos_bit);
308 const int32_t *cospi = cospi_arr(INV_COS_BIT);
309
310 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
311
312 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
313 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
314 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
315 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
316 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
317 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
318 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
319 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
320 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
321 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
322 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
323 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
324 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
325 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
326 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
327 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
328
329 // stage 1
330 __m256i x1[16];
331 x1[0] = input[15];
332 x1[1] = input[0];
333 x1[2] = input[13];
334 x1[3] = input[2];
335 x1[4] = input[11];
336 x1[5] = input[4];
337 x1[6] = input[9];
338 x1[7] = input[6];
339 x1[8] = input[7];
340 x1[9] = input[8];
341 x1[10] = input[5];
342 x1[11] = input[10];
343 x1[12] = input[3];
344 x1[13] = input[12];
345 x1[14] = input[1];
346 x1[15] = input[14];
347
348 // stage 2
349 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
350 btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
351 btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
352 btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
353 btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
354 btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
355 btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
356 btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
357
358 iadst16_stage3_avx2(x1);
359 iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
360 iadst16_stage5_avx2(x1);
361 iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
362 iadst16_stage7_avx2(x1);
363 iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
364 iadst16_stage9_avx2(output, x1);
365 }
366
iadst16_low8_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)367 static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
368 int8_t cos_bit) {
369 (void)(cos_bit);
370 const int32_t *cospi = cospi_arr(INV_COS_BIT);
371 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
372
373 // stage 1
374 __m256i x1[16];
375 x1[1] = input[0];
376 x1[3] = input[2];
377 x1[5] = input[4];
378 x1[7] = input[6];
379 x1[8] = input[7];
380 x1[10] = input[5];
381 x1[12] = input[3];
382 x1[14] = input[1];
383
384 // stage 2
385 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
386 btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
387 btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
388 btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
389 btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
390 btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
391 btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
392 btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
393
394 iadst16_stage3_avx2(x1);
395 iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
396 iadst16_stage5_avx2(x1);
397 iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
398 iadst16_stage7_avx2(x1);
399 iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
400 iadst16_stage9_avx2(output, x1);
401 }
402
iadst16_low1_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)403 static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
404 int8_t cos_bit) {
405 (void)(cos_bit);
406 const int32_t *cospi = cospi_arr(INV_COS_BIT);
407 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
408
409 const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
410 const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
411 const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
412 const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
413
414 // stage 1
415 __m256i x1[16];
416 x1[1] = input[0];
417
418 // stage 2
419 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
420
421 // stage 3
422 x1[8] = x1[0];
423 x1[9] = x1[1];
424
425 // stage 4
426 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
427
428 // stage 5
429 x1[4] = x1[0];
430 x1[5] = x1[1];
431
432 x1[12] = x1[8];
433 x1[13] = x1[9];
434
435 // stage 6
436 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
437 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
438
439 // stage 7
440 x1[2] = x1[0];
441 x1[3] = x1[1];
442 x1[6] = x1[4];
443 x1[7] = x1[5];
444 x1[10] = x1[8];
445 x1[11] = x1[9];
446 x1[14] = x1[12];
447 x1[15] = x1[13];
448
449 iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
450 iadst16_stage9_avx2(output, x1);
451 }
452
idct32_high16_stage3_avx2(__m256i * x)453 static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
454 btf_16_adds_subs_avx2(&x[16], &x[17]);
455 btf_16_adds_subs_avx2(&x[19], &x[18]);
456 btf_16_adds_subs_avx2(&x[20], &x[21]);
457 btf_16_adds_subs_avx2(&x[23], &x[22]);
458 btf_16_adds_subs_avx2(&x[24], &x[25]);
459 btf_16_adds_subs_avx2(&x[27], &x[26]);
460 btf_16_adds_subs_avx2(&x[28], &x[29]);
461 btf_16_adds_subs_avx2(&x[31], &x[30]);
462 }
463
idct32_high16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)464 static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
465 const __m256i _r, int8_t cos_bit) {
466 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
467 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
468 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
469 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
470 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
471 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
472 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
473 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
474 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
475 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
476 }
477
idct32_high24_stage5_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)478 static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
479 const __m256i _r, int8_t cos_bit) {
480 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
481 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
482 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
483 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
484 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
485 btf_16_adds_subs_avx2(&x[16], &x[19]);
486 btf_16_adds_subs_avx2(&x[17], &x[18]);
487 btf_16_adds_subs_avx2(&x[23], &x[20]);
488 btf_16_adds_subs_avx2(&x[22], &x[21]);
489 btf_16_adds_subs_avx2(&x[24], &x[27]);
490 btf_16_adds_subs_avx2(&x[25], &x[26]);
491 btf_16_adds_subs_avx2(&x[31], &x[28]);
492 btf_16_adds_subs_avx2(&x[30], &x[29]);
493 }
494
idct32_high28_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)495 static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
496 const __m256i _r, int8_t cos_bit) {
497 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
498 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
499 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
500 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
501 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
502 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
503 btf_16_adds_subs_avx2(&x[8], &x[11]);
504 btf_16_adds_subs_avx2(&x[9], &x[10]);
505 btf_16_adds_subs_avx2(&x[15], &x[12]);
506 btf_16_adds_subs_avx2(&x[14], &x[13]);
507 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
508 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
509 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
510 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
511 }
512
idct32_stage7_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)513 static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
514 const __m256i _r, int8_t cos_bit) {
515 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
516 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
517 btf_16_adds_subs_avx2(&x[0], &x[7]);
518 btf_16_adds_subs_avx2(&x[1], &x[6]);
519 btf_16_adds_subs_avx2(&x[2], &x[5]);
520 btf_16_adds_subs_avx2(&x[3], &x[4]);
521 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
522 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
523 btf_16_adds_subs_avx2(&x[16], &x[23]);
524 btf_16_adds_subs_avx2(&x[17], &x[22]);
525 btf_16_adds_subs_avx2(&x[18], &x[21]);
526 btf_16_adds_subs_avx2(&x[19], &x[20]);
527 btf_16_adds_subs_avx2(&x[31], &x[24]);
528 btf_16_adds_subs_avx2(&x[30], &x[25]);
529 btf_16_adds_subs_avx2(&x[29], &x[26]);
530 btf_16_adds_subs_avx2(&x[28], &x[27]);
531 }
532
idct32_stage8_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)533 static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
534 const __m256i _r, int8_t cos_bit) {
535 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
536 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
537 btf_16_adds_subs_avx2(&x[0], &x[15]);
538 btf_16_adds_subs_avx2(&x[1], &x[14]);
539 btf_16_adds_subs_avx2(&x[2], &x[13]);
540 btf_16_adds_subs_avx2(&x[3], &x[12]);
541 btf_16_adds_subs_avx2(&x[4], &x[11]);
542 btf_16_adds_subs_avx2(&x[5], &x[10]);
543 btf_16_adds_subs_avx2(&x[6], &x[9]);
544 btf_16_adds_subs_avx2(&x[7], &x[8]);
545 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
546 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
547 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
548 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
549 }
550
idct32_stage9_avx2(__m256i * output,__m256i * x)551 static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
552 btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
553 btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
554 btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
555 btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
556 btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
557 btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
558 btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
559 btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
560 btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
561 btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
562 btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
563 btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
564 btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
565 btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
566 btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
567 btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
568 }
569
idct32_low1_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)570 static void idct32_low1_avx2(const __m256i *input, __m256i *output,
571 int8_t cos_bit) {
572 (void)cos_bit;
573 const int32_t *cospi = cospi_arr(INV_COS_BIT);
574
575 // stage 1
576 __m256i x[2];
577 x[0] = input[0];
578
579 // stage 2
580 // stage 3
581 // stage 4
582 // stage 5
583 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
584
585 // stage 6
586 // stage 7
587 // stage 8
588 // stage 9
589 output[0] = x[0];
590 output[31] = x[0];
591 output[1] = x[1];
592 output[30] = x[1];
593 output[2] = x[1];
594 output[29] = x[1];
595 output[3] = x[0];
596 output[28] = x[0];
597 output[4] = x[0];
598 output[27] = x[0];
599 output[5] = x[1];
600 output[26] = x[1];
601 output[6] = x[1];
602 output[25] = x[1];
603 output[7] = x[0];
604 output[24] = x[0];
605 output[8] = x[0];
606 output[23] = x[0];
607 output[9] = x[1];
608 output[22] = x[1];
609 output[10] = x[1];
610 output[21] = x[1];
611 output[11] = x[0];
612 output[20] = x[0];
613 output[12] = x[0];
614 output[19] = x[0];
615 output[13] = x[1];
616 output[18] = x[1];
617 output[14] = x[1];
618 output[17] = x[1];
619 output[15] = x[0];
620 output[16] = x[0];
621 }
622
idct32_low8_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)623 static void idct32_low8_avx2(const __m256i *input, __m256i *output,
624 int8_t cos_bit) {
625 (void)cos_bit;
626 const int32_t *cospi = cospi_arr(INV_COS_BIT);
627 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
628
629 // stage 1
630 __m256i x[32];
631 x[0] = input[0];
632 x[4] = input[4];
633 x[8] = input[2];
634 x[12] = input[6];
635 x[16] = input[1];
636 x[20] = input[5];
637 x[24] = input[3];
638 x[28] = input[7];
639
640 // stage 2
641 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
642 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
643 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
644 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
645
646 // stage 3
647 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
648 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
649 x[17] = x[16];
650 x[18] = x[19];
651 x[21] = x[20];
652 x[22] = x[23];
653 x[25] = x[24];
654 x[26] = x[27];
655 x[29] = x[28];
656 x[30] = x[31];
657
658 // stage 4
659 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
660 x[9] = x[8];
661 x[10] = x[11];
662 x[13] = x[12];
663 x[14] = x[15];
664 idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
665
666 // stage 5
667 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
668 x[5] = x[4];
669 x[6] = x[7];
670 idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
671 // stage 6
672 x[3] = x[0];
673 x[2] = x[1];
674 idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
675
676 idct32_stage7_avx2(x, cospi, _r, cos_bit);
677 idct32_stage8_avx2(x, cospi, _r, cos_bit);
678 idct32_stage9_avx2(output, x);
679 }
680
idct32_low16_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)681 static void idct32_low16_avx2(const __m256i *input, __m256i *output,
682 int8_t cos_bit) {
683 (void)cos_bit;
684 const int32_t *cospi = cospi_arr(INV_COS_BIT);
685 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
686
687 // stage 1
688 __m256i x[32];
689 x[0] = input[0];
690 x[2] = input[8];
691 x[4] = input[4];
692 x[6] = input[12];
693 x[8] = input[2];
694 x[10] = input[10];
695 x[12] = input[6];
696 x[14] = input[14];
697 x[16] = input[1];
698 x[18] = input[9];
699 x[20] = input[5];
700 x[22] = input[13];
701 x[24] = input[3];
702 x[26] = input[11];
703 x[28] = input[7];
704 x[30] = input[15];
705
706 // stage 2
707 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
708 btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
709 btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
710 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
711 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
712 btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
713 btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
714 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
715
716 // stage 3
717 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
718 btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
719 btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
720 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
721 idct32_high16_stage3_avx2(x);
722
723 // stage 4
724 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
725 btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
726 btf_16_adds_subs_avx2(&x[8], &x[9]);
727 btf_16_adds_subs_avx2(&x[11], &x[10]);
728 btf_16_adds_subs_avx2(&x[12], &x[13]);
729 btf_16_adds_subs_avx2(&x[15], &x[14]);
730 idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
731
732 // stage 5
733 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
734 btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
735 btf_16_adds_subs_avx2(&x[4], &x[5]);
736 btf_16_adds_subs_avx2(&x[7], &x[6]);
737 idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
738
739 btf_16_adds_subs_avx2(&x[0], &x[3]);
740 btf_16_adds_subs_avx2(&x[1], &x[2]);
741 idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
742
743 idct32_stage7_avx2(x, cospi, _r, cos_bit);
744 idct32_stage8_avx2(x, cospi, _r, cos_bit);
745 idct32_stage9_avx2(output, x);
746 }
747
idct32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)748 static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
749 (void)(cos_bit);
750 const int32_t *cospi = cospi_arr(INV_COS_BIT);
751 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
752
753 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
754 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
755 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
756 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
757 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
758 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
759 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
760 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
761 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
762 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
763 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
764 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
765 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
766 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
767 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
768 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
769 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
770 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
771 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
772 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
773 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
774 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
775 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
776 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
777 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
778 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
779 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
780 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
781 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
782 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
783 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
784 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
785
786 // stage 1
787 __m256i x1[32];
788 x1[0] = input[0];
789 x1[1] = input[16];
790 x1[2] = input[8];
791 x1[3] = input[24];
792 x1[4] = input[4];
793 x1[5] = input[20];
794 x1[6] = input[12];
795 x1[7] = input[28];
796 x1[8] = input[2];
797 x1[9] = input[18];
798 x1[10] = input[10];
799 x1[11] = input[26];
800 x1[12] = input[6];
801 x1[13] = input[22];
802 x1[14] = input[14];
803 x1[15] = input[30];
804 x1[16] = input[1];
805 x1[17] = input[17];
806 x1[18] = input[9];
807 x1[19] = input[25];
808 x1[20] = input[5];
809 x1[21] = input[21];
810 x1[22] = input[13];
811 x1[23] = input[29];
812 x1[24] = input[3];
813 x1[25] = input[19];
814 x1[26] = input[11];
815 x1[27] = input[27];
816 x1[28] = input[7];
817 x1[29] = input[23];
818 x1[30] = input[15];
819 x1[31] = input[31];
820
821 // stage 2
822 btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
823 btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
824 btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
825 btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
826 btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
827 btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
828 btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
829 btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
830
831 // stage 3
832 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
833 btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
834 btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
835 btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
836 idct32_high16_stage3_avx2(x1);
837
838 // stage 4
839 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
840 btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
841 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
842 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
843 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
844 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
845 idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
846
847 // stage 5
848 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
849 btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
850 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
851 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
852 idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
853
854 // stage 6
855 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
856 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
857 idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
858
859 idct32_stage7_avx2(x1, cospi, _r, cos_bit);
860 idct32_stage8_avx2(x1, cospi, _r, cos_bit);
861 idct32_stage9_avx2(output, x1);
862 }
863
idct64_stage4_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)864 static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
865 const __m256i _r, int8_t cos_bit) {
866 (void)cos_bit;
867 const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
868 const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
869 const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
870 const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
871 const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
872 const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
873 const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
874 const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
875 const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
876 const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
877 const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
878 const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
879 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
880 btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
881 btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
882 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
883 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
884 btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
885 btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
886 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
887 }
888
idct64_stage5_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)889 static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
890 const __m256i _r, int8_t cos_bit) {
891 (void)cos_bit;
892 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
893 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
894 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
895 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
896 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
897 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
898 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
899 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
900 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
901 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
902 btf_16_adds_subs_avx2(&x[32], &x[35]);
903 btf_16_adds_subs_avx2(&x[33], &x[34]);
904 btf_16_adds_subs_avx2(&x[39], &x[36]);
905 btf_16_adds_subs_avx2(&x[38], &x[37]);
906 btf_16_adds_subs_avx2(&x[40], &x[43]);
907 btf_16_adds_subs_avx2(&x[41], &x[42]);
908 btf_16_adds_subs_avx2(&x[47], &x[44]);
909 btf_16_adds_subs_avx2(&x[46], &x[45]);
910 btf_16_adds_subs_avx2(&x[48], &x[51]);
911 btf_16_adds_subs_avx2(&x[49], &x[50]);
912 btf_16_adds_subs_avx2(&x[55], &x[52]);
913 btf_16_adds_subs_avx2(&x[54], &x[53]);
914 btf_16_adds_subs_avx2(&x[56], &x[59]);
915 btf_16_adds_subs_avx2(&x[57], &x[58]);
916 btf_16_adds_subs_avx2(&x[63], &x[60]);
917 btf_16_adds_subs_avx2(&x[62], &x[61]);
918 }
919
idct64_stage6_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)920 static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
921 const __m256i _r, int8_t cos_bit) {
922 (void)cos_bit;
923 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
924 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
925 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
926 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
927 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
928 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
929 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
930 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
931 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
932 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
933 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
934 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
935 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
936 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
937 }
938
idct64_stage6_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)939 static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
940 const __m256i _r, int8_t cos_bit) {
941 btf_16_adds_subs_avx2(&x[16], &x[19]);
942 btf_16_adds_subs_avx2(&x[17], &x[18]);
943 btf_16_adds_subs_avx2(&x[23], &x[20]);
944 btf_16_adds_subs_avx2(&x[22], &x[21]);
945 btf_16_adds_subs_avx2(&x[24], &x[27]);
946 btf_16_adds_subs_avx2(&x[25], &x[26]);
947 btf_16_adds_subs_avx2(&x[31], &x[28]);
948 btf_16_adds_subs_avx2(&x[30], &x[29]);
949 idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
950 }
951
idct64_stage7_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)952 static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
953 const __m256i _r, int8_t cos_bit) {
954 (void)cos_bit;
955 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
956 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
957 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
958 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
959 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
960 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
961 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
962 btf_16_adds_subs_avx2(&x[32], &x[39]);
963 btf_16_adds_subs_avx2(&x[33], &x[38]);
964 btf_16_adds_subs_avx2(&x[34], &x[37]);
965 btf_16_adds_subs_avx2(&x[35], &x[36]);
966 btf_16_adds_subs_avx2(&x[47], &x[40]);
967 btf_16_adds_subs_avx2(&x[46], &x[41]);
968 btf_16_adds_subs_avx2(&x[45], &x[42]);
969 btf_16_adds_subs_avx2(&x[44], &x[43]);
970 btf_16_adds_subs_avx2(&x[48], &x[55]);
971 btf_16_adds_subs_avx2(&x[49], &x[54]);
972 btf_16_adds_subs_avx2(&x[50], &x[53]);
973 btf_16_adds_subs_avx2(&x[51], &x[52]);
974 btf_16_adds_subs_avx2(&x[63], &x[56]);
975 btf_16_adds_subs_avx2(&x[62], &x[57]);
976 btf_16_adds_subs_avx2(&x[61], &x[58]);
977 btf_16_adds_subs_avx2(&x[60], &x[59]);
978 }
979
idct64_stage8_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)980 static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
981 const __m256i _r, int8_t cos_bit) {
982 (void)cos_bit;
983 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
984 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
985 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
986 btf_16_adds_subs_avx2(&x[16], &x[23]);
987 btf_16_adds_subs_avx2(&x[17], &x[22]);
988 btf_16_adds_subs_avx2(&x[18], &x[21]);
989 btf_16_adds_subs_avx2(&x[19], &x[20]);
990 btf_16_adds_subs_avx2(&x[31], &x[24]);
991 btf_16_adds_subs_avx2(&x[30], &x[25]);
992 btf_16_adds_subs_avx2(&x[29], &x[26]);
993 btf_16_adds_subs_avx2(&x[28], &x[27]);
994 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
995 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
996 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
997 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
998 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
999 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
1000 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
1001 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
1002 }
1003
idct64_stage9_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1004 static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
1005 const __m256i _r, int8_t cos_bit) {
1006 (void)cos_bit;
1007 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1008 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1009 btf_16_adds_subs_avx2(&x[0], &x[15]);
1010 btf_16_adds_subs_avx2(&x[1], &x[14]);
1011 btf_16_adds_subs_avx2(&x[2], &x[13]);
1012 btf_16_adds_subs_avx2(&x[3], &x[12]);
1013 btf_16_adds_subs_avx2(&x[4], &x[11]);
1014 btf_16_adds_subs_avx2(&x[5], &x[10]);
1015 btf_16_adds_subs_avx2(&x[6], &x[9]);
1016 btf_16_adds_subs_avx2(&x[7], &x[8]);
1017 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
1018 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
1019 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
1020 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
1021 btf_16_adds_subs_avx2(&x[32], &x[47]);
1022 btf_16_adds_subs_avx2(&x[33], &x[46]);
1023 btf_16_adds_subs_avx2(&x[34], &x[45]);
1024 btf_16_adds_subs_avx2(&x[35], &x[44]);
1025 btf_16_adds_subs_avx2(&x[36], &x[43]);
1026 btf_16_adds_subs_avx2(&x[37], &x[42]);
1027 btf_16_adds_subs_avx2(&x[38], &x[41]);
1028 btf_16_adds_subs_avx2(&x[39], &x[40]);
1029 btf_16_adds_subs_avx2(&x[63], &x[48]);
1030 btf_16_adds_subs_avx2(&x[62], &x[49]);
1031 btf_16_adds_subs_avx2(&x[61], &x[50]);
1032 btf_16_adds_subs_avx2(&x[60], &x[51]);
1033 btf_16_adds_subs_avx2(&x[59], &x[52]);
1034 btf_16_adds_subs_avx2(&x[58], &x[53]);
1035 btf_16_adds_subs_avx2(&x[57], &x[54]);
1036 btf_16_adds_subs_avx2(&x[56], &x[55]);
1037 }
1038
idct64_stage10_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1039 static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
1040 const __m256i _r, int8_t cos_bit) {
1041 (void)cos_bit;
1042 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1043 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1044 btf_16_adds_subs_avx2(&x[0], &x[31]);
1045 btf_16_adds_subs_avx2(&x[1], &x[30]);
1046 btf_16_adds_subs_avx2(&x[2], &x[29]);
1047 btf_16_adds_subs_avx2(&x[3], &x[28]);
1048 btf_16_adds_subs_avx2(&x[4], &x[27]);
1049 btf_16_adds_subs_avx2(&x[5], &x[26]);
1050 btf_16_adds_subs_avx2(&x[6], &x[25]);
1051 btf_16_adds_subs_avx2(&x[7], &x[24]);
1052 btf_16_adds_subs_avx2(&x[8], &x[23]);
1053 btf_16_adds_subs_avx2(&x[9], &x[22]);
1054 btf_16_adds_subs_avx2(&x[10], &x[21]);
1055 btf_16_adds_subs_avx2(&x[11], &x[20]);
1056 btf_16_adds_subs_avx2(&x[12], &x[19]);
1057 btf_16_adds_subs_avx2(&x[13], &x[18]);
1058 btf_16_adds_subs_avx2(&x[14], &x[17]);
1059 btf_16_adds_subs_avx2(&x[15], &x[16]);
1060 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
1061 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
1062 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
1063 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
1064 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
1065 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
1066 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
1067 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
1068 }
1069
idct64_stage11_avx2(__m256i * output,__m256i * x)1070 static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
1071 btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
1072 btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
1073 btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
1074 btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
1075 btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
1076 btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
1077 btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
1078 btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
1079 btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
1080 btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
1081 btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
1082 btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
1083 btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
1084 btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
1085 btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
1086 btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
1087 btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
1088 btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
1089 btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
1090 btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
1091 btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
1092 btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
1093 btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
1094 btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
1095 btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
1096 btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
1097 btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
1098 btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
1099 btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
1100 btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
1101 btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
1102 btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
1103 }
1104
idct64_low1_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1105 static void idct64_low1_avx2(const __m256i *input, __m256i *output,
1106 int8_t cos_bit) {
1107 (void)cos_bit;
1108 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1109
1110 // stage 1
1111 __m256i x[32];
1112 x[0] = input[0];
1113
1114 // stage 2
1115 // stage 3
1116 // stage 4
1117 // stage 5
1118 // stage 6
1119 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1120
1121 // stage 7
1122 // stage 8
1123 // stage 9
1124 // stage 10
1125 // stage 11
1126 output[0] = x[0];
1127 output[63] = x[0];
1128 output[1] = x[1];
1129 output[62] = x[1];
1130 output[2] = x[1];
1131 output[61] = x[1];
1132 output[3] = x[0];
1133 output[60] = x[0];
1134 output[4] = x[0];
1135 output[59] = x[0];
1136 output[5] = x[1];
1137 output[58] = x[1];
1138 output[6] = x[1];
1139 output[57] = x[1];
1140 output[7] = x[0];
1141 output[56] = x[0];
1142 output[8] = x[0];
1143 output[55] = x[0];
1144 output[9] = x[1];
1145 output[54] = x[1];
1146 output[10] = x[1];
1147 output[53] = x[1];
1148 output[11] = x[0];
1149 output[52] = x[0];
1150 output[12] = x[0];
1151 output[51] = x[0];
1152 output[13] = x[1];
1153 output[50] = x[1];
1154 output[14] = x[1];
1155 output[49] = x[1];
1156 output[15] = x[0];
1157 output[48] = x[0];
1158 output[16] = x[0];
1159 output[47] = x[0];
1160 output[17] = x[1];
1161 output[46] = x[1];
1162 output[18] = x[1];
1163 output[45] = x[1];
1164 output[19] = x[0];
1165 output[44] = x[0];
1166 output[20] = x[0];
1167 output[43] = x[0];
1168 output[21] = x[1];
1169 output[42] = x[1];
1170 output[22] = x[1];
1171 output[41] = x[1];
1172 output[23] = x[0];
1173 output[40] = x[0];
1174 output[24] = x[0];
1175 output[39] = x[0];
1176 output[25] = x[1];
1177 output[38] = x[1];
1178 output[26] = x[1];
1179 output[37] = x[1];
1180 output[27] = x[0];
1181 output[36] = x[0];
1182 output[28] = x[0];
1183 output[35] = x[0];
1184 output[29] = x[1];
1185 output[34] = x[1];
1186 output[30] = x[1];
1187 output[33] = x[1];
1188 output[31] = x[0];
1189 output[32] = x[0];
1190 }
1191
idct64_low8_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1192 static void idct64_low8_avx2(const __m256i *input, __m256i *output,
1193 int8_t cos_bit) {
1194 (void)cos_bit;
1195 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1196 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1197 const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
1198 const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
1199 const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
1200 const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
1201 const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
1202 const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
1203 const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
1204 const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
1205 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
1206 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
1207 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
1208 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
1209 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1210 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1211 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1212 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1213
1214 // stage 1
1215 __m256i x[64];
1216 x[0] = input[0];
1217 x[8] = input[4];
1218 x[16] = input[2];
1219 x[24] = input[6];
1220 x[32] = input[1];
1221 x[40] = input[5];
1222 x[48] = input[3];
1223 x[56] = input[7];
1224
1225 // stage 2
1226 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1227 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1228 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1229 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1230
1231 // stage 3
1232 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1233 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1234 x[33] = x[32];
1235 x[38] = x[39];
1236 x[41] = x[40];
1237 x[46] = x[47];
1238 x[49] = x[48];
1239 x[54] = x[55];
1240 x[57] = x[56];
1241 x[62] = x[63];
1242
1243 // stage 4
1244 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1245 x[17] = x[16];
1246 x[22] = x[23];
1247 x[25] = x[24];
1248 x[30] = x[31];
1249 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
1250 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
1251 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
1252 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
1253
1254 // stage 5
1255 x[9] = x[8];
1256 x[14] = x[15];
1257 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
1258 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
1259 x[35] = x[32];
1260 x[34] = x[33];
1261 x[36] = x[39];
1262 x[37] = x[38];
1263 x[43] = x[40];
1264 x[42] = x[41];
1265 x[44] = x[47];
1266 x[45] = x[46];
1267 x[51] = x[48];
1268 x[50] = x[49];
1269 x[52] = x[55];
1270 x[53] = x[54];
1271 x[59] = x[56];
1272 x[58] = x[57];
1273 x[60] = x[63];
1274 x[61] = x[62];
1275
1276 // stage 6
1277 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1278 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1279 x[19] = x[16];
1280 x[18] = x[17];
1281 x[20] = x[23];
1282 x[21] = x[22];
1283 x[27] = x[24];
1284 x[26] = x[25];
1285 x[28] = x[31];
1286 x[29] = x[30];
1287 idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
1288
1289 // stage 7
1290 x[3] = x[0];
1291 x[2] = x[1];
1292 x[11] = x[8];
1293 x[10] = x[9];
1294 x[12] = x[15];
1295 x[13] = x[14];
1296 idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1297
1298 // stage 8
1299 x[7] = x[0];
1300 x[6] = x[1];
1301 x[5] = x[2];
1302 x[4] = x[3];
1303 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1304 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1305 idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1306
1307 idct64_stage9_avx2(x, cospi, _r, cos_bit);
1308 idct64_stage10_avx2(x, cospi, _r, cos_bit);
1309 idct64_stage11_avx2(output, x);
1310 }
1311
idct64_low16_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1312 static void idct64_low16_avx2(const __m256i *input, __m256i *output,
1313 int8_t cos_bit) {
1314 (void)cos_bit;
1315 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1316 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1317
1318 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1319 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1320 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1321 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1322 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1323
1324 // stage 1
1325 __m256i x[64];
1326 x[0] = input[0];
1327 x[4] = input[8];
1328 x[8] = input[4];
1329 x[12] = input[12];
1330 x[16] = input[2];
1331 x[20] = input[10];
1332 x[24] = input[6];
1333 x[28] = input[14];
1334 x[32] = input[1];
1335 x[36] = input[9];
1336 x[40] = input[5];
1337 x[44] = input[13];
1338 x[48] = input[3];
1339 x[52] = input[11];
1340 x[56] = input[7];
1341 x[60] = input[15];
1342
1343 // stage 2
1344 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1345 btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1346 btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1347 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1348 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1349 btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1350 btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1351 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1352
1353 // stage 3
1354 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1355 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1356 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1357 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1358 x[33] = x[32];
1359 x[34] = x[35];
1360 x[37] = x[36];
1361 x[38] = x[39];
1362 x[41] = x[40];
1363 x[42] = x[43];
1364 x[45] = x[44];
1365 x[46] = x[47];
1366 x[49] = x[48];
1367 x[50] = x[51];
1368 x[53] = x[52];
1369 x[54] = x[55];
1370 x[57] = x[56];
1371 x[58] = x[59];
1372 x[61] = x[60];
1373 x[62] = x[63];
1374
1375 // stage 4
1376 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1377 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1378 x[17] = x[16];
1379 x[18] = x[19];
1380 x[21] = x[20];
1381 x[22] = x[23];
1382 x[25] = x[24];
1383 x[26] = x[27];
1384 x[29] = x[28];
1385 x[30] = x[31];
1386 idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
1387
1388 // stage 5
1389 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1390 x[9] = x[8];
1391 x[10] = x[11];
1392 x[13] = x[12];
1393 x[14] = x[15];
1394 idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
1395
1396 // stage 6
1397 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1398 x[5] = x[4];
1399 x[6] = x[7];
1400 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1401 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
1402 idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
1403
1404 // stage 7
1405 x[3] = x[0];
1406 x[2] = x[1];
1407 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
1408 btf_16_adds_subs_avx2(&x[8], &x[11]);
1409 btf_16_adds_subs_avx2(&x[9], &x[10]);
1410 btf_16_adds_subs_avx2(&x[15], &x[12]);
1411 btf_16_adds_subs_avx2(&x[14], &x[13]);
1412 idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1413
1414 // stage 8
1415 btf_16_adds_subs_avx2(&x[0], &x[7]);
1416 btf_16_adds_subs_avx2(&x[1], &x[6]);
1417 btf_16_adds_subs_avx2(&x[2], &x[5]);
1418 btf_16_adds_subs_avx2(&x[3], &x[4]);
1419 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1420 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1421 idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1422
1423 idct64_stage9_avx2(x, cospi, _r, cos_bit);
1424 idct64_stage10_avx2(x, cospi, _r, cos_bit);
1425 idct64_stage11_avx2(output, x);
1426 }
1427
idct64_low32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1428 static void idct64_low32_avx2(const __m256i *input, __m256i *output,
1429 int8_t cos_bit) {
1430 (void)cos_bit;
1431 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1432 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1433
1434 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1435 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1436 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1437 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1438 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1439
1440 // stage 1
1441 __m256i x[64];
1442 x[0] = input[0];
1443 x[2] = input[16];
1444 x[4] = input[8];
1445 x[6] = input[24];
1446 x[8] = input[4];
1447 x[10] = input[20];
1448 x[12] = input[12];
1449 x[14] = input[28];
1450 x[16] = input[2];
1451 x[18] = input[18];
1452 x[20] = input[10];
1453 x[22] = input[26];
1454 x[24] = input[6];
1455 x[26] = input[22];
1456 x[28] = input[14];
1457 x[30] = input[30];
1458 x[32] = input[1];
1459 x[34] = input[17];
1460 x[36] = input[9];
1461 x[38] = input[25];
1462 x[40] = input[5];
1463 x[42] = input[21];
1464 x[44] = input[13];
1465 x[46] = input[29];
1466 x[48] = input[3];
1467 x[50] = input[19];
1468 x[52] = input[11];
1469 x[54] = input[27];
1470 x[56] = input[7];
1471 x[58] = input[23];
1472 x[60] = input[15];
1473 x[62] = input[31];
1474
1475 // stage 2
1476 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1477 btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
1478 btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
1479 btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1480 btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1481 btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
1482 btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
1483 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1484 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1485 btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
1486 btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
1487 btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1488 btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1489 btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
1490 btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
1491 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1492
1493 // stage 3
1494 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1495 btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
1496 btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
1497 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1498 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1499 btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
1500 btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
1501 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1502 btf_16_adds_subs_avx2(&x[32], &x[33]);
1503 btf_16_adds_subs_avx2(&x[35], &x[34]);
1504 btf_16_adds_subs_avx2(&x[36], &x[37]);
1505 btf_16_adds_subs_avx2(&x[39], &x[38]);
1506 btf_16_adds_subs_avx2(&x[40], &x[41]);
1507 btf_16_adds_subs_avx2(&x[43], &x[42]);
1508 btf_16_adds_subs_avx2(&x[44], &x[45]);
1509 btf_16_adds_subs_avx2(&x[47], &x[46]);
1510 btf_16_adds_subs_avx2(&x[48], &x[49]);
1511 btf_16_adds_subs_avx2(&x[51], &x[50]);
1512 btf_16_adds_subs_avx2(&x[52], &x[53]);
1513 btf_16_adds_subs_avx2(&x[55], &x[54]);
1514 btf_16_adds_subs_avx2(&x[56], &x[57]);
1515 btf_16_adds_subs_avx2(&x[59], &x[58]);
1516 btf_16_adds_subs_avx2(&x[60], &x[61]);
1517 btf_16_adds_subs_avx2(&x[63], &x[62]);
1518
1519 // stage 4
1520 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1521 btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
1522 btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
1523 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1524 btf_16_adds_subs_avx2(&x[16], &x[17]);
1525 btf_16_adds_subs_avx2(&x[19], &x[18]);
1526 btf_16_adds_subs_avx2(&x[20], &x[21]);
1527 btf_16_adds_subs_avx2(&x[23], &x[22]);
1528 btf_16_adds_subs_avx2(&x[24], &x[25]);
1529 btf_16_adds_subs_avx2(&x[27], &x[26]);
1530 btf_16_adds_subs_avx2(&x[28], &x[29]);
1531 btf_16_adds_subs_avx2(&x[31], &x[30]);
1532 idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
1533
1534 // stage 5
1535 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1536 btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
1537 btf_16_adds_subs_avx2(&x[8], &x[9]);
1538 btf_16_adds_subs_avx2(&x[11], &x[10]);
1539 btf_16_adds_subs_avx2(&x[12], &x[13]);
1540 btf_16_adds_subs_avx2(&x[15], &x[14]);
1541 idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
1542
1543 // stage 6
1544 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1545 btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
1546 btf_16_adds_subs_avx2(&x[4], &x[5]);
1547 btf_16_adds_subs_avx2(&x[7], &x[6]);
1548 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
1549 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
1550 idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
1551
1552 // stage 7
1553 btf_16_adds_subs_avx2(&x[0], &x[3]);
1554 btf_16_adds_subs_avx2(&x[1], &x[2]);
1555 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
1556 btf_16_adds_subs_avx2(&x[8], &x[11]);
1557 btf_16_adds_subs_avx2(&x[9], &x[10]);
1558 btf_16_adds_subs_avx2(&x[15], &x[12]);
1559 btf_16_adds_subs_avx2(&x[14], &x[13]);
1560 idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
1561
1562 // stage 8
1563 btf_16_adds_subs_avx2(&x[0], &x[7]);
1564 btf_16_adds_subs_avx2(&x[1], &x[6]);
1565 btf_16_adds_subs_avx2(&x[2], &x[5]);
1566 btf_16_adds_subs_avx2(&x[3], &x[4]);
1567 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
1568 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
1569 idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
1570
1571 // stage 9~11
1572 idct64_stage9_avx2(x, cospi, _r, cos_bit);
1573 idct64_stage10_avx2(x, cospi, _r, cos_bit);
1574 idct64_stage11_avx2(output, x);
1575 }
1576
1577 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
1578 int8_t cos_bit);
1579
1580 // 1D functions process 16 pixels at one time.
1581 static const transform_1d_avx2
1582 lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
1583 {
1584 { NULL, NULL, NULL, NULL },
1585 { NULL, NULL, NULL, NULL },
1586 { NULL, NULL, NULL, NULL },
1587 },
1588 { { NULL, NULL, NULL, NULL },
1589 { NULL, NULL, NULL, NULL },
1590 { NULL, NULL, NULL, NULL } },
1591 {
1592 { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
1593 { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
1594 { NULL, NULL, NULL, NULL },
1595 },
1596 { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
1597 { NULL, NULL, NULL, NULL },
1598 { NULL, NULL, NULL, NULL } },
1599 { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
1600 idct64_low32_avx2 },
1601 { NULL, NULL, NULL, NULL },
1602 { NULL, NULL, NULL, NULL } }
1603 };
1604
1605 // only process w >= 16 h >= 16
lowbd_inv_txfm2d_add_no_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1606 static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
1607 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1608 TX_SIZE tx_size, int eob) {
1609 __m256i buf1[64 * 16];
1610 int eobx, eoby;
1611 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
1612 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1613 const int txw_idx = get_txw_idx(tx_size);
1614 const int txh_idx = get_txh_idx(tx_size);
1615 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
1616 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
1617 const int txfm_size_col = tx_size_wide[tx_size];
1618 const int txfm_size_row = tx_size_high[tx_size];
1619 const int buf_size_w_div16 = txfm_size_col >> 4;
1620 const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
1621 const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
1622 const int input_stride = AOMMIN(32, txfm_size_col);
1623 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1624
1625 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1626 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1627 const transform_1d_avx2 row_txfm =
1628 lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1629 const transform_1d_avx2 col_txfm =
1630 lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1631
1632 assert(col_txfm != NULL);
1633 assert(row_txfm != NULL);
1634 int ud_flip, lr_flip;
1635 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1636 const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
1637 for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
1638 __m256i buf0[64];
1639 const int32_t *input_row = input + (i << 4) * input_stride;
1640 for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
1641 __m256i *buf0_cur = buf0 + j * 16;
1642 const int32_t *input_cur = input_row + j * 16;
1643 load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
1644 16);
1645 transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1646 }
1647 if (rect_type == 1 || rect_type == -1) {
1648 round_shift_avx2(buf0, buf0, input_stride); // rect special code
1649 }
1650 row_txfm(buf0, buf0, cos_bit_row);
1651 for (int j = 0; j < txfm_size_col; ++j) {
1652 buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
1653 }
1654
1655 __m256i *buf1_cur = buf1 + (i << 4);
1656 if (lr_flip) {
1657 for (int j = 0; j < buf_size_w_div16; ++j) {
1658 __m256i temp[16];
1659 flip_buf_avx2(buf0 + 16 * j, temp, 16);
1660 int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
1661 transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
1662 }
1663 } else {
1664 for (int j = 0; j < buf_size_w_div16; ++j) {
1665 transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
1666 }
1667 }
1668 }
1669 const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
1670 for (int i = 0; i < buf_size_w_div16; i++) {
1671 __m256i *buf1_cur = buf1 + i * txfm_size_row;
1672 col_txfm(buf1_cur, buf1_cur, cos_bit_col);
1673 for (int j = 0; j < txfm_size_row; ++j) {
1674 buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
1675 }
1676 }
1677 for (int i = 0; i < buf_size_w_div16; i++) {
1678 lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
1679 stride, ud_flip, txfm_size_row);
1680 }
1681 }
1682
iidentity_row_16xn_avx2(__m256i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)1683 static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
1684 int stride, int shift, int height,
1685 int txw_idx, int rect_type) {
1686 const int32_t *input_row = input;
1687 const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
1688 const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
1689 (1 << (NewSqrt2Bits - shift - 1)));
1690 const __m256i one = _mm256_set1_epi16(1);
1691 const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
1692 if (rect_type != 1 && rect_type != -1) {
1693 for (int i = 0; i < height; ++i) {
1694 const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1695 input_row += stride;
1696 __m256i lo = _mm256_unpacklo_epi16(src, one);
1697 __m256i hi = _mm256_unpackhi_epi16(src, one);
1698 lo = _mm256_madd_epi16(lo, scale__r);
1699 hi = _mm256_madd_epi16(hi, scale__r);
1700 lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1701 hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1702 out[i] = _mm256_packs_epi32(lo, hi);
1703 }
1704 } else {
1705 const __m256i rect_scale =
1706 _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
1707 for (int i = 0; i < height; ++i) {
1708 __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1709 src = _mm256_mulhrs_epi16(src, rect_scale);
1710 input_row += stride;
1711 __m256i lo = _mm256_unpacklo_epi16(src, one);
1712 __m256i hi = _mm256_unpackhi_epi16(src, one);
1713 lo = _mm256_madd_epi16(lo, scale__r);
1714 hi = _mm256_madd_epi16(hi, scale__r);
1715 lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1716 hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1717 out[i] = _mm256_packs_epi32(lo, hi);
1718 }
1719 }
1720 }
1721
iidentity_col_16xn_avx2(uint8_t * output,int stride,__m256i * buf,int shift,int height,int txh_idx)1722 static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
1723 __m256i *buf, int shift, int height,
1724 int txh_idx) {
1725 const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
1726 const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
1727 const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
1728 const __m256i one = _mm256_set1_epi16(1);
1729 const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
1730 for (int h = 0; h < height; ++h) {
1731 __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
1732 __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
1733 lo = _mm256_madd_epi16(lo, scale_coeff);
1734 hi = _mm256_madd_epi16(hi, scale_coeff);
1735 lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
1736 hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
1737 lo = _mm256_add_epi32(lo, shift__r);
1738 hi = _mm256_add_epi32(hi, shift__r);
1739 lo = _mm256_srai_epi32(lo, -shift);
1740 hi = _mm256_srai_epi32(hi, -shift);
1741 const __m256i x = _mm256_packs_epi32(lo, hi);
1742 write_recon_w16_avx2(x, output);
1743 output += stride;
1744 }
1745 }
1746
lowbd_inv_txfm2d_add_idtx_avx2(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size,int32_t eob)1747 static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
1748 uint8_t *output, int stride,
1749 TX_SIZE tx_size,
1750 int32_t eob) {
1751 (void)eob;
1752 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1753 const int txw_idx = get_txw_idx(tx_size);
1754 const int txh_idx = get_txh_idx(tx_size);
1755 const int txfm_size_col = tx_size_wide[tx_size];
1756 const int txfm_size_row = tx_size_high[tx_size];
1757 const int input_stride = AOMMIN(32, txfm_size_col);
1758 const int row_max = AOMMIN(32, txfm_size_row);
1759 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1760 __m256i buf[32];
1761 for (int i = 0; i < input_stride; i += 16) {
1762 iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
1763 txw_idx, rect_type);
1764 iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
1765 txh_idx);
1766 }
1767 }
1768
lowbd_inv_txfm2d_add_h_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1769 static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
1770 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1771 TX_SIZE tx_size, int eob) {
1772 int eobx, eoby;
1773 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
1774 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1775 const int txw_idx = get_txw_idx(tx_size);
1776 const int txh_idx = get_txh_idx(tx_size);
1777 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
1778 const int txfm_size_col = tx_size_wide[tx_size];
1779 const int txfm_size_row = tx_size_high[tx_size];
1780 const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
1781 const int input_stride = txfm_size_col_notzero;
1782 const int buf_size_w_div16 = (eobx + 16) >> 4;
1783 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1784
1785 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1786 const transform_1d_avx2 col_txfm =
1787 lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1788
1789 assert(col_txfm != NULL);
1790
1791 int ud_flip, lr_flip;
1792 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1793 for (int i = 0; i < buf_size_w_div16; i++) {
1794 __m256i buf0[64];
1795 iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
1796 eoby + 1, txw_idx, rect_type);
1797 col_txfm(buf0, buf0, cos_bit_col);
1798 __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
1799 int k = ud_flip ? (txfm_size_row - 1) : 0;
1800 const int step = ud_flip ? -1 : 1;
1801 for (int j = 0; j < txfm_size_row; ++j, k += step) {
1802 __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
1803 write_recon_w16_avx2(res, output + (i << 4) + j * stride);
1804 }
1805 }
1806 }
1807
lowbd_inv_txfm2d_add_v_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1808 static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
1809 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1810 TX_SIZE tx_size, int eob) {
1811 __m256i buf1[64];
1812 int eobx, eoby;
1813 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
1814 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1815 const int txw_idx = get_txw_idx(tx_size);
1816 const int txh_idx = get_txh_idx(tx_size);
1817 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
1818 const int txfm_size_col = tx_size_wide[tx_size];
1819 const int txfm_size_row = tx_size_high[tx_size];
1820 const int buf_size_w_div16 = txfm_size_col >> 4;
1821 const int buf_size_h_div16 = (eoby + 16) >> 4;
1822 const int input_stride = AOMMIN(32, txfm_size_col);
1823 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1824
1825 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1826 const transform_1d_avx2 row_txfm =
1827 lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1828
1829 assert(row_txfm != NULL);
1830
1831 int ud_flip, lr_flip;
1832 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1833 for (int i = 0; i < buf_size_h_div16; i++) {
1834 __m256i buf0[64];
1835 const int32_t *input_row = input + i * input_stride * 16;
1836 for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
1837 __m256i *buf0_cur = buf0 + j * 16;
1838 load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
1839 buf0_cur, 16);
1840 transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1841 }
1842 if (rect_type == 1 || rect_type == -1) {
1843 round_shift_avx2(buf0, buf0, input_stride); // rect special code
1844 }
1845 row_txfm(buf0, buf0, cos_bit_row);
1846 round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
1847 __m256i *_buf1 = buf1;
1848 if (lr_flip) {
1849 for (int j = 0; j < buf_size_w_div16; ++j) {
1850 __m256i temp[16];
1851 flip_buf_avx2(buf0 + 16 * j, temp, 16);
1852 transpose_16bit_16x16_avx2(temp,
1853 _buf1 + 16 * (buf_size_w_div16 - 1 - j));
1854 }
1855 } else {
1856 for (int j = 0; j < buf_size_w_div16; ++j) {
1857 transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
1858 }
1859 }
1860 for (int j = 0; j < buf_size_w_div16; ++j) {
1861 iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
1862 buf1 + j * 16, shift[1], 16, txh_idx);
1863 }
1864 }
1865 }
1866
1867 // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1868 static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
1869 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1870 TX_SIZE tx_size, int eob) {
1871 (void)eob;
1872 switch (tx_type) {
1873 case DCT_DCT:
1874 case ADST_DCT: // ADST in vertical, DCT in horizontal
1875 case DCT_ADST: // DCT in vertical, ADST in horizontal
1876 case ADST_ADST: // ADST in both directions
1877 case FLIPADST_DCT:
1878 case DCT_FLIPADST:
1879 case FLIPADST_FLIPADST:
1880 case ADST_FLIPADST:
1881 case FLIPADST_ADST:
1882 lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
1883 tx_size, eob);
1884 break;
1885 case IDTX:
1886 lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
1887 break;
1888 case V_DCT:
1889 case V_ADST:
1890 case V_FLIPADST:
1891 lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
1892 tx_size, eob);
1893 break;
1894 case H_DCT:
1895 case H_ADST:
1896 case H_FLIPADST:
1897 lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
1898 tx_size, eob);
1899 break;
1900 default:
1901 av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
1902 eob);
1903 break;
1904 }
1905 }
1906
av1_lowbd_inv_txfm2d_add_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1907 void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
1908 int stride, TX_TYPE tx_type, TX_SIZE tx_size,
1909 int eob) {
1910 switch (tx_size) {
1911 case TX_4X4:
1912 case TX_8X8:
1913 case TX_4X8:
1914 case TX_8X4:
1915 case TX_8X16:
1916 case TX_16X8:
1917 case TX_4X16:
1918 case TX_16X4:
1919 case TX_8X32:
1920 case TX_32X8:
1921 av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
1922 eob);
1923 break;
1924 case TX_16X16:
1925 case TX_32X32:
1926 case TX_64X64:
1927 case TX_16X32:
1928 case TX_32X16:
1929 case TX_32X64:
1930 case TX_64X32:
1931 case TX_16X64:
1932 case TX_64X16:
1933 default:
1934 lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
1935 tx_size, eob);
1936 break;
1937 }
1938 }
1939
av1_inv_txfm_add_avx2(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)1940 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
1941 const TxfmParam *txfm_param) {
1942 const TX_TYPE tx_type = txfm_param->tx_type;
1943 if (!txfm_param->lossless) {
1944 av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
1945 txfm_param->tx_size, txfm_param->eob);
1946 } else {
1947 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
1948 }
1949 }
1950