1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <stdlib.h>
13 #include "aom_dsp/inv_txfm.h"
14 #include "av1/common/av1_fwd_txfm1d.h"
15 #if CONFIG_COEFFICIENT_RANGE_CHECKING
16
17 void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
18 int32_t size, int8_t bit);
19
20 #define range_check(stage, input, buf, size, bit) \
21 range_check_func(stage, input, buf, size, bit)
22 #else
23 #define range_check(stage, input, buf, size, bit) \
24 { \
25 (void)stage; \
26 (void)input; \
27 (void)buf; \
28 (void)size; \
29 (void)bit; \
30 }
31 #endif
32
33 // TODO(angiebird): Make 1-d txfm functions static
av1_fdct4_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)34 void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
35 const int8_t *stage_range) {
36 const int32_t size = 4;
37 const int32_t *cospi;
38
39 int32_t stage = 0;
40 int32_t *bf0, *bf1;
41 int32_t step[4];
42
43 // stage 0;
44 range_check(stage, input, input, size, stage_range[stage]);
45
46 // stage 1;
47 stage++;
48 bf1 = output;
49 bf1[0] = input[0] + input[3];
50 bf1[1] = input[1] + input[2];
51 bf1[2] = -input[2] + input[1];
52 bf1[3] = -input[3] + input[0];
53 range_check(stage, input, bf1, size, stage_range[stage]);
54
55 // stage 2
56 stage++;
57 cospi = cospi_arr(cos_bit[stage]);
58 bf0 = output;
59 bf1 = step;
60 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
61 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
62 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
63 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
64 range_check(stage, input, bf1, size, stage_range[stage]);
65
66 // stage 3
67 stage++;
68 bf0 = step;
69 bf1 = output;
70 bf1[0] = bf0[0];
71 bf1[1] = bf0[2];
72 bf1[2] = bf0[1];
73 bf1[3] = bf0[3];
74 range_check(stage, input, bf1, size, stage_range[stage]);
75 }
76
av1_fdct8_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)77 void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
78 const int8_t *stage_range) {
79 const int32_t size = 8;
80 const int32_t *cospi;
81
82 int32_t stage = 0;
83 int32_t *bf0, *bf1;
84 int32_t step[8];
85
86 // stage 0;
87 range_check(stage, input, input, size, stage_range[stage]);
88
89 // stage 1;
90 stage++;
91 bf1 = output;
92 bf1[0] = input[0] + input[7];
93 bf1[1] = input[1] + input[6];
94 bf1[2] = input[2] + input[5];
95 bf1[3] = input[3] + input[4];
96 bf1[4] = -input[4] + input[3];
97 bf1[5] = -input[5] + input[2];
98 bf1[6] = -input[6] + input[1];
99 bf1[7] = -input[7] + input[0];
100 range_check(stage, input, bf1, size, stage_range[stage]);
101
102 // stage 2
103 stage++;
104 cospi = cospi_arr(cos_bit[stage]);
105 bf0 = output;
106 bf1 = step;
107 bf1[0] = bf0[0] + bf0[3];
108 bf1[1] = bf0[1] + bf0[2];
109 bf1[2] = -bf0[2] + bf0[1];
110 bf1[3] = -bf0[3] + bf0[0];
111 bf1[4] = bf0[4];
112 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
113 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
114 bf1[7] = bf0[7];
115 range_check(stage, input, bf1, size, stage_range[stage]);
116
117 // stage 3
118 stage++;
119 cospi = cospi_arr(cos_bit[stage]);
120 bf0 = step;
121 bf1 = output;
122 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
123 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
124 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
125 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
126 bf1[4] = bf0[4] + bf0[5];
127 bf1[5] = -bf0[5] + bf0[4];
128 bf1[6] = -bf0[6] + bf0[7];
129 bf1[7] = bf0[7] + bf0[6];
130 range_check(stage, input, bf1, size, stage_range[stage]);
131
132 // stage 4
133 stage++;
134 cospi = cospi_arr(cos_bit[stage]);
135 bf0 = output;
136 bf1 = step;
137 bf1[0] = bf0[0];
138 bf1[1] = bf0[1];
139 bf1[2] = bf0[2];
140 bf1[3] = bf0[3];
141 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
142 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
143 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
144 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
145 range_check(stage, input, bf1, size, stage_range[stage]);
146
147 // stage 5
148 stage++;
149 bf0 = step;
150 bf1 = output;
151 bf1[0] = bf0[0];
152 bf1[1] = bf0[4];
153 bf1[2] = bf0[2];
154 bf1[3] = bf0[6];
155 bf1[4] = bf0[1];
156 bf1[5] = bf0[5];
157 bf1[6] = bf0[3];
158 bf1[7] = bf0[7];
159 range_check(stage, input, bf1, size, stage_range[stage]);
160 }
161
av1_fdct16_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)162 void av1_fdct16_new(const int32_t *input, int32_t *output,
163 const int8_t *cos_bit, const int8_t *stage_range) {
164 const int32_t size = 16;
165 const int32_t *cospi;
166
167 int32_t stage = 0;
168 int32_t *bf0, *bf1;
169 int32_t step[16];
170
171 // stage 0;
172 range_check(stage, input, input, size, stage_range[stage]);
173
174 // stage 1;
175 stage++;
176 bf1 = output;
177 bf1[0] = input[0] + input[15];
178 bf1[1] = input[1] + input[14];
179 bf1[2] = input[2] + input[13];
180 bf1[3] = input[3] + input[12];
181 bf1[4] = input[4] + input[11];
182 bf1[5] = input[5] + input[10];
183 bf1[6] = input[6] + input[9];
184 bf1[7] = input[7] + input[8];
185 bf1[8] = -input[8] + input[7];
186 bf1[9] = -input[9] + input[6];
187 bf1[10] = -input[10] + input[5];
188 bf1[11] = -input[11] + input[4];
189 bf1[12] = -input[12] + input[3];
190 bf1[13] = -input[13] + input[2];
191 bf1[14] = -input[14] + input[1];
192 bf1[15] = -input[15] + input[0];
193 range_check(stage, input, bf1, size, stage_range[stage]);
194
195 // stage 2
196 stage++;
197 cospi = cospi_arr(cos_bit[stage]);
198 bf0 = output;
199 bf1 = step;
200 bf1[0] = bf0[0] + bf0[7];
201 bf1[1] = bf0[1] + bf0[6];
202 bf1[2] = bf0[2] + bf0[5];
203 bf1[3] = bf0[3] + bf0[4];
204 bf1[4] = -bf0[4] + bf0[3];
205 bf1[5] = -bf0[5] + bf0[2];
206 bf1[6] = -bf0[6] + bf0[1];
207 bf1[7] = -bf0[7] + bf0[0];
208 bf1[8] = bf0[8];
209 bf1[9] = bf0[9];
210 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
211 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
212 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
213 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
214 bf1[14] = bf0[14];
215 bf1[15] = bf0[15];
216 range_check(stage, input, bf1, size, stage_range[stage]);
217
218 // stage 3
219 stage++;
220 cospi = cospi_arr(cos_bit[stage]);
221 bf0 = step;
222 bf1 = output;
223 bf1[0] = bf0[0] + bf0[3];
224 bf1[1] = bf0[1] + bf0[2];
225 bf1[2] = -bf0[2] + bf0[1];
226 bf1[3] = -bf0[3] + bf0[0];
227 bf1[4] = bf0[4];
228 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
229 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
230 bf1[7] = bf0[7];
231 bf1[8] = bf0[8] + bf0[11];
232 bf1[9] = bf0[9] + bf0[10];
233 bf1[10] = -bf0[10] + bf0[9];
234 bf1[11] = -bf0[11] + bf0[8];
235 bf1[12] = -bf0[12] + bf0[15];
236 bf1[13] = -bf0[13] + bf0[14];
237 bf1[14] = bf0[14] + bf0[13];
238 bf1[15] = bf0[15] + bf0[12];
239 range_check(stage, input, bf1, size, stage_range[stage]);
240
241 // stage 4
242 stage++;
243 cospi = cospi_arr(cos_bit[stage]);
244 bf0 = output;
245 bf1 = step;
246 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
247 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
248 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
249 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
250 bf1[4] = bf0[4] + bf0[5];
251 bf1[5] = -bf0[5] + bf0[4];
252 bf1[6] = -bf0[6] + bf0[7];
253 bf1[7] = bf0[7] + bf0[6];
254 bf1[8] = bf0[8];
255 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
256 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
257 bf1[11] = bf0[11];
258 bf1[12] = bf0[12];
259 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
260 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
261 bf1[15] = bf0[15];
262 range_check(stage, input, bf1, size, stage_range[stage]);
263
264 // stage 5
265 stage++;
266 cospi = cospi_arr(cos_bit[stage]);
267 bf0 = step;
268 bf1 = output;
269 bf1[0] = bf0[0];
270 bf1[1] = bf0[1];
271 bf1[2] = bf0[2];
272 bf1[3] = bf0[3];
273 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
274 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
275 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
276 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
277 bf1[8] = bf0[8] + bf0[9];
278 bf1[9] = -bf0[9] + bf0[8];
279 bf1[10] = -bf0[10] + bf0[11];
280 bf1[11] = bf0[11] + bf0[10];
281 bf1[12] = bf0[12] + bf0[13];
282 bf1[13] = -bf0[13] + bf0[12];
283 bf1[14] = -bf0[14] + bf0[15];
284 bf1[15] = bf0[15] + bf0[14];
285 range_check(stage, input, bf1, size, stage_range[stage]);
286
287 // stage 6
288 stage++;
289 cospi = cospi_arr(cos_bit[stage]);
290 bf0 = output;
291 bf1 = step;
292 bf1[0] = bf0[0];
293 bf1[1] = bf0[1];
294 bf1[2] = bf0[2];
295 bf1[3] = bf0[3];
296 bf1[4] = bf0[4];
297 bf1[5] = bf0[5];
298 bf1[6] = bf0[6];
299 bf1[7] = bf0[7];
300 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
301 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
302 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
303 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
304 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
305 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
306 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
307 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
308 range_check(stage, input, bf1, size, stage_range[stage]);
309
310 // stage 7
311 stage++;
312 bf0 = step;
313 bf1 = output;
314 bf1[0] = bf0[0];
315 bf1[1] = bf0[8];
316 bf1[2] = bf0[4];
317 bf1[3] = bf0[12];
318 bf1[4] = bf0[2];
319 bf1[5] = bf0[10];
320 bf1[6] = bf0[6];
321 bf1[7] = bf0[14];
322 bf1[8] = bf0[1];
323 bf1[9] = bf0[9];
324 bf1[10] = bf0[5];
325 bf1[11] = bf0[13];
326 bf1[12] = bf0[3];
327 bf1[13] = bf0[11];
328 bf1[14] = bf0[7];
329 bf1[15] = bf0[15];
330 range_check(stage, input, bf1, size, stage_range[stage]);
331 }
332
av1_fdct32_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)333 void av1_fdct32_new(const int32_t *input, int32_t *output,
334 const int8_t *cos_bit, const int8_t *stage_range) {
335 const int32_t size = 32;
336 const int32_t *cospi;
337
338 int32_t stage = 0;
339 int32_t *bf0, *bf1;
340 int32_t step[32];
341
342 // stage 0;
343 range_check(stage, input, input, size, stage_range[stage]);
344
345 // stage 1;
346 stage++;
347 bf1 = output;
348 bf1[0] = input[0] + input[31];
349 bf1[1] = input[1] + input[30];
350 bf1[2] = input[2] + input[29];
351 bf1[3] = input[3] + input[28];
352 bf1[4] = input[4] + input[27];
353 bf1[5] = input[5] + input[26];
354 bf1[6] = input[6] + input[25];
355 bf1[7] = input[7] + input[24];
356 bf1[8] = input[8] + input[23];
357 bf1[9] = input[9] + input[22];
358 bf1[10] = input[10] + input[21];
359 bf1[11] = input[11] + input[20];
360 bf1[12] = input[12] + input[19];
361 bf1[13] = input[13] + input[18];
362 bf1[14] = input[14] + input[17];
363 bf1[15] = input[15] + input[16];
364 bf1[16] = -input[16] + input[15];
365 bf1[17] = -input[17] + input[14];
366 bf1[18] = -input[18] + input[13];
367 bf1[19] = -input[19] + input[12];
368 bf1[20] = -input[20] + input[11];
369 bf1[21] = -input[21] + input[10];
370 bf1[22] = -input[22] + input[9];
371 bf1[23] = -input[23] + input[8];
372 bf1[24] = -input[24] + input[7];
373 bf1[25] = -input[25] + input[6];
374 bf1[26] = -input[26] + input[5];
375 bf1[27] = -input[27] + input[4];
376 bf1[28] = -input[28] + input[3];
377 bf1[29] = -input[29] + input[2];
378 bf1[30] = -input[30] + input[1];
379 bf1[31] = -input[31] + input[0];
380 range_check(stage, input, bf1, size, stage_range[stage]);
381
382 // stage 2
383 stage++;
384 cospi = cospi_arr(cos_bit[stage]);
385 bf0 = output;
386 bf1 = step;
387 bf1[0] = bf0[0] + bf0[15];
388 bf1[1] = bf0[1] + bf0[14];
389 bf1[2] = bf0[2] + bf0[13];
390 bf1[3] = bf0[3] + bf0[12];
391 bf1[4] = bf0[4] + bf0[11];
392 bf1[5] = bf0[5] + bf0[10];
393 bf1[6] = bf0[6] + bf0[9];
394 bf1[7] = bf0[7] + bf0[8];
395 bf1[8] = -bf0[8] + bf0[7];
396 bf1[9] = -bf0[9] + bf0[6];
397 bf1[10] = -bf0[10] + bf0[5];
398 bf1[11] = -bf0[11] + bf0[4];
399 bf1[12] = -bf0[12] + bf0[3];
400 bf1[13] = -bf0[13] + bf0[2];
401 bf1[14] = -bf0[14] + bf0[1];
402 bf1[15] = -bf0[15] + bf0[0];
403 bf1[16] = bf0[16];
404 bf1[17] = bf0[17];
405 bf1[18] = bf0[18];
406 bf1[19] = bf0[19];
407 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
408 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
409 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
410 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
411 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
412 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
413 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
414 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
415 bf1[28] = bf0[28];
416 bf1[29] = bf0[29];
417 bf1[30] = bf0[30];
418 bf1[31] = bf0[31];
419 range_check(stage, input, bf1, size, stage_range[stage]);
420
421 // stage 3
422 stage++;
423 cospi = cospi_arr(cos_bit[stage]);
424 bf0 = step;
425 bf1 = output;
426 bf1[0] = bf0[0] + bf0[7];
427 bf1[1] = bf0[1] + bf0[6];
428 bf1[2] = bf0[2] + bf0[5];
429 bf1[3] = bf0[3] + bf0[4];
430 bf1[4] = -bf0[4] + bf0[3];
431 bf1[5] = -bf0[5] + bf0[2];
432 bf1[6] = -bf0[6] + bf0[1];
433 bf1[7] = -bf0[7] + bf0[0];
434 bf1[8] = bf0[8];
435 bf1[9] = bf0[9];
436 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
437 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
438 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
439 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
440 bf1[14] = bf0[14];
441 bf1[15] = bf0[15];
442 bf1[16] = bf0[16] + bf0[23];
443 bf1[17] = bf0[17] + bf0[22];
444 bf1[18] = bf0[18] + bf0[21];
445 bf1[19] = bf0[19] + bf0[20];
446 bf1[20] = -bf0[20] + bf0[19];
447 bf1[21] = -bf0[21] + bf0[18];
448 bf1[22] = -bf0[22] + bf0[17];
449 bf1[23] = -bf0[23] + bf0[16];
450 bf1[24] = -bf0[24] + bf0[31];
451 bf1[25] = -bf0[25] + bf0[30];
452 bf1[26] = -bf0[26] + bf0[29];
453 bf1[27] = -bf0[27] + bf0[28];
454 bf1[28] = bf0[28] + bf0[27];
455 bf1[29] = bf0[29] + bf0[26];
456 bf1[30] = bf0[30] + bf0[25];
457 bf1[31] = bf0[31] + bf0[24];
458 range_check(stage, input, bf1, size, stage_range[stage]);
459
460 // stage 4
461 stage++;
462 cospi = cospi_arr(cos_bit[stage]);
463 bf0 = output;
464 bf1 = step;
465 bf1[0] = bf0[0] + bf0[3];
466 bf1[1] = bf0[1] + bf0[2];
467 bf1[2] = -bf0[2] + bf0[1];
468 bf1[3] = -bf0[3] + bf0[0];
469 bf1[4] = bf0[4];
470 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
471 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
472 bf1[7] = bf0[7];
473 bf1[8] = bf0[8] + bf0[11];
474 bf1[9] = bf0[9] + bf0[10];
475 bf1[10] = -bf0[10] + bf0[9];
476 bf1[11] = -bf0[11] + bf0[8];
477 bf1[12] = -bf0[12] + bf0[15];
478 bf1[13] = -bf0[13] + bf0[14];
479 bf1[14] = bf0[14] + bf0[13];
480 bf1[15] = bf0[15] + bf0[12];
481 bf1[16] = bf0[16];
482 bf1[17] = bf0[17];
483 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
484 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
485 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
486 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
487 bf1[22] = bf0[22];
488 bf1[23] = bf0[23];
489 bf1[24] = bf0[24];
490 bf1[25] = bf0[25];
491 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
492 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
493 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
494 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
495 bf1[30] = bf0[30];
496 bf1[31] = bf0[31];
497 range_check(stage, input, bf1, size, stage_range[stage]);
498
499 // stage 5
500 stage++;
501 cospi = cospi_arr(cos_bit[stage]);
502 bf0 = step;
503 bf1 = output;
504 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
505 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
506 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
507 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
508 bf1[4] = bf0[4] + bf0[5];
509 bf1[5] = -bf0[5] + bf0[4];
510 bf1[6] = -bf0[6] + bf0[7];
511 bf1[7] = bf0[7] + bf0[6];
512 bf1[8] = bf0[8];
513 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
514 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
515 bf1[11] = bf0[11];
516 bf1[12] = bf0[12];
517 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
518 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
519 bf1[15] = bf0[15];
520 bf1[16] = bf0[16] + bf0[19];
521 bf1[17] = bf0[17] + bf0[18];
522 bf1[18] = -bf0[18] + bf0[17];
523 bf1[19] = -bf0[19] + bf0[16];
524 bf1[20] = -bf0[20] + bf0[23];
525 bf1[21] = -bf0[21] + bf0[22];
526 bf1[22] = bf0[22] + bf0[21];
527 bf1[23] = bf0[23] + bf0[20];
528 bf1[24] = bf0[24] + bf0[27];
529 bf1[25] = bf0[25] + bf0[26];
530 bf1[26] = -bf0[26] + bf0[25];
531 bf1[27] = -bf0[27] + bf0[24];
532 bf1[28] = -bf0[28] + bf0[31];
533 bf1[29] = -bf0[29] + bf0[30];
534 bf1[30] = bf0[30] + bf0[29];
535 bf1[31] = bf0[31] + bf0[28];
536 range_check(stage, input, bf1, size, stage_range[stage]);
537
538 // stage 6
539 stage++;
540 cospi = cospi_arr(cos_bit[stage]);
541 bf0 = output;
542 bf1 = step;
543 bf1[0] = bf0[0];
544 bf1[1] = bf0[1];
545 bf1[2] = bf0[2];
546 bf1[3] = bf0[3];
547 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
548 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
549 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
550 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
551 bf1[8] = bf0[8] + bf0[9];
552 bf1[9] = -bf0[9] + bf0[8];
553 bf1[10] = -bf0[10] + bf0[11];
554 bf1[11] = bf0[11] + bf0[10];
555 bf1[12] = bf0[12] + bf0[13];
556 bf1[13] = -bf0[13] + bf0[12];
557 bf1[14] = -bf0[14] + bf0[15];
558 bf1[15] = bf0[15] + bf0[14];
559 bf1[16] = bf0[16];
560 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
561 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
562 bf1[19] = bf0[19];
563 bf1[20] = bf0[20];
564 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
565 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
566 bf1[23] = bf0[23];
567 bf1[24] = bf0[24];
568 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
569 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
570 bf1[27] = bf0[27];
571 bf1[28] = bf0[28];
572 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
573 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
574 bf1[31] = bf0[31];
575 range_check(stage, input, bf1, size, stage_range[stage]);
576
577 // stage 7
578 stage++;
579 cospi = cospi_arr(cos_bit[stage]);
580 bf0 = step;
581 bf1 = output;
582 bf1[0] = bf0[0];
583 bf1[1] = bf0[1];
584 bf1[2] = bf0[2];
585 bf1[3] = bf0[3];
586 bf1[4] = bf0[4];
587 bf1[5] = bf0[5];
588 bf1[6] = bf0[6];
589 bf1[7] = bf0[7];
590 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
591 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
592 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
593 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
594 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
595 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
596 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
597 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
598 bf1[16] = bf0[16] + bf0[17];
599 bf1[17] = -bf0[17] + bf0[16];
600 bf1[18] = -bf0[18] + bf0[19];
601 bf1[19] = bf0[19] + bf0[18];
602 bf1[20] = bf0[20] + bf0[21];
603 bf1[21] = -bf0[21] + bf0[20];
604 bf1[22] = -bf0[22] + bf0[23];
605 bf1[23] = bf0[23] + bf0[22];
606 bf1[24] = bf0[24] + bf0[25];
607 bf1[25] = -bf0[25] + bf0[24];
608 bf1[26] = -bf0[26] + bf0[27];
609 bf1[27] = bf0[27] + bf0[26];
610 bf1[28] = bf0[28] + bf0[29];
611 bf1[29] = -bf0[29] + bf0[28];
612 bf1[30] = -bf0[30] + bf0[31];
613 bf1[31] = bf0[31] + bf0[30];
614 range_check(stage, input, bf1, size, stage_range[stage]);
615
616 // stage 8
617 stage++;
618 cospi = cospi_arr(cos_bit[stage]);
619 bf0 = output;
620 bf1 = step;
621 bf1[0] = bf0[0];
622 bf1[1] = bf0[1];
623 bf1[2] = bf0[2];
624 bf1[3] = bf0[3];
625 bf1[4] = bf0[4];
626 bf1[5] = bf0[5];
627 bf1[6] = bf0[6];
628 bf1[7] = bf0[7];
629 bf1[8] = bf0[8];
630 bf1[9] = bf0[9];
631 bf1[10] = bf0[10];
632 bf1[11] = bf0[11];
633 bf1[12] = bf0[12];
634 bf1[13] = bf0[13];
635 bf1[14] = bf0[14];
636 bf1[15] = bf0[15];
637 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
638 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
639 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
640 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
641 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
642 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
643 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
644 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
645 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
646 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
647 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
648 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
649 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
650 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
651 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
652 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
653 range_check(stage, input, bf1, size, stage_range[stage]);
654
655 // stage 9
656 stage++;
657 bf0 = step;
658 bf1 = output;
659 bf1[0] = bf0[0];
660 bf1[1] = bf0[16];
661 bf1[2] = bf0[8];
662 bf1[3] = bf0[24];
663 bf1[4] = bf0[4];
664 bf1[5] = bf0[20];
665 bf1[6] = bf0[12];
666 bf1[7] = bf0[28];
667 bf1[8] = bf0[2];
668 bf1[9] = bf0[18];
669 bf1[10] = bf0[10];
670 bf1[11] = bf0[26];
671 bf1[12] = bf0[6];
672 bf1[13] = bf0[22];
673 bf1[14] = bf0[14];
674 bf1[15] = bf0[30];
675 bf1[16] = bf0[1];
676 bf1[17] = bf0[17];
677 bf1[18] = bf0[9];
678 bf1[19] = bf0[25];
679 bf1[20] = bf0[5];
680 bf1[21] = bf0[21];
681 bf1[22] = bf0[13];
682 bf1[23] = bf0[29];
683 bf1[24] = bf0[3];
684 bf1[25] = bf0[19];
685 bf1[26] = bf0[11];
686 bf1[27] = bf0[27];
687 bf1[28] = bf0[7];
688 bf1[29] = bf0[23];
689 bf1[30] = bf0[15];
690 bf1[31] = bf0[31];
691 range_check(stage, input, bf1, size, stage_range[stage]);
692 }
693
av1_fadst4_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)694 void av1_fadst4_new(const int32_t *input, int32_t *output,
695 const int8_t *cos_bit, const int8_t *stage_range) {
696 const int32_t size = 4;
697 const int32_t *cospi;
698
699 int32_t stage = 0;
700 int32_t *bf0, *bf1;
701 int32_t step[4];
702
703 // stage 0;
704 range_check(stage, input, input, size, stage_range[stage]);
705
706 // stage 1;
707 stage++;
708 bf1 = output;
709 bf1[0] = input[3];
710 bf1[1] = input[0];
711 bf1[2] = input[1];
712 bf1[3] = input[2];
713 range_check(stage, input, bf1, size, stage_range[stage]);
714
715 // stage 2
716 stage++;
717 cospi = cospi_arr(cos_bit[stage]);
718 bf0 = output;
719 bf1 = step;
720 bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
721 bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
722 bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
723 bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
724 range_check(stage, input, bf1, size, stage_range[stage]);
725
726 // stage 3
727 stage++;
728 bf0 = step;
729 bf1 = output;
730 bf1[0] = bf0[0] + bf0[2];
731 bf1[1] = bf0[1] + bf0[3];
732 bf1[2] = -bf0[2] + bf0[0];
733 bf1[3] = -bf0[3] + bf0[1];
734 range_check(stage, input, bf1, size, stage_range[stage]);
735
736 // stage 4
737 stage++;
738 cospi = cospi_arr(cos_bit[stage]);
739 bf0 = output;
740 bf1 = step;
741 bf1[0] = bf0[0];
742 bf1[1] = bf0[1];
743 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
744 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
745 range_check(stage, input, bf1, size, stage_range[stage]);
746
747 // stage 5
748 stage++;
749 bf0 = step;
750 bf1 = output;
751 bf1[0] = bf0[0];
752 bf1[1] = -bf0[2];
753 bf1[2] = bf0[3];
754 bf1[3] = -bf0[1];
755 range_check(stage, input, bf1, size, stage_range[stage]);
756 }
757
av1_fadst8_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)758 void av1_fadst8_new(const int32_t *input, int32_t *output,
759 const int8_t *cos_bit, const int8_t *stage_range) {
760 const int32_t size = 8;
761 const int32_t *cospi;
762
763 int32_t stage = 0;
764 int32_t *bf0, *bf1;
765 int32_t step[8];
766
767 // stage 0;
768 range_check(stage, input, input, size, stage_range[stage]);
769
770 // stage 1;
771 stage++;
772 bf1 = output;
773 bf1[0] = input[7];
774 bf1[1] = input[0];
775 bf1[2] = input[5];
776 bf1[3] = input[2];
777 bf1[4] = input[3];
778 bf1[5] = input[4];
779 bf1[6] = input[1];
780 bf1[7] = input[6];
781 range_check(stage, input, bf1, size, stage_range[stage]);
782
783 // stage 2
784 stage++;
785 cospi = cospi_arr(cos_bit[stage]);
786 bf0 = output;
787 bf1 = step;
788 bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
789 bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
790 bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
791 bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
792 bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
793 bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
794 bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
795 bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
796 range_check(stage, input, bf1, size, stage_range[stage]);
797
798 // stage 3
799 stage++;
800 bf0 = step;
801 bf1 = output;
802 bf1[0] = bf0[0] + bf0[4];
803 bf1[1] = bf0[1] + bf0[5];
804 bf1[2] = bf0[2] + bf0[6];
805 bf1[3] = bf0[3] + bf0[7];
806 bf1[4] = -bf0[4] + bf0[0];
807 bf1[5] = -bf0[5] + bf0[1];
808 bf1[6] = -bf0[6] + bf0[2];
809 bf1[7] = -bf0[7] + bf0[3];
810 range_check(stage, input, bf1, size, stage_range[stage]);
811
812 // stage 4
813 stage++;
814 cospi = cospi_arr(cos_bit[stage]);
815 bf0 = output;
816 bf1 = step;
817 bf1[0] = bf0[0];
818 bf1[1] = bf0[1];
819 bf1[2] = bf0[2];
820 bf1[3] = bf0[3];
821 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
822 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
823 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
824 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
825 range_check(stage, input, bf1, size, stage_range[stage]);
826
827 // stage 5
828 stage++;
829 bf0 = step;
830 bf1 = output;
831 bf1[0] = bf0[0] + bf0[2];
832 bf1[1] = bf0[1] + bf0[3];
833 bf1[2] = -bf0[2] + bf0[0];
834 bf1[3] = -bf0[3] + bf0[1];
835 bf1[4] = bf0[4] + bf0[6];
836 bf1[5] = bf0[5] + bf0[7];
837 bf1[6] = -bf0[6] + bf0[4];
838 bf1[7] = -bf0[7] + bf0[5];
839 range_check(stage, input, bf1, size, stage_range[stage]);
840
841 // stage 6
842 stage++;
843 cospi = cospi_arr(cos_bit[stage]);
844 bf0 = output;
845 bf1 = step;
846 bf1[0] = bf0[0];
847 bf1[1] = bf0[1];
848 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
849 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
850 bf1[4] = bf0[4];
851 bf1[5] = bf0[5];
852 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
853 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
854 range_check(stage, input, bf1, size, stage_range[stage]);
855
856 // stage 7
857 stage++;
858 bf0 = step;
859 bf1 = output;
860 bf1[0] = bf0[0];
861 bf1[1] = -bf0[4];
862 bf1[2] = bf0[6];
863 bf1[3] = -bf0[2];
864 bf1[4] = bf0[3];
865 bf1[5] = -bf0[7];
866 bf1[6] = bf0[5];
867 bf1[7] = -bf0[1];
868 range_check(stage, input, bf1, size, stage_range[stage]);
869 }
870
av1_fadst16_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)871 void av1_fadst16_new(const int32_t *input, int32_t *output,
872 const int8_t *cos_bit, const int8_t *stage_range) {
873 const int32_t size = 16;
874 const int32_t *cospi;
875
876 int32_t stage = 0;
877 int32_t *bf0, *bf1;
878 int32_t step[16];
879
880 // stage 0;
881 range_check(stage, input, input, size, stage_range[stage]);
882
883 // stage 1;
884 stage++;
885 bf1 = output;
886 bf1[0] = input[15];
887 bf1[1] = input[0];
888 bf1[2] = input[13];
889 bf1[3] = input[2];
890 bf1[4] = input[11];
891 bf1[5] = input[4];
892 bf1[6] = input[9];
893 bf1[7] = input[6];
894 bf1[8] = input[7];
895 bf1[9] = input[8];
896 bf1[10] = input[5];
897 bf1[11] = input[10];
898 bf1[12] = input[3];
899 bf1[13] = input[12];
900 bf1[14] = input[1];
901 bf1[15] = input[14];
902 range_check(stage, input, bf1, size, stage_range[stage]);
903
904 // stage 2
905 stage++;
906 cospi = cospi_arr(cos_bit[stage]);
907 bf0 = output;
908 bf1 = step;
909 bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
910 bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
911 bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
912 bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
913 bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
914 bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
915 bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
916 bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
917 bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
918 bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
919 bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
920 bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
921 bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
922 bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
923 bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
924 bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
925 range_check(stage, input, bf1, size, stage_range[stage]);
926
927 // stage 3
928 stage++;
929 bf0 = step;
930 bf1 = output;
931 bf1[0] = bf0[0] + bf0[8];
932 bf1[1] = bf0[1] + bf0[9];
933 bf1[2] = bf0[2] + bf0[10];
934 bf1[3] = bf0[3] + bf0[11];
935 bf1[4] = bf0[4] + bf0[12];
936 bf1[5] = bf0[5] + bf0[13];
937 bf1[6] = bf0[6] + bf0[14];
938 bf1[7] = bf0[7] + bf0[15];
939 bf1[8] = -bf0[8] + bf0[0];
940 bf1[9] = -bf0[9] + bf0[1];
941 bf1[10] = -bf0[10] + bf0[2];
942 bf1[11] = -bf0[11] + bf0[3];
943 bf1[12] = -bf0[12] + bf0[4];
944 bf1[13] = -bf0[13] + bf0[5];
945 bf1[14] = -bf0[14] + bf0[6];
946 bf1[15] = -bf0[15] + bf0[7];
947 range_check(stage, input, bf1, size, stage_range[stage]);
948
949 // stage 4
950 stage++;
951 cospi = cospi_arr(cos_bit[stage]);
952 bf0 = output;
953 bf1 = step;
954 bf1[0] = bf0[0];
955 bf1[1] = bf0[1];
956 bf1[2] = bf0[2];
957 bf1[3] = bf0[3];
958 bf1[4] = bf0[4];
959 bf1[5] = bf0[5];
960 bf1[6] = bf0[6];
961 bf1[7] = bf0[7];
962 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
963 bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
964 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
965 bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
966 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
967 bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
968 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
969 bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
970 range_check(stage, input, bf1, size, stage_range[stage]);
971
972 // stage 5
973 stage++;
974 bf0 = step;
975 bf1 = output;
976 bf1[0] = bf0[0] + bf0[4];
977 bf1[1] = bf0[1] + bf0[5];
978 bf1[2] = bf0[2] + bf0[6];
979 bf1[3] = bf0[3] + bf0[7];
980 bf1[4] = -bf0[4] + bf0[0];
981 bf1[5] = -bf0[5] + bf0[1];
982 bf1[6] = -bf0[6] + bf0[2];
983 bf1[7] = -bf0[7] + bf0[3];
984 bf1[8] = bf0[8] + bf0[12];
985 bf1[9] = bf0[9] + bf0[13];
986 bf1[10] = bf0[10] + bf0[14];
987 bf1[11] = bf0[11] + bf0[15];
988 bf1[12] = -bf0[12] + bf0[8];
989 bf1[13] = -bf0[13] + bf0[9];
990 bf1[14] = -bf0[14] + bf0[10];
991 bf1[15] = -bf0[15] + bf0[11];
992 range_check(stage, input, bf1, size, stage_range[stage]);
993
994 // stage 6
995 stage++;
996 cospi = cospi_arr(cos_bit[stage]);
997 bf0 = output;
998 bf1 = step;
999 bf1[0] = bf0[0];
1000 bf1[1] = bf0[1];
1001 bf1[2] = bf0[2];
1002 bf1[3] = bf0[3];
1003 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1004 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
1005 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1006 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
1007 bf1[8] = bf0[8];
1008 bf1[9] = bf0[9];
1009 bf1[10] = bf0[10];
1010 bf1[11] = bf0[11];
1011 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1012 bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
1013 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1014 bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
1015 range_check(stage, input, bf1, size, stage_range[stage]);
1016
1017 // stage 7
1018 stage++;
1019 bf0 = step;
1020 bf1 = output;
1021 bf1[0] = bf0[0] + bf0[2];
1022 bf1[1] = bf0[1] + bf0[3];
1023 bf1[2] = -bf0[2] + bf0[0];
1024 bf1[3] = -bf0[3] + bf0[1];
1025 bf1[4] = bf0[4] + bf0[6];
1026 bf1[5] = bf0[5] + bf0[7];
1027 bf1[6] = -bf0[6] + bf0[4];
1028 bf1[7] = -bf0[7] + bf0[5];
1029 bf1[8] = bf0[8] + bf0[10];
1030 bf1[9] = bf0[9] + bf0[11];
1031 bf1[10] = -bf0[10] + bf0[8];
1032 bf1[11] = -bf0[11] + bf0[9];
1033 bf1[12] = bf0[12] + bf0[14];
1034 bf1[13] = bf0[13] + bf0[15];
1035 bf1[14] = -bf0[14] + bf0[12];
1036 bf1[15] = -bf0[15] + bf0[13];
1037 range_check(stage, input, bf1, size, stage_range[stage]);
1038
1039 // stage 8
1040 stage++;
1041 cospi = cospi_arr(cos_bit[stage]);
1042 bf0 = output;
1043 bf1 = step;
1044 bf1[0] = bf0[0];
1045 bf1[1] = bf0[1];
1046 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1047 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
1048 bf1[4] = bf0[4];
1049 bf1[5] = bf0[5];
1050 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1051 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
1052 bf1[8] = bf0[8];
1053 bf1[9] = bf0[9];
1054 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1055 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
1056 bf1[12] = bf0[12];
1057 bf1[13] = bf0[13];
1058 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1059 bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
1060 range_check(stage, input, bf1, size, stage_range[stage]);
1061
1062 // stage 9
1063 stage++;
1064 bf0 = step;
1065 bf1 = output;
1066 bf1[0] = bf0[0];
1067 bf1[1] = -bf0[8];
1068 bf1[2] = bf0[12];
1069 bf1[3] = -bf0[4];
1070 bf1[4] = bf0[6];
1071 bf1[5] = -bf0[14];
1072 bf1[6] = bf0[10];
1073 bf1[7] = -bf0[2];
1074 bf1[8] = bf0[3];
1075 bf1[9] = -bf0[11];
1076 bf1[10] = bf0[15];
1077 bf1[11] = -bf0[7];
1078 bf1[12] = bf0[5];
1079 bf1[13] = -bf0[13];
1080 bf1[14] = bf0[9];
1081 bf1[15] = -bf0[1];
1082 range_check(stage, input, bf1, size, stage_range[stage]);
1083 }
1084
av1_fadst32_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1085 void av1_fadst32_new(const int32_t *input, int32_t *output,
1086 const int8_t *cos_bit, const int8_t *stage_range) {
1087 const int32_t size = 32;
1088 const int32_t *cospi;
1089
1090 int32_t stage = 0;
1091 int32_t *bf0, *bf1;
1092 int32_t step[32];
1093
1094 // stage 0;
1095 range_check(stage, input, input, size, stage_range[stage]);
1096
1097 // stage 1;
1098 stage++;
1099 bf1 = output;
1100 bf1[0] = input[31];
1101 bf1[1] = input[0];
1102 bf1[2] = input[29];
1103 bf1[3] = input[2];
1104 bf1[4] = input[27];
1105 bf1[5] = input[4];
1106 bf1[6] = input[25];
1107 bf1[7] = input[6];
1108 bf1[8] = input[23];
1109 bf1[9] = input[8];
1110 bf1[10] = input[21];
1111 bf1[11] = input[10];
1112 bf1[12] = input[19];
1113 bf1[13] = input[12];
1114 bf1[14] = input[17];
1115 bf1[15] = input[14];
1116 bf1[16] = input[15];
1117 bf1[17] = input[16];
1118 bf1[18] = input[13];
1119 bf1[19] = input[18];
1120 bf1[20] = input[11];
1121 bf1[21] = input[20];
1122 bf1[22] = input[9];
1123 bf1[23] = input[22];
1124 bf1[24] = input[7];
1125 bf1[25] = input[24];
1126 bf1[26] = input[5];
1127 bf1[27] = input[26];
1128 bf1[28] = input[3];
1129 bf1[29] = input[28];
1130 bf1[30] = input[1];
1131 bf1[31] = input[30];
1132 range_check(stage, input, bf1, size, stage_range[stage]);
1133
1134 // stage 2
1135 stage++;
1136 cospi = cospi_arr(cos_bit[stage]);
1137 bf0 = output;
1138 bf1 = step;
1139 bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
1140 bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
1141 bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
1142 bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
1143 bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
1144 bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
1145 bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
1146 bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
1147 bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
1148 bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
1149 bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
1150 bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
1151 bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
1152 bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
1153 bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
1154 bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
1155 bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
1156 bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
1157 bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
1158 bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
1159 bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
1160 bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
1161 bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
1162 bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
1163 bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
1164 bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
1165 bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
1166 bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
1167 bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
1168 bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
1169 bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
1170 bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
1171 range_check(stage, input, bf1, size, stage_range[stage]);
1172
1173 // stage 3
1174 stage++;
1175 bf0 = step;
1176 bf1 = output;
1177 bf1[0] = bf0[0] + bf0[16];
1178 bf1[1] = bf0[1] + bf0[17];
1179 bf1[2] = bf0[2] + bf0[18];
1180 bf1[3] = bf0[3] + bf0[19];
1181 bf1[4] = bf0[4] + bf0[20];
1182 bf1[5] = bf0[5] + bf0[21];
1183 bf1[6] = bf0[6] + bf0[22];
1184 bf1[7] = bf0[7] + bf0[23];
1185 bf1[8] = bf0[8] + bf0[24];
1186 bf1[9] = bf0[9] + bf0[25];
1187 bf1[10] = bf0[10] + bf0[26];
1188 bf1[11] = bf0[11] + bf0[27];
1189 bf1[12] = bf0[12] + bf0[28];
1190 bf1[13] = bf0[13] + bf0[29];
1191 bf1[14] = bf0[14] + bf0[30];
1192 bf1[15] = bf0[15] + bf0[31];
1193 bf1[16] = -bf0[16] + bf0[0];
1194 bf1[17] = -bf0[17] + bf0[1];
1195 bf1[18] = -bf0[18] + bf0[2];
1196 bf1[19] = -bf0[19] + bf0[3];
1197 bf1[20] = -bf0[20] + bf0[4];
1198 bf1[21] = -bf0[21] + bf0[5];
1199 bf1[22] = -bf0[22] + bf0[6];
1200 bf1[23] = -bf0[23] + bf0[7];
1201 bf1[24] = -bf0[24] + bf0[8];
1202 bf1[25] = -bf0[25] + bf0[9];
1203 bf1[26] = -bf0[26] + bf0[10];
1204 bf1[27] = -bf0[27] + bf0[11];
1205 bf1[28] = -bf0[28] + bf0[12];
1206 bf1[29] = -bf0[29] + bf0[13];
1207 bf1[30] = -bf0[30] + bf0[14];
1208 bf1[31] = -bf0[31] + bf0[15];
1209 range_check(stage, input, bf1, size, stage_range[stage]);
1210
1211 // stage 4
1212 stage++;
1213 cospi = cospi_arr(cos_bit[stage]);
1214 bf0 = output;
1215 bf1 = step;
1216 bf1[0] = bf0[0];
1217 bf1[1] = bf0[1];
1218 bf1[2] = bf0[2];
1219 bf1[3] = bf0[3];
1220 bf1[4] = bf0[4];
1221 bf1[5] = bf0[5];
1222 bf1[6] = bf0[6];
1223 bf1[7] = bf0[7];
1224 bf1[8] = bf0[8];
1225 bf1[9] = bf0[9];
1226 bf1[10] = bf0[10];
1227 bf1[11] = bf0[11];
1228 bf1[12] = bf0[12];
1229 bf1[13] = bf0[13];
1230 bf1[14] = bf0[14];
1231 bf1[15] = bf0[15];
1232 bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
1233 bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
1234 bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
1235 bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
1236 bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
1237 bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
1238 bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
1239 bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
1240 bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
1241 bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
1242 bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
1243 bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
1244 bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
1245 bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
1246 bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
1247 bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
1248 range_check(stage, input, bf1, size, stage_range[stage]);
1249
1250 // stage 5
1251 stage++;
1252 bf0 = step;
1253 bf1 = output;
1254 bf1[0] = bf0[0] + bf0[8];
1255 bf1[1] = bf0[1] + bf0[9];
1256 bf1[2] = bf0[2] + bf0[10];
1257 bf1[3] = bf0[3] + bf0[11];
1258 bf1[4] = bf0[4] + bf0[12];
1259 bf1[5] = bf0[5] + bf0[13];
1260 bf1[6] = bf0[6] + bf0[14];
1261 bf1[7] = bf0[7] + bf0[15];
1262 bf1[8] = -bf0[8] + bf0[0];
1263 bf1[9] = -bf0[9] + bf0[1];
1264 bf1[10] = -bf0[10] + bf0[2];
1265 bf1[11] = -bf0[11] + bf0[3];
1266 bf1[12] = -bf0[12] + bf0[4];
1267 bf1[13] = -bf0[13] + bf0[5];
1268 bf1[14] = -bf0[14] + bf0[6];
1269 bf1[15] = -bf0[15] + bf0[7];
1270 bf1[16] = bf0[16] + bf0[24];
1271 bf1[17] = bf0[17] + bf0[25];
1272 bf1[18] = bf0[18] + bf0[26];
1273 bf1[19] = bf0[19] + bf0[27];
1274 bf1[20] = bf0[20] + bf0[28];
1275 bf1[21] = bf0[21] + bf0[29];
1276 bf1[22] = bf0[22] + bf0[30];
1277 bf1[23] = bf0[23] + bf0[31];
1278 bf1[24] = -bf0[24] + bf0[16];
1279 bf1[25] = -bf0[25] + bf0[17];
1280 bf1[26] = -bf0[26] + bf0[18];
1281 bf1[27] = -bf0[27] + bf0[19];
1282 bf1[28] = -bf0[28] + bf0[20];
1283 bf1[29] = -bf0[29] + bf0[21];
1284 bf1[30] = -bf0[30] + bf0[22];
1285 bf1[31] = -bf0[31] + bf0[23];
1286 range_check(stage, input, bf1, size, stage_range[stage]);
1287
1288 // stage 6
1289 stage++;
1290 cospi = cospi_arr(cos_bit[stage]);
1291 bf0 = output;
1292 bf1 = step;
1293 bf1[0] = bf0[0];
1294 bf1[1] = bf0[1];
1295 bf1[2] = bf0[2];
1296 bf1[3] = bf0[3];
1297 bf1[4] = bf0[4];
1298 bf1[5] = bf0[5];
1299 bf1[6] = bf0[6];
1300 bf1[7] = bf0[7];
1301 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
1302 bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
1303 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
1304 bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
1305 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
1306 bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
1307 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
1308 bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
1309 bf1[16] = bf0[16];
1310 bf1[17] = bf0[17];
1311 bf1[18] = bf0[18];
1312 bf1[19] = bf0[19];
1313 bf1[20] = bf0[20];
1314 bf1[21] = bf0[21];
1315 bf1[22] = bf0[22];
1316 bf1[23] = bf0[23];
1317 bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
1318 bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
1319 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
1320 bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
1321 bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
1322 bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
1323 bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
1324 bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
1325 range_check(stage, input, bf1, size, stage_range[stage]);
1326
1327 // stage 7
1328 stage++;
1329 bf0 = step;
1330 bf1 = output;
1331 bf1[0] = bf0[0] + bf0[4];
1332 bf1[1] = bf0[1] + bf0[5];
1333 bf1[2] = bf0[2] + bf0[6];
1334 bf1[3] = bf0[3] + bf0[7];
1335 bf1[4] = -bf0[4] + bf0[0];
1336 bf1[5] = -bf0[5] + bf0[1];
1337 bf1[6] = -bf0[6] + bf0[2];
1338 bf1[7] = -bf0[7] + bf0[3];
1339 bf1[8] = bf0[8] + bf0[12];
1340 bf1[9] = bf0[9] + bf0[13];
1341 bf1[10] = bf0[10] + bf0[14];
1342 bf1[11] = bf0[11] + bf0[15];
1343 bf1[12] = -bf0[12] + bf0[8];
1344 bf1[13] = -bf0[13] + bf0[9];
1345 bf1[14] = -bf0[14] + bf0[10];
1346 bf1[15] = -bf0[15] + bf0[11];
1347 bf1[16] = bf0[16] + bf0[20];
1348 bf1[17] = bf0[17] + bf0[21];
1349 bf1[18] = bf0[18] + bf0[22];
1350 bf1[19] = bf0[19] + bf0[23];
1351 bf1[20] = -bf0[20] + bf0[16];
1352 bf1[21] = -bf0[21] + bf0[17];
1353 bf1[22] = -bf0[22] + bf0[18];
1354 bf1[23] = -bf0[23] + bf0[19];
1355 bf1[24] = bf0[24] + bf0[28];
1356 bf1[25] = bf0[25] + bf0[29];
1357 bf1[26] = bf0[26] + bf0[30];
1358 bf1[27] = bf0[27] + bf0[31];
1359 bf1[28] = -bf0[28] + bf0[24];
1360 bf1[29] = -bf0[29] + bf0[25];
1361 bf1[30] = -bf0[30] + bf0[26];
1362 bf1[31] = -bf0[31] + bf0[27];
1363 range_check(stage, input, bf1, size, stage_range[stage]);
1364
1365 // stage 8
1366 stage++;
1367 cospi = cospi_arr(cos_bit[stage]);
1368 bf0 = output;
1369 bf1 = step;
1370 bf1[0] = bf0[0];
1371 bf1[1] = bf0[1];
1372 bf1[2] = bf0[2];
1373 bf1[3] = bf0[3];
1374 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1375 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
1376 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1377 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
1378 bf1[8] = bf0[8];
1379 bf1[9] = bf0[9];
1380 bf1[10] = bf0[10];
1381 bf1[11] = bf0[11];
1382 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1383 bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
1384 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1385 bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
1386 bf1[16] = bf0[16];
1387 bf1[17] = bf0[17];
1388 bf1[18] = bf0[18];
1389 bf1[19] = bf0[19];
1390 bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
1391 bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
1392 bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
1393 bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
1394 bf1[24] = bf0[24];
1395 bf1[25] = bf0[25];
1396 bf1[26] = bf0[26];
1397 bf1[27] = bf0[27];
1398 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
1399 bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
1400 bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
1401 bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
1402 range_check(stage, input, bf1, size, stage_range[stage]);
1403
1404 // stage 9
1405 stage++;
1406 bf0 = step;
1407 bf1 = output;
1408 bf1[0] = bf0[0] + bf0[2];
1409 bf1[1] = bf0[1] + bf0[3];
1410 bf1[2] = -bf0[2] + bf0[0];
1411 bf1[3] = -bf0[3] + bf0[1];
1412 bf1[4] = bf0[4] + bf0[6];
1413 bf1[5] = bf0[5] + bf0[7];
1414 bf1[6] = -bf0[6] + bf0[4];
1415 bf1[7] = -bf0[7] + bf0[5];
1416 bf1[8] = bf0[8] + bf0[10];
1417 bf1[9] = bf0[9] + bf0[11];
1418 bf1[10] = -bf0[10] + bf0[8];
1419 bf1[11] = -bf0[11] + bf0[9];
1420 bf1[12] = bf0[12] + bf0[14];
1421 bf1[13] = bf0[13] + bf0[15];
1422 bf1[14] = -bf0[14] + bf0[12];
1423 bf1[15] = -bf0[15] + bf0[13];
1424 bf1[16] = bf0[16] + bf0[18];
1425 bf1[17] = bf0[17] + bf0[19];
1426 bf1[18] = -bf0[18] + bf0[16];
1427 bf1[19] = -bf0[19] + bf0[17];
1428 bf1[20] = bf0[20] + bf0[22];
1429 bf1[21] = bf0[21] + bf0[23];
1430 bf1[22] = -bf0[22] + bf0[20];
1431 bf1[23] = -bf0[23] + bf0[21];
1432 bf1[24] = bf0[24] + bf0[26];
1433 bf1[25] = bf0[25] + bf0[27];
1434 bf1[26] = -bf0[26] + bf0[24];
1435 bf1[27] = -bf0[27] + bf0[25];
1436 bf1[28] = bf0[28] + bf0[30];
1437 bf1[29] = bf0[29] + bf0[31];
1438 bf1[30] = -bf0[30] + bf0[28];
1439 bf1[31] = -bf0[31] + bf0[29];
1440 range_check(stage, input, bf1, size, stage_range[stage]);
1441
1442 // stage 10
1443 stage++;
1444 cospi = cospi_arr(cos_bit[stage]);
1445 bf0 = output;
1446 bf1 = step;
1447 bf1[0] = bf0[0];
1448 bf1[1] = bf0[1];
1449 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1450 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
1451 bf1[4] = bf0[4];
1452 bf1[5] = bf0[5];
1453 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1454 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
1455 bf1[8] = bf0[8];
1456 bf1[9] = bf0[9];
1457 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1458 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
1459 bf1[12] = bf0[12];
1460 bf1[13] = bf0[13];
1461 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1462 bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
1463 bf1[16] = bf0[16];
1464 bf1[17] = bf0[17];
1465 bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
1466 bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
1467 bf1[20] = bf0[20];
1468 bf1[21] = bf0[21];
1469 bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
1470 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
1471 bf1[24] = bf0[24];
1472 bf1[25] = bf0[25];
1473 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
1474 bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
1475 bf1[28] = bf0[28];
1476 bf1[29] = bf0[29];
1477 bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
1478 bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
1479 range_check(stage, input, bf1, size, stage_range[stage]);
1480
1481 // stage 11
1482 stage++;
1483 bf0 = step;
1484 bf1 = output;
1485 bf1[0] = bf0[0];
1486 bf1[1] = -bf0[16];
1487 bf1[2] = bf0[24];
1488 bf1[3] = -bf0[8];
1489 bf1[4] = bf0[12];
1490 bf1[5] = -bf0[28];
1491 bf1[6] = bf0[20];
1492 bf1[7] = -bf0[4];
1493 bf1[8] = bf0[6];
1494 bf1[9] = -bf0[22];
1495 bf1[10] = bf0[30];
1496 bf1[11] = -bf0[14];
1497 bf1[12] = bf0[10];
1498 bf1[13] = -bf0[26];
1499 bf1[14] = bf0[18];
1500 bf1[15] = -bf0[2];
1501 bf1[16] = bf0[3];
1502 bf1[17] = -bf0[19];
1503 bf1[18] = bf0[27];
1504 bf1[19] = -bf0[11];
1505 bf1[20] = bf0[15];
1506 bf1[21] = -bf0[31];
1507 bf1[22] = bf0[23];
1508 bf1[23] = -bf0[7];
1509 bf1[24] = bf0[5];
1510 bf1[25] = -bf0[21];
1511 bf1[26] = bf0[29];
1512 bf1[27] = -bf0[13];
1513 bf1[28] = bf0[9];
1514 bf1[29] = -bf0[25];
1515 bf1[30] = bf0[17];
1516 bf1[31] = -bf0[1];
1517 range_check(stage, input, bf1, size, stage_range[stage]);
1518 }
1519
1520 #if CONFIG_EXT_TX
av1_fidentity4_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1521 void av1_fidentity4_c(const int32_t *input, int32_t *output,
1522 const int8_t *cos_bit, const int8_t *stage_range) {
1523 (void)cos_bit;
1524 for (int i = 0; i < 4; ++i)
1525 output[i] = (int32_t)dct_const_round_shift(input[i] * Sqrt2);
1526 range_check(0, input, output, 4, stage_range[0]);
1527 }
1528
av1_fidentity8_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1529 void av1_fidentity8_c(const int32_t *input, int32_t *output,
1530 const int8_t *cos_bit, const int8_t *stage_range) {
1531 (void)cos_bit;
1532 for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
1533 range_check(0, input, output, 8, stage_range[0]);
1534 }
1535
av1_fidentity16_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1536 void av1_fidentity16_c(const int32_t *input, int32_t *output,
1537 const int8_t *cos_bit, const int8_t *stage_range) {
1538 (void)cos_bit;
1539 for (int i = 0; i < 16; ++i)
1540 output[i] = (int32_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
1541 range_check(0, input, output, 16, stage_range[0]);
1542 }
1543
av1_fidentity32_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1544 void av1_fidentity32_c(const int32_t *input, int32_t *output,
1545 const int8_t *cos_bit, const int8_t *stage_range) {
1546 (void)cos_bit;
1547 for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
1548 range_check(0, input, output, 32, stage_range[0]);
1549 }
1550
1551 #if CONFIG_TX64X64
av1_fidentity64_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1552 void av1_fidentity64_c(const int32_t *input, int32_t *output,
1553 const int8_t *cos_bit, const int8_t *stage_range) {
1554 (void)cos_bit;
1555 for (int i = 0; i < 64; ++i)
1556 output[i] = (int32_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
1557 range_check(0, input, output, 64, stage_range[0]);
1558 }
1559 #endif // CONFIG_TX64X64
1560 #endif // CONFIG_EXT_TX
1561
1562 #if CONFIG_TX64X64
av1_fdct64_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1563 void av1_fdct64_new(const int32_t *input, int32_t *output,
1564 const int8_t *cos_bit, const int8_t *stage_range) {
1565 const int32_t size = 64;
1566 const int32_t *cospi;
1567
1568 int32_t stage = 0;
1569 int32_t *bf0, *bf1;
1570 int32_t step[64];
1571
1572 // stage 0;
1573 range_check(stage, input, input, size, stage_range[stage]);
1574
1575 // stage 1;
1576 stage++;
1577 cospi = cospi_arr(cos_bit[stage]);
1578 bf1 = output;
1579 bf1[0] = input[0] + input[63];
1580 bf1[1] = input[1] + input[62];
1581 bf1[2] = input[2] + input[61];
1582 bf1[3] = input[3] + input[60];
1583 bf1[4] = input[4] + input[59];
1584 bf1[5] = input[5] + input[58];
1585 bf1[6] = input[6] + input[57];
1586 bf1[7] = input[7] + input[56];
1587 bf1[8] = input[8] + input[55];
1588 bf1[9] = input[9] + input[54];
1589 bf1[10] = input[10] + input[53];
1590 bf1[11] = input[11] + input[52];
1591 bf1[12] = input[12] + input[51];
1592 bf1[13] = input[13] + input[50];
1593 bf1[14] = input[14] + input[49];
1594 bf1[15] = input[15] + input[48];
1595 bf1[16] = input[16] + input[47];
1596 bf1[17] = input[17] + input[46];
1597 bf1[18] = input[18] + input[45];
1598 bf1[19] = input[19] + input[44];
1599 bf1[20] = input[20] + input[43];
1600 bf1[21] = input[21] + input[42];
1601 bf1[22] = input[22] + input[41];
1602 bf1[23] = input[23] + input[40];
1603 bf1[24] = input[24] + input[39];
1604 bf1[25] = input[25] + input[38];
1605 bf1[26] = input[26] + input[37];
1606 bf1[27] = input[27] + input[36];
1607 bf1[28] = input[28] + input[35];
1608 bf1[29] = input[29] + input[34];
1609 bf1[30] = input[30] + input[33];
1610 bf1[31] = input[31] + input[32];
1611 bf1[32] = -input[32] + input[31];
1612 bf1[33] = -input[33] + input[30];
1613 bf1[34] = -input[34] + input[29];
1614 bf1[35] = -input[35] + input[28];
1615 bf1[36] = -input[36] + input[27];
1616 bf1[37] = -input[37] + input[26];
1617 bf1[38] = -input[38] + input[25];
1618 bf1[39] = -input[39] + input[24];
1619 bf1[40] = -input[40] + input[23];
1620 bf1[41] = -input[41] + input[22];
1621 bf1[42] = -input[42] + input[21];
1622 bf1[43] = -input[43] + input[20];
1623 bf1[44] = -input[44] + input[19];
1624 bf1[45] = -input[45] + input[18];
1625 bf1[46] = -input[46] + input[17];
1626 bf1[47] = -input[47] + input[16];
1627 bf1[48] = -input[48] + input[15];
1628 bf1[49] = -input[49] + input[14];
1629 bf1[50] = -input[50] + input[13];
1630 bf1[51] = -input[51] + input[12];
1631 bf1[52] = -input[52] + input[11];
1632 bf1[53] = -input[53] + input[10];
1633 bf1[54] = -input[54] + input[9];
1634 bf1[55] = -input[55] + input[8];
1635 bf1[56] = -input[56] + input[7];
1636 bf1[57] = -input[57] + input[6];
1637 bf1[58] = -input[58] + input[5];
1638 bf1[59] = -input[59] + input[4];
1639 bf1[60] = -input[60] + input[3];
1640 bf1[61] = -input[61] + input[2];
1641 bf1[62] = -input[62] + input[1];
1642 bf1[63] = -input[63] + input[0];
1643 range_check(stage, input, bf1, size, stage_range[stage]);
1644
1645 // stage 2
1646 stage++;
1647 cospi = cospi_arr(cos_bit[stage]);
1648 bf0 = output;
1649 bf1 = step;
1650 bf1[0] = bf0[0] + bf0[31];
1651 bf1[1] = bf0[1] + bf0[30];
1652 bf1[2] = bf0[2] + bf0[29];
1653 bf1[3] = bf0[3] + bf0[28];
1654 bf1[4] = bf0[4] + bf0[27];
1655 bf1[5] = bf0[5] + bf0[26];
1656 bf1[6] = bf0[6] + bf0[25];
1657 bf1[7] = bf0[7] + bf0[24];
1658 bf1[8] = bf0[8] + bf0[23];
1659 bf1[9] = bf0[9] + bf0[22];
1660 bf1[10] = bf0[10] + bf0[21];
1661 bf1[11] = bf0[11] + bf0[20];
1662 bf1[12] = bf0[12] + bf0[19];
1663 bf1[13] = bf0[13] + bf0[18];
1664 bf1[14] = bf0[14] + bf0[17];
1665 bf1[15] = bf0[15] + bf0[16];
1666 bf1[16] = -bf0[16] + bf0[15];
1667 bf1[17] = -bf0[17] + bf0[14];
1668 bf1[18] = -bf0[18] + bf0[13];
1669 bf1[19] = -bf0[19] + bf0[12];
1670 bf1[20] = -bf0[20] + bf0[11];
1671 bf1[21] = -bf0[21] + bf0[10];
1672 bf1[22] = -bf0[22] + bf0[9];
1673 bf1[23] = -bf0[23] + bf0[8];
1674 bf1[24] = -bf0[24] + bf0[7];
1675 bf1[25] = -bf0[25] + bf0[6];
1676 bf1[26] = -bf0[26] + bf0[5];
1677 bf1[27] = -bf0[27] + bf0[4];
1678 bf1[28] = -bf0[28] + bf0[3];
1679 bf1[29] = -bf0[29] + bf0[2];
1680 bf1[30] = -bf0[30] + bf0[1];
1681 bf1[31] = -bf0[31] + bf0[0];
1682 bf1[32] = bf0[32];
1683 bf1[33] = bf0[33];
1684 bf1[34] = bf0[34];
1685 bf1[35] = bf0[35];
1686 bf1[36] = bf0[36];
1687 bf1[37] = bf0[37];
1688 bf1[38] = bf0[38];
1689 bf1[39] = bf0[39];
1690 bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
1691 bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
1692 bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
1693 bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
1694 bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
1695 bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
1696 bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
1697 bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
1698 bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
1699 bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
1700 bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
1701 bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
1702 bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
1703 bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
1704 bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
1705 bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
1706 bf1[56] = bf0[56];
1707 bf1[57] = bf0[57];
1708 bf1[58] = bf0[58];
1709 bf1[59] = bf0[59];
1710 bf1[60] = bf0[60];
1711 bf1[61] = bf0[61];
1712 bf1[62] = bf0[62];
1713 bf1[63] = bf0[63];
1714 range_check(stage, input, bf1, size, stage_range[stage]);
1715
1716 // stage 3
1717 stage++;
1718 cospi = cospi_arr(cos_bit[stage]);
1719 bf0 = step;
1720 bf1 = output;
1721 bf1[0] = bf0[0] + bf0[15];
1722 bf1[1] = bf0[1] + bf0[14];
1723 bf1[2] = bf0[2] + bf0[13];
1724 bf1[3] = bf0[3] + bf0[12];
1725 bf1[4] = bf0[4] + bf0[11];
1726 bf1[5] = bf0[5] + bf0[10];
1727 bf1[6] = bf0[6] + bf0[9];
1728 bf1[7] = bf0[7] + bf0[8];
1729 bf1[8] = -bf0[8] + bf0[7];
1730 bf1[9] = -bf0[9] + bf0[6];
1731 bf1[10] = -bf0[10] + bf0[5];
1732 bf1[11] = -bf0[11] + bf0[4];
1733 bf1[12] = -bf0[12] + bf0[3];
1734 bf1[13] = -bf0[13] + bf0[2];
1735 bf1[14] = -bf0[14] + bf0[1];
1736 bf1[15] = -bf0[15] + bf0[0];
1737 bf1[16] = bf0[16];
1738 bf1[17] = bf0[17];
1739 bf1[18] = bf0[18];
1740 bf1[19] = bf0[19];
1741 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
1742 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
1743 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
1744 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
1745 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
1746 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
1747 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
1748 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
1749 bf1[28] = bf0[28];
1750 bf1[29] = bf0[29];
1751 bf1[30] = bf0[30];
1752 bf1[31] = bf0[31];
1753 bf1[32] = bf0[32] + bf0[47];
1754 bf1[33] = bf0[33] + bf0[46];
1755 bf1[34] = bf0[34] + bf0[45];
1756 bf1[35] = bf0[35] + bf0[44];
1757 bf1[36] = bf0[36] + bf0[43];
1758 bf1[37] = bf0[37] + bf0[42];
1759 bf1[38] = bf0[38] + bf0[41];
1760 bf1[39] = bf0[39] + bf0[40];
1761 bf1[40] = -bf0[40] + bf0[39];
1762 bf1[41] = -bf0[41] + bf0[38];
1763 bf1[42] = -bf0[42] + bf0[37];
1764 bf1[43] = -bf0[43] + bf0[36];
1765 bf1[44] = -bf0[44] + bf0[35];
1766 bf1[45] = -bf0[45] + bf0[34];
1767 bf1[46] = -bf0[46] + bf0[33];
1768 bf1[47] = -bf0[47] + bf0[32];
1769 bf1[48] = -bf0[48] + bf0[63];
1770 bf1[49] = -bf0[49] + bf0[62];
1771 bf1[50] = -bf0[50] + bf0[61];
1772 bf1[51] = -bf0[51] + bf0[60];
1773 bf1[52] = -bf0[52] + bf0[59];
1774 bf1[53] = -bf0[53] + bf0[58];
1775 bf1[54] = -bf0[54] + bf0[57];
1776 bf1[55] = -bf0[55] + bf0[56];
1777 bf1[56] = bf0[56] + bf0[55];
1778 bf1[57] = bf0[57] + bf0[54];
1779 bf1[58] = bf0[58] + bf0[53];
1780 bf1[59] = bf0[59] + bf0[52];
1781 bf1[60] = bf0[60] + bf0[51];
1782 bf1[61] = bf0[61] + bf0[50];
1783 bf1[62] = bf0[62] + bf0[49];
1784 bf1[63] = bf0[63] + bf0[48];
1785 range_check(stage, input, bf1, size, stage_range[stage]);
1786
1787 // stage 4
1788 stage++;
1789 cospi = cospi_arr(cos_bit[stage]);
1790 bf0 = output;
1791 bf1 = step;
1792 bf1[0] = bf0[0] + bf0[7];
1793 bf1[1] = bf0[1] + bf0[6];
1794 bf1[2] = bf0[2] + bf0[5];
1795 bf1[3] = bf0[3] + bf0[4];
1796 bf1[4] = -bf0[4] + bf0[3];
1797 bf1[5] = -bf0[5] + bf0[2];
1798 bf1[6] = -bf0[6] + bf0[1];
1799 bf1[7] = -bf0[7] + bf0[0];
1800 bf1[8] = bf0[8];
1801 bf1[9] = bf0[9];
1802 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
1803 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
1804 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
1805 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
1806 bf1[14] = bf0[14];
1807 bf1[15] = bf0[15];
1808 bf1[16] = bf0[16] + bf0[23];
1809 bf1[17] = bf0[17] + bf0[22];
1810 bf1[18] = bf0[18] + bf0[21];
1811 bf1[19] = bf0[19] + bf0[20];
1812 bf1[20] = -bf0[20] + bf0[19];
1813 bf1[21] = -bf0[21] + bf0[18];
1814 bf1[22] = -bf0[22] + bf0[17];
1815 bf1[23] = -bf0[23] + bf0[16];
1816 bf1[24] = -bf0[24] + bf0[31];
1817 bf1[25] = -bf0[25] + bf0[30];
1818 bf1[26] = -bf0[26] + bf0[29];
1819 bf1[27] = -bf0[27] + bf0[28];
1820 bf1[28] = bf0[28] + bf0[27];
1821 bf1[29] = bf0[29] + bf0[26];
1822 bf1[30] = bf0[30] + bf0[25];
1823 bf1[31] = bf0[31] + bf0[24];
1824 bf1[32] = bf0[32];
1825 bf1[33] = bf0[33];
1826 bf1[34] = bf0[34];
1827 bf1[35] = bf0[35];
1828 bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
1829 bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
1830 bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
1831 bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
1832 bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
1833 bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
1834 bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
1835 bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
1836 bf1[44] = bf0[44];
1837 bf1[45] = bf0[45];
1838 bf1[46] = bf0[46];
1839 bf1[47] = bf0[47];
1840 bf1[48] = bf0[48];
1841 bf1[49] = bf0[49];
1842 bf1[50] = bf0[50];
1843 bf1[51] = bf0[51];
1844 bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
1845 bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
1846 bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
1847 bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
1848 bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
1849 bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
1850 bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
1851 bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
1852 bf1[60] = bf0[60];
1853 bf1[61] = bf0[61];
1854 bf1[62] = bf0[62];
1855 bf1[63] = bf0[63];
1856 range_check(stage, input, bf1, size, stage_range[stage]);
1857
1858 // stage 5
1859 stage++;
1860 cospi = cospi_arr(cos_bit[stage]);
1861 bf0 = step;
1862 bf1 = output;
1863 bf1[0] = bf0[0] + bf0[3];
1864 bf1[1] = bf0[1] + bf0[2];
1865 bf1[2] = -bf0[2] + bf0[1];
1866 bf1[3] = -bf0[3] + bf0[0];
1867 bf1[4] = bf0[4];
1868 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
1869 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
1870 bf1[7] = bf0[7];
1871 bf1[8] = bf0[8] + bf0[11];
1872 bf1[9] = bf0[9] + bf0[10];
1873 bf1[10] = -bf0[10] + bf0[9];
1874 bf1[11] = -bf0[11] + bf0[8];
1875 bf1[12] = -bf0[12] + bf0[15];
1876 bf1[13] = -bf0[13] + bf0[14];
1877 bf1[14] = bf0[14] + bf0[13];
1878 bf1[15] = bf0[15] + bf0[12];
1879 bf1[16] = bf0[16];
1880 bf1[17] = bf0[17];
1881 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
1882 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
1883 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
1884 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
1885 bf1[22] = bf0[22];
1886 bf1[23] = bf0[23];
1887 bf1[24] = bf0[24];
1888 bf1[25] = bf0[25];
1889 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
1890 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
1891 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
1892 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
1893 bf1[30] = bf0[30];
1894 bf1[31] = bf0[31];
1895 bf1[32] = bf0[32] + bf0[39];
1896 bf1[33] = bf0[33] + bf0[38];
1897 bf1[34] = bf0[34] + bf0[37];
1898 bf1[35] = bf0[35] + bf0[36];
1899 bf1[36] = -bf0[36] + bf0[35];
1900 bf1[37] = -bf0[37] + bf0[34];
1901 bf1[38] = -bf0[38] + bf0[33];
1902 bf1[39] = -bf0[39] + bf0[32];
1903 bf1[40] = -bf0[40] + bf0[47];
1904 bf1[41] = -bf0[41] + bf0[46];
1905 bf1[42] = -bf0[42] + bf0[45];
1906 bf1[43] = -bf0[43] + bf0[44];
1907 bf1[44] = bf0[44] + bf0[43];
1908 bf1[45] = bf0[45] + bf0[42];
1909 bf1[46] = bf0[46] + bf0[41];
1910 bf1[47] = bf0[47] + bf0[40];
1911 bf1[48] = bf0[48] + bf0[55];
1912 bf1[49] = bf0[49] + bf0[54];
1913 bf1[50] = bf0[50] + bf0[53];
1914 bf1[51] = bf0[51] + bf0[52];
1915 bf1[52] = -bf0[52] + bf0[51];
1916 bf1[53] = -bf0[53] + bf0[50];
1917 bf1[54] = -bf0[54] + bf0[49];
1918 bf1[55] = -bf0[55] + bf0[48];
1919 bf1[56] = -bf0[56] + bf0[63];
1920 bf1[57] = -bf0[57] + bf0[62];
1921 bf1[58] = -bf0[58] + bf0[61];
1922 bf1[59] = -bf0[59] + bf0[60];
1923 bf1[60] = bf0[60] + bf0[59];
1924 bf1[61] = bf0[61] + bf0[58];
1925 bf1[62] = bf0[62] + bf0[57];
1926 bf1[63] = bf0[63] + bf0[56];
1927 range_check(stage, input, bf1, size, stage_range[stage]);
1928
1929 // stage 6
1930 stage++;
1931 cospi = cospi_arr(cos_bit[stage]);
1932 bf0 = output;
1933 bf1 = step;
1934 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
1935 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
1936 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
1937 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
1938 bf1[4] = bf0[4] + bf0[5];
1939 bf1[5] = -bf0[5] + bf0[4];
1940 bf1[6] = -bf0[6] + bf0[7];
1941 bf1[7] = bf0[7] + bf0[6];
1942 bf1[8] = bf0[8];
1943 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
1944 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
1945 bf1[11] = bf0[11];
1946 bf1[12] = bf0[12];
1947 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
1948 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
1949 bf1[15] = bf0[15];
1950 bf1[16] = bf0[16] + bf0[19];
1951 bf1[17] = bf0[17] + bf0[18];
1952 bf1[18] = -bf0[18] + bf0[17];
1953 bf1[19] = -bf0[19] + bf0[16];
1954 bf1[20] = -bf0[20] + bf0[23];
1955 bf1[21] = -bf0[21] + bf0[22];
1956 bf1[22] = bf0[22] + bf0[21];
1957 bf1[23] = bf0[23] + bf0[20];
1958 bf1[24] = bf0[24] + bf0[27];
1959 bf1[25] = bf0[25] + bf0[26];
1960 bf1[26] = -bf0[26] + bf0[25];
1961 bf1[27] = -bf0[27] + bf0[24];
1962 bf1[28] = -bf0[28] + bf0[31];
1963 bf1[29] = -bf0[29] + bf0[30];
1964 bf1[30] = bf0[30] + bf0[29];
1965 bf1[31] = bf0[31] + bf0[28];
1966 bf1[32] = bf0[32];
1967 bf1[33] = bf0[33];
1968 bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
1969 bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
1970 bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
1971 bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
1972 bf1[38] = bf0[38];
1973 bf1[39] = bf0[39];
1974 bf1[40] = bf0[40];
1975 bf1[41] = bf0[41];
1976 bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
1977 bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
1978 bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
1979 bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
1980 bf1[46] = bf0[46];
1981 bf1[47] = bf0[47];
1982 bf1[48] = bf0[48];
1983 bf1[49] = bf0[49];
1984 bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
1985 bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
1986 bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
1987 bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
1988 bf1[54] = bf0[54];
1989 bf1[55] = bf0[55];
1990 bf1[56] = bf0[56];
1991 bf1[57] = bf0[57];
1992 bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
1993 bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
1994 bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
1995 bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
1996 bf1[62] = bf0[62];
1997 bf1[63] = bf0[63];
1998 range_check(stage, input, bf1, size, stage_range[stage]);
1999
2000 // stage 7
2001 stage++;
2002 cospi = cospi_arr(cos_bit[stage]);
2003 bf0 = step;
2004 bf1 = output;
2005 bf1[0] = bf0[0];
2006 bf1[1] = bf0[1];
2007 bf1[2] = bf0[2];
2008 bf1[3] = bf0[3];
2009 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
2010 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
2011 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
2012 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
2013 bf1[8] = bf0[8] + bf0[9];
2014 bf1[9] = -bf0[9] + bf0[8];
2015 bf1[10] = -bf0[10] + bf0[11];
2016 bf1[11] = bf0[11] + bf0[10];
2017 bf1[12] = bf0[12] + bf0[13];
2018 bf1[13] = -bf0[13] + bf0[12];
2019 bf1[14] = -bf0[14] + bf0[15];
2020 bf1[15] = bf0[15] + bf0[14];
2021 bf1[16] = bf0[16];
2022 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
2023 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
2024 bf1[19] = bf0[19];
2025 bf1[20] = bf0[20];
2026 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
2027 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
2028 bf1[23] = bf0[23];
2029 bf1[24] = bf0[24];
2030 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
2031 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
2032 bf1[27] = bf0[27];
2033 bf1[28] = bf0[28];
2034 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
2035 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
2036 bf1[31] = bf0[31];
2037 bf1[32] = bf0[32] + bf0[35];
2038 bf1[33] = bf0[33] + bf0[34];
2039 bf1[34] = -bf0[34] + bf0[33];
2040 bf1[35] = -bf0[35] + bf0[32];
2041 bf1[36] = -bf0[36] + bf0[39];
2042 bf1[37] = -bf0[37] + bf0[38];
2043 bf1[38] = bf0[38] + bf0[37];
2044 bf1[39] = bf0[39] + bf0[36];
2045 bf1[40] = bf0[40] + bf0[43];
2046 bf1[41] = bf0[41] + bf0[42];
2047 bf1[42] = -bf0[42] + bf0[41];
2048 bf1[43] = -bf0[43] + bf0[40];
2049 bf1[44] = -bf0[44] + bf0[47];
2050 bf1[45] = -bf0[45] + bf0[46];
2051 bf1[46] = bf0[46] + bf0[45];
2052 bf1[47] = bf0[47] + bf0[44];
2053 bf1[48] = bf0[48] + bf0[51];
2054 bf1[49] = bf0[49] + bf0[50];
2055 bf1[50] = -bf0[50] + bf0[49];
2056 bf1[51] = -bf0[51] + bf0[48];
2057 bf1[52] = -bf0[52] + bf0[55];
2058 bf1[53] = -bf0[53] + bf0[54];
2059 bf1[54] = bf0[54] + bf0[53];
2060 bf1[55] = bf0[55] + bf0[52];
2061 bf1[56] = bf0[56] + bf0[59];
2062 bf1[57] = bf0[57] + bf0[58];
2063 bf1[58] = -bf0[58] + bf0[57];
2064 bf1[59] = -bf0[59] + bf0[56];
2065 bf1[60] = -bf0[60] + bf0[63];
2066 bf1[61] = -bf0[61] + bf0[62];
2067 bf1[62] = bf0[62] + bf0[61];
2068 bf1[63] = bf0[63] + bf0[60];
2069 range_check(stage, input, bf1, size, stage_range[stage]);
2070
2071 // stage 8
2072 stage++;
2073 cospi = cospi_arr(cos_bit[stage]);
2074 bf0 = output;
2075 bf1 = step;
2076 bf1[0] = bf0[0];
2077 bf1[1] = bf0[1];
2078 bf1[2] = bf0[2];
2079 bf1[3] = bf0[3];
2080 bf1[4] = bf0[4];
2081 bf1[5] = bf0[5];
2082 bf1[6] = bf0[6];
2083 bf1[7] = bf0[7];
2084 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
2085 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
2086 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
2087 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
2088 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
2089 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
2090 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
2091 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
2092 bf1[16] = bf0[16] + bf0[17];
2093 bf1[17] = -bf0[17] + bf0[16];
2094 bf1[18] = -bf0[18] + bf0[19];
2095 bf1[19] = bf0[19] + bf0[18];
2096 bf1[20] = bf0[20] + bf0[21];
2097 bf1[21] = -bf0[21] + bf0[20];
2098 bf1[22] = -bf0[22] + bf0[23];
2099 bf1[23] = bf0[23] + bf0[22];
2100 bf1[24] = bf0[24] + bf0[25];
2101 bf1[25] = -bf0[25] + bf0[24];
2102 bf1[26] = -bf0[26] + bf0[27];
2103 bf1[27] = bf0[27] + bf0[26];
2104 bf1[28] = bf0[28] + bf0[29];
2105 bf1[29] = -bf0[29] + bf0[28];
2106 bf1[30] = -bf0[30] + bf0[31];
2107 bf1[31] = bf0[31] + bf0[30];
2108 bf1[32] = bf0[32];
2109 bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
2110 bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
2111 bf1[35] = bf0[35];
2112 bf1[36] = bf0[36];
2113 bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
2114 bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
2115 bf1[39] = bf0[39];
2116 bf1[40] = bf0[40];
2117 bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
2118 bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
2119 bf1[43] = bf0[43];
2120 bf1[44] = bf0[44];
2121 bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
2122 bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
2123 bf1[47] = bf0[47];
2124 bf1[48] = bf0[48];
2125 bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
2126 bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
2127 bf1[51] = bf0[51];
2128 bf1[52] = bf0[52];
2129 bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
2130 bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
2131 bf1[55] = bf0[55];
2132 bf1[56] = bf0[56];
2133 bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
2134 bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
2135 bf1[59] = bf0[59];
2136 bf1[60] = bf0[60];
2137 bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
2138 bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
2139 bf1[63] = bf0[63];
2140 range_check(stage, input, bf1, size, stage_range[stage]);
2141
2142 // stage 9
2143 stage++;
2144 cospi = cospi_arr(cos_bit[stage]);
2145 bf0 = step;
2146 bf1 = output;
2147 bf1[0] = bf0[0];
2148 bf1[1] = bf0[1];
2149 bf1[2] = bf0[2];
2150 bf1[3] = bf0[3];
2151 bf1[4] = bf0[4];
2152 bf1[5] = bf0[5];
2153 bf1[6] = bf0[6];
2154 bf1[7] = bf0[7];
2155 bf1[8] = bf0[8];
2156 bf1[9] = bf0[9];
2157 bf1[10] = bf0[10];
2158 bf1[11] = bf0[11];
2159 bf1[12] = bf0[12];
2160 bf1[13] = bf0[13];
2161 bf1[14] = bf0[14];
2162 bf1[15] = bf0[15];
2163 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
2164 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
2165 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
2166 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
2167 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
2168 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
2169 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
2170 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
2171 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
2172 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
2173 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
2174 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
2175 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
2176 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
2177 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
2178 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
2179 bf1[32] = bf0[32] + bf0[33];
2180 bf1[33] = -bf0[33] + bf0[32];
2181 bf1[34] = -bf0[34] + bf0[35];
2182 bf1[35] = bf0[35] + bf0[34];
2183 bf1[36] = bf0[36] + bf0[37];
2184 bf1[37] = -bf0[37] + bf0[36];
2185 bf1[38] = -bf0[38] + bf0[39];
2186 bf1[39] = bf0[39] + bf0[38];
2187 bf1[40] = bf0[40] + bf0[41];
2188 bf1[41] = -bf0[41] + bf0[40];
2189 bf1[42] = -bf0[42] + bf0[43];
2190 bf1[43] = bf0[43] + bf0[42];
2191 bf1[44] = bf0[44] + bf0[45];
2192 bf1[45] = -bf0[45] + bf0[44];
2193 bf1[46] = -bf0[46] + bf0[47];
2194 bf1[47] = bf0[47] + bf0[46];
2195 bf1[48] = bf0[48] + bf0[49];
2196 bf1[49] = -bf0[49] + bf0[48];
2197 bf1[50] = -bf0[50] + bf0[51];
2198 bf1[51] = bf0[51] + bf0[50];
2199 bf1[52] = bf0[52] + bf0[53];
2200 bf1[53] = -bf0[53] + bf0[52];
2201 bf1[54] = -bf0[54] + bf0[55];
2202 bf1[55] = bf0[55] + bf0[54];
2203 bf1[56] = bf0[56] + bf0[57];
2204 bf1[57] = -bf0[57] + bf0[56];
2205 bf1[58] = -bf0[58] + bf0[59];
2206 bf1[59] = bf0[59] + bf0[58];
2207 bf1[60] = bf0[60] + bf0[61];
2208 bf1[61] = -bf0[61] + bf0[60];
2209 bf1[62] = -bf0[62] + bf0[63];
2210 bf1[63] = bf0[63] + bf0[62];
2211 range_check(stage, input, bf1, size, stage_range[stage]);
2212
2213 // stage 10
2214 stage++;
2215 cospi = cospi_arr(cos_bit[stage]);
2216 bf0 = output;
2217 bf1 = step;
2218 bf1[0] = bf0[0];
2219 bf1[1] = bf0[1];
2220 bf1[2] = bf0[2];
2221 bf1[3] = bf0[3];
2222 bf1[4] = bf0[4];
2223 bf1[5] = bf0[5];
2224 bf1[6] = bf0[6];
2225 bf1[7] = bf0[7];
2226 bf1[8] = bf0[8];
2227 bf1[9] = bf0[9];
2228 bf1[10] = bf0[10];
2229 bf1[11] = bf0[11];
2230 bf1[12] = bf0[12];
2231 bf1[13] = bf0[13];
2232 bf1[14] = bf0[14];
2233 bf1[15] = bf0[15];
2234 bf1[16] = bf0[16];
2235 bf1[17] = bf0[17];
2236 bf1[18] = bf0[18];
2237 bf1[19] = bf0[19];
2238 bf1[20] = bf0[20];
2239 bf1[21] = bf0[21];
2240 bf1[22] = bf0[22];
2241 bf1[23] = bf0[23];
2242 bf1[24] = bf0[24];
2243 bf1[25] = bf0[25];
2244 bf1[26] = bf0[26];
2245 bf1[27] = bf0[27];
2246 bf1[28] = bf0[28];
2247 bf1[29] = bf0[29];
2248 bf1[30] = bf0[30];
2249 bf1[31] = bf0[31];
2250 bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
2251 bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
2252 bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
2253 bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
2254 bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
2255 bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
2256 bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
2257 bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
2258 bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
2259 bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
2260 bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
2261 bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
2262 bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
2263 bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
2264 bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
2265 bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
2266 bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
2267 bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
2268 bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
2269 bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
2270 bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
2271 bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
2272 bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
2273 bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
2274 bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
2275 bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
2276 bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
2277 bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
2278 bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
2279 bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
2280 bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
2281 bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
2282 range_check(stage, input, bf1, size, stage_range[stage]);
2283
2284 // stage 11
2285 stage++;
2286 cospi = cospi_arr(cos_bit[stage]);
2287 bf0 = step;
2288 bf1 = output;
2289 bf1[0] = bf0[0];
2290 bf1[1] = bf0[32];
2291 bf1[2] = bf0[16];
2292 bf1[3] = bf0[48];
2293 bf1[4] = bf0[8];
2294 bf1[5] = bf0[40];
2295 bf1[6] = bf0[24];
2296 bf1[7] = bf0[56];
2297 bf1[8] = bf0[4];
2298 bf1[9] = bf0[36];
2299 bf1[10] = bf0[20];
2300 bf1[11] = bf0[52];
2301 bf1[12] = bf0[12];
2302 bf1[13] = bf0[44];
2303 bf1[14] = bf0[28];
2304 bf1[15] = bf0[60];
2305 bf1[16] = bf0[2];
2306 bf1[17] = bf0[34];
2307 bf1[18] = bf0[18];
2308 bf1[19] = bf0[50];
2309 bf1[20] = bf0[10];
2310 bf1[21] = bf0[42];
2311 bf1[22] = bf0[26];
2312 bf1[23] = bf0[58];
2313 bf1[24] = bf0[6];
2314 bf1[25] = bf0[38];
2315 bf1[26] = bf0[22];
2316 bf1[27] = bf0[54];
2317 bf1[28] = bf0[14];
2318 bf1[29] = bf0[46];
2319 bf1[30] = bf0[30];
2320 bf1[31] = bf0[62];
2321 bf1[32] = bf0[1];
2322 bf1[33] = bf0[33];
2323 bf1[34] = bf0[17];
2324 bf1[35] = bf0[49];
2325 bf1[36] = bf0[9];
2326 bf1[37] = bf0[41];
2327 bf1[38] = bf0[25];
2328 bf1[39] = bf0[57];
2329 bf1[40] = bf0[5];
2330 bf1[41] = bf0[37];
2331 bf1[42] = bf0[21];
2332 bf1[43] = bf0[53];
2333 bf1[44] = bf0[13];
2334 bf1[45] = bf0[45];
2335 bf1[46] = bf0[29];
2336 bf1[47] = bf0[61];
2337 bf1[48] = bf0[3];
2338 bf1[49] = bf0[35];
2339 bf1[50] = bf0[19];
2340 bf1[51] = bf0[51];
2341 bf1[52] = bf0[11];
2342 bf1[53] = bf0[43];
2343 bf1[54] = bf0[27];
2344 bf1[55] = bf0[59];
2345 bf1[56] = bf0[7];
2346 bf1[57] = bf0[39];
2347 bf1[58] = bf0[23];
2348 bf1[59] = bf0[55];
2349 bf1[60] = bf0[15];
2350 bf1[61] = bf0[47];
2351 bf1[62] = bf0[31];
2352 bf1[63] = bf0[63];
2353 range_check(stage, input, bf1, size, stage_range[stage]);
2354 }
2355 #endif // CONFIG_TX64X64
2356