1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdlib.h>
13 #include "aom_dsp/inv_txfm.h"
14 #include "av1/common/av1_fwd_txfm1d.h"
15 #if CONFIG_COEFFICIENT_RANGE_CHECKING
16 
17 void range_check_func(int32_t stage, const int32_t *input, const int32_t *buf,
18                       int32_t size, int8_t bit);
19 
20 #define range_check(stage, input, buf, size, bit) \
21   range_check_func(stage, input, buf, size, bit)
22 #else
23 #define range_check(stage, input, buf, size, bit) \
24   {                                               \
25     (void)stage;                                  \
26     (void)input;                                  \
27     (void)buf;                                    \
28     (void)size;                                   \
29     (void)bit;                                    \
30   }
31 #endif
32 
33 // TODO(angiebird): Make 1-d txfm functions static
av1_fdct4_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)34 void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
35                    const int8_t *stage_range) {
36   const int32_t size = 4;
37   const int32_t *cospi;
38 
39   int32_t stage = 0;
40   int32_t *bf0, *bf1;
41   int32_t step[4];
42 
43   // stage 0;
44   range_check(stage, input, input, size, stage_range[stage]);
45 
46   // stage 1;
47   stage++;
48   bf1 = output;
49   bf1[0] = input[0] + input[3];
50   bf1[1] = input[1] + input[2];
51   bf1[2] = -input[2] + input[1];
52   bf1[3] = -input[3] + input[0];
53   range_check(stage, input, bf1, size, stage_range[stage]);
54 
55   // stage 2
56   stage++;
57   cospi = cospi_arr(cos_bit[stage]);
58   bf0 = output;
59   bf1 = step;
60   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
61   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
62   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
63   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
64   range_check(stage, input, bf1, size, stage_range[stage]);
65 
66   // stage 3
67   stage++;
68   bf0 = step;
69   bf1 = output;
70   bf1[0] = bf0[0];
71   bf1[1] = bf0[2];
72   bf1[2] = bf0[1];
73   bf1[3] = bf0[3];
74   range_check(stage, input, bf1, size, stage_range[stage]);
75 }
76 
av1_fdct8_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)77 void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
78                    const int8_t *stage_range) {
79   const int32_t size = 8;
80   const int32_t *cospi;
81 
82   int32_t stage = 0;
83   int32_t *bf0, *bf1;
84   int32_t step[8];
85 
86   // stage 0;
87   range_check(stage, input, input, size, stage_range[stage]);
88 
89   // stage 1;
90   stage++;
91   bf1 = output;
92   bf1[0] = input[0] + input[7];
93   bf1[1] = input[1] + input[6];
94   bf1[2] = input[2] + input[5];
95   bf1[3] = input[3] + input[4];
96   bf1[4] = -input[4] + input[3];
97   bf1[5] = -input[5] + input[2];
98   bf1[6] = -input[6] + input[1];
99   bf1[7] = -input[7] + input[0];
100   range_check(stage, input, bf1, size, stage_range[stage]);
101 
102   // stage 2
103   stage++;
104   cospi = cospi_arr(cos_bit[stage]);
105   bf0 = output;
106   bf1 = step;
107   bf1[0] = bf0[0] + bf0[3];
108   bf1[1] = bf0[1] + bf0[2];
109   bf1[2] = -bf0[2] + bf0[1];
110   bf1[3] = -bf0[3] + bf0[0];
111   bf1[4] = bf0[4];
112   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
113   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
114   bf1[7] = bf0[7];
115   range_check(stage, input, bf1, size, stage_range[stage]);
116 
117   // stage 3
118   stage++;
119   cospi = cospi_arr(cos_bit[stage]);
120   bf0 = step;
121   bf1 = output;
122   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
123   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
124   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
125   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
126   bf1[4] = bf0[4] + bf0[5];
127   bf1[5] = -bf0[5] + bf0[4];
128   bf1[6] = -bf0[6] + bf0[7];
129   bf1[7] = bf0[7] + bf0[6];
130   range_check(stage, input, bf1, size, stage_range[stage]);
131 
132   // stage 4
133   stage++;
134   cospi = cospi_arr(cos_bit[stage]);
135   bf0 = output;
136   bf1 = step;
137   bf1[0] = bf0[0];
138   bf1[1] = bf0[1];
139   bf1[2] = bf0[2];
140   bf1[3] = bf0[3];
141   bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
142   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
143   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
144   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
145   range_check(stage, input, bf1, size, stage_range[stage]);
146 
147   // stage 5
148   stage++;
149   bf0 = step;
150   bf1 = output;
151   bf1[0] = bf0[0];
152   bf1[1] = bf0[4];
153   bf1[2] = bf0[2];
154   bf1[3] = bf0[6];
155   bf1[4] = bf0[1];
156   bf1[5] = bf0[5];
157   bf1[6] = bf0[3];
158   bf1[7] = bf0[7];
159   range_check(stage, input, bf1, size, stage_range[stage]);
160 }
161 
av1_fdct16_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)162 void av1_fdct16_new(const int32_t *input, int32_t *output,
163                     const int8_t *cos_bit, const int8_t *stage_range) {
164   const int32_t size = 16;
165   const int32_t *cospi;
166 
167   int32_t stage = 0;
168   int32_t *bf0, *bf1;
169   int32_t step[16];
170 
171   // stage 0;
172   range_check(stage, input, input, size, stage_range[stage]);
173 
174   // stage 1;
175   stage++;
176   bf1 = output;
177   bf1[0] = input[0] + input[15];
178   bf1[1] = input[1] + input[14];
179   bf1[2] = input[2] + input[13];
180   bf1[3] = input[3] + input[12];
181   bf1[4] = input[4] + input[11];
182   bf1[5] = input[5] + input[10];
183   bf1[6] = input[6] + input[9];
184   bf1[7] = input[7] + input[8];
185   bf1[8] = -input[8] + input[7];
186   bf1[9] = -input[9] + input[6];
187   bf1[10] = -input[10] + input[5];
188   bf1[11] = -input[11] + input[4];
189   bf1[12] = -input[12] + input[3];
190   bf1[13] = -input[13] + input[2];
191   bf1[14] = -input[14] + input[1];
192   bf1[15] = -input[15] + input[0];
193   range_check(stage, input, bf1, size, stage_range[stage]);
194 
195   // stage 2
196   stage++;
197   cospi = cospi_arr(cos_bit[stage]);
198   bf0 = output;
199   bf1 = step;
200   bf1[0] = bf0[0] + bf0[7];
201   bf1[1] = bf0[1] + bf0[6];
202   bf1[2] = bf0[2] + bf0[5];
203   bf1[3] = bf0[3] + bf0[4];
204   bf1[4] = -bf0[4] + bf0[3];
205   bf1[5] = -bf0[5] + bf0[2];
206   bf1[6] = -bf0[6] + bf0[1];
207   bf1[7] = -bf0[7] + bf0[0];
208   bf1[8] = bf0[8];
209   bf1[9] = bf0[9];
210   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
211   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
212   bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
213   bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
214   bf1[14] = bf0[14];
215   bf1[15] = bf0[15];
216   range_check(stage, input, bf1, size, stage_range[stage]);
217 
218   // stage 3
219   stage++;
220   cospi = cospi_arr(cos_bit[stage]);
221   bf0 = step;
222   bf1 = output;
223   bf1[0] = bf0[0] + bf0[3];
224   bf1[1] = bf0[1] + bf0[2];
225   bf1[2] = -bf0[2] + bf0[1];
226   bf1[3] = -bf0[3] + bf0[0];
227   bf1[4] = bf0[4];
228   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
229   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
230   bf1[7] = bf0[7];
231   bf1[8] = bf0[8] + bf0[11];
232   bf1[9] = bf0[9] + bf0[10];
233   bf1[10] = -bf0[10] + bf0[9];
234   bf1[11] = -bf0[11] + bf0[8];
235   bf1[12] = -bf0[12] + bf0[15];
236   bf1[13] = -bf0[13] + bf0[14];
237   bf1[14] = bf0[14] + bf0[13];
238   bf1[15] = bf0[15] + bf0[12];
239   range_check(stage, input, bf1, size, stage_range[stage]);
240 
241   // stage 4
242   stage++;
243   cospi = cospi_arr(cos_bit[stage]);
244   bf0 = output;
245   bf1 = step;
246   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
247   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
248   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
249   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
250   bf1[4] = bf0[4] + bf0[5];
251   bf1[5] = -bf0[5] + bf0[4];
252   bf1[6] = -bf0[6] + bf0[7];
253   bf1[7] = bf0[7] + bf0[6];
254   bf1[8] = bf0[8];
255   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
256   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
257   bf1[11] = bf0[11];
258   bf1[12] = bf0[12];
259   bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
260   bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
261   bf1[15] = bf0[15];
262   range_check(stage, input, bf1, size, stage_range[stage]);
263 
264   // stage 5
265   stage++;
266   cospi = cospi_arr(cos_bit[stage]);
267   bf0 = step;
268   bf1 = output;
269   bf1[0] = bf0[0];
270   bf1[1] = bf0[1];
271   bf1[2] = bf0[2];
272   bf1[3] = bf0[3];
273   bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
274   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
275   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
276   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
277   bf1[8] = bf0[8] + bf0[9];
278   bf1[9] = -bf0[9] + bf0[8];
279   bf1[10] = -bf0[10] + bf0[11];
280   bf1[11] = bf0[11] + bf0[10];
281   bf1[12] = bf0[12] + bf0[13];
282   bf1[13] = -bf0[13] + bf0[12];
283   bf1[14] = -bf0[14] + bf0[15];
284   bf1[15] = bf0[15] + bf0[14];
285   range_check(stage, input, bf1, size, stage_range[stage]);
286 
287   // stage 6
288   stage++;
289   cospi = cospi_arr(cos_bit[stage]);
290   bf0 = output;
291   bf1 = step;
292   bf1[0] = bf0[0];
293   bf1[1] = bf0[1];
294   bf1[2] = bf0[2];
295   bf1[3] = bf0[3];
296   bf1[4] = bf0[4];
297   bf1[5] = bf0[5];
298   bf1[6] = bf0[6];
299   bf1[7] = bf0[7];
300   bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
301   bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
302   bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
303   bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
304   bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
305   bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
306   bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
307   bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
308   range_check(stage, input, bf1, size, stage_range[stage]);
309 
310   // stage 7
311   stage++;
312   bf0 = step;
313   bf1 = output;
314   bf1[0] = bf0[0];
315   bf1[1] = bf0[8];
316   bf1[2] = bf0[4];
317   bf1[3] = bf0[12];
318   bf1[4] = bf0[2];
319   bf1[5] = bf0[10];
320   bf1[6] = bf0[6];
321   bf1[7] = bf0[14];
322   bf1[8] = bf0[1];
323   bf1[9] = bf0[9];
324   bf1[10] = bf0[5];
325   bf1[11] = bf0[13];
326   bf1[12] = bf0[3];
327   bf1[13] = bf0[11];
328   bf1[14] = bf0[7];
329   bf1[15] = bf0[15];
330   range_check(stage, input, bf1, size, stage_range[stage]);
331 }
332 
av1_fdct32_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)333 void av1_fdct32_new(const int32_t *input, int32_t *output,
334                     const int8_t *cos_bit, const int8_t *stage_range) {
335   const int32_t size = 32;
336   const int32_t *cospi;
337 
338   int32_t stage = 0;
339   int32_t *bf0, *bf1;
340   int32_t step[32];
341 
342   // stage 0;
343   range_check(stage, input, input, size, stage_range[stage]);
344 
345   // stage 1;
346   stage++;
347   bf1 = output;
348   bf1[0] = input[0] + input[31];
349   bf1[1] = input[1] + input[30];
350   bf1[2] = input[2] + input[29];
351   bf1[3] = input[3] + input[28];
352   bf1[4] = input[4] + input[27];
353   bf1[5] = input[5] + input[26];
354   bf1[6] = input[6] + input[25];
355   bf1[7] = input[7] + input[24];
356   bf1[8] = input[8] + input[23];
357   bf1[9] = input[9] + input[22];
358   bf1[10] = input[10] + input[21];
359   bf1[11] = input[11] + input[20];
360   bf1[12] = input[12] + input[19];
361   bf1[13] = input[13] + input[18];
362   bf1[14] = input[14] + input[17];
363   bf1[15] = input[15] + input[16];
364   bf1[16] = -input[16] + input[15];
365   bf1[17] = -input[17] + input[14];
366   bf1[18] = -input[18] + input[13];
367   bf1[19] = -input[19] + input[12];
368   bf1[20] = -input[20] + input[11];
369   bf1[21] = -input[21] + input[10];
370   bf1[22] = -input[22] + input[9];
371   bf1[23] = -input[23] + input[8];
372   bf1[24] = -input[24] + input[7];
373   bf1[25] = -input[25] + input[6];
374   bf1[26] = -input[26] + input[5];
375   bf1[27] = -input[27] + input[4];
376   bf1[28] = -input[28] + input[3];
377   bf1[29] = -input[29] + input[2];
378   bf1[30] = -input[30] + input[1];
379   bf1[31] = -input[31] + input[0];
380   range_check(stage, input, bf1, size, stage_range[stage]);
381 
382   // stage 2
383   stage++;
384   cospi = cospi_arr(cos_bit[stage]);
385   bf0 = output;
386   bf1 = step;
387   bf1[0] = bf0[0] + bf0[15];
388   bf1[1] = bf0[1] + bf0[14];
389   bf1[2] = bf0[2] + bf0[13];
390   bf1[3] = bf0[3] + bf0[12];
391   bf1[4] = bf0[4] + bf0[11];
392   bf1[5] = bf0[5] + bf0[10];
393   bf1[6] = bf0[6] + bf0[9];
394   bf1[7] = bf0[7] + bf0[8];
395   bf1[8] = -bf0[8] + bf0[7];
396   bf1[9] = -bf0[9] + bf0[6];
397   bf1[10] = -bf0[10] + bf0[5];
398   bf1[11] = -bf0[11] + bf0[4];
399   bf1[12] = -bf0[12] + bf0[3];
400   bf1[13] = -bf0[13] + bf0[2];
401   bf1[14] = -bf0[14] + bf0[1];
402   bf1[15] = -bf0[15] + bf0[0];
403   bf1[16] = bf0[16];
404   bf1[17] = bf0[17];
405   bf1[18] = bf0[18];
406   bf1[19] = bf0[19];
407   bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
408   bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
409   bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
410   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
411   bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
412   bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
413   bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
414   bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
415   bf1[28] = bf0[28];
416   bf1[29] = bf0[29];
417   bf1[30] = bf0[30];
418   bf1[31] = bf0[31];
419   range_check(stage, input, bf1, size, stage_range[stage]);
420 
421   // stage 3
422   stage++;
423   cospi = cospi_arr(cos_bit[stage]);
424   bf0 = step;
425   bf1 = output;
426   bf1[0] = bf0[0] + bf0[7];
427   bf1[1] = bf0[1] + bf0[6];
428   bf1[2] = bf0[2] + bf0[5];
429   bf1[3] = bf0[3] + bf0[4];
430   bf1[4] = -bf0[4] + bf0[3];
431   bf1[5] = -bf0[5] + bf0[2];
432   bf1[6] = -bf0[6] + bf0[1];
433   bf1[7] = -bf0[7] + bf0[0];
434   bf1[8] = bf0[8];
435   bf1[9] = bf0[9];
436   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
437   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
438   bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
439   bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
440   bf1[14] = bf0[14];
441   bf1[15] = bf0[15];
442   bf1[16] = bf0[16] + bf0[23];
443   bf1[17] = bf0[17] + bf0[22];
444   bf1[18] = bf0[18] + bf0[21];
445   bf1[19] = bf0[19] + bf0[20];
446   bf1[20] = -bf0[20] + bf0[19];
447   bf1[21] = -bf0[21] + bf0[18];
448   bf1[22] = -bf0[22] + bf0[17];
449   bf1[23] = -bf0[23] + bf0[16];
450   bf1[24] = -bf0[24] + bf0[31];
451   bf1[25] = -bf0[25] + bf0[30];
452   bf1[26] = -bf0[26] + bf0[29];
453   bf1[27] = -bf0[27] + bf0[28];
454   bf1[28] = bf0[28] + bf0[27];
455   bf1[29] = bf0[29] + bf0[26];
456   bf1[30] = bf0[30] + bf0[25];
457   bf1[31] = bf0[31] + bf0[24];
458   range_check(stage, input, bf1, size, stage_range[stage]);
459 
460   // stage 4
461   stage++;
462   cospi = cospi_arr(cos_bit[stage]);
463   bf0 = output;
464   bf1 = step;
465   bf1[0] = bf0[0] + bf0[3];
466   bf1[1] = bf0[1] + bf0[2];
467   bf1[2] = -bf0[2] + bf0[1];
468   bf1[3] = -bf0[3] + bf0[0];
469   bf1[4] = bf0[4];
470   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
471   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
472   bf1[7] = bf0[7];
473   bf1[8] = bf0[8] + bf0[11];
474   bf1[9] = bf0[9] + bf0[10];
475   bf1[10] = -bf0[10] + bf0[9];
476   bf1[11] = -bf0[11] + bf0[8];
477   bf1[12] = -bf0[12] + bf0[15];
478   bf1[13] = -bf0[13] + bf0[14];
479   bf1[14] = bf0[14] + bf0[13];
480   bf1[15] = bf0[15] + bf0[12];
481   bf1[16] = bf0[16];
482   bf1[17] = bf0[17];
483   bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
484   bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
485   bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
486   bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
487   bf1[22] = bf0[22];
488   bf1[23] = bf0[23];
489   bf1[24] = bf0[24];
490   bf1[25] = bf0[25];
491   bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
492   bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
493   bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
494   bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
495   bf1[30] = bf0[30];
496   bf1[31] = bf0[31];
497   range_check(stage, input, bf1, size, stage_range[stage]);
498 
499   // stage 5
500   stage++;
501   cospi = cospi_arr(cos_bit[stage]);
502   bf0 = step;
503   bf1 = output;
504   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
505   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
506   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
507   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
508   bf1[4] = bf0[4] + bf0[5];
509   bf1[5] = -bf0[5] + bf0[4];
510   bf1[6] = -bf0[6] + bf0[7];
511   bf1[7] = bf0[7] + bf0[6];
512   bf1[8] = bf0[8];
513   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
514   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
515   bf1[11] = bf0[11];
516   bf1[12] = bf0[12];
517   bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
518   bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
519   bf1[15] = bf0[15];
520   bf1[16] = bf0[16] + bf0[19];
521   bf1[17] = bf0[17] + bf0[18];
522   bf1[18] = -bf0[18] + bf0[17];
523   bf1[19] = -bf0[19] + bf0[16];
524   bf1[20] = -bf0[20] + bf0[23];
525   bf1[21] = -bf0[21] + bf0[22];
526   bf1[22] = bf0[22] + bf0[21];
527   bf1[23] = bf0[23] + bf0[20];
528   bf1[24] = bf0[24] + bf0[27];
529   bf1[25] = bf0[25] + bf0[26];
530   bf1[26] = -bf0[26] + bf0[25];
531   bf1[27] = -bf0[27] + bf0[24];
532   bf1[28] = -bf0[28] + bf0[31];
533   bf1[29] = -bf0[29] + bf0[30];
534   bf1[30] = bf0[30] + bf0[29];
535   bf1[31] = bf0[31] + bf0[28];
536   range_check(stage, input, bf1, size, stage_range[stage]);
537 
538   // stage 6
539   stage++;
540   cospi = cospi_arr(cos_bit[stage]);
541   bf0 = output;
542   bf1 = step;
543   bf1[0] = bf0[0];
544   bf1[1] = bf0[1];
545   bf1[2] = bf0[2];
546   bf1[3] = bf0[3];
547   bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
548   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
549   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
550   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
551   bf1[8] = bf0[8] + bf0[9];
552   bf1[9] = -bf0[9] + bf0[8];
553   bf1[10] = -bf0[10] + bf0[11];
554   bf1[11] = bf0[11] + bf0[10];
555   bf1[12] = bf0[12] + bf0[13];
556   bf1[13] = -bf0[13] + bf0[12];
557   bf1[14] = -bf0[14] + bf0[15];
558   bf1[15] = bf0[15] + bf0[14];
559   bf1[16] = bf0[16];
560   bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
561   bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
562   bf1[19] = bf0[19];
563   bf1[20] = bf0[20];
564   bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
565   bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
566   bf1[23] = bf0[23];
567   bf1[24] = bf0[24];
568   bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
569   bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
570   bf1[27] = bf0[27];
571   bf1[28] = bf0[28];
572   bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
573   bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
574   bf1[31] = bf0[31];
575   range_check(stage, input, bf1, size, stage_range[stage]);
576 
577   // stage 7
578   stage++;
579   cospi = cospi_arr(cos_bit[stage]);
580   bf0 = step;
581   bf1 = output;
582   bf1[0] = bf0[0];
583   bf1[1] = bf0[1];
584   bf1[2] = bf0[2];
585   bf1[3] = bf0[3];
586   bf1[4] = bf0[4];
587   bf1[5] = bf0[5];
588   bf1[6] = bf0[6];
589   bf1[7] = bf0[7];
590   bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
591   bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
592   bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
593   bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
594   bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
595   bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
596   bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
597   bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
598   bf1[16] = bf0[16] + bf0[17];
599   bf1[17] = -bf0[17] + bf0[16];
600   bf1[18] = -bf0[18] + bf0[19];
601   bf1[19] = bf0[19] + bf0[18];
602   bf1[20] = bf0[20] + bf0[21];
603   bf1[21] = -bf0[21] + bf0[20];
604   bf1[22] = -bf0[22] + bf0[23];
605   bf1[23] = bf0[23] + bf0[22];
606   bf1[24] = bf0[24] + bf0[25];
607   bf1[25] = -bf0[25] + bf0[24];
608   bf1[26] = -bf0[26] + bf0[27];
609   bf1[27] = bf0[27] + bf0[26];
610   bf1[28] = bf0[28] + bf0[29];
611   bf1[29] = -bf0[29] + bf0[28];
612   bf1[30] = -bf0[30] + bf0[31];
613   bf1[31] = bf0[31] + bf0[30];
614   range_check(stage, input, bf1, size, stage_range[stage]);
615 
616   // stage 8
617   stage++;
618   cospi = cospi_arr(cos_bit[stage]);
619   bf0 = output;
620   bf1 = step;
621   bf1[0] = bf0[0];
622   bf1[1] = bf0[1];
623   bf1[2] = bf0[2];
624   bf1[3] = bf0[3];
625   bf1[4] = bf0[4];
626   bf1[5] = bf0[5];
627   bf1[6] = bf0[6];
628   bf1[7] = bf0[7];
629   bf1[8] = bf0[8];
630   bf1[9] = bf0[9];
631   bf1[10] = bf0[10];
632   bf1[11] = bf0[11];
633   bf1[12] = bf0[12];
634   bf1[13] = bf0[13];
635   bf1[14] = bf0[14];
636   bf1[15] = bf0[15];
637   bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
638   bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
639   bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
640   bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
641   bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
642   bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
643   bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
644   bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
645   bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
646   bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
647   bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
648   bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
649   bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
650   bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
651   bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
652   bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
653   range_check(stage, input, bf1, size, stage_range[stage]);
654 
655   // stage 9
656   stage++;
657   bf0 = step;
658   bf1 = output;
659   bf1[0] = bf0[0];
660   bf1[1] = bf0[16];
661   bf1[2] = bf0[8];
662   bf1[3] = bf0[24];
663   bf1[4] = bf0[4];
664   bf1[5] = bf0[20];
665   bf1[6] = bf0[12];
666   bf1[7] = bf0[28];
667   bf1[8] = bf0[2];
668   bf1[9] = bf0[18];
669   bf1[10] = bf0[10];
670   bf1[11] = bf0[26];
671   bf1[12] = bf0[6];
672   bf1[13] = bf0[22];
673   bf1[14] = bf0[14];
674   bf1[15] = bf0[30];
675   bf1[16] = bf0[1];
676   bf1[17] = bf0[17];
677   bf1[18] = bf0[9];
678   bf1[19] = bf0[25];
679   bf1[20] = bf0[5];
680   bf1[21] = bf0[21];
681   bf1[22] = bf0[13];
682   bf1[23] = bf0[29];
683   bf1[24] = bf0[3];
684   bf1[25] = bf0[19];
685   bf1[26] = bf0[11];
686   bf1[27] = bf0[27];
687   bf1[28] = bf0[7];
688   bf1[29] = bf0[23];
689   bf1[30] = bf0[15];
690   bf1[31] = bf0[31];
691   range_check(stage, input, bf1, size, stage_range[stage]);
692 }
693 
av1_fadst4_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)694 void av1_fadst4_new(const int32_t *input, int32_t *output,
695                     const int8_t *cos_bit, const int8_t *stage_range) {
696   const int32_t size = 4;
697   const int32_t *cospi;
698 
699   int32_t stage = 0;
700   int32_t *bf0, *bf1;
701   int32_t step[4];
702 
703   // stage 0;
704   range_check(stage, input, input, size, stage_range[stage]);
705 
706   // stage 1;
707   stage++;
708   bf1 = output;
709   bf1[0] = input[3];
710   bf1[1] = input[0];
711   bf1[2] = input[1];
712   bf1[3] = input[2];
713   range_check(stage, input, bf1, size, stage_range[stage]);
714 
715   // stage 2
716   stage++;
717   cospi = cospi_arr(cos_bit[stage]);
718   bf0 = output;
719   bf1 = step;
720   bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
721   bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
722   bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
723   bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
724   range_check(stage, input, bf1, size, stage_range[stage]);
725 
726   // stage 3
727   stage++;
728   bf0 = step;
729   bf1 = output;
730   bf1[0] = bf0[0] + bf0[2];
731   bf1[1] = bf0[1] + bf0[3];
732   bf1[2] = -bf0[2] + bf0[0];
733   bf1[3] = -bf0[3] + bf0[1];
734   range_check(stage, input, bf1, size, stage_range[stage]);
735 
736   // stage 4
737   stage++;
738   cospi = cospi_arr(cos_bit[stage]);
739   bf0 = output;
740   bf1 = step;
741   bf1[0] = bf0[0];
742   bf1[1] = bf0[1];
743   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
744   bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
745   range_check(stage, input, bf1, size, stage_range[stage]);
746 
747   // stage 5
748   stage++;
749   bf0 = step;
750   bf1 = output;
751   bf1[0] = bf0[0];
752   bf1[1] = -bf0[2];
753   bf1[2] = bf0[3];
754   bf1[3] = -bf0[1];
755   range_check(stage, input, bf1, size, stage_range[stage]);
756 }
757 
av1_fadst8_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)758 void av1_fadst8_new(const int32_t *input, int32_t *output,
759                     const int8_t *cos_bit, const int8_t *stage_range) {
760   const int32_t size = 8;
761   const int32_t *cospi;
762 
763   int32_t stage = 0;
764   int32_t *bf0, *bf1;
765   int32_t step[8];
766 
767   // stage 0;
768   range_check(stage, input, input, size, stage_range[stage]);
769 
770   // stage 1;
771   stage++;
772   bf1 = output;
773   bf1[0] = input[7];
774   bf1[1] = input[0];
775   bf1[2] = input[5];
776   bf1[3] = input[2];
777   bf1[4] = input[3];
778   bf1[5] = input[4];
779   bf1[6] = input[1];
780   bf1[7] = input[6];
781   range_check(stage, input, bf1, size, stage_range[stage]);
782 
783   // stage 2
784   stage++;
785   cospi = cospi_arr(cos_bit[stage]);
786   bf0 = output;
787   bf1 = step;
788   bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
789   bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
790   bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
791   bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
792   bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
793   bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
794   bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
795   bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
796   range_check(stage, input, bf1, size, stage_range[stage]);
797 
798   // stage 3
799   stage++;
800   bf0 = step;
801   bf1 = output;
802   bf1[0] = bf0[0] + bf0[4];
803   bf1[1] = bf0[1] + bf0[5];
804   bf1[2] = bf0[2] + bf0[6];
805   bf1[3] = bf0[3] + bf0[7];
806   bf1[4] = -bf0[4] + bf0[0];
807   bf1[5] = -bf0[5] + bf0[1];
808   bf1[6] = -bf0[6] + bf0[2];
809   bf1[7] = -bf0[7] + bf0[3];
810   range_check(stage, input, bf1, size, stage_range[stage]);
811 
812   // stage 4
813   stage++;
814   cospi = cospi_arr(cos_bit[stage]);
815   bf0 = output;
816   bf1 = step;
817   bf1[0] = bf0[0];
818   bf1[1] = bf0[1];
819   bf1[2] = bf0[2];
820   bf1[3] = bf0[3];
821   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
822   bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
823   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
824   bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
825   range_check(stage, input, bf1, size, stage_range[stage]);
826 
827   // stage 5
828   stage++;
829   bf0 = step;
830   bf1 = output;
831   bf1[0] = bf0[0] + bf0[2];
832   bf1[1] = bf0[1] + bf0[3];
833   bf1[2] = -bf0[2] + bf0[0];
834   bf1[3] = -bf0[3] + bf0[1];
835   bf1[4] = bf0[4] + bf0[6];
836   bf1[5] = bf0[5] + bf0[7];
837   bf1[6] = -bf0[6] + bf0[4];
838   bf1[7] = -bf0[7] + bf0[5];
839   range_check(stage, input, bf1, size, stage_range[stage]);
840 
841   // stage 6
842   stage++;
843   cospi = cospi_arr(cos_bit[stage]);
844   bf0 = output;
845   bf1 = step;
846   bf1[0] = bf0[0];
847   bf1[1] = bf0[1];
848   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
849   bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
850   bf1[4] = bf0[4];
851   bf1[5] = bf0[5];
852   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
853   bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
854   range_check(stage, input, bf1, size, stage_range[stage]);
855 
856   // stage 7
857   stage++;
858   bf0 = step;
859   bf1 = output;
860   bf1[0] = bf0[0];
861   bf1[1] = -bf0[4];
862   bf1[2] = bf0[6];
863   bf1[3] = -bf0[2];
864   bf1[4] = bf0[3];
865   bf1[5] = -bf0[7];
866   bf1[6] = bf0[5];
867   bf1[7] = -bf0[1];
868   range_check(stage, input, bf1, size, stage_range[stage]);
869 }
870 
av1_fadst16_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)871 void av1_fadst16_new(const int32_t *input, int32_t *output,
872                      const int8_t *cos_bit, const int8_t *stage_range) {
873   const int32_t size = 16;
874   const int32_t *cospi;
875 
876   int32_t stage = 0;
877   int32_t *bf0, *bf1;
878   int32_t step[16];
879 
880   // stage 0;
881   range_check(stage, input, input, size, stage_range[stage]);
882 
883   // stage 1;
884   stage++;
885   bf1 = output;
886   bf1[0] = input[15];
887   bf1[1] = input[0];
888   bf1[2] = input[13];
889   bf1[3] = input[2];
890   bf1[4] = input[11];
891   bf1[5] = input[4];
892   bf1[6] = input[9];
893   bf1[7] = input[6];
894   bf1[8] = input[7];
895   bf1[9] = input[8];
896   bf1[10] = input[5];
897   bf1[11] = input[10];
898   bf1[12] = input[3];
899   bf1[13] = input[12];
900   bf1[14] = input[1];
901   bf1[15] = input[14];
902   range_check(stage, input, bf1, size, stage_range[stage]);
903 
904   // stage 2
905   stage++;
906   cospi = cospi_arr(cos_bit[stage]);
907   bf0 = output;
908   bf1 = step;
909   bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
910   bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
911   bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
912   bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
913   bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
914   bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
915   bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
916   bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
917   bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
918   bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
919   bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
920   bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
921   bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
922   bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
923   bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
924   bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
925   range_check(stage, input, bf1, size, stage_range[stage]);
926 
927   // stage 3
928   stage++;
929   bf0 = step;
930   bf1 = output;
931   bf1[0] = bf0[0] + bf0[8];
932   bf1[1] = bf0[1] + bf0[9];
933   bf1[2] = bf0[2] + bf0[10];
934   bf1[3] = bf0[3] + bf0[11];
935   bf1[4] = bf0[4] + bf0[12];
936   bf1[5] = bf0[5] + bf0[13];
937   bf1[6] = bf0[6] + bf0[14];
938   bf1[7] = bf0[7] + bf0[15];
939   bf1[8] = -bf0[8] + bf0[0];
940   bf1[9] = -bf0[9] + bf0[1];
941   bf1[10] = -bf0[10] + bf0[2];
942   bf1[11] = -bf0[11] + bf0[3];
943   bf1[12] = -bf0[12] + bf0[4];
944   bf1[13] = -bf0[13] + bf0[5];
945   bf1[14] = -bf0[14] + bf0[6];
946   bf1[15] = -bf0[15] + bf0[7];
947   range_check(stage, input, bf1, size, stage_range[stage]);
948 
949   // stage 4
950   stage++;
951   cospi = cospi_arr(cos_bit[stage]);
952   bf0 = output;
953   bf1 = step;
954   bf1[0] = bf0[0];
955   bf1[1] = bf0[1];
956   bf1[2] = bf0[2];
957   bf1[3] = bf0[3];
958   bf1[4] = bf0[4];
959   bf1[5] = bf0[5];
960   bf1[6] = bf0[6];
961   bf1[7] = bf0[7];
962   bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
963   bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
964   bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
965   bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
966   bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
967   bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
968   bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
969   bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
970   range_check(stage, input, bf1, size, stage_range[stage]);
971 
972   // stage 5
973   stage++;
974   bf0 = step;
975   bf1 = output;
976   bf1[0] = bf0[0] + bf0[4];
977   bf1[1] = bf0[1] + bf0[5];
978   bf1[2] = bf0[2] + bf0[6];
979   bf1[3] = bf0[3] + bf0[7];
980   bf1[4] = -bf0[4] + bf0[0];
981   bf1[5] = -bf0[5] + bf0[1];
982   bf1[6] = -bf0[6] + bf0[2];
983   bf1[7] = -bf0[7] + bf0[3];
984   bf1[8] = bf0[8] + bf0[12];
985   bf1[9] = bf0[9] + bf0[13];
986   bf1[10] = bf0[10] + bf0[14];
987   bf1[11] = bf0[11] + bf0[15];
988   bf1[12] = -bf0[12] + bf0[8];
989   bf1[13] = -bf0[13] + bf0[9];
990   bf1[14] = -bf0[14] + bf0[10];
991   bf1[15] = -bf0[15] + bf0[11];
992   range_check(stage, input, bf1, size, stage_range[stage]);
993 
994   // stage 6
995   stage++;
996   cospi = cospi_arr(cos_bit[stage]);
997   bf0 = output;
998   bf1 = step;
999   bf1[0] = bf0[0];
1000   bf1[1] = bf0[1];
1001   bf1[2] = bf0[2];
1002   bf1[3] = bf0[3];
1003   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1004   bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
1005   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1006   bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
1007   bf1[8] = bf0[8];
1008   bf1[9] = bf0[9];
1009   bf1[10] = bf0[10];
1010   bf1[11] = bf0[11];
1011   bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1012   bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
1013   bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1014   bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
1015   range_check(stage, input, bf1, size, stage_range[stage]);
1016 
1017   // stage 7
1018   stage++;
1019   bf0 = step;
1020   bf1 = output;
1021   bf1[0] = bf0[0] + bf0[2];
1022   bf1[1] = bf0[1] + bf0[3];
1023   bf1[2] = -bf0[2] + bf0[0];
1024   bf1[3] = -bf0[3] + bf0[1];
1025   bf1[4] = bf0[4] + bf0[6];
1026   bf1[5] = bf0[5] + bf0[7];
1027   bf1[6] = -bf0[6] + bf0[4];
1028   bf1[7] = -bf0[7] + bf0[5];
1029   bf1[8] = bf0[8] + bf0[10];
1030   bf1[9] = bf0[9] + bf0[11];
1031   bf1[10] = -bf0[10] + bf0[8];
1032   bf1[11] = -bf0[11] + bf0[9];
1033   bf1[12] = bf0[12] + bf0[14];
1034   bf1[13] = bf0[13] + bf0[15];
1035   bf1[14] = -bf0[14] + bf0[12];
1036   bf1[15] = -bf0[15] + bf0[13];
1037   range_check(stage, input, bf1, size, stage_range[stage]);
1038 
1039   // stage 8
1040   stage++;
1041   cospi = cospi_arr(cos_bit[stage]);
1042   bf0 = output;
1043   bf1 = step;
1044   bf1[0] = bf0[0];
1045   bf1[1] = bf0[1];
1046   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1047   bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
1048   bf1[4] = bf0[4];
1049   bf1[5] = bf0[5];
1050   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1051   bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
1052   bf1[8] = bf0[8];
1053   bf1[9] = bf0[9];
1054   bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1055   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
1056   bf1[12] = bf0[12];
1057   bf1[13] = bf0[13];
1058   bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1059   bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
1060   range_check(stage, input, bf1, size, stage_range[stage]);
1061 
1062   // stage 9
1063   stage++;
1064   bf0 = step;
1065   bf1 = output;
1066   bf1[0] = bf0[0];
1067   bf1[1] = -bf0[8];
1068   bf1[2] = bf0[12];
1069   bf1[3] = -bf0[4];
1070   bf1[4] = bf0[6];
1071   bf1[5] = -bf0[14];
1072   bf1[6] = bf0[10];
1073   bf1[7] = -bf0[2];
1074   bf1[8] = bf0[3];
1075   bf1[9] = -bf0[11];
1076   bf1[10] = bf0[15];
1077   bf1[11] = -bf0[7];
1078   bf1[12] = bf0[5];
1079   bf1[13] = -bf0[13];
1080   bf1[14] = bf0[9];
1081   bf1[15] = -bf0[1];
1082   range_check(stage, input, bf1, size, stage_range[stage]);
1083 }
1084 
av1_fadst32_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1085 void av1_fadst32_new(const int32_t *input, int32_t *output,
1086                      const int8_t *cos_bit, const int8_t *stage_range) {
1087   const int32_t size = 32;
1088   const int32_t *cospi;
1089 
1090   int32_t stage = 0;
1091   int32_t *bf0, *bf1;
1092   int32_t step[32];
1093 
1094   // stage 0;
1095   range_check(stage, input, input, size, stage_range[stage]);
1096 
1097   // stage 1;
1098   stage++;
1099   bf1 = output;
1100   bf1[0] = input[31];
1101   bf1[1] = input[0];
1102   bf1[2] = input[29];
1103   bf1[3] = input[2];
1104   bf1[4] = input[27];
1105   bf1[5] = input[4];
1106   bf1[6] = input[25];
1107   bf1[7] = input[6];
1108   bf1[8] = input[23];
1109   bf1[9] = input[8];
1110   bf1[10] = input[21];
1111   bf1[11] = input[10];
1112   bf1[12] = input[19];
1113   bf1[13] = input[12];
1114   bf1[14] = input[17];
1115   bf1[15] = input[14];
1116   bf1[16] = input[15];
1117   bf1[17] = input[16];
1118   bf1[18] = input[13];
1119   bf1[19] = input[18];
1120   bf1[20] = input[11];
1121   bf1[21] = input[20];
1122   bf1[22] = input[9];
1123   bf1[23] = input[22];
1124   bf1[24] = input[7];
1125   bf1[25] = input[24];
1126   bf1[26] = input[5];
1127   bf1[27] = input[26];
1128   bf1[28] = input[3];
1129   bf1[29] = input[28];
1130   bf1[30] = input[1];
1131   bf1[31] = input[30];
1132   range_check(stage, input, bf1, size, stage_range[stage]);
1133 
1134   // stage 2
1135   stage++;
1136   cospi = cospi_arr(cos_bit[stage]);
1137   bf0 = output;
1138   bf1 = step;
1139   bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
1140   bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
1141   bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
1142   bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
1143   bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
1144   bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
1145   bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
1146   bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
1147   bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
1148   bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
1149   bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
1150   bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
1151   bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
1152   bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
1153   bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
1154   bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
1155   bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
1156   bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
1157   bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
1158   bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
1159   bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
1160   bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
1161   bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
1162   bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
1163   bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
1164   bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
1165   bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
1166   bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
1167   bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
1168   bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
1169   bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
1170   bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
1171   range_check(stage, input, bf1, size, stage_range[stage]);
1172 
1173   // stage 3
1174   stage++;
1175   bf0 = step;
1176   bf1 = output;
1177   bf1[0] = bf0[0] + bf0[16];
1178   bf1[1] = bf0[1] + bf0[17];
1179   bf1[2] = bf0[2] + bf0[18];
1180   bf1[3] = bf0[3] + bf0[19];
1181   bf1[4] = bf0[4] + bf0[20];
1182   bf1[5] = bf0[5] + bf0[21];
1183   bf1[6] = bf0[6] + bf0[22];
1184   bf1[7] = bf0[7] + bf0[23];
1185   bf1[8] = bf0[8] + bf0[24];
1186   bf1[9] = bf0[9] + bf0[25];
1187   bf1[10] = bf0[10] + bf0[26];
1188   bf1[11] = bf0[11] + bf0[27];
1189   bf1[12] = bf0[12] + bf0[28];
1190   bf1[13] = bf0[13] + bf0[29];
1191   bf1[14] = bf0[14] + bf0[30];
1192   bf1[15] = bf0[15] + bf0[31];
1193   bf1[16] = -bf0[16] + bf0[0];
1194   bf1[17] = -bf0[17] + bf0[1];
1195   bf1[18] = -bf0[18] + bf0[2];
1196   bf1[19] = -bf0[19] + bf0[3];
1197   bf1[20] = -bf0[20] + bf0[4];
1198   bf1[21] = -bf0[21] + bf0[5];
1199   bf1[22] = -bf0[22] + bf0[6];
1200   bf1[23] = -bf0[23] + bf0[7];
1201   bf1[24] = -bf0[24] + bf0[8];
1202   bf1[25] = -bf0[25] + bf0[9];
1203   bf1[26] = -bf0[26] + bf0[10];
1204   bf1[27] = -bf0[27] + bf0[11];
1205   bf1[28] = -bf0[28] + bf0[12];
1206   bf1[29] = -bf0[29] + bf0[13];
1207   bf1[30] = -bf0[30] + bf0[14];
1208   bf1[31] = -bf0[31] + bf0[15];
1209   range_check(stage, input, bf1, size, stage_range[stage]);
1210 
1211   // stage 4
1212   stage++;
1213   cospi = cospi_arr(cos_bit[stage]);
1214   bf0 = output;
1215   bf1 = step;
1216   bf1[0] = bf0[0];
1217   bf1[1] = bf0[1];
1218   bf1[2] = bf0[2];
1219   bf1[3] = bf0[3];
1220   bf1[4] = bf0[4];
1221   bf1[5] = bf0[5];
1222   bf1[6] = bf0[6];
1223   bf1[7] = bf0[7];
1224   bf1[8] = bf0[8];
1225   bf1[9] = bf0[9];
1226   bf1[10] = bf0[10];
1227   bf1[11] = bf0[11];
1228   bf1[12] = bf0[12];
1229   bf1[13] = bf0[13];
1230   bf1[14] = bf0[14];
1231   bf1[15] = bf0[15];
1232   bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
1233   bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
1234   bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
1235   bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
1236   bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
1237   bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
1238   bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
1239   bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
1240   bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
1241   bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
1242   bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
1243   bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
1244   bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
1245   bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
1246   bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
1247   bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
1248   range_check(stage, input, bf1, size, stage_range[stage]);
1249 
1250   // stage 5
1251   stage++;
1252   bf0 = step;
1253   bf1 = output;
1254   bf1[0] = bf0[0] + bf0[8];
1255   bf1[1] = bf0[1] + bf0[9];
1256   bf1[2] = bf0[2] + bf0[10];
1257   bf1[3] = bf0[3] + bf0[11];
1258   bf1[4] = bf0[4] + bf0[12];
1259   bf1[5] = bf0[5] + bf0[13];
1260   bf1[6] = bf0[6] + bf0[14];
1261   bf1[7] = bf0[7] + bf0[15];
1262   bf1[8] = -bf0[8] + bf0[0];
1263   bf1[9] = -bf0[9] + bf0[1];
1264   bf1[10] = -bf0[10] + bf0[2];
1265   bf1[11] = -bf0[11] + bf0[3];
1266   bf1[12] = -bf0[12] + bf0[4];
1267   bf1[13] = -bf0[13] + bf0[5];
1268   bf1[14] = -bf0[14] + bf0[6];
1269   bf1[15] = -bf0[15] + bf0[7];
1270   bf1[16] = bf0[16] + bf0[24];
1271   bf1[17] = bf0[17] + bf0[25];
1272   bf1[18] = bf0[18] + bf0[26];
1273   bf1[19] = bf0[19] + bf0[27];
1274   bf1[20] = bf0[20] + bf0[28];
1275   bf1[21] = bf0[21] + bf0[29];
1276   bf1[22] = bf0[22] + bf0[30];
1277   bf1[23] = bf0[23] + bf0[31];
1278   bf1[24] = -bf0[24] + bf0[16];
1279   bf1[25] = -bf0[25] + bf0[17];
1280   bf1[26] = -bf0[26] + bf0[18];
1281   bf1[27] = -bf0[27] + bf0[19];
1282   bf1[28] = -bf0[28] + bf0[20];
1283   bf1[29] = -bf0[29] + bf0[21];
1284   bf1[30] = -bf0[30] + bf0[22];
1285   bf1[31] = -bf0[31] + bf0[23];
1286   range_check(stage, input, bf1, size, stage_range[stage]);
1287 
1288   // stage 6
1289   stage++;
1290   cospi = cospi_arr(cos_bit[stage]);
1291   bf0 = output;
1292   bf1 = step;
1293   bf1[0] = bf0[0];
1294   bf1[1] = bf0[1];
1295   bf1[2] = bf0[2];
1296   bf1[3] = bf0[3];
1297   bf1[4] = bf0[4];
1298   bf1[5] = bf0[5];
1299   bf1[6] = bf0[6];
1300   bf1[7] = bf0[7];
1301   bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
1302   bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
1303   bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
1304   bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
1305   bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
1306   bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
1307   bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
1308   bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
1309   bf1[16] = bf0[16];
1310   bf1[17] = bf0[17];
1311   bf1[18] = bf0[18];
1312   bf1[19] = bf0[19];
1313   bf1[20] = bf0[20];
1314   bf1[21] = bf0[21];
1315   bf1[22] = bf0[22];
1316   bf1[23] = bf0[23];
1317   bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
1318   bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
1319   bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
1320   bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
1321   bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
1322   bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
1323   bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
1324   bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
1325   range_check(stage, input, bf1, size, stage_range[stage]);
1326 
1327   // stage 7
1328   stage++;
1329   bf0 = step;
1330   bf1 = output;
1331   bf1[0] = bf0[0] + bf0[4];
1332   bf1[1] = bf0[1] + bf0[5];
1333   bf1[2] = bf0[2] + bf0[6];
1334   bf1[3] = bf0[3] + bf0[7];
1335   bf1[4] = -bf0[4] + bf0[0];
1336   bf1[5] = -bf0[5] + bf0[1];
1337   bf1[6] = -bf0[6] + bf0[2];
1338   bf1[7] = -bf0[7] + bf0[3];
1339   bf1[8] = bf0[8] + bf0[12];
1340   bf1[9] = bf0[9] + bf0[13];
1341   bf1[10] = bf0[10] + bf0[14];
1342   bf1[11] = bf0[11] + bf0[15];
1343   bf1[12] = -bf0[12] + bf0[8];
1344   bf1[13] = -bf0[13] + bf0[9];
1345   bf1[14] = -bf0[14] + bf0[10];
1346   bf1[15] = -bf0[15] + bf0[11];
1347   bf1[16] = bf0[16] + bf0[20];
1348   bf1[17] = bf0[17] + bf0[21];
1349   bf1[18] = bf0[18] + bf0[22];
1350   bf1[19] = bf0[19] + bf0[23];
1351   bf1[20] = -bf0[20] + bf0[16];
1352   bf1[21] = -bf0[21] + bf0[17];
1353   bf1[22] = -bf0[22] + bf0[18];
1354   bf1[23] = -bf0[23] + bf0[19];
1355   bf1[24] = bf0[24] + bf0[28];
1356   bf1[25] = bf0[25] + bf0[29];
1357   bf1[26] = bf0[26] + bf0[30];
1358   bf1[27] = bf0[27] + bf0[31];
1359   bf1[28] = -bf0[28] + bf0[24];
1360   bf1[29] = -bf0[29] + bf0[25];
1361   bf1[30] = -bf0[30] + bf0[26];
1362   bf1[31] = -bf0[31] + bf0[27];
1363   range_check(stage, input, bf1, size, stage_range[stage]);
1364 
1365   // stage 8
1366   stage++;
1367   cospi = cospi_arr(cos_bit[stage]);
1368   bf0 = output;
1369   bf1 = step;
1370   bf1[0] = bf0[0];
1371   bf1[1] = bf0[1];
1372   bf1[2] = bf0[2];
1373   bf1[3] = bf0[3];
1374   bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
1375   bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
1376   bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
1377   bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
1378   bf1[8] = bf0[8];
1379   bf1[9] = bf0[9];
1380   bf1[10] = bf0[10];
1381   bf1[11] = bf0[11];
1382   bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
1383   bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
1384   bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
1385   bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
1386   bf1[16] = bf0[16];
1387   bf1[17] = bf0[17];
1388   bf1[18] = bf0[18];
1389   bf1[19] = bf0[19];
1390   bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
1391   bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
1392   bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
1393   bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
1394   bf1[24] = bf0[24];
1395   bf1[25] = bf0[25];
1396   bf1[26] = bf0[26];
1397   bf1[27] = bf0[27];
1398   bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
1399   bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
1400   bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
1401   bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
1402   range_check(stage, input, bf1, size, stage_range[stage]);
1403 
1404   // stage 9
1405   stage++;
1406   bf0 = step;
1407   bf1 = output;
1408   bf1[0] = bf0[0] + bf0[2];
1409   bf1[1] = bf0[1] + bf0[3];
1410   bf1[2] = -bf0[2] + bf0[0];
1411   bf1[3] = -bf0[3] + bf0[1];
1412   bf1[4] = bf0[4] + bf0[6];
1413   bf1[5] = bf0[5] + bf0[7];
1414   bf1[6] = -bf0[6] + bf0[4];
1415   bf1[7] = -bf0[7] + bf0[5];
1416   bf1[8] = bf0[8] + bf0[10];
1417   bf1[9] = bf0[9] + bf0[11];
1418   bf1[10] = -bf0[10] + bf0[8];
1419   bf1[11] = -bf0[11] + bf0[9];
1420   bf1[12] = bf0[12] + bf0[14];
1421   bf1[13] = bf0[13] + bf0[15];
1422   bf1[14] = -bf0[14] + bf0[12];
1423   bf1[15] = -bf0[15] + bf0[13];
1424   bf1[16] = bf0[16] + bf0[18];
1425   bf1[17] = bf0[17] + bf0[19];
1426   bf1[18] = -bf0[18] + bf0[16];
1427   bf1[19] = -bf0[19] + bf0[17];
1428   bf1[20] = bf0[20] + bf0[22];
1429   bf1[21] = bf0[21] + bf0[23];
1430   bf1[22] = -bf0[22] + bf0[20];
1431   bf1[23] = -bf0[23] + bf0[21];
1432   bf1[24] = bf0[24] + bf0[26];
1433   bf1[25] = bf0[25] + bf0[27];
1434   bf1[26] = -bf0[26] + bf0[24];
1435   bf1[27] = -bf0[27] + bf0[25];
1436   bf1[28] = bf0[28] + bf0[30];
1437   bf1[29] = bf0[29] + bf0[31];
1438   bf1[30] = -bf0[30] + bf0[28];
1439   bf1[31] = -bf0[31] + bf0[29];
1440   range_check(stage, input, bf1, size, stage_range[stage]);
1441 
1442   // stage 10
1443   stage++;
1444   cospi = cospi_arr(cos_bit[stage]);
1445   bf0 = output;
1446   bf1 = step;
1447   bf1[0] = bf0[0];
1448   bf1[1] = bf0[1];
1449   bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
1450   bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
1451   bf1[4] = bf0[4];
1452   bf1[5] = bf0[5];
1453   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
1454   bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
1455   bf1[8] = bf0[8];
1456   bf1[9] = bf0[9];
1457   bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
1458   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
1459   bf1[12] = bf0[12];
1460   bf1[13] = bf0[13];
1461   bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
1462   bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
1463   bf1[16] = bf0[16];
1464   bf1[17] = bf0[17];
1465   bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
1466   bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
1467   bf1[20] = bf0[20];
1468   bf1[21] = bf0[21];
1469   bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
1470   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
1471   bf1[24] = bf0[24];
1472   bf1[25] = bf0[25];
1473   bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
1474   bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
1475   bf1[28] = bf0[28];
1476   bf1[29] = bf0[29];
1477   bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
1478   bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
1479   range_check(stage, input, bf1, size, stage_range[stage]);
1480 
1481   // stage 11
1482   stage++;
1483   bf0 = step;
1484   bf1 = output;
1485   bf1[0] = bf0[0];
1486   bf1[1] = -bf0[16];
1487   bf1[2] = bf0[24];
1488   bf1[3] = -bf0[8];
1489   bf1[4] = bf0[12];
1490   bf1[5] = -bf0[28];
1491   bf1[6] = bf0[20];
1492   bf1[7] = -bf0[4];
1493   bf1[8] = bf0[6];
1494   bf1[9] = -bf0[22];
1495   bf1[10] = bf0[30];
1496   bf1[11] = -bf0[14];
1497   bf1[12] = bf0[10];
1498   bf1[13] = -bf0[26];
1499   bf1[14] = bf0[18];
1500   bf1[15] = -bf0[2];
1501   bf1[16] = bf0[3];
1502   bf1[17] = -bf0[19];
1503   bf1[18] = bf0[27];
1504   bf1[19] = -bf0[11];
1505   bf1[20] = bf0[15];
1506   bf1[21] = -bf0[31];
1507   bf1[22] = bf0[23];
1508   bf1[23] = -bf0[7];
1509   bf1[24] = bf0[5];
1510   bf1[25] = -bf0[21];
1511   bf1[26] = bf0[29];
1512   bf1[27] = -bf0[13];
1513   bf1[28] = bf0[9];
1514   bf1[29] = -bf0[25];
1515   bf1[30] = bf0[17];
1516   bf1[31] = -bf0[1];
1517   range_check(stage, input, bf1, size, stage_range[stage]);
1518 }
1519 
1520 #if CONFIG_EXT_TX
av1_fidentity4_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1521 void av1_fidentity4_c(const int32_t *input, int32_t *output,
1522                       const int8_t *cos_bit, const int8_t *stage_range) {
1523   (void)cos_bit;
1524   for (int i = 0; i < 4; ++i)
1525     output[i] = (int32_t)dct_const_round_shift(input[i] * Sqrt2);
1526   range_check(0, input, output, 4, stage_range[0]);
1527 }
1528 
av1_fidentity8_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1529 void av1_fidentity8_c(const int32_t *input, int32_t *output,
1530                       const int8_t *cos_bit, const int8_t *stage_range) {
1531   (void)cos_bit;
1532   for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
1533   range_check(0, input, output, 8, stage_range[0]);
1534 }
1535 
av1_fidentity16_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1536 void av1_fidentity16_c(const int32_t *input, int32_t *output,
1537                        const int8_t *cos_bit, const int8_t *stage_range) {
1538   (void)cos_bit;
1539   for (int i = 0; i < 16; ++i)
1540     output[i] = (int32_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
1541   range_check(0, input, output, 16, stage_range[0]);
1542 }
1543 
av1_fidentity32_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1544 void av1_fidentity32_c(const int32_t *input, int32_t *output,
1545                        const int8_t *cos_bit, const int8_t *stage_range) {
1546   (void)cos_bit;
1547   for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
1548   range_check(0, input, output, 32, stage_range[0]);
1549 }
1550 
1551 #if CONFIG_TX64X64
av1_fidentity64_c(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1552 void av1_fidentity64_c(const int32_t *input, int32_t *output,
1553                        const int8_t *cos_bit, const int8_t *stage_range) {
1554   (void)cos_bit;
1555   for (int i = 0; i < 64; ++i)
1556     output[i] = (int32_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
1557   range_check(0, input, output, 64, stage_range[0]);
1558 }
1559 #endif  // CONFIG_TX64X64
1560 #endif  // CONFIG_EXT_TX
1561 
1562 #if CONFIG_TX64X64
av1_fdct64_new(const int32_t * input,int32_t * output,const int8_t * cos_bit,const int8_t * stage_range)1563 void av1_fdct64_new(const int32_t *input, int32_t *output,
1564                     const int8_t *cos_bit, const int8_t *stage_range) {
1565   const int32_t size = 64;
1566   const int32_t *cospi;
1567 
1568   int32_t stage = 0;
1569   int32_t *bf0, *bf1;
1570   int32_t step[64];
1571 
1572   // stage 0;
1573   range_check(stage, input, input, size, stage_range[stage]);
1574 
1575   // stage 1;
1576   stage++;
1577   cospi = cospi_arr(cos_bit[stage]);
1578   bf1 = output;
1579   bf1[0] = input[0] + input[63];
1580   bf1[1] = input[1] + input[62];
1581   bf1[2] = input[2] + input[61];
1582   bf1[3] = input[3] + input[60];
1583   bf1[4] = input[4] + input[59];
1584   bf1[5] = input[5] + input[58];
1585   bf1[6] = input[6] + input[57];
1586   bf1[7] = input[7] + input[56];
1587   bf1[8] = input[8] + input[55];
1588   bf1[9] = input[9] + input[54];
1589   bf1[10] = input[10] + input[53];
1590   bf1[11] = input[11] + input[52];
1591   bf1[12] = input[12] + input[51];
1592   bf1[13] = input[13] + input[50];
1593   bf1[14] = input[14] + input[49];
1594   bf1[15] = input[15] + input[48];
1595   bf1[16] = input[16] + input[47];
1596   bf1[17] = input[17] + input[46];
1597   bf1[18] = input[18] + input[45];
1598   bf1[19] = input[19] + input[44];
1599   bf1[20] = input[20] + input[43];
1600   bf1[21] = input[21] + input[42];
1601   bf1[22] = input[22] + input[41];
1602   bf1[23] = input[23] + input[40];
1603   bf1[24] = input[24] + input[39];
1604   bf1[25] = input[25] + input[38];
1605   bf1[26] = input[26] + input[37];
1606   bf1[27] = input[27] + input[36];
1607   bf1[28] = input[28] + input[35];
1608   bf1[29] = input[29] + input[34];
1609   bf1[30] = input[30] + input[33];
1610   bf1[31] = input[31] + input[32];
1611   bf1[32] = -input[32] + input[31];
1612   bf1[33] = -input[33] + input[30];
1613   bf1[34] = -input[34] + input[29];
1614   bf1[35] = -input[35] + input[28];
1615   bf1[36] = -input[36] + input[27];
1616   bf1[37] = -input[37] + input[26];
1617   bf1[38] = -input[38] + input[25];
1618   bf1[39] = -input[39] + input[24];
1619   bf1[40] = -input[40] + input[23];
1620   bf1[41] = -input[41] + input[22];
1621   bf1[42] = -input[42] + input[21];
1622   bf1[43] = -input[43] + input[20];
1623   bf1[44] = -input[44] + input[19];
1624   bf1[45] = -input[45] + input[18];
1625   bf1[46] = -input[46] + input[17];
1626   bf1[47] = -input[47] + input[16];
1627   bf1[48] = -input[48] + input[15];
1628   bf1[49] = -input[49] + input[14];
1629   bf1[50] = -input[50] + input[13];
1630   bf1[51] = -input[51] + input[12];
1631   bf1[52] = -input[52] + input[11];
1632   bf1[53] = -input[53] + input[10];
1633   bf1[54] = -input[54] + input[9];
1634   bf1[55] = -input[55] + input[8];
1635   bf1[56] = -input[56] + input[7];
1636   bf1[57] = -input[57] + input[6];
1637   bf1[58] = -input[58] + input[5];
1638   bf1[59] = -input[59] + input[4];
1639   bf1[60] = -input[60] + input[3];
1640   bf1[61] = -input[61] + input[2];
1641   bf1[62] = -input[62] + input[1];
1642   bf1[63] = -input[63] + input[0];
1643   range_check(stage, input, bf1, size, stage_range[stage]);
1644 
1645   // stage 2
1646   stage++;
1647   cospi = cospi_arr(cos_bit[stage]);
1648   bf0 = output;
1649   bf1 = step;
1650   bf1[0] = bf0[0] + bf0[31];
1651   bf1[1] = bf0[1] + bf0[30];
1652   bf1[2] = bf0[2] + bf0[29];
1653   bf1[3] = bf0[3] + bf0[28];
1654   bf1[4] = bf0[4] + bf0[27];
1655   bf1[5] = bf0[5] + bf0[26];
1656   bf1[6] = bf0[6] + bf0[25];
1657   bf1[7] = bf0[7] + bf0[24];
1658   bf1[8] = bf0[8] + bf0[23];
1659   bf1[9] = bf0[9] + bf0[22];
1660   bf1[10] = bf0[10] + bf0[21];
1661   bf1[11] = bf0[11] + bf0[20];
1662   bf1[12] = bf0[12] + bf0[19];
1663   bf1[13] = bf0[13] + bf0[18];
1664   bf1[14] = bf0[14] + bf0[17];
1665   bf1[15] = bf0[15] + bf0[16];
1666   bf1[16] = -bf0[16] + bf0[15];
1667   bf1[17] = -bf0[17] + bf0[14];
1668   bf1[18] = -bf0[18] + bf0[13];
1669   bf1[19] = -bf0[19] + bf0[12];
1670   bf1[20] = -bf0[20] + bf0[11];
1671   bf1[21] = -bf0[21] + bf0[10];
1672   bf1[22] = -bf0[22] + bf0[9];
1673   bf1[23] = -bf0[23] + bf0[8];
1674   bf1[24] = -bf0[24] + bf0[7];
1675   bf1[25] = -bf0[25] + bf0[6];
1676   bf1[26] = -bf0[26] + bf0[5];
1677   bf1[27] = -bf0[27] + bf0[4];
1678   bf1[28] = -bf0[28] + bf0[3];
1679   bf1[29] = -bf0[29] + bf0[2];
1680   bf1[30] = -bf0[30] + bf0[1];
1681   bf1[31] = -bf0[31] + bf0[0];
1682   bf1[32] = bf0[32];
1683   bf1[33] = bf0[33];
1684   bf1[34] = bf0[34];
1685   bf1[35] = bf0[35];
1686   bf1[36] = bf0[36];
1687   bf1[37] = bf0[37];
1688   bf1[38] = bf0[38];
1689   bf1[39] = bf0[39];
1690   bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
1691   bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
1692   bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
1693   bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
1694   bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
1695   bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
1696   bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
1697   bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
1698   bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
1699   bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
1700   bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
1701   bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
1702   bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
1703   bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
1704   bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
1705   bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
1706   bf1[56] = bf0[56];
1707   bf1[57] = bf0[57];
1708   bf1[58] = bf0[58];
1709   bf1[59] = bf0[59];
1710   bf1[60] = bf0[60];
1711   bf1[61] = bf0[61];
1712   bf1[62] = bf0[62];
1713   bf1[63] = bf0[63];
1714   range_check(stage, input, bf1, size, stage_range[stage]);
1715 
1716   // stage 3
1717   stage++;
1718   cospi = cospi_arr(cos_bit[stage]);
1719   bf0 = step;
1720   bf1 = output;
1721   bf1[0] = bf0[0] + bf0[15];
1722   bf1[1] = bf0[1] + bf0[14];
1723   bf1[2] = bf0[2] + bf0[13];
1724   bf1[3] = bf0[3] + bf0[12];
1725   bf1[4] = bf0[4] + bf0[11];
1726   bf1[5] = bf0[5] + bf0[10];
1727   bf1[6] = bf0[6] + bf0[9];
1728   bf1[7] = bf0[7] + bf0[8];
1729   bf1[8] = -bf0[8] + bf0[7];
1730   bf1[9] = -bf0[9] + bf0[6];
1731   bf1[10] = -bf0[10] + bf0[5];
1732   bf1[11] = -bf0[11] + bf0[4];
1733   bf1[12] = -bf0[12] + bf0[3];
1734   bf1[13] = -bf0[13] + bf0[2];
1735   bf1[14] = -bf0[14] + bf0[1];
1736   bf1[15] = -bf0[15] + bf0[0];
1737   bf1[16] = bf0[16];
1738   bf1[17] = bf0[17];
1739   bf1[18] = bf0[18];
1740   bf1[19] = bf0[19];
1741   bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
1742   bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
1743   bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
1744   bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
1745   bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
1746   bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
1747   bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
1748   bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
1749   bf1[28] = bf0[28];
1750   bf1[29] = bf0[29];
1751   bf1[30] = bf0[30];
1752   bf1[31] = bf0[31];
1753   bf1[32] = bf0[32] + bf0[47];
1754   bf1[33] = bf0[33] + bf0[46];
1755   bf1[34] = bf0[34] + bf0[45];
1756   bf1[35] = bf0[35] + bf0[44];
1757   bf1[36] = bf0[36] + bf0[43];
1758   bf1[37] = bf0[37] + bf0[42];
1759   bf1[38] = bf0[38] + bf0[41];
1760   bf1[39] = bf0[39] + bf0[40];
1761   bf1[40] = -bf0[40] + bf0[39];
1762   bf1[41] = -bf0[41] + bf0[38];
1763   bf1[42] = -bf0[42] + bf0[37];
1764   bf1[43] = -bf0[43] + bf0[36];
1765   bf1[44] = -bf0[44] + bf0[35];
1766   bf1[45] = -bf0[45] + bf0[34];
1767   bf1[46] = -bf0[46] + bf0[33];
1768   bf1[47] = -bf0[47] + bf0[32];
1769   bf1[48] = -bf0[48] + bf0[63];
1770   bf1[49] = -bf0[49] + bf0[62];
1771   bf1[50] = -bf0[50] + bf0[61];
1772   bf1[51] = -bf0[51] + bf0[60];
1773   bf1[52] = -bf0[52] + bf0[59];
1774   bf1[53] = -bf0[53] + bf0[58];
1775   bf1[54] = -bf0[54] + bf0[57];
1776   bf1[55] = -bf0[55] + bf0[56];
1777   bf1[56] = bf0[56] + bf0[55];
1778   bf1[57] = bf0[57] + bf0[54];
1779   bf1[58] = bf0[58] + bf0[53];
1780   bf1[59] = bf0[59] + bf0[52];
1781   bf1[60] = bf0[60] + bf0[51];
1782   bf1[61] = bf0[61] + bf0[50];
1783   bf1[62] = bf0[62] + bf0[49];
1784   bf1[63] = bf0[63] + bf0[48];
1785   range_check(stage, input, bf1, size, stage_range[stage]);
1786 
1787   // stage 4
1788   stage++;
1789   cospi = cospi_arr(cos_bit[stage]);
1790   bf0 = output;
1791   bf1 = step;
1792   bf1[0] = bf0[0] + bf0[7];
1793   bf1[1] = bf0[1] + bf0[6];
1794   bf1[2] = bf0[2] + bf0[5];
1795   bf1[3] = bf0[3] + bf0[4];
1796   bf1[4] = -bf0[4] + bf0[3];
1797   bf1[5] = -bf0[5] + bf0[2];
1798   bf1[6] = -bf0[6] + bf0[1];
1799   bf1[7] = -bf0[7] + bf0[0];
1800   bf1[8] = bf0[8];
1801   bf1[9] = bf0[9];
1802   bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
1803   bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
1804   bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
1805   bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
1806   bf1[14] = bf0[14];
1807   bf1[15] = bf0[15];
1808   bf1[16] = bf0[16] + bf0[23];
1809   bf1[17] = bf0[17] + bf0[22];
1810   bf1[18] = bf0[18] + bf0[21];
1811   bf1[19] = bf0[19] + bf0[20];
1812   bf1[20] = -bf0[20] + bf0[19];
1813   bf1[21] = -bf0[21] + bf0[18];
1814   bf1[22] = -bf0[22] + bf0[17];
1815   bf1[23] = -bf0[23] + bf0[16];
1816   bf1[24] = -bf0[24] + bf0[31];
1817   bf1[25] = -bf0[25] + bf0[30];
1818   bf1[26] = -bf0[26] + bf0[29];
1819   bf1[27] = -bf0[27] + bf0[28];
1820   bf1[28] = bf0[28] + bf0[27];
1821   bf1[29] = bf0[29] + bf0[26];
1822   bf1[30] = bf0[30] + bf0[25];
1823   bf1[31] = bf0[31] + bf0[24];
1824   bf1[32] = bf0[32];
1825   bf1[33] = bf0[33];
1826   bf1[34] = bf0[34];
1827   bf1[35] = bf0[35];
1828   bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
1829   bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
1830   bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
1831   bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
1832   bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
1833   bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
1834   bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
1835   bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
1836   bf1[44] = bf0[44];
1837   bf1[45] = bf0[45];
1838   bf1[46] = bf0[46];
1839   bf1[47] = bf0[47];
1840   bf1[48] = bf0[48];
1841   bf1[49] = bf0[49];
1842   bf1[50] = bf0[50];
1843   bf1[51] = bf0[51];
1844   bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
1845   bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
1846   bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
1847   bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
1848   bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
1849   bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
1850   bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
1851   bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
1852   bf1[60] = bf0[60];
1853   bf1[61] = bf0[61];
1854   bf1[62] = bf0[62];
1855   bf1[63] = bf0[63];
1856   range_check(stage, input, bf1, size, stage_range[stage]);
1857 
1858   // stage 5
1859   stage++;
1860   cospi = cospi_arr(cos_bit[stage]);
1861   bf0 = step;
1862   bf1 = output;
1863   bf1[0] = bf0[0] + bf0[3];
1864   bf1[1] = bf0[1] + bf0[2];
1865   bf1[2] = -bf0[2] + bf0[1];
1866   bf1[3] = -bf0[3] + bf0[0];
1867   bf1[4] = bf0[4];
1868   bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
1869   bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
1870   bf1[7] = bf0[7];
1871   bf1[8] = bf0[8] + bf0[11];
1872   bf1[9] = bf0[9] + bf0[10];
1873   bf1[10] = -bf0[10] + bf0[9];
1874   bf1[11] = -bf0[11] + bf0[8];
1875   bf1[12] = -bf0[12] + bf0[15];
1876   bf1[13] = -bf0[13] + bf0[14];
1877   bf1[14] = bf0[14] + bf0[13];
1878   bf1[15] = bf0[15] + bf0[12];
1879   bf1[16] = bf0[16];
1880   bf1[17] = bf0[17];
1881   bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
1882   bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
1883   bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
1884   bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
1885   bf1[22] = bf0[22];
1886   bf1[23] = bf0[23];
1887   bf1[24] = bf0[24];
1888   bf1[25] = bf0[25];
1889   bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
1890   bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
1891   bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
1892   bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
1893   bf1[30] = bf0[30];
1894   bf1[31] = bf0[31];
1895   bf1[32] = bf0[32] + bf0[39];
1896   bf1[33] = bf0[33] + bf0[38];
1897   bf1[34] = bf0[34] + bf0[37];
1898   bf1[35] = bf0[35] + bf0[36];
1899   bf1[36] = -bf0[36] + bf0[35];
1900   bf1[37] = -bf0[37] + bf0[34];
1901   bf1[38] = -bf0[38] + bf0[33];
1902   bf1[39] = -bf0[39] + bf0[32];
1903   bf1[40] = -bf0[40] + bf0[47];
1904   bf1[41] = -bf0[41] + bf0[46];
1905   bf1[42] = -bf0[42] + bf0[45];
1906   bf1[43] = -bf0[43] + bf0[44];
1907   bf1[44] = bf0[44] + bf0[43];
1908   bf1[45] = bf0[45] + bf0[42];
1909   bf1[46] = bf0[46] + bf0[41];
1910   bf1[47] = bf0[47] + bf0[40];
1911   bf1[48] = bf0[48] + bf0[55];
1912   bf1[49] = bf0[49] + bf0[54];
1913   bf1[50] = bf0[50] + bf0[53];
1914   bf1[51] = bf0[51] + bf0[52];
1915   bf1[52] = -bf0[52] + bf0[51];
1916   bf1[53] = -bf0[53] + bf0[50];
1917   bf1[54] = -bf0[54] + bf0[49];
1918   bf1[55] = -bf0[55] + bf0[48];
1919   bf1[56] = -bf0[56] + bf0[63];
1920   bf1[57] = -bf0[57] + bf0[62];
1921   bf1[58] = -bf0[58] + bf0[61];
1922   bf1[59] = -bf0[59] + bf0[60];
1923   bf1[60] = bf0[60] + bf0[59];
1924   bf1[61] = bf0[61] + bf0[58];
1925   bf1[62] = bf0[62] + bf0[57];
1926   bf1[63] = bf0[63] + bf0[56];
1927   range_check(stage, input, bf1, size, stage_range[stage]);
1928 
1929   // stage 6
1930   stage++;
1931   cospi = cospi_arr(cos_bit[stage]);
1932   bf0 = output;
1933   bf1 = step;
1934   bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
1935   bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
1936   bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
1937   bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
1938   bf1[4] = bf0[4] + bf0[5];
1939   bf1[5] = -bf0[5] + bf0[4];
1940   bf1[6] = -bf0[6] + bf0[7];
1941   bf1[7] = bf0[7] + bf0[6];
1942   bf1[8] = bf0[8];
1943   bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
1944   bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
1945   bf1[11] = bf0[11];
1946   bf1[12] = bf0[12];
1947   bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
1948   bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
1949   bf1[15] = bf0[15];
1950   bf1[16] = bf0[16] + bf0[19];
1951   bf1[17] = bf0[17] + bf0[18];
1952   bf1[18] = -bf0[18] + bf0[17];
1953   bf1[19] = -bf0[19] + bf0[16];
1954   bf1[20] = -bf0[20] + bf0[23];
1955   bf1[21] = -bf0[21] + bf0[22];
1956   bf1[22] = bf0[22] + bf0[21];
1957   bf1[23] = bf0[23] + bf0[20];
1958   bf1[24] = bf0[24] + bf0[27];
1959   bf1[25] = bf0[25] + bf0[26];
1960   bf1[26] = -bf0[26] + bf0[25];
1961   bf1[27] = -bf0[27] + bf0[24];
1962   bf1[28] = -bf0[28] + bf0[31];
1963   bf1[29] = -bf0[29] + bf0[30];
1964   bf1[30] = bf0[30] + bf0[29];
1965   bf1[31] = bf0[31] + bf0[28];
1966   bf1[32] = bf0[32];
1967   bf1[33] = bf0[33];
1968   bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
1969   bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
1970   bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
1971   bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
1972   bf1[38] = bf0[38];
1973   bf1[39] = bf0[39];
1974   bf1[40] = bf0[40];
1975   bf1[41] = bf0[41];
1976   bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
1977   bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
1978   bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
1979   bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
1980   bf1[46] = bf0[46];
1981   bf1[47] = bf0[47];
1982   bf1[48] = bf0[48];
1983   bf1[49] = bf0[49];
1984   bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
1985   bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
1986   bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
1987   bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
1988   bf1[54] = bf0[54];
1989   bf1[55] = bf0[55];
1990   bf1[56] = bf0[56];
1991   bf1[57] = bf0[57];
1992   bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
1993   bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
1994   bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
1995   bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
1996   bf1[62] = bf0[62];
1997   bf1[63] = bf0[63];
1998   range_check(stage, input, bf1, size, stage_range[stage]);
1999 
2000   // stage 7
2001   stage++;
2002   cospi = cospi_arr(cos_bit[stage]);
2003   bf0 = step;
2004   bf1 = output;
2005   bf1[0] = bf0[0];
2006   bf1[1] = bf0[1];
2007   bf1[2] = bf0[2];
2008   bf1[3] = bf0[3];
2009   bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
2010   bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
2011   bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
2012   bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
2013   bf1[8] = bf0[8] + bf0[9];
2014   bf1[9] = -bf0[9] + bf0[8];
2015   bf1[10] = -bf0[10] + bf0[11];
2016   bf1[11] = bf0[11] + bf0[10];
2017   bf1[12] = bf0[12] + bf0[13];
2018   bf1[13] = -bf0[13] + bf0[12];
2019   bf1[14] = -bf0[14] + bf0[15];
2020   bf1[15] = bf0[15] + bf0[14];
2021   bf1[16] = bf0[16];
2022   bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
2023   bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
2024   bf1[19] = bf0[19];
2025   bf1[20] = bf0[20];
2026   bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
2027   bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
2028   bf1[23] = bf0[23];
2029   bf1[24] = bf0[24];
2030   bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
2031   bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
2032   bf1[27] = bf0[27];
2033   bf1[28] = bf0[28];
2034   bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
2035   bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
2036   bf1[31] = bf0[31];
2037   bf1[32] = bf0[32] + bf0[35];
2038   bf1[33] = bf0[33] + bf0[34];
2039   bf1[34] = -bf0[34] + bf0[33];
2040   bf1[35] = -bf0[35] + bf0[32];
2041   bf1[36] = -bf0[36] + bf0[39];
2042   bf1[37] = -bf0[37] + bf0[38];
2043   bf1[38] = bf0[38] + bf0[37];
2044   bf1[39] = bf0[39] + bf0[36];
2045   bf1[40] = bf0[40] + bf0[43];
2046   bf1[41] = bf0[41] + bf0[42];
2047   bf1[42] = -bf0[42] + bf0[41];
2048   bf1[43] = -bf0[43] + bf0[40];
2049   bf1[44] = -bf0[44] + bf0[47];
2050   bf1[45] = -bf0[45] + bf0[46];
2051   bf1[46] = bf0[46] + bf0[45];
2052   bf1[47] = bf0[47] + bf0[44];
2053   bf1[48] = bf0[48] + bf0[51];
2054   bf1[49] = bf0[49] + bf0[50];
2055   bf1[50] = -bf0[50] + bf0[49];
2056   bf1[51] = -bf0[51] + bf0[48];
2057   bf1[52] = -bf0[52] + bf0[55];
2058   bf1[53] = -bf0[53] + bf0[54];
2059   bf1[54] = bf0[54] + bf0[53];
2060   bf1[55] = bf0[55] + bf0[52];
2061   bf1[56] = bf0[56] + bf0[59];
2062   bf1[57] = bf0[57] + bf0[58];
2063   bf1[58] = -bf0[58] + bf0[57];
2064   bf1[59] = -bf0[59] + bf0[56];
2065   bf1[60] = -bf0[60] + bf0[63];
2066   bf1[61] = -bf0[61] + bf0[62];
2067   bf1[62] = bf0[62] + bf0[61];
2068   bf1[63] = bf0[63] + bf0[60];
2069   range_check(stage, input, bf1, size, stage_range[stage]);
2070 
2071   // stage 8
2072   stage++;
2073   cospi = cospi_arr(cos_bit[stage]);
2074   bf0 = output;
2075   bf1 = step;
2076   bf1[0] = bf0[0];
2077   bf1[1] = bf0[1];
2078   bf1[2] = bf0[2];
2079   bf1[3] = bf0[3];
2080   bf1[4] = bf0[4];
2081   bf1[5] = bf0[5];
2082   bf1[6] = bf0[6];
2083   bf1[7] = bf0[7];
2084   bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
2085   bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
2086   bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
2087   bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
2088   bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
2089   bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
2090   bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
2091   bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
2092   bf1[16] = bf0[16] + bf0[17];
2093   bf1[17] = -bf0[17] + bf0[16];
2094   bf1[18] = -bf0[18] + bf0[19];
2095   bf1[19] = bf0[19] + bf0[18];
2096   bf1[20] = bf0[20] + bf0[21];
2097   bf1[21] = -bf0[21] + bf0[20];
2098   bf1[22] = -bf0[22] + bf0[23];
2099   bf1[23] = bf0[23] + bf0[22];
2100   bf1[24] = bf0[24] + bf0[25];
2101   bf1[25] = -bf0[25] + bf0[24];
2102   bf1[26] = -bf0[26] + bf0[27];
2103   bf1[27] = bf0[27] + bf0[26];
2104   bf1[28] = bf0[28] + bf0[29];
2105   bf1[29] = -bf0[29] + bf0[28];
2106   bf1[30] = -bf0[30] + bf0[31];
2107   bf1[31] = bf0[31] + bf0[30];
2108   bf1[32] = bf0[32];
2109   bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
2110   bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
2111   bf1[35] = bf0[35];
2112   bf1[36] = bf0[36];
2113   bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
2114   bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
2115   bf1[39] = bf0[39];
2116   bf1[40] = bf0[40];
2117   bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
2118   bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
2119   bf1[43] = bf0[43];
2120   bf1[44] = bf0[44];
2121   bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
2122   bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
2123   bf1[47] = bf0[47];
2124   bf1[48] = bf0[48];
2125   bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
2126   bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
2127   bf1[51] = bf0[51];
2128   bf1[52] = bf0[52];
2129   bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
2130   bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
2131   bf1[55] = bf0[55];
2132   bf1[56] = bf0[56];
2133   bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
2134   bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
2135   bf1[59] = bf0[59];
2136   bf1[60] = bf0[60];
2137   bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
2138   bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
2139   bf1[63] = bf0[63];
2140   range_check(stage, input, bf1, size, stage_range[stage]);
2141 
2142   // stage 9
2143   stage++;
2144   cospi = cospi_arr(cos_bit[stage]);
2145   bf0 = step;
2146   bf1 = output;
2147   bf1[0] = bf0[0];
2148   bf1[1] = bf0[1];
2149   bf1[2] = bf0[2];
2150   bf1[3] = bf0[3];
2151   bf1[4] = bf0[4];
2152   bf1[5] = bf0[5];
2153   bf1[6] = bf0[6];
2154   bf1[7] = bf0[7];
2155   bf1[8] = bf0[8];
2156   bf1[9] = bf0[9];
2157   bf1[10] = bf0[10];
2158   bf1[11] = bf0[11];
2159   bf1[12] = bf0[12];
2160   bf1[13] = bf0[13];
2161   bf1[14] = bf0[14];
2162   bf1[15] = bf0[15];
2163   bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
2164   bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
2165   bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
2166   bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
2167   bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
2168   bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
2169   bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
2170   bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
2171   bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
2172   bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
2173   bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
2174   bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
2175   bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
2176   bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
2177   bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
2178   bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
2179   bf1[32] = bf0[32] + bf0[33];
2180   bf1[33] = -bf0[33] + bf0[32];
2181   bf1[34] = -bf0[34] + bf0[35];
2182   bf1[35] = bf0[35] + bf0[34];
2183   bf1[36] = bf0[36] + bf0[37];
2184   bf1[37] = -bf0[37] + bf0[36];
2185   bf1[38] = -bf0[38] + bf0[39];
2186   bf1[39] = bf0[39] + bf0[38];
2187   bf1[40] = bf0[40] + bf0[41];
2188   bf1[41] = -bf0[41] + bf0[40];
2189   bf1[42] = -bf0[42] + bf0[43];
2190   bf1[43] = bf0[43] + bf0[42];
2191   bf1[44] = bf0[44] + bf0[45];
2192   bf1[45] = -bf0[45] + bf0[44];
2193   bf1[46] = -bf0[46] + bf0[47];
2194   bf1[47] = bf0[47] + bf0[46];
2195   bf1[48] = bf0[48] + bf0[49];
2196   bf1[49] = -bf0[49] + bf0[48];
2197   bf1[50] = -bf0[50] + bf0[51];
2198   bf1[51] = bf0[51] + bf0[50];
2199   bf1[52] = bf0[52] + bf0[53];
2200   bf1[53] = -bf0[53] + bf0[52];
2201   bf1[54] = -bf0[54] + bf0[55];
2202   bf1[55] = bf0[55] + bf0[54];
2203   bf1[56] = bf0[56] + bf0[57];
2204   bf1[57] = -bf0[57] + bf0[56];
2205   bf1[58] = -bf0[58] + bf0[59];
2206   bf1[59] = bf0[59] + bf0[58];
2207   bf1[60] = bf0[60] + bf0[61];
2208   bf1[61] = -bf0[61] + bf0[60];
2209   bf1[62] = -bf0[62] + bf0[63];
2210   bf1[63] = bf0[63] + bf0[62];
2211   range_check(stage, input, bf1, size, stage_range[stage]);
2212 
2213   // stage 10
2214   stage++;
2215   cospi = cospi_arr(cos_bit[stage]);
2216   bf0 = output;
2217   bf1 = step;
2218   bf1[0] = bf0[0];
2219   bf1[1] = bf0[1];
2220   bf1[2] = bf0[2];
2221   bf1[3] = bf0[3];
2222   bf1[4] = bf0[4];
2223   bf1[5] = bf0[5];
2224   bf1[6] = bf0[6];
2225   bf1[7] = bf0[7];
2226   bf1[8] = bf0[8];
2227   bf1[9] = bf0[9];
2228   bf1[10] = bf0[10];
2229   bf1[11] = bf0[11];
2230   bf1[12] = bf0[12];
2231   bf1[13] = bf0[13];
2232   bf1[14] = bf0[14];
2233   bf1[15] = bf0[15];
2234   bf1[16] = bf0[16];
2235   bf1[17] = bf0[17];
2236   bf1[18] = bf0[18];
2237   bf1[19] = bf0[19];
2238   bf1[20] = bf0[20];
2239   bf1[21] = bf0[21];
2240   bf1[22] = bf0[22];
2241   bf1[23] = bf0[23];
2242   bf1[24] = bf0[24];
2243   bf1[25] = bf0[25];
2244   bf1[26] = bf0[26];
2245   bf1[27] = bf0[27];
2246   bf1[28] = bf0[28];
2247   bf1[29] = bf0[29];
2248   bf1[30] = bf0[30];
2249   bf1[31] = bf0[31];
2250   bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
2251   bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
2252   bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
2253   bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
2254   bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
2255   bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
2256   bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
2257   bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
2258   bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
2259   bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
2260   bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
2261   bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
2262   bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
2263   bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
2264   bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
2265   bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
2266   bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
2267   bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
2268   bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
2269   bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
2270   bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
2271   bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
2272   bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
2273   bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
2274   bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
2275   bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
2276   bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
2277   bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
2278   bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
2279   bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
2280   bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
2281   bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
2282   range_check(stage, input, bf1, size, stage_range[stage]);
2283 
2284   // stage 11
2285   stage++;
2286   cospi = cospi_arr(cos_bit[stage]);
2287   bf0 = step;
2288   bf1 = output;
2289   bf1[0] = bf0[0];
2290   bf1[1] = bf0[32];
2291   bf1[2] = bf0[16];
2292   bf1[3] = bf0[48];
2293   bf1[4] = bf0[8];
2294   bf1[5] = bf0[40];
2295   bf1[6] = bf0[24];
2296   bf1[7] = bf0[56];
2297   bf1[8] = bf0[4];
2298   bf1[9] = bf0[36];
2299   bf1[10] = bf0[20];
2300   bf1[11] = bf0[52];
2301   bf1[12] = bf0[12];
2302   bf1[13] = bf0[44];
2303   bf1[14] = bf0[28];
2304   bf1[15] = bf0[60];
2305   bf1[16] = bf0[2];
2306   bf1[17] = bf0[34];
2307   bf1[18] = bf0[18];
2308   bf1[19] = bf0[50];
2309   bf1[20] = bf0[10];
2310   bf1[21] = bf0[42];
2311   bf1[22] = bf0[26];
2312   bf1[23] = bf0[58];
2313   bf1[24] = bf0[6];
2314   bf1[25] = bf0[38];
2315   bf1[26] = bf0[22];
2316   bf1[27] = bf0[54];
2317   bf1[28] = bf0[14];
2318   bf1[29] = bf0[46];
2319   bf1[30] = bf0[30];
2320   bf1[31] = bf0[62];
2321   bf1[32] = bf0[1];
2322   bf1[33] = bf0[33];
2323   bf1[34] = bf0[17];
2324   bf1[35] = bf0[49];
2325   bf1[36] = bf0[9];
2326   bf1[37] = bf0[41];
2327   bf1[38] = bf0[25];
2328   bf1[39] = bf0[57];
2329   bf1[40] = bf0[5];
2330   bf1[41] = bf0[37];
2331   bf1[42] = bf0[21];
2332   bf1[43] = bf0[53];
2333   bf1[44] = bf0[13];
2334   bf1[45] = bf0[45];
2335   bf1[46] = bf0[29];
2336   bf1[47] = bf0[61];
2337   bf1[48] = bf0[3];
2338   bf1[49] = bf0[35];
2339   bf1[50] = bf0[19];
2340   bf1[51] = bf0[51];
2341   bf1[52] = bf0[11];
2342   bf1[53] = bf0[43];
2343   bf1[54] = bf0[27];
2344   bf1[55] = bf0[59];
2345   bf1[56] = bf0[7];
2346   bf1[57] = bf0[39];
2347   bf1[58] = bf0[23];
2348   bf1[59] = bf0[55];
2349   bf1[60] = bf0[15];
2350   bf1[61] = bf0[47];
2351   bf1[62] = bf0[31];
2352   bf1[63] = bf0[63];
2353   range_check(stage, input, bf1, size, stage_range[stage]);
2354 }
2355 #endif  // CONFIG_TX64X64
2356