1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
11 */
12 
13 #include <stdlib.h>
14 #include "EbTransforms.h"
15 #include "aom_dsp_rtcd.h"
16 
17 static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {fdct4_range_mult2,
18                                                               fdct8_range_mult2,
19                                                               fdct16_range_mult2,
20                                                               fdct32_range_mult2,
21                                                               fdct64_range_mult2,
22                                                               fadst4_range_mult2,
23                                                               fadst8_range_mult2,
24                                                               fadst16_range_mult2,
25                                                               fadst32_range_mult2,
26                                                               fidtx4_range_mult2,
27                                                               fidtx8_range_mult2,
28                                                               fidtx16_range_mult2,
29                                                               fidtx32_range_mult2,
30                                                               fidtx64_range_mult2};
31 
32 static const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
33     fwd_shift_4x4,   fwd_shift_8x8,   fwd_shift_16x16, fwd_shift_32x32, fwd_shift_64x64,
34     fwd_shift_4x8,   fwd_shift_8x4,   fwd_shift_8x16,  fwd_shift_16x8,  fwd_shift_16x32,
35     fwd_shift_32x16, fwd_shift_32x64, fwd_shift_64x32, fwd_shift_4x16,  fwd_shift_16x4,
36     fwd_shift_8x32,  fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
37 };
38 
39 /*****************************
40  * Defines
41  *****************************/
42 
43 #define BETA_P 1
44 #define BETA_N 3
45 
46 /********************************************
47   * Constants
48   ********************************************/
49 
50 #define ALPHA_0000 0
51 #define ALPHA_0050 50
52 
53 #define ALPHA_0100 100
54 #define ALPHA_0200 200
55 #define ALPHA_0300 300
56 #define ALPHA_0500 500
57 #define ALPHA_1000 1000
58 
svt_av1_gen_fwd_stage_range(int8_t * stage_range_col,int8_t * stage_range_row,const Txfm2dFlipCfg * cfg,int32_t bd)59 void svt_av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
60                                  const Txfm2dFlipCfg *cfg, int32_t bd) {
61     // Take the shift from the larger dimension in the rectangular case.
62     const int8_t *shift = cfg->shift;
63     // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
64     for (int32_t i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i)
65         stage_range_col[i] = (int8_t)(cfg->stage_range_col[i] + shift[0] + bd + 1);
66     // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
67     for (int32_t i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i)
68         stage_range_row[i] = (int8_t)(cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1);
69 }
70 
71 #define range_check(stage, input, buf, size, bit) \
72     do {                                          \
73     } while (0)
74 
svt_av1_fdct4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)75 void svt_av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
76                        const int8_t *stage_range) {
77     (void)stage_range;
78     const int32_t *cospi;
79 
80     int32_t *bf0, *bf1;
81     int32_t  step[4];
82 
83     // stage 0;
84 
85     // stage 1;
86     bf1    = output;
87     bf1[0] = input[0] + input[3];
88     bf1[1] = input[1] + input[2];
89     bf1[2] = -input[2] + input[1];
90     bf1[3] = -input[3] + input[0];
91 
92     // stage 2
93     cospi  = cospi_arr(cos_bit);
94     bf0    = output;
95     bf1    = step;
96     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
97     bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
98     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
99     bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
100 
101     // stage 3
102     bf0    = step;
103     bf1    = output;
104     bf1[0] = bf0[0];
105     bf1[1] = bf0[2];
106     bf1[2] = bf0[1];
107     bf1[3] = bf0[3];
108 }
109 
svt_av1_fdct8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)110 void svt_av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
111                        const int8_t *stage_range) {
112     (void)stage_range;
113     const int32_t *cospi;
114 
115     int32_t *bf0, *bf1;
116     int32_t  step[8];
117 
118     // stage 0;
119 
120     // stage 1;
121     bf1    = output;
122     bf1[0] = input[0] + input[7];
123     bf1[1] = input[1] + input[6];
124     bf1[2] = input[2] + input[5];
125     bf1[3] = input[3] + input[4];
126     bf1[4] = -input[4] + input[3];
127     bf1[5] = -input[5] + input[2];
128     bf1[6] = -input[6] + input[1];
129     bf1[7] = -input[7] + input[0];
130 
131     // stage 2
132     cospi  = cospi_arr(cos_bit);
133     bf0    = output;
134     bf1    = step;
135     bf1[0] = bf0[0] + bf0[3];
136     bf1[1] = bf0[1] + bf0[2];
137     bf1[2] = -bf0[2] + bf0[1];
138     bf1[3] = -bf0[3] + bf0[0];
139     bf1[4] = bf0[4];
140     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
141     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
142     bf1[7] = bf0[7];
143 
144     // stage 3
145     cospi  = cospi_arr(cos_bit);
146     bf0    = step;
147     bf1    = output;
148     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
149     bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
150     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
151     bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
152     bf1[4] = bf0[4] + bf0[5];
153     bf1[5] = -bf0[5] + bf0[4];
154     bf1[6] = -bf0[6] + bf0[7];
155     bf1[7] = bf0[7] + bf0[6];
156 
157     // stage 4
158     cospi  = cospi_arr(cos_bit);
159     bf0    = output;
160     bf1    = step;
161     bf1[0] = bf0[0];
162     bf1[1] = bf0[1];
163     bf1[2] = bf0[2];
164     bf1[3] = bf0[3];
165     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
166     bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
167     bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
168     bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
169 
170     // stage 5
171     bf0    = step;
172     bf1    = output;
173     bf1[0] = bf0[0];
174     bf1[1] = bf0[4];
175     bf1[2] = bf0[2];
176     bf1[3] = bf0[6];
177     bf1[4] = bf0[1];
178     bf1[5] = bf0[5];
179     bf1[6] = bf0[3];
180     bf1[7] = bf0[7];
181 }
182 
svt_av1_fdct16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)183 void svt_av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
184                         const int8_t *stage_range) {
185     (void)stage_range;
186     const int32_t *cospi;
187 
188     int32_t *bf0, *bf1;
189     int32_t  step[16];
190 
191     // stage 0;
192 
193     // stage 1;
194     bf1     = output;
195     bf1[0]  = input[0] + input[15];
196     bf1[1]  = input[1] + input[14];
197     bf1[2]  = input[2] + input[13];
198     bf1[3]  = input[3] + input[12];
199     bf1[4]  = input[4] + input[11];
200     bf1[5]  = input[5] + input[10];
201     bf1[6]  = input[6] + input[9];
202     bf1[7]  = input[7] + input[8];
203     bf1[8]  = -input[8] + input[7];
204     bf1[9]  = -input[9] + input[6];
205     bf1[10] = -input[10] + input[5];
206     bf1[11] = -input[11] + input[4];
207     bf1[12] = -input[12] + input[3];
208     bf1[13] = -input[13] + input[2];
209     bf1[14] = -input[14] + input[1];
210     bf1[15] = -input[15] + input[0];
211 
212     // stage 2
213     cospi   = cospi_arr(cos_bit);
214     bf0     = output;
215     bf1     = step;
216     bf1[0]  = bf0[0] + bf0[7];
217     bf1[1]  = bf0[1] + bf0[6];
218     bf1[2]  = bf0[2] + bf0[5];
219     bf1[3]  = bf0[3] + bf0[4];
220     bf1[4]  = -bf0[4] + bf0[3];
221     bf1[5]  = -bf0[5] + bf0[2];
222     bf1[6]  = -bf0[6] + bf0[1];
223     bf1[7]  = -bf0[7] + bf0[0];
224     bf1[8]  = bf0[8];
225     bf1[9]  = bf0[9];
226     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
227     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
228     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
229     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
230     bf1[14] = bf0[14];
231     bf1[15] = bf0[15];
232 
233     // stage 3
234     cospi   = cospi_arr(cos_bit);
235     bf0     = step;
236     bf1     = output;
237     bf1[0]  = bf0[0] + bf0[3];
238     bf1[1]  = bf0[1] + bf0[2];
239     bf1[2]  = -bf0[2] + bf0[1];
240     bf1[3]  = -bf0[3] + bf0[0];
241     bf1[4]  = bf0[4];
242     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
243     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
244     bf1[7]  = bf0[7];
245     bf1[8]  = bf0[8] + bf0[11];
246     bf1[9]  = bf0[9] + bf0[10];
247     bf1[10] = -bf0[10] + bf0[9];
248     bf1[11] = -bf0[11] + bf0[8];
249     bf1[12] = -bf0[12] + bf0[15];
250     bf1[13] = -bf0[13] + bf0[14];
251     bf1[14] = bf0[14] + bf0[13];
252     bf1[15] = bf0[15] + bf0[12];
253 
254     // stage 4
255     cospi   = cospi_arr(cos_bit);
256     bf0     = output;
257     bf1     = step;
258     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
259     bf1[1]  = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
260     bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
261     bf1[3]  = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
262     bf1[4]  = bf0[4] + bf0[5];
263     bf1[5]  = -bf0[5] + bf0[4];
264     bf1[6]  = -bf0[6] + bf0[7];
265     bf1[7]  = bf0[7] + bf0[6];
266     bf1[8]  = bf0[8];
267     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
268     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
269     bf1[11] = bf0[11];
270     bf1[12] = bf0[12];
271     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
272     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
273     bf1[15] = bf0[15];
274 
275     // stage 5
276     cospi   = cospi_arr(cos_bit);
277     bf0     = step;
278     bf1     = output;
279     bf1[0]  = bf0[0];
280     bf1[1]  = bf0[1];
281     bf1[2]  = bf0[2];
282     bf1[3]  = bf0[3];
283     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
284     bf1[5]  = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
285     bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
286     bf1[7]  = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
287     bf1[8]  = bf0[8] + bf0[9];
288     bf1[9]  = -bf0[9] + bf0[8];
289     bf1[10] = -bf0[10] + bf0[11];
290     bf1[11] = bf0[11] + bf0[10];
291     bf1[12] = bf0[12] + bf0[13];
292     bf1[13] = -bf0[13] + bf0[12];
293     bf1[14] = -bf0[14] + bf0[15];
294     bf1[15] = bf0[15] + bf0[14];
295 
296     // stage 6
297     cospi   = cospi_arr(cos_bit);
298     bf0     = output;
299     bf1     = step;
300     bf1[0]  = bf0[0];
301     bf1[1]  = bf0[1];
302     bf1[2]  = bf0[2];
303     bf1[3]  = bf0[3];
304     bf1[4]  = bf0[4];
305     bf1[5]  = bf0[5];
306     bf1[6]  = bf0[6];
307     bf1[7]  = bf0[7];
308     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
309     bf1[9]  = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
310     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
311     bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
312     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
313     bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
314     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
315     bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
316 
317     // stage 7
318     bf0     = step;
319     bf1     = output;
320     bf1[0]  = bf0[0];
321     bf1[1]  = bf0[8];
322     bf1[2]  = bf0[4];
323     bf1[3]  = bf0[12];
324     bf1[4]  = bf0[2];
325     bf1[5]  = bf0[10];
326     bf1[6]  = bf0[6];
327     bf1[7]  = bf0[14];
328     bf1[8]  = bf0[1];
329     bf1[9]  = bf0[9];
330     bf1[10] = bf0[5];
331     bf1[11] = bf0[13];
332     bf1[12] = bf0[3];
333     bf1[13] = bf0[11];
334     bf1[14] = bf0[7];
335     bf1[15] = bf0[15];
336 }
337 
svt_av1_fdct32_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)338 void svt_av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
339                         const int8_t *stage_range) {
340     (void)stage_range;
341     const int32_t *cospi;
342 
343     int32_t *bf0, *bf1;
344     int32_t  step[32];
345 
346     // stage 0;
347 
348     // stage 1;
349     bf1     = output;
350     bf1[0]  = input[0] + input[31];
351     bf1[1]  = input[1] + input[30];
352     bf1[2]  = input[2] + input[29];
353     bf1[3]  = input[3] + input[28];
354     bf1[4]  = input[4] + input[27];
355     bf1[5]  = input[5] + input[26];
356     bf1[6]  = input[6] + input[25];
357     bf1[7]  = input[7] + input[24];
358     bf1[8]  = input[8] + input[23];
359     bf1[9]  = input[9] + input[22];
360     bf1[10] = input[10] + input[21];
361     bf1[11] = input[11] + input[20];
362     bf1[12] = input[12] + input[19];
363     bf1[13] = input[13] + input[18];
364     bf1[14] = input[14] + input[17];
365     bf1[15] = input[15] + input[16];
366     bf1[16] = -input[16] + input[15];
367     bf1[17] = -input[17] + input[14];
368     bf1[18] = -input[18] + input[13];
369     bf1[19] = -input[19] + input[12];
370     bf1[20] = -input[20] + input[11];
371     bf1[21] = -input[21] + input[10];
372     bf1[22] = -input[22] + input[9];
373     bf1[23] = -input[23] + input[8];
374     bf1[24] = -input[24] + input[7];
375     bf1[25] = -input[25] + input[6];
376     bf1[26] = -input[26] + input[5];
377     bf1[27] = -input[27] + input[4];
378     bf1[28] = -input[28] + input[3];
379     bf1[29] = -input[29] + input[2];
380     bf1[30] = -input[30] + input[1];
381     bf1[31] = -input[31] + input[0];
382 
383     // stage 2
384     cospi   = cospi_arr(cos_bit);
385     bf0     = output;
386     bf1     = step;
387     bf1[0]  = bf0[0] + bf0[15];
388     bf1[1]  = bf0[1] + bf0[14];
389     bf1[2]  = bf0[2] + bf0[13];
390     bf1[3]  = bf0[3] + bf0[12];
391     bf1[4]  = bf0[4] + bf0[11];
392     bf1[5]  = bf0[5] + bf0[10];
393     bf1[6]  = bf0[6] + bf0[9];
394     bf1[7]  = bf0[7] + bf0[8];
395     bf1[8]  = -bf0[8] + bf0[7];
396     bf1[9]  = -bf0[9] + bf0[6];
397     bf1[10] = -bf0[10] + bf0[5];
398     bf1[11] = -bf0[11] + bf0[4];
399     bf1[12] = -bf0[12] + bf0[3];
400     bf1[13] = -bf0[13] + bf0[2];
401     bf1[14] = -bf0[14] + bf0[1];
402     bf1[15] = -bf0[15] + bf0[0];
403     bf1[16] = bf0[16];
404     bf1[17] = bf0[17];
405     bf1[18] = bf0[18];
406     bf1[19] = bf0[19];
407     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
408     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
409     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
410     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
411     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
412     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
413     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
414     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
415     bf1[28] = bf0[28];
416     bf1[29] = bf0[29];
417     bf1[30] = bf0[30];
418     bf1[31] = bf0[31];
419 
420     // stage 3
421     cospi   = cospi_arr(cos_bit);
422     bf0     = step;
423     bf1     = output;
424     bf1[0]  = bf0[0] + bf0[7];
425     bf1[1]  = bf0[1] + bf0[6];
426     bf1[2]  = bf0[2] + bf0[5];
427     bf1[3]  = bf0[3] + bf0[4];
428     bf1[4]  = -bf0[4] + bf0[3];
429     bf1[5]  = -bf0[5] + bf0[2];
430     bf1[6]  = -bf0[6] + bf0[1];
431     bf1[7]  = -bf0[7] + bf0[0];
432     bf1[8]  = bf0[8];
433     bf1[9]  = bf0[9];
434     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
435     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
436     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
437     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
438     bf1[14] = bf0[14];
439     bf1[15] = bf0[15];
440     bf1[16] = bf0[16] + bf0[23];
441     bf1[17] = bf0[17] + bf0[22];
442     bf1[18] = bf0[18] + bf0[21];
443     bf1[19] = bf0[19] + bf0[20];
444     bf1[20] = -bf0[20] + bf0[19];
445     bf1[21] = -bf0[21] + bf0[18];
446     bf1[22] = -bf0[22] + bf0[17];
447     bf1[23] = -bf0[23] + bf0[16];
448     bf1[24] = -bf0[24] + bf0[31];
449     bf1[25] = -bf0[25] + bf0[30];
450     bf1[26] = -bf0[26] + bf0[29];
451     bf1[27] = -bf0[27] + bf0[28];
452     bf1[28] = bf0[28] + bf0[27];
453     bf1[29] = bf0[29] + bf0[26];
454     bf1[30] = bf0[30] + bf0[25];
455     bf1[31] = bf0[31] + bf0[24];
456 
457     // stage 4
458     cospi   = cospi_arr(cos_bit);
459     bf0     = output;
460     bf1     = step;
461     bf1[0]  = bf0[0] + bf0[3];
462     bf1[1]  = bf0[1] + bf0[2];
463     bf1[2]  = -bf0[2] + bf0[1];
464     bf1[3]  = -bf0[3] + bf0[0];
465     bf1[4]  = bf0[4];
466     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
467     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
468     bf1[7]  = bf0[7];
469     bf1[8]  = bf0[8] + bf0[11];
470     bf1[9]  = bf0[9] + bf0[10];
471     bf1[10] = -bf0[10] + bf0[9];
472     bf1[11] = -bf0[11] + bf0[8];
473     bf1[12] = -bf0[12] + bf0[15];
474     bf1[13] = -bf0[13] + bf0[14];
475     bf1[14] = bf0[14] + bf0[13];
476     bf1[15] = bf0[15] + bf0[12];
477     bf1[16] = bf0[16];
478     bf1[17] = bf0[17];
479     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
480     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
481     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
482     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
483     bf1[22] = bf0[22];
484     bf1[23] = bf0[23];
485     bf1[24] = bf0[24];
486     bf1[25] = bf0[25];
487     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
488     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
489     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
490     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
491     bf1[30] = bf0[30];
492     bf1[31] = bf0[31];
493 
494     // stage 5
495     cospi   = cospi_arr(cos_bit);
496     bf0     = step;
497     bf1     = output;
498     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
499     bf1[1]  = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
500     bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
501     bf1[3]  = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
502     bf1[4]  = bf0[4] + bf0[5];
503     bf1[5]  = -bf0[5] + bf0[4];
504     bf1[6]  = -bf0[6] + bf0[7];
505     bf1[7]  = bf0[7] + bf0[6];
506     bf1[8]  = bf0[8];
507     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
508     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
509     bf1[11] = bf0[11];
510     bf1[12] = bf0[12];
511     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
512     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
513     bf1[15] = bf0[15];
514     bf1[16] = bf0[16] + bf0[19];
515     bf1[17] = bf0[17] + bf0[18];
516     bf1[18] = -bf0[18] + bf0[17];
517     bf1[19] = -bf0[19] + bf0[16];
518     bf1[20] = -bf0[20] + bf0[23];
519     bf1[21] = -bf0[21] + bf0[22];
520     bf1[22] = bf0[22] + bf0[21];
521     bf1[23] = bf0[23] + bf0[20];
522     bf1[24] = bf0[24] + bf0[27];
523     bf1[25] = bf0[25] + bf0[26];
524     bf1[26] = -bf0[26] + bf0[25];
525     bf1[27] = -bf0[27] + bf0[24];
526     bf1[28] = -bf0[28] + bf0[31];
527     bf1[29] = -bf0[29] + bf0[30];
528     bf1[30] = bf0[30] + bf0[29];
529     bf1[31] = bf0[31] + bf0[28];
530 
531     // stage 6
532     cospi   = cospi_arr(cos_bit);
533     bf0     = output;
534     bf1     = step;
535     bf1[0]  = bf0[0];
536     bf1[1]  = bf0[1];
537     bf1[2]  = bf0[2];
538     bf1[3]  = bf0[3];
539     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
540     bf1[5]  = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
541     bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
542     bf1[7]  = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
543     bf1[8]  = bf0[8] + bf0[9];
544     bf1[9]  = -bf0[9] + bf0[8];
545     bf1[10] = -bf0[10] + bf0[11];
546     bf1[11] = bf0[11] + bf0[10];
547     bf1[12] = bf0[12] + bf0[13];
548     bf1[13] = -bf0[13] + bf0[12];
549     bf1[14] = -bf0[14] + bf0[15];
550     bf1[15] = bf0[15] + bf0[14];
551     bf1[16] = bf0[16];
552     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
553     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
554     bf1[19] = bf0[19];
555     bf1[20] = bf0[20];
556     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
557     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
558     bf1[23] = bf0[23];
559     bf1[24] = bf0[24];
560     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
561     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
562     bf1[27] = bf0[27];
563     bf1[28] = bf0[28];
564     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
565     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
566     bf1[31] = bf0[31];
567 
568     // stage 7
569     cospi   = cospi_arr(cos_bit);
570     bf0     = step;
571     bf1     = output;
572     bf1[0]  = bf0[0];
573     bf1[1]  = bf0[1];
574     bf1[2]  = bf0[2];
575     bf1[3]  = bf0[3];
576     bf1[4]  = bf0[4];
577     bf1[5]  = bf0[5];
578     bf1[6]  = bf0[6];
579     bf1[7]  = bf0[7];
580     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
581     bf1[9]  = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
582     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
583     bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
584     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
585     bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
586     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
587     bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
588     bf1[16] = bf0[16] + bf0[17];
589     bf1[17] = -bf0[17] + bf0[16];
590     bf1[18] = -bf0[18] + bf0[19];
591     bf1[19] = bf0[19] + bf0[18];
592     bf1[20] = bf0[20] + bf0[21];
593     bf1[21] = -bf0[21] + bf0[20];
594     bf1[22] = -bf0[22] + bf0[23];
595     bf1[23] = bf0[23] + bf0[22];
596     bf1[24] = bf0[24] + bf0[25];
597     bf1[25] = -bf0[25] + bf0[24];
598     bf1[26] = -bf0[26] + bf0[27];
599     bf1[27] = bf0[27] + bf0[26];
600     bf1[28] = bf0[28] + bf0[29];
601     bf1[29] = -bf0[29] + bf0[28];
602     bf1[30] = -bf0[30] + bf0[31];
603     bf1[31] = bf0[31] + bf0[30];
604 
605     // stage 8
606     cospi   = cospi_arr(cos_bit);
607     bf0     = output;
608     bf1     = step;
609     bf1[0]  = bf0[0];
610     bf1[1]  = bf0[1];
611     bf1[2]  = bf0[2];
612     bf1[3]  = bf0[3];
613     bf1[4]  = bf0[4];
614     bf1[5]  = bf0[5];
615     bf1[6]  = bf0[6];
616     bf1[7]  = bf0[7];
617     bf1[8]  = bf0[8];
618     bf1[9]  = bf0[9];
619     bf1[10] = bf0[10];
620     bf1[11] = bf0[11];
621     bf1[12] = bf0[12];
622     bf1[13] = bf0[13];
623     bf1[14] = bf0[14];
624     bf1[15] = bf0[15];
625     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
626     bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
627     bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
628     bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
629     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
630     bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
631     bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
632     bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
633     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
634     bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
635     bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
636     bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
637     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
638     bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
639     bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
640     bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
641 
642     // stage 9
643     bf0     = step;
644     bf1     = output;
645     bf1[0]  = bf0[0];
646     bf1[1]  = bf0[16];
647     bf1[2]  = bf0[8];
648     bf1[3]  = bf0[24];
649     bf1[4]  = bf0[4];
650     bf1[5]  = bf0[20];
651     bf1[6]  = bf0[12];
652     bf1[7]  = bf0[28];
653     bf1[8]  = bf0[2];
654     bf1[9]  = bf0[18];
655     bf1[10] = bf0[10];
656     bf1[11] = bf0[26];
657     bf1[12] = bf0[6];
658     bf1[13] = bf0[22];
659     bf1[14] = bf0[14];
660     bf1[15] = bf0[30];
661     bf1[16] = bf0[1];
662     bf1[17] = bf0[17];
663     bf1[18] = bf0[9];
664     bf1[19] = bf0[25];
665     bf1[20] = bf0[5];
666     bf1[21] = bf0[21];
667     bf1[22] = bf0[13];
668     bf1[23] = bf0[29];
669     bf1[24] = bf0[3];
670     bf1[25] = bf0[19];
671     bf1[26] = bf0[11];
672     bf1[27] = bf0[27];
673     bf1[28] = bf0[7];
674     bf1[29] = bf0[23];
675     bf1[30] = bf0[15];
676     bf1[31] = bf0[31];
677 }
svt_av1_fdct64_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)678 void svt_av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
679                         const int8_t *stage_range) {
680     (void)stage_range;
681     const int32_t *cospi;
682 
683     int32_t *bf0, *bf1;
684     int32_t  step[64];
685 
686     // stage 0;
687 
688     // stage 1;
689     bf1     = output;
690     bf1[0]  = input[0] + input[63];
691     bf1[1]  = input[1] + input[62];
692     bf1[2]  = input[2] + input[61];
693     bf1[3]  = input[3] + input[60];
694     bf1[4]  = input[4] + input[59];
695     bf1[5]  = input[5] + input[58];
696     bf1[6]  = input[6] + input[57];
697     bf1[7]  = input[7] + input[56];
698     bf1[8]  = input[8] + input[55];
699     bf1[9]  = input[9] + input[54];
700     bf1[10] = input[10] + input[53];
701     bf1[11] = input[11] + input[52];
702     bf1[12] = input[12] + input[51];
703     bf1[13] = input[13] + input[50];
704     bf1[14] = input[14] + input[49];
705     bf1[15] = input[15] + input[48];
706     bf1[16] = input[16] + input[47];
707     bf1[17] = input[17] + input[46];
708     bf1[18] = input[18] + input[45];
709     bf1[19] = input[19] + input[44];
710     bf1[20] = input[20] + input[43];
711     bf1[21] = input[21] + input[42];
712     bf1[22] = input[22] + input[41];
713     bf1[23] = input[23] + input[40];
714     bf1[24] = input[24] + input[39];
715     bf1[25] = input[25] + input[38];
716     bf1[26] = input[26] + input[37];
717     bf1[27] = input[27] + input[36];
718     bf1[28] = input[28] + input[35];
719     bf1[29] = input[29] + input[34];
720     bf1[30] = input[30] + input[33];
721     bf1[31] = input[31] + input[32];
722     bf1[32] = -input[32] + input[31];
723     bf1[33] = -input[33] + input[30];
724     bf1[34] = -input[34] + input[29];
725     bf1[35] = -input[35] + input[28];
726     bf1[36] = -input[36] + input[27];
727     bf1[37] = -input[37] + input[26];
728     bf1[38] = -input[38] + input[25];
729     bf1[39] = -input[39] + input[24];
730     bf1[40] = -input[40] + input[23];
731     bf1[41] = -input[41] + input[22];
732     bf1[42] = -input[42] + input[21];
733     bf1[43] = -input[43] + input[20];
734     bf1[44] = -input[44] + input[19];
735     bf1[45] = -input[45] + input[18];
736     bf1[46] = -input[46] + input[17];
737     bf1[47] = -input[47] + input[16];
738     bf1[48] = -input[48] + input[15];
739     bf1[49] = -input[49] + input[14];
740     bf1[50] = -input[50] + input[13];
741     bf1[51] = -input[51] + input[12];
742     bf1[52] = -input[52] + input[11];
743     bf1[53] = -input[53] + input[10];
744     bf1[54] = -input[54] + input[9];
745     bf1[55] = -input[55] + input[8];
746     bf1[56] = -input[56] + input[7];
747     bf1[57] = -input[57] + input[6];
748     bf1[58] = -input[58] + input[5];
749     bf1[59] = -input[59] + input[4];
750     bf1[60] = -input[60] + input[3];
751     bf1[61] = -input[61] + input[2];
752     bf1[62] = -input[62] + input[1];
753     bf1[63] = -input[63] + input[0];
754 
755     // stage 2
756     cospi   = cospi_arr(cos_bit);
757     bf0     = output;
758     bf1     = step;
759     bf1[0]  = bf0[0] + bf0[31];
760     bf1[1]  = bf0[1] + bf0[30];
761     bf1[2]  = bf0[2] + bf0[29];
762     bf1[3]  = bf0[3] + bf0[28];
763     bf1[4]  = bf0[4] + bf0[27];
764     bf1[5]  = bf0[5] + bf0[26];
765     bf1[6]  = bf0[6] + bf0[25];
766     bf1[7]  = bf0[7] + bf0[24];
767     bf1[8]  = bf0[8] + bf0[23];
768     bf1[9]  = bf0[9] + bf0[22];
769     bf1[10] = bf0[10] + bf0[21];
770     bf1[11] = bf0[11] + bf0[20];
771     bf1[12] = bf0[12] + bf0[19];
772     bf1[13] = bf0[13] + bf0[18];
773     bf1[14] = bf0[14] + bf0[17];
774     bf1[15] = bf0[15] + bf0[16];
775     bf1[16] = -bf0[16] + bf0[15];
776     bf1[17] = -bf0[17] + bf0[14];
777     bf1[18] = -bf0[18] + bf0[13];
778     bf1[19] = -bf0[19] + bf0[12];
779     bf1[20] = -bf0[20] + bf0[11];
780     bf1[21] = -bf0[21] + bf0[10];
781     bf1[22] = -bf0[22] + bf0[9];
782     bf1[23] = -bf0[23] + bf0[8];
783     bf1[24] = -bf0[24] + bf0[7];
784     bf1[25] = -bf0[25] + bf0[6];
785     bf1[26] = -bf0[26] + bf0[5];
786     bf1[27] = -bf0[27] + bf0[4];
787     bf1[28] = -bf0[28] + bf0[3];
788     bf1[29] = -bf0[29] + bf0[2];
789     bf1[30] = -bf0[30] + bf0[1];
790     bf1[31] = -bf0[31] + bf0[0];
791     bf1[32] = bf0[32];
792     bf1[33] = bf0[33];
793     bf1[34] = bf0[34];
794     bf1[35] = bf0[35];
795     bf1[36] = bf0[36];
796     bf1[37] = bf0[37];
797     bf1[38] = bf0[38];
798     bf1[39] = bf0[39];
799     bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
800     bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
801     bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
802     bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
803     bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
804     bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
805     bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
806     bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
807     bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
808     bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
809     bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
810     bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
811     bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
812     bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
813     bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
814     bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
815     bf1[56] = bf0[56];
816     bf1[57] = bf0[57];
817     bf1[58] = bf0[58];
818     bf1[59] = bf0[59];
819     bf1[60] = bf0[60];
820     bf1[61] = bf0[61];
821     bf1[62] = bf0[62];
822     bf1[63] = bf0[63];
823 
824     // stage 3
825     cospi   = cospi_arr(cos_bit);
826     bf0     = step;
827     bf1     = output;
828     bf1[0]  = bf0[0] + bf0[15];
829     bf1[1]  = bf0[1] + bf0[14];
830     bf1[2]  = bf0[2] + bf0[13];
831     bf1[3]  = bf0[3] + bf0[12];
832     bf1[4]  = bf0[4] + bf0[11];
833     bf1[5]  = bf0[5] + bf0[10];
834     bf1[6]  = bf0[6] + bf0[9];
835     bf1[7]  = bf0[7] + bf0[8];
836     bf1[8]  = -bf0[8] + bf0[7];
837     bf1[9]  = -bf0[9] + bf0[6];
838     bf1[10] = -bf0[10] + bf0[5];
839     bf1[11] = -bf0[11] + bf0[4];
840     bf1[12] = -bf0[12] + bf0[3];
841     bf1[13] = -bf0[13] + bf0[2];
842     bf1[14] = -bf0[14] + bf0[1];
843     bf1[15] = -bf0[15] + bf0[0];
844     bf1[16] = bf0[16];
845     bf1[17] = bf0[17];
846     bf1[18] = bf0[18];
847     bf1[19] = bf0[19];
848     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
849     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
850     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
851     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
852     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
853     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
854     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
855     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
856     bf1[28] = bf0[28];
857     bf1[29] = bf0[29];
858     bf1[30] = bf0[30];
859     bf1[31] = bf0[31];
860     bf1[32] = bf0[32] + bf0[47];
861     bf1[33] = bf0[33] + bf0[46];
862     bf1[34] = bf0[34] + bf0[45];
863     bf1[35] = bf0[35] + bf0[44];
864     bf1[36] = bf0[36] + bf0[43];
865     bf1[37] = bf0[37] + bf0[42];
866     bf1[38] = bf0[38] + bf0[41];
867     bf1[39] = bf0[39] + bf0[40];
868     bf1[40] = -bf0[40] + bf0[39];
869     bf1[41] = -bf0[41] + bf0[38];
870     bf1[42] = -bf0[42] + bf0[37];
871     bf1[43] = -bf0[43] + bf0[36];
872     bf1[44] = -bf0[44] + bf0[35];
873     bf1[45] = -bf0[45] + bf0[34];
874     bf1[46] = -bf0[46] + bf0[33];
875     bf1[47] = -bf0[47] + bf0[32];
876     bf1[48] = -bf0[48] + bf0[63];
877     bf1[49] = -bf0[49] + bf0[62];
878     bf1[50] = -bf0[50] + bf0[61];
879     bf1[51] = -bf0[51] + bf0[60];
880     bf1[52] = -bf0[52] + bf0[59];
881     bf1[53] = -bf0[53] + bf0[58];
882     bf1[54] = -bf0[54] + bf0[57];
883     bf1[55] = -bf0[55] + bf0[56];
884     bf1[56] = bf0[56] + bf0[55];
885     bf1[57] = bf0[57] + bf0[54];
886     bf1[58] = bf0[58] + bf0[53];
887     bf1[59] = bf0[59] + bf0[52];
888     bf1[60] = bf0[60] + bf0[51];
889     bf1[61] = bf0[61] + bf0[50];
890     bf1[62] = bf0[62] + bf0[49];
891     bf1[63] = bf0[63] + bf0[48];
892 
893     // stage 4
894     cospi   = cospi_arr(cos_bit);
895     bf0     = output;
896     bf1     = step;
897     bf1[0]  = bf0[0] + bf0[7];
898     bf1[1]  = bf0[1] + bf0[6];
899     bf1[2]  = bf0[2] + bf0[5];
900     bf1[3]  = bf0[3] + bf0[4];
901     bf1[4]  = -bf0[4] + bf0[3];
902     bf1[5]  = -bf0[5] + bf0[2];
903     bf1[6]  = -bf0[6] + bf0[1];
904     bf1[7]  = -bf0[7] + bf0[0];
905     bf1[8]  = bf0[8];
906     bf1[9]  = bf0[9];
907     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
908     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
909     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
910     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
911     bf1[14] = bf0[14];
912     bf1[15] = bf0[15];
913     bf1[16] = bf0[16] + bf0[23];
914     bf1[17] = bf0[17] + bf0[22];
915     bf1[18] = bf0[18] + bf0[21];
916     bf1[19] = bf0[19] + bf0[20];
917     bf1[20] = -bf0[20] + bf0[19];
918     bf1[21] = -bf0[21] + bf0[18];
919     bf1[22] = -bf0[22] + bf0[17];
920     bf1[23] = -bf0[23] + bf0[16];
921     bf1[24] = -bf0[24] + bf0[31];
922     bf1[25] = -bf0[25] + bf0[30];
923     bf1[26] = -bf0[26] + bf0[29];
924     bf1[27] = -bf0[27] + bf0[28];
925     bf1[28] = bf0[28] + bf0[27];
926     bf1[29] = bf0[29] + bf0[26];
927     bf1[30] = bf0[30] + bf0[25];
928     bf1[31] = bf0[31] + bf0[24];
929     bf1[32] = bf0[32];
930     bf1[33] = bf0[33];
931     bf1[34] = bf0[34];
932     bf1[35] = bf0[35];
933     bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
934     bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
935     bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
936     bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
937     bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
938     bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
939     bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
940     bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
941     bf1[44] = bf0[44];
942     bf1[45] = bf0[45];
943     bf1[46] = bf0[46];
944     bf1[47] = bf0[47];
945     bf1[48] = bf0[48];
946     bf1[49] = bf0[49];
947     bf1[50] = bf0[50];
948     bf1[51] = bf0[51];
949     bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
950     bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
951     bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
952     bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
953     bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
954     bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
955     bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
956     bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
957     bf1[60] = bf0[60];
958     bf1[61] = bf0[61];
959     bf1[62] = bf0[62];
960     bf1[63] = bf0[63];
961 
962     // stage 5
963     cospi   = cospi_arr(cos_bit);
964     bf0     = step;
965     bf1     = output;
966     bf1[0]  = bf0[0] + bf0[3];
967     bf1[1]  = bf0[1] + bf0[2];
968     bf1[2]  = -bf0[2] + bf0[1];
969     bf1[3]  = -bf0[3] + bf0[0];
970     bf1[4]  = bf0[4];
971     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
972     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
973     bf1[7]  = bf0[7];
974     bf1[8]  = bf0[8] + bf0[11];
975     bf1[9]  = bf0[9] + bf0[10];
976     bf1[10] = -bf0[10] + bf0[9];
977     bf1[11] = -bf0[11] + bf0[8];
978     bf1[12] = -bf0[12] + bf0[15];
979     bf1[13] = -bf0[13] + bf0[14];
980     bf1[14] = bf0[14] + bf0[13];
981     bf1[15] = bf0[15] + bf0[12];
982     bf1[16] = bf0[16];
983     bf1[17] = bf0[17];
984     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
985     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
986     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
987     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
988     bf1[22] = bf0[22];
989     bf1[23] = bf0[23];
990     bf1[24] = bf0[24];
991     bf1[25] = bf0[25];
992     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
993     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
994     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
995     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
996     bf1[30] = bf0[30];
997     bf1[31] = bf0[31];
998     bf1[32] = bf0[32] + bf0[39];
999     bf1[33] = bf0[33] + bf0[38];
1000     bf1[34] = bf0[34] + bf0[37];
1001     bf1[35] = bf0[35] + bf0[36];
1002     bf1[36] = -bf0[36] + bf0[35];
1003     bf1[37] = -bf0[37] + bf0[34];
1004     bf1[38] = -bf0[38] + bf0[33];
1005     bf1[39] = -bf0[39] + bf0[32];
1006     bf1[40] = -bf0[40] + bf0[47];
1007     bf1[41] = -bf0[41] + bf0[46];
1008     bf1[42] = -bf0[42] + bf0[45];
1009     bf1[43] = -bf0[43] + bf0[44];
1010     bf1[44] = bf0[44] + bf0[43];
1011     bf1[45] = bf0[45] + bf0[42];
1012     bf1[46] = bf0[46] + bf0[41];
1013     bf1[47] = bf0[47] + bf0[40];
1014     bf1[48] = bf0[48] + bf0[55];
1015     bf1[49] = bf0[49] + bf0[54];
1016     bf1[50] = bf0[50] + bf0[53];
1017     bf1[51] = bf0[51] + bf0[52];
1018     bf1[52] = -bf0[52] + bf0[51];
1019     bf1[53] = -bf0[53] + bf0[50];
1020     bf1[54] = -bf0[54] + bf0[49];
1021     bf1[55] = -bf0[55] + bf0[48];
1022     bf1[56] = -bf0[56] + bf0[63];
1023     bf1[57] = -bf0[57] + bf0[62];
1024     bf1[58] = -bf0[58] + bf0[61];
1025     bf1[59] = -bf0[59] + bf0[60];
1026     bf1[60] = bf0[60] + bf0[59];
1027     bf1[61] = bf0[61] + bf0[58];
1028     bf1[62] = bf0[62] + bf0[57];
1029     bf1[63] = bf0[63] + bf0[56];
1030 
1031     // stage 6
1032     cospi   = cospi_arr(cos_bit);
1033     bf0     = output;
1034     bf1     = step;
1035     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1036     bf1[1]  = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1037     bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1038     bf1[3]  = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1039     bf1[4]  = bf0[4] + bf0[5];
1040     bf1[5]  = -bf0[5] + bf0[4];
1041     bf1[6]  = -bf0[6] + bf0[7];
1042     bf1[7]  = bf0[7] + bf0[6];
1043     bf1[8]  = bf0[8];
1044     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1045     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1046     bf1[11] = bf0[11];
1047     bf1[12] = bf0[12];
1048     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1049     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1050     bf1[15] = bf0[15];
1051     bf1[16] = bf0[16] + bf0[19];
1052     bf1[17] = bf0[17] + bf0[18];
1053     bf1[18] = -bf0[18] + bf0[17];
1054     bf1[19] = -bf0[19] + bf0[16];
1055     bf1[20] = -bf0[20] + bf0[23];
1056     bf1[21] = -bf0[21] + bf0[22];
1057     bf1[22] = bf0[22] + bf0[21];
1058     bf1[23] = bf0[23] + bf0[20];
1059     bf1[24] = bf0[24] + bf0[27];
1060     bf1[25] = bf0[25] + bf0[26];
1061     bf1[26] = -bf0[26] + bf0[25];
1062     bf1[27] = -bf0[27] + bf0[24];
1063     bf1[28] = -bf0[28] + bf0[31];
1064     bf1[29] = -bf0[29] + bf0[30];
1065     bf1[30] = bf0[30] + bf0[29];
1066     bf1[31] = bf0[31] + bf0[28];
1067     bf1[32] = bf0[32];
1068     bf1[33] = bf0[33];
1069     bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1070     bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1071     bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1072     bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1073     bf1[38] = bf0[38];
1074     bf1[39] = bf0[39];
1075     bf1[40] = bf0[40];
1076     bf1[41] = bf0[41];
1077     bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1078     bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1079     bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1080     bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1081     bf1[46] = bf0[46];
1082     bf1[47] = bf0[47];
1083     bf1[48] = bf0[48];
1084     bf1[49] = bf0[49];
1085     bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
1086     bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
1087     bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
1088     bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
1089     bf1[54] = bf0[54];
1090     bf1[55] = bf0[55];
1091     bf1[56] = bf0[56];
1092     bf1[57] = bf0[57];
1093     bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
1094     bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
1095     bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
1096     bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
1097     bf1[62] = bf0[62];
1098     bf1[63] = bf0[63];
1099 
1100     // stage 7
1101     cospi   = cospi_arr(cos_bit);
1102     bf0     = step;
1103     bf1     = output;
1104     bf1[0]  = bf0[0];
1105     bf1[1]  = bf0[1];
1106     bf1[2]  = bf0[2];
1107     bf1[3]  = bf0[3];
1108     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1109     bf1[5]  = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1110     bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1111     bf1[7]  = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1112     bf1[8]  = bf0[8] + bf0[9];
1113     bf1[9]  = -bf0[9] + bf0[8];
1114     bf1[10] = -bf0[10] + bf0[11];
1115     bf1[11] = bf0[11] + bf0[10];
1116     bf1[12] = bf0[12] + bf0[13];
1117     bf1[13] = -bf0[13] + bf0[12];
1118     bf1[14] = -bf0[14] + bf0[15];
1119     bf1[15] = bf0[15] + bf0[14];
1120     bf1[16] = bf0[16];
1121     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1122     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1123     bf1[19] = bf0[19];
1124     bf1[20] = bf0[20];
1125     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1126     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1127     bf1[23] = bf0[23];
1128     bf1[24] = bf0[24];
1129     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1130     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1131     bf1[27] = bf0[27];
1132     bf1[28] = bf0[28];
1133     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1134     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1135     bf1[31] = bf0[31];
1136     bf1[32] = bf0[32] + bf0[35];
1137     bf1[33] = bf0[33] + bf0[34];
1138     bf1[34] = -bf0[34] + bf0[33];
1139     bf1[35] = -bf0[35] + bf0[32];
1140     bf1[36] = -bf0[36] + bf0[39];
1141     bf1[37] = -bf0[37] + bf0[38];
1142     bf1[38] = bf0[38] + bf0[37];
1143     bf1[39] = bf0[39] + bf0[36];
1144     bf1[40] = bf0[40] + bf0[43];
1145     bf1[41] = bf0[41] + bf0[42];
1146     bf1[42] = -bf0[42] + bf0[41];
1147     bf1[43] = -bf0[43] + bf0[40];
1148     bf1[44] = -bf0[44] + bf0[47];
1149     bf1[45] = -bf0[45] + bf0[46];
1150     bf1[46] = bf0[46] + bf0[45];
1151     bf1[47] = bf0[47] + bf0[44];
1152     bf1[48] = bf0[48] + bf0[51];
1153     bf1[49] = bf0[49] + bf0[50];
1154     bf1[50] = -bf0[50] + bf0[49];
1155     bf1[51] = -bf0[51] + bf0[48];
1156     bf1[52] = -bf0[52] + bf0[55];
1157     bf1[53] = -bf0[53] + bf0[54];
1158     bf1[54] = bf0[54] + bf0[53];
1159     bf1[55] = bf0[55] + bf0[52];
1160     bf1[56] = bf0[56] + bf0[59];
1161     bf1[57] = bf0[57] + bf0[58];
1162     bf1[58] = -bf0[58] + bf0[57];
1163     bf1[59] = -bf0[59] + bf0[56];
1164     bf1[60] = -bf0[60] + bf0[63];
1165     bf1[61] = -bf0[61] + bf0[62];
1166     bf1[62] = bf0[62] + bf0[61];
1167     bf1[63] = bf0[63] + bf0[60];
1168 
1169     // stage 8
1170     cospi   = cospi_arr(cos_bit);
1171     bf0     = output;
1172     bf1     = step;
1173     bf1[0]  = bf0[0];
1174     bf1[1]  = bf0[1];
1175     bf1[2]  = bf0[2];
1176     bf1[3]  = bf0[3];
1177     bf1[4]  = bf0[4];
1178     bf1[5]  = bf0[5];
1179     bf1[6]  = bf0[6];
1180     bf1[7]  = bf0[7];
1181     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1182     bf1[9]  = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1183     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1184     bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1185     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1186     bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1187     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1188     bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1189     bf1[16] = bf0[16] + bf0[17];
1190     bf1[17] = -bf0[17] + bf0[16];
1191     bf1[18] = -bf0[18] + bf0[19];
1192     bf1[19] = bf0[19] + bf0[18];
1193     bf1[20] = bf0[20] + bf0[21];
1194     bf1[21] = -bf0[21] + bf0[20];
1195     bf1[22] = -bf0[22] + bf0[23];
1196     bf1[23] = bf0[23] + bf0[22];
1197     bf1[24] = bf0[24] + bf0[25];
1198     bf1[25] = -bf0[25] + bf0[24];
1199     bf1[26] = -bf0[26] + bf0[27];
1200     bf1[27] = bf0[27] + bf0[26];
1201     bf1[28] = bf0[28] + bf0[29];
1202     bf1[29] = -bf0[29] + bf0[28];
1203     bf1[30] = -bf0[30] + bf0[31];
1204     bf1[31] = bf0[31] + bf0[30];
1205     bf1[32] = bf0[32];
1206     bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1207     bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1208     bf1[35] = bf0[35];
1209     bf1[36] = bf0[36];
1210     bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1211     bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1212     bf1[39] = bf0[39];
1213     bf1[40] = bf0[40];
1214     bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1215     bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1216     bf1[43] = bf0[43];
1217     bf1[44] = bf0[44];
1218     bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1219     bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1220     bf1[47] = bf0[47];
1221     bf1[48] = bf0[48];
1222     bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
1223     bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
1224     bf1[51] = bf0[51];
1225     bf1[52] = bf0[52];
1226     bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
1227     bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
1228     bf1[55] = bf0[55];
1229     bf1[56] = bf0[56];
1230     bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
1231     bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
1232     bf1[59] = bf0[59];
1233     bf1[60] = bf0[60];
1234     bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
1235     bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
1236     bf1[63] = bf0[63];
1237 
1238     // stage 9
1239     cospi   = cospi_arr(cos_bit);
1240     bf0     = step;
1241     bf1     = output;
1242     bf1[0]  = bf0[0];
1243     bf1[1]  = bf0[1];
1244     bf1[2]  = bf0[2];
1245     bf1[3]  = bf0[3];
1246     bf1[4]  = bf0[4];
1247     bf1[5]  = bf0[5];
1248     bf1[6]  = bf0[6];
1249     bf1[7]  = bf0[7];
1250     bf1[8]  = bf0[8];
1251     bf1[9]  = bf0[9];
1252     bf1[10] = bf0[10];
1253     bf1[11] = bf0[11];
1254     bf1[12] = bf0[12];
1255     bf1[13] = bf0[13];
1256     bf1[14] = bf0[14];
1257     bf1[15] = bf0[15];
1258     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1259     bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1260     bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1261     bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1262     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1263     bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1264     bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1265     bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1266     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1267     bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1268     bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1269     bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1270     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1271     bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1272     bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1273     bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1274     bf1[32] = bf0[32] + bf0[33];
1275     bf1[33] = -bf0[33] + bf0[32];
1276     bf1[34] = -bf0[34] + bf0[35];
1277     bf1[35] = bf0[35] + bf0[34];
1278     bf1[36] = bf0[36] + bf0[37];
1279     bf1[37] = -bf0[37] + bf0[36];
1280     bf1[38] = -bf0[38] + bf0[39];
1281     bf1[39] = bf0[39] + bf0[38];
1282     bf1[40] = bf0[40] + bf0[41];
1283     bf1[41] = -bf0[41] + bf0[40];
1284     bf1[42] = -bf0[42] + bf0[43];
1285     bf1[43] = bf0[43] + bf0[42];
1286     bf1[44] = bf0[44] + bf0[45];
1287     bf1[45] = -bf0[45] + bf0[44];
1288     bf1[46] = -bf0[46] + bf0[47];
1289     bf1[47] = bf0[47] + bf0[46];
1290     bf1[48] = bf0[48] + bf0[49];
1291     bf1[49] = -bf0[49] + bf0[48];
1292     bf1[50] = -bf0[50] + bf0[51];
1293     bf1[51] = bf0[51] + bf0[50];
1294     bf1[52] = bf0[52] + bf0[53];
1295     bf1[53] = -bf0[53] + bf0[52];
1296     bf1[54] = -bf0[54] + bf0[55];
1297     bf1[55] = bf0[55] + bf0[54];
1298     bf1[56] = bf0[56] + bf0[57];
1299     bf1[57] = -bf0[57] + bf0[56];
1300     bf1[58] = -bf0[58] + bf0[59];
1301     bf1[59] = bf0[59] + bf0[58];
1302     bf1[60] = bf0[60] + bf0[61];
1303     bf1[61] = -bf0[61] + bf0[60];
1304     bf1[62] = -bf0[62] + bf0[63];
1305     bf1[63] = bf0[63] + bf0[62];
1306 
1307     // stage 10
1308     cospi   = cospi_arr(cos_bit);
1309     bf0     = output;
1310     bf1     = step;
1311     bf1[0]  = bf0[0];
1312     bf1[1]  = bf0[1];
1313     bf1[2]  = bf0[2];
1314     bf1[3]  = bf0[3];
1315     bf1[4]  = bf0[4];
1316     bf1[5]  = bf0[5];
1317     bf1[6]  = bf0[6];
1318     bf1[7]  = bf0[7];
1319     bf1[8]  = bf0[8];
1320     bf1[9]  = bf0[9];
1321     bf1[10] = bf0[10];
1322     bf1[11] = bf0[11];
1323     bf1[12] = bf0[12];
1324     bf1[13] = bf0[13];
1325     bf1[14] = bf0[14];
1326     bf1[15] = bf0[15];
1327     bf1[16] = bf0[16];
1328     bf1[17] = bf0[17];
1329     bf1[18] = bf0[18];
1330     bf1[19] = bf0[19];
1331     bf1[20] = bf0[20];
1332     bf1[21] = bf0[21];
1333     bf1[22] = bf0[22];
1334     bf1[23] = bf0[23];
1335     bf1[24] = bf0[24];
1336     bf1[25] = bf0[25];
1337     bf1[26] = bf0[26];
1338     bf1[27] = bf0[27];
1339     bf1[28] = bf0[28];
1340     bf1[29] = bf0[29];
1341     bf1[30] = bf0[30];
1342     bf1[31] = bf0[31];
1343     bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
1344     bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
1345     bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
1346     bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
1347     bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
1348     bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
1349     bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
1350     bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
1351     bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
1352     bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
1353     bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
1354     bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
1355     bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
1356     bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
1357     bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
1358     bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
1359     bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
1360     bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
1361     bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
1362     bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
1363     bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
1364     bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
1365     bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
1366     bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
1367     bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
1368     bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
1369     bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
1370     bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
1371     bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
1372     bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
1373     bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
1374     bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
1375 
1376     // stage 11
1377     bf0     = step;
1378     bf1     = output;
1379     bf1[0]  = bf0[0];
1380     bf1[1]  = bf0[32];
1381     bf1[2]  = bf0[16];
1382     bf1[3]  = bf0[48];
1383     bf1[4]  = bf0[8];
1384     bf1[5]  = bf0[40];
1385     bf1[6]  = bf0[24];
1386     bf1[7]  = bf0[56];
1387     bf1[8]  = bf0[4];
1388     bf1[9]  = bf0[36];
1389     bf1[10] = bf0[20];
1390     bf1[11] = bf0[52];
1391     bf1[12] = bf0[12];
1392     bf1[13] = bf0[44];
1393     bf1[14] = bf0[28];
1394     bf1[15] = bf0[60];
1395     bf1[16] = bf0[2];
1396     bf1[17] = bf0[34];
1397     bf1[18] = bf0[18];
1398     bf1[19] = bf0[50];
1399     bf1[20] = bf0[10];
1400     bf1[21] = bf0[42];
1401     bf1[22] = bf0[26];
1402     bf1[23] = bf0[58];
1403     bf1[24] = bf0[6];
1404     bf1[25] = bf0[38];
1405     bf1[26] = bf0[22];
1406     bf1[27] = bf0[54];
1407     bf1[28] = bf0[14];
1408     bf1[29] = bf0[46];
1409     bf1[30] = bf0[30];
1410     bf1[31] = bf0[62];
1411     bf1[32] = bf0[1];
1412     bf1[33] = bf0[33];
1413     bf1[34] = bf0[17];
1414     bf1[35] = bf0[49];
1415     bf1[36] = bf0[9];
1416     bf1[37] = bf0[41];
1417     bf1[38] = bf0[25];
1418     bf1[39] = bf0[57];
1419     bf1[40] = bf0[5];
1420     bf1[41] = bf0[37];
1421     bf1[42] = bf0[21];
1422     bf1[43] = bf0[53];
1423     bf1[44] = bf0[13];
1424     bf1[45] = bf0[45];
1425     bf1[46] = bf0[29];
1426     bf1[47] = bf0[61];
1427     bf1[48] = bf0[3];
1428     bf1[49] = bf0[35];
1429     bf1[50] = bf0[19];
1430     bf1[51] = bf0[51];
1431     bf1[52] = bf0[11];
1432     bf1[53] = bf0[43];
1433     bf1[54] = bf0[27];
1434     bf1[55] = bf0[59];
1435     bf1[56] = bf0[7];
1436     bf1[57] = bf0[39];
1437     bf1[58] = bf0[23];
1438     bf1[59] = bf0[55];
1439     bf1[60] = bf0[15];
1440     bf1[61] = bf0[47];
1441     bf1[62] = bf0[31];
1442     bf1[63] = bf0[63];
1443 }
1444 
svt_av1_fadst4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1445 void svt_av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1446                         const int8_t *stage_range) {
1447     (void)stage_range;
1448     int32_t        bit   = cos_bit;
1449     const int32_t *sinpi = sinpi_arr(bit);
1450     int32_t        x0, x1, x2, x3;
1451     int32_t        s0, s1, s2, s3, s4, s5, s6, s7;
1452 
1453     // stage 0
1454     x0 = input[0];
1455     x1 = input[1];
1456     x2 = input[2];
1457     x3 = input[3];
1458 
1459     if (!(x0 | x1 | x2 | x3)) {
1460         output[0] = output[1] = output[2] = output[3] = 0;
1461         return;
1462     }
1463 
1464     //// stage 1
1465     //s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
1466     //s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
1467     //s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
1468     //s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
1469     //s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
1470     //s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
1471     //s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
1472     //s7 = range_check_value(x0 + x1, stage_range[1]);
1473 
1474     //// stage 2
1475     //s7 = range_check_value(s7 - x3, stage_range[2]);
1476 
1477     //// stage 3
1478     //x0 = range_check_value(s0 + s2, bit + stage_range[3]);
1479     //x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
1480     //x2 = range_check_value(s1 - s3, bit + stage_range[3]);
1481     //x3 = range_check_value(s4, bit + stage_range[3]);
1482 
1483     //// stage 4
1484     //x0 = range_check_value(x0 + s5, bit + stage_range[4]);
1485     //x2 = range_check_value(x2 + s6, bit + stage_range[4]);
1486 
1487     //// stage 5
1488     //s0 = range_check_value(x0 + x3, bit + stage_range[5]);
1489     //s1 = range_check_value(x1, bit + stage_range[5]);
1490     //s2 = range_check_value(x2 - x3, bit + stage_range[5]);
1491     //s3 = range_check_value(x2 - x0, bit + stage_range[5]);
1492 
1493     //// stage 6
1494     //s3 = range_check_value(s3 + x3, bit + stage_range[6]);
1495 
1496     // stage 1
1497     s0 = sinpi[1] * x0;
1498     s1 = sinpi[4] * x0;
1499     s2 = sinpi[2] * x1;
1500     s3 = sinpi[1] * x1;
1501     s4 = sinpi[3] * x2;
1502     s5 = sinpi[4] * x3;
1503     s6 = sinpi[2] * x3;
1504     s7 = x0 + x1;
1505 
1506     // stage 2
1507     s7 = s7 - x3;
1508 
1509     // stage 3
1510     x0 = s0 + s2;
1511     x1 = sinpi[3] * s7;
1512     x2 = s1 - s3;
1513     x3 = s4;
1514 
1515     // stage 4
1516     x0 = x0 + s5;
1517     x2 = x2 + s6;
1518 
1519     // stage 5
1520     s0 = x0 + x3;
1521     s1 = x1;
1522     s2 = x2 - x3;
1523     s3 = x2 - x0;
1524 
1525     // stage 6
1526     s3 = s3 + x3;
1527 
1528     // 1-D transform scaling factor is sqrt(2).
1529     output[0] = round_shift(s0, bit);
1530     output[1] = round_shift(s1, bit);
1531     output[2] = round_shift(s2, bit);
1532     output[3] = round_shift(s3, bit);
1533 }
1534 
svt_av1_fadst8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1535 void svt_av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1536                         const int8_t *stage_range) {
1537     (void)stage_range;
1538     const int32_t *cospi;
1539 
1540     int32_t *bf0, *bf1;
1541     int32_t  step[8];
1542 
1543     // stage 0;
1544 
1545     // stage 1;
1546     assert(output != input);
1547     bf1    = output;
1548     bf1[0] = input[0];
1549     bf1[1] = -input[7];
1550     bf1[2] = -input[3];
1551     bf1[3] = input[4];
1552     bf1[4] = -input[1];
1553     bf1[5] = input[6];
1554     bf1[6] = input[2];
1555     bf1[7] = -input[5];
1556 
1557     // stage 2
1558     cospi  = cospi_arr(cos_bit);
1559     bf0    = output;
1560     bf1    = step;
1561     bf1[0] = bf0[0];
1562     bf1[1] = bf0[1];
1563     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
1564     bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
1565     bf1[4] = bf0[4];
1566     bf1[5] = bf0[5];
1567     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
1568     bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
1569 
1570     // stage 3
1571     bf0    = step;
1572     bf1    = output;
1573     bf1[0] = bf0[0] + bf0[2];
1574     bf1[1] = bf0[1] + bf0[3];
1575     bf1[2] = bf0[0] - bf0[2];
1576     bf1[3] = bf0[1] - bf0[3];
1577     bf1[4] = bf0[4] + bf0[6];
1578     bf1[5] = bf0[5] + bf0[7];
1579     bf1[6] = bf0[4] - bf0[6];
1580     bf1[7] = bf0[5] - bf0[7];
1581 
1582     // stage 4
1583     cospi  = cospi_arr(cos_bit);
1584     bf0    = output;
1585     bf1    = step;
1586     bf1[0] = bf0[0];
1587     bf1[1] = bf0[1];
1588     bf1[2] = bf0[2];
1589     bf1[3] = bf0[3];
1590     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
1591     bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
1592     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
1593     bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
1594 
1595     // stage 5
1596     bf0    = step;
1597     bf1    = output;
1598     bf1[0] = bf0[0] + bf0[4];
1599     bf1[1] = bf0[1] + bf0[5];
1600     bf1[2] = bf0[2] + bf0[6];
1601     bf1[3] = bf0[3] + bf0[7];
1602     bf1[4] = bf0[0] - bf0[4];
1603     bf1[5] = bf0[1] - bf0[5];
1604     bf1[6] = bf0[2] - bf0[6];
1605     bf1[7] = bf0[3] - bf0[7];
1606 
1607     // stage 6
1608     cospi  = cospi_arr(cos_bit);
1609     bf0    = output;
1610     bf1    = step;
1611     bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
1612     bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
1613     bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
1614     bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
1615     bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
1616     bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
1617     bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
1618     bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
1619 
1620     // stage 7
1621     bf0    = step;
1622     bf1    = output;
1623     bf1[0] = bf0[1];
1624     bf1[1] = bf0[6];
1625     bf1[2] = bf0[3];
1626     bf1[3] = bf0[4];
1627     bf1[4] = bf0[5];
1628     bf1[5] = bf0[2];
1629     bf1[6] = bf0[7];
1630     bf1[7] = bf0[0];
1631 }
1632 
svt_av1_fadst16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1633 void svt_av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1634                          const int8_t *stage_range) {
1635     (void)stage_range;
1636     const int32_t *cospi;
1637 
1638     int32_t *bf0, *bf1;
1639     int32_t  step[16];
1640 
1641     // stage 0;
1642 
1643     // stage 1;
1644     assert(output != input);
1645     bf1     = output;
1646     bf1[0]  = input[0];
1647     bf1[1]  = -input[15];
1648     bf1[2]  = -input[7];
1649     bf1[3]  = input[8];
1650     bf1[4]  = -input[3];
1651     bf1[5]  = input[12];
1652     bf1[6]  = input[4];
1653     bf1[7]  = -input[11];
1654     bf1[8]  = -input[1];
1655     bf1[9]  = input[14];
1656     bf1[10] = input[6];
1657     bf1[11] = -input[9];
1658     bf1[12] = input[2];
1659     bf1[13] = -input[13];
1660     bf1[14] = -input[5];
1661     bf1[15] = input[10];
1662 
1663     // stage 2
1664     cospi   = cospi_arr(cos_bit);
1665     bf0     = output;
1666     bf1     = step;
1667     bf1[0]  = bf0[0];
1668     bf1[1]  = bf0[1];
1669     bf1[2]  = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
1670     bf1[3]  = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
1671     bf1[4]  = bf0[4];
1672     bf1[5]  = bf0[5];
1673     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
1674     bf1[7]  = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
1675     bf1[8]  = bf0[8];
1676     bf1[9]  = bf0[9];
1677     bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
1678     bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
1679     bf1[12] = bf0[12];
1680     bf1[13] = bf0[13];
1681     bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
1682     bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
1683 
1684     // stage 3
1685     bf0     = step;
1686     bf1     = output;
1687     bf1[0]  = bf0[0] + bf0[2];
1688     bf1[1]  = bf0[1] + bf0[3];
1689     bf1[2]  = bf0[0] - bf0[2];
1690     bf1[3]  = bf0[1] - bf0[3];
1691     bf1[4]  = bf0[4] + bf0[6];
1692     bf1[5]  = bf0[5] + bf0[7];
1693     bf1[6]  = bf0[4] - bf0[6];
1694     bf1[7]  = bf0[5] - bf0[7];
1695     bf1[8]  = bf0[8] + bf0[10];
1696     bf1[9]  = bf0[9] + bf0[11];
1697     bf1[10] = bf0[8] - bf0[10];
1698     bf1[11] = bf0[9] - bf0[11];
1699     bf1[12] = bf0[12] + bf0[14];
1700     bf1[13] = bf0[13] + bf0[15];
1701     bf1[14] = bf0[12] - bf0[14];
1702     bf1[15] = bf0[13] - bf0[15];
1703 
1704     // stage 4
1705     cospi   = cospi_arr(cos_bit);
1706     bf0     = output;
1707     bf1     = step;
1708     bf1[0]  = bf0[0];
1709     bf1[1]  = bf0[1];
1710     bf1[2]  = bf0[2];
1711     bf1[3]  = bf0[3];
1712     bf1[4]  = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
1713     bf1[5]  = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
1714     bf1[6]  = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
1715     bf1[7]  = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
1716     bf1[8]  = bf0[8];
1717     bf1[9]  = bf0[9];
1718     bf1[10] = bf0[10];
1719     bf1[11] = bf0[11];
1720     bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
1721     bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
1722     bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
1723     bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
1724 
1725     // stage 5
1726     bf0     = step;
1727     bf1     = output;
1728     bf1[0]  = bf0[0] + bf0[4];
1729     bf1[1]  = bf0[1] + bf0[5];
1730     bf1[2]  = bf0[2] + bf0[6];
1731     bf1[3]  = bf0[3] + bf0[7];
1732     bf1[4]  = bf0[0] - bf0[4];
1733     bf1[5]  = bf0[1] - bf0[5];
1734     bf1[6]  = bf0[2] - bf0[6];
1735     bf1[7]  = bf0[3] - bf0[7];
1736     bf1[8]  = bf0[8] + bf0[12];
1737     bf1[9]  = bf0[9] + bf0[13];
1738     bf1[10] = bf0[10] + bf0[14];
1739     bf1[11] = bf0[11] + bf0[15];
1740     bf1[12] = bf0[8] - bf0[12];
1741     bf1[13] = bf0[9] - bf0[13];
1742     bf1[14] = bf0[10] - bf0[14];
1743     bf1[15] = bf0[11] - bf0[15];
1744 
1745     // stage 6
1746     cospi   = cospi_arr(cos_bit);
1747     bf0     = output;
1748     bf1     = step;
1749     bf1[0]  = bf0[0];
1750     bf1[1]  = bf0[1];
1751     bf1[2]  = bf0[2];
1752     bf1[3]  = bf0[3];
1753     bf1[4]  = bf0[4];
1754     bf1[5]  = bf0[5];
1755     bf1[6]  = bf0[6];
1756     bf1[7]  = bf0[7];
1757     bf1[8]  = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
1758     bf1[9]  = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
1759     bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
1760     bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
1761     bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
1762     bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
1763     bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
1764     bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
1765 
1766     // stage 7
1767     bf0     = step;
1768     bf1     = output;
1769     bf1[0]  = bf0[0] + bf0[8];
1770     bf1[1]  = bf0[1] + bf0[9];
1771     bf1[2]  = bf0[2] + bf0[10];
1772     bf1[3]  = bf0[3] + bf0[11];
1773     bf1[4]  = bf0[4] + bf0[12];
1774     bf1[5]  = bf0[5] + bf0[13];
1775     bf1[6]  = bf0[6] + bf0[14];
1776     bf1[7]  = bf0[7] + bf0[15];
1777     bf1[8]  = bf0[0] - bf0[8];
1778     bf1[9]  = bf0[1] - bf0[9];
1779     bf1[10] = bf0[2] - bf0[10];
1780     bf1[11] = bf0[3] - bf0[11];
1781     bf1[12] = bf0[4] - bf0[12];
1782     bf1[13] = bf0[5] - bf0[13];
1783     bf1[14] = bf0[6] - bf0[14];
1784     bf1[15] = bf0[7] - bf0[15];
1785 
1786     // stage 8
1787     cospi   = cospi_arr(cos_bit);
1788     bf0     = output;
1789     bf1     = step;
1790     bf1[0]  = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
1791     bf1[1]  = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
1792     bf1[2]  = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
1793     bf1[3]  = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
1794     bf1[4]  = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
1795     bf1[5]  = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
1796     bf1[6]  = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
1797     bf1[7]  = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
1798     bf1[8]  = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
1799     bf1[9]  = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
1800     bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
1801     bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
1802     bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
1803     bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
1804     bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
1805     bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
1806 
1807     // stage 9
1808     bf0     = step;
1809     bf1     = output;
1810     bf1[0]  = bf0[1];
1811     bf1[1]  = bf0[14];
1812     bf1[2]  = bf0[3];
1813     bf1[3]  = bf0[12];
1814     bf1[4]  = bf0[5];
1815     bf1[5]  = bf0[10];
1816     bf1[6]  = bf0[7];
1817     bf1[7]  = bf0[8];
1818     bf1[8]  = bf0[9];
1819     bf1[9]  = bf0[6];
1820     bf1[10] = bf0[11];
1821     bf1[11] = bf0[4];
1822     bf1[12] = bf0[13];
1823     bf1[13] = bf0[2];
1824     bf1[14] = bf0[15];
1825     bf1[15] = bf0[0];
1826 }
1827 
av1_fadst32_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1828 void av1_fadst32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1829                      const int8_t *stage_range) {
1830     (void)stage_range;
1831     const int32_t *cospi;
1832 
1833     int32_t *bf0, *bf1;
1834     int32_t  step[32];
1835 
1836     // stage 0;
1837 
1838     // stage 1;
1839     bf1     = output;
1840     bf1[0]  = input[31];
1841     bf1[1]  = input[0];
1842     bf1[2]  = input[29];
1843     bf1[3]  = input[2];
1844     bf1[4]  = input[27];
1845     bf1[5]  = input[4];
1846     bf1[6]  = input[25];
1847     bf1[7]  = input[6];
1848     bf1[8]  = input[23];
1849     bf1[9]  = input[8];
1850     bf1[10] = input[21];
1851     bf1[11] = input[10];
1852     bf1[12] = input[19];
1853     bf1[13] = input[12];
1854     bf1[14] = input[17];
1855     bf1[15] = input[14];
1856     bf1[16] = input[15];
1857     bf1[17] = input[16];
1858     bf1[18] = input[13];
1859     bf1[19] = input[18];
1860     bf1[20] = input[11];
1861     bf1[21] = input[20];
1862     bf1[22] = input[9];
1863     bf1[23] = input[22];
1864     bf1[24] = input[7];
1865     bf1[25] = input[24];
1866     bf1[26] = input[5];
1867     bf1[27] = input[26];
1868     bf1[28] = input[3];
1869     bf1[29] = input[28];
1870     bf1[30] = input[1];
1871     bf1[31] = input[30];
1872 
1873     // stage 2
1874     cospi   = cospi_arr(cos_bit);
1875     bf0     = output;
1876     bf1     = step;
1877     bf1[0]  = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit);
1878     bf1[1]  = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit);
1879     bf1[2]  = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit);
1880     bf1[3]  = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit);
1881     bf1[4]  = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit);
1882     bf1[5]  = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit);
1883     bf1[6]  = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit);
1884     bf1[7]  = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit);
1885     bf1[8]  = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit);
1886     bf1[9]  = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit);
1887     bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit);
1888     bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit);
1889     bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit);
1890     bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit);
1891     bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit);
1892     bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit);
1893     bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit);
1894     bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit);
1895     bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit);
1896     bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit);
1897     bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit);
1898     bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit);
1899     bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit);
1900     bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit);
1901     bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit);
1902     bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit);
1903     bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit);
1904     bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit);
1905     bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit);
1906     bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit);
1907     bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit);
1908     bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit);
1909 
1910     // stage 3
1911     bf0     = step;
1912     bf1     = output;
1913     bf1[0]  = bf0[0] + bf0[16];
1914     bf1[1]  = bf0[1] + bf0[17];
1915     bf1[2]  = bf0[2] + bf0[18];
1916     bf1[3]  = bf0[3] + bf0[19];
1917     bf1[4]  = bf0[4] + bf0[20];
1918     bf1[5]  = bf0[5] + bf0[21];
1919     bf1[6]  = bf0[6] + bf0[22];
1920     bf1[7]  = bf0[7] + bf0[23];
1921     bf1[8]  = bf0[8] + bf0[24];
1922     bf1[9]  = bf0[9] + bf0[25];
1923     bf1[10] = bf0[10] + bf0[26];
1924     bf1[11] = bf0[11] + bf0[27];
1925     bf1[12] = bf0[12] + bf0[28];
1926     bf1[13] = bf0[13] + bf0[29];
1927     bf1[14] = bf0[14] + bf0[30];
1928     bf1[15] = bf0[15] + bf0[31];
1929     bf1[16] = -bf0[16] + bf0[0];
1930     bf1[17] = -bf0[17] + bf0[1];
1931     bf1[18] = -bf0[18] + bf0[2];
1932     bf1[19] = -bf0[19] + bf0[3];
1933     bf1[20] = -bf0[20] + bf0[4];
1934     bf1[21] = -bf0[21] + bf0[5];
1935     bf1[22] = -bf0[22] + bf0[6];
1936     bf1[23] = -bf0[23] + bf0[7];
1937     bf1[24] = -bf0[24] + bf0[8];
1938     bf1[25] = -bf0[25] + bf0[9];
1939     bf1[26] = -bf0[26] + bf0[10];
1940     bf1[27] = -bf0[27] + bf0[11];
1941     bf1[28] = -bf0[28] + bf0[12];
1942     bf1[29] = -bf0[29] + bf0[13];
1943     bf1[30] = -bf0[30] + bf0[14];
1944     bf1[31] = -bf0[31] + bf0[15];
1945 
1946     // stage 4
1947     cospi   = cospi_arr(cos_bit);
1948     bf0     = output;
1949     bf1     = step;
1950     bf1[0]  = bf0[0];
1951     bf1[1]  = bf0[1];
1952     bf1[2]  = bf0[2];
1953     bf1[3]  = bf0[3];
1954     bf1[4]  = bf0[4];
1955     bf1[5]  = bf0[5];
1956     bf1[6]  = bf0[6];
1957     bf1[7]  = bf0[7];
1958     bf1[8]  = bf0[8];
1959     bf1[9]  = bf0[9];
1960     bf1[10] = bf0[10];
1961     bf1[11] = bf0[11];
1962     bf1[12] = bf0[12];
1963     bf1[13] = bf0[13];
1964     bf1[14] = bf0[14];
1965     bf1[15] = bf0[15];
1966     bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit);
1967     bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit);
1968     bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit);
1969     bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit);
1970     bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit);
1971     bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit);
1972     bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit);
1973     bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit);
1974     bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit);
1975     bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit);
1976     bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit);
1977     bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit);
1978     bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit);
1979     bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit);
1980     bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit);
1981     bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit);
1982 
1983     // stage 5
1984     bf0     = step;
1985     bf1     = output;
1986     bf1[0]  = bf0[0] + bf0[8];
1987     bf1[1]  = bf0[1] + bf0[9];
1988     bf1[2]  = bf0[2] + bf0[10];
1989     bf1[3]  = bf0[3] + bf0[11];
1990     bf1[4]  = bf0[4] + bf0[12];
1991     bf1[5]  = bf0[5] + bf0[13];
1992     bf1[6]  = bf0[6] + bf0[14];
1993     bf1[7]  = bf0[7] + bf0[15];
1994     bf1[8]  = -bf0[8] + bf0[0];
1995     bf1[9]  = -bf0[9] + bf0[1];
1996     bf1[10] = -bf0[10] + bf0[2];
1997     bf1[11] = -bf0[11] + bf0[3];
1998     bf1[12] = -bf0[12] + bf0[4];
1999     bf1[13] = -bf0[13] + bf0[5];
2000     bf1[14] = -bf0[14] + bf0[6];
2001     bf1[15] = -bf0[15] + bf0[7];
2002     bf1[16] = bf0[16] + bf0[24];
2003     bf1[17] = bf0[17] + bf0[25];
2004     bf1[18] = bf0[18] + bf0[26];
2005     bf1[19] = bf0[19] + bf0[27];
2006     bf1[20] = bf0[20] + bf0[28];
2007     bf1[21] = bf0[21] + bf0[29];
2008     bf1[22] = bf0[22] + bf0[30];
2009     bf1[23] = bf0[23] + bf0[31];
2010     bf1[24] = -bf0[24] + bf0[16];
2011     bf1[25] = -bf0[25] + bf0[17];
2012     bf1[26] = -bf0[26] + bf0[18];
2013     bf1[27] = -bf0[27] + bf0[19];
2014     bf1[28] = -bf0[28] + bf0[20];
2015     bf1[29] = -bf0[29] + bf0[21];
2016     bf1[30] = -bf0[30] + bf0[22];
2017     bf1[31] = -bf0[31] + bf0[23];
2018 
2019     // stage 6
2020     cospi   = cospi_arr(cos_bit);
2021     bf0     = output;
2022     bf1     = step;
2023     bf1[0]  = bf0[0];
2024     bf1[1]  = bf0[1];
2025     bf1[2]  = bf0[2];
2026     bf1[3]  = bf0[3];
2027     bf1[4]  = bf0[4];
2028     bf1[5]  = bf0[5];
2029     bf1[6]  = bf0[6];
2030     bf1[7]  = bf0[7];
2031     bf1[8]  = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
2032     bf1[9]  = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit);
2033     bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
2034     bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit);
2035     bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
2036     bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit);
2037     bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
2038     bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit);
2039     bf1[16] = bf0[16];
2040     bf1[17] = bf0[17];
2041     bf1[18] = bf0[18];
2042     bf1[19] = bf0[19];
2043     bf1[20] = bf0[20];
2044     bf1[21] = bf0[21];
2045     bf1[22] = bf0[22];
2046     bf1[23] = bf0[23];
2047     bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit);
2048     bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit);
2049     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit);
2050     bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit);
2051     bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit);
2052     bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit);
2053     bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit);
2054     bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit);
2055 
2056     // stage 7
2057     bf0     = step;
2058     bf1     = output;
2059     bf1[0]  = bf0[0] + bf0[4];
2060     bf1[1]  = bf0[1] + bf0[5];
2061     bf1[2]  = bf0[2] + bf0[6];
2062     bf1[3]  = bf0[3] + bf0[7];
2063     bf1[4]  = -bf0[4] + bf0[0];
2064     bf1[5]  = -bf0[5] + bf0[1];
2065     bf1[6]  = -bf0[6] + bf0[2];
2066     bf1[7]  = -bf0[7] + bf0[3];
2067     bf1[8]  = bf0[8] + bf0[12];
2068     bf1[9]  = bf0[9] + bf0[13];
2069     bf1[10] = bf0[10] + bf0[14];
2070     bf1[11] = bf0[11] + bf0[15];
2071     bf1[12] = -bf0[12] + bf0[8];
2072     bf1[13] = -bf0[13] + bf0[9];
2073     bf1[14] = -bf0[14] + bf0[10];
2074     bf1[15] = -bf0[15] + bf0[11];
2075     bf1[16] = bf0[16] + bf0[20];
2076     bf1[17] = bf0[17] + bf0[21];
2077     bf1[18] = bf0[18] + bf0[22];
2078     bf1[19] = bf0[19] + bf0[23];
2079     bf1[20] = -bf0[20] + bf0[16];
2080     bf1[21] = -bf0[21] + bf0[17];
2081     bf1[22] = -bf0[22] + bf0[18];
2082     bf1[23] = -bf0[23] + bf0[19];
2083     bf1[24] = bf0[24] + bf0[28];
2084     bf1[25] = bf0[25] + bf0[29];
2085     bf1[26] = bf0[26] + bf0[30];
2086     bf1[27] = bf0[27] + bf0[31];
2087     bf1[28] = -bf0[28] + bf0[24];
2088     bf1[29] = -bf0[29] + bf0[25];
2089     bf1[30] = -bf0[30] + bf0[26];
2090     bf1[31] = -bf0[31] + bf0[27];
2091 
2092     // stage 8
2093     cospi   = cospi_arr(cos_bit);
2094     bf0     = output;
2095     bf1     = step;
2096     bf1[0]  = bf0[0];
2097     bf1[1]  = bf0[1];
2098     bf1[2]  = bf0[2];
2099     bf1[3]  = bf0[3];
2100     bf1[4]  = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
2101     bf1[5]  = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit);
2102     bf1[6]  = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
2103     bf1[7]  = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit);
2104     bf1[8]  = bf0[8];
2105     bf1[9]  = bf0[9];
2106     bf1[10] = bf0[10];
2107     bf1[11] = bf0[11];
2108     bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
2109     bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit);
2110     bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
2111     bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit);
2112     bf1[16] = bf0[16];
2113     bf1[17] = bf0[17];
2114     bf1[18] = bf0[18];
2115     bf1[19] = bf0[19];
2116     bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit);
2117     bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit);
2118     bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit);
2119     bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit);
2120     bf1[24] = bf0[24];
2121     bf1[25] = bf0[25];
2122     bf1[26] = bf0[26];
2123     bf1[27] = bf0[27];
2124     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit);
2125     bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit);
2126     bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit);
2127     bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit);
2128 
2129     // stage 9
2130     bf0     = step;
2131     bf1     = output;
2132     bf1[0]  = bf0[0] + bf0[2];
2133     bf1[1]  = bf0[1] + bf0[3];
2134     bf1[2]  = -bf0[2] + bf0[0];
2135     bf1[3]  = -bf0[3] + bf0[1];
2136     bf1[4]  = bf0[4] + bf0[6];
2137     bf1[5]  = bf0[5] + bf0[7];
2138     bf1[6]  = -bf0[6] + bf0[4];
2139     bf1[7]  = -bf0[7] + bf0[5];
2140     bf1[8]  = bf0[8] + bf0[10];
2141     bf1[9]  = bf0[9] + bf0[11];
2142     bf1[10] = -bf0[10] + bf0[8];
2143     bf1[11] = -bf0[11] + bf0[9];
2144     bf1[12] = bf0[12] + bf0[14];
2145     bf1[13] = bf0[13] + bf0[15];
2146     bf1[14] = -bf0[14] + bf0[12];
2147     bf1[15] = -bf0[15] + bf0[13];
2148     bf1[16] = bf0[16] + bf0[18];
2149     bf1[17] = bf0[17] + bf0[19];
2150     bf1[18] = -bf0[18] + bf0[16];
2151     bf1[19] = -bf0[19] + bf0[17];
2152     bf1[20] = bf0[20] + bf0[22];
2153     bf1[21] = bf0[21] + bf0[23];
2154     bf1[22] = -bf0[22] + bf0[20];
2155     bf1[23] = -bf0[23] + bf0[21];
2156     bf1[24] = bf0[24] + bf0[26];
2157     bf1[25] = bf0[25] + bf0[27];
2158     bf1[26] = -bf0[26] + bf0[24];
2159     bf1[27] = -bf0[27] + bf0[25];
2160     bf1[28] = bf0[28] + bf0[30];
2161     bf1[29] = bf0[29] + bf0[31];
2162     bf1[30] = -bf0[30] + bf0[28];
2163     bf1[31] = -bf0[31] + bf0[29];
2164 
2165     // stage 10
2166     cospi   = cospi_arr(cos_bit);
2167     bf0     = output;
2168     bf1     = step;
2169     bf1[0]  = bf0[0];
2170     bf1[1]  = bf0[1];
2171     bf1[2]  = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
2172     bf1[3]  = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit);
2173     bf1[4]  = bf0[4];
2174     bf1[5]  = bf0[5];
2175     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
2176     bf1[7]  = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit);
2177     bf1[8]  = bf0[8];
2178     bf1[9]  = bf0[9];
2179     bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
2180     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit);
2181     bf1[12] = bf0[12];
2182     bf1[13] = bf0[13];
2183     bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
2184     bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit);
2185     bf1[16] = bf0[16];
2186     bf1[17] = bf0[17];
2187     bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit);
2188     bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit);
2189     bf1[20] = bf0[20];
2190     bf1[21] = bf0[21];
2191     bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit);
2192     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit);
2193     bf1[24] = bf0[24];
2194     bf1[25] = bf0[25];
2195     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit);
2196     bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit);
2197     bf1[28] = bf0[28];
2198     bf1[29] = bf0[29];
2199     bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit);
2200     bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit);
2201 
2202     // stage 11
2203     bf0     = step;
2204     bf1     = output;
2205     bf1[0]  = bf0[0];
2206     bf1[1]  = -bf0[16];
2207     bf1[2]  = bf0[24];
2208     bf1[3]  = -bf0[8];
2209     bf1[4]  = bf0[12];
2210     bf1[5]  = -bf0[28];
2211     bf1[6]  = bf0[20];
2212     bf1[7]  = -bf0[4];
2213     bf1[8]  = bf0[6];
2214     bf1[9]  = -bf0[22];
2215     bf1[10] = bf0[30];
2216     bf1[11] = -bf0[14];
2217     bf1[12] = bf0[10];
2218     bf1[13] = -bf0[26];
2219     bf1[14] = bf0[18];
2220     bf1[15] = -bf0[2];
2221     bf1[16] = bf0[3];
2222     bf1[17] = -bf0[19];
2223     bf1[18] = bf0[27];
2224     bf1[19] = -bf0[11];
2225     bf1[20] = bf0[15];
2226     bf1[21] = -bf0[31];
2227     bf1[22] = bf0[23];
2228     bf1[23] = -bf0[7];
2229     bf1[24] = bf0[5];
2230     bf1[25] = -bf0[21];
2231     bf1[26] = bf0[29];
2232     bf1[27] = -bf0[13];
2233     bf1[28] = bf0[9];
2234     bf1[29] = -bf0[25];
2235     bf1[30] = bf0[17];
2236     bf1[31] = -bf0[1];
2237 }
2238 
svt_av1_fidentity4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2239 void svt_av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2240                           const int8_t *stage_range) {
2241     (void)stage_range;
2242     (void)cos_bit;
2243     for (int32_t i = 0; i < 4; ++i)
2244         output[i] = round_shift((int64_t)input[i] * new_sqrt2, new_sqrt2_bits);
2245     assert(stage_range[0] + new_sqrt2_bits <= 32);
2246 }
2247 
svt_av1_fidentity8_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2248 void svt_av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2249                           const int8_t *stage_range) {
2250     (void)stage_range;
2251     (void)cos_bit;
2252     for (int32_t i = 0; i < 8; ++i) output[i] = input[i] * 2;
2253 }
2254 
svt_av1_fidentity16_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2255 void svt_av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2256                            const int8_t *stage_range) {
2257     (void)stage_range;
2258     (void)cos_bit;
2259     for (int32_t i = 0; i < 16; ++i)
2260         output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
2261     assert(stage_range[0] + new_sqrt2_bits <= 32);
2262 }
2263 
svt_av1_fidentity32_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2264 void svt_av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2265                            const int8_t *stage_range) {
2266     (void)stage_range;
2267     (void)cos_bit;
2268     for (int32_t i = 0; i < 32; ++i) output[i] = input[i] * 4;
2269 }
2270 
av1_fidentity64_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2271 void av1_fidentity64_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2272                        const int8_t *stage_range) {
2273     (void)stage_range;
2274     (void)cos_bit;
2275     for (int32_t i = 0; i < 64; ++i)
2276         output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
2277     assert(stage_range[0] + new_sqrt2_bits <= 32);
2278 }
2279 
fwd_txfm_type_to_func(TxfmType txfmtype)2280 static INLINE TxfmFunc fwd_txfm_type_to_func(TxfmType txfmtype) {
2281     switch (txfmtype) {
2282     case TXFM_TYPE_DCT4: return svt_av1_fdct4_new;
2283     case TXFM_TYPE_DCT8: return svt_av1_fdct8_new;
2284     case TXFM_TYPE_DCT16: return svt_av1_fdct16_new;
2285     case TXFM_TYPE_DCT32: return svt_av1_fdct32_new;
2286     case TXFM_TYPE_DCT64: return svt_av1_fdct64_new;
2287     case TXFM_TYPE_ADST4: return svt_av1_fadst4_new;
2288     case TXFM_TYPE_ADST8: return svt_av1_fadst8_new;
2289     case TXFM_TYPE_ADST16: return svt_av1_fadst16_new;
2290     case TXFM_TYPE_ADST32: return av1_fadst32_new;
2291     case TXFM_TYPE_IDENTITY4: return svt_av1_fidentity4_c;
2292     case TXFM_TYPE_IDENTITY8: return svt_av1_fidentity8_c;
2293     case TXFM_TYPE_IDENTITY16: return svt_av1_fidentity16_c;
2294     case TXFM_TYPE_IDENTITY32: return svt_av1_fidentity32_c;
2295     case TXFM_TYPE_IDENTITY64: return av1_fidentity64_c;
2296     default: assert(0); return NULL;
2297     }
2298 }
2299 
2300 //fwd_txfm2d_c
av1_tranform_two_d_core_c(int16_t * input,uint32_t input_stride,int32_t * output,const Txfm2dFlipCfg * cfg,int32_t * buf,uint8_t bit_depth)2301 static INLINE void av1_tranform_two_d_core_c(int16_t *input, uint32_t input_stride, int32_t *output,
2302                                              const Txfm2dFlipCfg *cfg, int32_t *buf,
2303                                              uint8_t bit_depth) {
2304     int32_t c, r;
2305     // Note when assigning txfm_size_col, we use the txfm_size from the
2306     // row configuration and vice versa. This is intentionally done to
2307     // accurately perform rectangular transforms. When the transform is
2308     // rectangular, the number of columns will be the same as the
2309     // txfm_size stored in the row cfg struct. It will make no difference
2310     // for square transforms.
2311     const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
2312     const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
2313     // Take the shift from the larger dimension in the rectangular case.
2314     const int8_t *shift     = cfg->shift;
2315     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2316     int8_t        stage_range_col[MAX_TXFM_STAGE_NUM];
2317     int8_t        stage_range_row[MAX_TXFM_STAGE_NUM];
2318     assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
2319     assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
2320     svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
2321 
2322     const int8_t   cos_bit_col   = cfg->cos_bit_col;
2323     const int8_t   cos_bit_row   = cfg->cos_bit_row;
2324     const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
2325     const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
2326     ASSERT(txfm_func_col != NULL);
2327     ASSERT(txfm_func_row != NULL);
2328     // use output buffer as temp buffer
2329     int32_t *temp_in  = output;
2330     int32_t *temp_out = output + txfm_size_row;
2331 
2332     // Columns
2333     for (c = 0; c < txfm_size_col; ++c) {
2334         if (cfg->ud_flip == 0)
2335             for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * input_stride + c];
2336         else {
2337             for (r = 0; r < txfm_size_row; ++r)
2338                 // flip upside down
2339                 temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
2340         }
2341         svt_av1_round_shift_array_c(
2342             temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
2343         txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
2344         svt_av1_round_shift_array_c(
2345             temp_out, txfm_size_row, -shift[1]); // NM svt_av1_round_shift_array_c
2346         if (cfg->lr_flip == 0) {
2347             for (r = 0; r < txfm_size_row; ++r) buf[r * txfm_size_col + c] = temp_out[r];
2348         } else {
2349             for (r = 0; r < txfm_size_row; ++r)
2350                 // flip from left to right
2351                 buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
2352         }
2353     }
2354 
2355     // Rows
2356     for (r = 0; r < txfm_size_row; ++r) {
2357         txfm_func_row(
2358             buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
2359         svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col, -shift[2]);
2360 
2361         if (abs(rect_type) == 1) {
2362             // Multiply everything by Sqrt2 if the transform is rectangular and the
2363             // size difference is a factor of 2.
2364             for (c = 0; c < txfm_size_col; ++c) {
2365                 output[r * txfm_size_col + c] = round_shift(
2366                     (int64_t)output[r * txfm_size_col + c] * new_sqrt2, new_sqrt2_bits);
2367             }
2368         }
2369     }
2370 }
av1_fdct32_pf_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2371 void av1_fdct32_pf_new(const int32_t *input, int32_t *output, int8_t cos_bit,
2372                        const int8_t *stage_range) {
2373     (void)stage_range;
2374     const int32_t *cospi;
2375 
2376     int32_t *bf0, *bf1;
2377     int32_t  step[32];
2378 
2379     // stage 0;
2380 
2381     // stage 1;
2382     bf1     = output;
2383     bf1[0]  = input[0] + input[31];
2384     bf1[1]  = input[1] + input[30];
2385     bf1[2]  = input[2] + input[29];
2386     bf1[3]  = input[3] + input[28];
2387     bf1[4]  = input[4] + input[27];
2388     bf1[5]  = input[5] + input[26];
2389     bf1[6]  = input[6] + input[25];
2390     bf1[7]  = input[7] + input[24];
2391     bf1[8]  = input[8] + input[23];
2392     bf1[9]  = input[9] + input[22];
2393     bf1[10] = input[10] + input[21];
2394     bf1[11] = input[11] + input[20];
2395     bf1[12] = input[12] + input[19];
2396     bf1[13] = input[13] + input[18];
2397     bf1[14] = input[14] + input[17];
2398     bf1[15] = input[15] + input[16];
2399     bf1[16] = -input[16] + input[15];
2400     bf1[17] = -input[17] + input[14];
2401     bf1[18] = -input[18] + input[13];
2402     bf1[19] = -input[19] + input[12];
2403     bf1[20] = -input[20] + input[11];
2404     bf1[21] = -input[21] + input[10];
2405     bf1[22] = -input[22] + input[9];
2406     bf1[23] = -input[23] + input[8];
2407     bf1[24] = -input[24] + input[7];
2408     bf1[25] = -input[25] + input[6];
2409     bf1[26] = -input[26] + input[5];
2410     bf1[27] = -input[27] + input[4];
2411     bf1[28] = -input[28] + input[3];
2412     bf1[29] = -input[29] + input[2];
2413     bf1[30] = -input[30] + input[1];
2414     bf1[31] = -input[31] + input[0];
2415 
2416     // stage 2
2417     cospi   = cospi_arr(cos_bit);
2418     bf0     = output;
2419     bf1     = step;
2420     bf1[0]  = bf0[0] + bf0[15];
2421     bf1[1]  = bf0[1] + bf0[14];
2422     bf1[2]  = bf0[2] + bf0[13];
2423     bf1[3]  = bf0[3] + bf0[12];
2424     bf1[4]  = bf0[4] + bf0[11];
2425     bf1[5]  = bf0[5] + bf0[10];
2426     bf1[6]  = bf0[6] + bf0[9];
2427     bf1[7]  = bf0[7] + bf0[8];
2428     bf1[8]  = -bf0[8] + bf0[7];
2429     bf1[9]  = -bf0[9] + bf0[6];
2430     bf1[10] = -bf0[10] + bf0[5];
2431     bf1[11] = -bf0[11] + bf0[4];
2432     bf1[12] = -bf0[12] + bf0[3];
2433     bf1[13] = -bf0[13] + bf0[2];
2434     bf1[14] = -bf0[14] + bf0[1];
2435     bf1[15] = -bf0[15] + bf0[0];
2436     bf1[16] = bf0[16];
2437     bf1[17] = bf0[17];
2438     bf1[18] = bf0[18];
2439     bf1[19] = bf0[19];
2440     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
2441     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
2442     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
2443     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
2444     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
2445     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
2446     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
2447     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
2448     bf1[28] = bf0[28];
2449     bf1[29] = bf0[29];
2450     bf1[30] = bf0[30];
2451     bf1[31] = bf0[31];
2452 
2453     // stage 3
2454     cospi   = cospi_arr(cos_bit);
2455     bf0     = step;
2456     bf1     = output;
2457     bf1[0]  = bf0[0] + bf0[7];
2458     bf1[1]  = bf0[1] + bf0[6];
2459     bf1[2]  = bf0[2] + bf0[5];
2460     bf1[3]  = bf0[3] + bf0[4];
2461     bf1[4]  = -bf0[4] + bf0[3];
2462     bf1[5]  = -bf0[5] + bf0[2];
2463     bf1[6]  = -bf0[6] + bf0[1];
2464     bf1[7]  = -bf0[7] + bf0[0];
2465     bf1[8]  = bf0[8];
2466     bf1[9]  = bf0[9];
2467     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
2468     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
2469     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
2470     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
2471     bf1[14] = bf0[14];
2472     bf1[15] = bf0[15];
2473     bf1[16] = bf0[16] + bf0[23];
2474     bf1[17] = bf0[17] + bf0[22];
2475     bf1[18] = bf0[18] + bf0[21];
2476     bf1[19] = bf0[19] + bf0[20];
2477     bf1[20] = -bf0[20] + bf0[19];
2478     bf1[21] = -bf0[21] + bf0[18];
2479     bf1[22] = -bf0[22] + bf0[17];
2480     bf1[23] = -bf0[23] + bf0[16];
2481     bf1[24] = -bf0[24] + bf0[31];
2482     bf1[25] = -bf0[25] + bf0[30];
2483     bf1[26] = -bf0[26] + bf0[29];
2484     bf1[27] = -bf0[27] + bf0[28];
2485     bf1[28] = bf0[28] + bf0[27];
2486     bf1[29] = bf0[29] + bf0[26];
2487     bf1[30] = bf0[30] + bf0[25];
2488     bf1[31] = bf0[31] + bf0[24];
2489 
2490     // stage 4
2491     cospi   = cospi_arr(cos_bit);
2492     bf0     = output;
2493     bf1     = step;
2494     bf1[0]  = bf0[0] + bf0[3];
2495     bf1[1]  = bf0[1] + bf0[2];
2496     bf1[2]  = -bf0[2] + bf0[1];
2497     bf1[3]  = -bf0[3] + bf0[0];
2498     bf1[4]  = bf0[4];
2499     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
2500     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
2501     bf1[7]  = bf0[7];
2502     bf1[8]  = bf0[8] + bf0[11];
2503     bf1[9]  = bf0[9] + bf0[10];
2504     bf1[10] = -bf0[10] + bf0[9];
2505     bf1[11] = -bf0[11] + bf0[8];
2506     bf1[12] = -bf0[12] + bf0[15];
2507     bf1[13] = -bf0[13] + bf0[14];
2508     bf1[14] = bf0[14] + bf0[13];
2509     bf1[15] = bf0[15] + bf0[12];
2510     bf1[16] = bf0[16];
2511     bf1[17] = bf0[17];
2512     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
2513     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
2514     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
2515     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
2516     bf1[22] = bf0[22];
2517     bf1[23] = bf0[23];
2518     bf1[24] = bf0[24];
2519     bf1[25] = bf0[25];
2520     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
2521     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
2522     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
2523     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
2524     bf1[30] = bf0[30];
2525     bf1[31] = bf0[31];
2526 
2527     // stage 5
2528     cospi  = cospi_arr(cos_bit);
2529     bf0    = step;
2530     bf1    = output;
2531     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
2532     //bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
2533     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
2534     //bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
2535     bf1[4]  = bf0[4] + bf0[5];
2536     bf1[5]  = -bf0[5] + bf0[4];
2537     bf1[6]  = -bf0[6] + bf0[7];
2538     bf1[7]  = bf0[7] + bf0[6];
2539     bf1[8]  = bf0[8];
2540     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
2541     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
2542     bf1[11] = bf0[11];
2543     bf1[12] = bf0[12];
2544     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
2545     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
2546     bf1[15] = bf0[15];
2547     bf1[16] = bf0[16] + bf0[19];
2548     bf1[17] = bf0[17] + bf0[18];
2549     bf1[18] = -bf0[18] + bf0[17];
2550     bf1[19] = -bf0[19] + bf0[16];
2551     bf1[20] = -bf0[20] + bf0[23];
2552     bf1[21] = -bf0[21] + bf0[22];
2553     bf1[22] = bf0[22] + bf0[21];
2554     bf1[23] = bf0[23] + bf0[20];
2555     bf1[24] = bf0[24] + bf0[27];
2556     bf1[25] = bf0[25] + bf0[26];
2557     bf1[26] = -bf0[26] + bf0[25];
2558     bf1[27] = -bf0[27] + bf0[24];
2559     bf1[28] = -bf0[28] + bf0[31];
2560     bf1[29] = -bf0[29] + bf0[30];
2561     bf1[30] = bf0[30] + bf0[29];
2562     bf1[31] = bf0[31] + bf0[28];
2563 
2564     // stage 6
2565     cospi  = cospi_arr(cos_bit);
2566     bf0    = output;
2567     bf1    = step;
2568     bf1[0] = bf0[0];
2569     //bf1[1] = bf0[1];
2570     bf1[2] = bf0[2];
2571     //bf1[3] = bf0[3];
2572     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
2573     //bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
2574     bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
2575     //bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
2576     bf1[8]  = bf0[8] + bf0[9];
2577     bf1[9]  = -bf0[9] + bf0[8];
2578     bf1[10] = -bf0[10] + bf0[11];
2579     bf1[11] = bf0[11] + bf0[10];
2580     bf1[12] = bf0[12] + bf0[13];
2581     bf1[13] = -bf0[13] + bf0[12];
2582     bf1[14] = -bf0[14] + bf0[15];
2583     bf1[15] = bf0[15] + bf0[14];
2584     bf1[16] = bf0[16];
2585     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
2586     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
2587     bf1[19] = bf0[19];
2588     bf1[20] = bf0[20];
2589     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
2590     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
2591     bf1[23] = bf0[23];
2592     bf1[24] = bf0[24];
2593     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
2594     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
2595     bf1[27] = bf0[27];
2596     bf1[28] = bf0[28];
2597     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
2598     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
2599     bf1[31] = bf0[31];
2600 
2601     // stage 7
2602     cospi  = cospi_arr(cos_bit);
2603     bf0    = step;
2604     bf1    = output;
2605     bf1[0] = bf0[0];
2606     //bf1[1] = bf0[1];
2607     bf1[2] = bf0[2];
2608     //bf1[3] = bf0[3];
2609     bf1[4] = bf0[4];
2610     //bf1[5] = bf0[5];
2611     bf1[6] = bf0[6];
2612     //bf1[7] = bf0[7];
2613     bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
2614     //bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
2615     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
2616     //bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
2617     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
2618     //bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
2619     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
2620     //bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
2621     bf1[16] = bf0[16] + bf0[17];
2622     bf1[17] = -bf0[17] + bf0[16];
2623     bf1[18] = -bf0[18] + bf0[19];
2624     bf1[19] = bf0[19] + bf0[18];
2625     bf1[20] = bf0[20] + bf0[21];
2626     bf1[21] = -bf0[21] + bf0[20];
2627     bf1[22] = -bf0[22] + bf0[23];
2628     bf1[23] = bf0[23] + bf0[22];
2629     bf1[24] = bf0[24] + bf0[25];
2630     bf1[25] = -bf0[25] + bf0[24];
2631     bf1[26] = -bf0[26] + bf0[27];
2632     bf1[27] = bf0[27] + bf0[26];
2633     bf1[28] = bf0[28] + bf0[29];
2634     bf1[29] = -bf0[29] + bf0[28];
2635     bf1[30] = -bf0[30] + bf0[31];
2636     bf1[31] = bf0[31] + bf0[30];
2637 
2638     // stage 8
2639     cospi  = cospi_arr(cos_bit);
2640     bf0    = output;
2641     bf1    = step;
2642     bf1[0] = bf0[0];
2643     //bf1[1] = bf0[1];
2644     bf1[2] = bf0[2];
2645     //bf1[3] = bf0[3];
2646     bf1[4] = bf0[4];
2647     //bf1[5] = bf0[5];
2648     bf1[6] = bf0[6];
2649     //bf1[7] = bf0[7];
2650     bf1[8] = bf0[8];
2651     //bf1[9] = bf0[9];
2652     bf1[10] = bf0[10];
2653     //bf1[11] = bf0[11];
2654     bf1[12] = bf0[12];
2655     //bf1[13] = bf0[13];
2656     bf1[14] = bf0[14];
2657     //bf1[15] = bf0[15];
2658     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
2659     //bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
2660     bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
2661     //bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
2662     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
2663     //bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
2664     bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
2665     //bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
2666     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
2667     //bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
2668     bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
2669     //bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
2670     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
2671     //bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
2672     bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
2673     //bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
2674 
2675     // stage 11
2676     bf0     = step;
2677     bf1     = output;
2678     bf1[0]  = bf0[0];
2679     bf1[1]  = -bf0[16];
2680     bf1[2]  = bf0[24];
2681     bf1[3]  = -bf0[8];
2682     bf1[4]  = bf0[12];
2683     bf1[5]  = -bf0[28];
2684     bf1[6]  = bf0[20];
2685     bf1[7]  = -bf0[4];
2686     bf1[8]  = bf0[6];
2687     bf1[9]  = -bf0[22];
2688     bf1[10] = bf0[30];
2689     bf1[11] = -bf0[14];
2690     bf1[12] = bf0[10];
2691     bf1[13] = -bf0[26];
2692     bf1[14] = bf0[18];
2693     bf1[15] = -bf0[2];
2694     bf1[16] = bf0[3];
2695     bf1[17] = -bf0[19];
2696     bf1[18] = bf0[27];
2697     bf1[19] = -bf0[11];
2698     bf1[20] = bf0[15];
2699     bf1[21] = -bf0[31];
2700     bf1[22] = bf0[23];
2701     bf1[23] = -bf0[7];
2702     bf1[24] = bf0[5];
2703     bf1[25] = -bf0[21];
2704     bf1[26] = bf0[29];
2705     bf1[27] = -bf0[13];
2706     bf1[28] = bf0[9];
2707     bf1[29] = -bf0[25];
2708     bf1[30] = bf0[17];
2709     bf1[31] = -bf0[1];
2710 }
set_fwd_txfm_non_scale_range(Txfm2dFlipCfg * cfg)2711 static INLINE void set_fwd_txfm_non_scale_range(Txfm2dFlipCfg *cfg) {
2712     av1_zero(cfg->stage_range_col);
2713     av1_zero(cfg->stage_range_row);
2714 
2715     const int8_t *range_mult2_col = fwd_txfm_range_mult2_list[cfg->txfm_type_col];
2716     if (cfg->txfm_type_col != TXFM_TYPE_INVALID) {
2717         int stage_num_col = cfg->stage_num_col;
2718         for (int i = 0; i < stage_num_col; ++i)
2719             cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
2720     }
2721 
2722     if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
2723         int           stage_num_row   = cfg->stage_num_row;
2724         const int8_t *range_mult2_row = fwd_txfm_range_mult2_list[cfg->txfm_type_row];
2725         for (int i = 0; i < stage_num_row; ++i) {
2726             cfg->stage_range_row[i] = (range_mult2_col[cfg->stage_num_col - 1] +
2727                                        range_mult2_row[i] + 1) >>
2728                 1;
2729         }
2730     }
2731 }
av1_transform_config(TxType tx_type,TxSize tx_size,Txfm2dFlipCfg * cfg)2732 void av1_transform_config(TxType tx_type, TxSize tx_size, Txfm2dFlipCfg *cfg) {
2733     assert(cfg != NULL);
2734     cfg->tx_size = tx_size;
2735     set_flip_cfg(tx_type, cfg);
2736     const TxType1D tx_type_1d_col = vtx_tab[tx_type];
2737     const TxType1D tx_type_1d_row = htx_tab[tx_type];
2738     const int32_t  txw_idx        = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
2739     const int32_t  txh_idx        = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
2740     cfg->shift                    = fwd_txfm_shift_ls[tx_size];
2741     cfg->cos_bit_col              = fwd_cos_bit_col[txw_idx][txh_idx];
2742     cfg->cos_bit_row              = fwd_cos_bit_row[txw_idx][txh_idx];
2743     cfg->txfm_type_col            = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
2744     cfg->txfm_type_row            = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
2745     cfg->stage_num_col            = av1_txfm_stage_num_list[cfg->txfm_type_col];
2746     cfg->stage_num_row            = av1_txfm_stage_num_list[cfg->txfm_type_row];
2747     set_fwd_txfm_non_scale_range(cfg);
2748 }
2749 
energy_computation(int32_t * coeff,uint32_t coeff_stride,uint32_t area_width,uint32_t area_height)2750 static uint64_t energy_computation(int32_t *coeff, uint32_t coeff_stride, uint32_t area_width,
2751                                    uint32_t area_height) {
2752     uint64_t prediction_distortion = 0;
2753 
2754     for (uint32_t row_index = 0; row_index < area_height; ++row_index) {
2755         for (uint32_t column_index = 0; column_index < area_width; ++column_index)
2756             prediction_distortion += (int64_t)SQR((int64_t)(coeff[column_index]));
2757         coeff += coeff_stride;
2758     }
2759 
2760     return prediction_distortion;
2761 }
2762 
svt_handle_transform64x64_c(int32_t * output)2763 uint64_t svt_handle_transform64x64_c(int32_t *output) {
2764     uint64_t three_quad_energy;
2765 
2766     // top - right 32x32 area.
2767     three_quad_energy = energy_computation(output + 32, 64, 32, 32);
2768     //bottom 64x32 area.
2769     three_quad_energy += energy_computation(output + 32 * 64, 64, 64, 32);
2770 
2771     // zero out top-right 32x32 area.
2772     for (int32_t row = 0; row < 32; ++row) memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
2773 
2774     // zero out the bottom 64x32 area.
2775     memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
2776 
2777     // Re-pack non-zero coeffs in the first 32x32 indices.
2778     for (int32_t row = 1; row < 32; ++row)
2779         svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2780 
2781     return three_quad_energy;
2782 }
2783 
svt_av1_transform_two_d_64x64_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2784 void svt_av1_transform_two_d_64x64_c(int16_t *input, int32_t *output, uint32_t input_stride,
2785                                      TxType transform_type, uint8_t bit_depth) {
2786     int32_t       intermediate_transform_buffer[64 * 64];
2787     Txfm2dFlipCfg cfg;
2788     //av1_get_fwd_txfm_cfg
2789     av1_transform_config(transform_type, TX_64X64, &cfg);
2790     //fwd_txfm2d_c
2791     av1_tranform_two_d_core_c(
2792         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2793 }
2794 
svt_av1_transform_two_d_32x32_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2795 void svt_av1_transform_two_d_32x32_c(int16_t *input, int32_t *output, uint32_t input_stride,
2796                                      TxType transform_type, uint8_t bit_depth) {
2797     int32_t       intermediate_transform_buffer[32 * 32];
2798     Txfm2dFlipCfg cfg;
2799 
2800     av1_transform_config(transform_type, TX_32X32, &cfg);
2801 
2802     av1_tranform_two_d_core_c(
2803         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2804 }
svt_av1_transform_two_d_16x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2805 void svt_av1_transform_two_d_16x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
2806                                      TxType transform_type, uint8_t bit_depth) {
2807     int32_t       intermediate_transform_buffer[16 * 16];
2808     Txfm2dFlipCfg cfg;
2809 
2810     av1_transform_config(transform_type, TX_16X16, &cfg);
2811 
2812     av1_tranform_two_d_core_c(
2813         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2814 }
2815 
svt_av1_transform_two_d_8x8_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2816 void svt_av1_transform_two_d_8x8_c(int16_t *input, int32_t *output, uint32_t input_stride,
2817                                    TxType transform_type, uint8_t bit_depth) {
2818     int32_t       intermediate_transform_buffer[8 * 8];
2819     Txfm2dFlipCfg cfg;
2820 
2821     av1_transform_config(transform_type, TX_8X8, &cfg);
2822 
2823     av1_tranform_two_d_core_c(
2824         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2825 }
2826 
svt_av1_transform_two_d_4x4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2827 void svt_av1_transform_two_d_4x4_c(int16_t *input, int32_t *output, uint32_t input_stride,
2828                                    TxType transform_type, uint8_t bit_depth) {
2829     int32_t       intermediate_transform_buffer[4 * 4];
2830     Txfm2dFlipCfg cfg;
2831 
2832     av1_transform_config(transform_type, TX_4X4, &cfg);
2833 
2834     av1_tranform_two_d_core_c(
2835         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2836 }
2837 
2838 /*********************************************************************
2839 * Calculate CBF
2840 *********************************************************************/
svt_av1_fwd_txfm2d_64x32_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2841 void svt_av1_fwd_txfm2d_64x32_c(int16_t *input, int32_t *output, uint32_t input_stride,
2842                                 TxType transform_type, uint8_t bit_depth) {
2843     int32_t       intermediate_transform_buffer[64 * 32];
2844     Txfm2dFlipCfg cfg;
2845     /*av1_get_fwd_txfm_cfg*/
2846     av1_transform_config(transform_type, TX_64X32, &cfg);
2847     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2848         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2849 }
2850 
svt_handle_transform64x32_c(int32_t * output)2851 uint64_t svt_handle_transform64x32_c(int32_t *output) {
2852     // top - right 32x32 area.
2853     const uint64_t three_quad_energy = energy_computation(output + 32, 64, 32, 32);
2854 
2855     // zero out right 32x32 area.
2856     for (int32_t row = 0; row < 32; ++row) memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
2857 
2858     // Re-pack non-zero coeffs in the first 32x32 indices.
2859     for (int32_t row = 1; row < 32; ++row)
2860         svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2861 
2862     return three_quad_energy;
2863 }
2864 
svt_av1_fwd_txfm2d_32x64_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2865 void svt_av1_fwd_txfm2d_32x64_c(int16_t *input, int32_t *output, uint32_t input_stride,
2866                                 TxType transform_type, uint8_t bit_depth) {
2867     int32_t intermediate_transform_buffer[32 * 64];
2868 
2869     Txfm2dFlipCfg cfg;
2870     /*av1_get_fwd_txfm_cfg*/
2871     av1_transform_config(transform_type, TX_32X64, &cfg);
2872     /*fwd_txfm2d_c*/
2873     av1_tranform_two_d_core_c(
2874         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2875 }
2876 
svt_handle_transform32x64_c(int32_t * output)2877 uint64_t svt_handle_transform32x64_c(int32_t *output) {
2878     //bottom 32x32 area.
2879     const uint64_t three_quad_energy = energy_computation(output + 32 * 32, 32, 32, 32);
2880 
2881     // zero out the bottom 32x32 area.
2882     memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
2883 
2884     return three_quad_energy;
2885 }
2886 
svt_av1_fwd_txfm2d_64x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2887 void svt_av1_fwd_txfm2d_64x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
2888                                 TxType transform_type, uint8_t bit_depth) {
2889     int32_t       intermediate_transform_buffer[64 * 16];
2890     Txfm2dFlipCfg cfg;
2891     /*av1_get_fwd_txfm_cfg*/
2892     av1_transform_config(transform_type, TX_64X16, &cfg);
2893     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2894         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2895 }
2896 
svt_handle_transform64x16_c(int32_t * output)2897 uint64_t svt_handle_transform64x16_c(int32_t *output) {
2898     // top - right 32x16 area.
2899     const uint64_t three_quad_energy = energy_computation(output + 32, 64, 32, 16);
2900 
2901     // zero out right 32x16 area.
2902     for (int32_t row = 0; row < 16; ++row) memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
2903 
2904     // Re-pack non-zero coeffs in the first 32x16 indices.
2905     for (int32_t row = 1; row < 16; ++row)
2906         svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2907 
2908     return three_quad_energy;
2909 }
2910 
svt_av1_fwd_txfm2d_16x64_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2911 void svt_av1_fwd_txfm2d_16x64_c(int16_t *input, int32_t *output, uint32_t input_stride,
2912                                 TxType transform_type, uint8_t bit_depth) {
2913     int32_t intermediate_transform_buffer[16 * 64];
2914 
2915     Txfm2dFlipCfg cfg;
2916     /*av1_get_fwd_txfm_cfg*/
2917     av1_transform_config(transform_type, TX_16X64, &cfg);
2918     /*fwd_txfm2d_c*/
2919     av1_tranform_two_d_core_c(
2920         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2921 }
2922 
svt_handle_transform16x64_c(int32_t * output)2923 uint64_t svt_handle_transform16x64_c(int32_t *output) {
2924     //bottom 16x32 area.
2925     const uint64_t three_quad_energy = energy_computation(output + 16 * 32, 16, 16, 32);
2926 
2927     // zero out the bottom 16x32 area.
2928     memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
2929 
2930     return three_quad_energy;
2931 }
2932 
handle_transform16x64_N2_N4_c(int32_t * output)2933 uint64_t handle_transform16x64_N2_N4_c(int32_t *output) {
2934     (void)output;
2935     return 0;
2936 }
2937 
handle_transform32x64_N2_N4_c(int32_t * output)2938 uint64_t handle_transform32x64_N2_N4_c(int32_t *output) {
2939     (void)output;
2940     return 0;
2941 }
2942 
handle_transform64x16_N2_N4_c(int32_t * output)2943 uint64_t handle_transform64x16_N2_N4_c(int32_t *output) {
2944     // Re-pack non-zero coeffs in the first 32x16 indices.
2945     for (int32_t row = 1; row < 16; ++row)
2946         svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2947 
2948     return 0;
2949 }
2950 
handle_transform64x32_N2_N4_c(int32_t * output)2951 uint64_t handle_transform64x32_N2_N4_c(int32_t *output) {
2952     // Re-pack non-zero coeffs in the first 32x32 indices.
2953     for (int32_t row = 1; row < 32; ++row)
2954         svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2955 
2956     return 0;
2957 }
2958 
handle_transform64x64_N2_N4_c(int32_t * output)2959 uint64_t handle_transform64x64_N2_N4_c(int32_t *output) {
2960     // Re-pack non-zero coeffs in the first 32x32 indices.
2961     for (int32_t row = 1; row < 32; ++row)
2962         svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2963 
2964     return 0;
2965 }
svt_av1_fwd_txfm2d_32x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2966 void svt_av1_fwd_txfm2d_32x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
2967                                 TxType transform_type, uint8_t bit_depth) {
2968     int32_t       intermediate_transform_buffer[32 * 16];
2969     Txfm2dFlipCfg cfg;
2970     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_32X16, &cfg);
2971     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2972         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2973 }
2974 
svt_av1_fwd_txfm2d_16x32_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2975 void svt_av1_fwd_txfm2d_16x32_c(int16_t *input, int32_t *output, uint32_t input_stride,
2976                                 TxType transform_type, uint8_t bit_depth) {
2977     int32_t       intermediate_transform_buffer[16 * 32];
2978     Txfm2dFlipCfg cfg;
2979     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_16X32, &cfg);
2980     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2981         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2982 }
2983 
svt_av1_fwd_txfm2d_16x8_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2984 void svt_av1_fwd_txfm2d_16x8_c(int16_t *input, int32_t *output, uint32_t input_stride,
2985                                TxType transform_type, uint8_t bit_depth) {
2986     int32_t       intermediate_transform_buffer[16 * 8];
2987     Txfm2dFlipCfg cfg;
2988     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_16X8, &cfg);
2989     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2990         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2991 }
2992 
svt_av1_fwd_txfm2d_8x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2993 void svt_av1_fwd_txfm2d_8x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
2994                                TxType transform_type, uint8_t bit_depth) {
2995     int32_t       intermediate_transform_buffer[8 * 16];
2996     Txfm2dFlipCfg cfg;
2997     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_8X16, &cfg);
2998     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2999         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3000 }
3001 
svt_av1_fwd_txfm2d_32x8_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3002 void svt_av1_fwd_txfm2d_32x8_c(int16_t *input, int32_t *output, uint32_t input_stride,
3003                                TxType transform_type, uint8_t bit_depth) {
3004     int32_t       intermediate_transform_buffer[32 * 8];
3005     Txfm2dFlipCfg cfg;
3006     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_32X8, &cfg);
3007     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3008         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3009 }
3010 
svt_av1_fwd_txfm2d_8x32_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3011 void svt_av1_fwd_txfm2d_8x32_c(int16_t *input, int32_t *output, uint32_t input_stride,
3012                                TxType transform_type, uint8_t bit_depth) {
3013     int32_t       intermediate_transform_buffer[8 * 32];
3014     Txfm2dFlipCfg cfg;
3015     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_8X32, &cfg);
3016     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3017         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3018 }
3019 
svt_av1_fwd_txfm2d_16x4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3020 void svt_av1_fwd_txfm2d_16x4_c(int16_t *input, int32_t *output, uint32_t input_stride,
3021                                TxType transform_type, uint8_t bit_depth) {
3022     int32_t       intermediate_transform_buffer[16 * 4];
3023     Txfm2dFlipCfg cfg;
3024     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_16X4, &cfg);
3025     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3026         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3027 }
3028 
svt_av1_fwd_txfm2d_4x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3029 void svt_av1_fwd_txfm2d_4x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
3030                                TxType transform_type, uint8_t bit_depth) {
3031     int32_t       intermediate_transform_buffer[4 * 16];
3032     Txfm2dFlipCfg cfg;
3033     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_4X16, &cfg);
3034     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3035         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3036 }
3037 
svt_av1_fwd_txfm2d_8x4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3038 void svt_av1_fwd_txfm2d_8x4_c(int16_t *input, int32_t *output, uint32_t input_stride,
3039                               TxType transform_type, uint8_t bit_depth) {
3040     int32_t       intermediate_transform_buffer[8 * 4];
3041     Txfm2dFlipCfg cfg;
3042     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_8X4, &cfg);
3043     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3044         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3045 }
3046 
svt_av1_fwd_txfm2d_4x8_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3047 void svt_av1_fwd_txfm2d_4x8_c(int16_t *input, int32_t *output, uint32_t input_stride,
3048                               TxType transform_type, uint8_t bit_depth) {
3049     int32_t       intermediate_transform_buffer[4 * 8];
3050     Txfm2dFlipCfg cfg;
3051     /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_4X8, &cfg);
3052     /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3053         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3054 }
av1_estimate_transform_N2(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type)3055 static EbErrorType av1_estimate_transform_N2(int16_t *residual_buffer, uint32_t residual_stride,
3056                                              int32_t *coeff_buffer, uint32_t coeff_stride,
3057                                              TxSize transform_size, uint64_t *three_quad_energy,
3058                                              uint32_t bit_depth, TxType transform_type,
3059                                              PlaneType component_type)
3060 
3061 {
3062     EbErrorType return_error = EB_ErrorNone;
3063 
3064     (void)coeff_stride;
3065     (void)component_type;
3066 
3067     switch (transform_size) {
3068     case TX_64X32:
3069         if (transform_type == DCT_DCT)
3070             svt_av1_fwd_txfm2d_64x32_N2(
3071                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3072         else
3073             svt_av1_fwd_txfm2d_64x32_N2_c(
3074                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3075 
3076         *three_quad_energy = handle_transform64x32_N2_N4(coeff_buffer);
3077 
3078         break;
3079 
3080     case TX_32X64:
3081         if (transform_type == DCT_DCT)
3082             svt_av1_fwd_txfm2d_32x64_N2(
3083                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3084         else
3085             svt_av1_fwd_txfm2d_32x64_N2_c(
3086                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3087 
3088         *three_quad_energy = handle_transform32x64_N2_N4(coeff_buffer);
3089 
3090         break;
3091 
3092     case TX_64X16:
3093         if (transform_type == DCT_DCT)
3094             svt_av1_fwd_txfm2d_64x16_N2(
3095                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3096         else
3097             svt_av1_fwd_txfm2d_64x16_N2_c(
3098                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3099 
3100         *three_quad_energy = handle_transform64x16_N2_N4(coeff_buffer);
3101 
3102         break;
3103 
3104     case TX_16X64:
3105         if (transform_type == DCT_DCT)
3106             svt_av1_fwd_txfm2d_16x64_N2(
3107                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3108         else
3109             svt_av1_fwd_txfm2d_16x64_N2_c(
3110                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3111 
3112         *three_quad_energy = handle_transform16x64_N2_N4(coeff_buffer);
3113 
3114         break;
3115 
3116     case TX_32X16:
3117         // TTK
3118         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3119             svt_av1_fwd_txfm2d_32x16_N2(
3120                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3121         else
3122             svt_av1_fwd_txfm2d_32x16_N2_c(
3123                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3124         break;
3125 
3126     case TX_16X32:
3127         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3128             svt_av1_fwd_txfm2d_16x32_N2(
3129                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3130         else
3131             svt_av1_fwd_txfm2d_16x32_N2_c(
3132                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3133         break;
3134 
3135     case TX_16X8:
3136         svt_av1_fwd_txfm2d_16x8_N2(
3137             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3138         break;
3139 
3140     case TX_8X16:
3141         svt_av1_fwd_txfm2d_8x16_N2(
3142             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3143         break;
3144 
3145     case TX_32X8:
3146         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3147             svt_av1_fwd_txfm2d_32x8_N2(
3148                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3149         else
3150             svt_av1_fwd_txfm2d_32x8_N2_c(
3151                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3152         break;
3153 
3154     case TX_8X32:
3155         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3156             svt_av1_fwd_txfm2d_8x32_N2(
3157                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3158         else
3159             svt_av1_fwd_txfm2d_8x32_N2_c(
3160                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3161         break;
3162     case TX_16X4:
3163         svt_av1_fwd_txfm2d_16x4_N2(
3164             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3165         break;
3166     case TX_4X16:
3167         svt_av1_fwd_txfm2d_4x16_N2(
3168             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3169         break;
3170     case TX_8X4:
3171 
3172         svt_av1_fwd_txfm2d_8x4_N2(
3173             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3174 
3175         break;
3176     case TX_4X8:
3177 
3178         svt_av1_fwd_txfm2d_4x8_N2(
3179             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3180 
3181         break;
3182 
3183     case TX_64X64:
3184 
3185         svt_av1_fwd_txfm2d_64x64_N2(
3186             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3187 
3188         *three_quad_energy = handle_transform64x64_N2_N4(coeff_buffer);
3189 
3190         break;
3191 
3192     case TX_32X32:
3193         if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3194             transform_type == H_ADST || transform_type == V_FLIPADST ||
3195             transform_type == H_FLIPADST)
3196             // Tahani: I believe those cases are never hit
3197             av1_transform_two_d_32x32_N2_c(
3198                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3199 
3200         else {
3201             svt_av1_fwd_txfm2d_32x32_N2(
3202                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3203         }
3204 
3205         break;
3206 
3207     case TX_16X16:
3208 
3209         svt_av1_fwd_txfm2d_16x16_N2(
3210             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3211 
3212         break;
3213     case TX_8X8:
3214 
3215         svt_av1_fwd_txfm2d_8x8_N2(
3216             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3217 
3218         break;
3219     case TX_4X4:
3220 
3221         svt_av1_fwd_txfm2d_4x4_N2(
3222             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3223 
3224         break;
3225     default: assert(0); break;
3226     }
3227 
3228     return return_error;
3229 }
3230 
av1_estimate_transform_N4(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type)3231 static EbErrorType av1_estimate_transform_N4(int16_t *residual_buffer, uint32_t residual_stride,
3232                                              int32_t *coeff_buffer, uint32_t coeff_stride,
3233                                              TxSize transform_size, uint64_t *three_quad_energy,
3234                                              uint32_t bit_depth, TxType transform_type,
3235                                              PlaneType component_type)
3236 
3237 {
3238     EbErrorType return_error = EB_ErrorNone;
3239 
3240     (void)coeff_stride;
3241     (void)component_type;
3242 
3243     switch (transform_size) {
3244     case TX_64X32:
3245         if (transform_type == DCT_DCT)
3246             svt_av1_fwd_txfm2d_64x32_N4(
3247                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3248         else
3249             svt_av1_fwd_txfm2d_64x32_N4_c(
3250                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3251 
3252         *three_quad_energy = handle_transform64x32_N2_N4(coeff_buffer);
3253 
3254         break;
3255 
3256     case TX_32X64:
3257         if (transform_type == DCT_DCT)
3258             svt_av1_fwd_txfm2d_32x64_N4(
3259                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3260         else
3261             svt_av1_fwd_txfm2d_32x64_N4_c(
3262                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3263 
3264         *three_quad_energy = handle_transform32x64_N2_N4(coeff_buffer);
3265 
3266         break;
3267 
3268     case TX_64X16:
3269         if (transform_type == DCT_DCT)
3270             svt_av1_fwd_txfm2d_64x16_N4(
3271                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3272         else
3273             svt_av1_fwd_txfm2d_64x16_N4_c(
3274                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3275 
3276         *three_quad_energy = handle_transform64x16_N2_N4(coeff_buffer);
3277 
3278         break;
3279 
3280     case TX_16X64:
3281         if (transform_type == DCT_DCT)
3282             svt_av1_fwd_txfm2d_16x64_N4(
3283                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3284         else
3285             svt_av1_fwd_txfm2d_16x64_N4_c(
3286                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3287 
3288         *three_quad_energy = handle_transform16x64_N2_N4(coeff_buffer);
3289 
3290         break;
3291 
3292     case TX_32X16:
3293         // TTK
3294         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3295             svt_av1_fwd_txfm2d_32x16_N4(
3296                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3297         else
3298             svt_av1_fwd_txfm2d_32x16_N4_c(
3299                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3300         break;
3301 
3302     case TX_16X32:
3303         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3304             svt_av1_fwd_txfm2d_16x32_N4(
3305                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3306         else
3307             svt_av1_fwd_txfm2d_16x32_N4_c(
3308                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3309         break;
3310 
3311     case TX_16X8:
3312         svt_av1_fwd_txfm2d_16x8_N4(
3313             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3314         break;
3315 
3316     case TX_8X16:
3317         svt_av1_fwd_txfm2d_8x16_N4(
3318             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3319         break;
3320 
3321     case TX_32X8:
3322         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3323             svt_av1_fwd_txfm2d_32x8_N4(
3324                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3325         else
3326             svt_av1_fwd_txfm2d_32x8_N4_c(
3327                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3328         break;
3329 
3330     case TX_8X32:
3331         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3332             svt_av1_fwd_txfm2d_8x32_N4(
3333                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3334         else
3335             svt_av1_fwd_txfm2d_8x32_N4_c(
3336                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3337         break;
3338     case TX_16X4:
3339         svt_av1_fwd_txfm2d_16x4_N4(
3340             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3341         break;
3342     case TX_4X16:
3343         svt_av1_fwd_txfm2d_4x16_N4(
3344             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3345         break;
3346     case TX_8X4:
3347 
3348         svt_av1_fwd_txfm2d_8x4_N4(
3349             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3350 
3351         break;
3352     case TX_4X8:
3353 
3354         svt_av1_fwd_txfm2d_4x8_N4(
3355             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3356 
3357         break;
3358 
3359     case TX_64X64:
3360 
3361         svt_av1_fwd_txfm2d_64x64_N4(
3362             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3363 
3364         *three_quad_energy = handle_transform64x64_N2_N4(coeff_buffer);
3365 
3366         break;
3367 
3368     case TX_32X32:
3369         if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3370             transform_type == H_ADST || transform_type == V_FLIPADST ||
3371             transform_type == H_FLIPADST)
3372             // Tahani: I believe those cases are never hit
3373             av1_transform_two_d_32x32_N4_c(
3374                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3375 
3376         else {
3377             svt_av1_fwd_txfm2d_32x32_N4(
3378                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3379         }
3380 
3381         break;
3382 
3383     case TX_16X16:
3384 
3385         svt_av1_fwd_txfm2d_16x16_N4(
3386             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3387 
3388         break;
3389     case TX_8X8:
3390 
3391         svt_av1_fwd_txfm2d_8x8_N4(
3392             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3393 
3394         break;
3395     case TX_4X4:
3396 
3397         svt_av1_fwd_txfm2d_4x4_N4(
3398             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3399 
3400         break;
3401     default: assert(0); break;
3402     }
3403 
3404     return return_error;
3405 }
3406 
av1_estimate_transform_ONLY_DC(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type)3407 static EbErrorType av1_estimate_transform_ONLY_DC(int16_t *residual_buffer,
3408                                                   uint32_t residual_stride, int32_t *coeff_buffer,
3409                                                   uint32_t coeff_stride, TxSize transform_size,
3410                                                   uint64_t *three_quad_energy, uint32_t bit_depth,
3411                                                   TxType transform_type, PlaneType component_type)
3412 
3413 {
3414     EbErrorType return_error = av1_estimate_transform_N4(residual_buffer,
3415                                                          residual_stride,
3416                                                          coeff_buffer,
3417                                                          coeff_stride,
3418                                                          transform_size,
3419                                                          three_quad_energy,
3420                                                          bit_depth,
3421                                                          transform_type,
3422                                                          component_type);
3423 
3424     for (int i = 1; i < (tx_size_wide[transform_size] * tx_size_high[transform_size]); i++) {
3425         if (i % tx_size_wide[transform_size] < (tx_size_wide[transform_size] >> 2) ||
3426             i / tx_size_wide[transform_size] < (tx_size_high[transform_size] >> 2)) {
3427             coeff_buffer[i] = 0;
3428         }
3429     }
3430     return return_error;
3431 }
3432 
av1_estimate_transform_default(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type)3433 EbErrorType av1_estimate_transform_default(int16_t *residual_buffer, uint32_t residual_stride,
3434                                            int32_t *coeff_buffer, uint32_t coeff_stride,
3435                                            TxSize transform_size, uint64_t *three_quad_energy,
3436                                            uint32_t bit_depth, TxType transform_type,
3437                                            PlaneType component_type)
3438 
3439 {
3440     EbErrorType return_error = EB_ErrorNone;
3441 
3442     (void)coeff_stride;
3443     (void)component_type;
3444 
3445     switch (transform_size) {
3446     case TX_64X32:
3447         if (transform_type == DCT_DCT)
3448             svt_av1_fwd_txfm2d_64x32(
3449                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3450         else
3451             svt_av1_fwd_txfm2d_64x32_c(
3452                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3453 
3454         *three_quad_energy = svt_handle_transform64x32(coeff_buffer);
3455 
3456         break;
3457 
3458     case TX_32X64:
3459         if (transform_type == DCT_DCT)
3460             svt_av1_fwd_txfm2d_32x64(
3461                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3462         else
3463             svt_av1_fwd_txfm2d_32x64_c(
3464                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3465 
3466         *three_quad_energy = svt_handle_transform32x64(coeff_buffer);
3467 
3468         break;
3469 
3470     case TX_64X16:
3471         if (transform_type == DCT_DCT)
3472             svt_av1_fwd_txfm2d_64x16(
3473                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3474         else
3475             svt_av1_fwd_txfm2d_64x16_c(
3476                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3477 
3478         *three_quad_energy = svt_handle_transform64x16(coeff_buffer);
3479 
3480         break;
3481 
3482     case TX_16X64:
3483         if (transform_type == DCT_DCT)
3484             svt_av1_fwd_txfm2d_16x64(
3485                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3486         else
3487             svt_av1_fwd_txfm2d_16x64_c(
3488                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3489 
3490         *three_quad_energy = svt_handle_transform16x64(coeff_buffer);
3491 
3492         break;
3493 
3494     case TX_32X16:
3495         // TTK
3496         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3497             svt_av1_fwd_txfm2d_32x16(
3498                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3499         else
3500             svt_av1_fwd_txfm2d_32x16_c(
3501                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3502         break;
3503 
3504     case TX_16X32:
3505         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3506             svt_av1_fwd_txfm2d_16x32(
3507                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3508         else
3509             svt_av1_fwd_txfm2d_16x32_c(
3510                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3511         break;
3512 
3513     case TX_16X8:
3514         svt_av1_fwd_txfm2d_16x8(
3515             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3516         break;
3517 
3518     case TX_8X16:
3519         svt_av1_fwd_txfm2d_8x16(
3520             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3521         break;
3522 
3523     case TX_32X8:
3524         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3525             svt_av1_fwd_txfm2d_32x8(
3526                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3527         else
3528             svt_av1_fwd_txfm2d_32x8_c(
3529                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3530         break;
3531 
3532     case TX_8X32:
3533         if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3534             svt_av1_fwd_txfm2d_8x32(
3535                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3536         else
3537             svt_av1_fwd_txfm2d_8x32_c(
3538                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3539         break;
3540     case TX_16X4:
3541         svt_av1_fwd_txfm2d_16x4(
3542             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3543         break;
3544     case TX_4X16:
3545         svt_av1_fwd_txfm2d_4x16(
3546             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3547         break;
3548     case TX_8X4:
3549 
3550         svt_av1_fwd_txfm2d_8x4(
3551             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3552 
3553         break;
3554     case TX_4X8:
3555 
3556         svt_av1_fwd_txfm2d_4x8(
3557             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3558 
3559         break;
3560 
3561     case TX_64X64:
3562 
3563         svt_av1_fwd_txfm2d_64x64(
3564             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3565 
3566         *three_quad_energy = svt_handle_transform64x64(coeff_buffer);
3567 
3568         break;
3569 
3570     case TX_32X32:
3571         if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3572             transform_type == H_ADST || transform_type == V_FLIPADST ||
3573             transform_type == H_FLIPADST)
3574             // Tahani: I believe those cases are never hit
3575             svt_av1_transform_two_d_32x32_c(
3576                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3577 
3578         else {
3579             svt_av1_fwd_txfm2d_32x32(
3580                 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3581         }
3582 
3583         break;
3584 
3585     case TX_16X16:
3586 
3587         svt_av1_fwd_txfm2d_16x16(
3588             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3589 
3590         break;
3591     case TX_8X8:
3592 
3593         svt_av1_fwd_txfm2d_8x8(
3594             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3595 
3596         break;
3597     case TX_4X4:
3598 
3599         svt_av1_fwd_txfm2d_4x4(
3600             residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3601 
3602         break;
3603     default: assert(0); break;
3604     }
3605 
3606     return return_error;
3607 }
3608 /*********************************************************************
3609 * Transform
3610 *   Note there is an implicit assumption that TU Size <= PU Size,
3611 *   which is different than the HEVC requirements.
3612 *********************************************************************/
av1_estimate_transform(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type,EB_TRANS_COEFF_SHAPE trans_coeff_shape)3613 EbErrorType av1_estimate_transform(int16_t *residual_buffer, uint32_t residual_stride,
3614                                    int32_t *coeff_buffer, uint32_t coeff_stride,
3615                                    TxSize transform_size, uint64_t *three_quad_energy,
3616                                    uint32_t bit_depth, TxType transform_type,
3617                                    PlaneType component_type, EB_TRANS_COEFF_SHAPE trans_coeff_shape)
3618 
3619 {
3620     (void)trans_coeff_shape;
3621     (void)coeff_stride;
3622     (void)component_type;
3623     switch (trans_coeff_shape) {
3624     case DEFAULT_SHAPE:
3625         return av1_estimate_transform_default(residual_buffer,
3626                                               residual_stride,
3627                                               coeff_buffer,
3628                                               coeff_stride,
3629                                               transform_size,
3630                                               three_quad_energy,
3631                                               bit_depth,
3632                                               transform_type,
3633                                               component_type);
3634     case N2_SHAPE:
3635         return av1_estimate_transform_N2(residual_buffer,
3636                                          residual_stride,
3637                                          coeff_buffer,
3638                                          coeff_stride,
3639                                          transform_size,
3640                                          three_quad_energy,
3641                                          bit_depth,
3642                                          transform_type,
3643                                          component_type);
3644     case N4_SHAPE:
3645         return av1_estimate_transform_N4(residual_buffer,
3646                                          residual_stride,
3647                                          coeff_buffer,
3648                                          coeff_stride,
3649                                          transform_size,
3650                                          three_quad_energy,
3651                                          bit_depth,
3652                                          transform_type,
3653                                          component_type);
3654     case ONLY_DC_SHAPE:
3655         return av1_estimate_transform_ONLY_DC(residual_buffer,
3656                                               residual_stride,
3657                                               coeff_buffer,
3658                                               coeff_stride,
3659                                               transform_size,
3660                                               three_quad_energy,
3661                                               bit_depth,
3662                                               transform_type,
3663                                               component_type);
3664     }
3665 
3666     assert(0);
3667     return EB_ErrorBadParameter;
3668 }
3669 // PF_N4
highbd_fwd_txfm_64x64_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3670 static void highbd_fwd_txfm_64x64_n4(int16_t *src_diff, TranLow *coeff,
3671                                   int diff_stride, TxfmParam *txfm_param) {
3672   assert(txfm_param->tx_type == DCT_DCT);
3673   int32_t *dst_coeff = (int32_t *)coeff;
3674   const int bd = txfm_param->bd;
3675   svt_av1_fwd_txfm2d_64x64_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3676 }
3677 
highbd_fwd_txfm_32x64_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3678 static void highbd_fwd_txfm_32x64_n4(int16_t *src_diff, TranLow *coeff,
3679                                   int diff_stride, TxfmParam *txfm_param) {
3680   assert(txfm_param->tx_type == DCT_DCT);
3681   int32_t *dst_coeff = (int32_t *)coeff;
3682   const int bd = txfm_param->bd;
3683   svt_av1_fwd_txfm2d_32x64_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3684                        bd);
3685 }
3686 
highbd_fwd_txfm_64x32_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3687 static void highbd_fwd_txfm_64x32_n4(int16_t *src_diff, TranLow *coeff,
3688                                   int diff_stride, TxfmParam *txfm_param) {
3689   assert(txfm_param->tx_type == DCT_DCT);
3690   int32_t *dst_coeff = (int32_t *)coeff;
3691   const int bd = txfm_param->bd;
3692   svt_av1_fwd_txfm2d_64x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3693                        bd);
3694 }
3695 
highbd_fwd_txfm_16x64_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3696 static void highbd_fwd_txfm_16x64_n4(int16_t *src_diff, TranLow *coeff,
3697                                   int diff_stride, TxfmParam *txfm_param) {
3698   assert(txfm_param->tx_type == DCT_DCT);
3699   int32_t *dst_coeff = (int32_t *)coeff;
3700   const int bd = txfm_param->bd;
3701   svt_av1_fwd_txfm2d_16x64_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3702 }
3703 
highbd_fwd_txfm_64x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3704 static void highbd_fwd_txfm_64x16_n4(int16_t *src_diff, TranLow *coeff,
3705                                   int diff_stride, TxfmParam *txfm_param) {
3706   assert(txfm_param->tx_type == DCT_DCT);
3707   int32_t *dst_coeff = (int32_t *)coeff;
3708   const int bd = txfm_param->bd;
3709   svt_av1_fwd_txfm2d_64x16_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3710 }
3711 
highbd_fwd_txfm_32x32_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3712 static void highbd_fwd_txfm_32x32_n4(int16_t *src_diff, TranLow *coeff,
3713                                   int diff_stride, TxfmParam *txfm_param) {
3714   int32_t *dst_coeff = (int32_t *)coeff;
3715   const TxType tx_type = txfm_param->tx_type;
3716   const int bd = txfm_param->bd;
3717   svt_av1_fwd_txfm2d_32x32_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3718 }
3719 
highbd_fwd_txfm_16x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3720 static void highbd_fwd_txfm_16x16_n4(int16_t *src_diff, TranLow *coeff,
3721                                   int diff_stride, TxfmParam *txfm_param) {
3722   int32_t *dst_coeff = (int32_t *)coeff;
3723   const TxType tx_type = txfm_param->tx_type;
3724   const int bd = txfm_param->bd;
3725   svt_av1_fwd_txfm2d_16x16_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3726 }
3727 
highbd_fwd_txfm_8x8_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3728 static void highbd_fwd_txfm_8x8_n4(int16_t *src_diff, TranLow *coeff,
3729                                 int diff_stride, TxfmParam *txfm_param) {
3730   int32_t *dst_coeff = (int32_t *)coeff;
3731   const TxType tx_type = txfm_param->tx_type;
3732   const int bd = txfm_param->bd;
3733   svt_av1_fwd_txfm2d_8x8_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3734 }
3735 
highbd_fwd_txfm_4x8_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3736 static void highbd_fwd_txfm_4x8_n4(int16_t *src_diff, TranLow *coeff,
3737                                 int diff_stride, TxfmParam *txfm_param) {
3738   int32_t *dst_coeff = (int32_t *)coeff;
3739   svt_av1_fwd_txfm2d_4x8_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3740                      txfm_param->bd);
3741 }
3742 
highbd_fwd_txfm_8x4_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3743 static void highbd_fwd_txfm_8x4_n4(int16_t *src_diff, TranLow *coeff,
3744                                 int diff_stride, TxfmParam *txfm_param) {
3745   int32_t *dst_coeff = (int32_t *)coeff;
3746   svt_av1_fwd_txfm2d_8x4_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3747                      txfm_param->bd);
3748 }
3749 
highbd_fwd_txfm_8x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3750 static void highbd_fwd_txfm_8x16_n4(int16_t *src_diff, TranLow *coeff,
3751                                  int diff_stride, TxfmParam *txfm_param) {
3752   int32_t *dst_coeff = (int32_t *)coeff;
3753   const TxType tx_type = txfm_param->tx_type;
3754   const int bd = txfm_param->bd;
3755   svt_av1_fwd_txfm2d_8x16_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3756 }
3757 
highbd_fwd_txfm_16x8_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3758 static void highbd_fwd_txfm_16x8_n4(int16_t *src_diff, TranLow *coeff,
3759                                  int diff_stride, TxfmParam *txfm_param) {
3760   int32_t *dst_coeff = (int32_t *)coeff;
3761   const TxType tx_type = txfm_param->tx_type;
3762   const int bd = txfm_param->bd;
3763   svt_av1_fwd_txfm2d_16x8_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3764 }
3765 
highbd_fwd_txfm_16x32_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3766 static void highbd_fwd_txfm_16x32_n4(int16_t *src_diff, TranLow *coeff,
3767                                   int diff_stride, TxfmParam *txfm_param) {
3768   int32_t *dst_coeff = (int32_t *)coeff;
3769   svt_av1_fwd_txfm2d_16x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3770                        txfm_param->bd);
3771 }
3772 
highbd_fwd_txfm_32x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3773 static void highbd_fwd_txfm_32x16_n4(int16_t *src_diff, TranLow *coeff,
3774                                   int diff_stride, TxfmParam *txfm_param) {
3775   int32_t *dst_coeff = (int32_t *)coeff;
3776   svt_av1_fwd_txfm2d_32x16_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3777                        txfm_param->bd);
3778 }
3779 
highbd_fwd_txfm_4x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3780 static void highbd_fwd_txfm_4x16_n4(int16_t *src_diff, TranLow *coeff,
3781                                  int diff_stride, TxfmParam *txfm_param) {
3782   int32_t *dst_coeff = (int32_t *)coeff;
3783   svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3784                       txfm_param->bd);
3785 }
3786 
highbd_fwd_txfm_16x4_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3787 static void highbd_fwd_txfm_16x4_n4(int16_t *src_diff, TranLow *coeff,
3788                                  int diff_stride, TxfmParam *txfm_param) {
3789   int32_t *dst_coeff = (int32_t *)coeff;
3790   svt_av1_fwd_txfm2d_16x4_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3791                       txfm_param->bd);
3792 }
3793 
highbd_fwd_txfm_8x32_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3794 static void highbd_fwd_txfm_8x32_n4(int16_t *src_diff, TranLow *coeff,
3795                                  int diff_stride, TxfmParam *txfm_param) {
3796   int32_t *dst_coeff = (int32_t *)coeff;
3797   svt_av1_fwd_txfm2d_8x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3798                       txfm_param->bd);
3799 }
3800 
highbd_fwd_txfm_32x8_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3801 static void highbd_fwd_txfm_32x8_n4(int16_t *src_diff, TranLow *coeff,
3802                                  int diff_stride, TxfmParam *txfm_param) {
3803   int32_t *dst_coeff = (int32_t *)coeff;
3804   svt_av1_fwd_txfm2d_32x8_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3805                       txfm_param->bd);
3806 }
3807 
3808 //PF_N2
highbd_fwd_txfm_64x64_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3809 static void highbd_fwd_txfm_64x64_n2(int16_t *src_diff, TranLow *coeff,
3810                                   int diff_stride, TxfmParam *txfm_param) {
3811   assert(txfm_param->tx_type == DCT_DCT);
3812   int32_t *dst_coeff = (int32_t *)coeff;
3813   const int bd = txfm_param->bd;
3814   svt_av1_fwd_txfm2d_64x64_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3815 }
3816 
highbd_fwd_txfm_32x64_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3817 static void highbd_fwd_txfm_32x64_n2(int16_t *src_diff, TranLow *coeff,
3818                                   int diff_stride, TxfmParam *txfm_param) {
3819   assert(txfm_param->tx_type == DCT_DCT);
3820   int32_t *dst_coeff = (int32_t *)coeff;
3821   const int bd = txfm_param->bd;
3822   svt_av1_fwd_txfm2d_32x64_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3823                        bd);
3824 }
3825 
highbd_fwd_txfm_64x32_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3826 static void highbd_fwd_txfm_64x32_n2(int16_t *src_diff, TranLow *coeff,
3827                                   int diff_stride, TxfmParam *txfm_param) {
3828   assert(txfm_param->tx_type == DCT_DCT);
3829   int32_t *dst_coeff = (int32_t *)coeff;
3830   const int bd = txfm_param->bd;
3831   svt_av1_fwd_txfm2d_64x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3832                        bd);
3833 }
3834 
highbd_fwd_txfm_16x64_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3835 static void highbd_fwd_txfm_16x64_n2(int16_t *src_diff, TranLow *coeff,
3836                                   int diff_stride, TxfmParam *txfm_param) {
3837   assert(txfm_param->tx_type == DCT_DCT);
3838   int32_t *dst_coeff = (int32_t *)coeff;
3839   const int bd = txfm_param->bd;
3840   svt_av1_fwd_txfm2d_16x64_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3841 }
3842 
highbd_fwd_txfm_64x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3843 static void highbd_fwd_txfm_64x16_n2(int16_t *src_diff, TranLow *coeff,
3844                                   int diff_stride, TxfmParam *txfm_param) {
3845   assert(txfm_param->tx_type == DCT_DCT);
3846   int32_t *dst_coeff = (int32_t *)coeff;
3847   const int bd = txfm_param->bd;
3848   svt_av1_fwd_txfm2d_64x16_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3849 }
3850 
highbd_fwd_txfm_32x32_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3851 static void highbd_fwd_txfm_32x32_n2(int16_t *src_diff, TranLow *coeff,
3852                                   int diff_stride, TxfmParam *txfm_param) {
3853   int32_t *dst_coeff = (int32_t *)coeff;
3854   const TxType tx_type = txfm_param->tx_type;
3855   const int bd = txfm_param->bd;
3856   svt_av1_fwd_txfm2d_32x32_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3857 }
3858 
highbd_fwd_txfm_16x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3859 static void highbd_fwd_txfm_16x16_n2(int16_t *src_diff, TranLow *coeff,
3860                                   int diff_stride, TxfmParam *txfm_param) {
3861   int32_t *dst_coeff = (int32_t *)coeff;
3862   const TxType tx_type = txfm_param->tx_type;
3863   const int bd = txfm_param->bd;
3864   svt_av1_fwd_txfm2d_16x16_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3865 }
3866 
highbd_fwd_txfm_8x8_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3867 static void highbd_fwd_txfm_8x8_n2(int16_t *src_diff, TranLow *coeff,
3868                                 int diff_stride, TxfmParam *txfm_param) {
3869   int32_t *dst_coeff = (int32_t *)coeff;
3870   const TxType tx_type = txfm_param->tx_type;
3871   const int bd = txfm_param->bd;
3872   svt_av1_fwd_txfm2d_8x8_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3873 }
3874 
highbd_fwd_txfm_4x8_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3875 static void highbd_fwd_txfm_4x8_n2(int16_t *src_diff, TranLow *coeff,
3876                                 int diff_stride, TxfmParam *txfm_param) {
3877   int32_t *dst_coeff = (int32_t *)coeff;
3878   svt_av1_fwd_txfm2d_4x8_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3879                      txfm_param->bd);
3880 }
3881 
highbd_fwd_txfm_8x4_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3882 static void highbd_fwd_txfm_8x4_n2(int16_t *src_diff, TranLow *coeff,
3883                                 int diff_stride, TxfmParam *txfm_param) {
3884   int32_t *dst_coeff = (int32_t *)coeff;
3885   svt_av1_fwd_txfm2d_8x4_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3886                      txfm_param->bd);
3887 }
3888 
highbd_fwd_txfm_8x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3889 static void highbd_fwd_txfm_8x16_n2(int16_t *src_diff, TranLow *coeff,
3890                                  int diff_stride, TxfmParam *txfm_param) {
3891   int32_t *dst_coeff = (int32_t *)coeff;
3892   const TxType tx_type = txfm_param->tx_type;
3893   const int bd = txfm_param->bd;
3894   svt_av1_fwd_txfm2d_8x16_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3895 }
3896 
highbd_fwd_txfm_16x8_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3897 static void highbd_fwd_txfm_16x8_n2(int16_t *src_diff, TranLow *coeff,
3898                                  int diff_stride, TxfmParam *txfm_param) {
3899   int32_t *dst_coeff = (int32_t *)coeff;
3900   const TxType tx_type = txfm_param->tx_type;
3901   const int bd = txfm_param->bd;
3902   svt_av1_fwd_txfm2d_16x8_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3903 }
3904 
highbd_fwd_txfm_16x32_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3905 static void highbd_fwd_txfm_16x32_n2(int16_t *src_diff, TranLow *coeff,
3906                                   int diff_stride, TxfmParam *txfm_param) {
3907   int32_t *dst_coeff = (int32_t *)coeff;
3908   svt_av1_fwd_txfm2d_16x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3909                        txfm_param->bd);
3910 }
3911 
highbd_fwd_txfm_32x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3912 static void highbd_fwd_txfm_32x16_n2(int16_t *src_diff, TranLow *coeff,
3913                                   int diff_stride, TxfmParam *txfm_param) {
3914   int32_t *dst_coeff = (int32_t *)coeff;
3915   svt_av1_fwd_txfm2d_32x16_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3916                        txfm_param->bd);
3917 }
3918 
highbd_fwd_txfm_4x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3919 static void highbd_fwd_txfm_4x16_n2(int16_t *src_diff, TranLow *coeff,
3920                                  int diff_stride, TxfmParam *txfm_param) {
3921   int32_t *dst_coeff = (int32_t *)coeff;
3922   svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3923                       txfm_param->bd);
3924 }
3925 
highbd_fwd_txfm_16x4_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3926 static void highbd_fwd_txfm_16x4_n2(int16_t *src_diff, TranLow *coeff,
3927                                  int diff_stride, TxfmParam *txfm_param) {
3928   int32_t *dst_coeff = (int32_t *)coeff;
3929   svt_av1_fwd_txfm2d_16x4_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3930                       txfm_param->bd);
3931 }
3932 
highbd_fwd_txfm_8x32_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3933 static void highbd_fwd_txfm_8x32_n2(int16_t *src_diff, TranLow *coeff,
3934                                  int diff_stride, TxfmParam *txfm_param) {
3935   int32_t *dst_coeff = (int32_t *)coeff;
3936   svt_av1_fwd_txfm2d_8x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3937                       txfm_param->bd);
3938 }
3939 
highbd_fwd_txfm_32x8_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3940 static void highbd_fwd_txfm_32x8_n2(int16_t *src_diff, TranLow *coeff,
3941                                  int diff_stride, TxfmParam *txfm_param) {
3942   int32_t *dst_coeff = (int32_t *)coeff;
3943   svt_av1_fwd_txfm2d_32x8_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3944                       txfm_param->bd);
3945 }
highbd_fwd_txfm_64x64(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3946 static void highbd_fwd_txfm_64x64(int16_t *src_diff, TranLow *coeff, int diff_stride,
3947                                   TxfmParam *txfm_param) {
3948     assert(txfm_param->tx_type == DCT_DCT);
3949     int32_t * dst_coeff = (int32_t *)coeff;
3950     const int bd        = txfm_param->bd;
3951     svt_av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3952 }
3953 
highbd_fwd_txfm_32x64(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3954 static void highbd_fwd_txfm_32x64(int16_t *src_diff, TranLow *coeff, int diff_stride,
3955                                   TxfmParam *txfm_param) {
3956     assert(txfm_param->tx_type == DCT_DCT);
3957     int32_t * dst_coeff = (int32_t *)coeff;
3958     const int bd        = txfm_param->bd;
3959     svt_av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
3960 }
3961 
highbd_fwd_txfm_64x32(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3962 static void highbd_fwd_txfm_64x32(int16_t *src_diff, TranLow *coeff, int diff_stride,
3963                                   TxfmParam *txfm_param) {
3964     assert(txfm_param->tx_type == DCT_DCT);
3965     int32_t * dst_coeff = (int32_t *)coeff;
3966     const int bd        = txfm_param->bd;
3967     svt_av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
3968 }
3969 
highbd_fwd_txfm_16x64(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3970 static void highbd_fwd_txfm_16x64(int16_t *src_diff, TranLow *coeff, int diff_stride,
3971                                   TxfmParam *txfm_param) {
3972     assert(txfm_param->tx_type == DCT_DCT);
3973     int32_t * dst_coeff = (int32_t *)coeff;
3974     const int bd        = txfm_param->bd;
3975     svt_av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3976 }
3977 
highbd_fwd_txfm_64x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3978 static void highbd_fwd_txfm_64x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
3979                                   TxfmParam *txfm_param) {
3980     assert(txfm_param->tx_type == DCT_DCT);
3981     int32_t * dst_coeff = (int32_t *)coeff;
3982     const int bd        = txfm_param->bd;
3983     svt_av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3984 }
3985 
highbd_fwd_txfm_32x32(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3986 static void highbd_fwd_txfm_32x32(int16_t *src_diff, TranLow *coeff, int diff_stride,
3987                                   TxfmParam *txfm_param) {
3988     int32_t *    dst_coeff = (int32_t *)coeff;
3989     const TxType tx_type   = txfm_param->tx_type;
3990     const int    bd        = txfm_param->bd;
3991     svt_av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
3992 }
3993 
highbd_fwd_txfm_16x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3994 static void highbd_fwd_txfm_16x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
3995                                   TxfmParam *txfm_param) {
3996     int32_t *    dst_coeff = (int32_t *)coeff;
3997     const TxType tx_type   = txfm_param->tx_type;
3998     const int    bd        = txfm_param->bd;
3999     svt_av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
4000 }
4001 
highbd_fwd_txfm_8x8(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4002 static void highbd_fwd_txfm_8x8(int16_t *src_diff, TranLow *coeff, int diff_stride,
4003                                 TxfmParam *txfm_param) {
4004     int32_t *    dst_coeff = (int32_t *)coeff;
4005     const TxType tx_type   = txfm_param->tx_type;
4006     const int    bd        = txfm_param->bd;
4007     svt_av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
4008 }
4009 
highbd_fwd_txfm_4x8(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4010 static void highbd_fwd_txfm_4x8(int16_t *src_diff, TranLow *coeff, int diff_stride,
4011                                 TxfmParam *txfm_param) {
4012     int32_t *dst_coeff = (int32_t *)coeff;
4013     svt_av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4014 }
4015 
highbd_fwd_txfm_8x4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4016 static void highbd_fwd_txfm_8x4(int16_t *src_diff, TranLow *coeff, int diff_stride,
4017                                 TxfmParam *txfm_param) {
4018     int32_t *dst_coeff = (int32_t *)coeff;
4019     svt_av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4020 }
4021 
highbd_fwd_txfm_8x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4022 static void highbd_fwd_txfm_8x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
4023                                  TxfmParam *txfm_param) {
4024     int32_t *    dst_coeff = (int32_t *)coeff;
4025     const TxType tx_type   = txfm_param->tx_type;
4026     const int    bd        = txfm_param->bd;
4027     svt_av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
4028 }
4029 
highbd_fwd_txfm_16x8(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4030 static void highbd_fwd_txfm_16x8(int16_t *src_diff, TranLow *coeff, int diff_stride,
4031                                  TxfmParam *txfm_param) {
4032     int32_t *    dst_coeff = (int32_t *)coeff;
4033     const TxType tx_type   = txfm_param->tx_type;
4034     const int    bd        = txfm_param->bd;
4035     svt_av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
4036 }
4037 
highbd_fwd_txfm_16x32(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4038 static void highbd_fwd_txfm_16x32(int16_t *src_diff, TranLow *coeff, int diff_stride,
4039                                   TxfmParam *txfm_param) {
4040     int32_t *dst_coeff = (int32_t *)coeff;
4041     svt_av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4042 }
4043 
highbd_fwd_txfm_32x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4044 static void highbd_fwd_txfm_32x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
4045                                   TxfmParam *txfm_param) {
4046     int32_t *dst_coeff = (int32_t *)coeff;
4047     svt_av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4048 }
4049 
highbd_fwd_txfm_4x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4050 static void highbd_fwd_txfm_4x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
4051                                  TxfmParam *txfm_param) {
4052     int32_t *dst_coeff = (int32_t *)coeff;
4053     svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4054 }
4055 
highbd_fwd_txfm_16x4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4056 static void highbd_fwd_txfm_16x4(int16_t *src_diff, TranLow *coeff, int diff_stride,
4057                                  TxfmParam *txfm_param) {
4058     int32_t *dst_coeff = (int32_t *)coeff;
4059     svt_av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4060 }
4061 
highbd_fwd_txfm_8x32(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4062 static void highbd_fwd_txfm_8x32(int16_t *src_diff, TranLow *coeff, int diff_stride,
4063                                  TxfmParam *txfm_param) {
4064     int32_t *dst_coeff = (int32_t *)coeff;
4065     svt_av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4066 }
4067 
highbd_fwd_txfm_32x8(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4068 static void highbd_fwd_txfm_32x8(int16_t *src_diff, TranLow *coeff, int diff_stride,
4069                                  TxfmParam *txfm_param) {
4070     int32_t *dst_coeff = (int32_t *)coeff;
4071     svt_av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4072 }
svt_av1_highbd_fwd_txfm_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4073 void svt_av1_highbd_fwd_txfm_n4(int16_t *src_diff, TranLow *coeff,
4074                          int diff_stride, TxfmParam *txfm_param) {
4075   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4076   const TxSize tx_size = txfm_param->tx_size;
4077   switch (tx_size) {
4078     case TX_64X64:
4079       highbd_fwd_txfm_64x64_n4(src_diff, coeff, diff_stride, txfm_param);
4080       break;
4081     case TX_32X64:
4082       highbd_fwd_txfm_32x64_n4(src_diff, coeff, diff_stride, txfm_param);
4083       break;
4084     case TX_64X32:
4085       highbd_fwd_txfm_64x32_n4(src_diff, coeff, diff_stride, txfm_param);
4086       break;
4087     case TX_16X64:
4088       highbd_fwd_txfm_16x64_n4(src_diff, coeff, diff_stride, txfm_param);
4089       break;
4090     case TX_64X16:
4091       highbd_fwd_txfm_64x16_n4(src_diff, coeff, diff_stride, txfm_param);
4092       break;
4093     case TX_32X32:
4094       highbd_fwd_txfm_32x32_n4(src_diff, coeff, diff_stride, txfm_param);
4095       break;
4096     case TX_16X16:
4097       highbd_fwd_txfm_16x16_n4(src_diff, coeff, diff_stride, txfm_param);
4098       break;
4099     case TX_8X8:
4100       highbd_fwd_txfm_8x8_n4(src_diff, coeff, diff_stride, txfm_param);
4101       break;
4102     case TX_4X8:
4103       highbd_fwd_txfm_4x8_n4(src_diff, coeff, diff_stride, txfm_param);
4104       break;
4105     case TX_8X4:
4106       highbd_fwd_txfm_8x4_n4(src_diff, coeff, diff_stride, txfm_param);
4107       break;
4108     case TX_8X16:
4109       highbd_fwd_txfm_8x16_n4(src_diff, coeff, diff_stride, txfm_param);
4110       break;
4111     case TX_16X8:
4112       highbd_fwd_txfm_16x8_n4(src_diff, coeff, diff_stride, txfm_param);
4113       break;
4114     case TX_16X32:
4115       highbd_fwd_txfm_16x32_n4(src_diff, coeff, diff_stride, txfm_param);
4116       break;
4117     case TX_32X16:
4118       highbd_fwd_txfm_32x16_n4(src_diff, coeff, diff_stride, txfm_param);
4119       break;
4120     case TX_4X4:
4121       //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4122       break;
4123     case TX_4X16:
4124       highbd_fwd_txfm_4x16_n4(src_diff, coeff, diff_stride, txfm_param);
4125       break;
4126     case TX_16X4:
4127       highbd_fwd_txfm_16x4_n4(src_diff, coeff, diff_stride, txfm_param);
4128       break;
4129     case TX_8X32:
4130       highbd_fwd_txfm_8x32_n4(src_diff, coeff, diff_stride, txfm_param);
4131       break;
4132     case TX_32X8:
4133       highbd_fwd_txfm_32x8_n4(src_diff, coeff, diff_stride, txfm_param);
4134       break;
4135     default: assert(0); break;
4136   }
4137 }
svt_av1_highbd_fwd_txfm_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4138 void svt_av1_highbd_fwd_txfm_n2(int16_t *src_diff, TranLow *coeff,
4139                          int diff_stride, TxfmParam *txfm_param) {
4140   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4141   const TxSize tx_size = txfm_param->tx_size;
4142   switch (tx_size) {
4143     case TX_64X64:
4144       highbd_fwd_txfm_64x64_n2(src_diff, coeff, diff_stride, txfm_param);
4145       break;
4146     case TX_32X64:
4147       highbd_fwd_txfm_32x64_n2(src_diff, coeff, diff_stride, txfm_param);
4148       break;
4149     case TX_64X32:
4150       highbd_fwd_txfm_64x32_n2(src_diff, coeff, diff_stride, txfm_param);
4151       break;
4152     case TX_16X64:
4153       highbd_fwd_txfm_16x64_n2(src_diff, coeff, diff_stride, txfm_param);
4154       break;
4155     case TX_64X16:
4156       highbd_fwd_txfm_64x16_n2(src_diff, coeff, diff_stride, txfm_param);
4157       break;
4158     case TX_32X32:
4159       highbd_fwd_txfm_32x32_n2(src_diff, coeff, diff_stride, txfm_param);
4160       break;
4161     case TX_16X16:
4162       highbd_fwd_txfm_16x16_n2(src_diff, coeff, diff_stride, txfm_param);
4163       break;
4164     case TX_8X8:
4165       highbd_fwd_txfm_8x8_n2(src_diff, coeff, diff_stride, txfm_param);
4166       break;
4167     case TX_4X8:
4168       highbd_fwd_txfm_4x8_n2(src_diff, coeff, diff_stride, txfm_param);
4169       break;
4170     case TX_8X4:
4171       highbd_fwd_txfm_8x4_n2(src_diff, coeff, diff_stride, txfm_param);
4172       break;
4173     case TX_8X16:
4174       highbd_fwd_txfm_8x16_n2(src_diff, coeff, diff_stride, txfm_param);
4175       break;
4176     case TX_16X8:
4177       highbd_fwd_txfm_16x8_n2(src_diff, coeff, diff_stride, txfm_param);
4178       break;
4179     case TX_16X32:
4180       highbd_fwd_txfm_16x32_n2(src_diff, coeff, diff_stride, txfm_param);
4181       break;
4182     case TX_32X16:
4183       highbd_fwd_txfm_32x16_n2(src_diff, coeff, diff_stride, txfm_param);
4184       break;
4185     case TX_4X4:
4186       //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4187       break;
4188     case TX_4X16:
4189       highbd_fwd_txfm_4x16_n2(src_diff, coeff, diff_stride, txfm_param);
4190       break;
4191     case TX_16X4:
4192       highbd_fwd_txfm_16x4_n2(src_diff, coeff, diff_stride, txfm_param);
4193       break;
4194     case TX_8X32:
4195       highbd_fwd_txfm_8x32_n2(src_diff, coeff, diff_stride, txfm_param);
4196       break;
4197     case TX_32X8:
4198       highbd_fwd_txfm_32x8_n2(src_diff, coeff, diff_stride, txfm_param);
4199       break;
4200     default: assert(0); break;
4201   }
4202 }
svt_av1_highbd_fwd_txfm(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4203 void svt_av1_highbd_fwd_txfm(int16_t *src_diff, TranLow *coeff, int diff_stride,
4204                              TxfmParam *txfm_param) {
4205     assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4206     const TxSize tx_size = txfm_param->tx_size;
4207     switch (tx_size) {
4208     case TX_64X64: highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); break;
4209     case TX_32X64: highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param); break;
4210     case TX_64X32: highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); break;
4211     case TX_16X64: highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param); break;
4212     case TX_64X16: highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param); break;
4213     case TX_32X32: highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); break;
4214     case TX_16X16: highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param); break;
4215     case TX_8X8: highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); break;
4216     case TX_4X8: highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); break;
4217     case TX_8X4: highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); break;
4218     case TX_8X16: highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param); break;
4219     case TX_16X8: highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param); break;
4220     case TX_16X32: highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param); break;
4221     case TX_32X16: highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param); break;
4222     case TX_4X4:
4223         //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4224         break;
4225     case TX_4X16: highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); break;
4226     case TX_16X4: highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); break;
4227     case TX_8X32: highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); break;
4228     case TX_32X8: highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); break;
4229     default: assert(0); break;
4230     }
4231 }
svt_av1_wht_fwd_txfm(int16_t * src_diff,int bw,int32_t * coeff,TxSize tx_size,EB_TRANS_COEFF_SHAPE pf_shape,int bit_depth,int is_hbd)4232 void svt_av1_wht_fwd_txfm(int16_t *src_diff, int bw, int32_t *coeff, TxSize tx_size, EB_TRANS_COEFF_SHAPE pf_shape, int bit_depth,
4233     int is_hbd) {
4234     TxfmParam txfm_param;
4235     txfm_param.tx_type = DCT_DCT;
4236     txfm_param.tx_size = tx_size;
4237     txfm_param.lossless = 0;
4238     txfm_param.tx_set_type = EXT_TX_SET_ALL16;
4239 
4240     txfm_param.bd = bit_depth;
4241     txfm_param.is_hbd = is_hbd;
4242     switch (pf_shape) {
4243     case N4_SHAPE:
4244         svt_av1_highbd_fwd_txfm_n4(src_diff, coeff, bw, &txfm_param);
4245         break;
4246     case N2_SHAPE:
4247         svt_av1_highbd_fwd_txfm_n2(src_diff, coeff, bw, &txfm_param);
4248         break;
4249     default:
4250         svt_av1_highbd_fwd_txfm(src_diff, coeff, bw, &txfm_param);
4251     }
4252 }
svt_av1_fidentity16_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4253 void svt_av1_fidentity16_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
4254                               const int8_t *stage_range) {
4255     (void)stage_range;
4256     (void)cos_bit;
4257     for (int32_t i = 0; i < 8; ++i)
4258         output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
4259 
4260     assert(stage_range[0] + new_sqrt2_bits <= 32);
4261 }
4262 
svt_av1_fadst16_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4263 void svt_av1_fadst16_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4264                             const int8_t *stage_range) {
4265     (void)stage_range;
4266     const int32_t *cospi;
4267 
4268     int32_t *bf0, *bf1;
4269     int32_t  step[16];
4270 
4271     // stage 0;
4272 
4273     // stage 1;
4274     assert(output != input);
4275     bf1     = output;
4276     bf1[0]  = input[0];
4277     bf1[1]  = -input[15];
4278     bf1[2]  = -input[7];
4279     bf1[3]  = input[8];
4280     bf1[4]  = -input[3];
4281     bf1[5]  = input[12];
4282     bf1[6]  = input[4];
4283     bf1[7]  = -input[11];
4284     bf1[8]  = -input[1];
4285     bf1[9]  = input[14];
4286     bf1[10] = input[6];
4287     bf1[11] = -input[9];
4288     bf1[12] = input[2];
4289     bf1[13] = -input[13];
4290     bf1[14] = -input[5];
4291     bf1[15] = input[10];
4292 
4293     // stage 2
4294     cospi   = cospi_arr(cos_bit);
4295     bf0     = output;
4296     bf1     = step;
4297     bf1[0]  = bf0[0];
4298     bf1[1]  = bf0[1];
4299     bf1[2]  = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
4300     bf1[3]  = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
4301     bf1[4]  = bf0[4];
4302     bf1[5]  = bf0[5];
4303     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
4304     bf1[7]  = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
4305     bf1[8]  = bf0[8];
4306     bf1[9]  = bf0[9];
4307     bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
4308     bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
4309     bf1[12] = bf0[12];
4310     bf1[13] = bf0[13];
4311     bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
4312     bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
4313 
4314     // stage 3
4315     bf0     = step;
4316     bf1     = output;
4317     bf1[0]  = bf0[0] + bf0[2];
4318     bf1[1]  = bf0[1] + bf0[3];
4319     bf1[2]  = bf0[0] - bf0[2];
4320     bf1[3]  = bf0[1] - bf0[3];
4321     bf1[4]  = bf0[4] + bf0[6];
4322     bf1[5]  = bf0[5] + bf0[7];
4323     bf1[6]  = bf0[4] - bf0[6];
4324     bf1[7]  = bf0[5] - bf0[7];
4325     bf1[8]  = bf0[8] + bf0[10];
4326     bf1[9]  = bf0[9] + bf0[11];
4327     bf1[10] = bf0[8] - bf0[10];
4328     bf1[11] = bf0[9] - bf0[11];
4329     bf1[12] = bf0[12] + bf0[14];
4330     bf1[13] = bf0[13] + bf0[15];
4331     bf1[14] = bf0[12] - bf0[14];
4332     bf1[15] = bf0[13] - bf0[15];
4333 
4334     // stage 4
4335     cospi   = cospi_arr(cos_bit);
4336     bf0     = output;
4337     bf1     = step;
4338     bf1[0]  = bf0[0];
4339     bf1[1]  = bf0[1];
4340     bf1[2]  = bf0[2];
4341     bf1[3]  = bf0[3];
4342     bf1[4]  = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
4343     bf1[5]  = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
4344     bf1[6]  = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
4345     bf1[7]  = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
4346     bf1[8]  = bf0[8];
4347     bf1[9]  = bf0[9];
4348     bf1[10] = bf0[10];
4349     bf1[11] = bf0[11];
4350     bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
4351     bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
4352     bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
4353     bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
4354 
4355     // stage 5
4356     bf0     = step;
4357     bf1     = output;
4358     bf1[0]  = bf0[0] + bf0[4];
4359     bf1[1]  = bf0[1] + bf0[5];
4360     bf1[2]  = bf0[2] + bf0[6];
4361     bf1[3]  = bf0[3] + bf0[7];
4362     bf1[4]  = bf0[0] - bf0[4];
4363     bf1[5]  = bf0[1] - bf0[5];
4364     bf1[6]  = bf0[2] - bf0[6];
4365     bf1[7]  = bf0[3] - bf0[7];
4366     bf1[8]  = bf0[8] + bf0[12];
4367     bf1[9]  = bf0[9] + bf0[13];
4368     bf1[10] = bf0[10] + bf0[14];
4369     bf1[11] = bf0[11] + bf0[15];
4370     bf1[12] = bf0[8] - bf0[12];
4371     bf1[13] = bf0[9] - bf0[13];
4372     bf1[14] = bf0[10] - bf0[14];
4373     bf1[15] = bf0[11] - bf0[15];
4374 
4375     // stage 6
4376     cospi   = cospi_arr(cos_bit);
4377     bf0     = output;
4378     bf1     = step;
4379     bf1[0]  = bf0[0];
4380     bf1[1]  = bf0[1];
4381     bf1[2]  = bf0[2];
4382     bf1[3]  = bf0[3];
4383     bf1[4]  = bf0[4];
4384     bf1[5]  = bf0[5];
4385     bf1[6]  = bf0[6];
4386     bf1[7]  = bf0[7];
4387     bf1[8]  = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
4388     bf1[9]  = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
4389     bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
4390     bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
4391     bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
4392     bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
4393     bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
4394     bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
4395 
4396     // stage 7
4397     bf0     = step;
4398     bf1     = output;
4399     bf1[0]  = bf0[0] + bf0[8];
4400     bf1[1]  = bf0[1] + bf0[9];
4401     bf1[2]  = bf0[2] + bf0[10];
4402     bf1[3]  = bf0[3] + bf0[11];
4403     bf1[4]  = bf0[4] + bf0[12];
4404     bf1[5]  = bf0[5] + bf0[13];
4405     bf1[6]  = bf0[6] + bf0[14];
4406     bf1[7]  = bf0[7] + bf0[15];
4407     bf1[8]  = bf0[0] - bf0[8];
4408     bf1[9]  = bf0[1] - bf0[9];
4409     bf1[10] = bf0[2] - bf0[10];
4410     bf1[11] = bf0[3] - bf0[11];
4411     bf1[12] = bf0[4] - bf0[12];
4412     bf1[13] = bf0[5] - bf0[13];
4413     bf1[14] = bf0[6] - bf0[14];
4414     bf1[15] = bf0[7] - bf0[15];
4415 
4416     // stage 8
4417     cospi   = cospi_arr(cos_bit);
4418     bf0     = output;
4419     bf1     = step;
4420     bf1[1]  = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
4421     bf1[3]  = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
4422     bf1[5]  = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
4423     bf1[7]  = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
4424     bf1[8]  = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
4425     bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
4426     bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
4427     bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
4428 
4429     // stage 9
4430     bf0    = step;
4431     bf1    = output;
4432     bf1[0] = bf0[1];
4433     bf1[1] = bf0[14];
4434     bf1[2] = bf0[3];
4435     bf1[3] = bf0[12];
4436     bf1[4] = bf0[5];
4437     bf1[5] = bf0[10];
4438     bf1[6] = bf0[7];
4439     bf1[7] = bf0[8];
4440 }
4441 
svt_av1_fdct16_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4442 void svt_av1_fdct16_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4443                            const int8_t *stage_range) {
4444     (void)stage_range;
4445     const int32_t *cospi;
4446 
4447     int32_t *bf0, *bf1;
4448     int32_t  step[16];
4449 
4450     // stage 0;
4451 
4452     // stage 1;
4453     bf1     = output;
4454     bf1[0]  = input[0] + input[15];
4455     bf1[1]  = input[1] + input[14];
4456     bf1[2]  = input[2] + input[13];
4457     bf1[3]  = input[3] + input[12];
4458     bf1[4]  = input[4] + input[11];
4459     bf1[5]  = input[5] + input[10];
4460     bf1[6]  = input[6] + input[9];
4461     bf1[7]  = input[7] + input[8];
4462     bf1[8]  = -input[8] + input[7];
4463     bf1[9]  = -input[9] + input[6];
4464     bf1[10] = -input[10] + input[5];
4465     bf1[11] = -input[11] + input[4];
4466     bf1[12] = -input[12] + input[3];
4467     bf1[13] = -input[13] + input[2];
4468     bf1[14] = -input[14] + input[1];
4469     bf1[15] = -input[15] + input[0];
4470 
4471     // stage 2
4472     cospi   = cospi_arr(cos_bit);
4473     bf0     = output;
4474     bf1     = step;
4475     bf1[0]  = bf0[0] + bf0[7];
4476     bf1[1]  = bf0[1] + bf0[6];
4477     bf1[2]  = bf0[2] + bf0[5];
4478     bf1[3]  = bf0[3] + bf0[4];
4479     bf1[4]  = -bf0[4] + bf0[3];
4480     bf1[5]  = -bf0[5] + bf0[2];
4481     bf1[6]  = -bf0[6] + bf0[1];
4482     bf1[7]  = -bf0[7] + bf0[0];
4483     bf1[8]  = bf0[8];
4484     bf1[9]  = bf0[9];
4485     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
4486     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
4487     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
4488     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
4489     bf1[14] = bf0[14];
4490     bf1[15] = bf0[15];
4491 
4492     // stage 3
4493     cospi   = cospi_arr(cos_bit);
4494     bf0     = step;
4495     bf1     = output;
4496     bf1[0]  = bf0[0] + bf0[3];
4497     bf1[1]  = bf0[1] + bf0[2];
4498     bf1[2]  = -bf0[2] + bf0[1];
4499     bf1[3]  = -bf0[3] + bf0[0];
4500     bf1[4]  = bf0[4];
4501     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
4502     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
4503     bf1[7]  = bf0[7];
4504     bf1[8]  = bf0[8] + bf0[11];
4505     bf1[9]  = bf0[9] + bf0[10];
4506     bf1[10] = -bf0[10] + bf0[9];
4507     bf1[11] = -bf0[11] + bf0[8];
4508     bf1[12] = -bf0[12] + bf0[15];
4509     bf1[13] = -bf0[13] + bf0[14];
4510     bf1[14] = bf0[14] + bf0[13];
4511     bf1[15] = bf0[15] + bf0[12];
4512 
4513     // stage 4
4514     cospi   = cospi_arr(cos_bit);
4515     bf0     = output;
4516     bf1     = step;
4517     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4518     bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4519     bf1[4]  = bf0[4] + bf0[5];
4520     bf1[5]  = -bf0[5] + bf0[4];
4521     bf1[6]  = -bf0[6] + bf0[7];
4522     bf1[7]  = bf0[7] + bf0[6];
4523     bf1[8]  = bf0[8];
4524     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
4525     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
4526     bf1[11] = bf0[11];
4527     bf1[12] = bf0[12];
4528     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
4529     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
4530     bf1[15] = bf0[15];
4531 
4532     // stage 5
4533     cospi   = cospi_arr(cos_bit);
4534     bf0     = step;
4535     bf1     = output;
4536     bf1[0]  = bf0[0];
4537     bf1[2]  = bf0[2];
4538     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
4539     bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
4540     bf1[8]  = bf0[8] + bf0[9];
4541     bf1[9]  = -bf0[9] + bf0[8];
4542     bf1[10] = -bf0[10] + bf0[11];
4543     bf1[11] = bf0[11] + bf0[10];
4544     bf1[12] = bf0[12] + bf0[13];
4545     bf1[13] = -bf0[13] + bf0[12];
4546     bf1[14] = -bf0[14] + bf0[15];
4547     bf1[15] = bf0[15] + bf0[14];
4548 
4549     // stage 6
4550     cospi   = cospi_arr(cos_bit);
4551     bf0     = output;
4552     bf1     = step;
4553     bf1[0]  = bf0[0];
4554     bf1[2]  = bf0[2];
4555     bf1[4]  = bf0[4];
4556     bf1[6]  = bf0[6];
4557     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
4558     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
4559     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
4560     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
4561 
4562     // stage 7
4563     bf0    = step;
4564     bf1    = output;
4565     bf1[0] = bf0[0];
4566     bf1[1] = bf0[8];
4567     bf1[2] = bf0[4];
4568     bf1[3] = bf0[12];
4569     bf1[4] = bf0[2];
4570     bf1[5] = bf0[10];
4571     bf1[6] = bf0[6];
4572     bf1[7] = bf0[14];
4573 }
4574 
svt_av1_fidentity8_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4575 void svt_av1_fidentity8_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
4576                              const int8_t *stage_range) {
4577     (void)stage_range;
4578     (void)cos_bit;
4579     for (int32_t i = 0; i < 4; ++i) output[i] = input[i] * 2;
4580 }
4581 
svt_av1_fadst8_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4582 void svt_av1_fadst8_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4583                            const int8_t *stage_range) {
4584     (void)stage_range;
4585     const int32_t *cospi;
4586 
4587     int32_t *bf0, *bf1;
4588     int32_t  step[8];
4589 
4590     // stage 0;
4591 
4592     // stage 1;
4593     assert(output != input);
4594     bf1    = output;
4595     bf1[0] = input[0];
4596     bf1[1] = -input[7];
4597     bf1[2] = -input[3];
4598     bf1[3] = input[4];
4599     bf1[4] = -input[1];
4600     bf1[5] = input[6];
4601     bf1[6] = input[2];
4602     bf1[7] = -input[5];
4603 
4604     // stage 2
4605     cospi  = cospi_arr(cos_bit);
4606     bf0    = output;
4607     bf1    = step;
4608     bf1[0] = bf0[0];
4609     bf1[1] = bf0[1];
4610     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
4611     bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
4612     bf1[4] = bf0[4];
4613     bf1[5] = bf0[5];
4614     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
4615     bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
4616 
4617     // stage 3
4618     bf0    = step;
4619     bf1    = output;
4620     bf1[0] = bf0[0] + bf0[2];
4621     bf1[1] = bf0[1] + bf0[3];
4622     bf1[2] = bf0[0] - bf0[2];
4623     bf1[3] = bf0[1] - bf0[3];
4624     bf1[4] = bf0[4] + bf0[6];
4625     bf1[5] = bf0[5] + bf0[7];
4626     bf1[6] = bf0[4] - bf0[6];
4627     bf1[7] = bf0[5] - bf0[7];
4628 
4629     // stage 4
4630     cospi  = cospi_arr(cos_bit);
4631     bf0    = output;
4632     bf1    = step;
4633     bf1[0] = bf0[0];
4634     bf1[1] = bf0[1];
4635     bf1[2] = bf0[2];
4636     bf1[3] = bf0[3];
4637     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
4638     bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
4639     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
4640     bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
4641 
4642     // stage 5
4643     bf0    = step;
4644     bf1    = output;
4645     bf1[0] = bf0[0] + bf0[4];
4646     bf1[1] = bf0[1] + bf0[5];
4647     bf1[2] = bf0[2] + bf0[6];
4648     bf1[3] = bf0[3] + bf0[7];
4649     bf1[4] = bf0[0] - bf0[4];
4650     bf1[5] = bf0[1] - bf0[5];
4651     bf1[6] = bf0[2] - bf0[6];
4652     bf1[7] = bf0[3] - bf0[7];
4653 
4654     // stage 6
4655     cospi  = cospi_arr(cos_bit);
4656     bf0    = output;
4657     bf1    = step;
4658     bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
4659     bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
4660     bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
4661     bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
4662 
4663     // stage 7
4664     bf0    = step;
4665     bf1    = output;
4666     bf1[0] = bf0[1];
4667     bf1[1] = bf0[6];
4668     bf1[2] = bf0[3];
4669     bf1[3] = bf0[4];
4670 }
4671 
svt_av1_fdct8_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4672 void svt_av1_fdct8_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4673                           const int8_t *stage_range) {
4674     (void)stage_range;
4675     const int32_t *cospi;
4676 
4677     int32_t *bf0, *bf1;
4678     int32_t  step[8];
4679 
4680     // stage 0;
4681 
4682     // stage 1;
4683     bf1    = output;
4684     bf1[0] = input[0] + input[7];
4685     bf1[1] = input[1] + input[6];
4686     bf1[2] = input[2] + input[5];
4687     bf1[3] = input[3] + input[4];
4688     bf1[4] = -input[4] + input[3];
4689     bf1[5] = -input[5] + input[2];
4690     bf1[6] = -input[6] + input[1];
4691     bf1[7] = -input[7] + input[0];
4692 
4693     // stage 2
4694     cospi  = cospi_arr(cos_bit);
4695     bf0    = output;
4696     bf1    = step;
4697     bf1[0] = bf0[0] + bf0[3];
4698     bf1[1] = bf0[1] + bf0[2];
4699     bf1[2] = -bf0[2] + bf0[1];
4700     bf1[3] = -bf0[3] + bf0[0];
4701     bf1[4] = bf0[4];
4702     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
4703     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
4704     bf1[7] = bf0[7];
4705 
4706     // stage 3
4707     cospi  = cospi_arr(cos_bit);
4708     bf0    = step;
4709     bf1    = output;
4710     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4711     bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4712     bf1[4] = bf0[4] + bf0[5];
4713     bf1[5] = -bf0[5] + bf0[4];
4714     bf1[6] = -bf0[6] + bf0[7];
4715     bf1[7] = bf0[7] + bf0[6];
4716 
4717     // stage 4
4718     cospi  = cospi_arr(cos_bit);
4719     bf0    = output;
4720     bf1    = step;
4721     bf1[0] = bf0[0];
4722     bf1[2] = bf0[2];
4723     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
4724     bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
4725 
4726     // stage 5
4727     bf0    = step;
4728     bf1    = output;
4729     bf1[0] = bf0[0];
4730     bf1[1] = bf0[4];
4731     bf1[2] = bf0[2];
4732     bf1[3] = bf0[6];
4733 }
4734 
svt_av1_fidentity4_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4735 void svt_av1_fidentity4_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
4736                              const int8_t *stage_range) {
4737     (void)stage_range;
4738     (void)cos_bit;
4739     output[0] = round_shift((int64_t)input[0] * new_sqrt2, new_sqrt2_bits);
4740     output[1] = round_shift((int64_t)input[1] * new_sqrt2, new_sqrt2_bits);
4741     assert(stage_range[0] + new_sqrt2_bits <= 32);
4742 }
4743 
svt_av1_fadst4_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4744 void svt_av1_fadst4_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4745                            const int8_t *stage_range) {
4746     (void)stage_range;
4747     int32_t        bit   = cos_bit;
4748     const int32_t *sinpi = sinpi_arr(bit);
4749     int32_t        x0, x1, x2, x3;
4750     int32_t        s0, s2, s4, s5, s7;
4751 
4752     // stage 0
4753     x0 = input[0];
4754     x1 = input[1];
4755     x2 = input[2];
4756     x3 = input[3];
4757 
4758     if (!(x0 | x1 | x2 | x3)) {
4759         output[0] = output[1] = output[2] = output[3] = 0;
4760         return;
4761     }
4762 
4763     // stage 1
4764     s0 = sinpi[1] * x0;
4765     s2 = sinpi[2] * x1;
4766     s4 = sinpi[3] * x2;
4767     s5 = sinpi[4] * x3;
4768     s7 = x0 + x1;
4769 
4770     // stage 2
4771     s7 = s7 - x3;
4772 
4773     // stage 3
4774     x0 = s0 + s2;
4775     x1 = sinpi[3] * s7;
4776 
4777     // stage 4
4778     x0 = x0 + s5;
4779 
4780     // stage 5
4781     s0 = x0 + s4;
4782 
4783     // 1-D transform scaling factor is sqrt(2).
4784     output[0] = round_shift(s0, bit);
4785     output[1] = round_shift(x1, bit);
4786 }
4787 
svt_av1_fdct4_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4788 void svt_av1_fdct4_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4789                           const int8_t *stage_range) {
4790     (void)stage_range;
4791     const int32_t *cospi;
4792 
4793     int32_t *bf0;
4794     int32_t  step[4];
4795 
4796     // stage 1;
4797     bf0    = step;
4798     bf0[0] = input[0] + input[3];
4799     bf0[1] = input[1] + input[2];
4800     bf0[2] = -input[2] + input[1];
4801     bf0[3] = -input[3] + input[0];
4802 
4803     // stage 2
4804     cospi = cospi_arr(cos_bit);
4805 
4806     output[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4807     output[1] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4808 }
4809 
svt_av1_fdct32_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4810 void svt_av1_fdct32_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4811                            const int8_t *stage_range) {
4812     (void)stage_range;
4813     const int32_t *cospi;
4814 
4815     int32_t *bf0, *bf1;
4816     int32_t  step[32];
4817 
4818     // stage 0;
4819 
4820     // stage 1;
4821     bf1     = output;
4822     bf1[0]  = input[0] + input[31];
4823     bf1[1]  = input[1] + input[30];
4824     bf1[2]  = input[2] + input[29];
4825     bf1[3]  = input[3] + input[28];
4826     bf1[4]  = input[4] + input[27];
4827     bf1[5]  = input[5] + input[26];
4828     bf1[6]  = input[6] + input[25];
4829     bf1[7]  = input[7] + input[24];
4830     bf1[8]  = input[8] + input[23];
4831     bf1[9]  = input[9] + input[22];
4832     bf1[10] = input[10] + input[21];
4833     bf1[11] = input[11] + input[20];
4834     bf1[12] = input[12] + input[19];
4835     bf1[13] = input[13] + input[18];
4836     bf1[14] = input[14] + input[17];
4837     bf1[15] = input[15] + input[16];
4838     bf1[16] = -input[16] + input[15];
4839     bf1[17] = -input[17] + input[14];
4840     bf1[18] = -input[18] + input[13];
4841     bf1[19] = -input[19] + input[12];
4842     bf1[20] = -input[20] + input[11];
4843     bf1[21] = -input[21] + input[10];
4844     bf1[22] = -input[22] + input[9];
4845     bf1[23] = -input[23] + input[8];
4846     bf1[24] = -input[24] + input[7];
4847     bf1[25] = -input[25] + input[6];
4848     bf1[26] = -input[26] + input[5];
4849     bf1[27] = -input[27] + input[4];
4850     bf1[28] = -input[28] + input[3];
4851     bf1[29] = -input[29] + input[2];
4852     bf1[30] = -input[30] + input[1];
4853     bf1[31] = -input[31] + input[0];
4854 
4855     // stage 2
4856     cospi   = cospi_arr(cos_bit);
4857     bf0     = output;
4858     bf1     = step;
4859     bf1[0]  = bf0[0] + bf0[15];
4860     bf1[1]  = bf0[1] + bf0[14];
4861     bf1[2]  = bf0[2] + bf0[13];
4862     bf1[3]  = bf0[3] + bf0[12];
4863     bf1[4]  = bf0[4] + bf0[11];
4864     bf1[5]  = bf0[5] + bf0[10];
4865     bf1[6]  = bf0[6] + bf0[9];
4866     bf1[7]  = bf0[7] + bf0[8];
4867     bf1[8]  = -bf0[8] + bf0[7];
4868     bf1[9]  = -bf0[9] + bf0[6];
4869     bf1[10] = -bf0[10] + bf0[5];
4870     bf1[11] = -bf0[11] + bf0[4];
4871     bf1[12] = -bf0[12] + bf0[3];
4872     bf1[13] = -bf0[13] + bf0[2];
4873     bf1[14] = -bf0[14] + bf0[1];
4874     bf1[15] = -bf0[15] + bf0[0];
4875     bf1[16] = bf0[16];
4876     bf1[17] = bf0[17];
4877     bf1[18] = bf0[18];
4878     bf1[19] = bf0[19];
4879     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
4880     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
4881     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
4882     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
4883     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
4884     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
4885     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
4886     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
4887     bf1[28] = bf0[28];
4888     bf1[29] = bf0[29];
4889     bf1[30] = bf0[30];
4890     bf1[31] = bf0[31];
4891 
4892     // stage 3
4893     cospi   = cospi_arr(cos_bit);
4894     bf0     = step;
4895     bf1     = output;
4896     bf1[0]  = bf0[0] + bf0[7];
4897     bf1[1]  = bf0[1] + bf0[6];
4898     bf1[2]  = bf0[2] + bf0[5];
4899     bf1[3]  = bf0[3] + bf0[4];
4900     bf1[4]  = -bf0[4] + bf0[3];
4901     bf1[5]  = -bf0[5] + bf0[2];
4902     bf1[6]  = -bf0[6] + bf0[1];
4903     bf1[7]  = -bf0[7] + bf0[0];
4904     bf1[8]  = bf0[8];
4905     bf1[9]  = bf0[9];
4906     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
4907     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
4908     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
4909     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
4910     bf1[14] = bf0[14];
4911     bf1[15] = bf0[15];
4912     bf1[16] = bf0[16] + bf0[23];
4913     bf1[17] = bf0[17] + bf0[22];
4914     bf1[18] = bf0[18] + bf0[21];
4915     bf1[19] = bf0[19] + bf0[20];
4916     bf1[20] = -bf0[20] + bf0[19];
4917     bf1[21] = -bf0[21] + bf0[18];
4918     bf1[22] = -bf0[22] + bf0[17];
4919     bf1[23] = -bf0[23] + bf0[16];
4920     bf1[24] = -bf0[24] + bf0[31];
4921     bf1[25] = -bf0[25] + bf0[30];
4922     bf1[26] = -bf0[26] + bf0[29];
4923     bf1[27] = -bf0[27] + bf0[28];
4924     bf1[28] = bf0[28] + bf0[27];
4925     bf1[29] = bf0[29] + bf0[26];
4926     bf1[30] = bf0[30] + bf0[25];
4927     bf1[31] = bf0[31] + bf0[24];
4928 
4929     // stage 4
4930     cospi   = cospi_arr(cos_bit);
4931     bf0     = output;
4932     bf1     = step;
4933     bf1[0]  = bf0[0] + bf0[3];
4934     bf1[1]  = bf0[1] + bf0[2];
4935     bf1[2]  = -bf0[2] + bf0[1];
4936     bf1[3]  = -bf0[3] + bf0[0];
4937     bf1[4]  = bf0[4];
4938     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
4939     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
4940     bf1[7]  = bf0[7];
4941     bf1[8]  = bf0[8] + bf0[11];
4942     bf1[9]  = bf0[9] + bf0[10];
4943     bf1[10] = -bf0[10] + bf0[9];
4944     bf1[11] = -bf0[11] + bf0[8];
4945     bf1[12] = -bf0[12] + bf0[15];
4946     bf1[13] = -bf0[13] + bf0[14];
4947     bf1[14] = bf0[14] + bf0[13];
4948     bf1[15] = bf0[15] + bf0[12];
4949     bf1[16] = bf0[16];
4950     bf1[17] = bf0[17];
4951     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
4952     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
4953     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
4954     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
4955     bf1[22] = bf0[22];
4956     bf1[23] = bf0[23];
4957     bf1[24] = bf0[24];
4958     bf1[25] = bf0[25];
4959     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
4960     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
4961     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
4962     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
4963     bf1[30] = bf0[30];
4964     bf1[31] = bf0[31];
4965 
4966     // stage 5
4967     cospi   = cospi_arr(cos_bit);
4968     bf0     = step;
4969     bf1     = output;
4970     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4971     bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4972     bf1[4]  = bf0[4] + bf0[5];
4973     bf1[5]  = -bf0[5] + bf0[4];
4974     bf1[6]  = -bf0[6] + bf0[7];
4975     bf1[7]  = bf0[7] + bf0[6];
4976     bf1[8]  = bf0[8];
4977     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
4978     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
4979     bf1[11] = bf0[11];
4980     bf1[12] = bf0[12];
4981     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
4982     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
4983     bf1[15] = bf0[15];
4984     bf1[16] = bf0[16] + bf0[19];
4985     bf1[17] = bf0[17] + bf0[18];
4986     bf1[18] = -bf0[18] + bf0[17];
4987     bf1[19] = -bf0[19] + bf0[16];
4988     bf1[20] = -bf0[20] + bf0[23];
4989     bf1[21] = -bf0[21] + bf0[22];
4990     bf1[22] = bf0[22] + bf0[21];
4991     bf1[23] = bf0[23] + bf0[20];
4992     bf1[24] = bf0[24] + bf0[27];
4993     bf1[25] = bf0[25] + bf0[26];
4994     bf1[26] = -bf0[26] + bf0[25];
4995     bf1[27] = -bf0[27] + bf0[24];
4996     bf1[28] = -bf0[28] + bf0[31];
4997     bf1[29] = -bf0[29] + bf0[30];
4998     bf1[30] = bf0[30] + bf0[29];
4999     bf1[31] = bf0[31] + bf0[28];
5000 
5001     // stage 6
5002     cospi   = cospi_arr(cos_bit);
5003     bf0     = output;
5004     bf1     = step;
5005     bf1[0]  = bf0[0];
5006     bf1[2]  = bf0[2];
5007     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
5008     bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
5009     bf1[8]  = bf0[8] + bf0[9];
5010     bf1[9]  = -bf0[9] + bf0[8];
5011     bf1[10] = -bf0[10] + bf0[11];
5012     bf1[11] = bf0[11] + bf0[10];
5013     bf1[12] = bf0[12] + bf0[13];
5014     bf1[13] = -bf0[13] + bf0[12];
5015     bf1[14] = -bf0[14] + bf0[15];
5016     bf1[15] = bf0[15] + bf0[14];
5017     bf1[16] = bf0[16];
5018     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
5019     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
5020     bf1[19] = bf0[19];
5021     bf1[20] = bf0[20];
5022     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
5023     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
5024     bf1[23] = bf0[23];
5025     bf1[24] = bf0[24];
5026     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
5027     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
5028     bf1[27] = bf0[27];
5029     bf1[28] = bf0[28];
5030     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
5031     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
5032     bf1[31] = bf0[31];
5033 
5034     // stage 7
5035     cospi   = cospi_arr(cos_bit);
5036     bf0     = step;
5037     bf1     = output;
5038     bf1[0]  = bf0[0];
5039     bf1[2]  = bf0[2];
5040     bf1[4]  = bf0[4];
5041     bf1[6]  = bf0[6];
5042     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
5043     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
5044     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
5045     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
5046     bf1[16] = bf0[16] + bf0[17];
5047     bf1[17] = -bf0[17] + bf0[16];
5048     bf1[18] = -bf0[18] + bf0[19];
5049     bf1[19] = bf0[19] + bf0[18];
5050     bf1[20] = bf0[20] + bf0[21];
5051     bf1[21] = -bf0[21] + bf0[20];
5052     bf1[22] = -bf0[22] + bf0[23];
5053     bf1[23] = bf0[23] + bf0[22];
5054     bf1[24] = bf0[24] + bf0[25];
5055     bf1[25] = -bf0[25] + bf0[24];
5056     bf1[26] = -bf0[26] + bf0[27];
5057     bf1[27] = bf0[27] + bf0[26];
5058     bf1[28] = bf0[28] + bf0[29];
5059     bf1[29] = -bf0[29] + bf0[28];
5060     bf1[30] = -bf0[30] + bf0[31];
5061     bf1[31] = bf0[31] + bf0[30];
5062 
5063     // stage 8
5064     cospi   = cospi_arr(cos_bit);
5065     bf0     = output;
5066     bf1     = step;
5067     bf1[0]  = bf0[0];
5068     bf1[2]  = bf0[2];
5069     bf1[4]  = bf0[4];
5070     bf1[6]  = bf0[6];
5071     bf1[8]  = bf0[8];
5072     bf1[10] = bf0[10];
5073     bf1[12] = bf0[12];
5074     bf1[14] = bf0[14];
5075     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
5076     bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
5077     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
5078     bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
5079     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
5080     bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
5081     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
5082     bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
5083 
5084     // stage 9
5085     bf0     = step;
5086     bf1     = output;
5087     bf1[0]  = bf0[0];
5088     bf1[1]  = bf0[16];
5089     bf1[2]  = bf0[8];
5090     bf1[3]  = bf0[24];
5091     bf1[4]  = bf0[4];
5092     bf1[5]  = bf0[20];
5093     bf1[6]  = bf0[12];
5094     bf1[7]  = bf0[28];
5095     bf1[8]  = bf0[2];
5096     bf1[9]  = bf0[18];
5097     bf1[10] = bf0[10];
5098     bf1[11] = bf0[26];
5099     bf1[12] = bf0[6];
5100     bf1[13] = bf0[22];
5101     bf1[14] = bf0[14];
5102     bf1[15] = bf0[30];
5103 }
5104 
svt_av1_fidentity32_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)5105 void svt_av1_fidentity32_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
5106                               const int8_t *stage_range) {
5107     (void)stage_range;
5108     (void)cos_bit;
5109     for (int32_t i = 0; i < 16; ++i) output[i] = input[i] * 4;
5110 }
5111 
svt_av1_fdct64_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)5112 void svt_av1_fdct64_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
5113                            const int8_t *stage_range) {
5114     (void)stage_range;
5115     const int32_t *cospi;
5116 
5117     int32_t *bf0, *bf1;
5118     int32_t  step[64];
5119 
5120     // stage 0;
5121 
5122     // stage 1;
5123     bf1     = output;
5124     bf1[0]  = input[0] + input[63];
5125     bf1[1]  = input[1] + input[62];
5126     bf1[2]  = input[2] + input[61];
5127     bf1[3]  = input[3] + input[60];
5128     bf1[4]  = input[4] + input[59];
5129     bf1[5]  = input[5] + input[58];
5130     bf1[6]  = input[6] + input[57];
5131     bf1[7]  = input[7] + input[56];
5132     bf1[8]  = input[8] + input[55];
5133     bf1[9]  = input[9] + input[54];
5134     bf1[10] = input[10] + input[53];
5135     bf1[11] = input[11] + input[52];
5136     bf1[12] = input[12] + input[51];
5137     bf1[13] = input[13] + input[50];
5138     bf1[14] = input[14] + input[49];
5139     bf1[15] = input[15] + input[48];
5140     bf1[16] = input[16] + input[47];
5141     bf1[17] = input[17] + input[46];
5142     bf1[18] = input[18] + input[45];
5143     bf1[19] = input[19] + input[44];
5144     bf1[20] = input[20] + input[43];
5145     bf1[21] = input[21] + input[42];
5146     bf1[22] = input[22] + input[41];
5147     bf1[23] = input[23] + input[40];
5148     bf1[24] = input[24] + input[39];
5149     bf1[25] = input[25] + input[38];
5150     bf1[26] = input[26] + input[37];
5151     bf1[27] = input[27] + input[36];
5152     bf1[28] = input[28] + input[35];
5153     bf1[29] = input[29] + input[34];
5154     bf1[30] = input[30] + input[33];
5155     bf1[31] = input[31] + input[32];
5156     bf1[32] = -input[32] + input[31];
5157     bf1[33] = -input[33] + input[30];
5158     bf1[34] = -input[34] + input[29];
5159     bf1[35] = -input[35] + input[28];
5160     bf1[36] = -input[36] + input[27];
5161     bf1[37] = -input[37] + input[26];
5162     bf1[38] = -input[38] + input[25];
5163     bf1[39] = -input[39] + input[24];
5164     bf1[40] = -input[40] + input[23];
5165     bf1[41] = -input[41] + input[22];
5166     bf1[42] = -input[42] + input[21];
5167     bf1[43] = -input[43] + input[20];
5168     bf1[44] = -input[44] + input[19];
5169     bf1[45] = -input[45] + input[18];
5170     bf1[46] = -input[46] + input[17];
5171     bf1[47] = -input[47] + input[16];
5172     bf1[48] = -input[48] + input[15];
5173     bf1[49] = -input[49] + input[14];
5174     bf1[50] = -input[50] + input[13];
5175     bf1[51] = -input[51] + input[12];
5176     bf1[52] = -input[52] + input[11];
5177     bf1[53] = -input[53] + input[10];
5178     bf1[54] = -input[54] + input[9];
5179     bf1[55] = -input[55] + input[8];
5180     bf1[56] = -input[56] + input[7];
5181     bf1[57] = -input[57] + input[6];
5182     bf1[58] = -input[58] + input[5];
5183     bf1[59] = -input[59] + input[4];
5184     bf1[60] = -input[60] + input[3];
5185     bf1[61] = -input[61] + input[2];
5186     bf1[62] = -input[62] + input[1];
5187     bf1[63] = -input[63] + input[0];
5188 
5189     // stage 2
5190     cospi   = cospi_arr(cos_bit);
5191     bf0     = output;
5192     bf1     = step;
5193     bf1[0]  = bf0[0] + bf0[31];
5194     bf1[1]  = bf0[1] + bf0[30];
5195     bf1[2]  = bf0[2] + bf0[29];
5196     bf1[3]  = bf0[3] + bf0[28];
5197     bf1[4]  = bf0[4] + bf0[27];
5198     bf1[5]  = bf0[5] + bf0[26];
5199     bf1[6]  = bf0[6] + bf0[25];
5200     bf1[7]  = bf0[7] + bf0[24];
5201     bf1[8]  = bf0[8] + bf0[23];
5202     bf1[9]  = bf0[9] + bf0[22];
5203     bf1[10] = bf0[10] + bf0[21];
5204     bf1[11] = bf0[11] + bf0[20];
5205     bf1[12] = bf0[12] + bf0[19];
5206     bf1[13] = bf0[13] + bf0[18];
5207     bf1[14] = bf0[14] + bf0[17];
5208     bf1[15] = bf0[15] + bf0[16];
5209     bf1[16] = -bf0[16] + bf0[15];
5210     bf1[17] = -bf0[17] + bf0[14];
5211     bf1[18] = -bf0[18] + bf0[13];
5212     bf1[19] = -bf0[19] + bf0[12];
5213     bf1[20] = -bf0[20] + bf0[11];
5214     bf1[21] = -bf0[21] + bf0[10];
5215     bf1[22] = -bf0[22] + bf0[9];
5216     bf1[23] = -bf0[23] + bf0[8];
5217     bf1[24] = -bf0[24] + bf0[7];
5218     bf1[25] = -bf0[25] + bf0[6];
5219     bf1[26] = -bf0[26] + bf0[5];
5220     bf1[27] = -bf0[27] + bf0[4];
5221     bf1[28] = -bf0[28] + bf0[3];
5222     bf1[29] = -bf0[29] + bf0[2];
5223     bf1[30] = -bf0[30] + bf0[1];
5224     bf1[31] = -bf0[31] + bf0[0];
5225     bf1[32] = bf0[32];
5226     bf1[33] = bf0[33];
5227     bf1[34] = bf0[34];
5228     bf1[35] = bf0[35];
5229     bf1[36] = bf0[36];
5230     bf1[37] = bf0[37];
5231     bf1[38] = bf0[38];
5232     bf1[39] = bf0[39];
5233     bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
5234     bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
5235     bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
5236     bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
5237     bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
5238     bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
5239     bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
5240     bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
5241     bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
5242     bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
5243     bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
5244     bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
5245     bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
5246     bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
5247     bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
5248     bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
5249     bf1[56] = bf0[56];
5250     bf1[57] = bf0[57];
5251     bf1[58] = bf0[58];
5252     bf1[59] = bf0[59];
5253     bf1[60] = bf0[60];
5254     bf1[61] = bf0[61];
5255     bf1[62] = bf0[62];
5256     bf1[63] = bf0[63];
5257 
5258     // stage 3
5259     cospi   = cospi_arr(cos_bit);
5260     bf0     = step;
5261     bf1     = output;
5262     bf1[0]  = bf0[0] + bf0[15];
5263     bf1[1]  = bf0[1] + bf0[14];
5264     bf1[2]  = bf0[2] + bf0[13];
5265     bf1[3]  = bf0[3] + bf0[12];
5266     bf1[4]  = bf0[4] + bf0[11];
5267     bf1[5]  = bf0[5] + bf0[10];
5268     bf1[6]  = bf0[6] + bf0[9];
5269     bf1[7]  = bf0[7] + bf0[8];
5270     bf1[8]  = -bf0[8] + bf0[7];
5271     bf1[9]  = -bf0[9] + bf0[6];
5272     bf1[10] = -bf0[10] + bf0[5];
5273     bf1[11] = -bf0[11] + bf0[4];
5274     bf1[12] = -bf0[12] + bf0[3];
5275     bf1[13] = -bf0[13] + bf0[2];
5276     bf1[14] = -bf0[14] + bf0[1];
5277     bf1[15] = -bf0[15] + bf0[0];
5278     bf1[16] = bf0[16];
5279     bf1[17] = bf0[17];
5280     bf1[18] = bf0[18];
5281     bf1[19] = bf0[19];
5282     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
5283     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
5284     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
5285     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
5286     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
5287     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
5288     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
5289     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
5290     bf1[28] = bf0[28];
5291     bf1[29] = bf0[29];
5292     bf1[30] = bf0[30];
5293     bf1[31] = bf0[31];
5294     bf1[32] = bf0[32] + bf0[47];
5295     bf1[33] = bf0[33] + bf0[46];
5296     bf1[34] = bf0[34] + bf0[45];
5297     bf1[35] = bf0[35] + bf0[44];
5298     bf1[36] = bf0[36] + bf0[43];
5299     bf1[37] = bf0[37] + bf0[42];
5300     bf1[38] = bf0[38] + bf0[41];
5301     bf1[39] = bf0[39] + bf0[40];
5302     bf1[40] = -bf0[40] + bf0[39];
5303     bf1[41] = -bf0[41] + bf0[38];
5304     bf1[42] = -bf0[42] + bf0[37];
5305     bf1[43] = -bf0[43] + bf0[36];
5306     bf1[44] = -bf0[44] + bf0[35];
5307     bf1[45] = -bf0[45] + bf0[34];
5308     bf1[46] = -bf0[46] + bf0[33];
5309     bf1[47] = -bf0[47] + bf0[32];
5310     bf1[48] = -bf0[48] + bf0[63];
5311     bf1[49] = -bf0[49] + bf0[62];
5312     bf1[50] = -bf0[50] + bf0[61];
5313     bf1[51] = -bf0[51] + bf0[60];
5314     bf1[52] = -bf0[52] + bf0[59];
5315     bf1[53] = -bf0[53] + bf0[58];
5316     bf1[54] = -bf0[54] + bf0[57];
5317     bf1[55] = -bf0[55] + bf0[56];
5318     bf1[56] = bf0[56] + bf0[55];
5319     bf1[57] = bf0[57] + bf0[54];
5320     bf1[58] = bf0[58] + bf0[53];
5321     bf1[59] = bf0[59] + bf0[52];
5322     bf1[60] = bf0[60] + bf0[51];
5323     bf1[61] = bf0[61] + bf0[50];
5324     bf1[62] = bf0[62] + bf0[49];
5325     bf1[63] = bf0[63] + bf0[48];
5326 
5327     // stage 4
5328     cospi   = cospi_arr(cos_bit);
5329     bf0     = output;
5330     bf1     = step;
5331     bf1[0]  = bf0[0] + bf0[7];
5332     bf1[1]  = bf0[1] + bf0[6];
5333     bf1[2]  = bf0[2] + bf0[5];
5334     bf1[3]  = bf0[3] + bf0[4];
5335     bf1[4]  = -bf0[4] + bf0[3];
5336     bf1[5]  = -bf0[5] + bf0[2];
5337     bf1[6]  = -bf0[6] + bf0[1];
5338     bf1[7]  = -bf0[7] + bf0[0];
5339     bf1[8]  = bf0[8];
5340     bf1[9]  = bf0[9];
5341     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
5342     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
5343     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
5344     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
5345     bf1[14] = bf0[14];
5346     bf1[15] = bf0[15];
5347     bf1[16] = bf0[16] + bf0[23];
5348     bf1[17] = bf0[17] + bf0[22];
5349     bf1[18] = bf0[18] + bf0[21];
5350     bf1[19] = bf0[19] + bf0[20];
5351     bf1[20] = -bf0[20] + bf0[19];
5352     bf1[21] = -bf0[21] + bf0[18];
5353     bf1[22] = -bf0[22] + bf0[17];
5354     bf1[23] = -bf0[23] + bf0[16];
5355     bf1[24] = -bf0[24] + bf0[31];
5356     bf1[25] = -bf0[25] + bf0[30];
5357     bf1[26] = -bf0[26] + bf0[29];
5358     bf1[27] = -bf0[27] + bf0[28];
5359     bf1[28] = bf0[28] + bf0[27];
5360     bf1[29] = bf0[29] + bf0[26];
5361     bf1[30] = bf0[30] + bf0[25];
5362     bf1[31] = bf0[31] + bf0[24];
5363     bf1[32] = bf0[32];
5364     bf1[33] = bf0[33];
5365     bf1[34] = bf0[34];
5366     bf1[35] = bf0[35];
5367     bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
5368     bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
5369     bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
5370     bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
5371     bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
5372     bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
5373     bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
5374     bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
5375     bf1[44] = bf0[44];
5376     bf1[45] = bf0[45];
5377     bf1[46] = bf0[46];
5378     bf1[47] = bf0[47];
5379     bf1[48] = bf0[48];
5380     bf1[49] = bf0[49];
5381     bf1[50] = bf0[50];
5382     bf1[51] = bf0[51];
5383     bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
5384     bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
5385     bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
5386     bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
5387     bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
5388     bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
5389     bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
5390     bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
5391     bf1[60] = bf0[60];
5392     bf1[61] = bf0[61];
5393     bf1[62] = bf0[62];
5394     bf1[63] = bf0[63];
5395 
5396     // stage 5
5397     cospi   = cospi_arr(cos_bit);
5398     bf0     = step;
5399     bf1     = output;
5400     bf1[0]  = bf0[0] + bf0[3];
5401     bf1[1]  = bf0[1] + bf0[2];
5402     bf1[2]  = -bf0[2] + bf0[1];
5403     bf1[3]  = -bf0[3] + bf0[0];
5404     bf1[4]  = bf0[4];
5405     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
5406     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
5407     bf1[7]  = bf0[7];
5408     bf1[8]  = bf0[8] + bf0[11];
5409     bf1[9]  = bf0[9] + bf0[10];
5410     bf1[10] = -bf0[10] + bf0[9];
5411     bf1[11] = -bf0[11] + bf0[8];
5412     bf1[12] = -bf0[12] + bf0[15];
5413     bf1[13] = -bf0[13] + bf0[14];
5414     bf1[14] = bf0[14] + bf0[13];
5415     bf1[15] = bf0[15] + bf0[12];
5416     bf1[16] = bf0[16];
5417     bf1[17] = bf0[17];
5418     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
5419     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
5420     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
5421     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
5422     bf1[22] = bf0[22];
5423     bf1[23] = bf0[23];
5424     bf1[24] = bf0[24];
5425     bf1[25] = bf0[25];
5426     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
5427     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
5428     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
5429     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
5430     bf1[30] = bf0[30];
5431     bf1[31] = bf0[31];
5432     bf1[32] = bf0[32] + bf0[39];
5433     bf1[33] = bf0[33] + bf0[38];
5434     bf1[34] = bf0[34] + bf0[37];
5435     bf1[35] = bf0[35] + bf0[36];
5436     bf1[36] = -bf0[36] + bf0[35];
5437     bf1[37] = -bf0[37] + bf0[34];
5438     bf1[38] = -bf0[38] + bf0[33];
5439     bf1[39] = -bf0[39] + bf0[32];
5440     bf1[40] = -bf0[40] + bf0[47];
5441     bf1[41] = -bf0[41] + bf0[46];
5442     bf1[42] = -bf0[42] + bf0[45];
5443     bf1[43] = -bf0[43] + bf0[44];
5444     bf1[44] = bf0[44] + bf0[43];
5445     bf1[45] = bf0[45] + bf0[42];
5446     bf1[46] = bf0[46] + bf0[41];
5447     bf1[47] = bf0[47] + bf0[40];
5448     bf1[48] = bf0[48] + bf0[55];
5449     bf1[49] = bf0[49] + bf0[54];
5450     bf1[50] = bf0[50] + bf0[53];
5451     bf1[51] = bf0[51] + bf0[52];
5452     bf1[52] = -bf0[52] + bf0[51];
5453     bf1[53] = -bf0[53] + bf0[50];
5454     bf1[54] = -bf0[54] + bf0[49];
5455     bf1[55] = -bf0[55] + bf0[48];
5456     bf1[56] = -bf0[56] + bf0[63];
5457     bf1[57] = -bf0[57] + bf0[62];
5458     bf1[58] = -bf0[58] + bf0[61];
5459     bf1[59] = -bf0[59] + bf0[60];
5460     bf1[60] = bf0[60] + bf0[59];
5461     bf1[61] = bf0[61] + bf0[58];
5462     bf1[62] = bf0[62] + bf0[57];
5463     bf1[63] = bf0[63] + bf0[56];
5464 
5465     // stage 6
5466     cospi   = cospi_arr(cos_bit);
5467     bf0     = output;
5468     bf1     = step;
5469     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
5470     bf1[2]  = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
5471     bf1[4]  = bf0[4] + bf0[5];
5472     bf1[5]  = -bf0[5] + bf0[4];
5473     bf1[6]  = -bf0[6] + bf0[7];
5474     bf1[7]  = bf0[7] + bf0[6];
5475     bf1[8]  = bf0[8];
5476     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
5477     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
5478     bf1[11] = bf0[11];
5479     bf1[12] = bf0[12];
5480     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
5481     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
5482     bf1[15] = bf0[15];
5483     bf1[16] = bf0[16] + bf0[19];
5484     bf1[17] = bf0[17] + bf0[18];
5485     bf1[18] = -bf0[18] + bf0[17];
5486     bf1[19] = -bf0[19] + bf0[16];
5487     bf1[20] = -bf0[20] + bf0[23];
5488     bf1[21] = -bf0[21] + bf0[22];
5489     bf1[22] = bf0[22] + bf0[21];
5490     bf1[23] = bf0[23] + bf0[20];
5491     bf1[24] = bf0[24] + bf0[27];
5492     bf1[25] = bf0[25] + bf0[26];
5493     bf1[26] = -bf0[26] + bf0[25];
5494     bf1[27] = -bf0[27] + bf0[24];
5495     bf1[28] = -bf0[28] + bf0[31];
5496     bf1[29] = -bf0[29] + bf0[30];
5497     bf1[30] = bf0[30] + bf0[29];
5498     bf1[31] = bf0[31] + bf0[28];
5499     bf1[32] = bf0[32];
5500     bf1[33] = bf0[33];
5501     bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
5502     bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
5503     bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
5504     bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
5505     bf1[38] = bf0[38];
5506     bf1[39] = bf0[39];
5507     bf1[40] = bf0[40];
5508     bf1[41] = bf0[41];
5509     bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
5510     bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
5511     bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
5512     bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
5513     bf1[46] = bf0[46];
5514     bf1[47] = bf0[47];
5515     bf1[48] = bf0[48];
5516     bf1[49] = bf0[49];
5517     bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
5518     bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
5519     bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
5520     bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
5521     bf1[54] = bf0[54];
5522     bf1[55] = bf0[55];
5523     bf1[56] = bf0[56];
5524     bf1[57] = bf0[57];
5525     bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
5526     bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
5527     bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
5528     bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
5529     bf1[62] = bf0[62];
5530     bf1[63] = bf0[63];
5531 
5532     // stage 7
5533     cospi   = cospi_arr(cos_bit);
5534     bf0     = step;
5535     bf1     = output;
5536     bf1[0]  = bf0[0];
5537     bf1[2]  = bf0[2];
5538     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
5539     bf1[6]  = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
5540     bf1[8]  = bf0[8] + bf0[9];
5541     bf1[9]  = -bf0[9] + bf0[8];
5542     bf1[10] = -bf0[10] + bf0[11];
5543     bf1[11] = bf0[11] + bf0[10];
5544     bf1[12] = bf0[12] + bf0[13];
5545     bf1[13] = -bf0[13] + bf0[12];
5546     bf1[14] = -bf0[14] + bf0[15];
5547     bf1[15] = bf0[15] + bf0[14];
5548     bf1[16] = bf0[16];
5549     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
5550     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
5551     bf1[19] = bf0[19];
5552     bf1[20] = bf0[20];
5553     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
5554     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
5555     bf1[23] = bf0[23];
5556     bf1[24] = bf0[24];
5557     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
5558     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
5559     bf1[27] = bf0[27];
5560     bf1[28] = bf0[28];
5561     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
5562     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
5563     bf1[31] = bf0[31];
5564     bf1[32] = bf0[32] + bf0[35];
5565     bf1[33] = bf0[33] + bf0[34];
5566     bf1[34] = -bf0[34] + bf0[33];
5567     bf1[35] = -bf0[35] + bf0[32];
5568     bf1[36] = -bf0[36] + bf0[39];
5569     bf1[37] = -bf0[37] + bf0[38];
5570     bf1[38] = bf0[38] + bf0[37];
5571     bf1[39] = bf0[39] + bf0[36];
5572     bf1[40] = bf0[40] + bf0[43];
5573     bf1[41] = bf0[41] + bf0[42];
5574     bf1[42] = -bf0[42] + bf0[41];
5575     bf1[43] = -bf0[43] + bf0[40];
5576     bf1[44] = -bf0[44] + bf0[47];
5577     bf1[45] = -bf0[45] + bf0[46];
5578     bf1[46] = bf0[46] + bf0[45];
5579     bf1[47] = bf0[47] + bf0[44];
5580     bf1[48] = bf0[48] + bf0[51];
5581     bf1[49] = bf0[49] + bf0[50];
5582     bf1[50] = -bf0[50] + bf0[49];
5583     bf1[51] = -bf0[51] + bf0[48];
5584     bf1[52] = -bf0[52] + bf0[55];
5585     bf1[53] = -bf0[53] + bf0[54];
5586     bf1[54] = bf0[54] + bf0[53];
5587     bf1[55] = bf0[55] + bf0[52];
5588     bf1[56] = bf0[56] + bf0[59];
5589     bf1[57] = bf0[57] + bf0[58];
5590     bf1[58] = -bf0[58] + bf0[57];
5591     bf1[59] = -bf0[59] + bf0[56];
5592     bf1[60] = -bf0[60] + bf0[63];
5593     bf1[61] = -bf0[61] + bf0[62];
5594     bf1[62] = bf0[62] + bf0[61];
5595     bf1[63] = bf0[63] + bf0[60];
5596 
5597     // stage 8
5598     cospi   = cospi_arr(cos_bit);
5599     bf0     = output;
5600     bf1     = step;
5601     bf1[0]  = bf0[0];
5602     bf1[2]  = bf0[2];
5603     bf1[4]  = bf0[4];
5604     bf1[6]  = bf0[6];
5605     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
5606     bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
5607     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
5608     bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
5609     bf1[16] = bf0[16] + bf0[17];
5610     bf1[17] = -bf0[17] + bf0[16];
5611     bf1[18] = -bf0[18] + bf0[19];
5612     bf1[19] = bf0[19] + bf0[18];
5613     bf1[20] = bf0[20] + bf0[21];
5614     bf1[21] = -bf0[21] + bf0[20];
5615     bf1[22] = -bf0[22] + bf0[23];
5616     bf1[23] = bf0[23] + bf0[22];
5617     bf1[24] = bf0[24] + bf0[25];
5618     bf1[25] = -bf0[25] + bf0[24];
5619     bf1[26] = -bf0[26] + bf0[27];
5620     bf1[27] = bf0[27] + bf0[26];
5621     bf1[28] = bf0[28] + bf0[29];
5622     bf1[29] = -bf0[29] + bf0[28];
5623     bf1[30] = -bf0[30] + bf0[31];
5624     bf1[31] = bf0[31] + bf0[30];
5625     bf1[32] = bf0[32];
5626     bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
5627     bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
5628     bf1[35] = bf0[35];
5629     bf1[36] = bf0[36];
5630     bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
5631     bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
5632     bf1[39] = bf0[39];
5633     bf1[40] = bf0[40];
5634     bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
5635     bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
5636     bf1[43] = bf0[43];
5637     bf1[44] = bf0[44];
5638     bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
5639     bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
5640     bf1[47] = bf0[47];
5641     bf1[48] = bf0[48];
5642     bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
5643     bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
5644     bf1[51] = bf0[51];
5645     bf1[52] = bf0[52];
5646     bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
5647     bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
5648     bf1[55] = bf0[55];
5649     bf1[56] = bf0[56];
5650     bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
5651     bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
5652     bf1[59] = bf0[59];
5653     bf1[60] = bf0[60];
5654     bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
5655     bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
5656     bf1[63] = bf0[63];
5657 
5658     // stage 9
5659     cospi   = cospi_arr(cos_bit);
5660     bf0     = step;
5661     bf1     = output;
5662     bf1[0]  = bf0[0];
5663     bf1[2]  = bf0[2];
5664     bf1[4]  = bf0[4];
5665     bf1[6]  = bf0[6];
5666     bf1[8]  = bf0[8];
5667     bf1[10] = bf0[10];
5668     bf1[12] = bf0[12];
5669     bf1[14] = bf0[14];
5670     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
5671     bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
5672     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
5673     bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
5674     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
5675     bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
5676     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
5677     bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
5678     bf1[32] = bf0[32] + bf0[33];
5679     bf1[33] = -bf0[33] + bf0[32];
5680     bf1[34] = -bf0[34] + bf0[35];
5681     bf1[35] = bf0[35] + bf0[34];
5682     bf1[36] = bf0[36] + bf0[37];
5683     bf1[37] = -bf0[37] + bf0[36];
5684     bf1[38] = -bf0[38] + bf0[39];
5685     bf1[39] = bf0[39] + bf0[38];
5686     bf1[40] = bf0[40] + bf0[41];
5687     bf1[41] = -bf0[41] + bf0[40];
5688     bf1[42] = -bf0[42] + bf0[43];
5689     bf1[43] = bf0[43] + bf0[42];
5690     bf1[44] = bf0[44] + bf0[45];
5691     bf1[45] = -bf0[45] + bf0[44];
5692     bf1[46] = -bf0[46] + bf0[47];
5693     bf1[47] = bf0[47] + bf0[46];
5694     bf1[48] = bf0[48] + bf0[49];
5695     bf1[49] = -bf0[49] + bf0[48];
5696     bf1[50] = -bf0[50] + bf0[51];
5697     bf1[51] = bf0[51] + bf0[50];
5698     bf1[52] = bf0[52] + bf0[53];
5699     bf1[53] = -bf0[53] + bf0[52];
5700     bf1[54] = -bf0[54] + bf0[55];
5701     bf1[55] = bf0[55] + bf0[54];
5702     bf1[56] = bf0[56] + bf0[57];
5703     bf1[57] = -bf0[57] + bf0[56];
5704     bf1[58] = -bf0[58] + bf0[59];
5705     bf1[59] = bf0[59] + bf0[58];
5706     bf1[60] = bf0[60] + bf0[61];
5707     bf1[61] = -bf0[61] + bf0[60];
5708     bf1[62] = -bf0[62] + bf0[63];
5709     bf1[63] = bf0[63] + bf0[62];
5710 
5711     // stage 10
5712     cospi   = cospi_arr(cos_bit);
5713     bf0     = output;
5714     bf1     = step;
5715     bf1[0]  = bf0[0];
5716     bf1[2]  = bf0[2];
5717     bf1[4]  = bf0[4];
5718     bf1[6]  = bf0[6];
5719     bf1[8]  = bf0[8];
5720     bf1[10] = bf0[10];
5721     bf1[12] = bf0[12];
5722     bf1[14] = bf0[14];
5723     bf1[16] = bf0[16];
5724     bf1[18] = bf0[18];
5725     bf1[20] = bf0[20];
5726     bf1[22] = bf0[22];
5727     bf1[24] = bf0[24];
5728     bf1[26] = bf0[26];
5729     bf1[28] = bf0[28];
5730     bf1[30] = bf0[30];
5731     bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
5732     bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
5733     bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
5734     bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
5735     bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
5736     bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
5737     bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
5738     bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
5739     bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
5740     bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
5741     bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
5742     bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
5743     bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
5744     bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
5745     bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
5746     bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
5747 
5748     // stage 11
5749     bf0     = step;
5750     bf1     = output;
5751     bf1[0]  = bf0[0];
5752     bf1[1]  = bf0[32];
5753     bf1[2]  = bf0[16];
5754     bf1[3]  = bf0[48];
5755     bf1[4]  = bf0[8];
5756     bf1[5]  = bf0[40];
5757     bf1[6]  = bf0[24];
5758     bf1[7]  = bf0[56];
5759     bf1[8]  = bf0[4];
5760     bf1[9]  = bf0[36];
5761     bf1[10] = bf0[20];
5762     bf1[11] = bf0[52];
5763     bf1[12] = bf0[12];
5764     bf1[13] = bf0[44];
5765     bf1[14] = bf0[28];
5766     bf1[15] = bf0[60];
5767     bf1[16] = bf0[2];
5768     bf1[17] = bf0[34];
5769     bf1[18] = bf0[18];
5770     bf1[19] = bf0[50];
5771     bf1[20] = bf0[10];
5772     bf1[21] = bf0[42];
5773     bf1[22] = bf0[26];
5774     bf1[23] = bf0[58];
5775     bf1[24] = bf0[6];
5776     bf1[25] = bf0[38];
5777     bf1[26] = bf0[22];
5778     bf1[27] = bf0[54];
5779     bf1[28] = bf0[14];
5780     bf1[29] = bf0[46];
5781     bf1[30] = bf0[30];
5782     bf1[31] = bf0[62];
5783 }
5784 
av1_fidentity64_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)5785 void av1_fidentity64_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
5786                           const int8_t *stage_range) {
5787     (void)stage_range;
5788     (void)cos_bit;
5789     for (int32_t i = 0; i < 32; ++i)
5790         output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
5791     assert(stage_range[0] + new_sqrt2_bits <= 32);
5792 }
5793 
fwd_txfm_type_to_func_N2(TxfmType txfmtype)5794 static INLINE TxfmFunc fwd_txfm_type_to_func_N2(TxfmType txfmtype) {
5795     switch (txfmtype) {
5796     case TXFM_TYPE_DCT4: return svt_av1_fdct4_new_N2;
5797     case TXFM_TYPE_DCT8: return svt_av1_fdct8_new_N2;
5798     case TXFM_TYPE_DCT16: return svt_av1_fdct16_new_N2;
5799     case TXFM_TYPE_DCT32: return svt_av1_fdct32_new_N2;
5800     case TXFM_TYPE_DCT64: return svt_av1_fdct64_new_N2;
5801     case TXFM_TYPE_ADST4: return svt_av1_fadst4_new_N2;
5802     case TXFM_TYPE_ADST8: return svt_av1_fadst8_new_N2;
5803     case TXFM_TYPE_ADST16: return svt_av1_fadst16_new_N2;
5804     case TXFM_TYPE_ADST32: return av1_fadst32_new;
5805     case TXFM_TYPE_IDENTITY4: return svt_av1_fidentity4_N2_c;
5806     case TXFM_TYPE_IDENTITY8: return svt_av1_fidentity8_N2_c;
5807     case TXFM_TYPE_IDENTITY16: return svt_av1_fidentity16_N2_c;
5808     case TXFM_TYPE_IDENTITY32: return svt_av1_fidentity32_N2_c;
5809     case TXFM_TYPE_IDENTITY64: return av1_fidentity64_N2_c;
5810     default: assert(0); return NULL;
5811     }
5812 }
5813 
av1_tranform_two_d_core_N2_c(int16_t * input,uint32_t input_stride,int32_t * output,const Txfm2dFlipCfg * cfg,int32_t * buf,uint8_t bit_depth)5814 static INLINE void av1_tranform_two_d_core_N2_c(int16_t *input, uint32_t input_stride,
5815                                                 int32_t *output, const Txfm2dFlipCfg *cfg,
5816                                                 int32_t *buf, uint8_t bit_depth) {
5817     int32_t c, r;
5818     // Note when assigning txfm_size_col, we use the txfm_size from the
5819     // row configuration and vice versa. This is intentionally done to
5820     // accurately perform rectangular transforms. When the transform is
5821     // rectangular, the number of columns will be the same as the
5822     // txfm_size stored in the row cfg struct. It will make no difference
5823     // for square transforms.
5824     const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
5825     const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
5826     // Take the shift from the larger dimension in the rectangular case.
5827     const int8_t *shift     = cfg->shift;
5828     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5829     int8_t        stage_range_col[MAX_TXFM_STAGE_NUM];
5830     int8_t        stage_range_row[MAX_TXFM_STAGE_NUM];
5831     assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
5832     assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
5833     svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
5834 
5835     const int8_t   cos_bit_col   = cfg->cos_bit_col;
5836     const int8_t   cos_bit_row   = cfg->cos_bit_row;
5837     const TxfmFunc txfm_func_col = fwd_txfm_type_to_func_N2(cfg->txfm_type_col);
5838     const TxfmFunc txfm_func_row = fwd_txfm_type_to_func_N2(cfg->txfm_type_row);
5839     ASSERT(txfm_func_col != NULL);
5840     ASSERT(txfm_func_row != NULL);
5841     // use output buffer as temp buffer
5842     int32_t *temp_in  = output;
5843     int32_t *temp_out = output + txfm_size_row;
5844 
5845     // Columns
5846     for (c = 0; c < txfm_size_col; ++c) {
5847         if (cfg->ud_flip == 0)
5848             for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * input_stride + c];
5849         else {
5850             for (r = 0; r < txfm_size_row; ++r)
5851                 // flip upside down
5852                 temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
5853         }
5854         svt_av1_round_shift_array_c(
5855             temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
5856         txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
5857         svt_av1_round_shift_array_c(
5858             temp_out, txfm_size_row / 2, -shift[1]); // NM svt_av1_round_shift_array_c
5859         if (cfg->lr_flip == 0) {
5860             for (r = 0; r < txfm_size_row; ++r) buf[r * txfm_size_col + c] = temp_out[r];
5861         } else {
5862             for (r = 0; r < txfm_size_row; ++r)
5863                 // flip from left to right
5864                 buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
5865         }
5866     }
5867 
5868     // Rows
5869     for (r = 0; r < txfm_size_row / 2; ++r) {
5870         txfm_func_row(
5871             buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
5872         svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col / 2, -shift[2]);
5873 
5874         if (abs(rect_type) == 1) {
5875             // Multiply everything by Sqrt2 if the transform is rectangular and the
5876             // size difference is a factor of 2.
5877             for (c = 0; c < txfm_size_col / 2; ++c) {
5878                 output[r * txfm_size_col + c] = round_shift(
5879                     (int64_t)output[r * txfm_size_col + c] * new_sqrt2, new_sqrt2_bits);
5880             }
5881         }
5882     }
5883 
5884     for (int i = 0; i < (txfm_size_col * txfm_size_row); i++) {
5885         if (i % txfm_size_col >= (txfm_size_col >> 1) ||
5886             i / txfm_size_col >= (txfm_size_row >> 1)) {
5887             output[i] = 0;
5888         }
5889     }
5890 }
5891 
av1_transform_two_d_64x64_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5892 void av1_transform_two_d_64x64_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5893                                     TxType transform_type, uint8_t bit_depth) {
5894     int32_t       intermediate_transform_buffer[64 * 64];
5895     Txfm2dFlipCfg cfg;
5896     av1_transform_config(transform_type, TX_64X64, &cfg);
5897     av1_tranform_two_d_core_N2_c(
5898         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5899 }
5900 
av1_transform_two_d_32x32_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5901 void av1_transform_two_d_32x32_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5902                                     TxType transform_type, uint8_t bit_depth) {
5903     int32_t       intermediate_transform_buffer[32 * 32];
5904     Txfm2dFlipCfg cfg;
5905     av1_transform_config(transform_type, TX_32X32, &cfg);
5906     av1_tranform_two_d_core_N2_c(
5907         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5908 }
5909 
av1_transform_two_d_16x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5910 void av1_transform_two_d_16x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5911                                     TxType transform_type, uint8_t bit_depth) {
5912     int32_t       intermediate_transform_buffer[16 * 16];
5913     Txfm2dFlipCfg cfg;
5914     av1_transform_config(transform_type, TX_16X16, &cfg);
5915     av1_tranform_two_d_core_N2_c(
5916         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5917 }
5918 
av1_transform_two_d_8x8_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5919 void av1_transform_two_d_8x8_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5920                                   TxType transform_type, uint8_t bit_depth) {
5921     int32_t       intermediate_transform_buffer[8 * 8];
5922     Txfm2dFlipCfg cfg;
5923     av1_transform_config(transform_type, TX_8X8, &cfg);
5924     av1_tranform_two_d_core_N2_c(
5925         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5926 }
5927 
av1_transform_two_d_4x4_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5928 void av1_transform_two_d_4x4_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5929                                   TxType transform_type, uint8_t bit_depth) {
5930     int32_t       intermediate_transform_buffer[4 * 4];
5931     Txfm2dFlipCfg cfg;
5932     av1_transform_config(transform_type, TX_4X4, &cfg);
5933     av1_tranform_two_d_core_N2_c(
5934         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5935 }
5936 
svt_av1_fwd_txfm2d_64x32_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5937 void svt_av1_fwd_txfm2d_64x32_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5938                                    TxType transform_type, uint8_t bit_depth) {
5939     int32_t       intermediate_transform_buffer[64 * 32];
5940     Txfm2dFlipCfg cfg;
5941     av1_transform_config(transform_type, TX_64X32, &cfg);
5942     av1_tranform_two_d_core_N2_c(
5943         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5944 }
5945 
svt_av1_fwd_txfm2d_32x64_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5946 void svt_av1_fwd_txfm2d_32x64_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5947                                    TxType transform_type, uint8_t bit_depth) {
5948     int32_t       intermediate_transform_buffer[32 * 64];
5949     Txfm2dFlipCfg cfg;
5950     av1_transform_config(transform_type, TX_32X64, &cfg);
5951     av1_tranform_two_d_core_N2_c(
5952         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5953 }
5954 
svt_av1_fwd_txfm2d_64x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5955 void svt_av1_fwd_txfm2d_64x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5956                                    TxType transform_type, uint8_t bit_depth) {
5957     int32_t       intermediate_transform_buffer[64 * 16];
5958     Txfm2dFlipCfg cfg;
5959     av1_transform_config(transform_type, TX_64X16, &cfg);
5960     av1_tranform_two_d_core_N2_c(
5961         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5962 }
5963 
svt_av1_fwd_txfm2d_16x64_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5964 void svt_av1_fwd_txfm2d_16x64_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5965                                    TxType transform_type, uint8_t bit_depth) {
5966     int32_t       intermediate_transform_buffer[16 * 64];
5967     Txfm2dFlipCfg cfg;
5968     av1_transform_config(transform_type, TX_16X64, &cfg);
5969     av1_tranform_two_d_core_N2_c(
5970         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5971 }
5972 
svt_av1_fwd_txfm2d_32x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5973 void svt_av1_fwd_txfm2d_32x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5974                                    TxType transform_type, uint8_t bit_depth) {
5975     int32_t       intermediate_transform_buffer[32 * 16];
5976     Txfm2dFlipCfg cfg;
5977     av1_transform_config(transform_type, TX_32X16, &cfg);
5978     av1_tranform_two_d_core_N2_c(
5979         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5980 }
5981 
svt_av1_fwd_txfm2d_16x32_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5982 void svt_av1_fwd_txfm2d_16x32_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5983                                    TxType transform_type, uint8_t bit_depth) {
5984     int32_t       intermediate_transform_buffer[16 * 32];
5985     Txfm2dFlipCfg cfg;
5986     av1_transform_config(transform_type, TX_16X32, &cfg);
5987     av1_tranform_two_d_core_N2_c(
5988         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5989 }
5990 
svt_av1_fwd_txfm2d_16x8_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5991 void svt_av1_fwd_txfm2d_16x8_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5992                                   TxType transform_type, uint8_t bit_depth) {
5993     int32_t       intermediate_transform_buffer[16 * 8];
5994     Txfm2dFlipCfg cfg;
5995     av1_transform_config(transform_type, TX_16X8, &cfg);
5996     av1_tranform_two_d_core_N2_c(
5997         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5998 }
5999 
svt_av1_fwd_txfm2d_8x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6000 void svt_av1_fwd_txfm2d_8x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6001                                   TxType transform_type, uint8_t bit_depth) {
6002     int32_t       intermediate_transform_buffer[8 * 16];
6003     Txfm2dFlipCfg cfg;
6004     av1_transform_config(transform_type, TX_8X16, &cfg);
6005     av1_tranform_two_d_core_N2_c(
6006         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6007 }
6008 
svt_av1_fwd_txfm2d_32x8_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6009 void svt_av1_fwd_txfm2d_32x8_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6010                                   TxType transform_type, uint8_t bit_depth) {
6011     int32_t       intermediate_transform_buffer[32 * 8];
6012     Txfm2dFlipCfg cfg;
6013     av1_transform_config(transform_type, TX_32X8, &cfg);
6014     av1_tranform_two_d_core_N2_c(
6015         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6016 }
6017 
svt_av1_fwd_txfm2d_8x32_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6018 void svt_av1_fwd_txfm2d_8x32_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6019                                   TxType transform_type, uint8_t bit_depth) {
6020     int32_t       intermediate_transform_buffer[8 * 32];
6021     Txfm2dFlipCfg cfg;
6022     av1_transform_config(transform_type, TX_8X32, &cfg);
6023     av1_tranform_two_d_core_N2_c(
6024         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6025 }
6026 
svt_av1_fwd_txfm2d_16x4_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6027 void svt_av1_fwd_txfm2d_16x4_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6028                                   TxType transform_type, uint8_t bit_depth) {
6029     int32_t       intermediate_transform_buffer[16 * 4];
6030     Txfm2dFlipCfg cfg;
6031     av1_transform_config(transform_type, TX_16X4, &cfg);
6032     av1_tranform_two_d_core_N2_c(
6033         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6034 }
6035 
svt_av1_fwd_txfm2d_4x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6036 void svt_av1_fwd_txfm2d_4x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6037                                   TxType transform_type, uint8_t bit_depth) {
6038     int32_t       intermediate_transform_buffer[4 * 16];
6039     Txfm2dFlipCfg cfg;
6040     av1_transform_config(transform_type, TX_4X16, &cfg);
6041     av1_tranform_two_d_core_N2_c(
6042         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6043 }
6044 
svt_av1_fwd_txfm2d_8x4_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6045 void svt_av1_fwd_txfm2d_8x4_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6046                                  TxType transform_type, uint8_t bit_depth) {
6047     int32_t       intermediate_transform_buffer[8 * 4];
6048     Txfm2dFlipCfg cfg;
6049     av1_transform_config(transform_type, TX_8X4, &cfg);
6050     av1_tranform_two_d_core_N2_c(
6051         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6052 }
6053 
svt_av1_fwd_txfm2d_4x8_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6054 void svt_av1_fwd_txfm2d_4x8_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6055                                  TxType transform_type, uint8_t bit_depth) {
6056     int32_t       intermediate_transform_buffer[4 * 8];
6057     Txfm2dFlipCfg cfg;
6058     av1_transform_config(transform_type, TX_4X8, &cfg);
6059     av1_tranform_two_d_core_N2_c(
6060         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6061 }
6062 
svt_av1_fdct4_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6063 void svt_av1_fdct4_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6064                           const int8_t *stage_range) {
6065     (void)stage_range;
6066     const int32_t *cospi = cospi_arr(cos_bit);
6067     int32_t        step[2];
6068 
6069     // stage 1;
6070     step[0] = input[0] + input[3];
6071     step[1] = input[1] + input[2];
6072 
6073     output[0] = half_btf(cospi[32], step[0], cospi[32], step[1], cos_bit);
6074 }
6075 
svt_av1_fadst4_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6076 void svt_av1_fadst4_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6077                            const int8_t *stage_range) {
6078     (void)stage_range;
6079     int32_t        bit   = cos_bit;
6080     const int32_t *sinpi = sinpi_arr(bit);
6081     int32_t        x0, x1, x2, x3;
6082     int32_t        s0, s2, s4, s5;
6083 
6084     // stage 0
6085     x0 = input[0];
6086     x1 = input[1];
6087     x2 = input[2];
6088     x3 = input[3];
6089 
6090     if (!(x0 | x1 | x2 | x3)) {
6091         output[0] = output[1] = output[2] = output[3] = 0;
6092         return;
6093     }
6094 
6095     // stage 1
6096     s0 = sinpi[1] * x0;
6097     s2 = sinpi[2] * x1;
6098     s4 = sinpi[3] * x2;
6099     s5 = sinpi[4] * x3;
6100 
6101     // stage 3
6102     x0 = s0 + s2;
6103 
6104     // stage 4
6105     x0 = x0 + s5;
6106 
6107     // stage 5
6108     s0 = x0 + s4;
6109 
6110     // 1-D transform scaling factor is sqrt(2).
6111     output[0] = round_shift(s0, bit);
6112 }
6113 
svt_av1_fidentity4_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6114 void svt_av1_fidentity4_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
6115                              const int8_t *stage_range) {
6116     (void)stage_range;
6117     (void)cos_bit;
6118     output[0] = round_shift((int64_t)input[0] * new_sqrt2, new_sqrt2_bits);
6119     assert(stage_range[0] + new_sqrt2_bits <= 32);
6120 }
6121 
svt_av1_fdct8_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6122 void svt_av1_fdct8_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6123                           const int8_t *stage_range) {
6124     (void)stage_range;
6125     const int32_t *cospi;
6126 
6127     int32_t *bf0, *bf1;
6128     int32_t  step[8];
6129 
6130     // stage 0;
6131 
6132     // stage 1;
6133     bf1    = output;
6134     bf1[0] = input[0] + input[7];
6135     bf1[1] = input[1] + input[6];
6136     bf1[2] = input[2] + input[5];
6137     bf1[3] = input[3] + input[4];
6138     bf1[4] = -input[4] + input[3];
6139     bf1[5] = -input[5] + input[2];
6140     bf1[6] = -input[6] + input[1];
6141     bf1[7] = -input[7] + input[0];
6142 
6143     // stage 2
6144     cospi  = cospi_arr(cos_bit);
6145     bf0    = output;
6146     bf1    = step;
6147     bf1[0] = bf0[0] + bf0[3];
6148     bf1[1] = bf0[1] + bf0[2];
6149     bf1[4] = bf0[4];
6150     bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6151     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6152     bf1[7] = bf0[7];
6153 
6154     // stage 3
6155     bf0    = step;
6156     bf1    = output;
6157     bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6158     bf1[4] = bf0[4] + bf0[5];
6159     bf1[7] = bf0[7] + bf0[6];
6160 
6161     // stage 4
6162     bf0    = output;
6163     bf1    = step;
6164     bf1[0] = bf0[0];
6165     bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
6166 
6167     // stage 5
6168     bf0    = step;
6169     bf1    = output;
6170     bf1[0] = bf0[0];
6171     bf1[1] = bf0[4];
6172 }
6173 
svt_av1_fadst8_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6174 void svt_av1_fadst8_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6175                            const int8_t *stage_range) {
6176     (void)stage_range;
6177     const int32_t *cospi;
6178 
6179     int32_t *bf0, *bf1;
6180     int32_t  step[8];
6181 
6182     // stage 0;
6183 
6184     // stage 1;
6185     assert(output != input);
6186     bf1    = output;
6187     bf1[0] = input[0];
6188     bf1[1] = -input[7];
6189     bf1[2] = -input[3];
6190     bf1[3] = input[4];
6191     bf1[4] = -input[1];
6192     bf1[5] = input[6];
6193     bf1[6] = input[2];
6194     bf1[7] = -input[5];
6195 
6196     // stage 2
6197     cospi  = cospi_arr(cos_bit);
6198     bf0    = output;
6199     bf1    = step;
6200     bf1[0] = bf0[0];
6201     bf1[1] = bf0[1];
6202     bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
6203     bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
6204     bf1[4] = bf0[4];
6205     bf1[5] = bf0[5];
6206     bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
6207     bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
6208 
6209     // stage 3
6210     bf0    = step;
6211     bf1    = output;
6212     bf1[0] = bf0[0] + bf0[2];
6213     bf1[1] = bf0[1] + bf0[3];
6214     bf1[2] = bf0[0] - bf0[2];
6215     bf1[3] = bf0[1] - bf0[3];
6216     bf1[4] = bf0[4] + bf0[6];
6217     bf1[5] = bf0[5] + bf0[7];
6218     bf1[6] = bf0[4] - bf0[6];
6219     bf1[7] = bf0[5] - bf0[7];
6220 
6221     // stage 4
6222     bf0    = output;
6223     bf1    = step;
6224     bf1[0] = bf0[0];
6225     bf1[1] = bf0[1];
6226     bf1[2] = bf0[2];
6227     bf1[3] = bf0[3];
6228     bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
6229     bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
6230     bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
6231     bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
6232 
6233     // stage 5
6234     bf0    = step;
6235     bf1    = output;
6236     bf1[0] = bf0[0] + bf0[4];
6237     bf1[1] = bf0[1] + bf0[5];
6238     bf1[6] = bf0[2] - bf0[6];
6239     bf1[7] = bf0[3] - bf0[7];
6240 
6241     // stage 6
6242     bf0    = output;
6243     bf1    = step;
6244     bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
6245     bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
6246 
6247     // stage 7
6248     bf0    = step;
6249     bf1    = output;
6250     bf1[0] = bf0[1];
6251     bf1[1] = bf0[6];
6252 }
6253 
svt_av1_fidentity8_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6254 void svt_av1_fidentity8_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
6255                              const int8_t *stage_range) {
6256     (void)stage_range;
6257     (void)cos_bit;
6258     for (int32_t i = 0; i < 2; ++i) output[i] = input[i] * 2;
6259 }
6260 
svt_av1_fdct16_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6261 void svt_av1_fdct16_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6262                            const int8_t *stage_range) {
6263     (void)stage_range;
6264     const int32_t *cospi;
6265 
6266     int32_t *bf0, *bf1;
6267     int32_t  step[16];
6268 
6269     // stage 0;
6270 
6271     // stage 1;
6272     bf1     = output;
6273     bf1[0]  = input[0] + input[15];
6274     bf1[1]  = input[1] + input[14];
6275     bf1[2]  = input[2] + input[13];
6276     bf1[3]  = input[3] + input[12];
6277     bf1[4]  = input[4] + input[11];
6278     bf1[5]  = input[5] + input[10];
6279     bf1[6]  = input[6] + input[9];
6280     bf1[7]  = input[7] + input[8];
6281     bf1[8]  = -input[8] + input[7];
6282     bf1[9]  = -input[9] + input[6];
6283     bf1[10] = -input[10] + input[5];
6284     bf1[11] = -input[11] + input[4];
6285     bf1[12] = -input[12] + input[3];
6286     bf1[13] = -input[13] + input[2];
6287     bf1[14] = -input[14] + input[1];
6288     bf1[15] = -input[15] + input[0];
6289 
6290     // stage 2
6291     cospi   = cospi_arr(cos_bit);
6292     bf0     = output;
6293     bf1     = step;
6294     bf1[0]  = bf0[0] + bf0[7];
6295     bf1[1]  = bf0[1] + bf0[6];
6296     bf1[2]  = bf0[2] + bf0[5];
6297     bf1[3]  = bf0[3] + bf0[4];
6298     bf1[4]  = -bf0[4] + bf0[3];
6299     bf1[5]  = -bf0[5] + bf0[2];
6300     bf1[6]  = -bf0[6] + bf0[1];
6301     bf1[7]  = -bf0[7] + bf0[0];
6302     bf1[8]  = bf0[8];
6303     bf1[9]  = bf0[9];
6304     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
6305     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
6306     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
6307     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
6308     bf1[14] = bf0[14];
6309     bf1[15] = bf0[15];
6310 
6311     // stage 3
6312     bf0     = step;
6313     bf1     = output;
6314     bf1[0]  = bf0[0] + bf0[3];
6315     bf1[1]  = bf0[1] + bf0[2];
6316     bf1[4]  = bf0[4];
6317     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6318     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6319     bf1[7]  = bf0[7];
6320     bf1[8]  = bf0[8] + bf0[11];
6321     bf1[9]  = bf0[9] + bf0[10];
6322     bf1[10] = -bf0[10] + bf0[9];
6323     bf1[11] = -bf0[11] + bf0[8];
6324     bf1[12] = -bf0[12] + bf0[15];
6325     bf1[13] = -bf0[13] + bf0[14];
6326     bf1[14] = bf0[14] + bf0[13];
6327     bf1[15] = bf0[15] + bf0[12];
6328 
6329     // stage 4
6330     bf0     = output;
6331     bf1     = step;
6332     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6333     bf1[4]  = bf0[4] + bf0[5];
6334     bf1[7]  = bf0[7] + bf0[6];
6335     bf1[8]  = bf0[8];
6336     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
6337     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
6338     bf1[11] = bf0[11];
6339     bf1[12] = bf0[12];
6340     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
6341     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
6342     bf1[15] = bf0[15];
6343 
6344     // stage 5
6345     bf0     = step;
6346     bf1     = output;
6347     bf1[0]  = bf0[0];
6348     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
6349     bf1[8]  = bf0[8] + bf0[9];
6350     bf1[11] = bf0[11] + bf0[10];
6351     bf1[12] = bf0[12] + bf0[13];
6352     bf1[15] = bf0[15] + bf0[14];
6353 
6354     // stage 6
6355     bf0     = output;
6356     bf1     = step;
6357     bf1[0]  = bf0[0];
6358     bf1[4]  = bf0[4];
6359     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
6360     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
6361 
6362     // stage 7
6363     bf0    = step;
6364     bf1    = output;
6365     bf1[0] = bf0[0];
6366     bf1[1] = bf0[8];
6367     bf1[2] = bf0[4];
6368     bf1[3] = bf0[12];
6369 }
6370 
svt_av1_fadst16_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6371 void svt_av1_fadst16_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6372                             const int8_t *stage_range) {
6373     (void)stage_range;
6374     const int32_t *cospi;
6375 
6376     int32_t *bf0, *bf1;
6377     int32_t  step[16];
6378 
6379     // stage 0;
6380 
6381     // stage 1;
6382     assert(output != input);
6383     bf1     = output;
6384     bf1[0]  = input[0];
6385     bf1[1]  = -input[15];
6386     bf1[2]  = -input[7];
6387     bf1[3]  = input[8];
6388     bf1[4]  = -input[3];
6389     bf1[5]  = input[12];
6390     bf1[6]  = input[4];
6391     bf1[7]  = -input[11];
6392     bf1[8]  = -input[1];
6393     bf1[9]  = input[14];
6394     bf1[10] = input[6];
6395     bf1[11] = -input[9];
6396     bf1[12] = input[2];
6397     bf1[13] = -input[13];
6398     bf1[14] = -input[5];
6399     bf1[15] = input[10];
6400 
6401     // stage 2
6402     cospi   = cospi_arr(cos_bit);
6403     bf0     = output;
6404     bf1     = step;
6405     bf1[0]  = bf0[0];
6406     bf1[1]  = bf0[1];
6407     bf1[2]  = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
6408     bf1[3]  = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
6409     bf1[4]  = bf0[4];
6410     bf1[5]  = bf0[5];
6411     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
6412     bf1[7]  = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
6413     bf1[8]  = bf0[8];
6414     bf1[9]  = bf0[9];
6415     bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
6416     bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
6417     bf1[12] = bf0[12];
6418     bf1[13] = bf0[13];
6419     bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
6420     bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
6421 
6422     // stage 3
6423     bf0     = step;
6424     bf1     = output;
6425     bf1[0]  = bf0[0] + bf0[2];
6426     bf1[1]  = bf0[1] + bf0[3];
6427     bf1[2]  = bf0[0] - bf0[2];
6428     bf1[3]  = bf0[1] - bf0[3];
6429     bf1[4]  = bf0[4] + bf0[6];
6430     bf1[5]  = bf0[5] + bf0[7];
6431     bf1[6]  = bf0[4] - bf0[6];
6432     bf1[7]  = bf0[5] - bf0[7];
6433     bf1[8]  = bf0[8] + bf0[10];
6434     bf1[9]  = bf0[9] + bf0[11];
6435     bf1[10] = bf0[8] - bf0[10];
6436     bf1[11] = bf0[9] - bf0[11];
6437     bf1[12] = bf0[12] + bf0[14];
6438     bf1[13] = bf0[13] + bf0[15];
6439     bf1[14] = bf0[12] - bf0[14];
6440     bf1[15] = bf0[13] - bf0[15];
6441 
6442     // stage 4
6443     bf0     = output;
6444     bf1     = step;
6445     bf1[0]  = bf0[0];
6446     bf1[1]  = bf0[1];
6447     bf1[2]  = bf0[2];
6448     bf1[3]  = bf0[3];
6449     bf1[4]  = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
6450     bf1[5]  = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
6451     bf1[6]  = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
6452     bf1[7]  = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
6453     bf1[8]  = bf0[8];
6454     bf1[9]  = bf0[9];
6455     bf1[10] = bf0[10];
6456     bf1[11] = bf0[11];
6457     bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
6458     bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
6459     bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
6460     bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
6461 
6462     // stage 5
6463     bf0     = step;
6464     bf1     = output;
6465     bf1[0]  = bf0[0] + bf0[4];
6466     bf1[1]  = bf0[1] + bf0[5];
6467     bf1[2]  = bf0[2] + bf0[6];
6468     bf1[3]  = bf0[3] + bf0[7];
6469     bf1[4]  = bf0[0] - bf0[4];
6470     bf1[5]  = bf0[1] - bf0[5];
6471     bf1[6]  = bf0[2] - bf0[6];
6472     bf1[7]  = bf0[3] - bf0[7];
6473     bf1[8]  = bf0[8] + bf0[12];
6474     bf1[9]  = bf0[9] + bf0[13];
6475     bf1[10] = bf0[10] + bf0[14];
6476     bf1[11] = bf0[11] + bf0[15];
6477     bf1[12] = bf0[8] - bf0[12];
6478     bf1[13] = bf0[9] - bf0[13];
6479     bf1[14] = bf0[10] - bf0[14];
6480     bf1[15] = bf0[11] - bf0[15];
6481 
6482     // stage 6
6483     bf0     = output;
6484     bf1     = step;
6485     bf1[0]  = bf0[0];
6486     bf1[1]  = bf0[1];
6487     bf1[2]  = bf0[2];
6488     bf1[3]  = bf0[3];
6489     bf1[4]  = bf0[4];
6490     bf1[5]  = bf0[5];
6491     bf1[6]  = bf0[6];
6492     bf1[7]  = bf0[7];
6493     bf1[8]  = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
6494     bf1[9]  = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
6495     bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
6496     bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
6497     bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
6498     bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
6499     bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
6500     bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
6501 
6502     // stage 7
6503     bf0     = step;
6504     bf1     = output;
6505     bf1[0]  = bf0[0] + bf0[8];
6506     bf1[1]  = bf0[1] + bf0[9];
6507     bf1[2]  = bf0[2] + bf0[10];
6508     bf1[3]  = bf0[3] + bf0[11];
6509     bf1[12] = bf0[4] - bf0[12];
6510     bf1[13] = bf0[5] - bf0[13];
6511     bf1[14] = bf0[6] - bf0[14];
6512     bf1[15] = bf0[7] - bf0[15];
6513 
6514     // stage 8
6515     bf0     = output;
6516     bf1     = step;
6517     bf1[1]  = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
6518     bf1[3]  = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
6519     bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
6520     bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
6521 
6522     // stage 9
6523     bf0    = step;
6524     bf1    = output;
6525     bf1[0] = bf0[1];
6526     bf1[1] = bf0[14];
6527     bf1[2] = bf0[3];
6528     bf1[3] = bf0[12];
6529 }
6530 
svt_av1_fidentity16_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6531 void svt_av1_fidentity16_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
6532                               const int8_t *stage_range) {
6533     (void)stage_range;
6534     (void)cos_bit;
6535     for (int32_t i = 0; i < 4; ++i)
6536         output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
6537     assert(stage_range[0] + new_sqrt2_bits <= 32);
6538 }
6539 
svt_av1_fdct32_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6540 void svt_av1_fdct32_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6541                            const int8_t *stage_range) {
6542     (void)stage_range;
6543     const int32_t *cospi;
6544 
6545     int32_t *bf0, *bf1;
6546     int32_t  step[32];
6547 
6548     // stage 0;
6549 
6550     // stage 1;
6551     bf1     = output;
6552     bf1[0]  = input[0] + input[31];
6553     bf1[1]  = input[1] + input[30];
6554     bf1[2]  = input[2] + input[29];
6555     bf1[3]  = input[3] + input[28];
6556     bf1[4]  = input[4] + input[27];
6557     bf1[5]  = input[5] + input[26];
6558     bf1[6]  = input[6] + input[25];
6559     bf1[7]  = input[7] + input[24];
6560     bf1[8]  = input[8] + input[23];
6561     bf1[9]  = input[9] + input[22];
6562     bf1[10] = input[10] + input[21];
6563     bf1[11] = input[11] + input[20];
6564     bf1[12] = input[12] + input[19];
6565     bf1[13] = input[13] + input[18];
6566     bf1[14] = input[14] + input[17];
6567     bf1[15] = input[15] + input[16];
6568     bf1[16] = -input[16] + input[15];
6569     bf1[17] = -input[17] + input[14];
6570     bf1[18] = -input[18] + input[13];
6571     bf1[19] = -input[19] + input[12];
6572     bf1[20] = -input[20] + input[11];
6573     bf1[21] = -input[21] + input[10];
6574     bf1[22] = -input[22] + input[9];
6575     bf1[23] = -input[23] + input[8];
6576     bf1[24] = -input[24] + input[7];
6577     bf1[25] = -input[25] + input[6];
6578     bf1[26] = -input[26] + input[5];
6579     bf1[27] = -input[27] + input[4];
6580     bf1[28] = -input[28] + input[3];
6581     bf1[29] = -input[29] + input[2];
6582     bf1[30] = -input[30] + input[1];
6583     bf1[31] = -input[31] + input[0];
6584 
6585     // stage 2
6586     cospi   = cospi_arr(cos_bit);
6587     bf0     = output;
6588     bf1     = step;
6589     bf1[0]  = bf0[0] + bf0[15];
6590     bf1[1]  = bf0[1] + bf0[14];
6591     bf1[2]  = bf0[2] + bf0[13];
6592     bf1[3]  = bf0[3] + bf0[12];
6593     bf1[4]  = bf0[4] + bf0[11];
6594     bf1[5]  = bf0[5] + bf0[10];
6595     bf1[6]  = bf0[6] + bf0[9];
6596     bf1[7]  = bf0[7] + bf0[8];
6597     bf1[8]  = -bf0[8] + bf0[7];
6598     bf1[9]  = -bf0[9] + bf0[6];
6599     bf1[10] = -bf0[10] + bf0[5];
6600     bf1[11] = -bf0[11] + bf0[4];
6601     bf1[12] = -bf0[12] + bf0[3];
6602     bf1[13] = -bf0[13] + bf0[2];
6603     bf1[14] = -bf0[14] + bf0[1];
6604     bf1[15] = -bf0[15] + bf0[0];
6605     bf1[16] = bf0[16];
6606     bf1[17] = bf0[17];
6607     bf1[18] = bf0[18];
6608     bf1[19] = bf0[19];
6609     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
6610     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
6611     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
6612     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
6613     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
6614     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
6615     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
6616     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
6617     bf1[28] = bf0[28];
6618     bf1[29] = bf0[29];
6619     bf1[30] = bf0[30];
6620     bf1[31] = bf0[31];
6621 
6622     // stage 3
6623     bf0     = step;
6624     bf1     = output;
6625     bf1[0]  = bf0[0] + bf0[7];
6626     bf1[1]  = bf0[1] + bf0[6];
6627     bf1[2]  = bf0[2] + bf0[5];
6628     bf1[3]  = bf0[3] + bf0[4];
6629     bf1[4]  = -bf0[4] + bf0[3];
6630     bf1[5]  = -bf0[5] + bf0[2];
6631     bf1[6]  = -bf0[6] + bf0[1];
6632     bf1[7]  = -bf0[7] + bf0[0];
6633     bf1[8]  = bf0[8];
6634     bf1[9]  = bf0[9];
6635     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
6636     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
6637     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
6638     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
6639     bf1[14] = bf0[14];
6640     bf1[15] = bf0[15];
6641     bf1[16] = bf0[16] + bf0[23];
6642     bf1[17] = bf0[17] + bf0[22];
6643     bf1[18] = bf0[18] + bf0[21];
6644     bf1[19] = bf0[19] + bf0[20];
6645     bf1[20] = -bf0[20] + bf0[19];
6646     bf1[21] = -bf0[21] + bf0[18];
6647     bf1[22] = -bf0[22] + bf0[17];
6648     bf1[23] = -bf0[23] + bf0[16];
6649     bf1[24] = -bf0[24] + bf0[31];
6650     bf1[25] = -bf0[25] + bf0[30];
6651     bf1[26] = -bf0[26] + bf0[29];
6652     bf1[27] = -bf0[27] + bf0[28];
6653     bf1[28] = bf0[28] + bf0[27];
6654     bf1[29] = bf0[29] + bf0[26];
6655     bf1[30] = bf0[30] + bf0[25];
6656     bf1[31] = bf0[31] + bf0[24];
6657 
6658     // stage 4
6659     bf0     = output;
6660     bf1     = step;
6661     bf1[0]  = bf0[0] + bf0[3];
6662     bf1[1]  = bf0[1] + bf0[2];
6663     bf1[4]  = bf0[4];
6664     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6665     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6666     bf1[7]  = bf0[7];
6667     bf1[8]  = bf0[8] + bf0[11];
6668     bf1[9]  = bf0[9] + bf0[10];
6669     bf1[10] = -bf0[10] + bf0[9];
6670     bf1[11] = -bf0[11] + bf0[8];
6671     bf1[12] = -bf0[12] + bf0[15];
6672     bf1[13] = -bf0[13] + bf0[14];
6673     bf1[14] = bf0[14] + bf0[13];
6674     bf1[15] = bf0[15] + bf0[12];
6675     bf1[16] = bf0[16];
6676     bf1[17] = bf0[17];
6677     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
6678     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
6679     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
6680     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
6681     bf1[22] = bf0[22];
6682     bf1[23] = bf0[23];
6683     bf1[24] = bf0[24];
6684     bf1[25] = bf0[25];
6685     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
6686     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
6687     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
6688     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
6689     bf1[30] = bf0[30];
6690     bf1[31] = bf0[31];
6691 
6692     // stage 5
6693     bf0     = step;
6694     bf1     = output;
6695     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6696     bf1[4]  = bf0[4] + bf0[5];
6697     bf1[7]  = bf0[7] + bf0[6];
6698     bf1[8]  = bf0[8];
6699     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
6700     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
6701     bf1[11] = bf0[11];
6702     bf1[12] = bf0[12];
6703     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
6704     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
6705     bf1[15] = bf0[15];
6706     bf1[16] = bf0[16] + bf0[19];
6707     bf1[17] = bf0[17] + bf0[18];
6708     bf1[18] = -bf0[18] + bf0[17];
6709     bf1[19] = -bf0[19] + bf0[16];
6710     bf1[20] = -bf0[20] + bf0[23];
6711     bf1[21] = -bf0[21] + bf0[22];
6712     bf1[22] = bf0[22] + bf0[21];
6713     bf1[23] = bf0[23] + bf0[20];
6714     bf1[24] = bf0[24] + bf0[27];
6715     bf1[25] = bf0[25] + bf0[26];
6716     bf1[26] = -bf0[26] + bf0[25];
6717     bf1[27] = -bf0[27] + bf0[24];
6718     bf1[28] = -bf0[28] + bf0[31];
6719     bf1[29] = -bf0[29] + bf0[30];
6720     bf1[30] = bf0[30] + bf0[29];
6721     bf1[31] = bf0[31] + bf0[28];
6722 
6723     // stage 6
6724     bf0     = output;
6725     bf1     = step;
6726     bf1[0]  = bf0[0];
6727     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
6728     bf1[8]  = bf0[8] + bf0[9];
6729     bf1[11] = bf0[11] + bf0[10];
6730     bf1[12] = bf0[12] + bf0[13];
6731     bf1[15] = bf0[15] + bf0[14];
6732     bf1[16] = bf0[16];
6733     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
6734     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
6735     bf1[19] = bf0[19];
6736     bf1[20] = bf0[20];
6737     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
6738     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
6739     bf1[23] = bf0[23];
6740     bf1[24] = bf0[24];
6741     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
6742     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
6743     bf1[27] = bf0[27];
6744     bf1[28] = bf0[28];
6745     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
6746     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
6747     bf1[31] = bf0[31];
6748 
6749     // stage 7
6750     bf0     = step;
6751     bf1     = output;
6752     bf1[0]  = bf0[0];
6753     bf1[4]  = bf0[4];
6754     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
6755     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
6756     bf1[16] = bf0[16] + bf0[17];
6757     bf1[19] = bf0[19] + bf0[18];
6758     bf1[20] = bf0[20] + bf0[21];
6759     bf1[23] = bf0[23] + bf0[22];
6760     bf1[24] = bf0[24] + bf0[25];
6761     bf1[27] = bf0[27] + bf0[26];
6762     bf1[28] = bf0[28] + bf0[29];
6763     bf1[31] = bf0[31] + bf0[30];
6764 
6765     // stage 8
6766     bf0     = output;
6767     bf1     = step;
6768     bf1[0]  = bf0[0];
6769     bf1[4]  = bf0[4];
6770     bf1[8]  = bf0[8];
6771     bf1[12] = bf0[12];
6772     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
6773     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
6774     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
6775     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
6776 
6777     // stage 9
6778     bf0    = step;
6779     bf1    = output;
6780     bf1[0] = bf0[0];
6781     bf1[1] = bf0[16];
6782     bf1[2] = bf0[8];
6783     bf1[3] = bf0[24];
6784     bf1[4] = bf0[4];
6785     bf1[5] = bf0[20];
6786     bf1[6] = bf0[12];
6787     bf1[7] = bf0[28];
6788 }
6789 
svt_av1_fidentity32_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6790 void svt_av1_fidentity32_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
6791                               const int8_t *stage_range) {
6792     (void)stage_range;
6793     (void)cos_bit;
6794     for (int32_t i = 0; i < 8; ++i) output[i] = input[i] * 4;
6795 }
6796 
svt_av1_fdct64_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6797 void svt_av1_fdct64_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6798                            const int8_t *stage_range) {
6799     (void)stage_range;
6800     const int32_t *cospi;
6801 
6802     int32_t *bf0, *bf1;
6803     int32_t  step[64];
6804 
6805     // stage 0;
6806 
6807     // stage 1;
6808     bf1     = output;
6809     bf1[0]  = input[0] + input[63];
6810     bf1[1]  = input[1] + input[62];
6811     bf1[2]  = input[2] + input[61];
6812     bf1[3]  = input[3] + input[60];
6813     bf1[4]  = input[4] + input[59];
6814     bf1[5]  = input[5] + input[58];
6815     bf1[6]  = input[6] + input[57];
6816     bf1[7]  = input[7] + input[56];
6817     bf1[8]  = input[8] + input[55];
6818     bf1[9]  = input[9] + input[54];
6819     bf1[10] = input[10] + input[53];
6820     bf1[11] = input[11] + input[52];
6821     bf1[12] = input[12] + input[51];
6822     bf1[13] = input[13] + input[50];
6823     bf1[14] = input[14] + input[49];
6824     bf1[15] = input[15] + input[48];
6825     bf1[16] = input[16] + input[47];
6826     bf1[17] = input[17] + input[46];
6827     bf1[18] = input[18] + input[45];
6828     bf1[19] = input[19] + input[44];
6829     bf1[20] = input[20] + input[43];
6830     bf1[21] = input[21] + input[42];
6831     bf1[22] = input[22] + input[41];
6832     bf1[23] = input[23] + input[40];
6833     bf1[24] = input[24] + input[39];
6834     bf1[25] = input[25] + input[38];
6835     bf1[26] = input[26] + input[37];
6836     bf1[27] = input[27] + input[36];
6837     bf1[28] = input[28] + input[35];
6838     bf1[29] = input[29] + input[34];
6839     bf1[30] = input[30] + input[33];
6840     bf1[31] = input[31] + input[32];
6841     bf1[32] = -input[32] + input[31];
6842     bf1[33] = -input[33] + input[30];
6843     bf1[34] = -input[34] + input[29];
6844     bf1[35] = -input[35] + input[28];
6845     bf1[36] = -input[36] + input[27];
6846     bf1[37] = -input[37] + input[26];
6847     bf1[38] = -input[38] + input[25];
6848     bf1[39] = -input[39] + input[24];
6849     bf1[40] = -input[40] + input[23];
6850     bf1[41] = -input[41] + input[22];
6851     bf1[42] = -input[42] + input[21];
6852     bf1[43] = -input[43] + input[20];
6853     bf1[44] = -input[44] + input[19];
6854     bf1[45] = -input[45] + input[18];
6855     bf1[46] = -input[46] + input[17];
6856     bf1[47] = -input[47] + input[16];
6857     bf1[48] = -input[48] + input[15];
6858     bf1[49] = -input[49] + input[14];
6859     bf1[50] = -input[50] + input[13];
6860     bf1[51] = -input[51] + input[12];
6861     bf1[52] = -input[52] + input[11];
6862     bf1[53] = -input[53] + input[10];
6863     bf1[54] = -input[54] + input[9];
6864     bf1[55] = -input[55] + input[8];
6865     bf1[56] = -input[56] + input[7];
6866     bf1[57] = -input[57] + input[6];
6867     bf1[58] = -input[58] + input[5];
6868     bf1[59] = -input[59] + input[4];
6869     bf1[60] = -input[60] + input[3];
6870     bf1[61] = -input[61] + input[2];
6871     bf1[62] = -input[62] + input[1];
6872     bf1[63] = -input[63] + input[0];
6873 
6874     // stage 2
6875     cospi   = cospi_arr(cos_bit);
6876     bf0     = output;
6877     bf1     = step;
6878     bf1[0]  = bf0[0] + bf0[31];
6879     bf1[1]  = bf0[1] + bf0[30];
6880     bf1[2]  = bf0[2] + bf0[29];
6881     bf1[3]  = bf0[3] + bf0[28];
6882     bf1[4]  = bf0[4] + bf0[27];
6883     bf1[5]  = bf0[5] + bf0[26];
6884     bf1[6]  = bf0[6] + bf0[25];
6885     bf1[7]  = bf0[7] + bf0[24];
6886     bf1[8]  = bf0[8] + bf0[23];
6887     bf1[9]  = bf0[9] + bf0[22];
6888     bf1[10] = bf0[10] + bf0[21];
6889     bf1[11] = bf0[11] + bf0[20];
6890     bf1[12] = bf0[12] + bf0[19];
6891     bf1[13] = bf0[13] + bf0[18];
6892     bf1[14] = bf0[14] + bf0[17];
6893     bf1[15] = bf0[15] + bf0[16];
6894     bf1[16] = -bf0[16] + bf0[15];
6895     bf1[17] = -bf0[17] + bf0[14];
6896     bf1[18] = -bf0[18] + bf0[13];
6897     bf1[19] = -bf0[19] + bf0[12];
6898     bf1[20] = -bf0[20] + bf0[11];
6899     bf1[21] = -bf0[21] + bf0[10];
6900     bf1[22] = -bf0[22] + bf0[9];
6901     bf1[23] = -bf0[23] + bf0[8];
6902     bf1[24] = -bf0[24] + bf0[7];
6903     bf1[25] = -bf0[25] + bf0[6];
6904     bf1[26] = -bf0[26] + bf0[5];
6905     bf1[27] = -bf0[27] + bf0[4];
6906     bf1[28] = -bf0[28] + bf0[3];
6907     bf1[29] = -bf0[29] + bf0[2];
6908     bf1[30] = -bf0[30] + bf0[1];
6909     bf1[31] = -bf0[31] + bf0[0];
6910     bf1[32] = bf0[32];
6911     bf1[33] = bf0[33];
6912     bf1[34] = bf0[34];
6913     bf1[35] = bf0[35];
6914     bf1[36] = bf0[36];
6915     bf1[37] = bf0[37];
6916     bf1[38] = bf0[38];
6917     bf1[39] = bf0[39];
6918     bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
6919     bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
6920     bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
6921     bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
6922     bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
6923     bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
6924     bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
6925     bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
6926     bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
6927     bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
6928     bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
6929     bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
6930     bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
6931     bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
6932     bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
6933     bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
6934     bf1[56] = bf0[56];
6935     bf1[57] = bf0[57];
6936     bf1[58] = bf0[58];
6937     bf1[59] = bf0[59];
6938     bf1[60] = bf0[60];
6939     bf1[61] = bf0[61];
6940     bf1[62] = bf0[62];
6941     bf1[63] = bf0[63];
6942 
6943     // stage 3
6944     cospi   = cospi_arr(cos_bit);
6945     bf0     = step;
6946     bf1     = output;
6947     bf1[0]  = bf0[0] + bf0[15];
6948     bf1[1]  = bf0[1] + bf0[14];
6949     bf1[2]  = bf0[2] + bf0[13];
6950     bf1[3]  = bf0[3] + bf0[12];
6951     bf1[4]  = bf0[4] + bf0[11];
6952     bf1[5]  = bf0[5] + bf0[10];
6953     bf1[6]  = bf0[6] + bf0[9];
6954     bf1[7]  = bf0[7] + bf0[8];
6955     bf1[8]  = -bf0[8] + bf0[7];
6956     bf1[9]  = -bf0[9] + bf0[6];
6957     bf1[10] = -bf0[10] + bf0[5];
6958     bf1[11] = -bf0[11] + bf0[4];
6959     bf1[12] = -bf0[12] + bf0[3];
6960     bf1[13] = -bf0[13] + bf0[2];
6961     bf1[14] = -bf0[14] + bf0[1];
6962     bf1[15] = -bf0[15] + bf0[0];
6963     bf1[16] = bf0[16];
6964     bf1[17] = bf0[17];
6965     bf1[18] = bf0[18];
6966     bf1[19] = bf0[19];
6967     bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
6968     bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
6969     bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
6970     bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
6971     bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
6972     bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
6973     bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
6974     bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
6975     bf1[28] = bf0[28];
6976     bf1[29] = bf0[29];
6977     bf1[30] = bf0[30];
6978     bf1[31] = bf0[31];
6979     bf1[32] = bf0[32] + bf0[47];
6980     bf1[33] = bf0[33] + bf0[46];
6981     bf1[34] = bf0[34] + bf0[45];
6982     bf1[35] = bf0[35] + bf0[44];
6983     bf1[36] = bf0[36] + bf0[43];
6984     bf1[37] = bf0[37] + bf0[42];
6985     bf1[38] = bf0[38] + bf0[41];
6986     bf1[39] = bf0[39] + bf0[40];
6987     bf1[40] = -bf0[40] + bf0[39];
6988     bf1[41] = -bf0[41] + bf0[38];
6989     bf1[42] = -bf0[42] + bf0[37];
6990     bf1[43] = -bf0[43] + bf0[36];
6991     bf1[44] = -bf0[44] + bf0[35];
6992     bf1[45] = -bf0[45] + bf0[34];
6993     bf1[46] = -bf0[46] + bf0[33];
6994     bf1[47] = -bf0[47] + bf0[32];
6995     bf1[48] = -bf0[48] + bf0[63];
6996     bf1[49] = -bf0[49] + bf0[62];
6997     bf1[50] = -bf0[50] + bf0[61];
6998     bf1[51] = -bf0[51] + bf0[60];
6999     bf1[52] = -bf0[52] + bf0[59];
7000     bf1[53] = -bf0[53] + bf0[58];
7001     bf1[54] = -bf0[54] + bf0[57];
7002     bf1[55] = -bf0[55] + bf0[56];
7003     bf1[56] = bf0[56] + bf0[55];
7004     bf1[57] = bf0[57] + bf0[54];
7005     bf1[58] = bf0[58] + bf0[53];
7006     bf1[59] = bf0[59] + bf0[52];
7007     bf1[60] = bf0[60] + bf0[51];
7008     bf1[61] = bf0[61] + bf0[50];
7009     bf1[62] = bf0[62] + bf0[49];
7010     bf1[63] = bf0[63] + bf0[48];
7011 
7012     // stage 4
7013     cospi   = cospi_arr(cos_bit);
7014     bf0     = output;
7015     bf1     = step;
7016     bf1[0]  = bf0[0] + bf0[7];
7017     bf1[1]  = bf0[1] + bf0[6];
7018     bf1[2]  = bf0[2] + bf0[5];
7019     bf1[3]  = bf0[3] + bf0[4];
7020     bf1[4]  = -bf0[4] + bf0[3];
7021     bf1[5]  = -bf0[5] + bf0[2];
7022     bf1[6]  = -bf0[6] + bf0[1];
7023     bf1[7]  = -bf0[7] + bf0[0];
7024     bf1[8]  = bf0[8];
7025     bf1[9]  = bf0[9];
7026     bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
7027     bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
7028     bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
7029     bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
7030     bf1[14] = bf0[14];
7031     bf1[15] = bf0[15];
7032     bf1[16] = bf0[16] + bf0[23];
7033     bf1[17] = bf0[17] + bf0[22];
7034     bf1[18] = bf0[18] + bf0[21];
7035     bf1[19] = bf0[19] + bf0[20];
7036     bf1[20] = -bf0[20] + bf0[19];
7037     bf1[21] = -bf0[21] + bf0[18];
7038     bf1[22] = -bf0[22] + bf0[17];
7039     bf1[23] = -bf0[23] + bf0[16];
7040     bf1[24] = -bf0[24] + bf0[31];
7041     bf1[25] = -bf0[25] + bf0[30];
7042     bf1[26] = -bf0[26] + bf0[29];
7043     bf1[27] = -bf0[27] + bf0[28];
7044     bf1[28] = bf0[28] + bf0[27];
7045     bf1[29] = bf0[29] + bf0[26];
7046     bf1[30] = bf0[30] + bf0[25];
7047     bf1[31] = bf0[31] + bf0[24];
7048     bf1[32] = bf0[32];
7049     bf1[33] = bf0[33];
7050     bf1[34] = bf0[34];
7051     bf1[35] = bf0[35];
7052     bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
7053     bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
7054     bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
7055     bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
7056     bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
7057     bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
7058     bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
7059     bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
7060     bf1[44] = bf0[44];
7061     bf1[45] = bf0[45];
7062     bf1[46] = bf0[46];
7063     bf1[47] = bf0[47];
7064     bf1[48] = bf0[48];
7065     bf1[49] = bf0[49];
7066     bf1[50] = bf0[50];
7067     bf1[51] = bf0[51];
7068     bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
7069     bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
7070     bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
7071     bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
7072     bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
7073     bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
7074     bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
7075     bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
7076     bf1[60] = bf0[60];
7077     bf1[61] = bf0[61];
7078     bf1[62] = bf0[62];
7079     bf1[63] = bf0[63];
7080 
7081     // stage 5
7082     cospi   = cospi_arr(cos_bit);
7083     bf0     = step;
7084     bf1     = output;
7085     bf1[0]  = bf0[0] + bf0[3];
7086     bf1[1]  = bf0[1] + bf0[2];
7087     bf1[4]  = bf0[4];
7088     bf1[5]  = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
7089     bf1[6]  = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
7090     bf1[7]  = bf0[7];
7091     bf1[8]  = bf0[8] + bf0[11];
7092     bf1[9]  = bf0[9] + bf0[10];
7093     bf1[10] = -bf0[10] + bf0[9];
7094     bf1[11] = -bf0[11] + bf0[8];
7095     bf1[12] = -bf0[12] + bf0[15];
7096     bf1[13] = -bf0[13] + bf0[14];
7097     bf1[14] = bf0[14] + bf0[13];
7098     bf1[15] = bf0[15] + bf0[12];
7099     bf1[16] = bf0[16];
7100     bf1[17] = bf0[17];
7101     bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
7102     bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
7103     bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
7104     bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
7105     bf1[22] = bf0[22];
7106     bf1[23] = bf0[23];
7107     bf1[24] = bf0[24];
7108     bf1[25] = bf0[25];
7109     bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
7110     bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
7111     bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
7112     bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
7113     bf1[30] = bf0[30];
7114     bf1[31] = bf0[31];
7115     bf1[32] = bf0[32] + bf0[39];
7116     bf1[33] = bf0[33] + bf0[38];
7117     bf1[34] = bf0[34] + bf0[37];
7118     bf1[35] = bf0[35] + bf0[36];
7119     bf1[36] = -bf0[36] + bf0[35];
7120     bf1[37] = -bf0[37] + bf0[34];
7121     bf1[38] = -bf0[38] + bf0[33];
7122     bf1[39] = -bf0[39] + bf0[32];
7123     bf1[40] = -bf0[40] + bf0[47];
7124     bf1[41] = -bf0[41] + bf0[46];
7125     bf1[42] = -bf0[42] + bf0[45];
7126     bf1[43] = -bf0[43] + bf0[44];
7127     bf1[44] = bf0[44] + bf0[43];
7128     bf1[45] = bf0[45] + bf0[42];
7129     bf1[46] = bf0[46] + bf0[41];
7130     bf1[47] = bf0[47] + bf0[40];
7131     bf1[48] = bf0[48] + bf0[55];
7132     bf1[49] = bf0[49] + bf0[54];
7133     bf1[50] = bf0[50] + bf0[53];
7134     bf1[51] = bf0[51] + bf0[52];
7135     bf1[52] = -bf0[52] + bf0[51];
7136     bf1[53] = -bf0[53] + bf0[50];
7137     bf1[54] = -bf0[54] + bf0[49];
7138     bf1[55] = -bf0[55] + bf0[48];
7139     bf1[56] = -bf0[56] + bf0[63];
7140     bf1[57] = -bf0[57] + bf0[62];
7141     bf1[58] = -bf0[58] + bf0[61];
7142     bf1[59] = -bf0[59] + bf0[60];
7143     bf1[60] = bf0[60] + bf0[59];
7144     bf1[61] = bf0[61] + bf0[58];
7145     bf1[62] = bf0[62] + bf0[57];
7146     bf1[63] = bf0[63] + bf0[56];
7147 
7148     // stage 6
7149     cospi   = cospi_arr(cos_bit);
7150     bf0     = output;
7151     bf1     = step;
7152     bf1[0]  = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
7153     bf1[4]  = bf0[4] + bf0[5];
7154     bf1[7]  = bf0[7] + bf0[6];
7155     bf1[8]  = bf0[8];
7156     bf1[9]  = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
7157     bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
7158     bf1[11] = bf0[11];
7159     bf1[12] = bf0[12];
7160     bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
7161     bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
7162     bf1[15] = bf0[15];
7163     bf1[16] = bf0[16] + bf0[19];
7164     bf1[17] = bf0[17] + bf0[18];
7165     bf1[18] = -bf0[18] + bf0[17];
7166     bf1[19] = -bf0[19] + bf0[16];
7167     bf1[20] = -bf0[20] + bf0[23];
7168     bf1[21] = -bf0[21] + bf0[22];
7169     bf1[22] = bf0[22] + bf0[21];
7170     bf1[23] = bf0[23] + bf0[20];
7171     bf1[24] = bf0[24] + bf0[27];
7172     bf1[25] = bf0[25] + bf0[26];
7173     bf1[26] = -bf0[26] + bf0[25];
7174     bf1[27] = -bf0[27] + bf0[24];
7175     bf1[28] = -bf0[28] + bf0[31];
7176     bf1[29] = -bf0[29] + bf0[30];
7177     bf1[30] = bf0[30] + bf0[29];
7178     bf1[31] = bf0[31] + bf0[28];
7179     bf1[32] = bf0[32];
7180     bf1[33] = bf0[33];
7181     bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
7182     bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
7183     bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
7184     bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
7185     bf1[38] = bf0[38];
7186     bf1[39] = bf0[39];
7187     bf1[40] = bf0[40];
7188     bf1[41] = bf0[41];
7189     bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
7190     bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
7191     bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
7192     bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
7193     bf1[46] = bf0[46];
7194     bf1[47] = bf0[47];
7195     bf1[48] = bf0[48];
7196     bf1[49] = bf0[49];
7197     bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
7198     bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
7199     bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
7200     bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
7201     bf1[54] = bf0[54];
7202     bf1[55] = bf0[55];
7203     bf1[56] = bf0[56];
7204     bf1[57] = bf0[57];
7205     bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
7206     bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
7207     bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
7208     bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
7209     bf1[62] = bf0[62];
7210     bf1[63] = bf0[63];
7211 
7212     // stage 7
7213     cospi   = cospi_arr(cos_bit);
7214     bf0     = step;
7215     bf1     = output;
7216     bf1[0]  = bf0[0];
7217     bf1[4]  = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
7218     bf1[8]  = bf0[8] + bf0[9];
7219     bf1[11] = bf0[11] + bf0[10];
7220     bf1[12] = bf0[12] + bf0[13];
7221     bf1[15] = bf0[15] + bf0[14];
7222     bf1[16] = bf0[16];
7223     bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
7224     bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
7225     bf1[19] = bf0[19];
7226     bf1[20] = bf0[20];
7227     bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
7228     bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
7229     bf1[23] = bf0[23];
7230     bf1[24] = bf0[24];
7231     bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
7232     bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
7233     bf1[27] = bf0[27];
7234     bf1[28] = bf0[28];
7235     bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
7236     bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
7237     bf1[31] = bf0[31];
7238     bf1[32] = bf0[32] + bf0[35];
7239     bf1[33] = bf0[33] + bf0[34];
7240     bf1[34] = -bf0[34] + bf0[33];
7241     bf1[35] = -bf0[35] + bf0[32];
7242     bf1[36] = -bf0[36] + bf0[39];
7243     bf1[37] = -bf0[37] + bf0[38];
7244     bf1[38] = bf0[38] + bf0[37];
7245     bf1[39] = bf0[39] + bf0[36];
7246     bf1[40] = bf0[40] + bf0[43];
7247     bf1[41] = bf0[41] + bf0[42];
7248     bf1[42] = -bf0[42] + bf0[41];
7249     bf1[43] = -bf0[43] + bf0[40];
7250     bf1[44] = -bf0[44] + bf0[47];
7251     bf1[45] = -bf0[45] + bf0[46];
7252     bf1[46] = bf0[46] + bf0[45];
7253     bf1[47] = bf0[47] + bf0[44];
7254     bf1[48] = bf0[48] + bf0[51];
7255     bf1[49] = bf0[49] + bf0[50];
7256     bf1[50] = -bf0[50] + bf0[49];
7257     bf1[51] = -bf0[51] + bf0[48];
7258     bf1[52] = -bf0[52] + bf0[55];
7259     bf1[53] = -bf0[53] + bf0[54];
7260     bf1[54] = bf0[54] + bf0[53];
7261     bf1[55] = bf0[55] + bf0[52];
7262     bf1[56] = bf0[56] + bf0[59];
7263     bf1[57] = bf0[57] + bf0[58];
7264     bf1[58] = -bf0[58] + bf0[57];
7265     bf1[59] = -bf0[59] + bf0[56];
7266     bf1[60] = -bf0[60] + bf0[63];
7267     bf1[61] = -bf0[61] + bf0[62];
7268     bf1[62] = bf0[62] + bf0[61];
7269     bf1[63] = bf0[63] + bf0[60];
7270 
7271     // stage 8
7272     cospi   = cospi_arr(cos_bit);
7273     bf0     = output;
7274     bf1     = step;
7275     bf1[0]  = bf0[0];
7276     bf1[4]  = bf0[4];
7277     bf1[8]  = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
7278     bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
7279     bf1[16] = bf0[16] + bf0[17];
7280     bf1[19] = bf0[19] + bf0[18];
7281     bf1[20] = bf0[20] + bf0[21];
7282     bf1[23] = bf0[23] + bf0[22];
7283     bf1[24] = bf0[24] + bf0[25];
7284     bf1[27] = bf0[27] + bf0[26];
7285     bf1[28] = bf0[28] + bf0[29];
7286     bf1[31] = bf0[31] + bf0[30];
7287     bf1[32] = bf0[32];
7288     bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
7289     bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
7290     bf1[35] = bf0[35];
7291     bf1[36] = bf0[36];
7292     bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
7293     bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
7294     bf1[39] = bf0[39];
7295     bf1[40] = bf0[40];
7296     bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
7297     bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
7298     bf1[43] = bf0[43];
7299     bf1[44] = bf0[44];
7300     bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
7301     bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
7302     bf1[47] = bf0[47];
7303     bf1[48] = bf0[48];
7304     bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
7305     bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
7306     bf1[51] = bf0[51];
7307     bf1[52] = bf0[52];
7308     bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
7309     bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
7310     bf1[55] = bf0[55];
7311     bf1[56] = bf0[56];
7312     bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
7313     bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
7314     bf1[59] = bf0[59];
7315     bf1[60] = bf0[60];
7316     bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
7317     bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
7318     bf1[63] = bf0[63];
7319 
7320     // stage 9
7321     cospi   = cospi_arr(cos_bit);
7322     bf0     = step;
7323     bf1     = output;
7324     bf1[0]  = bf0[0];
7325     bf1[4]  = bf0[4];
7326     bf1[8]  = bf0[8];
7327     bf1[12] = bf0[12];
7328     bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
7329     bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
7330     bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
7331     bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
7332     bf1[32] = bf0[32] + bf0[33];
7333     bf1[35] = bf0[35] + bf0[34];
7334     bf1[36] = bf0[36] + bf0[37];
7335     bf1[39] = bf0[39] + bf0[38];
7336     bf1[40] = bf0[40] + bf0[41];
7337     bf1[43] = bf0[43] + bf0[42];
7338     bf1[44] = bf0[44] + bf0[45];
7339     bf1[47] = bf0[47] + bf0[46];
7340     bf1[48] = bf0[48] + bf0[49];
7341     bf1[51] = bf0[51] + bf0[50];
7342     bf1[52] = bf0[52] + bf0[53];
7343     bf1[55] = bf0[55] + bf0[54];
7344     bf1[56] = bf0[56] + bf0[57];
7345     bf1[59] = bf0[59] + bf0[58];
7346     bf1[60] = bf0[60] + bf0[61];
7347     bf1[63] = bf0[63] + bf0[62];
7348 
7349     // stage 10
7350     cospi   = cospi_arr(cos_bit);
7351     bf0     = output;
7352     bf1     = step;
7353     bf1[0]  = bf0[0];
7354     bf1[4]  = bf0[4];
7355     bf1[8]  = bf0[8];
7356     bf1[12] = bf0[12];
7357     bf1[16] = bf0[16];
7358     bf1[20] = bf0[20];
7359     bf1[24] = bf0[24];
7360     bf1[28] = bf0[28];
7361     bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
7362     bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
7363     bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
7364     bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
7365     bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
7366     bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
7367     bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
7368     bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
7369 
7370     // stage 11
7371     bf0     = step;
7372     bf1     = output;
7373     bf1[0]  = bf0[0];
7374     bf1[1]  = bf0[32];
7375     bf1[2]  = bf0[16];
7376     bf1[3]  = bf0[48];
7377     bf1[4]  = bf0[8];
7378     bf1[5]  = bf0[40];
7379     bf1[6]  = bf0[24];
7380     bf1[7]  = bf0[56];
7381     bf1[8]  = bf0[4];
7382     bf1[9]  = bf0[36];
7383     bf1[10] = bf0[20];
7384     bf1[11] = bf0[52];
7385     bf1[12] = bf0[12];
7386     bf1[13] = bf0[44];
7387     bf1[14] = bf0[28];
7388     bf1[15] = bf0[60];
7389 }
7390 
av1_fidentity64_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)7391 void av1_fidentity64_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
7392                           const int8_t *stage_range) {
7393     (void)stage_range;
7394     (void)cos_bit;
7395     for (int32_t i = 0; i < 16; ++i)
7396         output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
7397     assert(stage_range[0] + new_sqrt2_bits <= 32);
7398 }
7399 
fwd_txfm_type_to_func_N4(TxfmType txfmtype)7400 static INLINE TxfmFunc fwd_txfm_type_to_func_N4(TxfmType txfmtype) {
7401     switch (txfmtype) {
7402     case TXFM_TYPE_DCT4: return svt_av1_fdct4_new_N4;
7403     case TXFM_TYPE_DCT8: return svt_av1_fdct8_new_N4;
7404     case TXFM_TYPE_DCT16: return svt_av1_fdct16_new_N4;
7405     case TXFM_TYPE_DCT32: return svt_av1_fdct32_new_N4;
7406     case TXFM_TYPE_DCT64: return svt_av1_fdct64_new_N4;
7407     case TXFM_TYPE_ADST4: return svt_av1_fadst4_new_N4;
7408     case TXFM_TYPE_ADST8: return svt_av1_fadst8_new_N4;
7409     case TXFM_TYPE_ADST16: return svt_av1_fadst16_new_N4;
7410     case TXFM_TYPE_ADST32: return av1_fadst32_new;
7411     case TXFM_TYPE_IDENTITY4: return svt_av1_fidentity4_N4_c;
7412     case TXFM_TYPE_IDENTITY8: return svt_av1_fidentity8_N4_c;
7413     case TXFM_TYPE_IDENTITY16: return svt_av1_fidentity16_N4_c;
7414     case TXFM_TYPE_IDENTITY32: return svt_av1_fidentity32_N4_c;
7415     case TXFM_TYPE_IDENTITY64: return av1_fidentity64_N4_c;
7416     default: assert(0); return NULL;
7417     }
7418 }
7419 
av1_tranform_two_d_core_N4_c(int16_t * input,uint32_t input_stride,int32_t * output,const Txfm2dFlipCfg * cfg,int32_t * buf,uint8_t bit_depth)7420 static INLINE void av1_tranform_two_d_core_N4_c(int16_t *input, uint32_t input_stride,
7421                                                 int32_t *output, const Txfm2dFlipCfg *cfg,
7422                                                 int32_t *buf, uint8_t bit_depth) {
7423     int32_t c, r;
7424     // Note when assigning txfm_size_col, we use the txfm_size from the
7425     // row configuration and vice versa. This is intentionally done to
7426     // accurately perform rectangular transforms. When the transform is
7427     // rectangular, the number of columns will be the same as the
7428     // txfm_size stored in the row cfg struct. It will make no difference
7429     // for square transforms.
7430     const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
7431     const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
7432     // Take the shift from the larger dimension in the rectangular case.
7433     const int8_t *shift     = cfg->shift;
7434     const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
7435     int8_t        stage_range_col[MAX_TXFM_STAGE_NUM];
7436     int8_t        stage_range_row[MAX_TXFM_STAGE_NUM];
7437     assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
7438     assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
7439     svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
7440 
7441     const int8_t   cos_bit_col   = cfg->cos_bit_col;
7442     const int8_t   cos_bit_row   = cfg->cos_bit_row;
7443     const TxfmFunc txfm_func_col = fwd_txfm_type_to_func_N4(cfg->txfm_type_col);
7444     const TxfmFunc txfm_func_row = fwd_txfm_type_to_func_N4(cfg->txfm_type_row);
7445     ASSERT(txfm_func_col != NULL);
7446     ASSERT(txfm_func_row != NULL);
7447     // use output buffer as temp buffer
7448     int32_t *temp_in  = output;
7449     int32_t *temp_out = output + txfm_size_row;
7450 
7451     // Columns
7452     for (c = 0; c < txfm_size_col; ++c) {
7453         if (cfg->ud_flip == 0)
7454             for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * input_stride + c];
7455         else {
7456             for (r = 0; r < txfm_size_row; ++r)
7457                 // flip upside down
7458                 temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
7459         }
7460         svt_av1_round_shift_array_c(
7461             temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
7462         txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
7463         svt_av1_round_shift_array_c(
7464             temp_out, txfm_size_row / 4, -shift[1]); // NM svt_av1_round_shift_array_c
7465         if (cfg->lr_flip == 0) {
7466             for (r = 0; r < txfm_size_row; ++r) buf[r * txfm_size_col + c] = temp_out[r];
7467         } else {
7468             for (r = 0; r < txfm_size_row; ++r)
7469                 // flip from left to right
7470                 buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
7471         }
7472     }
7473 
7474     // Rows
7475     for (r = 0; r < txfm_size_row / 4; ++r) {
7476         txfm_func_row(
7477             buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
7478         svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col / 4, -shift[2]);
7479 
7480         if (abs(rect_type) == 1) {
7481             // Multiply everything by Sqrt2 if the transform is rectangular and the
7482             // size difference is a factor of 2.
7483             for (c = 0; c < txfm_size_col / 4; ++c) {
7484                 output[r * txfm_size_col + c] = round_shift(
7485                     (int64_t)output[r * txfm_size_col + c] * new_sqrt2, new_sqrt2_bits);
7486             }
7487         }
7488     }
7489     for (int i = 0; i < (txfm_size_col * txfm_size_row); i++)
7490         if (i % txfm_size_col >= (txfm_size_col >> 2) || i / txfm_size_col >= (txfm_size_row >> 2))
7491             output[i] = 0;
7492 }
7493 
av1_transform_two_d_64x64_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7494 void av1_transform_two_d_64x64_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7495                                     TxType transform_type, uint8_t bit_depth) {
7496     int32_t       intermediate_transform_buffer[64 * 64];
7497     Txfm2dFlipCfg cfg;
7498     av1_transform_config(transform_type, TX_64X64, &cfg);
7499     av1_tranform_two_d_core_N4_c(
7500         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7501 }
7502 
av1_transform_two_d_32x32_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7503 void av1_transform_two_d_32x32_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7504                                     TxType transform_type, uint8_t bit_depth) {
7505     int32_t       intermediate_transform_buffer[32 * 32];
7506     Txfm2dFlipCfg cfg;
7507     av1_transform_config(transform_type, TX_32X32, &cfg);
7508     av1_tranform_two_d_core_N4_c(
7509         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7510 }
7511 
av1_transform_two_d_16x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7512 void av1_transform_two_d_16x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7513                                     TxType transform_type, uint8_t bit_depth) {
7514     int32_t       intermediate_transform_buffer[16 * 16];
7515     Txfm2dFlipCfg cfg;
7516     av1_transform_config(transform_type, TX_16X16, &cfg);
7517     av1_tranform_two_d_core_N4_c(
7518         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7519 }
7520 
av1_transform_two_d_8x8_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7521 void av1_transform_two_d_8x8_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7522                                   TxType transform_type, uint8_t bit_depth) {
7523     int32_t       intermediate_transform_buffer[8 * 8];
7524     Txfm2dFlipCfg cfg;
7525     av1_transform_config(transform_type, TX_8X8, &cfg);
7526     av1_tranform_two_d_core_N4_c(
7527         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7528 }
7529 
av1_transform_two_d_4x4_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7530 void av1_transform_two_d_4x4_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7531                                   TxType transform_type, uint8_t bit_depth) {
7532     int32_t       intermediate_transform_buffer[4 * 4];
7533     Txfm2dFlipCfg cfg;
7534     av1_transform_config(transform_type, TX_4X4, &cfg);
7535     av1_tranform_two_d_core_N4_c(
7536         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7537 }
7538 
svt_av1_fwd_txfm2d_64x32_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7539 void svt_av1_fwd_txfm2d_64x32_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7540                                    TxType transform_type, uint8_t bit_depth) {
7541     int32_t       intermediate_transform_buffer[64 * 32];
7542     Txfm2dFlipCfg cfg;
7543     av1_transform_config(transform_type, TX_64X32, &cfg);
7544     av1_tranform_two_d_core_N4_c(
7545         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7546 }
7547 
svt_av1_fwd_txfm2d_32x64_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7548 void svt_av1_fwd_txfm2d_32x64_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7549                                    TxType transform_type, uint8_t bit_depth) {
7550     int32_t       intermediate_transform_buffer[32 * 64];
7551     Txfm2dFlipCfg cfg;
7552     av1_transform_config(transform_type, TX_32X64, &cfg);
7553     av1_tranform_two_d_core_N4_c(
7554         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7555 }
7556 
svt_av1_fwd_txfm2d_64x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7557 void svt_av1_fwd_txfm2d_64x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7558                                    TxType transform_type, uint8_t bit_depth) {
7559     int32_t       intermediate_transform_buffer[64 * 16];
7560     Txfm2dFlipCfg cfg;
7561     av1_transform_config(transform_type, TX_64X16, &cfg);
7562     av1_tranform_two_d_core_N4_c(
7563         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7564 }
7565 
svt_av1_fwd_txfm2d_16x64_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7566 void svt_av1_fwd_txfm2d_16x64_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7567                                    TxType transform_type, uint8_t bit_depth) {
7568     int32_t       intermediate_transform_buffer[16 * 64];
7569     Txfm2dFlipCfg cfg;
7570     av1_transform_config(transform_type, TX_16X64, &cfg);
7571     av1_tranform_two_d_core_N4_c(
7572         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7573 }
7574 
svt_av1_fwd_txfm2d_32x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7575 void svt_av1_fwd_txfm2d_32x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7576                                    TxType transform_type, uint8_t bit_depth) {
7577     int32_t       intermediate_transform_buffer[32 * 16];
7578     Txfm2dFlipCfg cfg;
7579     av1_transform_config(transform_type, TX_32X16, &cfg);
7580     av1_tranform_two_d_core_N4_c(
7581         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7582 }
7583 
svt_av1_fwd_txfm2d_16x32_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7584 void svt_av1_fwd_txfm2d_16x32_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7585                                    TxType transform_type, uint8_t bit_depth) {
7586     int32_t       intermediate_transform_buffer[16 * 32];
7587     Txfm2dFlipCfg cfg;
7588     av1_transform_config(transform_type, TX_16X32, &cfg);
7589     av1_tranform_two_d_core_N4_c(
7590         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7591 }
7592 
svt_av1_fwd_txfm2d_16x8_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7593 void svt_av1_fwd_txfm2d_16x8_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7594                                   TxType transform_type, uint8_t bit_depth) {
7595     int32_t       intermediate_transform_buffer[16 * 8];
7596     Txfm2dFlipCfg cfg;
7597     av1_transform_config(transform_type, TX_16X8, &cfg);
7598     av1_tranform_two_d_core_N4_c(
7599         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7600 }
7601 
svt_av1_fwd_txfm2d_8x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7602 void svt_av1_fwd_txfm2d_8x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7603                                   TxType transform_type, uint8_t bit_depth) {
7604     int32_t       intermediate_transform_buffer[8 * 16];
7605     Txfm2dFlipCfg cfg;
7606     av1_transform_config(transform_type, TX_8X16, &cfg);
7607     av1_tranform_two_d_core_N4_c(
7608         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7609 }
7610 
svt_av1_fwd_txfm2d_32x8_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7611 void svt_av1_fwd_txfm2d_32x8_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7612                                   TxType transform_type, uint8_t bit_depth) {
7613     int32_t       intermediate_transform_buffer[32 * 8];
7614     Txfm2dFlipCfg cfg;
7615     av1_transform_config(transform_type, TX_32X8, &cfg);
7616     av1_tranform_two_d_core_N4_c(
7617         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7618 }
7619 
svt_av1_fwd_txfm2d_8x32_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7620 void svt_av1_fwd_txfm2d_8x32_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7621                                   TxType transform_type, uint8_t bit_depth) {
7622     int32_t       intermediate_transform_buffer[8 * 32];
7623     Txfm2dFlipCfg cfg;
7624     av1_transform_config(transform_type, TX_8X32, &cfg);
7625     av1_tranform_two_d_core_N4_c(
7626         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7627 }
7628 
svt_av1_fwd_txfm2d_16x4_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7629 void svt_av1_fwd_txfm2d_16x4_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7630                                   TxType transform_type, uint8_t bit_depth) {
7631     int32_t       intermediate_transform_buffer[16 * 4];
7632     Txfm2dFlipCfg cfg;
7633     av1_transform_config(transform_type, TX_16X4, &cfg);
7634     av1_tranform_two_d_core_N4_c(
7635         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7636 }
7637 
svt_av1_fwd_txfm2d_4x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7638 void svt_av1_fwd_txfm2d_4x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7639                                   TxType transform_type, uint8_t bit_depth) {
7640     int32_t       intermediate_transform_buffer[4 * 16];
7641     Txfm2dFlipCfg cfg;
7642     av1_transform_config(transform_type, TX_4X16, &cfg);
7643     av1_tranform_two_d_core_N4_c(
7644         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7645 }
7646 
svt_av1_fwd_txfm2d_8x4_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7647 void svt_av1_fwd_txfm2d_8x4_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7648                                  TxType transform_type, uint8_t bit_depth) {
7649     int32_t       intermediate_transform_buffer[8 * 4];
7650     Txfm2dFlipCfg cfg;
7651     av1_transform_config(transform_type, TX_8X4, &cfg);
7652     av1_tranform_two_d_core_N4_c(
7653         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7654 }
7655 
svt_av1_fwd_txfm2d_4x8_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7656 void svt_av1_fwd_txfm2d_4x8_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7657                                  TxType transform_type, uint8_t bit_depth) {
7658     int32_t       intermediate_transform_buffer[4 * 8];
7659     Txfm2dFlipCfg cfg;
7660     av1_transform_config(transform_type, TX_4X8, &cfg);
7661     av1_tranform_two_d_core_N4_c(
7662         input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7663 }
7664 
7665