1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
11 */
12
13 #include <stdlib.h>
14 #include "EbTransforms.h"
15 #include "aom_dsp_rtcd.h"
16
17 static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {fdct4_range_mult2,
18 fdct8_range_mult2,
19 fdct16_range_mult2,
20 fdct32_range_mult2,
21 fdct64_range_mult2,
22 fadst4_range_mult2,
23 fadst8_range_mult2,
24 fadst16_range_mult2,
25 fadst32_range_mult2,
26 fidtx4_range_mult2,
27 fidtx8_range_mult2,
28 fidtx16_range_mult2,
29 fidtx32_range_mult2,
30 fidtx64_range_mult2};
31
32 static const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
33 fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32, fwd_shift_64x64,
34 fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16, fwd_shift_16x8, fwd_shift_16x32,
35 fwd_shift_32x16, fwd_shift_32x64, fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4,
36 fwd_shift_8x32, fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
37 };
38
39 /*****************************
40 * Defines
41 *****************************/
42
43 #define BETA_P 1
44 #define BETA_N 3
45
46 /********************************************
47 * Constants
48 ********************************************/
49
50 #define ALPHA_0000 0
51 #define ALPHA_0050 50
52
53 #define ALPHA_0100 100
54 #define ALPHA_0200 200
55 #define ALPHA_0300 300
56 #define ALPHA_0500 500
57 #define ALPHA_1000 1000
58
svt_av1_gen_fwd_stage_range(int8_t * stage_range_col,int8_t * stage_range_row,const Txfm2dFlipCfg * cfg,int32_t bd)59 void svt_av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
60 const Txfm2dFlipCfg *cfg, int32_t bd) {
61 // Take the shift from the larger dimension in the rectangular case.
62 const int8_t *shift = cfg->shift;
63 // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
64 for (int32_t i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i)
65 stage_range_col[i] = (int8_t)(cfg->stage_range_col[i] + shift[0] + bd + 1);
66 // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
67 for (int32_t i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i)
68 stage_range_row[i] = (int8_t)(cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1);
69 }
70
71 #define range_check(stage, input, buf, size, bit) \
72 do { \
73 } while (0)
74
svt_av1_fdct4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)75 void svt_av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
76 const int8_t *stage_range) {
77 (void)stage_range;
78 const int32_t *cospi;
79
80 int32_t *bf0, *bf1;
81 int32_t step[4];
82
83 // stage 0;
84
85 // stage 1;
86 bf1 = output;
87 bf1[0] = input[0] + input[3];
88 bf1[1] = input[1] + input[2];
89 bf1[2] = -input[2] + input[1];
90 bf1[3] = -input[3] + input[0];
91
92 // stage 2
93 cospi = cospi_arr(cos_bit);
94 bf0 = output;
95 bf1 = step;
96 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
97 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
98 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
99 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
100
101 // stage 3
102 bf0 = step;
103 bf1 = output;
104 bf1[0] = bf0[0];
105 bf1[1] = bf0[2];
106 bf1[2] = bf0[1];
107 bf1[3] = bf0[3];
108 }
109
svt_av1_fdct8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)110 void svt_av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
111 const int8_t *stage_range) {
112 (void)stage_range;
113 const int32_t *cospi;
114
115 int32_t *bf0, *bf1;
116 int32_t step[8];
117
118 // stage 0;
119
120 // stage 1;
121 bf1 = output;
122 bf1[0] = input[0] + input[7];
123 bf1[1] = input[1] + input[6];
124 bf1[2] = input[2] + input[5];
125 bf1[3] = input[3] + input[4];
126 bf1[4] = -input[4] + input[3];
127 bf1[5] = -input[5] + input[2];
128 bf1[6] = -input[6] + input[1];
129 bf1[7] = -input[7] + input[0];
130
131 // stage 2
132 cospi = cospi_arr(cos_bit);
133 bf0 = output;
134 bf1 = step;
135 bf1[0] = bf0[0] + bf0[3];
136 bf1[1] = bf0[1] + bf0[2];
137 bf1[2] = -bf0[2] + bf0[1];
138 bf1[3] = -bf0[3] + bf0[0];
139 bf1[4] = bf0[4];
140 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
141 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
142 bf1[7] = bf0[7];
143
144 // stage 3
145 cospi = cospi_arr(cos_bit);
146 bf0 = step;
147 bf1 = output;
148 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
149 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
150 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
151 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
152 bf1[4] = bf0[4] + bf0[5];
153 bf1[5] = -bf0[5] + bf0[4];
154 bf1[6] = -bf0[6] + bf0[7];
155 bf1[7] = bf0[7] + bf0[6];
156
157 // stage 4
158 cospi = cospi_arr(cos_bit);
159 bf0 = output;
160 bf1 = step;
161 bf1[0] = bf0[0];
162 bf1[1] = bf0[1];
163 bf1[2] = bf0[2];
164 bf1[3] = bf0[3];
165 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
166 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
167 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
168 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
169
170 // stage 5
171 bf0 = step;
172 bf1 = output;
173 bf1[0] = bf0[0];
174 bf1[1] = bf0[4];
175 bf1[2] = bf0[2];
176 bf1[3] = bf0[6];
177 bf1[4] = bf0[1];
178 bf1[5] = bf0[5];
179 bf1[6] = bf0[3];
180 bf1[7] = bf0[7];
181 }
182
svt_av1_fdct16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)183 void svt_av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
184 const int8_t *stage_range) {
185 (void)stage_range;
186 const int32_t *cospi;
187
188 int32_t *bf0, *bf1;
189 int32_t step[16];
190
191 // stage 0;
192
193 // stage 1;
194 bf1 = output;
195 bf1[0] = input[0] + input[15];
196 bf1[1] = input[1] + input[14];
197 bf1[2] = input[2] + input[13];
198 bf1[3] = input[3] + input[12];
199 bf1[4] = input[4] + input[11];
200 bf1[5] = input[5] + input[10];
201 bf1[6] = input[6] + input[9];
202 bf1[7] = input[7] + input[8];
203 bf1[8] = -input[8] + input[7];
204 bf1[9] = -input[9] + input[6];
205 bf1[10] = -input[10] + input[5];
206 bf1[11] = -input[11] + input[4];
207 bf1[12] = -input[12] + input[3];
208 bf1[13] = -input[13] + input[2];
209 bf1[14] = -input[14] + input[1];
210 bf1[15] = -input[15] + input[0];
211
212 // stage 2
213 cospi = cospi_arr(cos_bit);
214 bf0 = output;
215 bf1 = step;
216 bf1[0] = bf0[0] + bf0[7];
217 bf1[1] = bf0[1] + bf0[6];
218 bf1[2] = bf0[2] + bf0[5];
219 bf1[3] = bf0[3] + bf0[4];
220 bf1[4] = -bf0[4] + bf0[3];
221 bf1[5] = -bf0[5] + bf0[2];
222 bf1[6] = -bf0[6] + bf0[1];
223 bf1[7] = -bf0[7] + bf0[0];
224 bf1[8] = bf0[8];
225 bf1[9] = bf0[9];
226 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
227 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
228 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
229 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
230 bf1[14] = bf0[14];
231 bf1[15] = bf0[15];
232
233 // stage 3
234 cospi = cospi_arr(cos_bit);
235 bf0 = step;
236 bf1 = output;
237 bf1[0] = bf0[0] + bf0[3];
238 bf1[1] = bf0[1] + bf0[2];
239 bf1[2] = -bf0[2] + bf0[1];
240 bf1[3] = -bf0[3] + bf0[0];
241 bf1[4] = bf0[4];
242 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
243 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
244 bf1[7] = bf0[7];
245 bf1[8] = bf0[8] + bf0[11];
246 bf1[9] = bf0[9] + bf0[10];
247 bf1[10] = -bf0[10] + bf0[9];
248 bf1[11] = -bf0[11] + bf0[8];
249 bf1[12] = -bf0[12] + bf0[15];
250 bf1[13] = -bf0[13] + bf0[14];
251 bf1[14] = bf0[14] + bf0[13];
252 bf1[15] = bf0[15] + bf0[12];
253
254 // stage 4
255 cospi = cospi_arr(cos_bit);
256 bf0 = output;
257 bf1 = step;
258 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
259 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
260 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
261 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
262 bf1[4] = bf0[4] + bf0[5];
263 bf1[5] = -bf0[5] + bf0[4];
264 bf1[6] = -bf0[6] + bf0[7];
265 bf1[7] = bf0[7] + bf0[6];
266 bf1[8] = bf0[8];
267 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
268 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
269 bf1[11] = bf0[11];
270 bf1[12] = bf0[12];
271 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
272 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
273 bf1[15] = bf0[15];
274
275 // stage 5
276 cospi = cospi_arr(cos_bit);
277 bf0 = step;
278 bf1 = output;
279 bf1[0] = bf0[0];
280 bf1[1] = bf0[1];
281 bf1[2] = bf0[2];
282 bf1[3] = bf0[3];
283 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
284 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
285 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
286 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
287 bf1[8] = bf0[8] + bf0[9];
288 bf1[9] = -bf0[9] + bf0[8];
289 bf1[10] = -bf0[10] + bf0[11];
290 bf1[11] = bf0[11] + bf0[10];
291 bf1[12] = bf0[12] + bf0[13];
292 bf1[13] = -bf0[13] + bf0[12];
293 bf1[14] = -bf0[14] + bf0[15];
294 bf1[15] = bf0[15] + bf0[14];
295
296 // stage 6
297 cospi = cospi_arr(cos_bit);
298 bf0 = output;
299 bf1 = step;
300 bf1[0] = bf0[0];
301 bf1[1] = bf0[1];
302 bf1[2] = bf0[2];
303 bf1[3] = bf0[3];
304 bf1[4] = bf0[4];
305 bf1[5] = bf0[5];
306 bf1[6] = bf0[6];
307 bf1[7] = bf0[7];
308 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
309 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
310 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
311 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
312 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
313 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
314 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
315 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
316
317 // stage 7
318 bf0 = step;
319 bf1 = output;
320 bf1[0] = bf0[0];
321 bf1[1] = bf0[8];
322 bf1[2] = bf0[4];
323 bf1[3] = bf0[12];
324 bf1[4] = bf0[2];
325 bf1[5] = bf0[10];
326 bf1[6] = bf0[6];
327 bf1[7] = bf0[14];
328 bf1[8] = bf0[1];
329 bf1[9] = bf0[9];
330 bf1[10] = bf0[5];
331 bf1[11] = bf0[13];
332 bf1[12] = bf0[3];
333 bf1[13] = bf0[11];
334 bf1[14] = bf0[7];
335 bf1[15] = bf0[15];
336 }
337
svt_av1_fdct32_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)338 void svt_av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
339 const int8_t *stage_range) {
340 (void)stage_range;
341 const int32_t *cospi;
342
343 int32_t *bf0, *bf1;
344 int32_t step[32];
345
346 // stage 0;
347
348 // stage 1;
349 bf1 = output;
350 bf1[0] = input[0] + input[31];
351 bf1[1] = input[1] + input[30];
352 bf1[2] = input[2] + input[29];
353 bf1[3] = input[3] + input[28];
354 bf1[4] = input[4] + input[27];
355 bf1[5] = input[5] + input[26];
356 bf1[6] = input[6] + input[25];
357 bf1[7] = input[7] + input[24];
358 bf1[8] = input[8] + input[23];
359 bf1[9] = input[9] + input[22];
360 bf1[10] = input[10] + input[21];
361 bf1[11] = input[11] + input[20];
362 bf1[12] = input[12] + input[19];
363 bf1[13] = input[13] + input[18];
364 bf1[14] = input[14] + input[17];
365 bf1[15] = input[15] + input[16];
366 bf1[16] = -input[16] + input[15];
367 bf1[17] = -input[17] + input[14];
368 bf1[18] = -input[18] + input[13];
369 bf1[19] = -input[19] + input[12];
370 bf1[20] = -input[20] + input[11];
371 bf1[21] = -input[21] + input[10];
372 bf1[22] = -input[22] + input[9];
373 bf1[23] = -input[23] + input[8];
374 bf1[24] = -input[24] + input[7];
375 bf1[25] = -input[25] + input[6];
376 bf1[26] = -input[26] + input[5];
377 bf1[27] = -input[27] + input[4];
378 bf1[28] = -input[28] + input[3];
379 bf1[29] = -input[29] + input[2];
380 bf1[30] = -input[30] + input[1];
381 bf1[31] = -input[31] + input[0];
382
383 // stage 2
384 cospi = cospi_arr(cos_bit);
385 bf0 = output;
386 bf1 = step;
387 bf1[0] = bf0[0] + bf0[15];
388 bf1[1] = bf0[1] + bf0[14];
389 bf1[2] = bf0[2] + bf0[13];
390 bf1[3] = bf0[3] + bf0[12];
391 bf1[4] = bf0[4] + bf0[11];
392 bf1[5] = bf0[5] + bf0[10];
393 bf1[6] = bf0[6] + bf0[9];
394 bf1[7] = bf0[7] + bf0[8];
395 bf1[8] = -bf0[8] + bf0[7];
396 bf1[9] = -bf0[9] + bf0[6];
397 bf1[10] = -bf0[10] + bf0[5];
398 bf1[11] = -bf0[11] + bf0[4];
399 bf1[12] = -bf0[12] + bf0[3];
400 bf1[13] = -bf0[13] + bf0[2];
401 bf1[14] = -bf0[14] + bf0[1];
402 bf1[15] = -bf0[15] + bf0[0];
403 bf1[16] = bf0[16];
404 bf1[17] = bf0[17];
405 bf1[18] = bf0[18];
406 bf1[19] = bf0[19];
407 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
408 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
409 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
410 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
411 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
412 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
413 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
414 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
415 bf1[28] = bf0[28];
416 bf1[29] = bf0[29];
417 bf1[30] = bf0[30];
418 bf1[31] = bf0[31];
419
420 // stage 3
421 cospi = cospi_arr(cos_bit);
422 bf0 = step;
423 bf1 = output;
424 bf1[0] = bf0[0] + bf0[7];
425 bf1[1] = bf0[1] + bf0[6];
426 bf1[2] = bf0[2] + bf0[5];
427 bf1[3] = bf0[3] + bf0[4];
428 bf1[4] = -bf0[4] + bf0[3];
429 bf1[5] = -bf0[5] + bf0[2];
430 bf1[6] = -bf0[6] + bf0[1];
431 bf1[7] = -bf0[7] + bf0[0];
432 bf1[8] = bf0[8];
433 bf1[9] = bf0[9];
434 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
435 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
436 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
437 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
438 bf1[14] = bf0[14];
439 bf1[15] = bf0[15];
440 bf1[16] = bf0[16] + bf0[23];
441 bf1[17] = bf0[17] + bf0[22];
442 bf1[18] = bf0[18] + bf0[21];
443 bf1[19] = bf0[19] + bf0[20];
444 bf1[20] = -bf0[20] + bf0[19];
445 bf1[21] = -bf0[21] + bf0[18];
446 bf1[22] = -bf0[22] + bf0[17];
447 bf1[23] = -bf0[23] + bf0[16];
448 bf1[24] = -bf0[24] + bf0[31];
449 bf1[25] = -bf0[25] + bf0[30];
450 bf1[26] = -bf0[26] + bf0[29];
451 bf1[27] = -bf0[27] + bf0[28];
452 bf1[28] = bf0[28] + bf0[27];
453 bf1[29] = bf0[29] + bf0[26];
454 bf1[30] = bf0[30] + bf0[25];
455 bf1[31] = bf0[31] + bf0[24];
456
457 // stage 4
458 cospi = cospi_arr(cos_bit);
459 bf0 = output;
460 bf1 = step;
461 bf1[0] = bf0[0] + bf0[3];
462 bf1[1] = bf0[1] + bf0[2];
463 bf1[2] = -bf0[2] + bf0[1];
464 bf1[3] = -bf0[3] + bf0[0];
465 bf1[4] = bf0[4];
466 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
467 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
468 bf1[7] = bf0[7];
469 bf1[8] = bf0[8] + bf0[11];
470 bf1[9] = bf0[9] + bf0[10];
471 bf1[10] = -bf0[10] + bf0[9];
472 bf1[11] = -bf0[11] + bf0[8];
473 bf1[12] = -bf0[12] + bf0[15];
474 bf1[13] = -bf0[13] + bf0[14];
475 bf1[14] = bf0[14] + bf0[13];
476 bf1[15] = bf0[15] + bf0[12];
477 bf1[16] = bf0[16];
478 bf1[17] = bf0[17];
479 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
480 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
481 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
482 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
483 bf1[22] = bf0[22];
484 bf1[23] = bf0[23];
485 bf1[24] = bf0[24];
486 bf1[25] = bf0[25];
487 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
488 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
489 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
490 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
491 bf1[30] = bf0[30];
492 bf1[31] = bf0[31];
493
494 // stage 5
495 cospi = cospi_arr(cos_bit);
496 bf0 = step;
497 bf1 = output;
498 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
499 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
500 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
501 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
502 bf1[4] = bf0[4] + bf0[5];
503 bf1[5] = -bf0[5] + bf0[4];
504 bf1[6] = -bf0[6] + bf0[7];
505 bf1[7] = bf0[7] + bf0[6];
506 bf1[8] = bf0[8];
507 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
508 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
509 bf1[11] = bf0[11];
510 bf1[12] = bf0[12];
511 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
512 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
513 bf1[15] = bf0[15];
514 bf1[16] = bf0[16] + bf0[19];
515 bf1[17] = bf0[17] + bf0[18];
516 bf1[18] = -bf0[18] + bf0[17];
517 bf1[19] = -bf0[19] + bf0[16];
518 bf1[20] = -bf0[20] + bf0[23];
519 bf1[21] = -bf0[21] + bf0[22];
520 bf1[22] = bf0[22] + bf0[21];
521 bf1[23] = bf0[23] + bf0[20];
522 bf1[24] = bf0[24] + bf0[27];
523 bf1[25] = bf0[25] + bf0[26];
524 bf1[26] = -bf0[26] + bf0[25];
525 bf1[27] = -bf0[27] + bf0[24];
526 bf1[28] = -bf0[28] + bf0[31];
527 bf1[29] = -bf0[29] + bf0[30];
528 bf1[30] = bf0[30] + bf0[29];
529 bf1[31] = bf0[31] + bf0[28];
530
531 // stage 6
532 cospi = cospi_arr(cos_bit);
533 bf0 = output;
534 bf1 = step;
535 bf1[0] = bf0[0];
536 bf1[1] = bf0[1];
537 bf1[2] = bf0[2];
538 bf1[3] = bf0[3];
539 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
540 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
541 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
542 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
543 bf1[8] = bf0[8] + bf0[9];
544 bf1[9] = -bf0[9] + bf0[8];
545 bf1[10] = -bf0[10] + bf0[11];
546 bf1[11] = bf0[11] + bf0[10];
547 bf1[12] = bf0[12] + bf0[13];
548 bf1[13] = -bf0[13] + bf0[12];
549 bf1[14] = -bf0[14] + bf0[15];
550 bf1[15] = bf0[15] + bf0[14];
551 bf1[16] = bf0[16];
552 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
553 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
554 bf1[19] = bf0[19];
555 bf1[20] = bf0[20];
556 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
557 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
558 bf1[23] = bf0[23];
559 bf1[24] = bf0[24];
560 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
561 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
562 bf1[27] = bf0[27];
563 bf1[28] = bf0[28];
564 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
565 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
566 bf1[31] = bf0[31];
567
568 // stage 7
569 cospi = cospi_arr(cos_bit);
570 bf0 = step;
571 bf1 = output;
572 bf1[0] = bf0[0];
573 bf1[1] = bf0[1];
574 bf1[2] = bf0[2];
575 bf1[3] = bf0[3];
576 bf1[4] = bf0[4];
577 bf1[5] = bf0[5];
578 bf1[6] = bf0[6];
579 bf1[7] = bf0[7];
580 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
581 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
582 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
583 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
584 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
585 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
586 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
587 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
588 bf1[16] = bf0[16] + bf0[17];
589 bf1[17] = -bf0[17] + bf0[16];
590 bf1[18] = -bf0[18] + bf0[19];
591 bf1[19] = bf0[19] + bf0[18];
592 bf1[20] = bf0[20] + bf0[21];
593 bf1[21] = -bf0[21] + bf0[20];
594 bf1[22] = -bf0[22] + bf0[23];
595 bf1[23] = bf0[23] + bf0[22];
596 bf1[24] = bf0[24] + bf0[25];
597 bf1[25] = -bf0[25] + bf0[24];
598 bf1[26] = -bf0[26] + bf0[27];
599 bf1[27] = bf0[27] + bf0[26];
600 bf1[28] = bf0[28] + bf0[29];
601 bf1[29] = -bf0[29] + bf0[28];
602 bf1[30] = -bf0[30] + bf0[31];
603 bf1[31] = bf0[31] + bf0[30];
604
605 // stage 8
606 cospi = cospi_arr(cos_bit);
607 bf0 = output;
608 bf1 = step;
609 bf1[0] = bf0[0];
610 bf1[1] = bf0[1];
611 bf1[2] = bf0[2];
612 bf1[3] = bf0[3];
613 bf1[4] = bf0[4];
614 bf1[5] = bf0[5];
615 bf1[6] = bf0[6];
616 bf1[7] = bf0[7];
617 bf1[8] = bf0[8];
618 bf1[9] = bf0[9];
619 bf1[10] = bf0[10];
620 bf1[11] = bf0[11];
621 bf1[12] = bf0[12];
622 bf1[13] = bf0[13];
623 bf1[14] = bf0[14];
624 bf1[15] = bf0[15];
625 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
626 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
627 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
628 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
629 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
630 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
631 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
632 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
633 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
634 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
635 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
636 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
637 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
638 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
639 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
640 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
641
642 // stage 9
643 bf0 = step;
644 bf1 = output;
645 bf1[0] = bf0[0];
646 bf1[1] = bf0[16];
647 bf1[2] = bf0[8];
648 bf1[3] = bf0[24];
649 bf1[4] = bf0[4];
650 bf1[5] = bf0[20];
651 bf1[6] = bf0[12];
652 bf1[7] = bf0[28];
653 bf1[8] = bf0[2];
654 bf1[9] = bf0[18];
655 bf1[10] = bf0[10];
656 bf1[11] = bf0[26];
657 bf1[12] = bf0[6];
658 bf1[13] = bf0[22];
659 bf1[14] = bf0[14];
660 bf1[15] = bf0[30];
661 bf1[16] = bf0[1];
662 bf1[17] = bf0[17];
663 bf1[18] = bf0[9];
664 bf1[19] = bf0[25];
665 bf1[20] = bf0[5];
666 bf1[21] = bf0[21];
667 bf1[22] = bf0[13];
668 bf1[23] = bf0[29];
669 bf1[24] = bf0[3];
670 bf1[25] = bf0[19];
671 bf1[26] = bf0[11];
672 bf1[27] = bf0[27];
673 bf1[28] = bf0[7];
674 bf1[29] = bf0[23];
675 bf1[30] = bf0[15];
676 bf1[31] = bf0[31];
677 }
svt_av1_fdct64_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)678 void svt_av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
679 const int8_t *stage_range) {
680 (void)stage_range;
681 const int32_t *cospi;
682
683 int32_t *bf0, *bf1;
684 int32_t step[64];
685
686 // stage 0;
687
688 // stage 1;
689 bf1 = output;
690 bf1[0] = input[0] + input[63];
691 bf1[1] = input[1] + input[62];
692 bf1[2] = input[2] + input[61];
693 bf1[3] = input[3] + input[60];
694 bf1[4] = input[4] + input[59];
695 bf1[5] = input[5] + input[58];
696 bf1[6] = input[6] + input[57];
697 bf1[7] = input[7] + input[56];
698 bf1[8] = input[8] + input[55];
699 bf1[9] = input[9] + input[54];
700 bf1[10] = input[10] + input[53];
701 bf1[11] = input[11] + input[52];
702 bf1[12] = input[12] + input[51];
703 bf1[13] = input[13] + input[50];
704 bf1[14] = input[14] + input[49];
705 bf1[15] = input[15] + input[48];
706 bf1[16] = input[16] + input[47];
707 bf1[17] = input[17] + input[46];
708 bf1[18] = input[18] + input[45];
709 bf1[19] = input[19] + input[44];
710 bf1[20] = input[20] + input[43];
711 bf1[21] = input[21] + input[42];
712 bf1[22] = input[22] + input[41];
713 bf1[23] = input[23] + input[40];
714 bf1[24] = input[24] + input[39];
715 bf1[25] = input[25] + input[38];
716 bf1[26] = input[26] + input[37];
717 bf1[27] = input[27] + input[36];
718 bf1[28] = input[28] + input[35];
719 bf1[29] = input[29] + input[34];
720 bf1[30] = input[30] + input[33];
721 bf1[31] = input[31] + input[32];
722 bf1[32] = -input[32] + input[31];
723 bf1[33] = -input[33] + input[30];
724 bf1[34] = -input[34] + input[29];
725 bf1[35] = -input[35] + input[28];
726 bf1[36] = -input[36] + input[27];
727 bf1[37] = -input[37] + input[26];
728 bf1[38] = -input[38] + input[25];
729 bf1[39] = -input[39] + input[24];
730 bf1[40] = -input[40] + input[23];
731 bf1[41] = -input[41] + input[22];
732 bf1[42] = -input[42] + input[21];
733 bf1[43] = -input[43] + input[20];
734 bf1[44] = -input[44] + input[19];
735 bf1[45] = -input[45] + input[18];
736 bf1[46] = -input[46] + input[17];
737 bf1[47] = -input[47] + input[16];
738 bf1[48] = -input[48] + input[15];
739 bf1[49] = -input[49] + input[14];
740 bf1[50] = -input[50] + input[13];
741 bf1[51] = -input[51] + input[12];
742 bf1[52] = -input[52] + input[11];
743 bf1[53] = -input[53] + input[10];
744 bf1[54] = -input[54] + input[9];
745 bf1[55] = -input[55] + input[8];
746 bf1[56] = -input[56] + input[7];
747 bf1[57] = -input[57] + input[6];
748 bf1[58] = -input[58] + input[5];
749 bf1[59] = -input[59] + input[4];
750 bf1[60] = -input[60] + input[3];
751 bf1[61] = -input[61] + input[2];
752 bf1[62] = -input[62] + input[1];
753 bf1[63] = -input[63] + input[0];
754
755 // stage 2
756 cospi = cospi_arr(cos_bit);
757 bf0 = output;
758 bf1 = step;
759 bf1[0] = bf0[0] + bf0[31];
760 bf1[1] = bf0[1] + bf0[30];
761 bf1[2] = bf0[2] + bf0[29];
762 bf1[3] = bf0[3] + bf0[28];
763 bf1[4] = bf0[4] + bf0[27];
764 bf1[5] = bf0[5] + bf0[26];
765 bf1[6] = bf0[6] + bf0[25];
766 bf1[7] = bf0[7] + bf0[24];
767 bf1[8] = bf0[8] + bf0[23];
768 bf1[9] = bf0[9] + bf0[22];
769 bf1[10] = bf0[10] + bf0[21];
770 bf1[11] = bf0[11] + bf0[20];
771 bf1[12] = bf0[12] + bf0[19];
772 bf1[13] = bf0[13] + bf0[18];
773 bf1[14] = bf0[14] + bf0[17];
774 bf1[15] = bf0[15] + bf0[16];
775 bf1[16] = -bf0[16] + bf0[15];
776 bf1[17] = -bf0[17] + bf0[14];
777 bf1[18] = -bf0[18] + bf0[13];
778 bf1[19] = -bf0[19] + bf0[12];
779 bf1[20] = -bf0[20] + bf0[11];
780 bf1[21] = -bf0[21] + bf0[10];
781 bf1[22] = -bf0[22] + bf0[9];
782 bf1[23] = -bf0[23] + bf0[8];
783 bf1[24] = -bf0[24] + bf0[7];
784 bf1[25] = -bf0[25] + bf0[6];
785 bf1[26] = -bf0[26] + bf0[5];
786 bf1[27] = -bf0[27] + bf0[4];
787 bf1[28] = -bf0[28] + bf0[3];
788 bf1[29] = -bf0[29] + bf0[2];
789 bf1[30] = -bf0[30] + bf0[1];
790 bf1[31] = -bf0[31] + bf0[0];
791 bf1[32] = bf0[32];
792 bf1[33] = bf0[33];
793 bf1[34] = bf0[34];
794 bf1[35] = bf0[35];
795 bf1[36] = bf0[36];
796 bf1[37] = bf0[37];
797 bf1[38] = bf0[38];
798 bf1[39] = bf0[39];
799 bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
800 bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
801 bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
802 bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
803 bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
804 bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
805 bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
806 bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
807 bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
808 bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
809 bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
810 bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
811 bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
812 bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
813 bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
814 bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
815 bf1[56] = bf0[56];
816 bf1[57] = bf0[57];
817 bf1[58] = bf0[58];
818 bf1[59] = bf0[59];
819 bf1[60] = bf0[60];
820 bf1[61] = bf0[61];
821 bf1[62] = bf0[62];
822 bf1[63] = bf0[63];
823
824 // stage 3
825 cospi = cospi_arr(cos_bit);
826 bf0 = step;
827 bf1 = output;
828 bf1[0] = bf0[0] + bf0[15];
829 bf1[1] = bf0[1] + bf0[14];
830 bf1[2] = bf0[2] + bf0[13];
831 bf1[3] = bf0[3] + bf0[12];
832 bf1[4] = bf0[4] + bf0[11];
833 bf1[5] = bf0[5] + bf0[10];
834 bf1[6] = bf0[6] + bf0[9];
835 bf1[7] = bf0[7] + bf0[8];
836 bf1[8] = -bf0[8] + bf0[7];
837 bf1[9] = -bf0[9] + bf0[6];
838 bf1[10] = -bf0[10] + bf0[5];
839 bf1[11] = -bf0[11] + bf0[4];
840 bf1[12] = -bf0[12] + bf0[3];
841 bf1[13] = -bf0[13] + bf0[2];
842 bf1[14] = -bf0[14] + bf0[1];
843 bf1[15] = -bf0[15] + bf0[0];
844 bf1[16] = bf0[16];
845 bf1[17] = bf0[17];
846 bf1[18] = bf0[18];
847 bf1[19] = bf0[19];
848 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
849 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
850 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
851 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
852 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
853 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
854 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
855 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
856 bf1[28] = bf0[28];
857 bf1[29] = bf0[29];
858 bf1[30] = bf0[30];
859 bf1[31] = bf0[31];
860 bf1[32] = bf0[32] + bf0[47];
861 bf1[33] = bf0[33] + bf0[46];
862 bf1[34] = bf0[34] + bf0[45];
863 bf1[35] = bf0[35] + bf0[44];
864 bf1[36] = bf0[36] + bf0[43];
865 bf1[37] = bf0[37] + bf0[42];
866 bf1[38] = bf0[38] + bf0[41];
867 bf1[39] = bf0[39] + bf0[40];
868 bf1[40] = -bf0[40] + bf0[39];
869 bf1[41] = -bf0[41] + bf0[38];
870 bf1[42] = -bf0[42] + bf0[37];
871 bf1[43] = -bf0[43] + bf0[36];
872 bf1[44] = -bf0[44] + bf0[35];
873 bf1[45] = -bf0[45] + bf0[34];
874 bf1[46] = -bf0[46] + bf0[33];
875 bf1[47] = -bf0[47] + bf0[32];
876 bf1[48] = -bf0[48] + bf0[63];
877 bf1[49] = -bf0[49] + bf0[62];
878 bf1[50] = -bf0[50] + bf0[61];
879 bf1[51] = -bf0[51] + bf0[60];
880 bf1[52] = -bf0[52] + bf0[59];
881 bf1[53] = -bf0[53] + bf0[58];
882 bf1[54] = -bf0[54] + bf0[57];
883 bf1[55] = -bf0[55] + bf0[56];
884 bf1[56] = bf0[56] + bf0[55];
885 bf1[57] = bf0[57] + bf0[54];
886 bf1[58] = bf0[58] + bf0[53];
887 bf1[59] = bf0[59] + bf0[52];
888 bf1[60] = bf0[60] + bf0[51];
889 bf1[61] = bf0[61] + bf0[50];
890 bf1[62] = bf0[62] + bf0[49];
891 bf1[63] = bf0[63] + bf0[48];
892
893 // stage 4
894 cospi = cospi_arr(cos_bit);
895 bf0 = output;
896 bf1 = step;
897 bf1[0] = bf0[0] + bf0[7];
898 bf1[1] = bf0[1] + bf0[6];
899 bf1[2] = bf0[2] + bf0[5];
900 bf1[3] = bf0[3] + bf0[4];
901 bf1[4] = -bf0[4] + bf0[3];
902 bf1[5] = -bf0[5] + bf0[2];
903 bf1[6] = -bf0[6] + bf0[1];
904 bf1[7] = -bf0[7] + bf0[0];
905 bf1[8] = bf0[8];
906 bf1[9] = bf0[9];
907 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
908 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
909 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
910 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
911 bf1[14] = bf0[14];
912 bf1[15] = bf0[15];
913 bf1[16] = bf0[16] + bf0[23];
914 bf1[17] = bf0[17] + bf0[22];
915 bf1[18] = bf0[18] + bf0[21];
916 bf1[19] = bf0[19] + bf0[20];
917 bf1[20] = -bf0[20] + bf0[19];
918 bf1[21] = -bf0[21] + bf0[18];
919 bf1[22] = -bf0[22] + bf0[17];
920 bf1[23] = -bf0[23] + bf0[16];
921 bf1[24] = -bf0[24] + bf0[31];
922 bf1[25] = -bf0[25] + bf0[30];
923 bf1[26] = -bf0[26] + bf0[29];
924 bf1[27] = -bf0[27] + bf0[28];
925 bf1[28] = bf0[28] + bf0[27];
926 bf1[29] = bf0[29] + bf0[26];
927 bf1[30] = bf0[30] + bf0[25];
928 bf1[31] = bf0[31] + bf0[24];
929 bf1[32] = bf0[32];
930 bf1[33] = bf0[33];
931 bf1[34] = bf0[34];
932 bf1[35] = bf0[35];
933 bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
934 bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
935 bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
936 bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
937 bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
938 bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
939 bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
940 bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
941 bf1[44] = bf0[44];
942 bf1[45] = bf0[45];
943 bf1[46] = bf0[46];
944 bf1[47] = bf0[47];
945 bf1[48] = bf0[48];
946 bf1[49] = bf0[49];
947 bf1[50] = bf0[50];
948 bf1[51] = bf0[51];
949 bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
950 bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
951 bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
952 bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
953 bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
954 bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
955 bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
956 bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
957 bf1[60] = bf0[60];
958 bf1[61] = bf0[61];
959 bf1[62] = bf0[62];
960 bf1[63] = bf0[63];
961
962 // stage 5
963 cospi = cospi_arr(cos_bit);
964 bf0 = step;
965 bf1 = output;
966 bf1[0] = bf0[0] + bf0[3];
967 bf1[1] = bf0[1] + bf0[2];
968 bf1[2] = -bf0[2] + bf0[1];
969 bf1[3] = -bf0[3] + bf0[0];
970 bf1[4] = bf0[4];
971 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
972 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
973 bf1[7] = bf0[7];
974 bf1[8] = bf0[8] + bf0[11];
975 bf1[9] = bf0[9] + bf0[10];
976 bf1[10] = -bf0[10] + bf0[9];
977 bf1[11] = -bf0[11] + bf0[8];
978 bf1[12] = -bf0[12] + bf0[15];
979 bf1[13] = -bf0[13] + bf0[14];
980 bf1[14] = bf0[14] + bf0[13];
981 bf1[15] = bf0[15] + bf0[12];
982 bf1[16] = bf0[16];
983 bf1[17] = bf0[17];
984 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
985 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
986 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
987 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
988 bf1[22] = bf0[22];
989 bf1[23] = bf0[23];
990 bf1[24] = bf0[24];
991 bf1[25] = bf0[25];
992 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
993 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
994 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
995 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
996 bf1[30] = bf0[30];
997 bf1[31] = bf0[31];
998 bf1[32] = bf0[32] + bf0[39];
999 bf1[33] = bf0[33] + bf0[38];
1000 bf1[34] = bf0[34] + bf0[37];
1001 bf1[35] = bf0[35] + bf0[36];
1002 bf1[36] = -bf0[36] + bf0[35];
1003 bf1[37] = -bf0[37] + bf0[34];
1004 bf1[38] = -bf0[38] + bf0[33];
1005 bf1[39] = -bf0[39] + bf0[32];
1006 bf1[40] = -bf0[40] + bf0[47];
1007 bf1[41] = -bf0[41] + bf0[46];
1008 bf1[42] = -bf0[42] + bf0[45];
1009 bf1[43] = -bf0[43] + bf0[44];
1010 bf1[44] = bf0[44] + bf0[43];
1011 bf1[45] = bf0[45] + bf0[42];
1012 bf1[46] = bf0[46] + bf0[41];
1013 bf1[47] = bf0[47] + bf0[40];
1014 bf1[48] = bf0[48] + bf0[55];
1015 bf1[49] = bf0[49] + bf0[54];
1016 bf1[50] = bf0[50] + bf0[53];
1017 bf1[51] = bf0[51] + bf0[52];
1018 bf1[52] = -bf0[52] + bf0[51];
1019 bf1[53] = -bf0[53] + bf0[50];
1020 bf1[54] = -bf0[54] + bf0[49];
1021 bf1[55] = -bf0[55] + bf0[48];
1022 bf1[56] = -bf0[56] + bf0[63];
1023 bf1[57] = -bf0[57] + bf0[62];
1024 bf1[58] = -bf0[58] + bf0[61];
1025 bf1[59] = -bf0[59] + bf0[60];
1026 bf1[60] = bf0[60] + bf0[59];
1027 bf1[61] = bf0[61] + bf0[58];
1028 bf1[62] = bf0[62] + bf0[57];
1029 bf1[63] = bf0[63] + bf0[56];
1030
1031 // stage 6
1032 cospi = cospi_arr(cos_bit);
1033 bf0 = output;
1034 bf1 = step;
1035 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
1036 bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
1037 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
1038 bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
1039 bf1[4] = bf0[4] + bf0[5];
1040 bf1[5] = -bf0[5] + bf0[4];
1041 bf1[6] = -bf0[6] + bf0[7];
1042 bf1[7] = bf0[7] + bf0[6];
1043 bf1[8] = bf0[8];
1044 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
1045 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
1046 bf1[11] = bf0[11];
1047 bf1[12] = bf0[12];
1048 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
1049 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
1050 bf1[15] = bf0[15];
1051 bf1[16] = bf0[16] + bf0[19];
1052 bf1[17] = bf0[17] + bf0[18];
1053 bf1[18] = -bf0[18] + bf0[17];
1054 bf1[19] = -bf0[19] + bf0[16];
1055 bf1[20] = -bf0[20] + bf0[23];
1056 bf1[21] = -bf0[21] + bf0[22];
1057 bf1[22] = bf0[22] + bf0[21];
1058 bf1[23] = bf0[23] + bf0[20];
1059 bf1[24] = bf0[24] + bf0[27];
1060 bf1[25] = bf0[25] + bf0[26];
1061 bf1[26] = -bf0[26] + bf0[25];
1062 bf1[27] = -bf0[27] + bf0[24];
1063 bf1[28] = -bf0[28] + bf0[31];
1064 bf1[29] = -bf0[29] + bf0[30];
1065 bf1[30] = bf0[30] + bf0[29];
1066 bf1[31] = bf0[31] + bf0[28];
1067 bf1[32] = bf0[32];
1068 bf1[33] = bf0[33];
1069 bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
1070 bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
1071 bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
1072 bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
1073 bf1[38] = bf0[38];
1074 bf1[39] = bf0[39];
1075 bf1[40] = bf0[40];
1076 bf1[41] = bf0[41];
1077 bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
1078 bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
1079 bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
1080 bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
1081 bf1[46] = bf0[46];
1082 bf1[47] = bf0[47];
1083 bf1[48] = bf0[48];
1084 bf1[49] = bf0[49];
1085 bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
1086 bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
1087 bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
1088 bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
1089 bf1[54] = bf0[54];
1090 bf1[55] = bf0[55];
1091 bf1[56] = bf0[56];
1092 bf1[57] = bf0[57];
1093 bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
1094 bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
1095 bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
1096 bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
1097 bf1[62] = bf0[62];
1098 bf1[63] = bf0[63];
1099
1100 // stage 7
1101 cospi = cospi_arr(cos_bit);
1102 bf0 = step;
1103 bf1 = output;
1104 bf1[0] = bf0[0];
1105 bf1[1] = bf0[1];
1106 bf1[2] = bf0[2];
1107 bf1[3] = bf0[3];
1108 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
1109 bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
1110 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
1111 bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
1112 bf1[8] = bf0[8] + bf0[9];
1113 bf1[9] = -bf0[9] + bf0[8];
1114 bf1[10] = -bf0[10] + bf0[11];
1115 bf1[11] = bf0[11] + bf0[10];
1116 bf1[12] = bf0[12] + bf0[13];
1117 bf1[13] = -bf0[13] + bf0[12];
1118 bf1[14] = -bf0[14] + bf0[15];
1119 bf1[15] = bf0[15] + bf0[14];
1120 bf1[16] = bf0[16];
1121 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
1122 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
1123 bf1[19] = bf0[19];
1124 bf1[20] = bf0[20];
1125 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
1126 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
1127 bf1[23] = bf0[23];
1128 bf1[24] = bf0[24];
1129 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
1130 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
1131 bf1[27] = bf0[27];
1132 bf1[28] = bf0[28];
1133 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
1134 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
1135 bf1[31] = bf0[31];
1136 bf1[32] = bf0[32] + bf0[35];
1137 bf1[33] = bf0[33] + bf0[34];
1138 bf1[34] = -bf0[34] + bf0[33];
1139 bf1[35] = -bf0[35] + bf0[32];
1140 bf1[36] = -bf0[36] + bf0[39];
1141 bf1[37] = -bf0[37] + bf0[38];
1142 bf1[38] = bf0[38] + bf0[37];
1143 bf1[39] = bf0[39] + bf0[36];
1144 bf1[40] = bf0[40] + bf0[43];
1145 bf1[41] = bf0[41] + bf0[42];
1146 bf1[42] = -bf0[42] + bf0[41];
1147 bf1[43] = -bf0[43] + bf0[40];
1148 bf1[44] = -bf0[44] + bf0[47];
1149 bf1[45] = -bf0[45] + bf0[46];
1150 bf1[46] = bf0[46] + bf0[45];
1151 bf1[47] = bf0[47] + bf0[44];
1152 bf1[48] = bf0[48] + bf0[51];
1153 bf1[49] = bf0[49] + bf0[50];
1154 bf1[50] = -bf0[50] + bf0[49];
1155 bf1[51] = -bf0[51] + bf0[48];
1156 bf1[52] = -bf0[52] + bf0[55];
1157 bf1[53] = -bf0[53] + bf0[54];
1158 bf1[54] = bf0[54] + bf0[53];
1159 bf1[55] = bf0[55] + bf0[52];
1160 bf1[56] = bf0[56] + bf0[59];
1161 bf1[57] = bf0[57] + bf0[58];
1162 bf1[58] = -bf0[58] + bf0[57];
1163 bf1[59] = -bf0[59] + bf0[56];
1164 bf1[60] = -bf0[60] + bf0[63];
1165 bf1[61] = -bf0[61] + bf0[62];
1166 bf1[62] = bf0[62] + bf0[61];
1167 bf1[63] = bf0[63] + bf0[60];
1168
1169 // stage 8
1170 cospi = cospi_arr(cos_bit);
1171 bf0 = output;
1172 bf1 = step;
1173 bf1[0] = bf0[0];
1174 bf1[1] = bf0[1];
1175 bf1[2] = bf0[2];
1176 bf1[3] = bf0[3];
1177 bf1[4] = bf0[4];
1178 bf1[5] = bf0[5];
1179 bf1[6] = bf0[6];
1180 bf1[7] = bf0[7];
1181 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
1182 bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
1183 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
1184 bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
1185 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
1186 bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
1187 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
1188 bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
1189 bf1[16] = bf0[16] + bf0[17];
1190 bf1[17] = -bf0[17] + bf0[16];
1191 bf1[18] = -bf0[18] + bf0[19];
1192 bf1[19] = bf0[19] + bf0[18];
1193 bf1[20] = bf0[20] + bf0[21];
1194 bf1[21] = -bf0[21] + bf0[20];
1195 bf1[22] = -bf0[22] + bf0[23];
1196 bf1[23] = bf0[23] + bf0[22];
1197 bf1[24] = bf0[24] + bf0[25];
1198 bf1[25] = -bf0[25] + bf0[24];
1199 bf1[26] = -bf0[26] + bf0[27];
1200 bf1[27] = bf0[27] + bf0[26];
1201 bf1[28] = bf0[28] + bf0[29];
1202 bf1[29] = -bf0[29] + bf0[28];
1203 bf1[30] = -bf0[30] + bf0[31];
1204 bf1[31] = bf0[31] + bf0[30];
1205 bf1[32] = bf0[32];
1206 bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
1207 bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
1208 bf1[35] = bf0[35];
1209 bf1[36] = bf0[36];
1210 bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
1211 bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
1212 bf1[39] = bf0[39];
1213 bf1[40] = bf0[40];
1214 bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
1215 bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
1216 bf1[43] = bf0[43];
1217 bf1[44] = bf0[44];
1218 bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
1219 bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
1220 bf1[47] = bf0[47];
1221 bf1[48] = bf0[48];
1222 bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
1223 bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
1224 bf1[51] = bf0[51];
1225 bf1[52] = bf0[52];
1226 bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
1227 bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
1228 bf1[55] = bf0[55];
1229 bf1[56] = bf0[56];
1230 bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
1231 bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
1232 bf1[59] = bf0[59];
1233 bf1[60] = bf0[60];
1234 bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
1235 bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
1236 bf1[63] = bf0[63];
1237
1238 // stage 9
1239 cospi = cospi_arr(cos_bit);
1240 bf0 = step;
1241 bf1 = output;
1242 bf1[0] = bf0[0];
1243 bf1[1] = bf0[1];
1244 bf1[2] = bf0[2];
1245 bf1[3] = bf0[3];
1246 bf1[4] = bf0[4];
1247 bf1[5] = bf0[5];
1248 bf1[6] = bf0[6];
1249 bf1[7] = bf0[7];
1250 bf1[8] = bf0[8];
1251 bf1[9] = bf0[9];
1252 bf1[10] = bf0[10];
1253 bf1[11] = bf0[11];
1254 bf1[12] = bf0[12];
1255 bf1[13] = bf0[13];
1256 bf1[14] = bf0[14];
1257 bf1[15] = bf0[15];
1258 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
1259 bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
1260 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
1261 bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
1262 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
1263 bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
1264 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
1265 bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
1266 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
1267 bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
1268 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
1269 bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
1270 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
1271 bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
1272 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
1273 bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
1274 bf1[32] = bf0[32] + bf0[33];
1275 bf1[33] = -bf0[33] + bf0[32];
1276 bf1[34] = -bf0[34] + bf0[35];
1277 bf1[35] = bf0[35] + bf0[34];
1278 bf1[36] = bf0[36] + bf0[37];
1279 bf1[37] = -bf0[37] + bf0[36];
1280 bf1[38] = -bf0[38] + bf0[39];
1281 bf1[39] = bf0[39] + bf0[38];
1282 bf1[40] = bf0[40] + bf0[41];
1283 bf1[41] = -bf0[41] + bf0[40];
1284 bf1[42] = -bf0[42] + bf0[43];
1285 bf1[43] = bf0[43] + bf0[42];
1286 bf1[44] = bf0[44] + bf0[45];
1287 bf1[45] = -bf0[45] + bf0[44];
1288 bf1[46] = -bf0[46] + bf0[47];
1289 bf1[47] = bf0[47] + bf0[46];
1290 bf1[48] = bf0[48] + bf0[49];
1291 bf1[49] = -bf0[49] + bf0[48];
1292 bf1[50] = -bf0[50] + bf0[51];
1293 bf1[51] = bf0[51] + bf0[50];
1294 bf1[52] = bf0[52] + bf0[53];
1295 bf1[53] = -bf0[53] + bf0[52];
1296 bf1[54] = -bf0[54] + bf0[55];
1297 bf1[55] = bf0[55] + bf0[54];
1298 bf1[56] = bf0[56] + bf0[57];
1299 bf1[57] = -bf0[57] + bf0[56];
1300 bf1[58] = -bf0[58] + bf0[59];
1301 bf1[59] = bf0[59] + bf0[58];
1302 bf1[60] = bf0[60] + bf0[61];
1303 bf1[61] = -bf0[61] + bf0[60];
1304 bf1[62] = -bf0[62] + bf0[63];
1305 bf1[63] = bf0[63] + bf0[62];
1306
1307 // stage 10
1308 cospi = cospi_arr(cos_bit);
1309 bf0 = output;
1310 bf1 = step;
1311 bf1[0] = bf0[0];
1312 bf1[1] = bf0[1];
1313 bf1[2] = bf0[2];
1314 bf1[3] = bf0[3];
1315 bf1[4] = bf0[4];
1316 bf1[5] = bf0[5];
1317 bf1[6] = bf0[6];
1318 bf1[7] = bf0[7];
1319 bf1[8] = bf0[8];
1320 bf1[9] = bf0[9];
1321 bf1[10] = bf0[10];
1322 bf1[11] = bf0[11];
1323 bf1[12] = bf0[12];
1324 bf1[13] = bf0[13];
1325 bf1[14] = bf0[14];
1326 bf1[15] = bf0[15];
1327 bf1[16] = bf0[16];
1328 bf1[17] = bf0[17];
1329 bf1[18] = bf0[18];
1330 bf1[19] = bf0[19];
1331 bf1[20] = bf0[20];
1332 bf1[21] = bf0[21];
1333 bf1[22] = bf0[22];
1334 bf1[23] = bf0[23];
1335 bf1[24] = bf0[24];
1336 bf1[25] = bf0[25];
1337 bf1[26] = bf0[26];
1338 bf1[27] = bf0[27];
1339 bf1[28] = bf0[28];
1340 bf1[29] = bf0[29];
1341 bf1[30] = bf0[30];
1342 bf1[31] = bf0[31];
1343 bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
1344 bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
1345 bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
1346 bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
1347 bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
1348 bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
1349 bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
1350 bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
1351 bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
1352 bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
1353 bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
1354 bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
1355 bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
1356 bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
1357 bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
1358 bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
1359 bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
1360 bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
1361 bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
1362 bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
1363 bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
1364 bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
1365 bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
1366 bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
1367 bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
1368 bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
1369 bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
1370 bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
1371 bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
1372 bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
1373 bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
1374 bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
1375
1376 // stage 11
1377 bf0 = step;
1378 bf1 = output;
1379 bf1[0] = bf0[0];
1380 bf1[1] = bf0[32];
1381 bf1[2] = bf0[16];
1382 bf1[3] = bf0[48];
1383 bf1[4] = bf0[8];
1384 bf1[5] = bf0[40];
1385 bf1[6] = bf0[24];
1386 bf1[7] = bf0[56];
1387 bf1[8] = bf0[4];
1388 bf1[9] = bf0[36];
1389 bf1[10] = bf0[20];
1390 bf1[11] = bf0[52];
1391 bf1[12] = bf0[12];
1392 bf1[13] = bf0[44];
1393 bf1[14] = bf0[28];
1394 bf1[15] = bf0[60];
1395 bf1[16] = bf0[2];
1396 bf1[17] = bf0[34];
1397 bf1[18] = bf0[18];
1398 bf1[19] = bf0[50];
1399 bf1[20] = bf0[10];
1400 bf1[21] = bf0[42];
1401 bf1[22] = bf0[26];
1402 bf1[23] = bf0[58];
1403 bf1[24] = bf0[6];
1404 bf1[25] = bf0[38];
1405 bf1[26] = bf0[22];
1406 bf1[27] = bf0[54];
1407 bf1[28] = bf0[14];
1408 bf1[29] = bf0[46];
1409 bf1[30] = bf0[30];
1410 bf1[31] = bf0[62];
1411 bf1[32] = bf0[1];
1412 bf1[33] = bf0[33];
1413 bf1[34] = bf0[17];
1414 bf1[35] = bf0[49];
1415 bf1[36] = bf0[9];
1416 bf1[37] = bf0[41];
1417 bf1[38] = bf0[25];
1418 bf1[39] = bf0[57];
1419 bf1[40] = bf0[5];
1420 bf1[41] = bf0[37];
1421 bf1[42] = bf0[21];
1422 bf1[43] = bf0[53];
1423 bf1[44] = bf0[13];
1424 bf1[45] = bf0[45];
1425 bf1[46] = bf0[29];
1426 bf1[47] = bf0[61];
1427 bf1[48] = bf0[3];
1428 bf1[49] = bf0[35];
1429 bf1[50] = bf0[19];
1430 bf1[51] = bf0[51];
1431 bf1[52] = bf0[11];
1432 bf1[53] = bf0[43];
1433 bf1[54] = bf0[27];
1434 bf1[55] = bf0[59];
1435 bf1[56] = bf0[7];
1436 bf1[57] = bf0[39];
1437 bf1[58] = bf0[23];
1438 bf1[59] = bf0[55];
1439 bf1[60] = bf0[15];
1440 bf1[61] = bf0[47];
1441 bf1[62] = bf0[31];
1442 bf1[63] = bf0[63];
1443 }
1444
svt_av1_fadst4_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1445 void svt_av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1446 const int8_t *stage_range) {
1447 (void)stage_range;
1448 int32_t bit = cos_bit;
1449 const int32_t *sinpi = sinpi_arr(bit);
1450 int32_t x0, x1, x2, x3;
1451 int32_t s0, s1, s2, s3, s4, s5, s6, s7;
1452
1453 // stage 0
1454 x0 = input[0];
1455 x1 = input[1];
1456 x2 = input[2];
1457 x3 = input[3];
1458
1459 if (!(x0 | x1 | x2 | x3)) {
1460 output[0] = output[1] = output[2] = output[3] = 0;
1461 return;
1462 }
1463
1464 //// stage 1
1465 //s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
1466 //s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
1467 //s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
1468 //s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
1469 //s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
1470 //s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
1471 //s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
1472 //s7 = range_check_value(x0 + x1, stage_range[1]);
1473
1474 //// stage 2
1475 //s7 = range_check_value(s7 - x3, stage_range[2]);
1476
1477 //// stage 3
1478 //x0 = range_check_value(s0 + s2, bit + stage_range[3]);
1479 //x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
1480 //x2 = range_check_value(s1 - s3, bit + stage_range[3]);
1481 //x3 = range_check_value(s4, bit + stage_range[3]);
1482
1483 //// stage 4
1484 //x0 = range_check_value(x0 + s5, bit + stage_range[4]);
1485 //x2 = range_check_value(x2 + s6, bit + stage_range[4]);
1486
1487 //// stage 5
1488 //s0 = range_check_value(x0 + x3, bit + stage_range[5]);
1489 //s1 = range_check_value(x1, bit + stage_range[5]);
1490 //s2 = range_check_value(x2 - x3, bit + stage_range[5]);
1491 //s3 = range_check_value(x2 - x0, bit + stage_range[5]);
1492
1493 //// stage 6
1494 //s3 = range_check_value(s3 + x3, bit + stage_range[6]);
1495
1496 // stage 1
1497 s0 = sinpi[1] * x0;
1498 s1 = sinpi[4] * x0;
1499 s2 = sinpi[2] * x1;
1500 s3 = sinpi[1] * x1;
1501 s4 = sinpi[3] * x2;
1502 s5 = sinpi[4] * x3;
1503 s6 = sinpi[2] * x3;
1504 s7 = x0 + x1;
1505
1506 // stage 2
1507 s7 = s7 - x3;
1508
1509 // stage 3
1510 x0 = s0 + s2;
1511 x1 = sinpi[3] * s7;
1512 x2 = s1 - s3;
1513 x3 = s4;
1514
1515 // stage 4
1516 x0 = x0 + s5;
1517 x2 = x2 + s6;
1518
1519 // stage 5
1520 s0 = x0 + x3;
1521 s1 = x1;
1522 s2 = x2 - x3;
1523 s3 = x2 - x0;
1524
1525 // stage 6
1526 s3 = s3 + x3;
1527
1528 // 1-D transform scaling factor is sqrt(2).
1529 output[0] = round_shift(s0, bit);
1530 output[1] = round_shift(s1, bit);
1531 output[2] = round_shift(s2, bit);
1532 output[3] = round_shift(s3, bit);
1533 }
1534
svt_av1_fadst8_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1535 void svt_av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1536 const int8_t *stage_range) {
1537 (void)stage_range;
1538 const int32_t *cospi;
1539
1540 int32_t *bf0, *bf1;
1541 int32_t step[8];
1542
1543 // stage 0;
1544
1545 // stage 1;
1546 assert(output != input);
1547 bf1 = output;
1548 bf1[0] = input[0];
1549 bf1[1] = -input[7];
1550 bf1[2] = -input[3];
1551 bf1[3] = input[4];
1552 bf1[4] = -input[1];
1553 bf1[5] = input[6];
1554 bf1[6] = input[2];
1555 bf1[7] = -input[5];
1556
1557 // stage 2
1558 cospi = cospi_arr(cos_bit);
1559 bf0 = output;
1560 bf1 = step;
1561 bf1[0] = bf0[0];
1562 bf1[1] = bf0[1];
1563 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
1564 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
1565 bf1[4] = bf0[4];
1566 bf1[5] = bf0[5];
1567 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
1568 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
1569
1570 // stage 3
1571 bf0 = step;
1572 bf1 = output;
1573 bf1[0] = bf0[0] + bf0[2];
1574 bf1[1] = bf0[1] + bf0[3];
1575 bf1[2] = bf0[0] - bf0[2];
1576 bf1[3] = bf0[1] - bf0[3];
1577 bf1[4] = bf0[4] + bf0[6];
1578 bf1[5] = bf0[5] + bf0[7];
1579 bf1[6] = bf0[4] - bf0[6];
1580 bf1[7] = bf0[5] - bf0[7];
1581
1582 // stage 4
1583 cospi = cospi_arr(cos_bit);
1584 bf0 = output;
1585 bf1 = step;
1586 bf1[0] = bf0[0];
1587 bf1[1] = bf0[1];
1588 bf1[2] = bf0[2];
1589 bf1[3] = bf0[3];
1590 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
1591 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
1592 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
1593 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
1594
1595 // stage 5
1596 bf0 = step;
1597 bf1 = output;
1598 bf1[0] = bf0[0] + bf0[4];
1599 bf1[1] = bf0[1] + bf0[5];
1600 bf1[2] = bf0[2] + bf0[6];
1601 bf1[3] = bf0[3] + bf0[7];
1602 bf1[4] = bf0[0] - bf0[4];
1603 bf1[5] = bf0[1] - bf0[5];
1604 bf1[6] = bf0[2] - bf0[6];
1605 bf1[7] = bf0[3] - bf0[7];
1606
1607 // stage 6
1608 cospi = cospi_arr(cos_bit);
1609 bf0 = output;
1610 bf1 = step;
1611 bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
1612 bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
1613 bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
1614 bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
1615 bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
1616 bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
1617 bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
1618 bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
1619
1620 // stage 7
1621 bf0 = step;
1622 bf1 = output;
1623 bf1[0] = bf0[1];
1624 bf1[1] = bf0[6];
1625 bf1[2] = bf0[3];
1626 bf1[3] = bf0[4];
1627 bf1[4] = bf0[5];
1628 bf1[5] = bf0[2];
1629 bf1[6] = bf0[7];
1630 bf1[7] = bf0[0];
1631 }
1632
svt_av1_fadst16_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1633 void svt_av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1634 const int8_t *stage_range) {
1635 (void)stage_range;
1636 const int32_t *cospi;
1637
1638 int32_t *bf0, *bf1;
1639 int32_t step[16];
1640
1641 // stage 0;
1642
1643 // stage 1;
1644 assert(output != input);
1645 bf1 = output;
1646 bf1[0] = input[0];
1647 bf1[1] = -input[15];
1648 bf1[2] = -input[7];
1649 bf1[3] = input[8];
1650 bf1[4] = -input[3];
1651 bf1[5] = input[12];
1652 bf1[6] = input[4];
1653 bf1[7] = -input[11];
1654 bf1[8] = -input[1];
1655 bf1[9] = input[14];
1656 bf1[10] = input[6];
1657 bf1[11] = -input[9];
1658 bf1[12] = input[2];
1659 bf1[13] = -input[13];
1660 bf1[14] = -input[5];
1661 bf1[15] = input[10];
1662
1663 // stage 2
1664 cospi = cospi_arr(cos_bit);
1665 bf0 = output;
1666 bf1 = step;
1667 bf1[0] = bf0[0];
1668 bf1[1] = bf0[1];
1669 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
1670 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
1671 bf1[4] = bf0[4];
1672 bf1[5] = bf0[5];
1673 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
1674 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
1675 bf1[8] = bf0[8];
1676 bf1[9] = bf0[9];
1677 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
1678 bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
1679 bf1[12] = bf0[12];
1680 bf1[13] = bf0[13];
1681 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
1682 bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
1683
1684 // stage 3
1685 bf0 = step;
1686 bf1 = output;
1687 bf1[0] = bf0[0] + bf0[2];
1688 bf1[1] = bf0[1] + bf0[3];
1689 bf1[2] = bf0[0] - bf0[2];
1690 bf1[3] = bf0[1] - bf0[3];
1691 bf1[4] = bf0[4] + bf0[6];
1692 bf1[5] = bf0[5] + bf0[7];
1693 bf1[6] = bf0[4] - bf0[6];
1694 bf1[7] = bf0[5] - bf0[7];
1695 bf1[8] = bf0[8] + bf0[10];
1696 bf1[9] = bf0[9] + bf0[11];
1697 bf1[10] = bf0[8] - bf0[10];
1698 bf1[11] = bf0[9] - bf0[11];
1699 bf1[12] = bf0[12] + bf0[14];
1700 bf1[13] = bf0[13] + bf0[15];
1701 bf1[14] = bf0[12] - bf0[14];
1702 bf1[15] = bf0[13] - bf0[15];
1703
1704 // stage 4
1705 cospi = cospi_arr(cos_bit);
1706 bf0 = output;
1707 bf1 = step;
1708 bf1[0] = bf0[0];
1709 bf1[1] = bf0[1];
1710 bf1[2] = bf0[2];
1711 bf1[3] = bf0[3];
1712 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
1713 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
1714 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
1715 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
1716 bf1[8] = bf0[8];
1717 bf1[9] = bf0[9];
1718 bf1[10] = bf0[10];
1719 bf1[11] = bf0[11];
1720 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
1721 bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
1722 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
1723 bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
1724
1725 // stage 5
1726 bf0 = step;
1727 bf1 = output;
1728 bf1[0] = bf0[0] + bf0[4];
1729 bf1[1] = bf0[1] + bf0[5];
1730 bf1[2] = bf0[2] + bf0[6];
1731 bf1[3] = bf0[3] + bf0[7];
1732 bf1[4] = bf0[0] - bf0[4];
1733 bf1[5] = bf0[1] - bf0[5];
1734 bf1[6] = bf0[2] - bf0[6];
1735 bf1[7] = bf0[3] - bf0[7];
1736 bf1[8] = bf0[8] + bf0[12];
1737 bf1[9] = bf0[9] + bf0[13];
1738 bf1[10] = bf0[10] + bf0[14];
1739 bf1[11] = bf0[11] + bf0[15];
1740 bf1[12] = bf0[8] - bf0[12];
1741 bf1[13] = bf0[9] - bf0[13];
1742 bf1[14] = bf0[10] - bf0[14];
1743 bf1[15] = bf0[11] - bf0[15];
1744
1745 // stage 6
1746 cospi = cospi_arr(cos_bit);
1747 bf0 = output;
1748 bf1 = step;
1749 bf1[0] = bf0[0];
1750 bf1[1] = bf0[1];
1751 bf1[2] = bf0[2];
1752 bf1[3] = bf0[3];
1753 bf1[4] = bf0[4];
1754 bf1[5] = bf0[5];
1755 bf1[6] = bf0[6];
1756 bf1[7] = bf0[7];
1757 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
1758 bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
1759 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
1760 bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
1761 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
1762 bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
1763 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
1764 bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
1765
1766 // stage 7
1767 bf0 = step;
1768 bf1 = output;
1769 bf1[0] = bf0[0] + bf0[8];
1770 bf1[1] = bf0[1] + bf0[9];
1771 bf1[2] = bf0[2] + bf0[10];
1772 bf1[3] = bf0[3] + bf0[11];
1773 bf1[4] = bf0[4] + bf0[12];
1774 bf1[5] = bf0[5] + bf0[13];
1775 bf1[6] = bf0[6] + bf0[14];
1776 bf1[7] = bf0[7] + bf0[15];
1777 bf1[8] = bf0[0] - bf0[8];
1778 bf1[9] = bf0[1] - bf0[9];
1779 bf1[10] = bf0[2] - bf0[10];
1780 bf1[11] = bf0[3] - bf0[11];
1781 bf1[12] = bf0[4] - bf0[12];
1782 bf1[13] = bf0[5] - bf0[13];
1783 bf1[14] = bf0[6] - bf0[14];
1784 bf1[15] = bf0[7] - bf0[15];
1785
1786 // stage 8
1787 cospi = cospi_arr(cos_bit);
1788 bf0 = output;
1789 bf1 = step;
1790 bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
1791 bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
1792 bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
1793 bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
1794 bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
1795 bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
1796 bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
1797 bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
1798 bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
1799 bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
1800 bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
1801 bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
1802 bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
1803 bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
1804 bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
1805 bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
1806
1807 // stage 9
1808 bf0 = step;
1809 bf1 = output;
1810 bf1[0] = bf0[1];
1811 bf1[1] = bf0[14];
1812 bf1[2] = bf0[3];
1813 bf1[3] = bf0[12];
1814 bf1[4] = bf0[5];
1815 bf1[5] = bf0[10];
1816 bf1[6] = bf0[7];
1817 bf1[7] = bf0[8];
1818 bf1[8] = bf0[9];
1819 bf1[9] = bf0[6];
1820 bf1[10] = bf0[11];
1821 bf1[11] = bf0[4];
1822 bf1[12] = bf0[13];
1823 bf1[13] = bf0[2];
1824 bf1[14] = bf0[15];
1825 bf1[15] = bf0[0];
1826 }
1827
av1_fadst32_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)1828 void av1_fadst32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
1829 const int8_t *stage_range) {
1830 (void)stage_range;
1831 const int32_t *cospi;
1832
1833 int32_t *bf0, *bf1;
1834 int32_t step[32];
1835
1836 // stage 0;
1837
1838 // stage 1;
1839 bf1 = output;
1840 bf1[0] = input[31];
1841 bf1[1] = input[0];
1842 bf1[2] = input[29];
1843 bf1[3] = input[2];
1844 bf1[4] = input[27];
1845 bf1[5] = input[4];
1846 bf1[6] = input[25];
1847 bf1[7] = input[6];
1848 bf1[8] = input[23];
1849 bf1[9] = input[8];
1850 bf1[10] = input[21];
1851 bf1[11] = input[10];
1852 bf1[12] = input[19];
1853 bf1[13] = input[12];
1854 bf1[14] = input[17];
1855 bf1[15] = input[14];
1856 bf1[16] = input[15];
1857 bf1[17] = input[16];
1858 bf1[18] = input[13];
1859 bf1[19] = input[18];
1860 bf1[20] = input[11];
1861 bf1[21] = input[20];
1862 bf1[22] = input[9];
1863 bf1[23] = input[22];
1864 bf1[24] = input[7];
1865 bf1[25] = input[24];
1866 bf1[26] = input[5];
1867 bf1[27] = input[26];
1868 bf1[28] = input[3];
1869 bf1[29] = input[28];
1870 bf1[30] = input[1];
1871 bf1[31] = input[30];
1872
1873 // stage 2
1874 cospi = cospi_arr(cos_bit);
1875 bf0 = output;
1876 bf1 = step;
1877 bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit);
1878 bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit);
1879 bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit);
1880 bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit);
1881 bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit);
1882 bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit);
1883 bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit);
1884 bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit);
1885 bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit);
1886 bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit);
1887 bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit);
1888 bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit);
1889 bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit);
1890 bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit);
1891 bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit);
1892 bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit);
1893 bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit);
1894 bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit);
1895 bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit);
1896 bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit);
1897 bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit);
1898 bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit);
1899 bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit);
1900 bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit);
1901 bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit);
1902 bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit);
1903 bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit);
1904 bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit);
1905 bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit);
1906 bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit);
1907 bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit);
1908 bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit);
1909
1910 // stage 3
1911 bf0 = step;
1912 bf1 = output;
1913 bf1[0] = bf0[0] + bf0[16];
1914 bf1[1] = bf0[1] + bf0[17];
1915 bf1[2] = bf0[2] + bf0[18];
1916 bf1[3] = bf0[3] + bf0[19];
1917 bf1[4] = bf0[4] + bf0[20];
1918 bf1[5] = bf0[5] + bf0[21];
1919 bf1[6] = bf0[6] + bf0[22];
1920 bf1[7] = bf0[7] + bf0[23];
1921 bf1[8] = bf0[8] + bf0[24];
1922 bf1[9] = bf0[9] + bf0[25];
1923 bf1[10] = bf0[10] + bf0[26];
1924 bf1[11] = bf0[11] + bf0[27];
1925 bf1[12] = bf0[12] + bf0[28];
1926 bf1[13] = bf0[13] + bf0[29];
1927 bf1[14] = bf0[14] + bf0[30];
1928 bf1[15] = bf0[15] + bf0[31];
1929 bf1[16] = -bf0[16] + bf0[0];
1930 bf1[17] = -bf0[17] + bf0[1];
1931 bf1[18] = -bf0[18] + bf0[2];
1932 bf1[19] = -bf0[19] + bf0[3];
1933 bf1[20] = -bf0[20] + bf0[4];
1934 bf1[21] = -bf0[21] + bf0[5];
1935 bf1[22] = -bf0[22] + bf0[6];
1936 bf1[23] = -bf0[23] + bf0[7];
1937 bf1[24] = -bf0[24] + bf0[8];
1938 bf1[25] = -bf0[25] + bf0[9];
1939 bf1[26] = -bf0[26] + bf0[10];
1940 bf1[27] = -bf0[27] + bf0[11];
1941 bf1[28] = -bf0[28] + bf0[12];
1942 bf1[29] = -bf0[29] + bf0[13];
1943 bf1[30] = -bf0[30] + bf0[14];
1944 bf1[31] = -bf0[31] + bf0[15];
1945
1946 // stage 4
1947 cospi = cospi_arr(cos_bit);
1948 bf0 = output;
1949 bf1 = step;
1950 bf1[0] = bf0[0];
1951 bf1[1] = bf0[1];
1952 bf1[2] = bf0[2];
1953 bf1[3] = bf0[3];
1954 bf1[4] = bf0[4];
1955 bf1[5] = bf0[5];
1956 bf1[6] = bf0[6];
1957 bf1[7] = bf0[7];
1958 bf1[8] = bf0[8];
1959 bf1[9] = bf0[9];
1960 bf1[10] = bf0[10];
1961 bf1[11] = bf0[11];
1962 bf1[12] = bf0[12];
1963 bf1[13] = bf0[13];
1964 bf1[14] = bf0[14];
1965 bf1[15] = bf0[15];
1966 bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit);
1967 bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit);
1968 bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit);
1969 bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit);
1970 bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit);
1971 bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit);
1972 bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit);
1973 bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit);
1974 bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit);
1975 bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit);
1976 bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit);
1977 bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit);
1978 bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit);
1979 bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit);
1980 bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit);
1981 bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit);
1982
1983 // stage 5
1984 bf0 = step;
1985 bf1 = output;
1986 bf1[0] = bf0[0] + bf0[8];
1987 bf1[1] = bf0[1] + bf0[9];
1988 bf1[2] = bf0[2] + bf0[10];
1989 bf1[3] = bf0[3] + bf0[11];
1990 bf1[4] = bf0[4] + bf0[12];
1991 bf1[5] = bf0[5] + bf0[13];
1992 bf1[6] = bf0[6] + bf0[14];
1993 bf1[7] = bf0[7] + bf0[15];
1994 bf1[8] = -bf0[8] + bf0[0];
1995 bf1[9] = -bf0[9] + bf0[1];
1996 bf1[10] = -bf0[10] + bf0[2];
1997 bf1[11] = -bf0[11] + bf0[3];
1998 bf1[12] = -bf0[12] + bf0[4];
1999 bf1[13] = -bf0[13] + bf0[5];
2000 bf1[14] = -bf0[14] + bf0[6];
2001 bf1[15] = -bf0[15] + bf0[7];
2002 bf1[16] = bf0[16] + bf0[24];
2003 bf1[17] = bf0[17] + bf0[25];
2004 bf1[18] = bf0[18] + bf0[26];
2005 bf1[19] = bf0[19] + bf0[27];
2006 bf1[20] = bf0[20] + bf0[28];
2007 bf1[21] = bf0[21] + bf0[29];
2008 bf1[22] = bf0[22] + bf0[30];
2009 bf1[23] = bf0[23] + bf0[31];
2010 bf1[24] = -bf0[24] + bf0[16];
2011 bf1[25] = -bf0[25] + bf0[17];
2012 bf1[26] = -bf0[26] + bf0[18];
2013 bf1[27] = -bf0[27] + bf0[19];
2014 bf1[28] = -bf0[28] + bf0[20];
2015 bf1[29] = -bf0[29] + bf0[21];
2016 bf1[30] = -bf0[30] + bf0[22];
2017 bf1[31] = -bf0[31] + bf0[23];
2018
2019 // stage 6
2020 cospi = cospi_arr(cos_bit);
2021 bf0 = output;
2022 bf1 = step;
2023 bf1[0] = bf0[0];
2024 bf1[1] = bf0[1];
2025 bf1[2] = bf0[2];
2026 bf1[3] = bf0[3];
2027 bf1[4] = bf0[4];
2028 bf1[5] = bf0[5];
2029 bf1[6] = bf0[6];
2030 bf1[7] = bf0[7];
2031 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
2032 bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit);
2033 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
2034 bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit);
2035 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
2036 bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit);
2037 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
2038 bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit);
2039 bf1[16] = bf0[16];
2040 bf1[17] = bf0[17];
2041 bf1[18] = bf0[18];
2042 bf1[19] = bf0[19];
2043 bf1[20] = bf0[20];
2044 bf1[21] = bf0[21];
2045 bf1[22] = bf0[22];
2046 bf1[23] = bf0[23];
2047 bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit);
2048 bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit);
2049 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit);
2050 bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit);
2051 bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit);
2052 bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit);
2053 bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit);
2054 bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit);
2055
2056 // stage 7
2057 bf0 = step;
2058 bf1 = output;
2059 bf1[0] = bf0[0] + bf0[4];
2060 bf1[1] = bf0[1] + bf0[5];
2061 bf1[2] = bf0[2] + bf0[6];
2062 bf1[3] = bf0[3] + bf0[7];
2063 bf1[4] = -bf0[4] + bf0[0];
2064 bf1[5] = -bf0[5] + bf0[1];
2065 bf1[6] = -bf0[6] + bf0[2];
2066 bf1[7] = -bf0[7] + bf0[3];
2067 bf1[8] = bf0[8] + bf0[12];
2068 bf1[9] = bf0[9] + bf0[13];
2069 bf1[10] = bf0[10] + bf0[14];
2070 bf1[11] = bf0[11] + bf0[15];
2071 bf1[12] = -bf0[12] + bf0[8];
2072 bf1[13] = -bf0[13] + bf0[9];
2073 bf1[14] = -bf0[14] + bf0[10];
2074 bf1[15] = -bf0[15] + bf0[11];
2075 bf1[16] = bf0[16] + bf0[20];
2076 bf1[17] = bf0[17] + bf0[21];
2077 bf1[18] = bf0[18] + bf0[22];
2078 bf1[19] = bf0[19] + bf0[23];
2079 bf1[20] = -bf0[20] + bf0[16];
2080 bf1[21] = -bf0[21] + bf0[17];
2081 bf1[22] = -bf0[22] + bf0[18];
2082 bf1[23] = -bf0[23] + bf0[19];
2083 bf1[24] = bf0[24] + bf0[28];
2084 bf1[25] = bf0[25] + bf0[29];
2085 bf1[26] = bf0[26] + bf0[30];
2086 bf1[27] = bf0[27] + bf0[31];
2087 bf1[28] = -bf0[28] + bf0[24];
2088 bf1[29] = -bf0[29] + bf0[25];
2089 bf1[30] = -bf0[30] + bf0[26];
2090 bf1[31] = -bf0[31] + bf0[27];
2091
2092 // stage 8
2093 cospi = cospi_arr(cos_bit);
2094 bf0 = output;
2095 bf1 = step;
2096 bf1[0] = bf0[0];
2097 bf1[1] = bf0[1];
2098 bf1[2] = bf0[2];
2099 bf1[3] = bf0[3];
2100 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
2101 bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit);
2102 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
2103 bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit);
2104 bf1[8] = bf0[8];
2105 bf1[9] = bf0[9];
2106 bf1[10] = bf0[10];
2107 bf1[11] = bf0[11];
2108 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
2109 bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit);
2110 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
2111 bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit);
2112 bf1[16] = bf0[16];
2113 bf1[17] = bf0[17];
2114 bf1[18] = bf0[18];
2115 bf1[19] = bf0[19];
2116 bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit);
2117 bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit);
2118 bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit);
2119 bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit);
2120 bf1[24] = bf0[24];
2121 bf1[25] = bf0[25];
2122 bf1[26] = bf0[26];
2123 bf1[27] = bf0[27];
2124 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit);
2125 bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit);
2126 bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit);
2127 bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit);
2128
2129 // stage 9
2130 bf0 = step;
2131 bf1 = output;
2132 bf1[0] = bf0[0] + bf0[2];
2133 bf1[1] = bf0[1] + bf0[3];
2134 bf1[2] = -bf0[2] + bf0[0];
2135 bf1[3] = -bf0[3] + bf0[1];
2136 bf1[4] = bf0[4] + bf0[6];
2137 bf1[5] = bf0[5] + bf0[7];
2138 bf1[6] = -bf0[6] + bf0[4];
2139 bf1[7] = -bf0[7] + bf0[5];
2140 bf1[8] = bf0[8] + bf0[10];
2141 bf1[9] = bf0[9] + bf0[11];
2142 bf1[10] = -bf0[10] + bf0[8];
2143 bf1[11] = -bf0[11] + bf0[9];
2144 bf1[12] = bf0[12] + bf0[14];
2145 bf1[13] = bf0[13] + bf0[15];
2146 bf1[14] = -bf0[14] + bf0[12];
2147 bf1[15] = -bf0[15] + bf0[13];
2148 bf1[16] = bf0[16] + bf0[18];
2149 bf1[17] = bf0[17] + bf0[19];
2150 bf1[18] = -bf0[18] + bf0[16];
2151 bf1[19] = -bf0[19] + bf0[17];
2152 bf1[20] = bf0[20] + bf0[22];
2153 bf1[21] = bf0[21] + bf0[23];
2154 bf1[22] = -bf0[22] + bf0[20];
2155 bf1[23] = -bf0[23] + bf0[21];
2156 bf1[24] = bf0[24] + bf0[26];
2157 bf1[25] = bf0[25] + bf0[27];
2158 bf1[26] = -bf0[26] + bf0[24];
2159 bf1[27] = -bf0[27] + bf0[25];
2160 bf1[28] = bf0[28] + bf0[30];
2161 bf1[29] = bf0[29] + bf0[31];
2162 bf1[30] = -bf0[30] + bf0[28];
2163 bf1[31] = -bf0[31] + bf0[29];
2164
2165 // stage 10
2166 cospi = cospi_arr(cos_bit);
2167 bf0 = output;
2168 bf1 = step;
2169 bf1[0] = bf0[0];
2170 bf1[1] = bf0[1];
2171 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
2172 bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit);
2173 bf1[4] = bf0[4];
2174 bf1[5] = bf0[5];
2175 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
2176 bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit);
2177 bf1[8] = bf0[8];
2178 bf1[9] = bf0[9];
2179 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
2180 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit);
2181 bf1[12] = bf0[12];
2182 bf1[13] = bf0[13];
2183 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
2184 bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit);
2185 bf1[16] = bf0[16];
2186 bf1[17] = bf0[17];
2187 bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit);
2188 bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit);
2189 bf1[20] = bf0[20];
2190 bf1[21] = bf0[21];
2191 bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit);
2192 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit);
2193 bf1[24] = bf0[24];
2194 bf1[25] = bf0[25];
2195 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit);
2196 bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit);
2197 bf1[28] = bf0[28];
2198 bf1[29] = bf0[29];
2199 bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit);
2200 bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit);
2201
2202 // stage 11
2203 bf0 = step;
2204 bf1 = output;
2205 bf1[0] = bf0[0];
2206 bf1[1] = -bf0[16];
2207 bf1[2] = bf0[24];
2208 bf1[3] = -bf0[8];
2209 bf1[4] = bf0[12];
2210 bf1[5] = -bf0[28];
2211 bf1[6] = bf0[20];
2212 bf1[7] = -bf0[4];
2213 bf1[8] = bf0[6];
2214 bf1[9] = -bf0[22];
2215 bf1[10] = bf0[30];
2216 bf1[11] = -bf0[14];
2217 bf1[12] = bf0[10];
2218 bf1[13] = -bf0[26];
2219 bf1[14] = bf0[18];
2220 bf1[15] = -bf0[2];
2221 bf1[16] = bf0[3];
2222 bf1[17] = -bf0[19];
2223 bf1[18] = bf0[27];
2224 bf1[19] = -bf0[11];
2225 bf1[20] = bf0[15];
2226 bf1[21] = -bf0[31];
2227 bf1[22] = bf0[23];
2228 bf1[23] = -bf0[7];
2229 bf1[24] = bf0[5];
2230 bf1[25] = -bf0[21];
2231 bf1[26] = bf0[29];
2232 bf1[27] = -bf0[13];
2233 bf1[28] = bf0[9];
2234 bf1[29] = -bf0[25];
2235 bf1[30] = bf0[17];
2236 bf1[31] = -bf0[1];
2237 }
2238
svt_av1_fidentity4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2239 void svt_av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2240 const int8_t *stage_range) {
2241 (void)stage_range;
2242 (void)cos_bit;
2243 for (int32_t i = 0; i < 4; ++i)
2244 output[i] = round_shift((int64_t)input[i] * new_sqrt2, new_sqrt2_bits);
2245 assert(stage_range[0] + new_sqrt2_bits <= 32);
2246 }
2247
svt_av1_fidentity8_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2248 void svt_av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2249 const int8_t *stage_range) {
2250 (void)stage_range;
2251 (void)cos_bit;
2252 for (int32_t i = 0; i < 8; ++i) output[i] = input[i] * 2;
2253 }
2254
svt_av1_fidentity16_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2255 void svt_av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2256 const int8_t *stage_range) {
2257 (void)stage_range;
2258 (void)cos_bit;
2259 for (int32_t i = 0; i < 16; ++i)
2260 output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
2261 assert(stage_range[0] + new_sqrt2_bits <= 32);
2262 }
2263
svt_av1_fidentity32_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2264 void svt_av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2265 const int8_t *stage_range) {
2266 (void)stage_range;
2267 (void)cos_bit;
2268 for (int32_t i = 0; i < 32; ++i) output[i] = input[i] * 4;
2269 }
2270
av1_fidentity64_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2271 void av1_fidentity64_c(const int32_t *input, int32_t *output, int8_t cos_bit,
2272 const int8_t *stage_range) {
2273 (void)stage_range;
2274 (void)cos_bit;
2275 for (int32_t i = 0; i < 64; ++i)
2276 output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
2277 assert(stage_range[0] + new_sqrt2_bits <= 32);
2278 }
2279
fwd_txfm_type_to_func(TxfmType txfmtype)2280 static INLINE TxfmFunc fwd_txfm_type_to_func(TxfmType txfmtype) {
2281 switch (txfmtype) {
2282 case TXFM_TYPE_DCT4: return svt_av1_fdct4_new;
2283 case TXFM_TYPE_DCT8: return svt_av1_fdct8_new;
2284 case TXFM_TYPE_DCT16: return svt_av1_fdct16_new;
2285 case TXFM_TYPE_DCT32: return svt_av1_fdct32_new;
2286 case TXFM_TYPE_DCT64: return svt_av1_fdct64_new;
2287 case TXFM_TYPE_ADST4: return svt_av1_fadst4_new;
2288 case TXFM_TYPE_ADST8: return svt_av1_fadst8_new;
2289 case TXFM_TYPE_ADST16: return svt_av1_fadst16_new;
2290 case TXFM_TYPE_ADST32: return av1_fadst32_new;
2291 case TXFM_TYPE_IDENTITY4: return svt_av1_fidentity4_c;
2292 case TXFM_TYPE_IDENTITY8: return svt_av1_fidentity8_c;
2293 case TXFM_TYPE_IDENTITY16: return svt_av1_fidentity16_c;
2294 case TXFM_TYPE_IDENTITY32: return svt_av1_fidentity32_c;
2295 case TXFM_TYPE_IDENTITY64: return av1_fidentity64_c;
2296 default: assert(0); return NULL;
2297 }
2298 }
2299
2300 //fwd_txfm2d_c
av1_tranform_two_d_core_c(int16_t * input,uint32_t input_stride,int32_t * output,const Txfm2dFlipCfg * cfg,int32_t * buf,uint8_t bit_depth)2301 static INLINE void av1_tranform_two_d_core_c(int16_t *input, uint32_t input_stride, int32_t *output,
2302 const Txfm2dFlipCfg *cfg, int32_t *buf,
2303 uint8_t bit_depth) {
2304 int32_t c, r;
2305 // Note when assigning txfm_size_col, we use the txfm_size from the
2306 // row configuration and vice versa. This is intentionally done to
2307 // accurately perform rectangular transforms. When the transform is
2308 // rectangular, the number of columns will be the same as the
2309 // txfm_size stored in the row cfg struct. It will make no difference
2310 // for square transforms.
2311 const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
2312 const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
2313 // Take the shift from the larger dimension in the rectangular case.
2314 const int8_t *shift = cfg->shift;
2315 const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
2316 int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
2317 int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
2318 assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
2319 assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
2320 svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
2321
2322 const int8_t cos_bit_col = cfg->cos_bit_col;
2323 const int8_t cos_bit_row = cfg->cos_bit_row;
2324 const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
2325 const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
2326 ASSERT(txfm_func_col != NULL);
2327 ASSERT(txfm_func_row != NULL);
2328 // use output buffer as temp buffer
2329 int32_t *temp_in = output;
2330 int32_t *temp_out = output + txfm_size_row;
2331
2332 // Columns
2333 for (c = 0; c < txfm_size_col; ++c) {
2334 if (cfg->ud_flip == 0)
2335 for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * input_stride + c];
2336 else {
2337 for (r = 0; r < txfm_size_row; ++r)
2338 // flip upside down
2339 temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
2340 }
2341 svt_av1_round_shift_array_c(
2342 temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
2343 txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
2344 svt_av1_round_shift_array_c(
2345 temp_out, txfm_size_row, -shift[1]); // NM svt_av1_round_shift_array_c
2346 if (cfg->lr_flip == 0) {
2347 for (r = 0; r < txfm_size_row; ++r) buf[r * txfm_size_col + c] = temp_out[r];
2348 } else {
2349 for (r = 0; r < txfm_size_row; ++r)
2350 // flip from left to right
2351 buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
2352 }
2353 }
2354
2355 // Rows
2356 for (r = 0; r < txfm_size_row; ++r) {
2357 txfm_func_row(
2358 buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
2359 svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col, -shift[2]);
2360
2361 if (abs(rect_type) == 1) {
2362 // Multiply everything by Sqrt2 if the transform is rectangular and the
2363 // size difference is a factor of 2.
2364 for (c = 0; c < txfm_size_col; ++c) {
2365 output[r * txfm_size_col + c] = round_shift(
2366 (int64_t)output[r * txfm_size_col + c] * new_sqrt2, new_sqrt2_bits);
2367 }
2368 }
2369 }
2370 }
av1_fdct32_pf_new(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)2371 void av1_fdct32_pf_new(const int32_t *input, int32_t *output, int8_t cos_bit,
2372 const int8_t *stage_range) {
2373 (void)stage_range;
2374 const int32_t *cospi;
2375
2376 int32_t *bf0, *bf1;
2377 int32_t step[32];
2378
2379 // stage 0;
2380
2381 // stage 1;
2382 bf1 = output;
2383 bf1[0] = input[0] + input[31];
2384 bf1[1] = input[1] + input[30];
2385 bf1[2] = input[2] + input[29];
2386 bf1[3] = input[3] + input[28];
2387 bf1[4] = input[4] + input[27];
2388 bf1[5] = input[5] + input[26];
2389 bf1[6] = input[6] + input[25];
2390 bf1[7] = input[7] + input[24];
2391 bf1[8] = input[8] + input[23];
2392 bf1[9] = input[9] + input[22];
2393 bf1[10] = input[10] + input[21];
2394 bf1[11] = input[11] + input[20];
2395 bf1[12] = input[12] + input[19];
2396 bf1[13] = input[13] + input[18];
2397 bf1[14] = input[14] + input[17];
2398 bf1[15] = input[15] + input[16];
2399 bf1[16] = -input[16] + input[15];
2400 bf1[17] = -input[17] + input[14];
2401 bf1[18] = -input[18] + input[13];
2402 bf1[19] = -input[19] + input[12];
2403 bf1[20] = -input[20] + input[11];
2404 bf1[21] = -input[21] + input[10];
2405 bf1[22] = -input[22] + input[9];
2406 bf1[23] = -input[23] + input[8];
2407 bf1[24] = -input[24] + input[7];
2408 bf1[25] = -input[25] + input[6];
2409 bf1[26] = -input[26] + input[5];
2410 bf1[27] = -input[27] + input[4];
2411 bf1[28] = -input[28] + input[3];
2412 bf1[29] = -input[29] + input[2];
2413 bf1[30] = -input[30] + input[1];
2414 bf1[31] = -input[31] + input[0];
2415
2416 // stage 2
2417 cospi = cospi_arr(cos_bit);
2418 bf0 = output;
2419 bf1 = step;
2420 bf1[0] = bf0[0] + bf0[15];
2421 bf1[1] = bf0[1] + bf0[14];
2422 bf1[2] = bf0[2] + bf0[13];
2423 bf1[3] = bf0[3] + bf0[12];
2424 bf1[4] = bf0[4] + bf0[11];
2425 bf1[5] = bf0[5] + bf0[10];
2426 bf1[6] = bf0[6] + bf0[9];
2427 bf1[7] = bf0[7] + bf0[8];
2428 bf1[8] = -bf0[8] + bf0[7];
2429 bf1[9] = -bf0[9] + bf0[6];
2430 bf1[10] = -bf0[10] + bf0[5];
2431 bf1[11] = -bf0[11] + bf0[4];
2432 bf1[12] = -bf0[12] + bf0[3];
2433 bf1[13] = -bf0[13] + bf0[2];
2434 bf1[14] = -bf0[14] + bf0[1];
2435 bf1[15] = -bf0[15] + bf0[0];
2436 bf1[16] = bf0[16];
2437 bf1[17] = bf0[17];
2438 bf1[18] = bf0[18];
2439 bf1[19] = bf0[19];
2440 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
2441 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
2442 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
2443 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
2444 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
2445 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
2446 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
2447 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
2448 bf1[28] = bf0[28];
2449 bf1[29] = bf0[29];
2450 bf1[30] = bf0[30];
2451 bf1[31] = bf0[31];
2452
2453 // stage 3
2454 cospi = cospi_arr(cos_bit);
2455 bf0 = step;
2456 bf1 = output;
2457 bf1[0] = bf0[0] + bf0[7];
2458 bf1[1] = bf0[1] + bf0[6];
2459 bf1[2] = bf0[2] + bf0[5];
2460 bf1[3] = bf0[3] + bf0[4];
2461 bf1[4] = -bf0[4] + bf0[3];
2462 bf1[5] = -bf0[5] + bf0[2];
2463 bf1[6] = -bf0[6] + bf0[1];
2464 bf1[7] = -bf0[7] + bf0[0];
2465 bf1[8] = bf0[8];
2466 bf1[9] = bf0[9];
2467 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
2468 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
2469 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
2470 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
2471 bf1[14] = bf0[14];
2472 bf1[15] = bf0[15];
2473 bf1[16] = bf0[16] + bf0[23];
2474 bf1[17] = bf0[17] + bf0[22];
2475 bf1[18] = bf0[18] + bf0[21];
2476 bf1[19] = bf0[19] + bf0[20];
2477 bf1[20] = -bf0[20] + bf0[19];
2478 bf1[21] = -bf0[21] + bf0[18];
2479 bf1[22] = -bf0[22] + bf0[17];
2480 bf1[23] = -bf0[23] + bf0[16];
2481 bf1[24] = -bf0[24] + bf0[31];
2482 bf1[25] = -bf0[25] + bf0[30];
2483 bf1[26] = -bf0[26] + bf0[29];
2484 bf1[27] = -bf0[27] + bf0[28];
2485 bf1[28] = bf0[28] + bf0[27];
2486 bf1[29] = bf0[29] + bf0[26];
2487 bf1[30] = bf0[30] + bf0[25];
2488 bf1[31] = bf0[31] + bf0[24];
2489
2490 // stage 4
2491 cospi = cospi_arr(cos_bit);
2492 bf0 = output;
2493 bf1 = step;
2494 bf1[0] = bf0[0] + bf0[3];
2495 bf1[1] = bf0[1] + bf0[2];
2496 bf1[2] = -bf0[2] + bf0[1];
2497 bf1[3] = -bf0[3] + bf0[0];
2498 bf1[4] = bf0[4];
2499 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
2500 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
2501 bf1[7] = bf0[7];
2502 bf1[8] = bf0[8] + bf0[11];
2503 bf1[9] = bf0[9] + bf0[10];
2504 bf1[10] = -bf0[10] + bf0[9];
2505 bf1[11] = -bf0[11] + bf0[8];
2506 bf1[12] = -bf0[12] + bf0[15];
2507 bf1[13] = -bf0[13] + bf0[14];
2508 bf1[14] = bf0[14] + bf0[13];
2509 bf1[15] = bf0[15] + bf0[12];
2510 bf1[16] = bf0[16];
2511 bf1[17] = bf0[17];
2512 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
2513 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
2514 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
2515 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
2516 bf1[22] = bf0[22];
2517 bf1[23] = bf0[23];
2518 bf1[24] = bf0[24];
2519 bf1[25] = bf0[25];
2520 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
2521 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
2522 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
2523 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
2524 bf1[30] = bf0[30];
2525 bf1[31] = bf0[31];
2526
2527 // stage 5
2528 cospi = cospi_arr(cos_bit);
2529 bf0 = step;
2530 bf1 = output;
2531 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
2532 //bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
2533 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
2534 //bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
2535 bf1[4] = bf0[4] + bf0[5];
2536 bf1[5] = -bf0[5] + bf0[4];
2537 bf1[6] = -bf0[6] + bf0[7];
2538 bf1[7] = bf0[7] + bf0[6];
2539 bf1[8] = bf0[8];
2540 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
2541 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
2542 bf1[11] = bf0[11];
2543 bf1[12] = bf0[12];
2544 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
2545 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
2546 bf1[15] = bf0[15];
2547 bf1[16] = bf0[16] + bf0[19];
2548 bf1[17] = bf0[17] + bf0[18];
2549 bf1[18] = -bf0[18] + bf0[17];
2550 bf1[19] = -bf0[19] + bf0[16];
2551 bf1[20] = -bf0[20] + bf0[23];
2552 bf1[21] = -bf0[21] + bf0[22];
2553 bf1[22] = bf0[22] + bf0[21];
2554 bf1[23] = bf0[23] + bf0[20];
2555 bf1[24] = bf0[24] + bf0[27];
2556 bf1[25] = bf0[25] + bf0[26];
2557 bf1[26] = -bf0[26] + bf0[25];
2558 bf1[27] = -bf0[27] + bf0[24];
2559 bf1[28] = -bf0[28] + bf0[31];
2560 bf1[29] = -bf0[29] + bf0[30];
2561 bf1[30] = bf0[30] + bf0[29];
2562 bf1[31] = bf0[31] + bf0[28];
2563
2564 // stage 6
2565 cospi = cospi_arr(cos_bit);
2566 bf0 = output;
2567 bf1 = step;
2568 bf1[0] = bf0[0];
2569 //bf1[1] = bf0[1];
2570 bf1[2] = bf0[2];
2571 //bf1[3] = bf0[3];
2572 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
2573 //bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
2574 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
2575 //bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
2576 bf1[8] = bf0[8] + bf0[9];
2577 bf1[9] = -bf0[9] + bf0[8];
2578 bf1[10] = -bf0[10] + bf0[11];
2579 bf1[11] = bf0[11] + bf0[10];
2580 bf1[12] = bf0[12] + bf0[13];
2581 bf1[13] = -bf0[13] + bf0[12];
2582 bf1[14] = -bf0[14] + bf0[15];
2583 bf1[15] = bf0[15] + bf0[14];
2584 bf1[16] = bf0[16];
2585 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
2586 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
2587 bf1[19] = bf0[19];
2588 bf1[20] = bf0[20];
2589 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
2590 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
2591 bf1[23] = bf0[23];
2592 bf1[24] = bf0[24];
2593 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
2594 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
2595 bf1[27] = bf0[27];
2596 bf1[28] = bf0[28];
2597 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
2598 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
2599 bf1[31] = bf0[31];
2600
2601 // stage 7
2602 cospi = cospi_arr(cos_bit);
2603 bf0 = step;
2604 bf1 = output;
2605 bf1[0] = bf0[0];
2606 //bf1[1] = bf0[1];
2607 bf1[2] = bf0[2];
2608 //bf1[3] = bf0[3];
2609 bf1[4] = bf0[4];
2610 //bf1[5] = bf0[5];
2611 bf1[6] = bf0[6];
2612 //bf1[7] = bf0[7];
2613 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
2614 //bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
2615 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
2616 //bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
2617 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
2618 //bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
2619 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
2620 //bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
2621 bf1[16] = bf0[16] + bf0[17];
2622 bf1[17] = -bf0[17] + bf0[16];
2623 bf1[18] = -bf0[18] + bf0[19];
2624 bf1[19] = bf0[19] + bf0[18];
2625 bf1[20] = bf0[20] + bf0[21];
2626 bf1[21] = -bf0[21] + bf0[20];
2627 bf1[22] = -bf0[22] + bf0[23];
2628 bf1[23] = bf0[23] + bf0[22];
2629 bf1[24] = bf0[24] + bf0[25];
2630 bf1[25] = -bf0[25] + bf0[24];
2631 bf1[26] = -bf0[26] + bf0[27];
2632 bf1[27] = bf0[27] + bf0[26];
2633 bf1[28] = bf0[28] + bf0[29];
2634 bf1[29] = -bf0[29] + bf0[28];
2635 bf1[30] = -bf0[30] + bf0[31];
2636 bf1[31] = bf0[31] + bf0[30];
2637
2638 // stage 8
2639 cospi = cospi_arr(cos_bit);
2640 bf0 = output;
2641 bf1 = step;
2642 bf1[0] = bf0[0];
2643 //bf1[1] = bf0[1];
2644 bf1[2] = bf0[2];
2645 //bf1[3] = bf0[3];
2646 bf1[4] = bf0[4];
2647 //bf1[5] = bf0[5];
2648 bf1[6] = bf0[6];
2649 //bf1[7] = bf0[7];
2650 bf1[8] = bf0[8];
2651 //bf1[9] = bf0[9];
2652 bf1[10] = bf0[10];
2653 //bf1[11] = bf0[11];
2654 bf1[12] = bf0[12];
2655 //bf1[13] = bf0[13];
2656 bf1[14] = bf0[14];
2657 //bf1[15] = bf0[15];
2658 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
2659 //bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
2660 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
2661 //bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
2662 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
2663 //bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
2664 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
2665 //bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
2666 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
2667 //bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
2668 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
2669 //bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
2670 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
2671 //bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
2672 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
2673 //bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
2674
2675 // stage 11
2676 bf0 = step;
2677 bf1 = output;
2678 bf1[0] = bf0[0];
2679 bf1[1] = -bf0[16];
2680 bf1[2] = bf0[24];
2681 bf1[3] = -bf0[8];
2682 bf1[4] = bf0[12];
2683 bf1[5] = -bf0[28];
2684 bf1[6] = bf0[20];
2685 bf1[7] = -bf0[4];
2686 bf1[8] = bf0[6];
2687 bf1[9] = -bf0[22];
2688 bf1[10] = bf0[30];
2689 bf1[11] = -bf0[14];
2690 bf1[12] = bf0[10];
2691 bf1[13] = -bf0[26];
2692 bf1[14] = bf0[18];
2693 bf1[15] = -bf0[2];
2694 bf1[16] = bf0[3];
2695 bf1[17] = -bf0[19];
2696 bf1[18] = bf0[27];
2697 bf1[19] = -bf0[11];
2698 bf1[20] = bf0[15];
2699 bf1[21] = -bf0[31];
2700 bf1[22] = bf0[23];
2701 bf1[23] = -bf0[7];
2702 bf1[24] = bf0[5];
2703 bf1[25] = -bf0[21];
2704 bf1[26] = bf0[29];
2705 bf1[27] = -bf0[13];
2706 bf1[28] = bf0[9];
2707 bf1[29] = -bf0[25];
2708 bf1[30] = bf0[17];
2709 bf1[31] = -bf0[1];
2710 }
set_fwd_txfm_non_scale_range(Txfm2dFlipCfg * cfg)2711 static INLINE void set_fwd_txfm_non_scale_range(Txfm2dFlipCfg *cfg) {
2712 av1_zero(cfg->stage_range_col);
2713 av1_zero(cfg->stage_range_row);
2714
2715 const int8_t *range_mult2_col = fwd_txfm_range_mult2_list[cfg->txfm_type_col];
2716 if (cfg->txfm_type_col != TXFM_TYPE_INVALID) {
2717 int stage_num_col = cfg->stage_num_col;
2718 for (int i = 0; i < stage_num_col; ++i)
2719 cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
2720 }
2721
2722 if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
2723 int stage_num_row = cfg->stage_num_row;
2724 const int8_t *range_mult2_row = fwd_txfm_range_mult2_list[cfg->txfm_type_row];
2725 for (int i = 0; i < stage_num_row; ++i) {
2726 cfg->stage_range_row[i] = (range_mult2_col[cfg->stage_num_col - 1] +
2727 range_mult2_row[i] + 1) >>
2728 1;
2729 }
2730 }
2731 }
av1_transform_config(TxType tx_type,TxSize tx_size,Txfm2dFlipCfg * cfg)2732 void av1_transform_config(TxType tx_type, TxSize tx_size, Txfm2dFlipCfg *cfg) {
2733 assert(cfg != NULL);
2734 cfg->tx_size = tx_size;
2735 set_flip_cfg(tx_type, cfg);
2736 const TxType1D tx_type_1d_col = vtx_tab[tx_type];
2737 const TxType1D tx_type_1d_row = htx_tab[tx_type];
2738 const int32_t txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
2739 const int32_t txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
2740 cfg->shift = fwd_txfm_shift_ls[tx_size];
2741 cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
2742 cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
2743 cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
2744 cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
2745 cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
2746 cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
2747 set_fwd_txfm_non_scale_range(cfg);
2748 }
2749
energy_computation(int32_t * coeff,uint32_t coeff_stride,uint32_t area_width,uint32_t area_height)2750 static uint64_t energy_computation(int32_t *coeff, uint32_t coeff_stride, uint32_t area_width,
2751 uint32_t area_height) {
2752 uint64_t prediction_distortion = 0;
2753
2754 for (uint32_t row_index = 0; row_index < area_height; ++row_index) {
2755 for (uint32_t column_index = 0; column_index < area_width; ++column_index)
2756 prediction_distortion += (int64_t)SQR((int64_t)(coeff[column_index]));
2757 coeff += coeff_stride;
2758 }
2759
2760 return prediction_distortion;
2761 }
2762
svt_handle_transform64x64_c(int32_t * output)2763 uint64_t svt_handle_transform64x64_c(int32_t *output) {
2764 uint64_t three_quad_energy;
2765
2766 // top - right 32x32 area.
2767 three_quad_energy = energy_computation(output + 32, 64, 32, 32);
2768 //bottom 64x32 area.
2769 three_quad_energy += energy_computation(output + 32 * 64, 64, 64, 32);
2770
2771 // zero out top-right 32x32 area.
2772 for (int32_t row = 0; row < 32; ++row) memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
2773
2774 // zero out the bottom 64x32 area.
2775 memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
2776
2777 // Re-pack non-zero coeffs in the first 32x32 indices.
2778 for (int32_t row = 1; row < 32; ++row)
2779 svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2780
2781 return three_quad_energy;
2782 }
2783
svt_av1_transform_two_d_64x64_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2784 void svt_av1_transform_two_d_64x64_c(int16_t *input, int32_t *output, uint32_t input_stride,
2785 TxType transform_type, uint8_t bit_depth) {
2786 int32_t intermediate_transform_buffer[64 * 64];
2787 Txfm2dFlipCfg cfg;
2788 //av1_get_fwd_txfm_cfg
2789 av1_transform_config(transform_type, TX_64X64, &cfg);
2790 //fwd_txfm2d_c
2791 av1_tranform_two_d_core_c(
2792 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2793 }
2794
svt_av1_transform_two_d_32x32_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2795 void svt_av1_transform_two_d_32x32_c(int16_t *input, int32_t *output, uint32_t input_stride,
2796 TxType transform_type, uint8_t bit_depth) {
2797 int32_t intermediate_transform_buffer[32 * 32];
2798 Txfm2dFlipCfg cfg;
2799
2800 av1_transform_config(transform_type, TX_32X32, &cfg);
2801
2802 av1_tranform_two_d_core_c(
2803 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2804 }
svt_av1_transform_two_d_16x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2805 void svt_av1_transform_two_d_16x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
2806 TxType transform_type, uint8_t bit_depth) {
2807 int32_t intermediate_transform_buffer[16 * 16];
2808 Txfm2dFlipCfg cfg;
2809
2810 av1_transform_config(transform_type, TX_16X16, &cfg);
2811
2812 av1_tranform_two_d_core_c(
2813 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2814 }
2815
svt_av1_transform_two_d_8x8_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2816 void svt_av1_transform_two_d_8x8_c(int16_t *input, int32_t *output, uint32_t input_stride,
2817 TxType transform_type, uint8_t bit_depth) {
2818 int32_t intermediate_transform_buffer[8 * 8];
2819 Txfm2dFlipCfg cfg;
2820
2821 av1_transform_config(transform_type, TX_8X8, &cfg);
2822
2823 av1_tranform_two_d_core_c(
2824 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2825 }
2826
svt_av1_transform_two_d_4x4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2827 void svt_av1_transform_two_d_4x4_c(int16_t *input, int32_t *output, uint32_t input_stride,
2828 TxType transform_type, uint8_t bit_depth) {
2829 int32_t intermediate_transform_buffer[4 * 4];
2830 Txfm2dFlipCfg cfg;
2831
2832 av1_transform_config(transform_type, TX_4X4, &cfg);
2833
2834 av1_tranform_two_d_core_c(
2835 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2836 }
2837
2838 /*********************************************************************
2839 * Calculate CBF
2840 *********************************************************************/
svt_av1_fwd_txfm2d_64x32_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2841 void svt_av1_fwd_txfm2d_64x32_c(int16_t *input, int32_t *output, uint32_t input_stride,
2842 TxType transform_type, uint8_t bit_depth) {
2843 int32_t intermediate_transform_buffer[64 * 32];
2844 Txfm2dFlipCfg cfg;
2845 /*av1_get_fwd_txfm_cfg*/
2846 av1_transform_config(transform_type, TX_64X32, &cfg);
2847 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2848 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2849 }
2850
svt_handle_transform64x32_c(int32_t * output)2851 uint64_t svt_handle_transform64x32_c(int32_t *output) {
2852 // top - right 32x32 area.
2853 const uint64_t three_quad_energy = energy_computation(output + 32, 64, 32, 32);
2854
2855 // zero out right 32x32 area.
2856 for (int32_t row = 0; row < 32; ++row) memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
2857
2858 // Re-pack non-zero coeffs in the first 32x32 indices.
2859 for (int32_t row = 1; row < 32; ++row)
2860 svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2861
2862 return three_quad_energy;
2863 }
2864
svt_av1_fwd_txfm2d_32x64_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2865 void svt_av1_fwd_txfm2d_32x64_c(int16_t *input, int32_t *output, uint32_t input_stride,
2866 TxType transform_type, uint8_t bit_depth) {
2867 int32_t intermediate_transform_buffer[32 * 64];
2868
2869 Txfm2dFlipCfg cfg;
2870 /*av1_get_fwd_txfm_cfg*/
2871 av1_transform_config(transform_type, TX_32X64, &cfg);
2872 /*fwd_txfm2d_c*/
2873 av1_tranform_two_d_core_c(
2874 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2875 }
2876
svt_handle_transform32x64_c(int32_t * output)2877 uint64_t svt_handle_transform32x64_c(int32_t *output) {
2878 //bottom 32x32 area.
2879 const uint64_t three_quad_energy = energy_computation(output + 32 * 32, 32, 32, 32);
2880
2881 // zero out the bottom 32x32 area.
2882 memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
2883
2884 return three_quad_energy;
2885 }
2886
svt_av1_fwd_txfm2d_64x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2887 void svt_av1_fwd_txfm2d_64x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
2888 TxType transform_type, uint8_t bit_depth) {
2889 int32_t intermediate_transform_buffer[64 * 16];
2890 Txfm2dFlipCfg cfg;
2891 /*av1_get_fwd_txfm_cfg*/
2892 av1_transform_config(transform_type, TX_64X16, &cfg);
2893 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2894 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2895 }
2896
svt_handle_transform64x16_c(int32_t * output)2897 uint64_t svt_handle_transform64x16_c(int32_t *output) {
2898 // top - right 32x16 area.
2899 const uint64_t three_quad_energy = energy_computation(output + 32, 64, 32, 16);
2900
2901 // zero out right 32x16 area.
2902 for (int32_t row = 0; row < 16; ++row) memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
2903
2904 // Re-pack non-zero coeffs in the first 32x16 indices.
2905 for (int32_t row = 1; row < 16; ++row)
2906 svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2907
2908 return three_quad_energy;
2909 }
2910
svt_av1_fwd_txfm2d_16x64_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2911 void svt_av1_fwd_txfm2d_16x64_c(int16_t *input, int32_t *output, uint32_t input_stride,
2912 TxType transform_type, uint8_t bit_depth) {
2913 int32_t intermediate_transform_buffer[16 * 64];
2914
2915 Txfm2dFlipCfg cfg;
2916 /*av1_get_fwd_txfm_cfg*/
2917 av1_transform_config(transform_type, TX_16X64, &cfg);
2918 /*fwd_txfm2d_c*/
2919 av1_tranform_two_d_core_c(
2920 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2921 }
2922
svt_handle_transform16x64_c(int32_t * output)2923 uint64_t svt_handle_transform16x64_c(int32_t *output) {
2924 //bottom 16x32 area.
2925 const uint64_t three_quad_energy = energy_computation(output + 16 * 32, 16, 16, 32);
2926
2927 // zero out the bottom 16x32 area.
2928 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
2929
2930 return three_quad_energy;
2931 }
2932
handle_transform16x64_N2_N4_c(int32_t * output)2933 uint64_t handle_transform16x64_N2_N4_c(int32_t *output) {
2934 (void)output;
2935 return 0;
2936 }
2937
handle_transform32x64_N2_N4_c(int32_t * output)2938 uint64_t handle_transform32x64_N2_N4_c(int32_t *output) {
2939 (void)output;
2940 return 0;
2941 }
2942
handle_transform64x16_N2_N4_c(int32_t * output)2943 uint64_t handle_transform64x16_N2_N4_c(int32_t *output) {
2944 // Re-pack non-zero coeffs in the first 32x16 indices.
2945 for (int32_t row = 1; row < 16; ++row)
2946 svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2947
2948 return 0;
2949 }
2950
handle_transform64x32_N2_N4_c(int32_t * output)2951 uint64_t handle_transform64x32_N2_N4_c(int32_t *output) {
2952 // Re-pack non-zero coeffs in the first 32x32 indices.
2953 for (int32_t row = 1; row < 32; ++row)
2954 svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2955
2956 return 0;
2957 }
2958
handle_transform64x64_N2_N4_c(int32_t * output)2959 uint64_t handle_transform64x64_N2_N4_c(int32_t *output) {
2960 // Re-pack non-zero coeffs in the first 32x32 indices.
2961 for (int32_t row = 1; row < 32; ++row)
2962 svt_memcpy_c(output + row * 32, output + row * 64, 32 * sizeof(*output));
2963
2964 return 0;
2965 }
svt_av1_fwd_txfm2d_32x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2966 void svt_av1_fwd_txfm2d_32x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
2967 TxType transform_type, uint8_t bit_depth) {
2968 int32_t intermediate_transform_buffer[32 * 16];
2969 Txfm2dFlipCfg cfg;
2970 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_32X16, &cfg);
2971 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2972 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2973 }
2974
svt_av1_fwd_txfm2d_16x32_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2975 void svt_av1_fwd_txfm2d_16x32_c(int16_t *input, int32_t *output, uint32_t input_stride,
2976 TxType transform_type, uint8_t bit_depth) {
2977 int32_t intermediate_transform_buffer[16 * 32];
2978 Txfm2dFlipCfg cfg;
2979 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_16X32, &cfg);
2980 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2981 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2982 }
2983
svt_av1_fwd_txfm2d_16x8_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2984 void svt_av1_fwd_txfm2d_16x8_c(int16_t *input, int32_t *output, uint32_t input_stride,
2985 TxType transform_type, uint8_t bit_depth) {
2986 int32_t intermediate_transform_buffer[16 * 8];
2987 Txfm2dFlipCfg cfg;
2988 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_16X8, &cfg);
2989 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2990 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
2991 }
2992
svt_av1_fwd_txfm2d_8x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)2993 void svt_av1_fwd_txfm2d_8x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
2994 TxType transform_type, uint8_t bit_depth) {
2995 int32_t intermediate_transform_buffer[8 * 16];
2996 Txfm2dFlipCfg cfg;
2997 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_8X16, &cfg);
2998 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
2999 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3000 }
3001
svt_av1_fwd_txfm2d_32x8_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3002 void svt_av1_fwd_txfm2d_32x8_c(int16_t *input, int32_t *output, uint32_t input_stride,
3003 TxType transform_type, uint8_t bit_depth) {
3004 int32_t intermediate_transform_buffer[32 * 8];
3005 Txfm2dFlipCfg cfg;
3006 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_32X8, &cfg);
3007 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3008 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3009 }
3010
svt_av1_fwd_txfm2d_8x32_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3011 void svt_av1_fwd_txfm2d_8x32_c(int16_t *input, int32_t *output, uint32_t input_stride,
3012 TxType transform_type, uint8_t bit_depth) {
3013 int32_t intermediate_transform_buffer[8 * 32];
3014 Txfm2dFlipCfg cfg;
3015 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_8X32, &cfg);
3016 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3017 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3018 }
3019
svt_av1_fwd_txfm2d_16x4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3020 void svt_av1_fwd_txfm2d_16x4_c(int16_t *input, int32_t *output, uint32_t input_stride,
3021 TxType transform_type, uint8_t bit_depth) {
3022 int32_t intermediate_transform_buffer[16 * 4];
3023 Txfm2dFlipCfg cfg;
3024 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_16X4, &cfg);
3025 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3026 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3027 }
3028
svt_av1_fwd_txfm2d_4x16_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3029 void svt_av1_fwd_txfm2d_4x16_c(int16_t *input, int32_t *output, uint32_t input_stride,
3030 TxType transform_type, uint8_t bit_depth) {
3031 int32_t intermediate_transform_buffer[4 * 16];
3032 Txfm2dFlipCfg cfg;
3033 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_4X16, &cfg);
3034 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3035 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3036 }
3037
svt_av1_fwd_txfm2d_8x4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3038 void svt_av1_fwd_txfm2d_8x4_c(int16_t *input, int32_t *output, uint32_t input_stride,
3039 TxType transform_type, uint8_t bit_depth) {
3040 int32_t intermediate_transform_buffer[8 * 4];
3041 Txfm2dFlipCfg cfg;
3042 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_8X4, &cfg);
3043 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3044 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3045 }
3046
svt_av1_fwd_txfm2d_4x8_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)3047 void svt_av1_fwd_txfm2d_4x8_c(int16_t *input, int32_t *output, uint32_t input_stride,
3048 TxType transform_type, uint8_t bit_depth) {
3049 int32_t intermediate_transform_buffer[4 * 8];
3050 Txfm2dFlipCfg cfg;
3051 /*av1_get_fwd_txfm_cfg*/ av1_transform_config(transform_type, TX_4X8, &cfg);
3052 /*fwd_txfm2d_c*/ av1_tranform_two_d_core_c(
3053 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
3054 }
av1_estimate_transform_N2(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type)3055 static EbErrorType av1_estimate_transform_N2(int16_t *residual_buffer, uint32_t residual_stride,
3056 int32_t *coeff_buffer, uint32_t coeff_stride,
3057 TxSize transform_size, uint64_t *three_quad_energy,
3058 uint32_t bit_depth, TxType transform_type,
3059 PlaneType component_type)
3060
3061 {
3062 EbErrorType return_error = EB_ErrorNone;
3063
3064 (void)coeff_stride;
3065 (void)component_type;
3066
3067 switch (transform_size) {
3068 case TX_64X32:
3069 if (transform_type == DCT_DCT)
3070 svt_av1_fwd_txfm2d_64x32_N2(
3071 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3072 else
3073 svt_av1_fwd_txfm2d_64x32_N2_c(
3074 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3075
3076 *three_quad_energy = handle_transform64x32_N2_N4(coeff_buffer);
3077
3078 break;
3079
3080 case TX_32X64:
3081 if (transform_type == DCT_DCT)
3082 svt_av1_fwd_txfm2d_32x64_N2(
3083 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3084 else
3085 svt_av1_fwd_txfm2d_32x64_N2_c(
3086 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3087
3088 *three_quad_energy = handle_transform32x64_N2_N4(coeff_buffer);
3089
3090 break;
3091
3092 case TX_64X16:
3093 if (transform_type == DCT_DCT)
3094 svt_av1_fwd_txfm2d_64x16_N2(
3095 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3096 else
3097 svt_av1_fwd_txfm2d_64x16_N2_c(
3098 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3099
3100 *three_quad_energy = handle_transform64x16_N2_N4(coeff_buffer);
3101
3102 break;
3103
3104 case TX_16X64:
3105 if (transform_type == DCT_DCT)
3106 svt_av1_fwd_txfm2d_16x64_N2(
3107 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3108 else
3109 svt_av1_fwd_txfm2d_16x64_N2_c(
3110 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3111
3112 *three_quad_energy = handle_transform16x64_N2_N4(coeff_buffer);
3113
3114 break;
3115
3116 case TX_32X16:
3117 // TTK
3118 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3119 svt_av1_fwd_txfm2d_32x16_N2(
3120 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3121 else
3122 svt_av1_fwd_txfm2d_32x16_N2_c(
3123 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3124 break;
3125
3126 case TX_16X32:
3127 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3128 svt_av1_fwd_txfm2d_16x32_N2(
3129 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3130 else
3131 svt_av1_fwd_txfm2d_16x32_N2_c(
3132 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3133 break;
3134
3135 case TX_16X8:
3136 svt_av1_fwd_txfm2d_16x8_N2(
3137 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3138 break;
3139
3140 case TX_8X16:
3141 svt_av1_fwd_txfm2d_8x16_N2(
3142 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3143 break;
3144
3145 case TX_32X8:
3146 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3147 svt_av1_fwd_txfm2d_32x8_N2(
3148 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3149 else
3150 svt_av1_fwd_txfm2d_32x8_N2_c(
3151 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3152 break;
3153
3154 case TX_8X32:
3155 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3156 svt_av1_fwd_txfm2d_8x32_N2(
3157 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3158 else
3159 svt_av1_fwd_txfm2d_8x32_N2_c(
3160 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3161 break;
3162 case TX_16X4:
3163 svt_av1_fwd_txfm2d_16x4_N2(
3164 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3165 break;
3166 case TX_4X16:
3167 svt_av1_fwd_txfm2d_4x16_N2(
3168 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3169 break;
3170 case TX_8X4:
3171
3172 svt_av1_fwd_txfm2d_8x4_N2(
3173 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3174
3175 break;
3176 case TX_4X8:
3177
3178 svt_av1_fwd_txfm2d_4x8_N2(
3179 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3180
3181 break;
3182
3183 case TX_64X64:
3184
3185 svt_av1_fwd_txfm2d_64x64_N2(
3186 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3187
3188 *three_quad_energy = handle_transform64x64_N2_N4(coeff_buffer);
3189
3190 break;
3191
3192 case TX_32X32:
3193 if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3194 transform_type == H_ADST || transform_type == V_FLIPADST ||
3195 transform_type == H_FLIPADST)
3196 // Tahani: I believe those cases are never hit
3197 av1_transform_two_d_32x32_N2_c(
3198 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3199
3200 else {
3201 svt_av1_fwd_txfm2d_32x32_N2(
3202 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3203 }
3204
3205 break;
3206
3207 case TX_16X16:
3208
3209 svt_av1_fwd_txfm2d_16x16_N2(
3210 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3211
3212 break;
3213 case TX_8X8:
3214
3215 svt_av1_fwd_txfm2d_8x8_N2(
3216 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3217
3218 break;
3219 case TX_4X4:
3220
3221 svt_av1_fwd_txfm2d_4x4_N2(
3222 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3223
3224 break;
3225 default: assert(0); break;
3226 }
3227
3228 return return_error;
3229 }
3230
av1_estimate_transform_N4(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type)3231 static EbErrorType av1_estimate_transform_N4(int16_t *residual_buffer, uint32_t residual_stride,
3232 int32_t *coeff_buffer, uint32_t coeff_stride,
3233 TxSize transform_size, uint64_t *three_quad_energy,
3234 uint32_t bit_depth, TxType transform_type,
3235 PlaneType component_type)
3236
3237 {
3238 EbErrorType return_error = EB_ErrorNone;
3239
3240 (void)coeff_stride;
3241 (void)component_type;
3242
3243 switch (transform_size) {
3244 case TX_64X32:
3245 if (transform_type == DCT_DCT)
3246 svt_av1_fwd_txfm2d_64x32_N4(
3247 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3248 else
3249 svt_av1_fwd_txfm2d_64x32_N4_c(
3250 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3251
3252 *three_quad_energy = handle_transform64x32_N2_N4(coeff_buffer);
3253
3254 break;
3255
3256 case TX_32X64:
3257 if (transform_type == DCT_DCT)
3258 svt_av1_fwd_txfm2d_32x64_N4(
3259 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3260 else
3261 svt_av1_fwd_txfm2d_32x64_N4_c(
3262 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3263
3264 *three_quad_energy = handle_transform32x64_N2_N4(coeff_buffer);
3265
3266 break;
3267
3268 case TX_64X16:
3269 if (transform_type == DCT_DCT)
3270 svt_av1_fwd_txfm2d_64x16_N4(
3271 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3272 else
3273 svt_av1_fwd_txfm2d_64x16_N4_c(
3274 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3275
3276 *three_quad_energy = handle_transform64x16_N2_N4(coeff_buffer);
3277
3278 break;
3279
3280 case TX_16X64:
3281 if (transform_type == DCT_DCT)
3282 svt_av1_fwd_txfm2d_16x64_N4(
3283 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3284 else
3285 svt_av1_fwd_txfm2d_16x64_N4_c(
3286 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3287
3288 *three_quad_energy = handle_transform16x64_N2_N4(coeff_buffer);
3289
3290 break;
3291
3292 case TX_32X16:
3293 // TTK
3294 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3295 svt_av1_fwd_txfm2d_32x16_N4(
3296 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3297 else
3298 svt_av1_fwd_txfm2d_32x16_N4_c(
3299 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3300 break;
3301
3302 case TX_16X32:
3303 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3304 svt_av1_fwd_txfm2d_16x32_N4(
3305 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3306 else
3307 svt_av1_fwd_txfm2d_16x32_N4_c(
3308 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3309 break;
3310
3311 case TX_16X8:
3312 svt_av1_fwd_txfm2d_16x8_N4(
3313 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3314 break;
3315
3316 case TX_8X16:
3317 svt_av1_fwd_txfm2d_8x16_N4(
3318 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3319 break;
3320
3321 case TX_32X8:
3322 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3323 svt_av1_fwd_txfm2d_32x8_N4(
3324 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3325 else
3326 svt_av1_fwd_txfm2d_32x8_N4_c(
3327 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3328 break;
3329
3330 case TX_8X32:
3331 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3332 svt_av1_fwd_txfm2d_8x32_N4(
3333 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3334 else
3335 svt_av1_fwd_txfm2d_8x32_N4_c(
3336 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3337 break;
3338 case TX_16X4:
3339 svt_av1_fwd_txfm2d_16x4_N4(
3340 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3341 break;
3342 case TX_4X16:
3343 svt_av1_fwd_txfm2d_4x16_N4(
3344 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3345 break;
3346 case TX_8X4:
3347
3348 svt_av1_fwd_txfm2d_8x4_N4(
3349 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3350
3351 break;
3352 case TX_4X8:
3353
3354 svt_av1_fwd_txfm2d_4x8_N4(
3355 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3356
3357 break;
3358
3359 case TX_64X64:
3360
3361 svt_av1_fwd_txfm2d_64x64_N4(
3362 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3363
3364 *three_quad_energy = handle_transform64x64_N2_N4(coeff_buffer);
3365
3366 break;
3367
3368 case TX_32X32:
3369 if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3370 transform_type == H_ADST || transform_type == V_FLIPADST ||
3371 transform_type == H_FLIPADST)
3372 // Tahani: I believe those cases are never hit
3373 av1_transform_two_d_32x32_N4_c(
3374 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3375
3376 else {
3377 svt_av1_fwd_txfm2d_32x32_N4(
3378 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3379 }
3380
3381 break;
3382
3383 case TX_16X16:
3384
3385 svt_av1_fwd_txfm2d_16x16_N4(
3386 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3387
3388 break;
3389 case TX_8X8:
3390
3391 svt_av1_fwd_txfm2d_8x8_N4(
3392 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3393
3394 break;
3395 case TX_4X4:
3396
3397 svt_av1_fwd_txfm2d_4x4_N4(
3398 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3399
3400 break;
3401 default: assert(0); break;
3402 }
3403
3404 return return_error;
3405 }
3406
av1_estimate_transform_ONLY_DC(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type)3407 static EbErrorType av1_estimate_transform_ONLY_DC(int16_t *residual_buffer,
3408 uint32_t residual_stride, int32_t *coeff_buffer,
3409 uint32_t coeff_stride, TxSize transform_size,
3410 uint64_t *three_quad_energy, uint32_t bit_depth,
3411 TxType transform_type, PlaneType component_type)
3412
3413 {
3414 EbErrorType return_error = av1_estimate_transform_N4(residual_buffer,
3415 residual_stride,
3416 coeff_buffer,
3417 coeff_stride,
3418 transform_size,
3419 three_quad_energy,
3420 bit_depth,
3421 transform_type,
3422 component_type);
3423
3424 for (int i = 1; i < (tx_size_wide[transform_size] * tx_size_high[transform_size]); i++) {
3425 if (i % tx_size_wide[transform_size] < (tx_size_wide[transform_size] >> 2) ||
3426 i / tx_size_wide[transform_size] < (tx_size_high[transform_size] >> 2)) {
3427 coeff_buffer[i] = 0;
3428 }
3429 }
3430 return return_error;
3431 }
3432
av1_estimate_transform_default(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type)3433 EbErrorType av1_estimate_transform_default(int16_t *residual_buffer, uint32_t residual_stride,
3434 int32_t *coeff_buffer, uint32_t coeff_stride,
3435 TxSize transform_size, uint64_t *three_quad_energy,
3436 uint32_t bit_depth, TxType transform_type,
3437 PlaneType component_type)
3438
3439 {
3440 EbErrorType return_error = EB_ErrorNone;
3441
3442 (void)coeff_stride;
3443 (void)component_type;
3444
3445 switch (transform_size) {
3446 case TX_64X32:
3447 if (transform_type == DCT_DCT)
3448 svt_av1_fwd_txfm2d_64x32(
3449 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3450 else
3451 svt_av1_fwd_txfm2d_64x32_c(
3452 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3453
3454 *three_quad_energy = svt_handle_transform64x32(coeff_buffer);
3455
3456 break;
3457
3458 case TX_32X64:
3459 if (transform_type == DCT_DCT)
3460 svt_av1_fwd_txfm2d_32x64(
3461 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3462 else
3463 svt_av1_fwd_txfm2d_32x64_c(
3464 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3465
3466 *three_quad_energy = svt_handle_transform32x64(coeff_buffer);
3467
3468 break;
3469
3470 case TX_64X16:
3471 if (transform_type == DCT_DCT)
3472 svt_av1_fwd_txfm2d_64x16(
3473 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3474 else
3475 svt_av1_fwd_txfm2d_64x16_c(
3476 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3477
3478 *three_quad_energy = svt_handle_transform64x16(coeff_buffer);
3479
3480 break;
3481
3482 case TX_16X64:
3483 if (transform_type == DCT_DCT)
3484 svt_av1_fwd_txfm2d_16x64(
3485 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3486 else
3487 svt_av1_fwd_txfm2d_16x64_c(
3488 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3489
3490 *three_quad_energy = svt_handle_transform16x64(coeff_buffer);
3491
3492 break;
3493
3494 case TX_32X16:
3495 // TTK
3496 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3497 svt_av1_fwd_txfm2d_32x16(
3498 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3499 else
3500 svt_av1_fwd_txfm2d_32x16_c(
3501 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3502 break;
3503
3504 case TX_16X32:
3505 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3506 svt_av1_fwd_txfm2d_16x32(
3507 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3508 else
3509 svt_av1_fwd_txfm2d_16x32_c(
3510 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3511 break;
3512
3513 case TX_16X8:
3514 svt_av1_fwd_txfm2d_16x8(
3515 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3516 break;
3517
3518 case TX_8X16:
3519 svt_av1_fwd_txfm2d_8x16(
3520 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3521 break;
3522
3523 case TX_32X8:
3524 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3525 svt_av1_fwd_txfm2d_32x8(
3526 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3527 else
3528 svt_av1_fwd_txfm2d_32x8_c(
3529 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3530 break;
3531
3532 case TX_8X32:
3533 if ((transform_type == DCT_DCT) || (transform_type == IDTX))
3534 svt_av1_fwd_txfm2d_8x32(
3535 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3536 else
3537 svt_av1_fwd_txfm2d_8x32_c(
3538 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3539 break;
3540 case TX_16X4:
3541 svt_av1_fwd_txfm2d_16x4(
3542 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3543 break;
3544 case TX_4X16:
3545 svt_av1_fwd_txfm2d_4x16(
3546 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3547 break;
3548 case TX_8X4:
3549
3550 svt_av1_fwd_txfm2d_8x4(
3551 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3552
3553 break;
3554 case TX_4X8:
3555
3556 svt_av1_fwd_txfm2d_4x8(
3557 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3558
3559 break;
3560
3561 case TX_64X64:
3562
3563 svt_av1_fwd_txfm2d_64x64(
3564 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3565
3566 *three_quad_energy = svt_handle_transform64x64(coeff_buffer);
3567
3568 break;
3569
3570 case TX_32X32:
3571 if (transform_type == V_DCT || transform_type == H_DCT || transform_type == V_ADST ||
3572 transform_type == H_ADST || transform_type == V_FLIPADST ||
3573 transform_type == H_FLIPADST)
3574 // Tahani: I believe those cases are never hit
3575 svt_av1_transform_two_d_32x32_c(
3576 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3577
3578 else {
3579 svt_av1_fwd_txfm2d_32x32(
3580 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3581 }
3582
3583 break;
3584
3585 case TX_16X16:
3586
3587 svt_av1_fwd_txfm2d_16x16(
3588 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3589
3590 break;
3591 case TX_8X8:
3592
3593 svt_av1_fwd_txfm2d_8x8(
3594 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3595
3596 break;
3597 case TX_4X4:
3598
3599 svt_av1_fwd_txfm2d_4x4(
3600 residual_buffer, coeff_buffer, residual_stride, transform_type, bit_depth);
3601
3602 break;
3603 default: assert(0); break;
3604 }
3605
3606 return return_error;
3607 }
3608 /*********************************************************************
3609 * Transform
3610 * Note there is an implicit assumption that TU Size <= PU Size,
3611 * which is different than the HEVC requirements.
3612 *********************************************************************/
av1_estimate_transform(int16_t * residual_buffer,uint32_t residual_stride,int32_t * coeff_buffer,uint32_t coeff_stride,TxSize transform_size,uint64_t * three_quad_energy,uint32_t bit_depth,TxType transform_type,PlaneType component_type,EB_TRANS_COEFF_SHAPE trans_coeff_shape)3613 EbErrorType av1_estimate_transform(int16_t *residual_buffer, uint32_t residual_stride,
3614 int32_t *coeff_buffer, uint32_t coeff_stride,
3615 TxSize transform_size, uint64_t *three_quad_energy,
3616 uint32_t bit_depth, TxType transform_type,
3617 PlaneType component_type, EB_TRANS_COEFF_SHAPE trans_coeff_shape)
3618
3619 {
3620 (void)trans_coeff_shape;
3621 (void)coeff_stride;
3622 (void)component_type;
3623 switch (trans_coeff_shape) {
3624 case DEFAULT_SHAPE:
3625 return av1_estimate_transform_default(residual_buffer,
3626 residual_stride,
3627 coeff_buffer,
3628 coeff_stride,
3629 transform_size,
3630 three_quad_energy,
3631 bit_depth,
3632 transform_type,
3633 component_type);
3634 case N2_SHAPE:
3635 return av1_estimate_transform_N2(residual_buffer,
3636 residual_stride,
3637 coeff_buffer,
3638 coeff_stride,
3639 transform_size,
3640 three_quad_energy,
3641 bit_depth,
3642 transform_type,
3643 component_type);
3644 case N4_SHAPE:
3645 return av1_estimate_transform_N4(residual_buffer,
3646 residual_stride,
3647 coeff_buffer,
3648 coeff_stride,
3649 transform_size,
3650 three_quad_energy,
3651 bit_depth,
3652 transform_type,
3653 component_type);
3654 case ONLY_DC_SHAPE:
3655 return av1_estimate_transform_ONLY_DC(residual_buffer,
3656 residual_stride,
3657 coeff_buffer,
3658 coeff_stride,
3659 transform_size,
3660 three_quad_energy,
3661 bit_depth,
3662 transform_type,
3663 component_type);
3664 }
3665
3666 assert(0);
3667 return EB_ErrorBadParameter;
3668 }
3669 // PF_N4
highbd_fwd_txfm_64x64_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3670 static void highbd_fwd_txfm_64x64_n4(int16_t *src_diff, TranLow *coeff,
3671 int diff_stride, TxfmParam *txfm_param) {
3672 assert(txfm_param->tx_type == DCT_DCT);
3673 int32_t *dst_coeff = (int32_t *)coeff;
3674 const int bd = txfm_param->bd;
3675 svt_av1_fwd_txfm2d_64x64_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3676 }
3677
highbd_fwd_txfm_32x64_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3678 static void highbd_fwd_txfm_32x64_n4(int16_t *src_diff, TranLow *coeff,
3679 int diff_stride, TxfmParam *txfm_param) {
3680 assert(txfm_param->tx_type == DCT_DCT);
3681 int32_t *dst_coeff = (int32_t *)coeff;
3682 const int bd = txfm_param->bd;
3683 svt_av1_fwd_txfm2d_32x64_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3684 bd);
3685 }
3686
highbd_fwd_txfm_64x32_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3687 static void highbd_fwd_txfm_64x32_n4(int16_t *src_diff, TranLow *coeff,
3688 int diff_stride, TxfmParam *txfm_param) {
3689 assert(txfm_param->tx_type == DCT_DCT);
3690 int32_t *dst_coeff = (int32_t *)coeff;
3691 const int bd = txfm_param->bd;
3692 svt_av1_fwd_txfm2d_64x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3693 bd);
3694 }
3695
highbd_fwd_txfm_16x64_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3696 static void highbd_fwd_txfm_16x64_n4(int16_t *src_diff, TranLow *coeff,
3697 int diff_stride, TxfmParam *txfm_param) {
3698 assert(txfm_param->tx_type == DCT_DCT);
3699 int32_t *dst_coeff = (int32_t *)coeff;
3700 const int bd = txfm_param->bd;
3701 svt_av1_fwd_txfm2d_16x64_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3702 }
3703
highbd_fwd_txfm_64x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3704 static void highbd_fwd_txfm_64x16_n4(int16_t *src_diff, TranLow *coeff,
3705 int diff_stride, TxfmParam *txfm_param) {
3706 assert(txfm_param->tx_type == DCT_DCT);
3707 int32_t *dst_coeff = (int32_t *)coeff;
3708 const int bd = txfm_param->bd;
3709 svt_av1_fwd_txfm2d_64x16_N4(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3710 }
3711
highbd_fwd_txfm_32x32_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3712 static void highbd_fwd_txfm_32x32_n4(int16_t *src_diff, TranLow *coeff,
3713 int diff_stride, TxfmParam *txfm_param) {
3714 int32_t *dst_coeff = (int32_t *)coeff;
3715 const TxType tx_type = txfm_param->tx_type;
3716 const int bd = txfm_param->bd;
3717 svt_av1_fwd_txfm2d_32x32_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3718 }
3719
highbd_fwd_txfm_16x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3720 static void highbd_fwd_txfm_16x16_n4(int16_t *src_diff, TranLow *coeff,
3721 int diff_stride, TxfmParam *txfm_param) {
3722 int32_t *dst_coeff = (int32_t *)coeff;
3723 const TxType tx_type = txfm_param->tx_type;
3724 const int bd = txfm_param->bd;
3725 svt_av1_fwd_txfm2d_16x16_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3726 }
3727
highbd_fwd_txfm_8x8_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3728 static void highbd_fwd_txfm_8x8_n4(int16_t *src_diff, TranLow *coeff,
3729 int diff_stride, TxfmParam *txfm_param) {
3730 int32_t *dst_coeff = (int32_t *)coeff;
3731 const TxType tx_type = txfm_param->tx_type;
3732 const int bd = txfm_param->bd;
3733 svt_av1_fwd_txfm2d_8x8_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3734 }
3735
highbd_fwd_txfm_4x8_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3736 static void highbd_fwd_txfm_4x8_n4(int16_t *src_diff, TranLow *coeff,
3737 int diff_stride, TxfmParam *txfm_param) {
3738 int32_t *dst_coeff = (int32_t *)coeff;
3739 svt_av1_fwd_txfm2d_4x8_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3740 txfm_param->bd);
3741 }
3742
highbd_fwd_txfm_8x4_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3743 static void highbd_fwd_txfm_8x4_n4(int16_t *src_diff, TranLow *coeff,
3744 int diff_stride, TxfmParam *txfm_param) {
3745 int32_t *dst_coeff = (int32_t *)coeff;
3746 svt_av1_fwd_txfm2d_8x4_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3747 txfm_param->bd);
3748 }
3749
highbd_fwd_txfm_8x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3750 static void highbd_fwd_txfm_8x16_n4(int16_t *src_diff, TranLow *coeff,
3751 int diff_stride, TxfmParam *txfm_param) {
3752 int32_t *dst_coeff = (int32_t *)coeff;
3753 const TxType tx_type = txfm_param->tx_type;
3754 const int bd = txfm_param->bd;
3755 svt_av1_fwd_txfm2d_8x16_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3756 }
3757
highbd_fwd_txfm_16x8_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3758 static void highbd_fwd_txfm_16x8_n4(int16_t *src_diff, TranLow *coeff,
3759 int diff_stride, TxfmParam *txfm_param) {
3760 int32_t *dst_coeff = (int32_t *)coeff;
3761 const TxType tx_type = txfm_param->tx_type;
3762 const int bd = txfm_param->bd;
3763 svt_av1_fwd_txfm2d_16x8_N4(src_diff, dst_coeff, diff_stride, tx_type, bd);
3764 }
3765
highbd_fwd_txfm_16x32_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3766 static void highbd_fwd_txfm_16x32_n4(int16_t *src_diff, TranLow *coeff,
3767 int diff_stride, TxfmParam *txfm_param) {
3768 int32_t *dst_coeff = (int32_t *)coeff;
3769 svt_av1_fwd_txfm2d_16x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3770 txfm_param->bd);
3771 }
3772
highbd_fwd_txfm_32x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3773 static void highbd_fwd_txfm_32x16_n4(int16_t *src_diff, TranLow *coeff,
3774 int diff_stride, TxfmParam *txfm_param) {
3775 int32_t *dst_coeff = (int32_t *)coeff;
3776 svt_av1_fwd_txfm2d_32x16_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3777 txfm_param->bd);
3778 }
3779
highbd_fwd_txfm_4x16_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3780 static void highbd_fwd_txfm_4x16_n4(int16_t *src_diff, TranLow *coeff,
3781 int diff_stride, TxfmParam *txfm_param) {
3782 int32_t *dst_coeff = (int32_t *)coeff;
3783 svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3784 txfm_param->bd);
3785 }
3786
highbd_fwd_txfm_16x4_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3787 static void highbd_fwd_txfm_16x4_n4(int16_t *src_diff, TranLow *coeff,
3788 int diff_stride, TxfmParam *txfm_param) {
3789 int32_t *dst_coeff = (int32_t *)coeff;
3790 svt_av1_fwd_txfm2d_16x4_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3791 txfm_param->bd);
3792 }
3793
highbd_fwd_txfm_8x32_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3794 static void highbd_fwd_txfm_8x32_n4(int16_t *src_diff, TranLow *coeff,
3795 int diff_stride, TxfmParam *txfm_param) {
3796 int32_t *dst_coeff = (int32_t *)coeff;
3797 svt_av1_fwd_txfm2d_8x32_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3798 txfm_param->bd);
3799 }
3800
highbd_fwd_txfm_32x8_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3801 static void highbd_fwd_txfm_32x8_n4(int16_t *src_diff, TranLow *coeff,
3802 int diff_stride, TxfmParam *txfm_param) {
3803 int32_t *dst_coeff = (int32_t *)coeff;
3804 svt_av1_fwd_txfm2d_32x8_N4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3805 txfm_param->bd);
3806 }
3807
3808 //PF_N2
highbd_fwd_txfm_64x64_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3809 static void highbd_fwd_txfm_64x64_n2(int16_t *src_diff, TranLow *coeff,
3810 int diff_stride, TxfmParam *txfm_param) {
3811 assert(txfm_param->tx_type == DCT_DCT);
3812 int32_t *dst_coeff = (int32_t *)coeff;
3813 const int bd = txfm_param->bd;
3814 svt_av1_fwd_txfm2d_64x64_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3815 }
3816
highbd_fwd_txfm_32x64_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3817 static void highbd_fwd_txfm_32x64_n2(int16_t *src_diff, TranLow *coeff,
3818 int diff_stride, TxfmParam *txfm_param) {
3819 assert(txfm_param->tx_type == DCT_DCT);
3820 int32_t *dst_coeff = (int32_t *)coeff;
3821 const int bd = txfm_param->bd;
3822 svt_av1_fwd_txfm2d_32x64_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3823 bd);
3824 }
3825
highbd_fwd_txfm_64x32_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3826 static void highbd_fwd_txfm_64x32_n2(int16_t *src_diff, TranLow *coeff,
3827 int diff_stride, TxfmParam *txfm_param) {
3828 assert(txfm_param->tx_type == DCT_DCT);
3829 int32_t *dst_coeff = (int32_t *)coeff;
3830 const int bd = txfm_param->bd;
3831 svt_av1_fwd_txfm2d_64x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3832 bd);
3833 }
3834
highbd_fwd_txfm_16x64_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3835 static void highbd_fwd_txfm_16x64_n2(int16_t *src_diff, TranLow *coeff,
3836 int diff_stride, TxfmParam *txfm_param) {
3837 assert(txfm_param->tx_type == DCT_DCT);
3838 int32_t *dst_coeff = (int32_t *)coeff;
3839 const int bd = txfm_param->bd;
3840 svt_av1_fwd_txfm2d_16x64_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3841 }
3842
highbd_fwd_txfm_64x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3843 static void highbd_fwd_txfm_64x16_n2(int16_t *src_diff, TranLow *coeff,
3844 int diff_stride, TxfmParam *txfm_param) {
3845 assert(txfm_param->tx_type == DCT_DCT);
3846 int32_t *dst_coeff = (int32_t *)coeff;
3847 const int bd = txfm_param->bd;
3848 svt_av1_fwd_txfm2d_64x16_N2(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3849 }
3850
highbd_fwd_txfm_32x32_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3851 static void highbd_fwd_txfm_32x32_n2(int16_t *src_diff, TranLow *coeff,
3852 int diff_stride, TxfmParam *txfm_param) {
3853 int32_t *dst_coeff = (int32_t *)coeff;
3854 const TxType tx_type = txfm_param->tx_type;
3855 const int bd = txfm_param->bd;
3856 svt_av1_fwd_txfm2d_32x32_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3857 }
3858
highbd_fwd_txfm_16x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3859 static void highbd_fwd_txfm_16x16_n2(int16_t *src_diff, TranLow *coeff,
3860 int diff_stride, TxfmParam *txfm_param) {
3861 int32_t *dst_coeff = (int32_t *)coeff;
3862 const TxType tx_type = txfm_param->tx_type;
3863 const int bd = txfm_param->bd;
3864 svt_av1_fwd_txfm2d_16x16_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3865 }
3866
highbd_fwd_txfm_8x8_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3867 static void highbd_fwd_txfm_8x8_n2(int16_t *src_diff, TranLow *coeff,
3868 int diff_stride, TxfmParam *txfm_param) {
3869 int32_t *dst_coeff = (int32_t *)coeff;
3870 const TxType tx_type = txfm_param->tx_type;
3871 const int bd = txfm_param->bd;
3872 svt_av1_fwd_txfm2d_8x8_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3873 }
3874
highbd_fwd_txfm_4x8_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3875 static void highbd_fwd_txfm_4x8_n2(int16_t *src_diff, TranLow *coeff,
3876 int diff_stride, TxfmParam *txfm_param) {
3877 int32_t *dst_coeff = (int32_t *)coeff;
3878 svt_av1_fwd_txfm2d_4x8_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3879 txfm_param->bd);
3880 }
3881
highbd_fwd_txfm_8x4_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3882 static void highbd_fwd_txfm_8x4_n2(int16_t *src_diff, TranLow *coeff,
3883 int diff_stride, TxfmParam *txfm_param) {
3884 int32_t *dst_coeff = (int32_t *)coeff;
3885 svt_av1_fwd_txfm2d_8x4_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3886 txfm_param->bd);
3887 }
3888
highbd_fwd_txfm_8x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3889 static void highbd_fwd_txfm_8x16_n2(int16_t *src_diff, TranLow *coeff,
3890 int diff_stride, TxfmParam *txfm_param) {
3891 int32_t *dst_coeff = (int32_t *)coeff;
3892 const TxType tx_type = txfm_param->tx_type;
3893 const int bd = txfm_param->bd;
3894 svt_av1_fwd_txfm2d_8x16_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3895 }
3896
highbd_fwd_txfm_16x8_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3897 static void highbd_fwd_txfm_16x8_n2(int16_t *src_diff, TranLow *coeff,
3898 int diff_stride, TxfmParam *txfm_param) {
3899 int32_t *dst_coeff = (int32_t *)coeff;
3900 const TxType tx_type = txfm_param->tx_type;
3901 const int bd = txfm_param->bd;
3902 svt_av1_fwd_txfm2d_16x8_N2(src_diff, dst_coeff, diff_stride, tx_type, bd);
3903 }
3904
highbd_fwd_txfm_16x32_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3905 static void highbd_fwd_txfm_16x32_n2(int16_t *src_diff, TranLow *coeff,
3906 int diff_stride, TxfmParam *txfm_param) {
3907 int32_t *dst_coeff = (int32_t *)coeff;
3908 svt_av1_fwd_txfm2d_16x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3909 txfm_param->bd);
3910 }
3911
highbd_fwd_txfm_32x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3912 static void highbd_fwd_txfm_32x16_n2(int16_t *src_diff, TranLow *coeff,
3913 int diff_stride, TxfmParam *txfm_param) {
3914 int32_t *dst_coeff = (int32_t *)coeff;
3915 svt_av1_fwd_txfm2d_32x16_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3916 txfm_param->bd);
3917 }
3918
highbd_fwd_txfm_4x16_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3919 static void highbd_fwd_txfm_4x16_n2(int16_t *src_diff, TranLow *coeff,
3920 int diff_stride, TxfmParam *txfm_param) {
3921 int32_t *dst_coeff = (int32_t *)coeff;
3922 svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3923 txfm_param->bd);
3924 }
3925
highbd_fwd_txfm_16x4_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3926 static void highbd_fwd_txfm_16x4_n2(int16_t *src_diff, TranLow *coeff,
3927 int diff_stride, TxfmParam *txfm_param) {
3928 int32_t *dst_coeff = (int32_t *)coeff;
3929 svt_av1_fwd_txfm2d_16x4_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3930 txfm_param->bd);
3931 }
3932
highbd_fwd_txfm_8x32_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3933 static void highbd_fwd_txfm_8x32_n2(int16_t *src_diff, TranLow *coeff,
3934 int diff_stride, TxfmParam *txfm_param) {
3935 int32_t *dst_coeff = (int32_t *)coeff;
3936 svt_av1_fwd_txfm2d_8x32_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3937 txfm_param->bd);
3938 }
3939
highbd_fwd_txfm_32x8_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3940 static void highbd_fwd_txfm_32x8_n2(int16_t *src_diff, TranLow *coeff,
3941 int diff_stride, TxfmParam *txfm_param) {
3942 int32_t *dst_coeff = (int32_t *)coeff;
3943 svt_av1_fwd_txfm2d_32x8_N2(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
3944 txfm_param->bd);
3945 }
highbd_fwd_txfm_64x64(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3946 static void highbd_fwd_txfm_64x64(int16_t *src_diff, TranLow *coeff, int diff_stride,
3947 TxfmParam *txfm_param) {
3948 assert(txfm_param->tx_type == DCT_DCT);
3949 int32_t * dst_coeff = (int32_t *)coeff;
3950 const int bd = txfm_param->bd;
3951 svt_av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3952 }
3953
highbd_fwd_txfm_32x64(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3954 static void highbd_fwd_txfm_32x64(int16_t *src_diff, TranLow *coeff, int diff_stride,
3955 TxfmParam *txfm_param) {
3956 assert(txfm_param->tx_type == DCT_DCT);
3957 int32_t * dst_coeff = (int32_t *)coeff;
3958 const int bd = txfm_param->bd;
3959 svt_av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
3960 }
3961
highbd_fwd_txfm_64x32(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3962 static void highbd_fwd_txfm_64x32(int16_t *src_diff, TranLow *coeff, int diff_stride,
3963 TxfmParam *txfm_param) {
3964 assert(txfm_param->tx_type == DCT_DCT);
3965 int32_t * dst_coeff = (int32_t *)coeff;
3966 const int bd = txfm_param->bd;
3967 svt_av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, bd);
3968 }
3969
highbd_fwd_txfm_16x64(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3970 static void highbd_fwd_txfm_16x64(int16_t *src_diff, TranLow *coeff, int diff_stride,
3971 TxfmParam *txfm_param) {
3972 assert(txfm_param->tx_type == DCT_DCT);
3973 int32_t * dst_coeff = (int32_t *)coeff;
3974 const int bd = txfm_param->bd;
3975 svt_av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3976 }
3977
highbd_fwd_txfm_64x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3978 static void highbd_fwd_txfm_64x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
3979 TxfmParam *txfm_param) {
3980 assert(txfm_param->tx_type == DCT_DCT);
3981 int32_t * dst_coeff = (int32_t *)coeff;
3982 const int bd = txfm_param->bd;
3983 svt_av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
3984 }
3985
highbd_fwd_txfm_32x32(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3986 static void highbd_fwd_txfm_32x32(int16_t *src_diff, TranLow *coeff, int diff_stride,
3987 TxfmParam *txfm_param) {
3988 int32_t * dst_coeff = (int32_t *)coeff;
3989 const TxType tx_type = txfm_param->tx_type;
3990 const int bd = txfm_param->bd;
3991 svt_av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
3992 }
3993
highbd_fwd_txfm_16x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)3994 static void highbd_fwd_txfm_16x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
3995 TxfmParam *txfm_param) {
3996 int32_t * dst_coeff = (int32_t *)coeff;
3997 const TxType tx_type = txfm_param->tx_type;
3998 const int bd = txfm_param->bd;
3999 svt_av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
4000 }
4001
highbd_fwd_txfm_8x8(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4002 static void highbd_fwd_txfm_8x8(int16_t *src_diff, TranLow *coeff, int diff_stride,
4003 TxfmParam *txfm_param) {
4004 int32_t * dst_coeff = (int32_t *)coeff;
4005 const TxType tx_type = txfm_param->tx_type;
4006 const int bd = txfm_param->bd;
4007 svt_av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
4008 }
4009
highbd_fwd_txfm_4x8(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4010 static void highbd_fwd_txfm_4x8(int16_t *src_diff, TranLow *coeff, int diff_stride,
4011 TxfmParam *txfm_param) {
4012 int32_t *dst_coeff = (int32_t *)coeff;
4013 svt_av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4014 }
4015
highbd_fwd_txfm_8x4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4016 static void highbd_fwd_txfm_8x4(int16_t *src_diff, TranLow *coeff, int diff_stride,
4017 TxfmParam *txfm_param) {
4018 int32_t *dst_coeff = (int32_t *)coeff;
4019 svt_av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4020 }
4021
highbd_fwd_txfm_8x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4022 static void highbd_fwd_txfm_8x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
4023 TxfmParam *txfm_param) {
4024 int32_t * dst_coeff = (int32_t *)coeff;
4025 const TxType tx_type = txfm_param->tx_type;
4026 const int bd = txfm_param->bd;
4027 svt_av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
4028 }
4029
highbd_fwd_txfm_16x8(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4030 static void highbd_fwd_txfm_16x8(int16_t *src_diff, TranLow *coeff, int diff_stride,
4031 TxfmParam *txfm_param) {
4032 int32_t * dst_coeff = (int32_t *)coeff;
4033 const TxType tx_type = txfm_param->tx_type;
4034 const int bd = txfm_param->bd;
4035 svt_av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
4036 }
4037
highbd_fwd_txfm_16x32(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4038 static void highbd_fwd_txfm_16x32(int16_t *src_diff, TranLow *coeff, int diff_stride,
4039 TxfmParam *txfm_param) {
4040 int32_t *dst_coeff = (int32_t *)coeff;
4041 svt_av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4042 }
4043
highbd_fwd_txfm_32x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4044 static void highbd_fwd_txfm_32x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
4045 TxfmParam *txfm_param) {
4046 int32_t *dst_coeff = (int32_t *)coeff;
4047 svt_av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4048 }
4049
highbd_fwd_txfm_4x16(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4050 static void highbd_fwd_txfm_4x16(int16_t *src_diff, TranLow *coeff, int diff_stride,
4051 TxfmParam *txfm_param) {
4052 int32_t *dst_coeff = (int32_t *)coeff;
4053 svt_av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4054 }
4055
highbd_fwd_txfm_16x4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4056 static void highbd_fwd_txfm_16x4(int16_t *src_diff, TranLow *coeff, int diff_stride,
4057 TxfmParam *txfm_param) {
4058 int32_t *dst_coeff = (int32_t *)coeff;
4059 svt_av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4060 }
4061
highbd_fwd_txfm_8x32(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4062 static void highbd_fwd_txfm_8x32(int16_t *src_diff, TranLow *coeff, int diff_stride,
4063 TxfmParam *txfm_param) {
4064 int32_t *dst_coeff = (int32_t *)coeff;
4065 svt_av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4066 }
4067
highbd_fwd_txfm_32x8(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4068 static void highbd_fwd_txfm_32x8(int16_t *src_diff, TranLow *coeff, int diff_stride,
4069 TxfmParam *txfm_param) {
4070 int32_t *dst_coeff = (int32_t *)coeff;
4071 svt_av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type, txfm_param->bd);
4072 }
svt_av1_highbd_fwd_txfm_n4(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4073 void svt_av1_highbd_fwd_txfm_n4(int16_t *src_diff, TranLow *coeff,
4074 int diff_stride, TxfmParam *txfm_param) {
4075 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4076 const TxSize tx_size = txfm_param->tx_size;
4077 switch (tx_size) {
4078 case TX_64X64:
4079 highbd_fwd_txfm_64x64_n4(src_diff, coeff, diff_stride, txfm_param);
4080 break;
4081 case TX_32X64:
4082 highbd_fwd_txfm_32x64_n4(src_diff, coeff, diff_stride, txfm_param);
4083 break;
4084 case TX_64X32:
4085 highbd_fwd_txfm_64x32_n4(src_diff, coeff, diff_stride, txfm_param);
4086 break;
4087 case TX_16X64:
4088 highbd_fwd_txfm_16x64_n4(src_diff, coeff, diff_stride, txfm_param);
4089 break;
4090 case TX_64X16:
4091 highbd_fwd_txfm_64x16_n4(src_diff, coeff, diff_stride, txfm_param);
4092 break;
4093 case TX_32X32:
4094 highbd_fwd_txfm_32x32_n4(src_diff, coeff, diff_stride, txfm_param);
4095 break;
4096 case TX_16X16:
4097 highbd_fwd_txfm_16x16_n4(src_diff, coeff, diff_stride, txfm_param);
4098 break;
4099 case TX_8X8:
4100 highbd_fwd_txfm_8x8_n4(src_diff, coeff, diff_stride, txfm_param);
4101 break;
4102 case TX_4X8:
4103 highbd_fwd_txfm_4x8_n4(src_diff, coeff, diff_stride, txfm_param);
4104 break;
4105 case TX_8X4:
4106 highbd_fwd_txfm_8x4_n4(src_diff, coeff, diff_stride, txfm_param);
4107 break;
4108 case TX_8X16:
4109 highbd_fwd_txfm_8x16_n4(src_diff, coeff, diff_stride, txfm_param);
4110 break;
4111 case TX_16X8:
4112 highbd_fwd_txfm_16x8_n4(src_diff, coeff, diff_stride, txfm_param);
4113 break;
4114 case TX_16X32:
4115 highbd_fwd_txfm_16x32_n4(src_diff, coeff, diff_stride, txfm_param);
4116 break;
4117 case TX_32X16:
4118 highbd_fwd_txfm_32x16_n4(src_diff, coeff, diff_stride, txfm_param);
4119 break;
4120 case TX_4X4:
4121 //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4122 break;
4123 case TX_4X16:
4124 highbd_fwd_txfm_4x16_n4(src_diff, coeff, diff_stride, txfm_param);
4125 break;
4126 case TX_16X4:
4127 highbd_fwd_txfm_16x4_n4(src_diff, coeff, diff_stride, txfm_param);
4128 break;
4129 case TX_8X32:
4130 highbd_fwd_txfm_8x32_n4(src_diff, coeff, diff_stride, txfm_param);
4131 break;
4132 case TX_32X8:
4133 highbd_fwd_txfm_32x8_n4(src_diff, coeff, diff_stride, txfm_param);
4134 break;
4135 default: assert(0); break;
4136 }
4137 }
svt_av1_highbd_fwd_txfm_n2(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4138 void svt_av1_highbd_fwd_txfm_n2(int16_t *src_diff, TranLow *coeff,
4139 int diff_stride, TxfmParam *txfm_param) {
4140 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4141 const TxSize tx_size = txfm_param->tx_size;
4142 switch (tx_size) {
4143 case TX_64X64:
4144 highbd_fwd_txfm_64x64_n2(src_diff, coeff, diff_stride, txfm_param);
4145 break;
4146 case TX_32X64:
4147 highbd_fwd_txfm_32x64_n2(src_diff, coeff, diff_stride, txfm_param);
4148 break;
4149 case TX_64X32:
4150 highbd_fwd_txfm_64x32_n2(src_diff, coeff, diff_stride, txfm_param);
4151 break;
4152 case TX_16X64:
4153 highbd_fwd_txfm_16x64_n2(src_diff, coeff, diff_stride, txfm_param);
4154 break;
4155 case TX_64X16:
4156 highbd_fwd_txfm_64x16_n2(src_diff, coeff, diff_stride, txfm_param);
4157 break;
4158 case TX_32X32:
4159 highbd_fwd_txfm_32x32_n2(src_diff, coeff, diff_stride, txfm_param);
4160 break;
4161 case TX_16X16:
4162 highbd_fwd_txfm_16x16_n2(src_diff, coeff, diff_stride, txfm_param);
4163 break;
4164 case TX_8X8:
4165 highbd_fwd_txfm_8x8_n2(src_diff, coeff, diff_stride, txfm_param);
4166 break;
4167 case TX_4X8:
4168 highbd_fwd_txfm_4x8_n2(src_diff, coeff, diff_stride, txfm_param);
4169 break;
4170 case TX_8X4:
4171 highbd_fwd_txfm_8x4_n2(src_diff, coeff, diff_stride, txfm_param);
4172 break;
4173 case TX_8X16:
4174 highbd_fwd_txfm_8x16_n2(src_diff, coeff, diff_stride, txfm_param);
4175 break;
4176 case TX_16X8:
4177 highbd_fwd_txfm_16x8_n2(src_diff, coeff, diff_stride, txfm_param);
4178 break;
4179 case TX_16X32:
4180 highbd_fwd_txfm_16x32_n2(src_diff, coeff, diff_stride, txfm_param);
4181 break;
4182 case TX_32X16:
4183 highbd_fwd_txfm_32x16_n2(src_diff, coeff, diff_stride, txfm_param);
4184 break;
4185 case TX_4X4:
4186 //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4187 break;
4188 case TX_4X16:
4189 highbd_fwd_txfm_4x16_n2(src_diff, coeff, diff_stride, txfm_param);
4190 break;
4191 case TX_16X4:
4192 highbd_fwd_txfm_16x4_n2(src_diff, coeff, diff_stride, txfm_param);
4193 break;
4194 case TX_8X32:
4195 highbd_fwd_txfm_8x32_n2(src_diff, coeff, diff_stride, txfm_param);
4196 break;
4197 case TX_32X8:
4198 highbd_fwd_txfm_32x8_n2(src_diff, coeff, diff_stride, txfm_param);
4199 break;
4200 default: assert(0); break;
4201 }
4202 }
svt_av1_highbd_fwd_txfm(int16_t * src_diff,TranLow * coeff,int diff_stride,TxfmParam * txfm_param)4203 void svt_av1_highbd_fwd_txfm(int16_t *src_diff, TranLow *coeff, int diff_stride,
4204 TxfmParam *txfm_param) {
4205 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4206 const TxSize tx_size = txfm_param->tx_size;
4207 switch (tx_size) {
4208 case TX_64X64: highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param); break;
4209 case TX_32X64: highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param); break;
4210 case TX_64X32: highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param); break;
4211 case TX_16X64: highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param); break;
4212 case TX_64X16: highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param); break;
4213 case TX_32X32: highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param); break;
4214 case TX_16X16: highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param); break;
4215 case TX_8X8: highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param); break;
4216 case TX_4X8: highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param); break;
4217 case TX_8X4: highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param); break;
4218 case TX_8X16: highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param); break;
4219 case TX_16X8: highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param); break;
4220 case TX_16X32: highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param); break;
4221 case TX_32X16: highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param); break;
4222 case TX_4X4:
4223 //hack highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
4224 break;
4225 case TX_4X16: highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param); break;
4226 case TX_16X4: highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param); break;
4227 case TX_8X32: highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param); break;
4228 case TX_32X8: highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param); break;
4229 default: assert(0); break;
4230 }
4231 }
svt_av1_wht_fwd_txfm(int16_t * src_diff,int bw,int32_t * coeff,TxSize tx_size,EB_TRANS_COEFF_SHAPE pf_shape,int bit_depth,int is_hbd)4232 void svt_av1_wht_fwd_txfm(int16_t *src_diff, int bw, int32_t *coeff, TxSize tx_size, EB_TRANS_COEFF_SHAPE pf_shape, int bit_depth,
4233 int is_hbd) {
4234 TxfmParam txfm_param;
4235 txfm_param.tx_type = DCT_DCT;
4236 txfm_param.tx_size = tx_size;
4237 txfm_param.lossless = 0;
4238 txfm_param.tx_set_type = EXT_TX_SET_ALL16;
4239
4240 txfm_param.bd = bit_depth;
4241 txfm_param.is_hbd = is_hbd;
4242 switch (pf_shape) {
4243 case N4_SHAPE:
4244 svt_av1_highbd_fwd_txfm_n4(src_diff, coeff, bw, &txfm_param);
4245 break;
4246 case N2_SHAPE:
4247 svt_av1_highbd_fwd_txfm_n2(src_diff, coeff, bw, &txfm_param);
4248 break;
4249 default:
4250 svt_av1_highbd_fwd_txfm(src_diff, coeff, bw, &txfm_param);
4251 }
4252 }
svt_av1_fidentity16_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4253 void svt_av1_fidentity16_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
4254 const int8_t *stage_range) {
4255 (void)stage_range;
4256 (void)cos_bit;
4257 for (int32_t i = 0; i < 8; ++i)
4258 output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
4259
4260 assert(stage_range[0] + new_sqrt2_bits <= 32);
4261 }
4262
svt_av1_fadst16_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4263 void svt_av1_fadst16_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4264 const int8_t *stage_range) {
4265 (void)stage_range;
4266 const int32_t *cospi;
4267
4268 int32_t *bf0, *bf1;
4269 int32_t step[16];
4270
4271 // stage 0;
4272
4273 // stage 1;
4274 assert(output != input);
4275 bf1 = output;
4276 bf1[0] = input[0];
4277 bf1[1] = -input[15];
4278 bf1[2] = -input[7];
4279 bf1[3] = input[8];
4280 bf1[4] = -input[3];
4281 bf1[5] = input[12];
4282 bf1[6] = input[4];
4283 bf1[7] = -input[11];
4284 bf1[8] = -input[1];
4285 bf1[9] = input[14];
4286 bf1[10] = input[6];
4287 bf1[11] = -input[9];
4288 bf1[12] = input[2];
4289 bf1[13] = -input[13];
4290 bf1[14] = -input[5];
4291 bf1[15] = input[10];
4292
4293 // stage 2
4294 cospi = cospi_arr(cos_bit);
4295 bf0 = output;
4296 bf1 = step;
4297 bf1[0] = bf0[0];
4298 bf1[1] = bf0[1];
4299 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
4300 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
4301 bf1[4] = bf0[4];
4302 bf1[5] = bf0[5];
4303 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
4304 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
4305 bf1[8] = bf0[8];
4306 bf1[9] = bf0[9];
4307 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
4308 bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
4309 bf1[12] = bf0[12];
4310 bf1[13] = bf0[13];
4311 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
4312 bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
4313
4314 // stage 3
4315 bf0 = step;
4316 bf1 = output;
4317 bf1[0] = bf0[0] + bf0[2];
4318 bf1[1] = bf0[1] + bf0[3];
4319 bf1[2] = bf0[0] - bf0[2];
4320 bf1[3] = bf0[1] - bf0[3];
4321 bf1[4] = bf0[4] + bf0[6];
4322 bf1[5] = bf0[5] + bf0[7];
4323 bf1[6] = bf0[4] - bf0[6];
4324 bf1[7] = bf0[5] - bf0[7];
4325 bf1[8] = bf0[8] + bf0[10];
4326 bf1[9] = bf0[9] + bf0[11];
4327 bf1[10] = bf0[8] - bf0[10];
4328 bf1[11] = bf0[9] - bf0[11];
4329 bf1[12] = bf0[12] + bf0[14];
4330 bf1[13] = bf0[13] + bf0[15];
4331 bf1[14] = bf0[12] - bf0[14];
4332 bf1[15] = bf0[13] - bf0[15];
4333
4334 // stage 4
4335 cospi = cospi_arr(cos_bit);
4336 bf0 = output;
4337 bf1 = step;
4338 bf1[0] = bf0[0];
4339 bf1[1] = bf0[1];
4340 bf1[2] = bf0[2];
4341 bf1[3] = bf0[3];
4342 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
4343 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
4344 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
4345 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
4346 bf1[8] = bf0[8];
4347 bf1[9] = bf0[9];
4348 bf1[10] = bf0[10];
4349 bf1[11] = bf0[11];
4350 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
4351 bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
4352 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
4353 bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
4354
4355 // stage 5
4356 bf0 = step;
4357 bf1 = output;
4358 bf1[0] = bf0[0] + bf0[4];
4359 bf1[1] = bf0[1] + bf0[5];
4360 bf1[2] = bf0[2] + bf0[6];
4361 bf1[3] = bf0[3] + bf0[7];
4362 bf1[4] = bf0[0] - bf0[4];
4363 bf1[5] = bf0[1] - bf0[5];
4364 bf1[6] = bf0[2] - bf0[6];
4365 bf1[7] = bf0[3] - bf0[7];
4366 bf1[8] = bf0[8] + bf0[12];
4367 bf1[9] = bf0[9] + bf0[13];
4368 bf1[10] = bf0[10] + bf0[14];
4369 bf1[11] = bf0[11] + bf0[15];
4370 bf1[12] = bf0[8] - bf0[12];
4371 bf1[13] = bf0[9] - bf0[13];
4372 bf1[14] = bf0[10] - bf0[14];
4373 bf1[15] = bf0[11] - bf0[15];
4374
4375 // stage 6
4376 cospi = cospi_arr(cos_bit);
4377 bf0 = output;
4378 bf1 = step;
4379 bf1[0] = bf0[0];
4380 bf1[1] = bf0[1];
4381 bf1[2] = bf0[2];
4382 bf1[3] = bf0[3];
4383 bf1[4] = bf0[4];
4384 bf1[5] = bf0[5];
4385 bf1[6] = bf0[6];
4386 bf1[7] = bf0[7];
4387 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
4388 bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
4389 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
4390 bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
4391 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
4392 bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
4393 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
4394 bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
4395
4396 // stage 7
4397 bf0 = step;
4398 bf1 = output;
4399 bf1[0] = bf0[0] + bf0[8];
4400 bf1[1] = bf0[1] + bf0[9];
4401 bf1[2] = bf0[2] + bf0[10];
4402 bf1[3] = bf0[3] + bf0[11];
4403 bf1[4] = bf0[4] + bf0[12];
4404 bf1[5] = bf0[5] + bf0[13];
4405 bf1[6] = bf0[6] + bf0[14];
4406 bf1[7] = bf0[7] + bf0[15];
4407 bf1[8] = bf0[0] - bf0[8];
4408 bf1[9] = bf0[1] - bf0[9];
4409 bf1[10] = bf0[2] - bf0[10];
4410 bf1[11] = bf0[3] - bf0[11];
4411 bf1[12] = bf0[4] - bf0[12];
4412 bf1[13] = bf0[5] - bf0[13];
4413 bf1[14] = bf0[6] - bf0[14];
4414 bf1[15] = bf0[7] - bf0[15];
4415
4416 // stage 8
4417 cospi = cospi_arr(cos_bit);
4418 bf0 = output;
4419 bf1 = step;
4420 bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
4421 bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
4422 bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
4423 bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
4424 bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
4425 bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
4426 bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
4427 bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
4428
4429 // stage 9
4430 bf0 = step;
4431 bf1 = output;
4432 bf1[0] = bf0[1];
4433 bf1[1] = bf0[14];
4434 bf1[2] = bf0[3];
4435 bf1[3] = bf0[12];
4436 bf1[4] = bf0[5];
4437 bf1[5] = bf0[10];
4438 bf1[6] = bf0[7];
4439 bf1[7] = bf0[8];
4440 }
4441
svt_av1_fdct16_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4442 void svt_av1_fdct16_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4443 const int8_t *stage_range) {
4444 (void)stage_range;
4445 const int32_t *cospi;
4446
4447 int32_t *bf0, *bf1;
4448 int32_t step[16];
4449
4450 // stage 0;
4451
4452 // stage 1;
4453 bf1 = output;
4454 bf1[0] = input[0] + input[15];
4455 bf1[1] = input[1] + input[14];
4456 bf1[2] = input[2] + input[13];
4457 bf1[3] = input[3] + input[12];
4458 bf1[4] = input[4] + input[11];
4459 bf1[5] = input[5] + input[10];
4460 bf1[6] = input[6] + input[9];
4461 bf1[7] = input[7] + input[8];
4462 bf1[8] = -input[8] + input[7];
4463 bf1[9] = -input[9] + input[6];
4464 bf1[10] = -input[10] + input[5];
4465 bf1[11] = -input[11] + input[4];
4466 bf1[12] = -input[12] + input[3];
4467 bf1[13] = -input[13] + input[2];
4468 bf1[14] = -input[14] + input[1];
4469 bf1[15] = -input[15] + input[0];
4470
4471 // stage 2
4472 cospi = cospi_arr(cos_bit);
4473 bf0 = output;
4474 bf1 = step;
4475 bf1[0] = bf0[0] + bf0[7];
4476 bf1[1] = bf0[1] + bf0[6];
4477 bf1[2] = bf0[2] + bf0[5];
4478 bf1[3] = bf0[3] + bf0[4];
4479 bf1[4] = -bf0[4] + bf0[3];
4480 bf1[5] = -bf0[5] + bf0[2];
4481 bf1[6] = -bf0[6] + bf0[1];
4482 bf1[7] = -bf0[7] + bf0[0];
4483 bf1[8] = bf0[8];
4484 bf1[9] = bf0[9];
4485 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
4486 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
4487 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
4488 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
4489 bf1[14] = bf0[14];
4490 bf1[15] = bf0[15];
4491
4492 // stage 3
4493 cospi = cospi_arr(cos_bit);
4494 bf0 = step;
4495 bf1 = output;
4496 bf1[0] = bf0[0] + bf0[3];
4497 bf1[1] = bf0[1] + bf0[2];
4498 bf1[2] = -bf0[2] + bf0[1];
4499 bf1[3] = -bf0[3] + bf0[0];
4500 bf1[4] = bf0[4];
4501 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
4502 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
4503 bf1[7] = bf0[7];
4504 bf1[8] = bf0[8] + bf0[11];
4505 bf1[9] = bf0[9] + bf0[10];
4506 bf1[10] = -bf0[10] + bf0[9];
4507 bf1[11] = -bf0[11] + bf0[8];
4508 bf1[12] = -bf0[12] + bf0[15];
4509 bf1[13] = -bf0[13] + bf0[14];
4510 bf1[14] = bf0[14] + bf0[13];
4511 bf1[15] = bf0[15] + bf0[12];
4512
4513 // stage 4
4514 cospi = cospi_arr(cos_bit);
4515 bf0 = output;
4516 bf1 = step;
4517 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4518 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4519 bf1[4] = bf0[4] + bf0[5];
4520 bf1[5] = -bf0[5] + bf0[4];
4521 bf1[6] = -bf0[6] + bf0[7];
4522 bf1[7] = bf0[7] + bf0[6];
4523 bf1[8] = bf0[8];
4524 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
4525 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
4526 bf1[11] = bf0[11];
4527 bf1[12] = bf0[12];
4528 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
4529 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
4530 bf1[15] = bf0[15];
4531
4532 // stage 5
4533 cospi = cospi_arr(cos_bit);
4534 bf0 = step;
4535 bf1 = output;
4536 bf1[0] = bf0[0];
4537 bf1[2] = bf0[2];
4538 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
4539 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
4540 bf1[8] = bf0[8] + bf0[9];
4541 bf1[9] = -bf0[9] + bf0[8];
4542 bf1[10] = -bf0[10] + bf0[11];
4543 bf1[11] = bf0[11] + bf0[10];
4544 bf1[12] = bf0[12] + bf0[13];
4545 bf1[13] = -bf0[13] + bf0[12];
4546 bf1[14] = -bf0[14] + bf0[15];
4547 bf1[15] = bf0[15] + bf0[14];
4548
4549 // stage 6
4550 cospi = cospi_arr(cos_bit);
4551 bf0 = output;
4552 bf1 = step;
4553 bf1[0] = bf0[0];
4554 bf1[2] = bf0[2];
4555 bf1[4] = bf0[4];
4556 bf1[6] = bf0[6];
4557 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
4558 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
4559 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
4560 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
4561
4562 // stage 7
4563 bf0 = step;
4564 bf1 = output;
4565 bf1[0] = bf0[0];
4566 bf1[1] = bf0[8];
4567 bf1[2] = bf0[4];
4568 bf1[3] = bf0[12];
4569 bf1[4] = bf0[2];
4570 bf1[5] = bf0[10];
4571 bf1[6] = bf0[6];
4572 bf1[7] = bf0[14];
4573 }
4574
svt_av1_fidentity8_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4575 void svt_av1_fidentity8_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
4576 const int8_t *stage_range) {
4577 (void)stage_range;
4578 (void)cos_bit;
4579 for (int32_t i = 0; i < 4; ++i) output[i] = input[i] * 2;
4580 }
4581
svt_av1_fadst8_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4582 void svt_av1_fadst8_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4583 const int8_t *stage_range) {
4584 (void)stage_range;
4585 const int32_t *cospi;
4586
4587 int32_t *bf0, *bf1;
4588 int32_t step[8];
4589
4590 // stage 0;
4591
4592 // stage 1;
4593 assert(output != input);
4594 bf1 = output;
4595 bf1[0] = input[0];
4596 bf1[1] = -input[7];
4597 bf1[2] = -input[3];
4598 bf1[3] = input[4];
4599 bf1[4] = -input[1];
4600 bf1[5] = input[6];
4601 bf1[6] = input[2];
4602 bf1[7] = -input[5];
4603
4604 // stage 2
4605 cospi = cospi_arr(cos_bit);
4606 bf0 = output;
4607 bf1 = step;
4608 bf1[0] = bf0[0];
4609 bf1[1] = bf0[1];
4610 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
4611 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
4612 bf1[4] = bf0[4];
4613 bf1[5] = bf0[5];
4614 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
4615 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
4616
4617 // stage 3
4618 bf0 = step;
4619 bf1 = output;
4620 bf1[0] = bf0[0] + bf0[2];
4621 bf1[1] = bf0[1] + bf0[3];
4622 bf1[2] = bf0[0] - bf0[2];
4623 bf1[3] = bf0[1] - bf0[3];
4624 bf1[4] = bf0[4] + bf0[6];
4625 bf1[5] = bf0[5] + bf0[7];
4626 bf1[6] = bf0[4] - bf0[6];
4627 bf1[7] = bf0[5] - bf0[7];
4628
4629 // stage 4
4630 cospi = cospi_arr(cos_bit);
4631 bf0 = output;
4632 bf1 = step;
4633 bf1[0] = bf0[0];
4634 bf1[1] = bf0[1];
4635 bf1[2] = bf0[2];
4636 bf1[3] = bf0[3];
4637 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
4638 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
4639 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
4640 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
4641
4642 // stage 5
4643 bf0 = step;
4644 bf1 = output;
4645 bf1[0] = bf0[0] + bf0[4];
4646 bf1[1] = bf0[1] + bf0[5];
4647 bf1[2] = bf0[2] + bf0[6];
4648 bf1[3] = bf0[3] + bf0[7];
4649 bf1[4] = bf0[0] - bf0[4];
4650 bf1[5] = bf0[1] - bf0[5];
4651 bf1[6] = bf0[2] - bf0[6];
4652 bf1[7] = bf0[3] - bf0[7];
4653
4654 // stage 6
4655 cospi = cospi_arr(cos_bit);
4656 bf0 = output;
4657 bf1 = step;
4658 bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
4659 bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
4660 bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
4661 bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
4662
4663 // stage 7
4664 bf0 = step;
4665 bf1 = output;
4666 bf1[0] = bf0[1];
4667 bf1[1] = bf0[6];
4668 bf1[2] = bf0[3];
4669 bf1[3] = bf0[4];
4670 }
4671
svt_av1_fdct8_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4672 void svt_av1_fdct8_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4673 const int8_t *stage_range) {
4674 (void)stage_range;
4675 const int32_t *cospi;
4676
4677 int32_t *bf0, *bf1;
4678 int32_t step[8];
4679
4680 // stage 0;
4681
4682 // stage 1;
4683 bf1 = output;
4684 bf1[0] = input[0] + input[7];
4685 bf1[1] = input[1] + input[6];
4686 bf1[2] = input[2] + input[5];
4687 bf1[3] = input[3] + input[4];
4688 bf1[4] = -input[4] + input[3];
4689 bf1[5] = -input[5] + input[2];
4690 bf1[6] = -input[6] + input[1];
4691 bf1[7] = -input[7] + input[0];
4692
4693 // stage 2
4694 cospi = cospi_arr(cos_bit);
4695 bf0 = output;
4696 bf1 = step;
4697 bf1[0] = bf0[0] + bf0[3];
4698 bf1[1] = bf0[1] + bf0[2];
4699 bf1[2] = -bf0[2] + bf0[1];
4700 bf1[3] = -bf0[3] + bf0[0];
4701 bf1[4] = bf0[4];
4702 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
4703 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
4704 bf1[7] = bf0[7];
4705
4706 // stage 3
4707 cospi = cospi_arr(cos_bit);
4708 bf0 = step;
4709 bf1 = output;
4710 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4711 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4712 bf1[4] = bf0[4] + bf0[5];
4713 bf1[5] = -bf0[5] + bf0[4];
4714 bf1[6] = -bf0[6] + bf0[7];
4715 bf1[7] = bf0[7] + bf0[6];
4716
4717 // stage 4
4718 cospi = cospi_arr(cos_bit);
4719 bf0 = output;
4720 bf1 = step;
4721 bf1[0] = bf0[0];
4722 bf1[2] = bf0[2];
4723 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
4724 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
4725
4726 // stage 5
4727 bf0 = step;
4728 bf1 = output;
4729 bf1[0] = bf0[0];
4730 bf1[1] = bf0[4];
4731 bf1[2] = bf0[2];
4732 bf1[3] = bf0[6];
4733 }
4734
svt_av1_fidentity4_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4735 void svt_av1_fidentity4_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
4736 const int8_t *stage_range) {
4737 (void)stage_range;
4738 (void)cos_bit;
4739 output[0] = round_shift((int64_t)input[0] * new_sqrt2, new_sqrt2_bits);
4740 output[1] = round_shift((int64_t)input[1] * new_sqrt2, new_sqrt2_bits);
4741 assert(stage_range[0] + new_sqrt2_bits <= 32);
4742 }
4743
svt_av1_fadst4_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4744 void svt_av1_fadst4_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4745 const int8_t *stage_range) {
4746 (void)stage_range;
4747 int32_t bit = cos_bit;
4748 const int32_t *sinpi = sinpi_arr(bit);
4749 int32_t x0, x1, x2, x3;
4750 int32_t s0, s2, s4, s5, s7;
4751
4752 // stage 0
4753 x0 = input[0];
4754 x1 = input[1];
4755 x2 = input[2];
4756 x3 = input[3];
4757
4758 if (!(x0 | x1 | x2 | x3)) {
4759 output[0] = output[1] = output[2] = output[3] = 0;
4760 return;
4761 }
4762
4763 // stage 1
4764 s0 = sinpi[1] * x0;
4765 s2 = sinpi[2] * x1;
4766 s4 = sinpi[3] * x2;
4767 s5 = sinpi[4] * x3;
4768 s7 = x0 + x1;
4769
4770 // stage 2
4771 s7 = s7 - x3;
4772
4773 // stage 3
4774 x0 = s0 + s2;
4775 x1 = sinpi[3] * s7;
4776
4777 // stage 4
4778 x0 = x0 + s5;
4779
4780 // stage 5
4781 s0 = x0 + s4;
4782
4783 // 1-D transform scaling factor is sqrt(2).
4784 output[0] = round_shift(s0, bit);
4785 output[1] = round_shift(x1, bit);
4786 }
4787
svt_av1_fdct4_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4788 void svt_av1_fdct4_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4789 const int8_t *stage_range) {
4790 (void)stage_range;
4791 const int32_t *cospi;
4792
4793 int32_t *bf0;
4794 int32_t step[4];
4795
4796 // stage 1;
4797 bf0 = step;
4798 bf0[0] = input[0] + input[3];
4799 bf0[1] = input[1] + input[2];
4800 bf0[2] = -input[2] + input[1];
4801 bf0[3] = -input[3] + input[0];
4802
4803 // stage 2
4804 cospi = cospi_arr(cos_bit);
4805
4806 output[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4807 output[1] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4808 }
4809
svt_av1_fdct32_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)4810 void svt_av1_fdct32_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
4811 const int8_t *stage_range) {
4812 (void)stage_range;
4813 const int32_t *cospi;
4814
4815 int32_t *bf0, *bf1;
4816 int32_t step[32];
4817
4818 // stage 0;
4819
4820 // stage 1;
4821 bf1 = output;
4822 bf1[0] = input[0] + input[31];
4823 bf1[1] = input[1] + input[30];
4824 bf1[2] = input[2] + input[29];
4825 bf1[3] = input[3] + input[28];
4826 bf1[4] = input[4] + input[27];
4827 bf1[5] = input[5] + input[26];
4828 bf1[6] = input[6] + input[25];
4829 bf1[7] = input[7] + input[24];
4830 bf1[8] = input[8] + input[23];
4831 bf1[9] = input[9] + input[22];
4832 bf1[10] = input[10] + input[21];
4833 bf1[11] = input[11] + input[20];
4834 bf1[12] = input[12] + input[19];
4835 bf1[13] = input[13] + input[18];
4836 bf1[14] = input[14] + input[17];
4837 bf1[15] = input[15] + input[16];
4838 bf1[16] = -input[16] + input[15];
4839 bf1[17] = -input[17] + input[14];
4840 bf1[18] = -input[18] + input[13];
4841 bf1[19] = -input[19] + input[12];
4842 bf1[20] = -input[20] + input[11];
4843 bf1[21] = -input[21] + input[10];
4844 bf1[22] = -input[22] + input[9];
4845 bf1[23] = -input[23] + input[8];
4846 bf1[24] = -input[24] + input[7];
4847 bf1[25] = -input[25] + input[6];
4848 bf1[26] = -input[26] + input[5];
4849 bf1[27] = -input[27] + input[4];
4850 bf1[28] = -input[28] + input[3];
4851 bf1[29] = -input[29] + input[2];
4852 bf1[30] = -input[30] + input[1];
4853 bf1[31] = -input[31] + input[0];
4854
4855 // stage 2
4856 cospi = cospi_arr(cos_bit);
4857 bf0 = output;
4858 bf1 = step;
4859 bf1[0] = bf0[0] + bf0[15];
4860 bf1[1] = bf0[1] + bf0[14];
4861 bf1[2] = bf0[2] + bf0[13];
4862 bf1[3] = bf0[3] + bf0[12];
4863 bf1[4] = bf0[4] + bf0[11];
4864 bf1[5] = bf0[5] + bf0[10];
4865 bf1[6] = bf0[6] + bf0[9];
4866 bf1[7] = bf0[7] + bf0[8];
4867 bf1[8] = -bf0[8] + bf0[7];
4868 bf1[9] = -bf0[9] + bf0[6];
4869 bf1[10] = -bf0[10] + bf0[5];
4870 bf1[11] = -bf0[11] + bf0[4];
4871 bf1[12] = -bf0[12] + bf0[3];
4872 bf1[13] = -bf0[13] + bf0[2];
4873 bf1[14] = -bf0[14] + bf0[1];
4874 bf1[15] = -bf0[15] + bf0[0];
4875 bf1[16] = bf0[16];
4876 bf1[17] = bf0[17];
4877 bf1[18] = bf0[18];
4878 bf1[19] = bf0[19];
4879 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
4880 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
4881 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
4882 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
4883 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
4884 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
4885 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
4886 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
4887 bf1[28] = bf0[28];
4888 bf1[29] = bf0[29];
4889 bf1[30] = bf0[30];
4890 bf1[31] = bf0[31];
4891
4892 // stage 3
4893 cospi = cospi_arr(cos_bit);
4894 bf0 = step;
4895 bf1 = output;
4896 bf1[0] = bf0[0] + bf0[7];
4897 bf1[1] = bf0[1] + bf0[6];
4898 bf1[2] = bf0[2] + bf0[5];
4899 bf1[3] = bf0[3] + bf0[4];
4900 bf1[4] = -bf0[4] + bf0[3];
4901 bf1[5] = -bf0[5] + bf0[2];
4902 bf1[6] = -bf0[6] + bf0[1];
4903 bf1[7] = -bf0[7] + bf0[0];
4904 bf1[8] = bf0[8];
4905 bf1[9] = bf0[9];
4906 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
4907 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
4908 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
4909 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
4910 bf1[14] = bf0[14];
4911 bf1[15] = bf0[15];
4912 bf1[16] = bf0[16] + bf0[23];
4913 bf1[17] = bf0[17] + bf0[22];
4914 bf1[18] = bf0[18] + bf0[21];
4915 bf1[19] = bf0[19] + bf0[20];
4916 bf1[20] = -bf0[20] + bf0[19];
4917 bf1[21] = -bf0[21] + bf0[18];
4918 bf1[22] = -bf0[22] + bf0[17];
4919 bf1[23] = -bf0[23] + bf0[16];
4920 bf1[24] = -bf0[24] + bf0[31];
4921 bf1[25] = -bf0[25] + bf0[30];
4922 bf1[26] = -bf0[26] + bf0[29];
4923 bf1[27] = -bf0[27] + bf0[28];
4924 bf1[28] = bf0[28] + bf0[27];
4925 bf1[29] = bf0[29] + bf0[26];
4926 bf1[30] = bf0[30] + bf0[25];
4927 bf1[31] = bf0[31] + bf0[24];
4928
4929 // stage 4
4930 cospi = cospi_arr(cos_bit);
4931 bf0 = output;
4932 bf1 = step;
4933 bf1[0] = bf0[0] + bf0[3];
4934 bf1[1] = bf0[1] + bf0[2];
4935 bf1[2] = -bf0[2] + bf0[1];
4936 bf1[3] = -bf0[3] + bf0[0];
4937 bf1[4] = bf0[4];
4938 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
4939 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
4940 bf1[7] = bf0[7];
4941 bf1[8] = bf0[8] + bf0[11];
4942 bf1[9] = bf0[9] + bf0[10];
4943 bf1[10] = -bf0[10] + bf0[9];
4944 bf1[11] = -bf0[11] + bf0[8];
4945 bf1[12] = -bf0[12] + bf0[15];
4946 bf1[13] = -bf0[13] + bf0[14];
4947 bf1[14] = bf0[14] + bf0[13];
4948 bf1[15] = bf0[15] + bf0[12];
4949 bf1[16] = bf0[16];
4950 bf1[17] = bf0[17];
4951 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
4952 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
4953 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
4954 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
4955 bf1[22] = bf0[22];
4956 bf1[23] = bf0[23];
4957 bf1[24] = bf0[24];
4958 bf1[25] = bf0[25];
4959 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
4960 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
4961 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
4962 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
4963 bf1[30] = bf0[30];
4964 bf1[31] = bf0[31];
4965
4966 // stage 5
4967 cospi = cospi_arr(cos_bit);
4968 bf0 = step;
4969 bf1 = output;
4970 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
4971 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
4972 bf1[4] = bf0[4] + bf0[5];
4973 bf1[5] = -bf0[5] + bf0[4];
4974 bf1[6] = -bf0[6] + bf0[7];
4975 bf1[7] = bf0[7] + bf0[6];
4976 bf1[8] = bf0[8];
4977 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
4978 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
4979 bf1[11] = bf0[11];
4980 bf1[12] = bf0[12];
4981 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
4982 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
4983 bf1[15] = bf0[15];
4984 bf1[16] = bf0[16] + bf0[19];
4985 bf1[17] = bf0[17] + bf0[18];
4986 bf1[18] = -bf0[18] + bf0[17];
4987 bf1[19] = -bf0[19] + bf0[16];
4988 bf1[20] = -bf0[20] + bf0[23];
4989 bf1[21] = -bf0[21] + bf0[22];
4990 bf1[22] = bf0[22] + bf0[21];
4991 bf1[23] = bf0[23] + bf0[20];
4992 bf1[24] = bf0[24] + bf0[27];
4993 bf1[25] = bf0[25] + bf0[26];
4994 bf1[26] = -bf0[26] + bf0[25];
4995 bf1[27] = -bf0[27] + bf0[24];
4996 bf1[28] = -bf0[28] + bf0[31];
4997 bf1[29] = -bf0[29] + bf0[30];
4998 bf1[30] = bf0[30] + bf0[29];
4999 bf1[31] = bf0[31] + bf0[28];
5000
5001 // stage 6
5002 cospi = cospi_arr(cos_bit);
5003 bf0 = output;
5004 bf1 = step;
5005 bf1[0] = bf0[0];
5006 bf1[2] = bf0[2];
5007 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
5008 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
5009 bf1[8] = bf0[8] + bf0[9];
5010 bf1[9] = -bf0[9] + bf0[8];
5011 bf1[10] = -bf0[10] + bf0[11];
5012 bf1[11] = bf0[11] + bf0[10];
5013 bf1[12] = bf0[12] + bf0[13];
5014 bf1[13] = -bf0[13] + bf0[12];
5015 bf1[14] = -bf0[14] + bf0[15];
5016 bf1[15] = bf0[15] + bf0[14];
5017 bf1[16] = bf0[16];
5018 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
5019 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
5020 bf1[19] = bf0[19];
5021 bf1[20] = bf0[20];
5022 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
5023 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
5024 bf1[23] = bf0[23];
5025 bf1[24] = bf0[24];
5026 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
5027 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
5028 bf1[27] = bf0[27];
5029 bf1[28] = bf0[28];
5030 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
5031 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
5032 bf1[31] = bf0[31];
5033
5034 // stage 7
5035 cospi = cospi_arr(cos_bit);
5036 bf0 = step;
5037 bf1 = output;
5038 bf1[0] = bf0[0];
5039 bf1[2] = bf0[2];
5040 bf1[4] = bf0[4];
5041 bf1[6] = bf0[6];
5042 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
5043 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
5044 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
5045 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
5046 bf1[16] = bf0[16] + bf0[17];
5047 bf1[17] = -bf0[17] + bf0[16];
5048 bf1[18] = -bf0[18] + bf0[19];
5049 bf1[19] = bf0[19] + bf0[18];
5050 bf1[20] = bf0[20] + bf0[21];
5051 bf1[21] = -bf0[21] + bf0[20];
5052 bf1[22] = -bf0[22] + bf0[23];
5053 bf1[23] = bf0[23] + bf0[22];
5054 bf1[24] = bf0[24] + bf0[25];
5055 bf1[25] = -bf0[25] + bf0[24];
5056 bf1[26] = -bf0[26] + bf0[27];
5057 bf1[27] = bf0[27] + bf0[26];
5058 bf1[28] = bf0[28] + bf0[29];
5059 bf1[29] = -bf0[29] + bf0[28];
5060 bf1[30] = -bf0[30] + bf0[31];
5061 bf1[31] = bf0[31] + bf0[30];
5062
5063 // stage 8
5064 cospi = cospi_arr(cos_bit);
5065 bf0 = output;
5066 bf1 = step;
5067 bf1[0] = bf0[0];
5068 bf1[2] = bf0[2];
5069 bf1[4] = bf0[4];
5070 bf1[6] = bf0[6];
5071 bf1[8] = bf0[8];
5072 bf1[10] = bf0[10];
5073 bf1[12] = bf0[12];
5074 bf1[14] = bf0[14];
5075 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
5076 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
5077 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
5078 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
5079 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
5080 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
5081 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
5082 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
5083
5084 // stage 9
5085 bf0 = step;
5086 bf1 = output;
5087 bf1[0] = bf0[0];
5088 bf1[1] = bf0[16];
5089 bf1[2] = bf0[8];
5090 bf1[3] = bf0[24];
5091 bf1[4] = bf0[4];
5092 bf1[5] = bf0[20];
5093 bf1[6] = bf0[12];
5094 bf1[7] = bf0[28];
5095 bf1[8] = bf0[2];
5096 bf1[9] = bf0[18];
5097 bf1[10] = bf0[10];
5098 bf1[11] = bf0[26];
5099 bf1[12] = bf0[6];
5100 bf1[13] = bf0[22];
5101 bf1[14] = bf0[14];
5102 bf1[15] = bf0[30];
5103 }
5104
svt_av1_fidentity32_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)5105 void svt_av1_fidentity32_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
5106 const int8_t *stage_range) {
5107 (void)stage_range;
5108 (void)cos_bit;
5109 for (int32_t i = 0; i < 16; ++i) output[i] = input[i] * 4;
5110 }
5111
svt_av1_fdct64_new_N2(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)5112 void svt_av1_fdct64_new_N2(const int32_t *input, int32_t *output, int8_t cos_bit,
5113 const int8_t *stage_range) {
5114 (void)stage_range;
5115 const int32_t *cospi;
5116
5117 int32_t *bf0, *bf1;
5118 int32_t step[64];
5119
5120 // stage 0;
5121
5122 // stage 1;
5123 bf1 = output;
5124 bf1[0] = input[0] + input[63];
5125 bf1[1] = input[1] + input[62];
5126 bf1[2] = input[2] + input[61];
5127 bf1[3] = input[3] + input[60];
5128 bf1[4] = input[4] + input[59];
5129 bf1[5] = input[5] + input[58];
5130 bf1[6] = input[6] + input[57];
5131 bf1[7] = input[7] + input[56];
5132 bf1[8] = input[8] + input[55];
5133 bf1[9] = input[9] + input[54];
5134 bf1[10] = input[10] + input[53];
5135 bf1[11] = input[11] + input[52];
5136 bf1[12] = input[12] + input[51];
5137 bf1[13] = input[13] + input[50];
5138 bf1[14] = input[14] + input[49];
5139 bf1[15] = input[15] + input[48];
5140 bf1[16] = input[16] + input[47];
5141 bf1[17] = input[17] + input[46];
5142 bf1[18] = input[18] + input[45];
5143 bf1[19] = input[19] + input[44];
5144 bf1[20] = input[20] + input[43];
5145 bf1[21] = input[21] + input[42];
5146 bf1[22] = input[22] + input[41];
5147 bf1[23] = input[23] + input[40];
5148 bf1[24] = input[24] + input[39];
5149 bf1[25] = input[25] + input[38];
5150 bf1[26] = input[26] + input[37];
5151 bf1[27] = input[27] + input[36];
5152 bf1[28] = input[28] + input[35];
5153 bf1[29] = input[29] + input[34];
5154 bf1[30] = input[30] + input[33];
5155 bf1[31] = input[31] + input[32];
5156 bf1[32] = -input[32] + input[31];
5157 bf1[33] = -input[33] + input[30];
5158 bf1[34] = -input[34] + input[29];
5159 bf1[35] = -input[35] + input[28];
5160 bf1[36] = -input[36] + input[27];
5161 bf1[37] = -input[37] + input[26];
5162 bf1[38] = -input[38] + input[25];
5163 bf1[39] = -input[39] + input[24];
5164 bf1[40] = -input[40] + input[23];
5165 bf1[41] = -input[41] + input[22];
5166 bf1[42] = -input[42] + input[21];
5167 bf1[43] = -input[43] + input[20];
5168 bf1[44] = -input[44] + input[19];
5169 bf1[45] = -input[45] + input[18];
5170 bf1[46] = -input[46] + input[17];
5171 bf1[47] = -input[47] + input[16];
5172 bf1[48] = -input[48] + input[15];
5173 bf1[49] = -input[49] + input[14];
5174 bf1[50] = -input[50] + input[13];
5175 bf1[51] = -input[51] + input[12];
5176 bf1[52] = -input[52] + input[11];
5177 bf1[53] = -input[53] + input[10];
5178 bf1[54] = -input[54] + input[9];
5179 bf1[55] = -input[55] + input[8];
5180 bf1[56] = -input[56] + input[7];
5181 bf1[57] = -input[57] + input[6];
5182 bf1[58] = -input[58] + input[5];
5183 bf1[59] = -input[59] + input[4];
5184 bf1[60] = -input[60] + input[3];
5185 bf1[61] = -input[61] + input[2];
5186 bf1[62] = -input[62] + input[1];
5187 bf1[63] = -input[63] + input[0];
5188
5189 // stage 2
5190 cospi = cospi_arr(cos_bit);
5191 bf0 = output;
5192 bf1 = step;
5193 bf1[0] = bf0[0] + bf0[31];
5194 bf1[1] = bf0[1] + bf0[30];
5195 bf1[2] = bf0[2] + bf0[29];
5196 bf1[3] = bf0[3] + bf0[28];
5197 bf1[4] = bf0[4] + bf0[27];
5198 bf1[5] = bf0[5] + bf0[26];
5199 bf1[6] = bf0[6] + bf0[25];
5200 bf1[7] = bf0[7] + bf0[24];
5201 bf1[8] = bf0[8] + bf0[23];
5202 bf1[9] = bf0[9] + bf0[22];
5203 bf1[10] = bf0[10] + bf0[21];
5204 bf1[11] = bf0[11] + bf0[20];
5205 bf1[12] = bf0[12] + bf0[19];
5206 bf1[13] = bf0[13] + bf0[18];
5207 bf1[14] = bf0[14] + bf0[17];
5208 bf1[15] = bf0[15] + bf0[16];
5209 bf1[16] = -bf0[16] + bf0[15];
5210 bf1[17] = -bf0[17] + bf0[14];
5211 bf1[18] = -bf0[18] + bf0[13];
5212 bf1[19] = -bf0[19] + bf0[12];
5213 bf1[20] = -bf0[20] + bf0[11];
5214 bf1[21] = -bf0[21] + bf0[10];
5215 bf1[22] = -bf0[22] + bf0[9];
5216 bf1[23] = -bf0[23] + bf0[8];
5217 bf1[24] = -bf0[24] + bf0[7];
5218 bf1[25] = -bf0[25] + bf0[6];
5219 bf1[26] = -bf0[26] + bf0[5];
5220 bf1[27] = -bf0[27] + bf0[4];
5221 bf1[28] = -bf0[28] + bf0[3];
5222 bf1[29] = -bf0[29] + bf0[2];
5223 bf1[30] = -bf0[30] + bf0[1];
5224 bf1[31] = -bf0[31] + bf0[0];
5225 bf1[32] = bf0[32];
5226 bf1[33] = bf0[33];
5227 bf1[34] = bf0[34];
5228 bf1[35] = bf0[35];
5229 bf1[36] = bf0[36];
5230 bf1[37] = bf0[37];
5231 bf1[38] = bf0[38];
5232 bf1[39] = bf0[39];
5233 bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
5234 bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
5235 bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
5236 bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
5237 bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
5238 bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
5239 bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
5240 bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
5241 bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
5242 bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
5243 bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
5244 bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
5245 bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
5246 bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
5247 bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
5248 bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
5249 bf1[56] = bf0[56];
5250 bf1[57] = bf0[57];
5251 bf1[58] = bf0[58];
5252 bf1[59] = bf0[59];
5253 bf1[60] = bf0[60];
5254 bf1[61] = bf0[61];
5255 bf1[62] = bf0[62];
5256 bf1[63] = bf0[63];
5257
5258 // stage 3
5259 cospi = cospi_arr(cos_bit);
5260 bf0 = step;
5261 bf1 = output;
5262 bf1[0] = bf0[0] + bf0[15];
5263 bf1[1] = bf0[1] + bf0[14];
5264 bf1[2] = bf0[2] + bf0[13];
5265 bf1[3] = bf0[3] + bf0[12];
5266 bf1[4] = bf0[4] + bf0[11];
5267 bf1[5] = bf0[5] + bf0[10];
5268 bf1[6] = bf0[6] + bf0[9];
5269 bf1[7] = bf0[7] + bf0[8];
5270 bf1[8] = -bf0[8] + bf0[7];
5271 bf1[9] = -bf0[9] + bf0[6];
5272 bf1[10] = -bf0[10] + bf0[5];
5273 bf1[11] = -bf0[11] + bf0[4];
5274 bf1[12] = -bf0[12] + bf0[3];
5275 bf1[13] = -bf0[13] + bf0[2];
5276 bf1[14] = -bf0[14] + bf0[1];
5277 bf1[15] = -bf0[15] + bf0[0];
5278 bf1[16] = bf0[16];
5279 bf1[17] = bf0[17];
5280 bf1[18] = bf0[18];
5281 bf1[19] = bf0[19];
5282 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
5283 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
5284 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
5285 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
5286 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
5287 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
5288 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
5289 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
5290 bf1[28] = bf0[28];
5291 bf1[29] = bf0[29];
5292 bf1[30] = bf0[30];
5293 bf1[31] = bf0[31];
5294 bf1[32] = bf0[32] + bf0[47];
5295 bf1[33] = bf0[33] + bf0[46];
5296 bf1[34] = bf0[34] + bf0[45];
5297 bf1[35] = bf0[35] + bf0[44];
5298 bf1[36] = bf0[36] + bf0[43];
5299 bf1[37] = bf0[37] + bf0[42];
5300 bf1[38] = bf0[38] + bf0[41];
5301 bf1[39] = bf0[39] + bf0[40];
5302 bf1[40] = -bf0[40] + bf0[39];
5303 bf1[41] = -bf0[41] + bf0[38];
5304 bf1[42] = -bf0[42] + bf0[37];
5305 bf1[43] = -bf0[43] + bf0[36];
5306 bf1[44] = -bf0[44] + bf0[35];
5307 bf1[45] = -bf0[45] + bf0[34];
5308 bf1[46] = -bf0[46] + bf0[33];
5309 bf1[47] = -bf0[47] + bf0[32];
5310 bf1[48] = -bf0[48] + bf0[63];
5311 bf1[49] = -bf0[49] + bf0[62];
5312 bf1[50] = -bf0[50] + bf0[61];
5313 bf1[51] = -bf0[51] + bf0[60];
5314 bf1[52] = -bf0[52] + bf0[59];
5315 bf1[53] = -bf0[53] + bf0[58];
5316 bf1[54] = -bf0[54] + bf0[57];
5317 bf1[55] = -bf0[55] + bf0[56];
5318 bf1[56] = bf0[56] + bf0[55];
5319 bf1[57] = bf0[57] + bf0[54];
5320 bf1[58] = bf0[58] + bf0[53];
5321 bf1[59] = bf0[59] + bf0[52];
5322 bf1[60] = bf0[60] + bf0[51];
5323 bf1[61] = bf0[61] + bf0[50];
5324 bf1[62] = bf0[62] + bf0[49];
5325 bf1[63] = bf0[63] + bf0[48];
5326
5327 // stage 4
5328 cospi = cospi_arr(cos_bit);
5329 bf0 = output;
5330 bf1 = step;
5331 bf1[0] = bf0[0] + bf0[7];
5332 bf1[1] = bf0[1] + bf0[6];
5333 bf1[2] = bf0[2] + bf0[5];
5334 bf1[3] = bf0[3] + bf0[4];
5335 bf1[4] = -bf0[4] + bf0[3];
5336 bf1[5] = -bf0[5] + bf0[2];
5337 bf1[6] = -bf0[6] + bf0[1];
5338 bf1[7] = -bf0[7] + bf0[0];
5339 bf1[8] = bf0[8];
5340 bf1[9] = bf0[9];
5341 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
5342 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
5343 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
5344 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
5345 bf1[14] = bf0[14];
5346 bf1[15] = bf0[15];
5347 bf1[16] = bf0[16] + bf0[23];
5348 bf1[17] = bf0[17] + bf0[22];
5349 bf1[18] = bf0[18] + bf0[21];
5350 bf1[19] = bf0[19] + bf0[20];
5351 bf1[20] = -bf0[20] + bf0[19];
5352 bf1[21] = -bf0[21] + bf0[18];
5353 bf1[22] = -bf0[22] + bf0[17];
5354 bf1[23] = -bf0[23] + bf0[16];
5355 bf1[24] = -bf0[24] + bf0[31];
5356 bf1[25] = -bf0[25] + bf0[30];
5357 bf1[26] = -bf0[26] + bf0[29];
5358 bf1[27] = -bf0[27] + bf0[28];
5359 bf1[28] = bf0[28] + bf0[27];
5360 bf1[29] = bf0[29] + bf0[26];
5361 bf1[30] = bf0[30] + bf0[25];
5362 bf1[31] = bf0[31] + bf0[24];
5363 bf1[32] = bf0[32];
5364 bf1[33] = bf0[33];
5365 bf1[34] = bf0[34];
5366 bf1[35] = bf0[35];
5367 bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
5368 bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
5369 bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
5370 bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
5371 bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
5372 bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
5373 bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
5374 bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
5375 bf1[44] = bf0[44];
5376 bf1[45] = bf0[45];
5377 bf1[46] = bf0[46];
5378 bf1[47] = bf0[47];
5379 bf1[48] = bf0[48];
5380 bf1[49] = bf0[49];
5381 bf1[50] = bf0[50];
5382 bf1[51] = bf0[51];
5383 bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
5384 bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
5385 bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
5386 bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
5387 bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
5388 bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
5389 bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
5390 bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
5391 bf1[60] = bf0[60];
5392 bf1[61] = bf0[61];
5393 bf1[62] = bf0[62];
5394 bf1[63] = bf0[63];
5395
5396 // stage 5
5397 cospi = cospi_arr(cos_bit);
5398 bf0 = step;
5399 bf1 = output;
5400 bf1[0] = bf0[0] + bf0[3];
5401 bf1[1] = bf0[1] + bf0[2];
5402 bf1[2] = -bf0[2] + bf0[1];
5403 bf1[3] = -bf0[3] + bf0[0];
5404 bf1[4] = bf0[4];
5405 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
5406 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
5407 bf1[7] = bf0[7];
5408 bf1[8] = bf0[8] + bf0[11];
5409 bf1[9] = bf0[9] + bf0[10];
5410 bf1[10] = -bf0[10] + bf0[9];
5411 bf1[11] = -bf0[11] + bf0[8];
5412 bf1[12] = -bf0[12] + bf0[15];
5413 bf1[13] = -bf0[13] + bf0[14];
5414 bf1[14] = bf0[14] + bf0[13];
5415 bf1[15] = bf0[15] + bf0[12];
5416 bf1[16] = bf0[16];
5417 bf1[17] = bf0[17];
5418 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
5419 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
5420 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
5421 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
5422 bf1[22] = bf0[22];
5423 bf1[23] = bf0[23];
5424 bf1[24] = bf0[24];
5425 bf1[25] = bf0[25];
5426 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
5427 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
5428 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
5429 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
5430 bf1[30] = bf0[30];
5431 bf1[31] = bf0[31];
5432 bf1[32] = bf0[32] + bf0[39];
5433 bf1[33] = bf0[33] + bf0[38];
5434 bf1[34] = bf0[34] + bf0[37];
5435 bf1[35] = bf0[35] + bf0[36];
5436 bf1[36] = -bf0[36] + bf0[35];
5437 bf1[37] = -bf0[37] + bf0[34];
5438 bf1[38] = -bf0[38] + bf0[33];
5439 bf1[39] = -bf0[39] + bf0[32];
5440 bf1[40] = -bf0[40] + bf0[47];
5441 bf1[41] = -bf0[41] + bf0[46];
5442 bf1[42] = -bf0[42] + bf0[45];
5443 bf1[43] = -bf0[43] + bf0[44];
5444 bf1[44] = bf0[44] + bf0[43];
5445 bf1[45] = bf0[45] + bf0[42];
5446 bf1[46] = bf0[46] + bf0[41];
5447 bf1[47] = bf0[47] + bf0[40];
5448 bf1[48] = bf0[48] + bf0[55];
5449 bf1[49] = bf0[49] + bf0[54];
5450 bf1[50] = bf0[50] + bf0[53];
5451 bf1[51] = bf0[51] + bf0[52];
5452 bf1[52] = -bf0[52] + bf0[51];
5453 bf1[53] = -bf0[53] + bf0[50];
5454 bf1[54] = -bf0[54] + bf0[49];
5455 bf1[55] = -bf0[55] + bf0[48];
5456 bf1[56] = -bf0[56] + bf0[63];
5457 bf1[57] = -bf0[57] + bf0[62];
5458 bf1[58] = -bf0[58] + bf0[61];
5459 bf1[59] = -bf0[59] + bf0[60];
5460 bf1[60] = bf0[60] + bf0[59];
5461 bf1[61] = bf0[61] + bf0[58];
5462 bf1[62] = bf0[62] + bf0[57];
5463 bf1[63] = bf0[63] + bf0[56];
5464
5465 // stage 6
5466 cospi = cospi_arr(cos_bit);
5467 bf0 = output;
5468 bf1 = step;
5469 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
5470 bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
5471 bf1[4] = bf0[4] + bf0[5];
5472 bf1[5] = -bf0[5] + bf0[4];
5473 bf1[6] = -bf0[6] + bf0[7];
5474 bf1[7] = bf0[7] + bf0[6];
5475 bf1[8] = bf0[8];
5476 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
5477 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
5478 bf1[11] = bf0[11];
5479 bf1[12] = bf0[12];
5480 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
5481 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
5482 bf1[15] = bf0[15];
5483 bf1[16] = bf0[16] + bf0[19];
5484 bf1[17] = bf0[17] + bf0[18];
5485 bf1[18] = -bf0[18] + bf0[17];
5486 bf1[19] = -bf0[19] + bf0[16];
5487 bf1[20] = -bf0[20] + bf0[23];
5488 bf1[21] = -bf0[21] + bf0[22];
5489 bf1[22] = bf0[22] + bf0[21];
5490 bf1[23] = bf0[23] + bf0[20];
5491 bf1[24] = bf0[24] + bf0[27];
5492 bf1[25] = bf0[25] + bf0[26];
5493 bf1[26] = -bf0[26] + bf0[25];
5494 bf1[27] = -bf0[27] + bf0[24];
5495 bf1[28] = -bf0[28] + bf0[31];
5496 bf1[29] = -bf0[29] + bf0[30];
5497 bf1[30] = bf0[30] + bf0[29];
5498 bf1[31] = bf0[31] + bf0[28];
5499 bf1[32] = bf0[32];
5500 bf1[33] = bf0[33];
5501 bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
5502 bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
5503 bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
5504 bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
5505 bf1[38] = bf0[38];
5506 bf1[39] = bf0[39];
5507 bf1[40] = bf0[40];
5508 bf1[41] = bf0[41];
5509 bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
5510 bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
5511 bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
5512 bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
5513 bf1[46] = bf0[46];
5514 bf1[47] = bf0[47];
5515 bf1[48] = bf0[48];
5516 bf1[49] = bf0[49];
5517 bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
5518 bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
5519 bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
5520 bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
5521 bf1[54] = bf0[54];
5522 bf1[55] = bf0[55];
5523 bf1[56] = bf0[56];
5524 bf1[57] = bf0[57];
5525 bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
5526 bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
5527 bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
5528 bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
5529 bf1[62] = bf0[62];
5530 bf1[63] = bf0[63];
5531
5532 // stage 7
5533 cospi = cospi_arr(cos_bit);
5534 bf0 = step;
5535 bf1 = output;
5536 bf1[0] = bf0[0];
5537 bf1[2] = bf0[2];
5538 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
5539 bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
5540 bf1[8] = bf0[8] + bf0[9];
5541 bf1[9] = -bf0[9] + bf0[8];
5542 bf1[10] = -bf0[10] + bf0[11];
5543 bf1[11] = bf0[11] + bf0[10];
5544 bf1[12] = bf0[12] + bf0[13];
5545 bf1[13] = -bf0[13] + bf0[12];
5546 bf1[14] = -bf0[14] + bf0[15];
5547 bf1[15] = bf0[15] + bf0[14];
5548 bf1[16] = bf0[16];
5549 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
5550 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
5551 bf1[19] = bf0[19];
5552 bf1[20] = bf0[20];
5553 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
5554 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
5555 bf1[23] = bf0[23];
5556 bf1[24] = bf0[24];
5557 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
5558 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
5559 bf1[27] = bf0[27];
5560 bf1[28] = bf0[28];
5561 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
5562 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
5563 bf1[31] = bf0[31];
5564 bf1[32] = bf0[32] + bf0[35];
5565 bf1[33] = bf0[33] + bf0[34];
5566 bf1[34] = -bf0[34] + bf0[33];
5567 bf1[35] = -bf0[35] + bf0[32];
5568 bf1[36] = -bf0[36] + bf0[39];
5569 bf1[37] = -bf0[37] + bf0[38];
5570 bf1[38] = bf0[38] + bf0[37];
5571 bf1[39] = bf0[39] + bf0[36];
5572 bf1[40] = bf0[40] + bf0[43];
5573 bf1[41] = bf0[41] + bf0[42];
5574 bf1[42] = -bf0[42] + bf0[41];
5575 bf1[43] = -bf0[43] + bf0[40];
5576 bf1[44] = -bf0[44] + bf0[47];
5577 bf1[45] = -bf0[45] + bf0[46];
5578 bf1[46] = bf0[46] + bf0[45];
5579 bf1[47] = bf0[47] + bf0[44];
5580 bf1[48] = bf0[48] + bf0[51];
5581 bf1[49] = bf0[49] + bf0[50];
5582 bf1[50] = -bf0[50] + bf0[49];
5583 bf1[51] = -bf0[51] + bf0[48];
5584 bf1[52] = -bf0[52] + bf0[55];
5585 bf1[53] = -bf0[53] + bf0[54];
5586 bf1[54] = bf0[54] + bf0[53];
5587 bf1[55] = bf0[55] + bf0[52];
5588 bf1[56] = bf0[56] + bf0[59];
5589 bf1[57] = bf0[57] + bf0[58];
5590 bf1[58] = -bf0[58] + bf0[57];
5591 bf1[59] = -bf0[59] + bf0[56];
5592 bf1[60] = -bf0[60] + bf0[63];
5593 bf1[61] = -bf0[61] + bf0[62];
5594 bf1[62] = bf0[62] + bf0[61];
5595 bf1[63] = bf0[63] + bf0[60];
5596
5597 // stage 8
5598 cospi = cospi_arr(cos_bit);
5599 bf0 = output;
5600 bf1 = step;
5601 bf1[0] = bf0[0];
5602 bf1[2] = bf0[2];
5603 bf1[4] = bf0[4];
5604 bf1[6] = bf0[6];
5605 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
5606 bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
5607 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
5608 bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
5609 bf1[16] = bf0[16] + bf0[17];
5610 bf1[17] = -bf0[17] + bf0[16];
5611 bf1[18] = -bf0[18] + bf0[19];
5612 bf1[19] = bf0[19] + bf0[18];
5613 bf1[20] = bf0[20] + bf0[21];
5614 bf1[21] = -bf0[21] + bf0[20];
5615 bf1[22] = -bf0[22] + bf0[23];
5616 bf1[23] = bf0[23] + bf0[22];
5617 bf1[24] = bf0[24] + bf0[25];
5618 bf1[25] = -bf0[25] + bf0[24];
5619 bf1[26] = -bf0[26] + bf0[27];
5620 bf1[27] = bf0[27] + bf0[26];
5621 bf1[28] = bf0[28] + bf0[29];
5622 bf1[29] = -bf0[29] + bf0[28];
5623 bf1[30] = -bf0[30] + bf0[31];
5624 bf1[31] = bf0[31] + bf0[30];
5625 bf1[32] = bf0[32];
5626 bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
5627 bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
5628 bf1[35] = bf0[35];
5629 bf1[36] = bf0[36];
5630 bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
5631 bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
5632 bf1[39] = bf0[39];
5633 bf1[40] = bf0[40];
5634 bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
5635 bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
5636 bf1[43] = bf0[43];
5637 bf1[44] = bf0[44];
5638 bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
5639 bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
5640 bf1[47] = bf0[47];
5641 bf1[48] = bf0[48];
5642 bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
5643 bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
5644 bf1[51] = bf0[51];
5645 bf1[52] = bf0[52];
5646 bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
5647 bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
5648 bf1[55] = bf0[55];
5649 bf1[56] = bf0[56];
5650 bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
5651 bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
5652 bf1[59] = bf0[59];
5653 bf1[60] = bf0[60];
5654 bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
5655 bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
5656 bf1[63] = bf0[63];
5657
5658 // stage 9
5659 cospi = cospi_arr(cos_bit);
5660 bf0 = step;
5661 bf1 = output;
5662 bf1[0] = bf0[0];
5663 bf1[2] = bf0[2];
5664 bf1[4] = bf0[4];
5665 bf1[6] = bf0[6];
5666 bf1[8] = bf0[8];
5667 bf1[10] = bf0[10];
5668 bf1[12] = bf0[12];
5669 bf1[14] = bf0[14];
5670 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
5671 bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
5672 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
5673 bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
5674 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
5675 bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
5676 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
5677 bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
5678 bf1[32] = bf0[32] + bf0[33];
5679 bf1[33] = -bf0[33] + bf0[32];
5680 bf1[34] = -bf0[34] + bf0[35];
5681 bf1[35] = bf0[35] + bf0[34];
5682 bf1[36] = bf0[36] + bf0[37];
5683 bf1[37] = -bf0[37] + bf0[36];
5684 bf1[38] = -bf0[38] + bf0[39];
5685 bf1[39] = bf0[39] + bf0[38];
5686 bf1[40] = bf0[40] + bf0[41];
5687 bf1[41] = -bf0[41] + bf0[40];
5688 bf1[42] = -bf0[42] + bf0[43];
5689 bf1[43] = bf0[43] + bf0[42];
5690 bf1[44] = bf0[44] + bf0[45];
5691 bf1[45] = -bf0[45] + bf0[44];
5692 bf1[46] = -bf0[46] + bf0[47];
5693 bf1[47] = bf0[47] + bf0[46];
5694 bf1[48] = bf0[48] + bf0[49];
5695 bf1[49] = -bf0[49] + bf0[48];
5696 bf1[50] = -bf0[50] + bf0[51];
5697 bf1[51] = bf0[51] + bf0[50];
5698 bf1[52] = bf0[52] + bf0[53];
5699 bf1[53] = -bf0[53] + bf0[52];
5700 bf1[54] = -bf0[54] + bf0[55];
5701 bf1[55] = bf0[55] + bf0[54];
5702 bf1[56] = bf0[56] + bf0[57];
5703 bf1[57] = -bf0[57] + bf0[56];
5704 bf1[58] = -bf0[58] + bf0[59];
5705 bf1[59] = bf0[59] + bf0[58];
5706 bf1[60] = bf0[60] + bf0[61];
5707 bf1[61] = -bf0[61] + bf0[60];
5708 bf1[62] = -bf0[62] + bf0[63];
5709 bf1[63] = bf0[63] + bf0[62];
5710
5711 // stage 10
5712 cospi = cospi_arr(cos_bit);
5713 bf0 = output;
5714 bf1 = step;
5715 bf1[0] = bf0[0];
5716 bf1[2] = bf0[2];
5717 bf1[4] = bf0[4];
5718 bf1[6] = bf0[6];
5719 bf1[8] = bf0[8];
5720 bf1[10] = bf0[10];
5721 bf1[12] = bf0[12];
5722 bf1[14] = bf0[14];
5723 bf1[16] = bf0[16];
5724 bf1[18] = bf0[18];
5725 bf1[20] = bf0[20];
5726 bf1[22] = bf0[22];
5727 bf1[24] = bf0[24];
5728 bf1[26] = bf0[26];
5729 bf1[28] = bf0[28];
5730 bf1[30] = bf0[30];
5731 bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
5732 bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
5733 bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
5734 bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
5735 bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
5736 bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
5737 bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
5738 bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
5739 bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
5740 bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
5741 bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
5742 bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
5743 bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
5744 bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
5745 bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
5746 bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
5747
5748 // stage 11
5749 bf0 = step;
5750 bf1 = output;
5751 bf1[0] = bf0[0];
5752 bf1[1] = bf0[32];
5753 bf1[2] = bf0[16];
5754 bf1[3] = bf0[48];
5755 bf1[4] = bf0[8];
5756 bf1[5] = bf0[40];
5757 bf1[6] = bf0[24];
5758 bf1[7] = bf0[56];
5759 bf1[8] = bf0[4];
5760 bf1[9] = bf0[36];
5761 bf1[10] = bf0[20];
5762 bf1[11] = bf0[52];
5763 bf1[12] = bf0[12];
5764 bf1[13] = bf0[44];
5765 bf1[14] = bf0[28];
5766 bf1[15] = bf0[60];
5767 bf1[16] = bf0[2];
5768 bf1[17] = bf0[34];
5769 bf1[18] = bf0[18];
5770 bf1[19] = bf0[50];
5771 bf1[20] = bf0[10];
5772 bf1[21] = bf0[42];
5773 bf1[22] = bf0[26];
5774 bf1[23] = bf0[58];
5775 bf1[24] = bf0[6];
5776 bf1[25] = bf0[38];
5777 bf1[26] = bf0[22];
5778 bf1[27] = bf0[54];
5779 bf1[28] = bf0[14];
5780 bf1[29] = bf0[46];
5781 bf1[30] = bf0[30];
5782 bf1[31] = bf0[62];
5783 }
5784
av1_fidentity64_N2_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)5785 void av1_fidentity64_N2_c(const int32_t *input, int32_t *output, int8_t cos_bit,
5786 const int8_t *stage_range) {
5787 (void)stage_range;
5788 (void)cos_bit;
5789 for (int32_t i = 0; i < 32; ++i)
5790 output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
5791 assert(stage_range[0] + new_sqrt2_bits <= 32);
5792 }
5793
fwd_txfm_type_to_func_N2(TxfmType txfmtype)5794 static INLINE TxfmFunc fwd_txfm_type_to_func_N2(TxfmType txfmtype) {
5795 switch (txfmtype) {
5796 case TXFM_TYPE_DCT4: return svt_av1_fdct4_new_N2;
5797 case TXFM_TYPE_DCT8: return svt_av1_fdct8_new_N2;
5798 case TXFM_TYPE_DCT16: return svt_av1_fdct16_new_N2;
5799 case TXFM_TYPE_DCT32: return svt_av1_fdct32_new_N2;
5800 case TXFM_TYPE_DCT64: return svt_av1_fdct64_new_N2;
5801 case TXFM_TYPE_ADST4: return svt_av1_fadst4_new_N2;
5802 case TXFM_TYPE_ADST8: return svt_av1_fadst8_new_N2;
5803 case TXFM_TYPE_ADST16: return svt_av1_fadst16_new_N2;
5804 case TXFM_TYPE_ADST32: return av1_fadst32_new;
5805 case TXFM_TYPE_IDENTITY4: return svt_av1_fidentity4_N2_c;
5806 case TXFM_TYPE_IDENTITY8: return svt_av1_fidentity8_N2_c;
5807 case TXFM_TYPE_IDENTITY16: return svt_av1_fidentity16_N2_c;
5808 case TXFM_TYPE_IDENTITY32: return svt_av1_fidentity32_N2_c;
5809 case TXFM_TYPE_IDENTITY64: return av1_fidentity64_N2_c;
5810 default: assert(0); return NULL;
5811 }
5812 }
5813
av1_tranform_two_d_core_N2_c(int16_t * input,uint32_t input_stride,int32_t * output,const Txfm2dFlipCfg * cfg,int32_t * buf,uint8_t bit_depth)5814 static INLINE void av1_tranform_two_d_core_N2_c(int16_t *input, uint32_t input_stride,
5815 int32_t *output, const Txfm2dFlipCfg *cfg,
5816 int32_t *buf, uint8_t bit_depth) {
5817 int32_t c, r;
5818 // Note when assigning txfm_size_col, we use the txfm_size from the
5819 // row configuration and vice versa. This is intentionally done to
5820 // accurately perform rectangular transforms. When the transform is
5821 // rectangular, the number of columns will be the same as the
5822 // txfm_size stored in the row cfg struct. It will make no difference
5823 // for square transforms.
5824 const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
5825 const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
5826 // Take the shift from the larger dimension in the rectangular case.
5827 const int8_t *shift = cfg->shift;
5828 const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5829 int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
5830 int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
5831 assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
5832 assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
5833 svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
5834
5835 const int8_t cos_bit_col = cfg->cos_bit_col;
5836 const int8_t cos_bit_row = cfg->cos_bit_row;
5837 const TxfmFunc txfm_func_col = fwd_txfm_type_to_func_N2(cfg->txfm_type_col);
5838 const TxfmFunc txfm_func_row = fwd_txfm_type_to_func_N2(cfg->txfm_type_row);
5839 ASSERT(txfm_func_col != NULL);
5840 ASSERT(txfm_func_row != NULL);
5841 // use output buffer as temp buffer
5842 int32_t *temp_in = output;
5843 int32_t *temp_out = output + txfm_size_row;
5844
5845 // Columns
5846 for (c = 0; c < txfm_size_col; ++c) {
5847 if (cfg->ud_flip == 0)
5848 for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * input_stride + c];
5849 else {
5850 for (r = 0; r < txfm_size_row; ++r)
5851 // flip upside down
5852 temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
5853 }
5854 svt_av1_round_shift_array_c(
5855 temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
5856 txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
5857 svt_av1_round_shift_array_c(
5858 temp_out, txfm_size_row / 2, -shift[1]); // NM svt_av1_round_shift_array_c
5859 if (cfg->lr_flip == 0) {
5860 for (r = 0; r < txfm_size_row; ++r) buf[r * txfm_size_col + c] = temp_out[r];
5861 } else {
5862 for (r = 0; r < txfm_size_row; ++r)
5863 // flip from left to right
5864 buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
5865 }
5866 }
5867
5868 // Rows
5869 for (r = 0; r < txfm_size_row / 2; ++r) {
5870 txfm_func_row(
5871 buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
5872 svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col / 2, -shift[2]);
5873
5874 if (abs(rect_type) == 1) {
5875 // Multiply everything by Sqrt2 if the transform is rectangular and the
5876 // size difference is a factor of 2.
5877 for (c = 0; c < txfm_size_col / 2; ++c) {
5878 output[r * txfm_size_col + c] = round_shift(
5879 (int64_t)output[r * txfm_size_col + c] * new_sqrt2, new_sqrt2_bits);
5880 }
5881 }
5882 }
5883
5884 for (int i = 0; i < (txfm_size_col * txfm_size_row); i++) {
5885 if (i % txfm_size_col >= (txfm_size_col >> 1) ||
5886 i / txfm_size_col >= (txfm_size_row >> 1)) {
5887 output[i] = 0;
5888 }
5889 }
5890 }
5891
av1_transform_two_d_64x64_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5892 void av1_transform_two_d_64x64_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5893 TxType transform_type, uint8_t bit_depth) {
5894 int32_t intermediate_transform_buffer[64 * 64];
5895 Txfm2dFlipCfg cfg;
5896 av1_transform_config(transform_type, TX_64X64, &cfg);
5897 av1_tranform_two_d_core_N2_c(
5898 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5899 }
5900
av1_transform_two_d_32x32_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5901 void av1_transform_two_d_32x32_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5902 TxType transform_type, uint8_t bit_depth) {
5903 int32_t intermediate_transform_buffer[32 * 32];
5904 Txfm2dFlipCfg cfg;
5905 av1_transform_config(transform_type, TX_32X32, &cfg);
5906 av1_tranform_two_d_core_N2_c(
5907 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5908 }
5909
av1_transform_two_d_16x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5910 void av1_transform_two_d_16x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5911 TxType transform_type, uint8_t bit_depth) {
5912 int32_t intermediate_transform_buffer[16 * 16];
5913 Txfm2dFlipCfg cfg;
5914 av1_transform_config(transform_type, TX_16X16, &cfg);
5915 av1_tranform_two_d_core_N2_c(
5916 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5917 }
5918
av1_transform_two_d_8x8_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5919 void av1_transform_two_d_8x8_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5920 TxType transform_type, uint8_t bit_depth) {
5921 int32_t intermediate_transform_buffer[8 * 8];
5922 Txfm2dFlipCfg cfg;
5923 av1_transform_config(transform_type, TX_8X8, &cfg);
5924 av1_tranform_two_d_core_N2_c(
5925 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5926 }
5927
av1_transform_two_d_4x4_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5928 void av1_transform_two_d_4x4_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5929 TxType transform_type, uint8_t bit_depth) {
5930 int32_t intermediate_transform_buffer[4 * 4];
5931 Txfm2dFlipCfg cfg;
5932 av1_transform_config(transform_type, TX_4X4, &cfg);
5933 av1_tranform_two_d_core_N2_c(
5934 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5935 }
5936
svt_av1_fwd_txfm2d_64x32_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5937 void svt_av1_fwd_txfm2d_64x32_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5938 TxType transform_type, uint8_t bit_depth) {
5939 int32_t intermediate_transform_buffer[64 * 32];
5940 Txfm2dFlipCfg cfg;
5941 av1_transform_config(transform_type, TX_64X32, &cfg);
5942 av1_tranform_two_d_core_N2_c(
5943 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5944 }
5945
svt_av1_fwd_txfm2d_32x64_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5946 void svt_av1_fwd_txfm2d_32x64_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5947 TxType transform_type, uint8_t bit_depth) {
5948 int32_t intermediate_transform_buffer[32 * 64];
5949 Txfm2dFlipCfg cfg;
5950 av1_transform_config(transform_type, TX_32X64, &cfg);
5951 av1_tranform_two_d_core_N2_c(
5952 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5953 }
5954
svt_av1_fwd_txfm2d_64x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5955 void svt_av1_fwd_txfm2d_64x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5956 TxType transform_type, uint8_t bit_depth) {
5957 int32_t intermediate_transform_buffer[64 * 16];
5958 Txfm2dFlipCfg cfg;
5959 av1_transform_config(transform_type, TX_64X16, &cfg);
5960 av1_tranform_two_d_core_N2_c(
5961 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5962 }
5963
svt_av1_fwd_txfm2d_16x64_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5964 void svt_av1_fwd_txfm2d_16x64_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5965 TxType transform_type, uint8_t bit_depth) {
5966 int32_t intermediate_transform_buffer[16 * 64];
5967 Txfm2dFlipCfg cfg;
5968 av1_transform_config(transform_type, TX_16X64, &cfg);
5969 av1_tranform_two_d_core_N2_c(
5970 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5971 }
5972
svt_av1_fwd_txfm2d_32x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5973 void svt_av1_fwd_txfm2d_32x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5974 TxType transform_type, uint8_t bit_depth) {
5975 int32_t intermediate_transform_buffer[32 * 16];
5976 Txfm2dFlipCfg cfg;
5977 av1_transform_config(transform_type, TX_32X16, &cfg);
5978 av1_tranform_two_d_core_N2_c(
5979 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5980 }
5981
svt_av1_fwd_txfm2d_16x32_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5982 void svt_av1_fwd_txfm2d_16x32_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5983 TxType transform_type, uint8_t bit_depth) {
5984 int32_t intermediate_transform_buffer[16 * 32];
5985 Txfm2dFlipCfg cfg;
5986 av1_transform_config(transform_type, TX_16X32, &cfg);
5987 av1_tranform_two_d_core_N2_c(
5988 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5989 }
5990
svt_av1_fwd_txfm2d_16x8_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)5991 void svt_av1_fwd_txfm2d_16x8_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
5992 TxType transform_type, uint8_t bit_depth) {
5993 int32_t intermediate_transform_buffer[16 * 8];
5994 Txfm2dFlipCfg cfg;
5995 av1_transform_config(transform_type, TX_16X8, &cfg);
5996 av1_tranform_two_d_core_N2_c(
5997 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
5998 }
5999
svt_av1_fwd_txfm2d_8x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6000 void svt_av1_fwd_txfm2d_8x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6001 TxType transform_type, uint8_t bit_depth) {
6002 int32_t intermediate_transform_buffer[8 * 16];
6003 Txfm2dFlipCfg cfg;
6004 av1_transform_config(transform_type, TX_8X16, &cfg);
6005 av1_tranform_two_d_core_N2_c(
6006 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6007 }
6008
svt_av1_fwd_txfm2d_32x8_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6009 void svt_av1_fwd_txfm2d_32x8_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6010 TxType transform_type, uint8_t bit_depth) {
6011 int32_t intermediate_transform_buffer[32 * 8];
6012 Txfm2dFlipCfg cfg;
6013 av1_transform_config(transform_type, TX_32X8, &cfg);
6014 av1_tranform_two_d_core_N2_c(
6015 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6016 }
6017
svt_av1_fwd_txfm2d_8x32_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6018 void svt_av1_fwd_txfm2d_8x32_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6019 TxType transform_type, uint8_t bit_depth) {
6020 int32_t intermediate_transform_buffer[8 * 32];
6021 Txfm2dFlipCfg cfg;
6022 av1_transform_config(transform_type, TX_8X32, &cfg);
6023 av1_tranform_two_d_core_N2_c(
6024 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6025 }
6026
svt_av1_fwd_txfm2d_16x4_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6027 void svt_av1_fwd_txfm2d_16x4_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6028 TxType transform_type, uint8_t bit_depth) {
6029 int32_t intermediate_transform_buffer[16 * 4];
6030 Txfm2dFlipCfg cfg;
6031 av1_transform_config(transform_type, TX_16X4, &cfg);
6032 av1_tranform_two_d_core_N2_c(
6033 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6034 }
6035
svt_av1_fwd_txfm2d_4x16_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6036 void svt_av1_fwd_txfm2d_4x16_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6037 TxType transform_type, uint8_t bit_depth) {
6038 int32_t intermediate_transform_buffer[4 * 16];
6039 Txfm2dFlipCfg cfg;
6040 av1_transform_config(transform_type, TX_4X16, &cfg);
6041 av1_tranform_two_d_core_N2_c(
6042 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6043 }
6044
svt_av1_fwd_txfm2d_8x4_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6045 void svt_av1_fwd_txfm2d_8x4_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6046 TxType transform_type, uint8_t bit_depth) {
6047 int32_t intermediate_transform_buffer[8 * 4];
6048 Txfm2dFlipCfg cfg;
6049 av1_transform_config(transform_type, TX_8X4, &cfg);
6050 av1_tranform_two_d_core_N2_c(
6051 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6052 }
6053
svt_av1_fwd_txfm2d_4x8_N2_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)6054 void svt_av1_fwd_txfm2d_4x8_N2_c(int16_t *input, int32_t *output, uint32_t input_stride,
6055 TxType transform_type, uint8_t bit_depth) {
6056 int32_t intermediate_transform_buffer[4 * 8];
6057 Txfm2dFlipCfg cfg;
6058 av1_transform_config(transform_type, TX_4X8, &cfg);
6059 av1_tranform_two_d_core_N2_c(
6060 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
6061 }
6062
svt_av1_fdct4_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6063 void svt_av1_fdct4_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6064 const int8_t *stage_range) {
6065 (void)stage_range;
6066 const int32_t *cospi = cospi_arr(cos_bit);
6067 int32_t step[2];
6068
6069 // stage 1;
6070 step[0] = input[0] + input[3];
6071 step[1] = input[1] + input[2];
6072
6073 output[0] = half_btf(cospi[32], step[0], cospi[32], step[1], cos_bit);
6074 }
6075
svt_av1_fadst4_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6076 void svt_av1_fadst4_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6077 const int8_t *stage_range) {
6078 (void)stage_range;
6079 int32_t bit = cos_bit;
6080 const int32_t *sinpi = sinpi_arr(bit);
6081 int32_t x0, x1, x2, x3;
6082 int32_t s0, s2, s4, s5;
6083
6084 // stage 0
6085 x0 = input[0];
6086 x1 = input[1];
6087 x2 = input[2];
6088 x3 = input[3];
6089
6090 if (!(x0 | x1 | x2 | x3)) {
6091 output[0] = output[1] = output[2] = output[3] = 0;
6092 return;
6093 }
6094
6095 // stage 1
6096 s0 = sinpi[1] * x0;
6097 s2 = sinpi[2] * x1;
6098 s4 = sinpi[3] * x2;
6099 s5 = sinpi[4] * x3;
6100
6101 // stage 3
6102 x0 = s0 + s2;
6103
6104 // stage 4
6105 x0 = x0 + s5;
6106
6107 // stage 5
6108 s0 = x0 + s4;
6109
6110 // 1-D transform scaling factor is sqrt(2).
6111 output[0] = round_shift(s0, bit);
6112 }
6113
svt_av1_fidentity4_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6114 void svt_av1_fidentity4_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
6115 const int8_t *stage_range) {
6116 (void)stage_range;
6117 (void)cos_bit;
6118 output[0] = round_shift((int64_t)input[0] * new_sqrt2, new_sqrt2_bits);
6119 assert(stage_range[0] + new_sqrt2_bits <= 32);
6120 }
6121
svt_av1_fdct8_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6122 void svt_av1_fdct8_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6123 const int8_t *stage_range) {
6124 (void)stage_range;
6125 const int32_t *cospi;
6126
6127 int32_t *bf0, *bf1;
6128 int32_t step[8];
6129
6130 // stage 0;
6131
6132 // stage 1;
6133 bf1 = output;
6134 bf1[0] = input[0] + input[7];
6135 bf1[1] = input[1] + input[6];
6136 bf1[2] = input[2] + input[5];
6137 bf1[3] = input[3] + input[4];
6138 bf1[4] = -input[4] + input[3];
6139 bf1[5] = -input[5] + input[2];
6140 bf1[6] = -input[6] + input[1];
6141 bf1[7] = -input[7] + input[0];
6142
6143 // stage 2
6144 cospi = cospi_arr(cos_bit);
6145 bf0 = output;
6146 bf1 = step;
6147 bf1[0] = bf0[0] + bf0[3];
6148 bf1[1] = bf0[1] + bf0[2];
6149 bf1[4] = bf0[4];
6150 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6151 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6152 bf1[7] = bf0[7];
6153
6154 // stage 3
6155 bf0 = step;
6156 bf1 = output;
6157 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6158 bf1[4] = bf0[4] + bf0[5];
6159 bf1[7] = bf0[7] + bf0[6];
6160
6161 // stage 4
6162 bf0 = output;
6163 bf1 = step;
6164 bf1[0] = bf0[0];
6165 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
6166
6167 // stage 5
6168 bf0 = step;
6169 bf1 = output;
6170 bf1[0] = bf0[0];
6171 bf1[1] = bf0[4];
6172 }
6173
svt_av1_fadst8_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6174 void svt_av1_fadst8_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6175 const int8_t *stage_range) {
6176 (void)stage_range;
6177 const int32_t *cospi;
6178
6179 int32_t *bf0, *bf1;
6180 int32_t step[8];
6181
6182 // stage 0;
6183
6184 // stage 1;
6185 assert(output != input);
6186 bf1 = output;
6187 bf1[0] = input[0];
6188 bf1[1] = -input[7];
6189 bf1[2] = -input[3];
6190 bf1[3] = input[4];
6191 bf1[4] = -input[1];
6192 bf1[5] = input[6];
6193 bf1[6] = input[2];
6194 bf1[7] = -input[5];
6195
6196 // stage 2
6197 cospi = cospi_arr(cos_bit);
6198 bf0 = output;
6199 bf1 = step;
6200 bf1[0] = bf0[0];
6201 bf1[1] = bf0[1];
6202 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
6203 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
6204 bf1[4] = bf0[4];
6205 bf1[5] = bf0[5];
6206 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
6207 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
6208
6209 // stage 3
6210 bf0 = step;
6211 bf1 = output;
6212 bf1[0] = bf0[0] + bf0[2];
6213 bf1[1] = bf0[1] + bf0[3];
6214 bf1[2] = bf0[0] - bf0[2];
6215 bf1[3] = bf0[1] - bf0[3];
6216 bf1[4] = bf0[4] + bf0[6];
6217 bf1[5] = bf0[5] + bf0[7];
6218 bf1[6] = bf0[4] - bf0[6];
6219 bf1[7] = bf0[5] - bf0[7];
6220
6221 // stage 4
6222 bf0 = output;
6223 bf1 = step;
6224 bf1[0] = bf0[0];
6225 bf1[1] = bf0[1];
6226 bf1[2] = bf0[2];
6227 bf1[3] = bf0[3];
6228 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
6229 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
6230 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
6231 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
6232
6233 // stage 5
6234 bf0 = step;
6235 bf1 = output;
6236 bf1[0] = bf0[0] + bf0[4];
6237 bf1[1] = bf0[1] + bf0[5];
6238 bf1[6] = bf0[2] - bf0[6];
6239 bf1[7] = bf0[3] - bf0[7];
6240
6241 // stage 6
6242 bf0 = output;
6243 bf1 = step;
6244 bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
6245 bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
6246
6247 // stage 7
6248 bf0 = step;
6249 bf1 = output;
6250 bf1[0] = bf0[1];
6251 bf1[1] = bf0[6];
6252 }
6253
svt_av1_fidentity8_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6254 void svt_av1_fidentity8_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
6255 const int8_t *stage_range) {
6256 (void)stage_range;
6257 (void)cos_bit;
6258 for (int32_t i = 0; i < 2; ++i) output[i] = input[i] * 2;
6259 }
6260
svt_av1_fdct16_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6261 void svt_av1_fdct16_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6262 const int8_t *stage_range) {
6263 (void)stage_range;
6264 const int32_t *cospi;
6265
6266 int32_t *bf0, *bf1;
6267 int32_t step[16];
6268
6269 // stage 0;
6270
6271 // stage 1;
6272 bf1 = output;
6273 bf1[0] = input[0] + input[15];
6274 bf1[1] = input[1] + input[14];
6275 bf1[2] = input[2] + input[13];
6276 bf1[3] = input[3] + input[12];
6277 bf1[4] = input[4] + input[11];
6278 bf1[5] = input[5] + input[10];
6279 bf1[6] = input[6] + input[9];
6280 bf1[7] = input[7] + input[8];
6281 bf1[8] = -input[8] + input[7];
6282 bf1[9] = -input[9] + input[6];
6283 bf1[10] = -input[10] + input[5];
6284 bf1[11] = -input[11] + input[4];
6285 bf1[12] = -input[12] + input[3];
6286 bf1[13] = -input[13] + input[2];
6287 bf1[14] = -input[14] + input[1];
6288 bf1[15] = -input[15] + input[0];
6289
6290 // stage 2
6291 cospi = cospi_arr(cos_bit);
6292 bf0 = output;
6293 bf1 = step;
6294 bf1[0] = bf0[0] + bf0[7];
6295 bf1[1] = bf0[1] + bf0[6];
6296 bf1[2] = bf0[2] + bf0[5];
6297 bf1[3] = bf0[3] + bf0[4];
6298 bf1[4] = -bf0[4] + bf0[3];
6299 bf1[5] = -bf0[5] + bf0[2];
6300 bf1[6] = -bf0[6] + bf0[1];
6301 bf1[7] = -bf0[7] + bf0[0];
6302 bf1[8] = bf0[8];
6303 bf1[9] = bf0[9];
6304 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
6305 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
6306 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
6307 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
6308 bf1[14] = bf0[14];
6309 bf1[15] = bf0[15];
6310
6311 // stage 3
6312 bf0 = step;
6313 bf1 = output;
6314 bf1[0] = bf0[0] + bf0[3];
6315 bf1[1] = bf0[1] + bf0[2];
6316 bf1[4] = bf0[4];
6317 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6318 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6319 bf1[7] = bf0[7];
6320 bf1[8] = bf0[8] + bf0[11];
6321 bf1[9] = bf0[9] + bf0[10];
6322 bf1[10] = -bf0[10] + bf0[9];
6323 bf1[11] = -bf0[11] + bf0[8];
6324 bf1[12] = -bf0[12] + bf0[15];
6325 bf1[13] = -bf0[13] + bf0[14];
6326 bf1[14] = bf0[14] + bf0[13];
6327 bf1[15] = bf0[15] + bf0[12];
6328
6329 // stage 4
6330 bf0 = output;
6331 bf1 = step;
6332 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6333 bf1[4] = bf0[4] + bf0[5];
6334 bf1[7] = bf0[7] + bf0[6];
6335 bf1[8] = bf0[8];
6336 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
6337 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
6338 bf1[11] = bf0[11];
6339 bf1[12] = bf0[12];
6340 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
6341 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
6342 bf1[15] = bf0[15];
6343
6344 // stage 5
6345 bf0 = step;
6346 bf1 = output;
6347 bf1[0] = bf0[0];
6348 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
6349 bf1[8] = bf0[8] + bf0[9];
6350 bf1[11] = bf0[11] + bf0[10];
6351 bf1[12] = bf0[12] + bf0[13];
6352 bf1[15] = bf0[15] + bf0[14];
6353
6354 // stage 6
6355 bf0 = output;
6356 bf1 = step;
6357 bf1[0] = bf0[0];
6358 bf1[4] = bf0[4];
6359 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
6360 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
6361
6362 // stage 7
6363 bf0 = step;
6364 bf1 = output;
6365 bf1[0] = bf0[0];
6366 bf1[1] = bf0[8];
6367 bf1[2] = bf0[4];
6368 bf1[3] = bf0[12];
6369 }
6370
svt_av1_fadst16_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6371 void svt_av1_fadst16_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6372 const int8_t *stage_range) {
6373 (void)stage_range;
6374 const int32_t *cospi;
6375
6376 int32_t *bf0, *bf1;
6377 int32_t step[16];
6378
6379 // stage 0;
6380
6381 // stage 1;
6382 assert(output != input);
6383 bf1 = output;
6384 bf1[0] = input[0];
6385 bf1[1] = -input[15];
6386 bf1[2] = -input[7];
6387 bf1[3] = input[8];
6388 bf1[4] = -input[3];
6389 bf1[5] = input[12];
6390 bf1[6] = input[4];
6391 bf1[7] = -input[11];
6392 bf1[8] = -input[1];
6393 bf1[9] = input[14];
6394 bf1[10] = input[6];
6395 bf1[11] = -input[9];
6396 bf1[12] = input[2];
6397 bf1[13] = -input[13];
6398 bf1[14] = -input[5];
6399 bf1[15] = input[10];
6400
6401 // stage 2
6402 cospi = cospi_arr(cos_bit);
6403 bf0 = output;
6404 bf1 = step;
6405 bf1[0] = bf0[0];
6406 bf1[1] = bf0[1];
6407 bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
6408 bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
6409 bf1[4] = bf0[4];
6410 bf1[5] = bf0[5];
6411 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
6412 bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
6413 bf1[8] = bf0[8];
6414 bf1[9] = bf0[9];
6415 bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
6416 bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
6417 bf1[12] = bf0[12];
6418 bf1[13] = bf0[13];
6419 bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
6420 bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
6421
6422 // stage 3
6423 bf0 = step;
6424 bf1 = output;
6425 bf1[0] = bf0[0] + bf0[2];
6426 bf1[1] = bf0[1] + bf0[3];
6427 bf1[2] = bf0[0] - bf0[2];
6428 bf1[3] = bf0[1] - bf0[3];
6429 bf1[4] = bf0[4] + bf0[6];
6430 bf1[5] = bf0[5] + bf0[7];
6431 bf1[6] = bf0[4] - bf0[6];
6432 bf1[7] = bf0[5] - bf0[7];
6433 bf1[8] = bf0[8] + bf0[10];
6434 bf1[9] = bf0[9] + bf0[11];
6435 bf1[10] = bf0[8] - bf0[10];
6436 bf1[11] = bf0[9] - bf0[11];
6437 bf1[12] = bf0[12] + bf0[14];
6438 bf1[13] = bf0[13] + bf0[15];
6439 bf1[14] = bf0[12] - bf0[14];
6440 bf1[15] = bf0[13] - bf0[15];
6441
6442 // stage 4
6443 bf0 = output;
6444 bf1 = step;
6445 bf1[0] = bf0[0];
6446 bf1[1] = bf0[1];
6447 bf1[2] = bf0[2];
6448 bf1[3] = bf0[3];
6449 bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
6450 bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
6451 bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
6452 bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
6453 bf1[8] = bf0[8];
6454 bf1[9] = bf0[9];
6455 bf1[10] = bf0[10];
6456 bf1[11] = bf0[11];
6457 bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
6458 bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
6459 bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
6460 bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
6461
6462 // stage 5
6463 bf0 = step;
6464 bf1 = output;
6465 bf1[0] = bf0[0] + bf0[4];
6466 bf1[1] = bf0[1] + bf0[5];
6467 bf1[2] = bf0[2] + bf0[6];
6468 bf1[3] = bf0[3] + bf0[7];
6469 bf1[4] = bf0[0] - bf0[4];
6470 bf1[5] = bf0[1] - bf0[5];
6471 bf1[6] = bf0[2] - bf0[6];
6472 bf1[7] = bf0[3] - bf0[7];
6473 bf1[8] = bf0[8] + bf0[12];
6474 bf1[9] = bf0[9] + bf0[13];
6475 bf1[10] = bf0[10] + bf0[14];
6476 bf1[11] = bf0[11] + bf0[15];
6477 bf1[12] = bf0[8] - bf0[12];
6478 bf1[13] = bf0[9] - bf0[13];
6479 bf1[14] = bf0[10] - bf0[14];
6480 bf1[15] = bf0[11] - bf0[15];
6481
6482 // stage 6
6483 bf0 = output;
6484 bf1 = step;
6485 bf1[0] = bf0[0];
6486 bf1[1] = bf0[1];
6487 bf1[2] = bf0[2];
6488 bf1[3] = bf0[3];
6489 bf1[4] = bf0[4];
6490 bf1[5] = bf0[5];
6491 bf1[6] = bf0[6];
6492 bf1[7] = bf0[7];
6493 bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
6494 bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
6495 bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
6496 bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
6497 bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
6498 bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
6499 bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
6500 bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
6501
6502 // stage 7
6503 bf0 = step;
6504 bf1 = output;
6505 bf1[0] = bf0[0] + bf0[8];
6506 bf1[1] = bf0[1] + bf0[9];
6507 bf1[2] = bf0[2] + bf0[10];
6508 bf1[3] = bf0[3] + bf0[11];
6509 bf1[12] = bf0[4] - bf0[12];
6510 bf1[13] = bf0[5] - bf0[13];
6511 bf1[14] = bf0[6] - bf0[14];
6512 bf1[15] = bf0[7] - bf0[15];
6513
6514 // stage 8
6515 bf0 = output;
6516 bf1 = step;
6517 bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
6518 bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
6519 bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
6520 bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
6521
6522 // stage 9
6523 bf0 = step;
6524 bf1 = output;
6525 bf1[0] = bf0[1];
6526 bf1[1] = bf0[14];
6527 bf1[2] = bf0[3];
6528 bf1[3] = bf0[12];
6529 }
6530
svt_av1_fidentity16_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6531 void svt_av1_fidentity16_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
6532 const int8_t *stage_range) {
6533 (void)stage_range;
6534 (void)cos_bit;
6535 for (int32_t i = 0; i < 4; ++i)
6536 output[i] = round_shift((int64_t)input[i] * 2 * new_sqrt2, new_sqrt2_bits);
6537 assert(stage_range[0] + new_sqrt2_bits <= 32);
6538 }
6539
svt_av1_fdct32_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6540 void svt_av1_fdct32_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6541 const int8_t *stage_range) {
6542 (void)stage_range;
6543 const int32_t *cospi;
6544
6545 int32_t *bf0, *bf1;
6546 int32_t step[32];
6547
6548 // stage 0;
6549
6550 // stage 1;
6551 bf1 = output;
6552 bf1[0] = input[0] + input[31];
6553 bf1[1] = input[1] + input[30];
6554 bf1[2] = input[2] + input[29];
6555 bf1[3] = input[3] + input[28];
6556 bf1[4] = input[4] + input[27];
6557 bf1[5] = input[5] + input[26];
6558 bf1[6] = input[6] + input[25];
6559 bf1[7] = input[7] + input[24];
6560 bf1[8] = input[8] + input[23];
6561 bf1[9] = input[9] + input[22];
6562 bf1[10] = input[10] + input[21];
6563 bf1[11] = input[11] + input[20];
6564 bf1[12] = input[12] + input[19];
6565 bf1[13] = input[13] + input[18];
6566 bf1[14] = input[14] + input[17];
6567 bf1[15] = input[15] + input[16];
6568 bf1[16] = -input[16] + input[15];
6569 bf1[17] = -input[17] + input[14];
6570 bf1[18] = -input[18] + input[13];
6571 bf1[19] = -input[19] + input[12];
6572 bf1[20] = -input[20] + input[11];
6573 bf1[21] = -input[21] + input[10];
6574 bf1[22] = -input[22] + input[9];
6575 bf1[23] = -input[23] + input[8];
6576 bf1[24] = -input[24] + input[7];
6577 bf1[25] = -input[25] + input[6];
6578 bf1[26] = -input[26] + input[5];
6579 bf1[27] = -input[27] + input[4];
6580 bf1[28] = -input[28] + input[3];
6581 bf1[29] = -input[29] + input[2];
6582 bf1[30] = -input[30] + input[1];
6583 bf1[31] = -input[31] + input[0];
6584
6585 // stage 2
6586 cospi = cospi_arr(cos_bit);
6587 bf0 = output;
6588 bf1 = step;
6589 bf1[0] = bf0[0] + bf0[15];
6590 bf1[1] = bf0[1] + bf0[14];
6591 bf1[2] = bf0[2] + bf0[13];
6592 bf1[3] = bf0[3] + bf0[12];
6593 bf1[4] = bf0[4] + bf0[11];
6594 bf1[5] = bf0[5] + bf0[10];
6595 bf1[6] = bf0[6] + bf0[9];
6596 bf1[7] = bf0[7] + bf0[8];
6597 bf1[8] = -bf0[8] + bf0[7];
6598 bf1[9] = -bf0[9] + bf0[6];
6599 bf1[10] = -bf0[10] + bf0[5];
6600 bf1[11] = -bf0[11] + bf0[4];
6601 bf1[12] = -bf0[12] + bf0[3];
6602 bf1[13] = -bf0[13] + bf0[2];
6603 bf1[14] = -bf0[14] + bf0[1];
6604 bf1[15] = -bf0[15] + bf0[0];
6605 bf1[16] = bf0[16];
6606 bf1[17] = bf0[17];
6607 bf1[18] = bf0[18];
6608 bf1[19] = bf0[19];
6609 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
6610 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
6611 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
6612 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
6613 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
6614 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
6615 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
6616 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
6617 bf1[28] = bf0[28];
6618 bf1[29] = bf0[29];
6619 bf1[30] = bf0[30];
6620 bf1[31] = bf0[31];
6621
6622 // stage 3
6623 bf0 = step;
6624 bf1 = output;
6625 bf1[0] = bf0[0] + bf0[7];
6626 bf1[1] = bf0[1] + bf0[6];
6627 bf1[2] = bf0[2] + bf0[5];
6628 bf1[3] = bf0[3] + bf0[4];
6629 bf1[4] = -bf0[4] + bf0[3];
6630 bf1[5] = -bf0[5] + bf0[2];
6631 bf1[6] = -bf0[6] + bf0[1];
6632 bf1[7] = -bf0[7] + bf0[0];
6633 bf1[8] = bf0[8];
6634 bf1[9] = bf0[9];
6635 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
6636 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
6637 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
6638 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
6639 bf1[14] = bf0[14];
6640 bf1[15] = bf0[15];
6641 bf1[16] = bf0[16] + bf0[23];
6642 bf1[17] = bf0[17] + bf0[22];
6643 bf1[18] = bf0[18] + bf0[21];
6644 bf1[19] = bf0[19] + bf0[20];
6645 bf1[20] = -bf0[20] + bf0[19];
6646 bf1[21] = -bf0[21] + bf0[18];
6647 bf1[22] = -bf0[22] + bf0[17];
6648 bf1[23] = -bf0[23] + bf0[16];
6649 bf1[24] = -bf0[24] + bf0[31];
6650 bf1[25] = -bf0[25] + bf0[30];
6651 bf1[26] = -bf0[26] + bf0[29];
6652 bf1[27] = -bf0[27] + bf0[28];
6653 bf1[28] = bf0[28] + bf0[27];
6654 bf1[29] = bf0[29] + bf0[26];
6655 bf1[30] = bf0[30] + bf0[25];
6656 bf1[31] = bf0[31] + bf0[24];
6657
6658 // stage 4
6659 bf0 = output;
6660 bf1 = step;
6661 bf1[0] = bf0[0] + bf0[3];
6662 bf1[1] = bf0[1] + bf0[2];
6663 bf1[4] = bf0[4];
6664 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
6665 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
6666 bf1[7] = bf0[7];
6667 bf1[8] = bf0[8] + bf0[11];
6668 bf1[9] = bf0[9] + bf0[10];
6669 bf1[10] = -bf0[10] + bf0[9];
6670 bf1[11] = -bf0[11] + bf0[8];
6671 bf1[12] = -bf0[12] + bf0[15];
6672 bf1[13] = -bf0[13] + bf0[14];
6673 bf1[14] = bf0[14] + bf0[13];
6674 bf1[15] = bf0[15] + bf0[12];
6675 bf1[16] = bf0[16];
6676 bf1[17] = bf0[17];
6677 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
6678 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
6679 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
6680 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
6681 bf1[22] = bf0[22];
6682 bf1[23] = bf0[23];
6683 bf1[24] = bf0[24];
6684 bf1[25] = bf0[25];
6685 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
6686 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
6687 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
6688 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
6689 bf1[30] = bf0[30];
6690 bf1[31] = bf0[31];
6691
6692 // stage 5
6693 bf0 = step;
6694 bf1 = output;
6695 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
6696 bf1[4] = bf0[4] + bf0[5];
6697 bf1[7] = bf0[7] + bf0[6];
6698 bf1[8] = bf0[8];
6699 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
6700 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
6701 bf1[11] = bf0[11];
6702 bf1[12] = bf0[12];
6703 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
6704 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
6705 bf1[15] = bf0[15];
6706 bf1[16] = bf0[16] + bf0[19];
6707 bf1[17] = bf0[17] + bf0[18];
6708 bf1[18] = -bf0[18] + bf0[17];
6709 bf1[19] = -bf0[19] + bf0[16];
6710 bf1[20] = -bf0[20] + bf0[23];
6711 bf1[21] = -bf0[21] + bf0[22];
6712 bf1[22] = bf0[22] + bf0[21];
6713 bf1[23] = bf0[23] + bf0[20];
6714 bf1[24] = bf0[24] + bf0[27];
6715 bf1[25] = bf0[25] + bf0[26];
6716 bf1[26] = -bf0[26] + bf0[25];
6717 bf1[27] = -bf0[27] + bf0[24];
6718 bf1[28] = -bf0[28] + bf0[31];
6719 bf1[29] = -bf0[29] + bf0[30];
6720 bf1[30] = bf0[30] + bf0[29];
6721 bf1[31] = bf0[31] + bf0[28];
6722
6723 // stage 6
6724 bf0 = output;
6725 bf1 = step;
6726 bf1[0] = bf0[0];
6727 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
6728 bf1[8] = bf0[8] + bf0[9];
6729 bf1[11] = bf0[11] + bf0[10];
6730 bf1[12] = bf0[12] + bf0[13];
6731 bf1[15] = bf0[15] + bf0[14];
6732 bf1[16] = bf0[16];
6733 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
6734 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
6735 bf1[19] = bf0[19];
6736 bf1[20] = bf0[20];
6737 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
6738 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
6739 bf1[23] = bf0[23];
6740 bf1[24] = bf0[24];
6741 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
6742 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
6743 bf1[27] = bf0[27];
6744 bf1[28] = bf0[28];
6745 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
6746 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
6747 bf1[31] = bf0[31];
6748
6749 // stage 7
6750 bf0 = step;
6751 bf1 = output;
6752 bf1[0] = bf0[0];
6753 bf1[4] = bf0[4];
6754 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
6755 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
6756 bf1[16] = bf0[16] + bf0[17];
6757 bf1[19] = bf0[19] + bf0[18];
6758 bf1[20] = bf0[20] + bf0[21];
6759 bf1[23] = bf0[23] + bf0[22];
6760 bf1[24] = bf0[24] + bf0[25];
6761 bf1[27] = bf0[27] + bf0[26];
6762 bf1[28] = bf0[28] + bf0[29];
6763 bf1[31] = bf0[31] + bf0[30];
6764
6765 // stage 8
6766 bf0 = output;
6767 bf1 = step;
6768 bf1[0] = bf0[0];
6769 bf1[4] = bf0[4];
6770 bf1[8] = bf0[8];
6771 bf1[12] = bf0[12];
6772 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
6773 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
6774 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
6775 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
6776
6777 // stage 9
6778 bf0 = step;
6779 bf1 = output;
6780 bf1[0] = bf0[0];
6781 bf1[1] = bf0[16];
6782 bf1[2] = bf0[8];
6783 bf1[3] = bf0[24];
6784 bf1[4] = bf0[4];
6785 bf1[5] = bf0[20];
6786 bf1[6] = bf0[12];
6787 bf1[7] = bf0[28];
6788 }
6789
svt_av1_fidentity32_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6790 void svt_av1_fidentity32_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
6791 const int8_t *stage_range) {
6792 (void)stage_range;
6793 (void)cos_bit;
6794 for (int32_t i = 0; i < 8; ++i) output[i] = input[i] * 4;
6795 }
6796
svt_av1_fdct64_new_N4(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)6797 void svt_av1_fdct64_new_N4(const int32_t *input, int32_t *output, int8_t cos_bit,
6798 const int8_t *stage_range) {
6799 (void)stage_range;
6800 const int32_t *cospi;
6801
6802 int32_t *bf0, *bf1;
6803 int32_t step[64];
6804
6805 // stage 0;
6806
6807 // stage 1;
6808 bf1 = output;
6809 bf1[0] = input[0] + input[63];
6810 bf1[1] = input[1] + input[62];
6811 bf1[2] = input[2] + input[61];
6812 bf1[3] = input[3] + input[60];
6813 bf1[4] = input[4] + input[59];
6814 bf1[5] = input[5] + input[58];
6815 bf1[6] = input[6] + input[57];
6816 bf1[7] = input[7] + input[56];
6817 bf1[8] = input[8] + input[55];
6818 bf1[9] = input[9] + input[54];
6819 bf1[10] = input[10] + input[53];
6820 bf1[11] = input[11] + input[52];
6821 bf1[12] = input[12] + input[51];
6822 bf1[13] = input[13] + input[50];
6823 bf1[14] = input[14] + input[49];
6824 bf1[15] = input[15] + input[48];
6825 bf1[16] = input[16] + input[47];
6826 bf1[17] = input[17] + input[46];
6827 bf1[18] = input[18] + input[45];
6828 bf1[19] = input[19] + input[44];
6829 bf1[20] = input[20] + input[43];
6830 bf1[21] = input[21] + input[42];
6831 bf1[22] = input[22] + input[41];
6832 bf1[23] = input[23] + input[40];
6833 bf1[24] = input[24] + input[39];
6834 bf1[25] = input[25] + input[38];
6835 bf1[26] = input[26] + input[37];
6836 bf1[27] = input[27] + input[36];
6837 bf1[28] = input[28] + input[35];
6838 bf1[29] = input[29] + input[34];
6839 bf1[30] = input[30] + input[33];
6840 bf1[31] = input[31] + input[32];
6841 bf1[32] = -input[32] + input[31];
6842 bf1[33] = -input[33] + input[30];
6843 bf1[34] = -input[34] + input[29];
6844 bf1[35] = -input[35] + input[28];
6845 bf1[36] = -input[36] + input[27];
6846 bf1[37] = -input[37] + input[26];
6847 bf1[38] = -input[38] + input[25];
6848 bf1[39] = -input[39] + input[24];
6849 bf1[40] = -input[40] + input[23];
6850 bf1[41] = -input[41] + input[22];
6851 bf1[42] = -input[42] + input[21];
6852 bf1[43] = -input[43] + input[20];
6853 bf1[44] = -input[44] + input[19];
6854 bf1[45] = -input[45] + input[18];
6855 bf1[46] = -input[46] + input[17];
6856 bf1[47] = -input[47] + input[16];
6857 bf1[48] = -input[48] + input[15];
6858 bf1[49] = -input[49] + input[14];
6859 bf1[50] = -input[50] + input[13];
6860 bf1[51] = -input[51] + input[12];
6861 bf1[52] = -input[52] + input[11];
6862 bf1[53] = -input[53] + input[10];
6863 bf1[54] = -input[54] + input[9];
6864 bf1[55] = -input[55] + input[8];
6865 bf1[56] = -input[56] + input[7];
6866 bf1[57] = -input[57] + input[6];
6867 bf1[58] = -input[58] + input[5];
6868 bf1[59] = -input[59] + input[4];
6869 bf1[60] = -input[60] + input[3];
6870 bf1[61] = -input[61] + input[2];
6871 bf1[62] = -input[62] + input[1];
6872 bf1[63] = -input[63] + input[0];
6873
6874 // stage 2
6875 cospi = cospi_arr(cos_bit);
6876 bf0 = output;
6877 bf1 = step;
6878 bf1[0] = bf0[0] + bf0[31];
6879 bf1[1] = bf0[1] + bf0[30];
6880 bf1[2] = bf0[2] + bf0[29];
6881 bf1[3] = bf0[3] + bf0[28];
6882 bf1[4] = bf0[4] + bf0[27];
6883 bf1[5] = bf0[5] + bf0[26];
6884 bf1[6] = bf0[6] + bf0[25];
6885 bf1[7] = bf0[7] + bf0[24];
6886 bf1[8] = bf0[8] + bf0[23];
6887 bf1[9] = bf0[9] + bf0[22];
6888 bf1[10] = bf0[10] + bf0[21];
6889 bf1[11] = bf0[11] + bf0[20];
6890 bf1[12] = bf0[12] + bf0[19];
6891 bf1[13] = bf0[13] + bf0[18];
6892 bf1[14] = bf0[14] + bf0[17];
6893 bf1[15] = bf0[15] + bf0[16];
6894 bf1[16] = -bf0[16] + bf0[15];
6895 bf1[17] = -bf0[17] + bf0[14];
6896 bf1[18] = -bf0[18] + bf0[13];
6897 bf1[19] = -bf0[19] + bf0[12];
6898 bf1[20] = -bf0[20] + bf0[11];
6899 bf1[21] = -bf0[21] + bf0[10];
6900 bf1[22] = -bf0[22] + bf0[9];
6901 bf1[23] = -bf0[23] + bf0[8];
6902 bf1[24] = -bf0[24] + bf0[7];
6903 bf1[25] = -bf0[25] + bf0[6];
6904 bf1[26] = -bf0[26] + bf0[5];
6905 bf1[27] = -bf0[27] + bf0[4];
6906 bf1[28] = -bf0[28] + bf0[3];
6907 bf1[29] = -bf0[29] + bf0[2];
6908 bf1[30] = -bf0[30] + bf0[1];
6909 bf1[31] = -bf0[31] + bf0[0];
6910 bf1[32] = bf0[32];
6911 bf1[33] = bf0[33];
6912 bf1[34] = bf0[34];
6913 bf1[35] = bf0[35];
6914 bf1[36] = bf0[36];
6915 bf1[37] = bf0[37];
6916 bf1[38] = bf0[38];
6917 bf1[39] = bf0[39];
6918 bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
6919 bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
6920 bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
6921 bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
6922 bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
6923 bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
6924 bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
6925 bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
6926 bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
6927 bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
6928 bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
6929 bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
6930 bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
6931 bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
6932 bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
6933 bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
6934 bf1[56] = bf0[56];
6935 bf1[57] = bf0[57];
6936 bf1[58] = bf0[58];
6937 bf1[59] = bf0[59];
6938 bf1[60] = bf0[60];
6939 bf1[61] = bf0[61];
6940 bf1[62] = bf0[62];
6941 bf1[63] = bf0[63];
6942
6943 // stage 3
6944 cospi = cospi_arr(cos_bit);
6945 bf0 = step;
6946 bf1 = output;
6947 bf1[0] = bf0[0] + bf0[15];
6948 bf1[1] = bf0[1] + bf0[14];
6949 bf1[2] = bf0[2] + bf0[13];
6950 bf1[3] = bf0[3] + bf0[12];
6951 bf1[4] = bf0[4] + bf0[11];
6952 bf1[5] = bf0[5] + bf0[10];
6953 bf1[6] = bf0[6] + bf0[9];
6954 bf1[7] = bf0[7] + bf0[8];
6955 bf1[8] = -bf0[8] + bf0[7];
6956 bf1[9] = -bf0[9] + bf0[6];
6957 bf1[10] = -bf0[10] + bf0[5];
6958 bf1[11] = -bf0[11] + bf0[4];
6959 bf1[12] = -bf0[12] + bf0[3];
6960 bf1[13] = -bf0[13] + bf0[2];
6961 bf1[14] = -bf0[14] + bf0[1];
6962 bf1[15] = -bf0[15] + bf0[0];
6963 bf1[16] = bf0[16];
6964 bf1[17] = bf0[17];
6965 bf1[18] = bf0[18];
6966 bf1[19] = bf0[19];
6967 bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
6968 bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
6969 bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
6970 bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
6971 bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
6972 bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
6973 bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
6974 bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
6975 bf1[28] = bf0[28];
6976 bf1[29] = bf0[29];
6977 bf1[30] = bf0[30];
6978 bf1[31] = bf0[31];
6979 bf1[32] = bf0[32] + bf0[47];
6980 bf1[33] = bf0[33] + bf0[46];
6981 bf1[34] = bf0[34] + bf0[45];
6982 bf1[35] = bf0[35] + bf0[44];
6983 bf1[36] = bf0[36] + bf0[43];
6984 bf1[37] = bf0[37] + bf0[42];
6985 bf1[38] = bf0[38] + bf0[41];
6986 bf1[39] = bf0[39] + bf0[40];
6987 bf1[40] = -bf0[40] + bf0[39];
6988 bf1[41] = -bf0[41] + bf0[38];
6989 bf1[42] = -bf0[42] + bf0[37];
6990 bf1[43] = -bf0[43] + bf0[36];
6991 bf1[44] = -bf0[44] + bf0[35];
6992 bf1[45] = -bf0[45] + bf0[34];
6993 bf1[46] = -bf0[46] + bf0[33];
6994 bf1[47] = -bf0[47] + bf0[32];
6995 bf1[48] = -bf0[48] + bf0[63];
6996 bf1[49] = -bf0[49] + bf0[62];
6997 bf1[50] = -bf0[50] + bf0[61];
6998 bf1[51] = -bf0[51] + bf0[60];
6999 bf1[52] = -bf0[52] + bf0[59];
7000 bf1[53] = -bf0[53] + bf0[58];
7001 bf1[54] = -bf0[54] + bf0[57];
7002 bf1[55] = -bf0[55] + bf0[56];
7003 bf1[56] = bf0[56] + bf0[55];
7004 bf1[57] = bf0[57] + bf0[54];
7005 bf1[58] = bf0[58] + bf0[53];
7006 bf1[59] = bf0[59] + bf0[52];
7007 bf1[60] = bf0[60] + bf0[51];
7008 bf1[61] = bf0[61] + bf0[50];
7009 bf1[62] = bf0[62] + bf0[49];
7010 bf1[63] = bf0[63] + bf0[48];
7011
7012 // stage 4
7013 cospi = cospi_arr(cos_bit);
7014 bf0 = output;
7015 bf1 = step;
7016 bf1[0] = bf0[0] + bf0[7];
7017 bf1[1] = bf0[1] + bf0[6];
7018 bf1[2] = bf0[2] + bf0[5];
7019 bf1[3] = bf0[3] + bf0[4];
7020 bf1[4] = -bf0[4] + bf0[3];
7021 bf1[5] = -bf0[5] + bf0[2];
7022 bf1[6] = -bf0[6] + bf0[1];
7023 bf1[7] = -bf0[7] + bf0[0];
7024 bf1[8] = bf0[8];
7025 bf1[9] = bf0[9];
7026 bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
7027 bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
7028 bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
7029 bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
7030 bf1[14] = bf0[14];
7031 bf1[15] = bf0[15];
7032 bf1[16] = bf0[16] + bf0[23];
7033 bf1[17] = bf0[17] + bf0[22];
7034 bf1[18] = bf0[18] + bf0[21];
7035 bf1[19] = bf0[19] + bf0[20];
7036 bf1[20] = -bf0[20] + bf0[19];
7037 bf1[21] = -bf0[21] + bf0[18];
7038 bf1[22] = -bf0[22] + bf0[17];
7039 bf1[23] = -bf0[23] + bf0[16];
7040 bf1[24] = -bf0[24] + bf0[31];
7041 bf1[25] = -bf0[25] + bf0[30];
7042 bf1[26] = -bf0[26] + bf0[29];
7043 bf1[27] = -bf0[27] + bf0[28];
7044 bf1[28] = bf0[28] + bf0[27];
7045 bf1[29] = bf0[29] + bf0[26];
7046 bf1[30] = bf0[30] + bf0[25];
7047 bf1[31] = bf0[31] + bf0[24];
7048 bf1[32] = bf0[32];
7049 bf1[33] = bf0[33];
7050 bf1[34] = bf0[34];
7051 bf1[35] = bf0[35];
7052 bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
7053 bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
7054 bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
7055 bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
7056 bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
7057 bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
7058 bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
7059 bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
7060 bf1[44] = bf0[44];
7061 bf1[45] = bf0[45];
7062 bf1[46] = bf0[46];
7063 bf1[47] = bf0[47];
7064 bf1[48] = bf0[48];
7065 bf1[49] = bf0[49];
7066 bf1[50] = bf0[50];
7067 bf1[51] = bf0[51];
7068 bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
7069 bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
7070 bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
7071 bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
7072 bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
7073 bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
7074 bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
7075 bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
7076 bf1[60] = bf0[60];
7077 bf1[61] = bf0[61];
7078 bf1[62] = bf0[62];
7079 bf1[63] = bf0[63];
7080
7081 // stage 5
7082 cospi = cospi_arr(cos_bit);
7083 bf0 = step;
7084 bf1 = output;
7085 bf1[0] = bf0[0] + bf0[3];
7086 bf1[1] = bf0[1] + bf0[2];
7087 bf1[4] = bf0[4];
7088 bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
7089 bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
7090 bf1[7] = bf0[7];
7091 bf1[8] = bf0[8] + bf0[11];
7092 bf1[9] = bf0[9] + bf0[10];
7093 bf1[10] = -bf0[10] + bf0[9];
7094 bf1[11] = -bf0[11] + bf0[8];
7095 bf1[12] = -bf0[12] + bf0[15];
7096 bf1[13] = -bf0[13] + bf0[14];
7097 bf1[14] = bf0[14] + bf0[13];
7098 bf1[15] = bf0[15] + bf0[12];
7099 bf1[16] = bf0[16];
7100 bf1[17] = bf0[17];
7101 bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
7102 bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
7103 bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
7104 bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
7105 bf1[22] = bf0[22];
7106 bf1[23] = bf0[23];
7107 bf1[24] = bf0[24];
7108 bf1[25] = bf0[25];
7109 bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
7110 bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
7111 bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
7112 bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
7113 bf1[30] = bf0[30];
7114 bf1[31] = bf0[31];
7115 bf1[32] = bf0[32] + bf0[39];
7116 bf1[33] = bf0[33] + bf0[38];
7117 bf1[34] = bf0[34] + bf0[37];
7118 bf1[35] = bf0[35] + bf0[36];
7119 bf1[36] = -bf0[36] + bf0[35];
7120 bf1[37] = -bf0[37] + bf0[34];
7121 bf1[38] = -bf0[38] + bf0[33];
7122 bf1[39] = -bf0[39] + bf0[32];
7123 bf1[40] = -bf0[40] + bf0[47];
7124 bf1[41] = -bf0[41] + bf0[46];
7125 bf1[42] = -bf0[42] + bf0[45];
7126 bf1[43] = -bf0[43] + bf0[44];
7127 bf1[44] = bf0[44] + bf0[43];
7128 bf1[45] = bf0[45] + bf0[42];
7129 bf1[46] = bf0[46] + bf0[41];
7130 bf1[47] = bf0[47] + bf0[40];
7131 bf1[48] = bf0[48] + bf0[55];
7132 bf1[49] = bf0[49] + bf0[54];
7133 bf1[50] = bf0[50] + bf0[53];
7134 bf1[51] = bf0[51] + bf0[52];
7135 bf1[52] = -bf0[52] + bf0[51];
7136 bf1[53] = -bf0[53] + bf0[50];
7137 bf1[54] = -bf0[54] + bf0[49];
7138 bf1[55] = -bf0[55] + bf0[48];
7139 bf1[56] = -bf0[56] + bf0[63];
7140 bf1[57] = -bf0[57] + bf0[62];
7141 bf1[58] = -bf0[58] + bf0[61];
7142 bf1[59] = -bf0[59] + bf0[60];
7143 bf1[60] = bf0[60] + bf0[59];
7144 bf1[61] = bf0[61] + bf0[58];
7145 bf1[62] = bf0[62] + bf0[57];
7146 bf1[63] = bf0[63] + bf0[56];
7147
7148 // stage 6
7149 cospi = cospi_arr(cos_bit);
7150 bf0 = output;
7151 bf1 = step;
7152 bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
7153 bf1[4] = bf0[4] + bf0[5];
7154 bf1[7] = bf0[7] + bf0[6];
7155 bf1[8] = bf0[8];
7156 bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
7157 bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
7158 bf1[11] = bf0[11];
7159 bf1[12] = bf0[12];
7160 bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
7161 bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
7162 bf1[15] = bf0[15];
7163 bf1[16] = bf0[16] + bf0[19];
7164 bf1[17] = bf0[17] + bf0[18];
7165 bf1[18] = -bf0[18] + bf0[17];
7166 bf1[19] = -bf0[19] + bf0[16];
7167 bf1[20] = -bf0[20] + bf0[23];
7168 bf1[21] = -bf0[21] + bf0[22];
7169 bf1[22] = bf0[22] + bf0[21];
7170 bf1[23] = bf0[23] + bf0[20];
7171 bf1[24] = bf0[24] + bf0[27];
7172 bf1[25] = bf0[25] + bf0[26];
7173 bf1[26] = -bf0[26] + bf0[25];
7174 bf1[27] = -bf0[27] + bf0[24];
7175 bf1[28] = -bf0[28] + bf0[31];
7176 bf1[29] = -bf0[29] + bf0[30];
7177 bf1[30] = bf0[30] + bf0[29];
7178 bf1[31] = bf0[31] + bf0[28];
7179 bf1[32] = bf0[32];
7180 bf1[33] = bf0[33];
7181 bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
7182 bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
7183 bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
7184 bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
7185 bf1[38] = bf0[38];
7186 bf1[39] = bf0[39];
7187 bf1[40] = bf0[40];
7188 bf1[41] = bf0[41];
7189 bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
7190 bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
7191 bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
7192 bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
7193 bf1[46] = bf0[46];
7194 bf1[47] = bf0[47];
7195 bf1[48] = bf0[48];
7196 bf1[49] = bf0[49];
7197 bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
7198 bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
7199 bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
7200 bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
7201 bf1[54] = bf0[54];
7202 bf1[55] = bf0[55];
7203 bf1[56] = bf0[56];
7204 bf1[57] = bf0[57];
7205 bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
7206 bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
7207 bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
7208 bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
7209 bf1[62] = bf0[62];
7210 bf1[63] = bf0[63];
7211
7212 // stage 7
7213 cospi = cospi_arr(cos_bit);
7214 bf0 = step;
7215 bf1 = output;
7216 bf1[0] = bf0[0];
7217 bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
7218 bf1[8] = bf0[8] + bf0[9];
7219 bf1[11] = bf0[11] + bf0[10];
7220 bf1[12] = bf0[12] + bf0[13];
7221 bf1[15] = bf0[15] + bf0[14];
7222 bf1[16] = bf0[16];
7223 bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
7224 bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
7225 bf1[19] = bf0[19];
7226 bf1[20] = bf0[20];
7227 bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
7228 bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
7229 bf1[23] = bf0[23];
7230 bf1[24] = bf0[24];
7231 bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
7232 bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
7233 bf1[27] = bf0[27];
7234 bf1[28] = bf0[28];
7235 bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
7236 bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
7237 bf1[31] = bf0[31];
7238 bf1[32] = bf0[32] + bf0[35];
7239 bf1[33] = bf0[33] + bf0[34];
7240 bf1[34] = -bf0[34] + bf0[33];
7241 bf1[35] = -bf0[35] + bf0[32];
7242 bf1[36] = -bf0[36] + bf0[39];
7243 bf1[37] = -bf0[37] + bf0[38];
7244 bf1[38] = bf0[38] + bf0[37];
7245 bf1[39] = bf0[39] + bf0[36];
7246 bf1[40] = bf0[40] + bf0[43];
7247 bf1[41] = bf0[41] + bf0[42];
7248 bf1[42] = -bf0[42] + bf0[41];
7249 bf1[43] = -bf0[43] + bf0[40];
7250 bf1[44] = -bf0[44] + bf0[47];
7251 bf1[45] = -bf0[45] + bf0[46];
7252 bf1[46] = bf0[46] + bf0[45];
7253 bf1[47] = bf0[47] + bf0[44];
7254 bf1[48] = bf0[48] + bf0[51];
7255 bf1[49] = bf0[49] + bf0[50];
7256 bf1[50] = -bf0[50] + bf0[49];
7257 bf1[51] = -bf0[51] + bf0[48];
7258 bf1[52] = -bf0[52] + bf0[55];
7259 bf1[53] = -bf0[53] + bf0[54];
7260 bf1[54] = bf0[54] + bf0[53];
7261 bf1[55] = bf0[55] + bf0[52];
7262 bf1[56] = bf0[56] + bf0[59];
7263 bf1[57] = bf0[57] + bf0[58];
7264 bf1[58] = -bf0[58] + bf0[57];
7265 bf1[59] = -bf0[59] + bf0[56];
7266 bf1[60] = -bf0[60] + bf0[63];
7267 bf1[61] = -bf0[61] + bf0[62];
7268 bf1[62] = bf0[62] + bf0[61];
7269 bf1[63] = bf0[63] + bf0[60];
7270
7271 // stage 8
7272 cospi = cospi_arr(cos_bit);
7273 bf0 = output;
7274 bf1 = step;
7275 bf1[0] = bf0[0];
7276 bf1[4] = bf0[4];
7277 bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
7278 bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
7279 bf1[16] = bf0[16] + bf0[17];
7280 bf1[19] = bf0[19] + bf0[18];
7281 bf1[20] = bf0[20] + bf0[21];
7282 bf1[23] = bf0[23] + bf0[22];
7283 bf1[24] = bf0[24] + bf0[25];
7284 bf1[27] = bf0[27] + bf0[26];
7285 bf1[28] = bf0[28] + bf0[29];
7286 bf1[31] = bf0[31] + bf0[30];
7287 bf1[32] = bf0[32];
7288 bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
7289 bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
7290 bf1[35] = bf0[35];
7291 bf1[36] = bf0[36];
7292 bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
7293 bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
7294 bf1[39] = bf0[39];
7295 bf1[40] = bf0[40];
7296 bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
7297 bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
7298 bf1[43] = bf0[43];
7299 bf1[44] = bf0[44];
7300 bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
7301 bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
7302 bf1[47] = bf0[47];
7303 bf1[48] = bf0[48];
7304 bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
7305 bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
7306 bf1[51] = bf0[51];
7307 bf1[52] = bf0[52];
7308 bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
7309 bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
7310 bf1[55] = bf0[55];
7311 bf1[56] = bf0[56];
7312 bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
7313 bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
7314 bf1[59] = bf0[59];
7315 bf1[60] = bf0[60];
7316 bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
7317 bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
7318 bf1[63] = bf0[63];
7319
7320 // stage 9
7321 cospi = cospi_arr(cos_bit);
7322 bf0 = step;
7323 bf1 = output;
7324 bf1[0] = bf0[0];
7325 bf1[4] = bf0[4];
7326 bf1[8] = bf0[8];
7327 bf1[12] = bf0[12];
7328 bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
7329 bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
7330 bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
7331 bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
7332 bf1[32] = bf0[32] + bf0[33];
7333 bf1[35] = bf0[35] + bf0[34];
7334 bf1[36] = bf0[36] + bf0[37];
7335 bf1[39] = bf0[39] + bf0[38];
7336 bf1[40] = bf0[40] + bf0[41];
7337 bf1[43] = bf0[43] + bf0[42];
7338 bf1[44] = bf0[44] + bf0[45];
7339 bf1[47] = bf0[47] + bf0[46];
7340 bf1[48] = bf0[48] + bf0[49];
7341 bf1[51] = bf0[51] + bf0[50];
7342 bf1[52] = bf0[52] + bf0[53];
7343 bf1[55] = bf0[55] + bf0[54];
7344 bf1[56] = bf0[56] + bf0[57];
7345 bf1[59] = bf0[59] + bf0[58];
7346 bf1[60] = bf0[60] + bf0[61];
7347 bf1[63] = bf0[63] + bf0[62];
7348
7349 // stage 10
7350 cospi = cospi_arr(cos_bit);
7351 bf0 = output;
7352 bf1 = step;
7353 bf1[0] = bf0[0];
7354 bf1[4] = bf0[4];
7355 bf1[8] = bf0[8];
7356 bf1[12] = bf0[12];
7357 bf1[16] = bf0[16];
7358 bf1[20] = bf0[20];
7359 bf1[24] = bf0[24];
7360 bf1[28] = bf0[28];
7361 bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
7362 bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
7363 bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
7364 bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
7365 bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
7366 bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
7367 bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
7368 bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
7369
7370 // stage 11
7371 bf0 = step;
7372 bf1 = output;
7373 bf1[0] = bf0[0];
7374 bf1[1] = bf0[32];
7375 bf1[2] = bf0[16];
7376 bf1[3] = bf0[48];
7377 bf1[4] = bf0[8];
7378 bf1[5] = bf0[40];
7379 bf1[6] = bf0[24];
7380 bf1[7] = bf0[56];
7381 bf1[8] = bf0[4];
7382 bf1[9] = bf0[36];
7383 bf1[10] = bf0[20];
7384 bf1[11] = bf0[52];
7385 bf1[12] = bf0[12];
7386 bf1[13] = bf0[44];
7387 bf1[14] = bf0[28];
7388 bf1[15] = bf0[60];
7389 }
7390
av1_fidentity64_N4_c(const int32_t * input,int32_t * output,int8_t cos_bit,const int8_t * stage_range)7391 void av1_fidentity64_N4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
7392 const int8_t *stage_range) {
7393 (void)stage_range;
7394 (void)cos_bit;
7395 for (int32_t i = 0; i < 16; ++i)
7396 output[i] = round_shift((int64_t)input[i] * 4 * new_sqrt2, new_sqrt2_bits);
7397 assert(stage_range[0] + new_sqrt2_bits <= 32);
7398 }
7399
fwd_txfm_type_to_func_N4(TxfmType txfmtype)7400 static INLINE TxfmFunc fwd_txfm_type_to_func_N4(TxfmType txfmtype) {
7401 switch (txfmtype) {
7402 case TXFM_TYPE_DCT4: return svt_av1_fdct4_new_N4;
7403 case TXFM_TYPE_DCT8: return svt_av1_fdct8_new_N4;
7404 case TXFM_TYPE_DCT16: return svt_av1_fdct16_new_N4;
7405 case TXFM_TYPE_DCT32: return svt_av1_fdct32_new_N4;
7406 case TXFM_TYPE_DCT64: return svt_av1_fdct64_new_N4;
7407 case TXFM_TYPE_ADST4: return svt_av1_fadst4_new_N4;
7408 case TXFM_TYPE_ADST8: return svt_av1_fadst8_new_N4;
7409 case TXFM_TYPE_ADST16: return svt_av1_fadst16_new_N4;
7410 case TXFM_TYPE_ADST32: return av1_fadst32_new;
7411 case TXFM_TYPE_IDENTITY4: return svt_av1_fidentity4_N4_c;
7412 case TXFM_TYPE_IDENTITY8: return svt_av1_fidentity8_N4_c;
7413 case TXFM_TYPE_IDENTITY16: return svt_av1_fidentity16_N4_c;
7414 case TXFM_TYPE_IDENTITY32: return svt_av1_fidentity32_N4_c;
7415 case TXFM_TYPE_IDENTITY64: return av1_fidentity64_N4_c;
7416 default: assert(0); return NULL;
7417 }
7418 }
7419
av1_tranform_two_d_core_N4_c(int16_t * input,uint32_t input_stride,int32_t * output,const Txfm2dFlipCfg * cfg,int32_t * buf,uint8_t bit_depth)7420 static INLINE void av1_tranform_two_d_core_N4_c(int16_t *input, uint32_t input_stride,
7421 int32_t *output, const Txfm2dFlipCfg *cfg,
7422 int32_t *buf, uint8_t bit_depth) {
7423 int32_t c, r;
7424 // Note when assigning txfm_size_col, we use the txfm_size from the
7425 // row configuration and vice versa. This is intentionally done to
7426 // accurately perform rectangular transforms. When the transform is
7427 // rectangular, the number of columns will be the same as the
7428 // txfm_size stored in the row cfg struct. It will make no difference
7429 // for square transforms.
7430 const int32_t txfm_size_col = tx_size_wide[cfg->tx_size];
7431 const int32_t txfm_size_row = tx_size_high[cfg->tx_size];
7432 // Take the shift from the larger dimension in the rectangular case.
7433 const int8_t *shift = cfg->shift;
7434 const int32_t rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
7435 int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
7436 int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
7437 assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
7438 assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
7439 svt_av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bit_depth);
7440
7441 const int8_t cos_bit_col = cfg->cos_bit_col;
7442 const int8_t cos_bit_row = cfg->cos_bit_row;
7443 const TxfmFunc txfm_func_col = fwd_txfm_type_to_func_N4(cfg->txfm_type_col);
7444 const TxfmFunc txfm_func_row = fwd_txfm_type_to_func_N4(cfg->txfm_type_row);
7445 ASSERT(txfm_func_col != NULL);
7446 ASSERT(txfm_func_row != NULL);
7447 // use output buffer as temp buffer
7448 int32_t *temp_in = output;
7449 int32_t *temp_out = output + txfm_size_row;
7450
7451 // Columns
7452 for (c = 0; c < txfm_size_col; ++c) {
7453 if (cfg->ud_flip == 0)
7454 for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * input_stride + c];
7455 else {
7456 for (r = 0; r < txfm_size_row; ++r)
7457 // flip upside down
7458 temp_in[r] = input[(txfm_size_row - r - 1) * input_stride + c];
7459 }
7460 svt_av1_round_shift_array_c(
7461 temp_in, txfm_size_row, -shift[0]); // NM svt_av1_round_shift_array_c
7462 txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
7463 svt_av1_round_shift_array_c(
7464 temp_out, txfm_size_row / 4, -shift[1]); // NM svt_av1_round_shift_array_c
7465 if (cfg->lr_flip == 0) {
7466 for (r = 0; r < txfm_size_row; ++r) buf[r * txfm_size_col + c] = temp_out[r];
7467 } else {
7468 for (r = 0; r < txfm_size_row; ++r)
7469 // flip from left to right
7470 buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
7471 }
7472 }
7473
7474 // Rows
7475 for (r = 0; r < txfm_size_row / 4; ++r) {
7476 txfm_func_row(
7477 buf + r * txfm_size_col, output + r * txfm_size_col, cos_bit_row, stage_range_row);
7478 svt_av1_round_shift_array_c(output + r * txfm_size_col, txfm_size_col / 4, -shift[2]);
7479
7480 if (abs(rect_type) == 1) {
7481 // Multiply everything by Sqrt2 if the transform is rectangular and the
7482 // size difference is a factor of 2.
7483 for (c = 0; c < txfm_size_col / 4; ++c) {
7484 output[r * txfm_size_col + c] = round_shift(
7485 (int64_t)output[r * txfm_size_col + c] * new_sqrt2, new_sqrt2_bits);
7486 }
7487 }
7488 }
7489 for (int i = 0; i < (txfm_size_col * txfm_size_row); i++)
7490 if (i % txfm_size_col >= (txfm_size_col >> 2) || i / txfm_size_col >= (txfm_size_row >> 2))
7491 output[i] = 0;
7492 }
7493
av1_transform_two_d_64x64_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7494 void av1_transform_two_d_64x64_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7495 TxType transform_type, uint8_t bit_depth) {
7496 int32_t intermediate_transform_buffer[64 * 64];
7497 Txfm2dFlipCfg cfg;
7498 av1_transform_config(transform_type, TX_64X64, &cfg);
7499 av1_tranform_two_d_core_N4_c(
7500 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7501 }
7502
av1_transform_two_d_32x32_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7503 void av1_transform_two_d_32x32_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7504 TxType transform_type, uint8_t bit_depth) {
7505 int32_t intermediate_transform_buffer[32 * 32];
7506 Txfm2dFlipCfg cfg;
7507 av1_transform_config(transform_type, TX_32X32, &cfg);
7508 av1_tranform_two_d_core_N4_c(
7509 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7510 }
7511
av1_transform_two_d_16x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7512 void av1_transform_two_d_16x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7513 TxType transform_type, uint8_t bit_depth) {
7514 int32_t intermediate_transform_buffer[16 * 16];
7515 Txfm2dFlipCfg cfg;
7516 av1_transform_config(transform_type, TX_16X16, &cfg);
7517 av1_tranform_two_d_core_N4_c(
7518 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7519 }
7520
av1_transform_two_d_8x8_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7521 void av1_transform_two_d_8x8_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7522 TxType transform_type, uint8_t bit_depth) {
7523 int32_t intermediate_transform_buffer[8 * 8];
7524 Txfm2dFlipCfg cfg;
7525 av1_transform_config(transform_type, TX_8X8, &cfg);
7526 av1_tranform_two_d_core_N4_c(
7527 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7528 }
7529
av1_transform_two_d_4x4_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7530 void av1_transform_two_d_4x4_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7531 TxType transform_type, uint8_t bit_depth) {
7532 int32_t intermediate_transform_buffer[4 * 4];
7533 Txfm2dFlipCfg cfg;
7534 av1_transform_config(transform_type, TX_4X4, &cfg);
7535 av1_tranform_two_d_core_N4_c(
7536 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7537 }
7538
svt_av1_fwd_txfm2d_64x32_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7539 void svt_av1_fwd_txfm2d_64x32_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7540 TxType transform_type, uint8_t bit_depth) {
7541 int32_t intermediate_transform_buffer[64 * 32];
7542 Txfm2dFlipCfg cfg;
7543 av1_transform_config(transform_type, TX_64X32, &cfg);
7544 av1_tranform_two_d_core_N4_c(
7545 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7546 }
7547
svt_av1_fwd_txfm2d_32x64_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7548 void svt_av1_fwd_txfm2d_32x64_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7549 TxType transform_type, uint8_t bit_depth) {
7550 int32_t intermediate_transform_buffer[32 * 64];
7551 Txfm2dFlipCfg cfg;
7552 av1_transform_config(transform_type, TX_32X64, &cfg);
7553 av1_tranform_two_d_core_N4_c(
7554 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7555 }
7556
svt_av1_fwd_txfm2d_64x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7557 void svt_av1_fwd_txfm2d_64x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7558 TxType transform_type, uint8_t bit_depth) {
7559 int32_t intermediate_transform_buffer[64 * 16];
7560 Txfm2dFlipCfg cfg;
7561 av1_transform_config(transform_type, TX_64X16, &cfg);
7562 av1_tranform_two_d_core_N4_c(
7563 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7564 }
7565
svt_av1_fwd_txfm2d_16x64_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7566 void svt_av1_fwd_txfm2d_16x64_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7567 TxType transform_type, uint8_t bit_depth) {
7568 int32_t intermediate_transform_buffer[16 * 64];
7569 Txfm2dFlipCfg cfg;
7570 av1_transform_config(transform_type, TX_16X64, &cfg);
7571 av1_tranform_two_d_core_N4_c(
7572 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7573 }
7574
svt_av1_fwd_txfm2d_32x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7575 void svt_av1_fwd_txfm2d_32x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7576 TxType transform_type, uint8_t bit_depth) {
7577 int32_t intermediate_transform_buffer[32 * 16];
7578 Txfm2dFlipCfg cfg;
7579 av1_transform_config(transform_type, TX_32X16, &cfg);
7580 av1_tranform_two_d_core_N4_c(
7581 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7582 }
7583
svt_av1_fwd_txfm2d_16x32_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7584 void svt_av1_fwd_txfm2d_16x32_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7585 TxType transform_type, uint8_t bit_depth) {
7586 int32_t intermediate_transform_buffer[16 * 32];
7587 Txfm2dFlipCfg cfg;
7588 av1_transform_config(transform_type, TX_16X32, &cfg);
7589 av1_tranform_two_d_core_N4_c(
7590 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7591 }
7592
svt_av1_fwd_txfm2d_16x8_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7593 void svt_av1_fwd_txfm2d_16x8_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7594 TxType transform_type, uint8_t bit_depth) {
7595 int32_t intermediate_transform_buffer[16 * 8];
7596 Txfm2dFlipCfg cfg;
7597 av1_transform_config(transform_type, TX_16X8, &cfg);
7598 av1_tranform_two_d_core_N4_c(
7599 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7600 }
7601
svt_av1_fwd_txfm2d_8x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7602 void svt_av1_fwd_txfm2d_8x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7603 TxType transform_type, uint8_t bit_depth) {
7604 int32_t intermediate_transform_buffer[8 * 16];
7605 Txfm2dFlipCfg cfg;
7606 av1_transform_config(transform_type, TX_8X16, &cfg);
7607 av1_tranform_two_d_core_N4_c(
7608 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7609 }
7610
svt_av1_fwd_txfm2d_32x8_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7611 void svt_av1_fwd_txfm2d_32x8_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7612 TxType transform_type, uint8_t bit_depth) {
7613 int32_t intermediate_transform_buffer[32 * 8];
7614 Txfm2dFlipCfg cfg;
7615 av1_transform_config(transform_type, TX_32X8, &cfg);
7616 av1_tranform_two_d_core_N4_c(
7617 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7618 }
7619
svt_av1_fwd_txfm2d_8x32_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7620 void svt_av1_fwd_txfm2d_8x32_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7621 TxType transform_type, uint8_t bit_depth) {
7622 int32_t intermediate_transform_buffer[8 * 32];
7623 Txfm2dFlipCfg cfg;
7624 av1_transform_config(transform_type, TX_8X32, &cfg);
7625 av1_tranform_two_d_core_N4_c(
7626 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7627 }
7628
svt_av1_fwd_txfm2d_16x4_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7629 void svt_av1_fwd_txfm2d_16x4_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7630 TxType transform_type, uint8_t bit_depth) {
7631 int32_t intermediate_transform_buffer[16 * 4];
7632 Txfm2dFlipCfg cfg;
7633 av1_transform_config(transform_type, TX_16X4, &cfg);
7634 av1_tranform_two_d_core_N4_c(
7635 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7636 }
7637
svt_av1_fwd_txfm2d_4x16_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7638 void svt_av1_fwd_txfm2d_4x16_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7639 TxType transform_type, uint8_t bit_depth) {
7640 int32_t intermediate_transform_buffer[4 * 16];
7641 Txfm2dFlipCfg cfg;
7642 av1_transform_config(transform_type, TX_4X16, &cfg);
7643 av1_tranform_two_d_core_N4_c(
7644 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7645 }
7646
svt_av1_fwd_txfm2d_8x4_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7647 void svt_av1_fwd_txfm2d_8x4_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7648 TxType transform_type, uint8_t bit_depth) {
7649 int32_t intermediate_transform_buffer[8 * 4];
7650 Txfm2dFlipCfg cfg;
7651 av1_transform_config(transform_type, TX_8X4, &cfg);
7652 av1_tranform_two_d_core_N4_c(
7653 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7654 }
7655
svt_av1_fwd_txfm2d_4x8_N4_c(int16_t * input,int32_t * output,uint32_t input_stride,TxType transform_type,uint8_t bit_depth)7656 void svt_av1_fwd_txfm2d_4x8_N4_c(int16_t *input, int32_t *output, uint32_t input_stride,
7657 TxType transform_type, uint8_t bit_depth) {
7658 int32_t intermediate_transform_buffer[4 * 8];
7659 Txfm2dFlipCfg cfg;
7660 av1_transform_config(transform_type, TX_4X8, &cfg);
7661 av1_tranform_two_d_core_N4_c(
7662 input, input_stride, output, &cfg, intermediate_transform_buffer, bit_depth);
7663 }
7664
7665