1 // Copyright (c) 2018-2020, The rav1e contributors. All rights reserved
2 //
3 // This source code is subject to the terms of the BSD 2 Clause License and
4 // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
5 // was not distributed with this source code in the LICENSE file, you can
6 // obtain it at www.aomedia.org/license/software. If the Alliance for Open
7 // Media Patent License 1.0 was not distributed with this source code in the
8 // PATENTS file, you can obtain it at www.aomedia.org/license/patent.
9 
10 use super::TxSize;
11 use super::TxType;
12 
13 use super::HTX_TAB;
14 use super::VTX_TAB;
15 
16 pub type TxfmShift = [i8; 3];
17 pub type TxfmShifts = [TxfmShift; 3];
18 
19 // Shift so that the first shift is 4 - (bd - 8) to align with the initial
20 // design of daala_tx
21 // 8 bit 4x4 is an exception and only shifts by 3 in the first stage
22 const FWD_SHIFT_4X4: TxfmShifts = [[3, 0, 0], [2, 0, 1], [0, 0, 3]];
23 const FWD_SHIFT_8X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
24 const FWD_SHIFT_16X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
25 const FWD_SHIFT_32X32: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
26 const FWD_SHIFT_64X64: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]];
27 const FWD_SHIFT_4X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
28 const FWD_SHIFT_8X4: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
29 const FWD_SHIFT_8X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
30 const FWD_SHIFT_16X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
31 const FWD_SHIFT_16X32: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
32 const FWD_SHIFT_32X16: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
33 const FWD_SHIFT_32X64: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]];
34 const FWD_SHIFT_64X32: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]];
35 const FWD_SHIFT_4X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
36 const FWD_SHIFT_16X4: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
37 const FWD_SHIFT_8X32: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
38 const FWD_SHIFT_32X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]];
39 const FWD_SHIFT_16X64: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
40 const FWD_SHIFT_64X16: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]];
41 
42 pub const FWD_TXFM_SHIFT_LS: [TxfmShifts; TxSize::TX_SIZES_ALL] = [
43   FWD_SHIFT_4X4,
44   FWD_SHIFT_8X8,
45   FWD_SHIFT_16X16,
46   FWD_SHIFT_32X32,
47   FWD_SHIFT_64X64,
48   FWD_SHIFT_4X8,
49   FWD_SHIFT_8X4,
50   FWD_SHIFT_8X16,
51   FWD_SHIFT_16X8,
52   FWD_SHIFT_16X32,
53   FWD_SHIFT_32X16,
54   FWD_SHIFT_32X64,
55   FWD_SHIFT_64X32,
56   FWD_SHIFT_4X16,
57   FWD_SHIFT_16X4,
58   FWD_SHIFT_8X32,
59   FWD_SHIFT_32X8,
60   FWD_SHIFT_16X64,
61   FWD_SHIFT_64X16,
62 ];
63 
64 #[derive(Debug, Clone, Copy, PartialEq)]
65 pub enum TxfmType {
66   DCT4,
67   DCT8,
68   DCT16,
69   DCT32,
70   DCT64,
71   ADST4,
72   ADST8,
73   ADST16,
74   Identity4,
75   Identity8,
76   Identity16,
77   Identity32,
78   Invalid,
79 }
80 
81 impl TxfmType {
82   const TX_TYPES_1D: usize = 4;
83   const AV1_TXFM_TYPE_LS: [[TxfmType; Self::TX_TYPES_1D]; 5] = [
84     [TxfmType::DCT4, TxfmType::ADST4, TxfmType::ADST4, TxfmType::Identity4],
85     [TxfmType::DCT8, TxfmType::ADST8, TxfmType::ADST8, TxfmType::Identity8],
86     [
87       TxfmType::DCT16,
88       TxfmType::ADST16,
89       TxfmType::ADST16,
90       TxfmType::Identity16,
91     ],
92     [
93       TxfmType::DCT32,
94       TxfmType::Invalid,
95       TxfmType::Invalid,
96       TxfmType::Identity32,
97     ],
98     [TxfmType::DCT64, TxfmType::Invalid, TxfmType::Invalid, TxfmType::Invalid],
99   ];
100 }
101 
102 #[derive(Debug, Clone, Copy)]
103 pub struct Txfm2DFlipCfg {
104   pub tx_size: TxSize,
105   /// Flip upside down
106   pub ud_flip: bool,
107   /// Flip left to right
108   pub lr_flip: bool,
109   pub shift: TxfmShift,
110   pub txfm_type_col: TxfmType,
111   pub txfm_type_row: TxfmType,
112 }
113 
114 impl Txfm2DFlipCfg {
fwd(tx_type: TxType, tx_size: TxSize, bd: usize) -> Self115   pub fn fwd(tx_type: TxType, tx_size: TxSize, bd: usize) -> Self {
116     let tx_type_1d_col = VTX_TAB[tx_type as usize];
117     let tx_type_1d_row = HTX_TAB[tx_type as usize];
118     let txw_idx = tx_size.width_index();
119     let txh_idx = tx_size.height_index();
120     let txfm_type_col =
121       TxfmType::AV1_TXFM_TYPE_LS[txh_idx][tx_type_1d_col as usize];
122     let txfm_type_row =
123       TxfmType::AV1_TXFM_TYPE_LS[txw_idx][tx_type_1d_row as usize];
124     assert_ne!(txfm_type_col, TxfmType::Invalid);
125     assert_ne!(txfm_type_row, TxfmType::Invalid);
126     let (ud_flip, lr_flip) = Self::get_flip_cfg(tx_type);
127 
128     Txfm2DFlipCfg {
129       tx_size,
130       ud_flip,
131       lr_flip,
132       shift: FWD_TXFM_SHIFT_LS[tx_size as usize][(bd - 8) / 2],
133       txfm_type_col,
134       txfm_type_row,
135     }
136   }
137 
138   /// Determine the flip config, returning (ud_flip, lr_flip)
get_flip_cfg(tx_type: TxType) -> (bool, bool)139   fn get_flip_cfg(tx_type: TxType) -> (bool, bool) {
140     use self::TxType::*;
141     match tx_type {
142       DCT_DCT | ADST_DCT | DCT_ADST | ADST_ADST | IDTX | V_DCT | H_DCT
143       | V_ADST | H_ADST => (false, false),
144       FLIPADST_DCT | FLIPADST_ADST | V_FLIPADST => (true, false),
145       DCT_FLIPADST | ADST_FLIPADST | H_FLIPADST => (false, true),
146       FLIPADST_FLIPADST => (true, true),
147     }
148   }
149 }
150 
151 macro_rules! store_coeffs {
152   ( $arr:expr, $( $x:expr ),* ) => {
153       {
154       let mut i: i32 = -1;
155       $(
156         i += 1;
157         $arr[i as usize] = $x;
158       )*
159     }
160   };
161 }
162 
163 macro_rules! impl_1d_tx {
164 () => {
165   impl_1d_tx! {allow(), }
166 };
167 
168 ($m:meta, $($s:ident),*) => {
169 trait RotateKernelPi4<T: TxOperations> {
170   const ADD: $($s)* fn(T, T) -> T;
171   const SUB: $($s)* fn(T, T) -> T;
172 
173   #[$m]
174   $($s)* fn kernel(p0: T, p1: T, m: ((i32, i32), (i32, i32))) -> (T, T) {
175     let t = Self::ADD(p1, p0);
176     let (a, out0) = (p0.tx_mul(m.0), t.tx_mul(m.1));
177     let out1 = Self::SUB(a, out0);
178     (out0, out1)
179   }
180 }
181 
182 struct RotatePi4Add;
183 struct RotatePi4AddAvg;
184 struct RotatePi4Sub;
185 struct RotatePi4SubAvg;
186 
187 impl<T: TxOperations> RotateKernelPi4<T> for RotatePi4Add {
188   const ADD: $($s)* fn(T, T) -> T = T::add;
189   const SUB: $($s)* fn(T, T) -> T = T::sub;
190 }
191 
192 impl<T: TxOperations> RotateKernelPi4<T> for RotatePi4AddAvg {
193   const ADD: $($s)* fn(T, T) -> T = T::add_avg;
194   const SUB: $($s)* fn(T, T) -> T = T::sub;
195 }
196 
197 impl<T: TxOperations> RotateKernelPi4<T> for RotatePi4Sub {
198   const ADD: $($s)* fn(T, T) -> T = T::sub;
199   const SUB: $($s)* fn(T, T) -> T = T::add;
200 }
201 
202 impl<T: TxOperations> RotateKernelPi4<T> for RotatePi4SubAvg {
203   const ADD: $($s)* fn(T, T) -> T = T::sub_avg;
204   const SUB: $($s)* fn(T, T) -> T = T::add;
205 }
206 
207 trait RotateKernel<T: TxOperations> {
208   const ADD: $($s)* fn(T, T) -> T;
209   const SUB: $($s)* fn(T, T) -> T;
210   const SHIFT: $($s)* fn(T) -> T;
211 
212   #[$m]
213   $($s)* fn half_kernel(
214     p0: (T, T), p1: T, m: ((i32, i32), (i32, i32), (i32, i32)),
215   ) -> (T, T) {
216     let t = Self::ADD(p1, p0.0);
217     let (a, b, c) = (p0.1.tx_mul(m.0), p1.tx_mul(m.1), t.tx_mul(m.2));
218     let out0 = b.add(c);
219     let shifted = Self::SHIFT(c);
220     let out1 = Self::SUB(a, shifted);
221     (out0, out1)
222   }
223 
224   #[$m]
225   $($s)* fn kernel(p0: T, p1: T, m: ((i32, i32), (i32, i32), (i32, i32))) -> (T, T) {
226     Self::half_kernel((p0, p0), p1, m)
227   }
228 }
229 
230 trait RotateKernelNeg<T: TxOperations> {
231   const ADD: $($s)* fn(T, T) -> T;
232 
233   #[$m]
234   $($s)* fn kernel(p0: T, p1: T, m: ((i32, i32), (i32, i32), (i32, i32))) -> (T, T) {
235     let t = Self::ADD(p0, p1);
236     let (a, b, c) = (p0.tx_mul(m.0), p1.tx_mul(m.1), t.tx_mul(m.2));
237     let out0 = b.sub(c);
238     let out1 = c.sub(a);
239     (out0, out1)
240   }
241 }
242 
243 struct RotateAdd;
244 struct RotateAddAvg;
245 struct RotateAddShift;
246 struct RotateSub;
247 struct RotateSubAvg;
248 struct RotateSubShift;
249 struct RotateNeg;
250 struct RotateNegAvg;
251 
252 impl<T: TxOperations> RotateKernel<T> for RotateAdd {
253   const ADD: $($s)* fn(T, T) -> T = T::add;
254   const SUB: $($s)* fn(T, T) -> T = T::sub;
255   const SHIFT: $($s)* fn(T) -> T = T::copy_fn;
256 }
257 
258 impl<T: TxOperations> RotateKernel<T> for RotateAddAvg {
259   const ADD: $($s)* fn(T, T) -> T = T::add_avg;
260   const SUB: $($s)* fn(T, T) -> T = T::sub;
261   const SHIFT: $($s)* fn(T) -> T = T::copy_fn;
262 }
263 
264 impl<T: TxOperations> RotateKernel<T> for RotateAddShift {
265   const ADD: $($s)* fn(T, T) -> T = T::add;
266   const SUB: $($s)* fn(T, T) -> T = T::sub;
267   const SHIFT: $($s)* fn(T) -> T = T::rshift1;
268 }
269 
270 impl<T: TxOperations> RotateKernel<T> for RotateSub {
271   const ADD: $($s)* fn(T, T) -> T = T::sub;
272   const SUB: $($s)* fn(T, T) -> T = T::add;
273   const SHIFT: $($s)* fn(T) -> T = T::copy_fn;
274 }
275 
276 impl<T: TxOperations> RotateKernel<T> for RotateSubAvg {
277   const ADD: $($s)* fn(T, T) -> T = T::sub_avg;
278   const SUB: $($s)* fn(T, T) -> T = T::add;
279   const SHIFT: $($s)* fn(T) -> T = T::copy_fn;
280 }
281 
282 impl<T: TxOperations> RotateKernel<T> for RotateSubShift {
283   const ADD: $($s)* fn(T, T) -> T = T::sub;
284   const SUB: $($s)* fn(T, T) -> T = T::add;
285   const SHIFT: $($s)* fn(T) -> T = T::rshift1;
286 }
287 
288 impl<T: TxOperations> RotateKernelNeg<T> for RotateNeg {
289   const ADD: $($s)* fn(T, T) -> T = T::sub;
290 }
291 
292 impl<T: TxOperations> RotateKernelNeg<T> for RotateNegAvg {
293   const ADD: $($s)* fn(T, T) -> T = T::sub_avg;
294 }
295 
296 #[inline]
297 #[$m]
298 $($s)* fn butterfly_add<T: TxOperations>(p0: T, p1: T) -> ((T, T), T) {
299   let p0 = p0.add(p1);
300   let p0h = p0.rshift1();
301   let p1h = p1.sub(p0h);
302   ((p0h, p0), p1h)
303 }
304 
305 #[inline]
306 #[$m]
307 $($s)* fn butterfly_sub<T: TxOperations>(p0: T, p1: T) -> ((T, T), T) {
308   let p0 = p0.sub(p1);
309   let p0h = p0.rshift1();
310   let p1h = p1.add(p0h);
311   ((p0h, p0), p1h)
312 }
313 
314 #[inline]
315 #[$m]
316 $($s)* fn butterfly_neg<T: TxOperations>(p0: T, p1: T) -> (T, (T, T)) {
317   let p1 = p0.sub(p1);
318   let p1h = p1.rshift1();
319   let p0h = p0.sub(p1h);
320   (p0h, (p1h, p1))
321 }
322 
323 #[inline]
324 #[$m]
325 $($s)* fn butterfly_add_asym<T: TxOperations>(p0: (T, T), p1h: T) -> (T, T) {
326   let p1 = p1h.add(p0.0);
327   let p0 = p0.1.sub(p1);
328   (p0, p1)
329 }
330 
331 #[inline]
332 #[$m]
333 $($s)* fn butterfly_sub_asym<T: TxOperations>(p0: (T, T), p1h: T) -> (T, T) {
334   let p1 = p1h.sub(p0.0);
335   let p0 = p0.1.add(p1);
336   (p0, p1)
337 }
338 
339 #[inline]
340 #[$m]
341 $($s)* fn butterfly_neg_asym<T: TxOperations>(p0h: T, p1: (T, T)) -> (T, T) {
342   let p0 = p0h.add(p1.0);
343   let p1 = p0.sub(p1.1);
344   (p0, p1)
345 }
346 
347 #[$m]
348 $($s)* fn daala_fdct_ii_2_asym<T: TxOperations>(p0h: T, p1: (T, T)) -> (T, T) {
349   butterfly_neg_asym(p0h, p1)
350 }
351 
352 #[$m]
353 $($s)* fn daala_fdst_iv_2_asym<T: TxOperations>(p0: (T, T), p1h: T) -> (T, T) {
354   //   473/512 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867
355   // 3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795
356   // 4433/8192 = Cos[3*Pi/8]*Sqrt[2]                 = 0.5411961001461971
357   RotateAdd::half_kernel(p0, p1h, ((473, 9), (3135, 12), (4433, 13)))
358 }
359 
360 #[$m]
361 $($s)* fn daala_fdct_ii_4<T: TxOperations>(
362   q0: T, q1: T, q2: T, q3: T, output: &mut [T],
363 ) {
364   // +/- Butterflies with asymmetric output.
365   let (q0h, q3) = butterfly_neg(q0, q3);
366   let (q1, q2h) = butterfly_add(q1, q2);
367 
368   // Embedded 2-point transforms with asymmetric input.
369   let (q0, q1) = daala_fdct_ii_2_asym(q0h, q1);
370   let (q3, q2) = daala_fdst_iv_2_asym(q3, q2h);
371 
372   store_coeffs!(output, q0, q1, q2, q3);
373 }
374 
375 #[$m]
376 $($s)* fn daala_fdct4<T: TxOperations>(coeffs: &mut [T]) {
377   assert!(coeffs.len() >= 4);
378   let mut temp_out: [T; 4] = [T::zero(); 4];
379   daala_fdct_ii_4(coeffs[0], coeffs[1], coeffs[2], coeffs[3], &mut temp_out);
380 
381   coeffs[0] = temp_out[0];
382   coeffs[1] = temp_out[2];
383   coeffs[2] = temp_out[1];
384   coeffs[3] = temp_out[3];
385 }
386 
387 #[$m]
388 $($s)* fn daala_fdst_vii_4<T: TxOperations>(coeffs: &mut [T]) {
389   assert!(coeffs.len() >= 4);
390 
391   let q0 = coeffs[0];
392   let q1 = coeffs[1];
393   let q2 = coeffs[2];
394   let q3 = coeffs[3];
395   let t0 = q1.add(q3);
396   // t1 = (q0 + q1 - q3)/2
397   let t1 = q1.add(q0.sub_avg(t0));
398   let t2 = q0.sub(q1);
399   let t3 = q2;
400   let t4 = q0.add(q3);
401   // 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360
402   let t0 = t0.tx_mul((7021, 14));
403   // 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252
404   let t1 = t1.tx_mul((37837, 15));
405   // 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139
406   let t2 = t2.tx_mul((21513, 15));
407   // 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252
408   let t3 = t3.tx_mul((37837, 15));
409   // 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779
410   let t4 = t4.tx_mul((467, 11));
411   let t3h = t3.rshift1();
412   let u4 = t4.add(t3h);
413   coeffs[0] = t0.add(u4);
414   coeffs[1] = t1;
415   coeffs[2] = t0.add(t2.sub(t3h));
416   coeffs[3] = t2.add(t3.sub(u4));
417 }
418 
419 #[$m]
420 $($s)* fn daala_fdct_ii_2<T: TxOperations>(p0: T, p1: T) -> (T, T) {
421   // 11585/8192 = Sin[Pi/4] + Cos[Pi/4]  = 1.4142135623730951
422   // 11585/8192 = 2*Cos[Pi/4]            = 1.4142135623730951
423   let (p1, p0) = RotatePi4SubAvg::kernel(p1, p0, ((11585, 13), (11585, 13)));
424   (p0, p1)
425 }
426 
427 #[$m]
428 $($s)* fn daala_fdst_iv_2<T: TxOperations>(p0: T, p1: T) -> (T, T) {
429   // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8]  = 1.3065629648763766
430   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8]  = 0.5411961001461971
431   //  3135/4096 = 2*Cos[3*Pi/8]              = 0.7653668647301796
432   RotateAddAvg::kernel(p0, p1, ((10703, 13), (8867, 14), (3135, 12)))
433 }
434 
435 #[$m]
436 $($s)* fn daala_fdct_ii_4_asym<T: TxOperations>(
437   q0h: T, q1: (T, T), q2h: T, q3: (T, T), output: &mut [T],
438 ) {
439   // +/- Butterflies with asymmetric input.
440   let (q0, q3) = butterfly_neg_asym(q0h, q3);
441   let (q1, q2) = butterfly_sub_asym(q1, q2h);
442 
443   // Embedded 2-point orthonormal transforms.
444   let (q0, q1) = daala_fdct_ii_2(q0, q1);
445   let (q3, q2) = daala_fdst_iv_2(q3, q2);
446 
447   store_coeffs!(output, q0, q1, q2, q3);
448 }
449 
450 #[$m]
451 $($s)* fn daala_fdst_iv_4_asym<T: TxOperations>(
452   q0: (T, T), q1h: T, q2: (T, T), q3h: T, output: &mut [T],
453 ) {
454   // Stage 0
455   //  9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793
456   //  12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045
457   // 12785/32768 = Cos[7*Pi/16]*2                  = 0.3901806440322565
458   let (q0, q3) = RotateAddShift::half_kernel(
459     q0,
460     q3h,
461     ((9633, 14), (12873, 13), (12785, 15)),
462   );
463   // 11363/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738
464   // 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861
465   //  4551/4096 = Cos[5*Pi/16]*2                  = 1.1111404660392044
466   let (q2, q1) = RotateSubShift::half_kernel(
467     q2,
468     q1h,
469     ((11363, 14), (18081, 15), (4551, 12)),
470   );
471 
472   // Stage 1
473   let (q2, q3) = butterfly_sub_asym((q2.rshift1(), q2), q3);
474   let (q0, q1) = butterfly_sub_asym((q0.rshift1(), q0), q1);
475 
476   // Stage 2
477   // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
478   // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
479   let (q2, q1) = RotatePi4AddAvg::kernel(q2, q1, ((11585, 13), (11585, 13)));
480 
481   store_coeffs!(output, q0, q1, q2, q3);
482 }
483 
484 #[$m]
485 $($s)* fn daala_fdct_ii_8<T: TxOperations>(
486   r0: T, r1: T, r2: T, r3: T, r4: T, r5: T, r6: T, r7: T, output: &mut [T],
487 ) {
488   // +/- Butterflies with asymmetric output.
489   let (r0h, r7) = butterfly_neg(r0, r7);
490   let (r1, r6h) = butterfly_add(r1, r6);
491   let (r2h, r5) = butterfly_neg(r2, r5);
492   let (r3, r4h) = butterfly_add(r3, r4);
493 
494   // Embedded 4-point transforms with asymmetric input.
495   daala_fdct_ii_4_asym(r0h, r1, r2h, r3, &mut output[0..4]);
496   daala_fdst_iv_4_asym(r7, r6h, r5, r4h, &mut output[4..8]);
497   output[4..8].reverse();
498 }
499 
500 #[$m]
501 $($s)* fn daala_fdct8<T: TxOperations>(coeffs: &mut [T]) {
502   assert!(coeffs.len() >= 8);
503   let mut temp_out: [T; 8] = [T::zero(); 8];
504   daala_fdct_ii_8(
505     coeffs[0],
506     coeffs[1],
507     coeffs[2],
508     coeffs[3],
509     coeffs[4],
510     coeffs[5],
511     coeffs[6],
512     coeffs[7],
513     &mut temp_out,
514   );
515 
516   coeffs[0] = temp_out[0];
517   coeffs[1] = temp_out[4];
518   coeffs[2] = temp_out[2];
519   coeffs[3] = temp_out[6];
520   coeffs[4] = temp_out[1];
521   coeffs[5] = temp_out[5];
522   coeffs[6] = temp_out[3];
523   coeffs[7] = temp_out[7];
524 }
525 
526 #[$m]
527 $($s)* fn daala_fdst_iv_8<T: TxOperations>(
528   r0: T, r1: T, r2: T, r3: T, r4: T, r5: T, r6: T, r7: T, output: &mut [T],
529 ) {
530   // Stage 0
531   // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576
532   // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363
533   //    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606
534   let (r0, r7) =
535     RotateAdd::kernel(r0, r7, ((17911, 14), (14699, 14), (803, 13)));
536   // 20435/16384 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123
537   // 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650
538   //   1189/4096 = Cos[13*Pi/32]                 = 0.29028467725446233
539   let (r6, r1) =
540     RotateSub::kernel(r6, r1, ((20435, 14), (21845, 15), (1189, 12)));
541   // 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526
542   //   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574
543   // 15447/32768 = Cos[11*Pi/32]                 = 0.47139673682599764
544   let (r2, r5) =
545     RotateAdd::kernel(r2, r5, ((22173, 14), (3363, 13), (15447, 15)));
546   // 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826
547   //  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915
548   //   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455
549   let (r4, r3) =
550     RotateSub::kernel(r4, r3, ((23059, 14), (2271, 14), (5197, 13)));
551 
552   // Stage 1
553   let (r0, r3h) = butterfly_add(r0, r3);
554   let (r2, r1h) = butterfly_sub(r2, r1);
555   let (r5, r6h) = butterfly_add(r5, r6);
556   let (r7, r4h) = butterfly_sub(r7, r4);
557 
558   // Stage 2
559   let (r7, r6) = butterfly_add_asym(r7, r6h);
560   let (r5, r3) = butterfly_add_asym(r5, r3h);
561   let (r2, r4) = butterfly_add_asym(r2, r4h);
562   let (r0, r1) = butterfly_sub_asym(r0, r1h);
563 
564   // Stage 3
565   // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
566   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
567   //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
568   let (r3, r4) =
569     RotateSubAvg::kernel(r3, r4, ((10703, 13), (8867, 14), (3135, 12)));
570   // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
571   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
572   //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
573   let (r2, r5) =
574     RotateNegAvg::kernel(r2, r5, ((10703, 13), (8867, 14), (3135, 12)));
575   // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
576   // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
577   let (r1, r6) = RotatePi4SubAvg::kernel(r1, r6, ((11585, 13), (11585, 13)));
578 
579   store_coeffs!(output, r0, r1, r2, r3, r4, r5, r6, r7);
580 }
581 
582 #[$m]
583 $($s)* fn daala_fdst8<T: TxOperations>(coeffs: &mut [T]) {
584   assert!(coeffs.len() >= 8);
585   let mut temp_out: [T; 8] = [T::zero(); 8];
586   daala_fdst_iv_8(
587     coeffs[0],
588     coeffs[1],
589     coeffs[2],
590     coeffs[3],
591     coeffs[4],
592     coeffs[5],
593     coeffs[6],
594     coeffs[7],
595     &mut temp_out,
596   );
597 
598   coeffs[0] = temp_out[0];
599   coeffs[1] = temp_out[4];
600   coeffs[2] = temp_out[2];
601   coeffs[3] = temp_out[6];
602   coeffs[4] = temp_out[1];
603   coeffs[5] = temp_out[5];
604   coeffs[6] = temp_out[3];
605   coeffs[7] = temp_out[7];
606 }
607 
608 #[$m]
609 $($s)* fn daala_fdst_iv_4<T: TxOperations>(
610   q0: T, q1: T, q2: T, q3: T, output: &mut [T],
611 ) {
612   // Stage 0
613   // 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545
614   //   4551/4096 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204
615   //  9041/32768 = Cos[7*Pi/16]*Sqrt[2]                  = 0.275899379282943
616   let (q0, q3) =
617     RotateAddShift::kernel(q0, q3, ((13623, 14), (4551, 12), (565, 11)));
618   // 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304
619   // 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566
620   //   1609/2048 = Cos[5*Pi/16]*Sqrt[2]                  = 0.7856949583871021
621   let (q2, q1) =
622     RotateSubShift::kernel(q2, q1, ((16069, 14), (12785, 15), (1609, 11)));
623 
624   // Stage 1
625   let (q2, q3) = butterfly_sub_asym((q2.rshift1(), q2), q3);
626   let (q0, q1) = butterfly_sub_asym((q0.rshift1(), q0), q1);
627 
628   // Stage 2
629   // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
630   // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
631   let (q2, q1) = RotatePi4AddAvg::kernel(q2, q1, ((11585, 13), (11585, 13)));
632 
633   store_coeffs!(output, q0, q1, q2, q3);
634 }
635 
636 
637 #[$m]
638 $($s)* fn daala_fdct_ii_8_asym<T: TxOperations>(
639   r0h: T, r1: (T, T), r2h: T, r3: (T, T), r4h: T, r5: (T, T), r6h: T,
640   r7: (T, T), output: &mut [T],
641 ) {
642   // +/- Butterflies with asymmetric input.
643   let (r0, r7) = butterfly_neg_asym(r0h, r7);
644   let (r1, r6) = butterfly_sub_asym(r1, r6h);
645   let (r2, r5) = butterfly_neg_asym(r2h, r5);
646   let (r3, r4) = butterfly_sub_asym(r3, r4h);
647 
648   // Embedded 4-point orthonormal transforms.
649   daala_fdct_ii_4(r0, r1, r2, r3, &mut output[0..4]);
650   daala_fdst_iv_4(r7, r6, r5, r4, &mut output[4..8]);
651   output[4..8].reverse();
652 }
653 
654 #[$m]
655 $($s)* fn daala_fdst_iv_8_asym<T: TxOperations>(
656   r0: (T, T), r1h: T, r2: (T, T), r3h: T, r4: (T, T), r5h: T, r6: (T, T),
657   r7h: T, output: &mut [T],
658 ) {
659   // Stage 0
660   // 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274
661   //   5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729
662   //  2271/16384 = Cos[15*Pi/32]*Sqrt[2]                   = 0.13861716919909
663   let (r0, r7) =
664     RotateAdd::half_kernel(r0, r7h, ((12665, 14), (5197, 12), (2271, 14)));
665   // 14449/16384 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355
666   // 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995
667   //   3363/8192 = Cos[13*Pi/32]*Sqrt[2]                  = 0.410524527522357
668   let (r6, r1) =
669     RotateSub::half_kernel(r6, r1h, ((14449, 14), (30893, 15), (3363, 13)));
670   // 15679/16384 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209
671   //   1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925
672   //   5461/8192 = Cos[11*Pi/32]*Sqrt[2]                  = 0.666655658477747
673   let (r2, r5) =
674     RotateAdd::half_kernel(r2, r5h, ((15679, 14), (1189, 11), (5461, 13)));
675   // 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969
676   //    803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213
677   // 14699/16384 = Cos[9*Pi/32]*Sqrt[2]                  = 0.8971675863426364
678   let (r4, r3) =
679     RotateSub::half_kernel(r4, r3h, ((16305, 14), (803, 12), (14699, 14)));
680 
681   // Stage 1
682   let (r0, r3h) = butterfly_add(r0, r3);
683   let (r2, r1h) = butterfly_sub(r2, r1);
684   let (r5, r6h) = butterfly_add(r5, r6);
685   let (r7, r4h) = butterfly_sub(r7, r4);
686 
687   // Stage 2
688   let (r7, r6) = butterfly_add_asym(r7, r6h);
689   let (r5, r3) = butterfly_add_asym(r5, r3h);
690   let (r2, r4) = butterfly_add_asym(r2, r4h);
691   let (r0, r1) = butterfly_sub_asym(r0, r1h);
692 
693   // Stage 3
694   // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
695   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
696   //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
697   let (r3, r4) =
698     RotateSubAvg::kernel(r3, r4, ((669, 9), (8867, 14), (3135, 12)));
699   // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
700   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
701   //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
702   let (r2, r5) =
703     RotateNegAvg::kernel(r2, r5, ((669, 9), (8867, 14), (3135, 12)));
704   // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
705   // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
706   let (r1, r6) = RotatePi4SubAvg::kernel(r1, r6, ((5793, 12), (11585, 13)));
707 
708   store_coeffs!(output, r0, r1, r2, r3, r4, r5, r6, r7);
709 }
710 
711 #[$m]
712 $($s)* fn daala_fdct_ii_16<T: TxOperations>(
713   s0: T, s1: T, s2: T, s3: T, s4: T, s5: T, s6: T, s7: T, s8: T, s9: T, sa: T,
714   sb: T, sc: T, sd: T, se: T, sf: T, output: &mut [T],
715 ) {
716   // +/- Butterflies with asymmetric output.
717   let (s0h, sf) = butterfly_neg(s0, sf);
718   let (s1, seh) = butterfly_add(s1, se);
719   let (s2h, sd) = butterfly_neg(s2, sd);
720   let (s3, sch) = butterfly_add(s3, sc);
721   let (s4h, sb) = butterfly_neg(s4, sb);
722   let (s5, sah) = butterfly_add(s5, sa);
723   let (s6h, s9) = butterfly_neg(s6, s9);
724   let (s7, s8h) = butterfly_add(s7, s8);
725 
726   // Embedded 8-point transforms with asymmetric input.
727   daala_fdct_ii_8_asym(s0h, s1, s2h, s3, s4h, s5, s6h, s7, &mut output[0..8]);
728   daala_fdst_iv_8_asym(sf, seh, sd, sch, sb, sah, s9, s8h, &mut output[8..16]);
729   output[8..16].reverse();
730 }
731 
732 #[$m]
733 $($s)* fn daala_fdct16<T: TxOperations>(coeffs: &mut [T]) {
734   assert!(coeffs.len() >= 16);
735   let mut temp_out: [T; 16] = [T::zero(); 16];
736   daala_fdct_ii_16(
737     coeffs[0],
738     coeffs[1],
739     coeffs[2],
740     coeffs[3],
741     coeffs[4],
742     coeffs[5],
743     coeffs[6],
744     coeffs[7],
745     coeffs[8],
746     coeffs[9],
747     coeffs[10],
748     coeffs[11],
749     coeffs[12],
750     coeffs[13],
751     coeffs[14],
752     coeffs[15],
753     &mut temp_out,
754   );
755 
756   coeffs[0] = temp_out[0];
757   coeffs[1] = temp_out[8];
758   coeffs[2] = temp_out[4];
759   coeffs[3] = temp_out[12];
760   coeffs[4] = temp_out[2];
761   coeffs[5] = temp_out[10];
762   coeffs[6] = temp_out[6];
763   coeffs[7] = temp_out[14];
764   coeffs[8] = temp_out[1];
765   coeffs[9] = temp_out[9];
766   coeffs[10] = temp_out[5];
767   coeffs[11] = temp_out[13];
768   coeffs[12] = temp_out[3];
769   coeffs[13] = temp_out[11];
770   coeffs[14] = temp_out[7];
771   coeffs[15] = temp_out[15];
772 }
773 
774 #[$m]
775 $($s)* fn daala_fdst_iv_16<T: TxOperations>(
776   s0: T, s1: T, s2: T, s3: T, s4: T, s5: T, s6: T, s7: T, s8: T, s9: T, sa: T,
777   sb: T, sc: T, sd: T, se: T, sf: T, output: &mut [T],
778 ) {
779   // Stage 0
780   // 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496
781   //  11003/8192 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404
782   //  1137/16384 = Cos[31*Pi/64]*Sqrt[2]                   = 0.06939217050794
783   let (s0, sf) =
784     RotateAddShift::kernel(s0, sf, ((24279, 15), (11003, 13), (1137, 14)));
785   // 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449
786   //   305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667
787   //  425/2048 = Cos[29*Pi/64]*Sqrt[2]                   = 0.2075082269882116
788   let (se, s1) =
789     RotateSubShift::kernel(se, s1, ((1645, 11), (305, 8), (425, 11)));
790   // 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027
791   //   8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644
792   //   2815/8192 = Cos[27*Pi/64]*Sqrt[2]                   = 0.34362586580705
793   let (s2, sd) =
794     RotateAddShift::kernel(s2, sd, ((14053, 14), (8423, 13), (2815, 13)));
795   // 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344
796   //   7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056
797   //   3903/8192 = Cos[25*Pi/64]*Sqrt[2]                   = 0.47643419969316
798   let (sc, s3) =
799     RotateSubShift::kernel(sc, s3, ((14811, 14), (7005, 13), (3903, 13)));
800   // 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302
801   // 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444
802   //  9907/16384 = Cos[23*Pi/64]*Sqrt[2]                   = 0.60465421179080
803   let (s4, sb) =
804     RotateAddShift::kernel(s4, sb, ((30853, 15), (11039, 14), (9907, 14)));
805   // 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454
806   //   3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264
807   //   1489/2048 = Cos[21*Pi/64]*Sqrt[2]                   = 0.72705107329128
808   let (sa, s5) =
809     RotateSubShift::kernel(sa, s5, ((15893, 14), (3981, 13), (1489, 11)));
810   // 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478
811   //    601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072
812   // 13803/16384 = Cos[19*Pi/64]*Sqrt[2]                   = 0.84244603550942
813   let (s6, s9) =
814     RotateAddShift::kernel(s6, s9, ((32413, 15), (601, 11), (13803, 14)));
815   // 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517
816   //    201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484
817   //   1945/2048 = Cos[17*Pi/64]*Sqrt[2]                   = 0.94972778187775
818   let (s8, s7) =
819     RotateSubShift::kernel(s8, s7, ((32729, 15), (201, 11), (1945, 11)));
820 
821   // Stage 1
822   let (s0, s7) = butterfly_sub_asym((s0.rshift1(), s0), s7);
823   let (s8, sf) = butterfly_sub_asym((s8.rshift1(), s8), sf);
824   let (s4, s3) = butterfly_add_asym((s4.rshift1(), s4), s3);
825   let (sc, sb) = butterfly_add_asym((sc.rshift1(), sc), sb);
826   let (s2, s5) = butterfly_sub_asym((s2.rshift1(), s2), s5);
827   let (sa, sd) = butterfly_sub_asym((sa.rshift1(), sa), sd);
828   let (s6, s1) = butterfly_add_asym((s6.rshift1(), s6), s1);
829   let (se, s9) = butterfly_add_asym((se.rshift1(), se), s9);
830 
831   // Stage 2
832   let ((_s8h, s8), s4h) = butterfly_add(s8, s4);
833   let ((_s7h, s7), sbh) = butterfly_add(s7, sb);
834   let ((_sah, sa), s6h) = butterfly_sub(sa, s6);
835   let ((_s5h, s5), s9h) = butterfly_sub(s5, s9);
836   let (s0, s3h) = butterfly_add(s0, s3);
837   let (sd, seh) = butterfly_add(sd, se);
838   let (s2, s1h) = butterfly_sub(s2, s1);
839   let (sf, sch) = butterfly_sub(sf, sc);
840 
841   // Stage 3
842   //     301/256 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
843   //   1609/2048 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
844   // 12785/32768 = 2*Cos[7*Pi/16]              = 0.3901806440322565
845   let (s8, s7) =
846     RotateAddAvg::kernel(s8, s7, ((301, 8), (1609, 11), (12785, 15)));
847   // 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
848   // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
849   //  4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
850   let (s9, s6) =
851     RotateAdd::kernel(s9h, s6h, ((11363, 13), (9041, 15), (4551, 13)));
852   //  5681/4096 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
853   // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
854   //  4551/4096 = 2*Cos[5*Pi/16]              = 1.1111404660392044
855   let (s5, sa) =
856     RotateNegAvg::kernel(s5, sa, ((5681, 12), (9041, 15), (4551, 12)));
857   //   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
858   // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
859   //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
860   let (s4, sb) =
861     RotateNeg::kernel(s4h, sbh, ((9633, 13), (12873, 14), (6393, 15)));
862 
863   // Stage 4
864   let (s2, sc) = butterfly_add_asym(s2, sch);
865   let (s0, s1) = butterfly_sub_asym(s0, s1h);
866   let (sf, se) = butterfly_add_asym(sf, seh);
867   let (sd, s3) = butterfly_add_asym(sd, s3h);
868   let (s7, s6) = butterfly_add_asym((s7.rshift1(), s7), s6);
869   let (s8, s9) = butterfly_sub_asym((s8.rshift1(), s8), s9);
870   let (sa, sb) = butterfly_sub_asym((sa.rshift1(), sa), sb);
871   let (s5, s4) = butterfly_add_asym((s5.rshift1(), s5), s4);
872 
873   // Stage 5
874   //    669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
875   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
876   //  3135/4096 = 2*Cos[7*Pi/8]             = 0.7653668647301796
877   let (sc, s3) =
878     RotateAddAvg::kernel(sc, s3, ((669, 9), (8867, 14), (3135, 12)));
879   //    669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475
880   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
881   //  3135/4096 = 2*Cos[3*Pi/8]             = 0.7653668647301796
882   let (s2, sd) =
883     RotateNegAvg::kernel(s2, sd, ((669, 9), (8867, 14), (3135, 12)));
884   //  5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
885   // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
886   let (sa, s5) = RotatePi4AddAvg::kernel(sa, s5, ((5793, 12), (11585, 13)));
887   //  5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
888   // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
889   let (s6, s9) = RotatePi4AddAvg::kernel(s6, s9, ((5793, 12), (11585, 13)));
890   //  5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
891   // 11585/8192 = 2*Cos[Pi/4]           = 1.4142135623730951
892   let (se, s1) = RotatePi4AddAvg::kernel(se, s1, ((5793, 12), (11585, 13)));
893 
894   store_coeffs!(
895     output, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf
896   );
897 }
898 
899 #[$m]
900 $($s)* fn daala_fdst16<T: TxOperations>(coeffs: &mut [T]) {
901   assert!(coeffs.len() >= 16);
902   let mut temp_out: [T; 16] = [T::zero(); 16];
903   daala_fdst_iv_16(
904     coeffs[0],
905     coeffs[1],
906     coeffs[2],
907     coeffs[3],
908     coeffs[4],
909     coeffs[5],
910     coeffs[6],
911     coeffs[7],
912     coeffs[8],
913     coeffs[9],
914     coeffs[10],
915     coeffs[11],
916     coeffs[12],
917     coeffs[13],
918     coeffs[14],
919     coeffs[15],
920     &mut temp_out,
921   );
922 
923   coeffs[0] = temp_out[0];
924   coeffs[1] = temp_out[8];
925   coeffs[2] = temp_out[4];
926   coeffs[3] = temp_out[12];
927   coeffs[4] = temp_out[2];
928   coeffs[5] = temp_out[10];
929   coeffs[6] = temp_out[6];
930   coeffs[7] = temp_out[14];
931   coeffs[8] = temp_out[1];
932   coeffs[9] = temp_out[9];
933   coeffs[10] = temp_out[5];
934   coeffs[11] = temp_out[13];
935   coeffs[12] = temp_out[3];
936   coeffs[13] = temp_out[11];
937   coeffs[14] = temp_out[7];
938   coeffs[15] = temp_out[15];
939 }
940 
941 #[$m]
942 $($s)* fn daala_fdct_ii_16_asym<T: TxOperations>(
943   s0h: T, s1: (T, T), s2h: T, s3: (T, T), s4h: T, s5: (T, T), s6h: T,
944   s7: (T, T), s8h: T, s9: (T, T), sah: T, sb: (T, T), sch: T, sd: (T, T),
945   seh: T, sf: (T, T), output: &mut [T],
946 ) {
947   // +/- Butterflies with asymmetric input.
948   let (s0, sf) = butterfly_neg_asym(s0h, sf);
949   let (s1, se) = butterfly_sub_asym(s1, seh);
950   let (s2, sd) = butterfly_neg_asym(s2h, sd);
951   let (s3, sc) = butterfly_sub_asym(s3, sch);
952   let (s4, sb) = butterfly_neg_asym(s4h, sb);
953   let (s5, sa) = butterfly_sub_asym(s5, sah);
954   let (s6, s9) = butterfly_neg_asym(s6h, s9);
955   let (s7, s8) = butterfly_sub_asym(s7, s8h);
956 
957   // Embedded 8-point orthonormal transforms.
958   daala_fdct_ii_8(s0, s1, s2, s3, s4, s5, s6, s7, &mut output[0..8]);
959   daala_fdst_iv_8(sf, se, sd, sc, sb, sa, s9, s8, &mut output[8..16]);
960   output[8..16].reverse();
961 }
962 
963 #[$m]
964 $($s)* fn daala_fdst_iv_16_asym<T: TxOperations>(
965   s0: (T, T), s1h: T, s2: (T, T), s3h: T, s4: (T, T), s5h: T, s6: (T, T),
966   s7h: T, s8: (T, T), s9h: T, sa: (T, T), sbh: T, sc: (T, T), sdh: T,
967   se: (T, T), sfh: T, output: &mut [T],
968 ) {
969   // Stage 0
970   //   1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953
971   // 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088
972   //   201/16384 = Cos[31*Pi/64]*2                   = 0.0981353486548360
973   let (s0, sf) =
974     RotateAddShift::half_kernel(s0, sfh, ((1073, 11), (62241, 15), (201, 11)));
975   // 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714
976   // 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384
977   //    601/2048 = Cos[29*Pi/64]*2                   = 0.2934609489107235
978   let (se, s1) = RotateSubShift::half_kernel(
979     se,
980     s1h,
981     ((18611, 15), (55211, 15), (601, 11)),
982   );
983   //  9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039
984   //   1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602
985   //   3981/8192 = Cos[27*Pi/64]*2                   = 0.4859603598065277
986   let (s2, sd) =
987     RotateAddShift::half_kernel(s2, sdh, ((9937, 14), (1489, 10), (3981, 13)));
988   // 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205
989   // 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014
990   // 11039/16384 = Cos[25*Pi/64]*2                   = 0.6737797067844401
991   let (sc, s3) = RotateSubShift::half_kernel(
992     sc,
993     s3h,
994     ((10473, 14), (39627, 15), (11039, 14)),
995   );
996   // 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628
997   // 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225
998   // 7005/8192 = Cos[23*Pi/64]*2                   = 0.8551101868605642
999   let (s4, sb) =
1000     RotateAddShift::half_kernel(s4, sbh, ((2727, 12), (3903, 12), (7005, 13)));
1001   // 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569
1002   // 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069
1003   // 8423/8192 = Cos[21*Pi/64]*2                   = 1.0282054883864433
1004   let (sa, s5) =
1005     RotateSubShift::half_kernel(sa, s5h, ((5619, 13), (2815, 12), (8423, 13)));
1006   //   2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391
1007   // 13588/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232
1008   //     305/256 = Cos[19*Pi/64]*2                   = 1.1913986089848667
1009   let (s6, s9) =
1010     RotateAddShift::half_kernel(s6, s9h, ((2865, 12), (13599, 15), (305, 8)));
1011   // 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887
1012   //   1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816
1013   //  11003/8192 = Cos[17*Pi/64]*2                   = 1.3431179096940367
1014   let (s8, s7) = RotateSubShift::half_kernel(
1015     s8,
1016     s7h,
1017     ((23143, 15), (1137, 13), (11003, 13)),
1018   );
1019 
1020   // Stage 1
1021   let (s0, s7) = butterfly_sub_asym((s0.rshift1(), s0), s7);
1022   let (s8, sf) = butterfly_sub_asym((s8.rshift1(), s8), sf);
1023   let (s4, s3) = butterfly_add_asym((s4.rshift1(), s4), s3);
1024   let (sc, sb) = butterfly_add_asym((sc.rshift1(), sc), sb);
1025   let (s2, s5) = butterfly_sub_asym((s2.rshift1(), s2), s5);
1026   let (sa, sd) = butterfly_sub_asym((sa.rshift1(), sa), sd);
1027   let (s6, s1) = butterfly_add_asym((s6.rshift1(), s6), s1);
1028   let (se, s9) = butterfly_add_asym((se.rshift1(), se), s9);
1029 
1030   // Stage 2
1031   let ((_s8h, s8), s4h) = butterfly_add(s8, s4);
1032   let ((_s7h, s7), sbh) = butterfly_add(s7, sb);
1033   let ((_sah, sa), s6h) = butterfly_sub(sa, s6);
1034   let ((_s5h, s5), s9h) = butterfly_sub(s5, s9);
1035   let (s0, s3h) = butterfly_add(s0, s3);
1036   let (sd, seh) = butterfly_add(sd, se);
1037   let (s2, s1h) = butterfly_sub(s2, s1);
1038   let (sf, sch) = butterfly_sub(sf, sc);
1039 
1040   // Stage 3
1041   //   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
1042   // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
1043   //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
1044   let (s8, s7) =
1045     RotateAdd::kernel(s8, s7, ((9633, 13), (12873, 14), (6393, 15)));
1046   // 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
1047   //  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
1048   //   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
1049   let (s9, s6) =
1050     RotateAdd::kernel(s9h, s6h, ((22725, 14), (9041, 15), (4551, 13)));
1051   //  11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
1052   //  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
1053   //   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
1054   let (s5, sa) =
1055     RotateNeg::kernel(s5, sa, ((11363, 13), (9041, 15), (4551, 13)));
1056   //  9633/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
1057   // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
1058   //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
1059   let (s4, sb) =
1060     RotateNeg::kernel(s4h, sbh, ((9633, 13), (12873, 14), (6393, 15)));
1061 
1062   // Stage 4
1063   let (s2, sc) = butterfly_add_asym(s2, sch);
1064   let (s0, s1) = butterfly_sub_asym(s0, s1h);
1065   let (sf, se) = butterfly_add_asym(sf, seh);
1066   let (sd, s3) = butterfly_add_asym(sd, s3h);
1067   let (s7, s6) = butterfly_add_asym((s7.rshift1(), s7), s6);
1068   let (s8, s9) = butterfly_sub_asym((s8.rshift1(), s8), s9);
1069   let (sa, sb) = butterfly_sub_asym((sa.rshift1(), sa), sb);
1070   let (s5, s4) = butterfly_add_asym((s5.rshift1(), s5), s4);
1071 
1072   // Stage 5
1073   // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1074   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1075   //  3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1076   let (sc, s3) =
1077     RotateAdd::kernel(sc, s3, ((10703, 13), (8867, 14), (3135, 13)));
1078   // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475
1079   // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1080   //  3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1081   let (s2, sd) =
1082     RotateNeg::kernel(s2, sd, ((10703, 13), (8867, 14), (3135, 13)));
1083   // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1084   //  5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1085   let (sa, s5) = RotatePi4Add::kernel(sa, s5, ((11585, 13), (5793, 13)));
1086   // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1087   //  5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1088   let (s6, s9) = RotatePi4Add::kernel(s6, s9, ((11585, 13), (5793, 13)));
1089   // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1090   //  5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1091   let (se, s1) = RotatePi4Add::kernel(se, s1, ((11585, 13), (5793, 13)));
1092 
1093   store_coeffs!(
1094     output, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf
1095   );
1096 }
1097 
1098 #[$m]
1099 $($s)* fn daala_fdct_ii_32<T: TxOperations>(
1100   t0: T, t1: T, t2: T, t3: T, t4: T, t5: T, t6: T, t7: T, t8: T, t9: T, ta: T,
1101   tb: T, tc: T, td: T, te: T, tf: T, tg: T, th: T, ti: T, tj: T, tk: T, tl: T,
1102   tm: T, tn: T, to: T, tp: T, tq: T, tr: T, ts: T, tt: T, tu: T, tv: T,
1103   output: &mut [T],
1104 ) {
1105   // +/- Butterflies with asymmetric output.
1106   let (t0h, tv) = butterfly_neg(t0, tv);
1107   let (t1, tuh) = butterfly_add(t1, tu);
1108   let (t2h, tt) = butterfly_neg(t2, tt);
1109   let (t3, tsh) = butterfly_add(t3, ts);
1110   let (t4h, tr) = butterfly_neg(t4, tr);
1111   let (t5, tqh) = butterfly_add(t5, tq);
1112   let (t6h, tp) = butterfly_neg(t6, tp);
1113   let (t7, toh) = butterfly_add(t7, to);
1114   let (t8h, tn) = butterfly_neg(t8, tn);
1115   let (t9, tmh) = butterfly_add(t9, tm);
1116   let (tah, tl) = butterfly_neg(ta, tl);
1117   let (tb, tkh) = butterfly_add(tb, tk);
1118   let (tch, tj) = butterfly_neg(tc, tj);
1119   let (td, tih) = butterfly_add(td, ti);
1120   let (teh, th) = butterfly_neg(te, th);
1121   let (tf, tgh) = butterfly_add(tf, tg);
1122 
1123   // Embedded 16-point transforms with asymmetric input.
1124   daala_fdct_ii_16_asym(
1125     t0h,
1126     t1,
1127     t2h,
1128     t3,
1129     t4h,
1130     t5,
1131     t6h,
1132     t7,
1133     t8h,
1134     t9,
1135     tah,
1136     tb,
1137     tch,
1138     td,
1139     teh,
1140     tf,
1141     &mut output[0..16],
1142   );
1143   daala_fdst_iv_16_asym(
1144     tv,
1145     tuh,
1146     tt,
1147     tsh,
1148     tr,
1149     tqh,
1150     tp,
1151     toh,
1152     tn,
1153     tmh,
1154     tl,
1155     tkh,
1156     tj,
1157     tih,
1158     th,
1159     tgh,
1160     &mut output[16..32],
1161   );
1162   output[16..32].reverse();
1163 }
1164 
1165 #[$m]
1166 $($s)* fn daala_fdct32<T: TxOperations>(coeffs: &mut [T]) {
1167   assert!(coeffs.len() >= 32);
1168   let mut temp_out: [T; 32] = [T::zero(); 32];
1169   daala_fdct_ii_32(
1170     coeffs[0],
1171     coeffs[1],
1172     coeffs[2],
1173     coeffs[3],
1174     coeffs[4],
1175     coeffs[5],
1176     coeffs[6],
1177     coeffs[7],
1178     coeffs[8],
1179     coeffs[9],
1180     coeffs[10],
1181     coeffs[11],
1182     coeffs[12],
1183     coeffs[13],
1184     coeffs[14],
1185     coeffs[15],
1186     coeffs[16],
1187     coeffs[17],
1188     coeffs[18],
1189     coeffs[19],
1190     coeffs[20],
1191     coeffs[21],
1192     coeffs[22],
1193     coeffs[23],
1194     coeffs[24],
1195     coeffs[25],
1196     coeffs[26],
1197     coeffs[27],
1198     coeffs[28],
1199     coeffs[29],
1200     coeffs[30],
1201     coeffs[31],
1202     &mut temp_out,
1203   );
1204 
1205   coeffs[0] = temp_out[0];
1206   coeffs[1] = temp_out[16];
1207   coeffs[2] = temp_out[8];
1208   coeffs[3] = temp_out[24];
1209   coeffs[4] = temp_out[4];
1210   coeffs[5] = temp_out[20];
1211   coeffs[6] = temp_out[12];
1212   coeffs[7] = temp_out[28];
1213   coeffs[8] = temp_out[2];
1214   coeffs[9] = temp_out[18];
1215   coeffs[10] = temp_out[10];
1216   coeffs[11] = temp_out[26];
1217   coeffs[12] = temp_out[6];
1218   coeffs[13] = temp_out[22];
1219   coeffs[14] = temp_out[14];
1220   coeffs[15] = temp_out[30];
1221   coeffs[16] = temp_out[1];
1222   coeffs[17] = temp_out[17];
1223   coeffs[18] = temp_out[9];
1224   coeffs[19] = temp_out[25];
1225   coeffs[20] = temp_out[5];
1226   coeffs[21] = temp_out[21];
1227   coeffs[22] = temp_out[13];
1228   coeffs[23] = temp_out[29];
1229   coeffs[24] = temp_out[3];
1230   coeffs[25] = temp_out[19];
1231   coeffs[26] = temp_out[11];
1232   coeffs[27] = temp_out[27];
1233   coeffs[28] = temp_out[7];
1234   coeffs[29] = temp_out[23];
1235   coeffs[30] = temp_out[15];
1236   coeffs[31] = temp_out[31];
1237 }
1238 
1239 #[$m]
1240 $($s)* fn daala_fdct_ii_32_asym<T: TxOperations>(
1241   t0h: T, t1: (T, T), t2h: T, t3: (T, T), t4h: T, t5: (T, T), t6h: T,
1242   t7: (T, T), t8h: T, t9: (T, T), tah: T, tb: (T, T), tch: T, td: (T, T),
1243   teh: T, tf: (T, T), tgh: T, th: (T, T), tih: T, tj: (T, T), tkh: T,
1244   tl: (T, T), tmh: T, tn: (T, T), toh: T, tp: (T, T), tqh: T, tr: (T, T),
1245   tsh: T, tt: (T, T), tuh: T, tv: (T, T), output: &mut [T],
1246 ) {
1247   // +/- Butterflies with asymmetric input.
1248   let (t0, tv) = butterfly_neg_asym(t0h, tv);
1249   let (t1, tu) = butterfly_sub_asym(t1, tuh);
1250   let (t2, tt) = butterfly_neg_asym(t2h, tt);
1251   let (t3, ts) = butterfly_sub_asym(t3, tsh);
1252   let (t4, tr) = butterfly_neg_asym(t4h, tr);
1253   let (t5, tq) = butterfly_sub_asym(t5, tqh);
1254   let (t6, tp) = butterfly_neg_asym(t6h, tp);
1255   let (t7, to) = butterfly_sub_asym(t7, toh);
1256   let (t8, tn) = butterfly_neg_asym(t8h, tn);
1257   let (t9, tm) = butterfly_sub_asym(t9, tmh);
1258   let (ta, tl) = butterfly_neg_asym(tah, tl);
1259   let (tb, tk) = butterfly_sub_asym(tb, tkh);
1260   let (tc, tj) = butterfly_neg_asym(tch, tj);
1261   let (td, ti) = butterfly_sub_asym(td, tih);
1262   let (te, th) = butterfly_neg_asym(teh, th);
1263   let (tf, tg) = butterfly_sub_asym(tf, tgh);
1264 
1265   // Embedded 16-point orthonormal transforms.
1266   daala_fdct_ii_16(
1267     t0,
1268     t1,
1269     t2,
1270     t3,
1271     t4,
1272     t5,
1273     t6,
1274     t7,
1275     t8,
1276     t9,
1277     ta,
1278     tb,
1279     tc,
1280     td,
1281     te,
1282     tf,
1283     &mut output[0..16],
1284   );
1285   daala_fdst_iv_16(
1286     tv,
1287     tu,
1288     tt,
1289     ts,
1290     tr,
1291     tq,
1292     tp,
1293     to,
1294     tn,
1295     tm,
1296     tl,
1297     tk,
1298     tj,
1299     ti,
1300     th,
1301     tg,
1302     &mut output[16..32],
1303   );
1304   output[16..32].reverse();
1305 }
1306 
1307 #[$m]
1308 $($s)* fn daala_fdst_iv_32_asym<T: TxOperations>(
1309   t0: (T, T), t1h: T, t2: (T, T), t3h: T, t4: (T, T), t5h: T, t6: (T, T),
1310   t7h: T, t8: (T, T), t9h: T, ta: (T, T), tbh: T, tc: (T, T), tdh: T,
1311   te: (T, T), tfh: T, tg: (T, T), thh: T, ti: (T, T), tjh: T, tk: (T, T),
1312   tlh: T, tm: (T, T), tnh: T, to: (T, T), tph: T, tq: (T, T), trh: T,
1313   ts: (T, T), tth: T, tu: (T, T), tvh: T, output: &mut [T],
1314 ) {
1315   // Stage 0
1316   //   5933/8192 = (Sin[63*Pi/128] + Cos[63*Pi/128])/Sqrt[2] = 0.72424708295147
1317   // 22595/16384 = (Sin[63*Pi/128] - Cos[63*Pi/128])*Sqrt[2] = 1.37908108947413
1318   //  1137/32768 = Cos[63*Pi/128]*Sqrt[2]                    = 0.03470653821440
1319   let (t0, tv) =
1320     RotateAdd::half_kernel(t0, tvh, ((5933, 13), (22595, 14), (1137, 15)));
1321   //   6203/8192 = (Sin[61*Pi/128] + Cos[61*Pi/128])/Sqrt[2] = 0.75720884650648
1322   // 21403/16384 = (Sin[61*Pi/128] - Cos[61*Pi/128])*Sqrt[2] = 1.30634568590755
1323   //  3409/32768 = Cos[61*Pi/128]*Sqrt[2]                    = 0.10403600355271
1324   let (tu, t1) =
1325     RotateSub::half_kernel(tu, t1h, ((6203, 13), (21403, 14), (3409, 15)));
1326   // 25833/32768 = (Sin[59*Pi/128] + Cos[59*Pi/128])/Sqrt[2] = 0.78834642762661
1327   //     315/256 = (Sin[59*Pi/128] - Cos[59*Pi/128])*Sqrt[2] = 1.23046318116125
1328   //  5673/32768 = Cos[59*Pi/128]*Sqrt[2]                    = 0.17311483704598
1329   let (t2, tt) =
1330     RotateAdd::half_kernel(t2, tth, ((25833, 15), (315, 8), (5673, 15)));
1331   // 26791/32768 = (Sin[57*Pi/128] + Cos[57*Pi/128])/Sqrt[2] = 0.81758481315158
1332   //   4717/4096 = (Sin[57*Pi/128] - Cos[57*Pi/128])*Sqrt[2] = 1.15161638283569
1333   //  7923/32768 = Cos[57*Pi/128]*Sqrt[2]                    = 0.24177662173374
1334   let (ts, t3) =
1335     RotateSub::half_kernel(ts, t3h, ((26791, 15), (4717, 12), (7923, 15)));
1336   //   6921/8192 = (Sin[55*Pi/128] + Cos[55*Pi/128])/Sqrt[2] = 0.84485356524971
1337   // 17531/16384 = (Sin[55*Pi/128] - Cos[55*Pi/128])*Sqrt[2] = 1.06999523977419
1338   // 10153/32768 = Cos[55*Pi/128]*Sqrt[2]                    = 0.30985594536261
1339   let (t4, tr) =
1340     RotateAdd::half_kernel(t4, trh, ((6921, 13), (17531, 14), (10153, 15)));
1341   // 28511/32768 = (Sin[53*Pi/128] + Cos[53*Pi/128])/Sqrt[2] = 0.87008699110871
1342   // 32303/32768 = (Sin[53*Pi/128] - Cos[53*Pi/128])*Sqrt[2] = 0.98579638445957
1343   //   1545/4096 = Cos[53*Pi/128]*Sqrt[2]                    = 0.37718879887893
1344   let (tq, t5) =
1345     RotateSub::half_kernel(tq, t5h, ((28511, 15), (32303, 15), (1545, 12)));
1346   // 29269/32768 = (Sin[51*Pi/128] + Cos[51*Pi/128])/Sqrt[2] = 0.89322430119552
1347   // 14733/16384 = (Sin[51*Pi/128] - Cos[51*Pi/128])*Sqrt[2] = 0.89922265930921
1348   //   1817/4096 = Cos[51*Pi/128]*Sqrt[2]                    = 0.44361297154091
1349   let (t6, tp) =
1350     RotateAdd::half_kernel(t6, tph, ((29269, 15), (14733, 14), (1817, 12)));
1351   // 29957/32768 = (Sin[49*Pi/128] + Cos[49*Pi/128])/Sqrt[2] = 0.91420975570353
1352   // 13279/16384 = (Sin[49*Pi/128] - Cos[49*Pi/128])*Sqrt[2] = 0.81048262800998
1353   //  8339/16384 = Cos[49*Pi/128]*Sqrt[2]                    = 0.50896844169854
1354   let (to, t7) =
1355     RotateSub::half_kernel(to, t7h, ((29957, 15), (13279, 14), (8339, 14)));
1356   //   7643/8192 = (Sin[47*Pi/128] + Cos[47*Pi/128])/Sqrt[2] = 0.93299279883474
1357   // 11793/16384 = (Sin[47*Pi/128] - Cos[47*Pi/128])*Sqrt[2] = 0.71979007306998
1358   // 18779/32768 = Cos[47*Pi/128]*Sqrt[2]                    = 0.57309776229975
1359   let (t8, tn) =
1360     RotateAdd::half_kernel(t8, tnh, ((7643, 13), (11793, 14), (18779, 15)));
1361   // 15557/16384 = (Sin[45*Pi/128] + Cos[45*Pi/128])/Sqrt[2] = 0.94952818059304
1362   // 20557/32768 = (Sin[45*Pi/128] - Cos[45*Pi/128])*Sqrt[2] = 0.62736348079778
1363   // 20835/32768 = Cos[45*Pi/128]*Sqrt[2]                    = 0.63584644019415
1364   let (tm, t9) =
1365     RotateSub::half_kernel(tm, t9h, ((15557, 14), (20557, 15), (20835, 15)));
1366   // 31581/32768 = (Sin[43*Pi/128] + Cos[43*Pi/128])/Sqrt[2] = 0.96377606579544
1367   // 17479/32768 = (Sin[43*Pi/128] - Cos[43*Pi/128])*Sqrt[2] = 0.53342551494980
1368   // 22841/32768 = Cos[43*Pi/128]*Sqrt[2]                    = 0.69706330832054
1369   let (ta, tl) =
1370     RotateAdd::half_kernel(ta, tlh, ((31581, 15), (17479, 15), (22841, 15)));
1371   //   7993/8192 = (Sin[41*Pi/128] + Cos[41*Pi/128])/Sqrt[2] = 0.97570213003853
1372   // 14359/32768 = (Sin[41*Pi/128] - Cos[41*Pi/128])*Sqrt[2] = 0.43820248031374
1373   //   3099/4096 = Cos[41*Pi/128]*Sqrt[2]                    = 0.75660088988166
1374   let (tk, tb) =
1375     RotateSub::half_kernel(tk, tbh, ((7993, 13), (14359, 15), (3099, 12)));
1376   // 16143/16384 = (Sin[39*Pi/128] + Cos[39*Pi/128])/Sqrt[2] = 0.98527764238894
1377   //   2801/8192 = (Sin[39*Pi/128] - Cos[39*Pi/128])*Sqrt[2] = 0.34192377752060
1378   // 26683/32768 = Cos[39*Pi/128]*Sqrt[2]                    = 0.81431575362864
1379   let (tc, tj) =
1380     RotateAdd::half_kernel(tc, tjh, ((16143, 14), (2801, 13), (26683, 15)));
1381   // 16261/16384 = (Sin[37*Pi/128] + Cos[37*Pi/128])/Sqrt[2] = 0.99247953459871
1382   //  4011/16384 = (Sin[37*Pi/128] - Cos[37*Pi/128])*Sqrt[2] = 0.24482135039843
1383   // 14255/16384 = Cos[37*Pi/128]*Sqrt[2]                    = 0.87006885939949
1384   let (ti, td) =
1385     RotateSub::half_kernel(ti, tdh, ((16261, 14), (4011, 14), (14255, 14)));
1386   // 32679/32768 = (Sin[35*Pi/128] + Cos[35*Pi/128])/Sqrt[2] = 0.99729045667869
1387   //  4821/32768 = (Sin[35*Pi/128] - Cos[35*Pi/128])*Sqrt[2] = 0.14712912719933
1388   // 30269/32768 = Cos[35*Pi/128]*Sqrt[2]                    = 0.92372589307902
1389   let (te, th) =
1390     RotateAdd::half_kernel(te, thh, ((32679, 15), (4821, 15), (30269, 15)));
1391   // 16379/16384 = (Sin[33*Pi/128] + Cos[33*Pi/128])/Sqrt[2] = 0.99969881869620
1392   //    201/4096 = (Sin[33*Pi/128] - Cos[33*Pi/128])*Sqrt[2] = 0.04908245704582
1393   // 15977/16384 = Cos[33*Pi/128]*Sqrt[2]                    = 0.97515759017329
1394   let (tg, tf) =
1395     RotateSub::half_kernel(tg, tfh, ((16379, 14), (201, 12), (15977, 14)));
1396 
1397   // Stage 1
1398   let (t0, tfh) = butterfly_add(t0, tf);
1399   let (tv, tgh) = butterfly_sub(tv, tg);
1400   let (th, tuh) = butterfly_add(th, tu);
1401   let (te, t1h) = butterfly_sub(te, t1);
1402   let (t2, tdh) = butterfly_add(t2, td);
1403   let (tt, tih) = butterfly_sub(tt, ti);
1404   let (tj, tsh) = butterfly_add(tj, ts);
1405   let (tc, t3h) = butterfly_sub(tc, t3);
1406   let (t4, tbh) = butterfly_add(t4, tb);
1407   let (tr, tkh) = butterfly_sub(tr, tk);
1408   let (tl, tqh) = butterfly_add(tl, tq);
1409   let (ta, t5h) = butterfly_sub(ta, t5);
1410   let (t6, t9h) = butterfly_add(t6, t9);
1411   let (tp, tmh) = butterfly_sub(tp, tm);
1412   let (tn, toh) = butterfly_add(tn, to);
1413   let (t8, t7h) = butterfly_sub(t8, t7);
1414 
1415   // Stage 2
1416   let (t0, t7) = butterfly_sub_asym(t0, t7h);
1417   let (tv, to) = butterfly_add_asym(tv, toh);
1418   let (tp, tu) = butterfly_sub_asym(tp, tuh);
1419   let (t6, t1) = butterfly_add_asym(t6, t1h);
1420   let (t2, t5) = butterfly_sub_asym(t2, t5h);
1421   let (tt, tq) = butterfly_add_asym(tt, tqh);
1422   let (tr, ts) = butterfly_sub_asym(tr, tsh);
1423   let (t4, t3) = butterfly_add_asym(t4, t3h);
1424   let (t8, tg) = butterfly_add_asym(t8, tgh);
1425   let (te, tm) = butterfly_sub_asym(te, tmh);
1426   let (tn, tf) = butterfly_add_asym(tn, tfh);
1427   let (th, t9) = butterfly_sub_asym(th, t9h);
1428   let (ta, ti) = butterfly_add_asym(ta, tih);
1429   let (tc, tk) = butterfly_sub_asym(tc, tkh);
1430   let (tl, td) = butterfly_add_asym(tl, tdh);
1431   let (tj, tb) = butterfly_sub_asym(tj, tbh);
1432 
1433   // Stage 3
1434   // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576
1435   // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363
1436   //    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606
1437   let (tf, tg) =
1438     RotateSub::kernel(tf, tg, ((17911, 14), (14699, 14), (803, 13)));
1439   //  10217/8192 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.2472250129866712
1440   //   5461/8192 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.6666556584777465
1441   //   1189/4096 = Cos[13*Pi/32]                 = 0.2902846772544623
1442   let (th, te) =
1443     RotateAdd::kernel(th, te, ((10217, 13), (5461, 13), (1189, 12)));
1444   //   5543/4096 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526
1445   //   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574
1446   //  7723/16384 = Cos[11*Pi/32]                 = 0.4713967368259976
1447   let (ti, td) =
1448     RotateAdd::kernel(ti, td, ((5543, 12), (3363, 13), (7723, 14)));
1449   //  11529/8192 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826
1450   //  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915
1451   //   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455
1452   let (tc, tj) =
1453     RotateSub::kernel(tc, tj, ((11529, 13), (2271, 14), (5197, 13)));
1454   //  11529/8192 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826
1455   //  2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915
1456   //   5197/8192 = Cos[9*Pi/32]                = 0.6343932841636455
1457   let (tb, tk) =
1458     RotateNeg::kernel(tb, tk, ((11529, 13), (2271, 14), (5197, 13)));
1459   //   5543/4096 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526
1460   //   3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574
1461   //  7723/16384 = Cos[11*Pi/32]                 = 0.4713967368259976
1462   let (ta, tl) =
1463     RotateNeg::kernel(ta, tl, ((5543, 12), (3363, 13), (7723, 14)));
1464   //  10217/8192 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.2472250129866712
1465   //   5461/8192 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.6666556584777465
1466   //   1189/4096 = Cos[13*Pi/32]                 = 0.2902846772544623
1467   let (t9, tm) =
1468     RotateNeg::kernel(t9, tm, ((10217, 13), (5461, 13), (1189, 12)));
1469   // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576
1470   // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363
1471   //    803/8192 = Cos[15*Pi/32]                 = 0.0980171403295606
1472   let (t8, tn) =
1473     RotateNeg::kernel(t8, tn, ((17911, 14), (14699, 14), (803, 13)));
1474 
1475   // Stage 4
1476   let (t3, t0h) = butterfly_sub(t3, t0);
1477   let (ts, tvh) = butterfly_add(ts, tv);
1478   let (tu, tth) = butterfly_sub(tu, tt);
1479   let (t1, t2h) = butterfly_add(t1, t2);
1480   let ((_toh, to), t4h) = butterfly_add(to, t4);
1481   let ((_tqh, tq), t6h) = butterfly_sub(tq, t6);
1482   let ((_t7h, t7), trh) = butterfly_add(t7, tr);
1483   let ((_t5h, t5), tph) = butterfly_sub(t5, tp);
1484   let (tb, t8h) = butterfly_sub(tb, t8);
1485   let (tk, tnh) = butterfly_add(tk, tn);
1486   let (tm, tlh) = butterfly_sub(tm, tl);
1487   let (t9, tah) = butterfly_add(t9, ta);
1488   let (tf, tch) = butterfly_sub(tf, tc);
1489   let (tg, tjh) = butterfly_add(tg, tj);
1490   let (ti, thh) = butterfly_sub(ti, th);
1491   let (td, teh) = butterfly_add(td, te);
1492 
1493   // Stage 5
1494   //     301/256 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
1495   //   1609/2048 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
1496   //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
1497   let (to, t7) = RotateAdd::kernel(to, t7, ((301, 8), (1609, 11), (6393, 15)));
1498   //  11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
1499   //  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
1500   //   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
1501   let (tph, t6h) =
1502     RotateAdd::kernel(tph, t6h, ((11363, 13), (9041, 15), (4551, 13)));
1503   //   5681/4096 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475
1504   //  9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431
1505   //   4551/8192 = Cos[5*Pi/16]                = 0.5555702330196022
1506   let (t5, tq) =
1507     RotateNeg::kernel(t5, tq, ((5681, 12), (9041, 15), (4551, 13)));
1508   //   9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586
1509   // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022
1510   //  6393/32768 = Cos[7*Pi/16]                = 0.1950903220161283
1511   let (t4h, trh) =
1512     RotateNeg::kernel(t4h, trh, ((9633, 13), (12873, 14), (6393, 15)));
1513 
1514   // Stage 6
1515   let (t1, t0) = butterfly_add_asym(t1, t0h);
1516   let (tu, tv) = butterfly_sub_asym(tu, tvh);
1517   let (ts, t2) = butterfly_sub_asym(ts, t2h);
1518   let (t3, tt) = butterfly_sub_asym(t3, tth);
1519   let (t5, t4) = butterfly_add_asym((t5.rshift1(), t5), t4h);
1520   let (tq, tr) = butterfly_sub_asym((tq.rshift1(), tq), trh);
1521   let (t7, t6) = butterfly_add_asym((t7.rshift1(), t7), t6h);
1522   let (to, tp) = butterfly_sub_asym((to.rshift1(), to), tph);
1523   let (t9, t8) = butterfly_add_asym(t9, t8h);
1524   let (tm, tn) = butterfly_sub_asym(tm, tnh);
1525   let (tk, ta) = butterfly_sub_asym(tk, tah);
1526   let (tb, tl) = butterfly_sub_asym(tb, tlh);
1527   let (ti, tc) = butterfly_add_asym(ti, tch);
1528   let (td, tj) = butterfly_add_asym(td, tjh);
1529   let (tf, te) = butterfly_add_asym(tf, teh);
1530   let (tg, th) = butterfly_sub_asym(tg, thh);
1531 
1532   // Stage 7
1533   //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1534   //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1535   //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1536   let (t2, tt) = RotateNeg::kernel(t2, tt, ((669, 9), (8867, 14), (3135, 13)));
1537   //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1538   //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1539   //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1540   let (ts, t3) = RotateAdd::kernel(ts, t3, ((669, 9), (8867, 14), (3135, 13)));
1541   //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1542   //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1543   //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1544   let (ta, tl) = RotateNeg::kernel(ta, tl, ((669, 9), (8867, 14), (3135, 13)));
1545   //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1546   //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1547   //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1548   let (tk, tb) = RotateAdd::kernel(tk, tb, ((669, 9), (8867, 14), (3135, 13)));
1549   //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1550   //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1551   //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1552   let (tc, tj) = RotateAdd::kernel(tc, tj, ((669, 9), (8867, 14), (3135, 13)));
1553   //     669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766
1554   //  8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969
1555   //   3135/8192 = Cos[3*Pi/8]               = 0.3826834323650898
1556   let (ti, td) = RotateNeg::kernel(ti, td, ((669, 9), (8867, 14), (3135, 13)));
1557   //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1558   //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1559   let (tu, t1) = RotatePi4Add::kernel(tu, t1, ((5793, 12), (5793, 13)));
1560   //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1561   //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1562   let (tq, t5) = RotatePi4Add::kernel(tq, t5, ((5793, 12), (5793, 13)));
1563   //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1564   //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1565   let (tp, t6) = RotatePi4Sub::kernel(tp, t6, ((5793, 12), (5793, 13)));
1566   //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1567   //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1568   let (tm, t9) = RotatePi4Add::kernel(tm, t9, ((5793, 12), (5793, 13)));
1569   //   5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951
1570   //   5793/8192 = Cos[Pi/4]             = 0.7071067811865475
1571   let (te, th) = RotatePi4Add::kernel(te, th, ((5793, 12), (5793, 13)));
1572 
1573   store_coeffs!(
1574     output, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf,
1575     tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv
1576   );
1577 }
1578 
1579 #[allow(clippy::identity_op)]
1580 #[$m]
1581 $($s)* fn daala_fdct64<T: TxOperations>(coeffs: &mut [T]) {
1582   assert!(coeffs.len() >= 64);
1583   // Use arrays to avoid ridiculous variable names
1584   let mut asym: [(T, T); 32] = [(T::zero(), T::zero()); 32];
1585   let mut half: [T; 32] = [T::zero(); 32];
1586   // +/- Butterflies with asymmetric output.
1587   {
1588     #[$m]
1589     #[inline]
1590     $($s)* fn butterfly_pair<T: TxOperations>(
1591       half: &mut [T; 32], asym: &mut [(T, T); 32], input: &[T], i: usize
1592     ) {
1593       let j = i * 2;
1594       let (ah, c) = butterfly_neg(input[j], input[63 - j]);
1595       let (b, dh) = butterfly_add(input[j + 1], input[63 - j - 1]);
1596       half[i] = ah;
1597       half[31 - i] = dh;
1598       asym[i] = b;
1599       asym[31 - i] = c;
1600     }
1601     butterfly_pair(&mut half, &mut asym, coeffs, 0);
1602     butterfly_pair(&mut half, &mut asym, coeffs, 1);
1603     butterfly_pair(&mut half, &mut asym, coeffs, 2);
1604     butterfly_pair(&mut half, &mut asym, coeffs, 3);
1605     butterfly_pair(&mut half, &mut asym, coeffs, 4);
1606     butterfly_pair(&mut half, &mut asym, coeffs, 5);
1607     butterfly_pair(&mut half, &mut asym, coeffs, 6);
1608     butterfly_pair(&mut half, &mut asym, coeffs, 7);
1609     butterfly_pair(&mut half, &mut asym, coeffs, 8);
1610     butterfly_pair(&mut half, &mut asym, coeffs, 9);
1611     butterfly_pair(&mut half, &mut asym, coeffs, 10);
1612     butterfly_pair(&mut half, &mut asym, coeffs, 11);
1613     butterfly_pair(&mut half, &mut asym, coeffs, 12);
1614     butterfly_pair(&mut half, &mut asym, coeffs, 13);
1615     butterfly_pair(&mut half, &mut asym, coeffs, 14);
1616     butterfly_pair(&mut half, &mut asym, coeffs, 15);
1617   }
1618 
1619   let mut temp_out: [T; 64] = [T::zero(); 64];
1620   // Embedded 2-point transforms with asymmetric input.
1621   daala_fdct_ii_32_asym(
1622     half[0],
1623     asym[0],
1624     half[1],
1625     asym[1],
1626     half[2],
1627     asym[2],
1628     half[3],
1629     asym[3],
1630     half[4],
1631     asym[4],
1632     half[5],
1633     asym[5],
1634     half[6],
1635     asym[6],
1636     half[7],
1637     asym[7],
1638     half[8],
1639     asym[8],
1640     half[9],
1641     asym[9],
1642     half[10],
1643     asym[10],
1644     half[11],
1645     asym[11],
1646     half[12],
1647     asym[12],
1648     half[13],
1649     asym[13],
1650     half[14],
1651     asym[14],
1652     half[15],
1653     asym[15],
1654     &mut temp_out[0..32],
1655   );
1656   daala_fdst_iv_32_asym(
1657     asym[31],
1658     half[31],
1659     asym[30],
1660     half[30],
1661     asym[29],
1662     half[29],
1663     asym[28],
1664     half[28],
1665     asym[27],
1666     half[27],
1667     asym[26],
1668     half[26],
1669     asym[25],
1670     half[25],
1671     asym[24],
1672     half[24],
1673     asym[23],
1674     half[23],
1675     asym[22],
1676     half[22],
1677     asym[21],
1678     half[21],
1679     asym[20],
1680     half[20],
1681     asym[19],
1682     half[19],
1683     asym[18],
1684     half[18],
1685     asym[17],
1686     half[17],
1687     asym[16],
1688     half[16],
1689     &mut temp_out[32..64],
1690   );
1691   temp_out[32..64].reverse();
1692 
1693   // Store a reordered version of output in temp_out
1694   #[$m]
1695   #[inline]
1696   $($s)* fn reorder_4<T: TxOperations>(
1697     output: &mut [T], i: usize, tmp: [T; 64], j: usize
1698   ) {
1699     output[0 + i * 4] = tmp[0 + j];
1700     output[1 + i * 4] = tmp[32 + j];
1701     output[2 + i * 4] = tmp[16 + j];
1702     output[3 + i * 4] = tmp[48 + j];
1703   }
1704   reorder_4(coeffs, 0, temp_out, 0);
1705   reorder_4(coeffs, 1, temp_out, 8);
1706   reorder_4(coeffs, 2, temp_out, 4);
1707   reorder_4(coeffs, 3, temp_out, 12);
1708   reorder_4(coeffs, 4, temp_out, 2);
1709   reorder_4(coeffs, 5, temp_out, 10);
1710   reorder_4(coeffs, 6, temp_out, 6);
1711   reorder_4(coeffs, 7, temp_out, 14);
1712 
1713   reorder_4(coeffs, 8, temp_out, 1);
1714   reorder_4(coeffs, 9, temp_out, 9);
1715   reorder_4(coeffs, 10, temp_out, 5);
1716   reorder_4(coeffs, 11, temp_out, 13);
1717   reorder_4(coeffs, 12, temp_out, 3);
1718   reorder_4(coeffs, 13, temp_out, 11);
1719   reorder_4(coeffs, 14, temp_out, 7);
1720   reorder_4(coeffs, 15, temp_out, 15);
1721 }
1722 
1723 #[$m]
1724 $($s)* fn fidentity<T: TxOperations>(_coeffs: &mut [T]) {}
1725 
1726 }
1727 
1728 }
1729