1 // Vorbis decoder written in Rust
2 //
3 // Copyright (c) 2016 est31 <MTest31@outlook.com>
4 // and contributors. All rights reserved.
5 // Licensed under MIT license, or Apache 2 license,
6 // at your option. Please see the LICENSE file
7 // attached to this source distribution for details.
8
9 // This file is a very close translation of the
10 // implementation of the algorithm from stb_vorbis.
11
12 use ::header_cached::CachedBlocksizeDerived;
13
imdct_step3_iter0_loop(n :usize, e :&mut[f32], i_off :usize, k_off :isize, a :&[f32])14 fn imdct_step3_iter0_loop(n :usize, e :&mut[f32], i_off :usize, k_off :isize, a :&[f32]) {
15 let mut a_offs = 0;
16 let mut i_offs = i_off;
17 let mut k_offs = i_off as isize + k_off;
18
19 macro_rules! ee0 {
20 (-$x:expr) => {e[i_offs - ($x as usize)]};
21 ($x:expr) => {e[i_offs + ($x as usize)]}
22 }
23
24 macro_rules! ee2 {
25 (-$x:expr) => {e[(k_offs - $x) as usize]};
26 ($x:expr) => {e[(k_offs + $x) as usize]}
27 }
28
29 macro_rules! aa {
30 ($x:expr) => {a[a_offs + ($x as usize)]}
31 }
32
33 assert_eq!((n & 3), 0);
34
35 for _ in 0 .. n >> 2 {
36 let mut k00_20 = ee0![ 0] - ee2![ 0];
37 let mut k01_21 = ee0![-1] - ee2![-1];
38 ee0![ 0] += ee2![ 0];
39 ee0![-1] += ee2![-1];
40 ee2![ 0] = k00_20 * aa![0] - k01_21 * aa![1];
41 ee2![-1] = k01_21 * aa![0] + k00_20 * aa![1];
42 a_offs += 8;
43
44 k00_20 = ee0![-2] - ee2![-2];
45 k01_21 = ee0![-3] - ee2![-3];
46 ee0![-2] += ee2![-2];
47 ee0![-3] += ee2![-3];
48 ee2![-2] = k00_20 * aa![0] - k01_21 * aa![1];
49 ee2![-3] = k01_21 * aa![0] + k00_20 * aa![1];
50 a_offs += 8;
51
52 k00_20 = ee0![-4] - ee2![-4];
53 k01_21 = ee0![-5] - ee2![-5];
54 ee0![-4] += ee2![-4];
55 ee0![-5] += ee2![-5];
56 ee2![-4] = k00_20 * aa![0] - k01_21 * aa![1];
57 ee2![-5] = k01_21 * aa![0] + k00_20 * aa![1];
58 a_offs += 8;
59
60 k00_20 = ee0![-6] - ee2![-6];
61 k01_21 = ee0![-7] - ee2![-7];
62 ee0![-6] += ee2![-6];
63 ee0![-7] += ee2![-7];
64 ee2![-6] = k00_20 * aa![0] - k01_21 * aa![1];
65 ee2![-7] = k01_21 * aa![0] + k00_20 * aa![1];
66
67 a_offs += 8;
68 i_offs -= 8;
69 k_offs -= 8;
70 }
71 }
72
imdct_step3_inner_r_loop(lim :usize, e :&mut [f32], d0 :usize, k_off :isize, a :&[f32], k1 :usize)73 fn imdct_step3_inner_r_loop(lim :usize, e :&mut [f32],
74 d0 :usize, k_off :isize, a :&[f32], k1 :usize) {
75 let mut a_offs = 0;
76 let mut d0_offs = d0;
77 let mut k_offs = d0 as isize + k_off;
78
79 macro_rules! e0 {
80 (-$x:expr) => {e[d0_offs - ($x as usize)]};
81 ($x:expr) => {e[d0_offs + ($x as usize)]}
82 }
83
84 macro_rules! e2 {
85 (-$x:expr) => {e[(k_offs - $x) as usize]};
86 ($x:expr) => {e[(k_offs + $x) as usize]}
87 }
88
89 macro_rules! aa {
90 ($x:expr) => {a[a_offs + ($x as usize)]}
91 }
92
93 for _ in 0 .. lim >> 2 {
94 let mut k00_20 = e0![-0] - e2![-0];
95 let mut k01_21 = e0![-1] - e2![-1];
96 e0![-0] += e2![-0];
97 e0![-1] += e2![-1];
98 e2![-0] = (k00_20) * aa![0] - (k01_21) * aa![1];
99 e2![-1] = (k01_21) * aa![0] + (k00_20) * aa![1];
100
101 a_offs += k1;
102
103 k00_20 = e0![-2] - e2![-2];
104 k01_21 = e0![-3] - e2![-3];
105 e0![-2] += e2![-2];
106 e0![-3] += e2![-3];
107 e2![-2] = (k00_20) * aa![0] - (k01_21) * aa![1];
108 e2![-3] = (k01_21) * aa![0] + (k00_20) * aa![1];
109
110 a_offs += k1;
111
112 k00_20 = e0![-4] - e2![-4];
113 k01_21 = e0![-5] - e2![-5];
114 e0![-4] += e2![-4];
115 e0![-5] += e2![-5];
116 e2![-4] = (k00_20) * aa![0] - (k01_21) * aa![1];
117 e2![-5] = (k01_21) * aa![0] + (k00_20) * aa![1];
118
119 a_offs += k1;
120
121 k00_20 = e0![-6] - e2![-6];
122 k01_21 = e0![-7] - e2![-7];
123 e0![-6] += e2![-6];
124 e0![-7] += e2![-7];
125 e2![-6] = (k00_20) * aa![0] - (k01_21) * aa![1];
126 e2![-7] = (k01_21) * aa![0] + (k00_20) * aa![1];
127
128 d0_offs -= 8;
129 k_offs -= 8;
130
131 a_offs += k1;
132 }
133 }
134
imdct_step3_inner_s_loop(n :usize, e :&mut [f32], i_off :usize, k_off :isize, a :&[f32], a_off :usize, k0 :usize)135 fn imdct_step3_inner_s_loop(n :usize, e :&mut [f32], i_off :usize, k_off :isize,
136 a :&[f32], a_off :usize, k0 :usize) {
137 let a0 = a[0];
138 let a1 = a[0+1];
139 let a2 = a[0+a_off];
140 let a3 = a[0+a_off+1];
141 let a4 = a[0+a_off*2+0];
142 let a5 = a[0+a_off*2+1];
143 let a6 = a[0+a_off*3+0];
144 let a7 = a[0+a_off*3+1];
145
146 let mut i_offs = i_off;
147 let mut k_offs = (i_off as isize + k_off) as usize;
148
149 macro_rules! ee0 {
150 (-$x:expr) => {e[i_offs - ($x as usize)]};
151 ($x:expr) => {e[i_offs + ($x as usize)]}
152 }
153
154 macro_rules! ee2 {
155 (-$x:expr) => {e[k_offs - ($x as usize)]};
156 ($x:expr) => {e[k_offs + ($x as usize)]}
157 }
158
159 let mut i = 0;
160 loop {
161 let mut k00 = ee0![ 0] - ee2![ 0];
162 let mut k11 = ee0![-1] - ee2![-1];
163 ee0![ 0] = ee0![ 0] + ee2![ 0];
164 ee0![-1] = ee0![-1] + ee2![-1];
165 ee2![ 0] = (k00) * a0 - (k11) * a1;
166 ee2![-1] = (k11) * a0 + (k00) * a1;
167
168 k00 = ee0![-2] - ee2![-2];
169 k11 = ee0![-3] - ee2![-3];
170 ee0![-2] = ee0![-2] + ee2![-2];
171 ee0![-3] = ee0![-3] + ee2![-3];
172 ee2![-2] = (k00) * a2 - (k11) * a3;
173 ee2![-3] = (k11) * a2 + (k00) * a3;
174
175 k00 = ee0![-4] - ee2![-4];
176 k11 = ee0![-5] - ee2![-5];
177 ee0![-4] = ee0![-4] + ee2![-4];
178 ee0![-5] = ee0![-5] + ee2![-5];
179 ee2![-4] = (k00) * a4 - (k11) * a5;
180 ee2![-5] = (k11) * a4 + (k00) * a5;
181
182 k00 = ee0![-6] - ee2![-6];
183 k11 = ee0![-7] - ee2![-7];
184 ee0![-6] = ee0![-6] + ee2![-6];
185 ee0![-7] = ee0![-7] + ee2![-7];
186 ee2![-6] = (k00) * a6 - (k11) * a7;
187 ee2![-7] = (k11) * a6 + (k00) * a7;
188
189 i += 1;
190 // we have this check instead of a for loop
191 // over an iterator because otherwise we
192 // overflow.
193 if i >= n {
194 break;
195 }
196 i_offs -= k0;
197 k_offs -= k0;
198 }
199 }
200
201 #[inline]
iter_54(zm7 :&mut [f32])202 fn iter_54(zm7 :&mut [f32]) {
203 // difference from stb_vorbis implementation:
204 // zm7 points to z minus 7
205 // (Rust disallows negative indices)
206
207 let k00 = zm7[7] - zm7[3];
208 let y0 = zm7[7] + zm7[3];
209 let y2 = zm7[5] + zm7[1];
210 let k22 = zm7[5] - zm7[1];
211
212 zm7[7] = y0 + y2; // z0 + z4 + z2 + z6
213 zm7[5] = y0 - y2; // z0 + z4 - z2 - z6
214
215 // done with y0,y2
216
217 let k33 = zm7[4] - zm7[0];
218
219 zm7[3] = k00 + k33; // z0 - z4 + z3 - z7
220 zm7[1] = k00 - k33; // z0 - z4 - z3 + z7
221
222 // done with k33
223
224 let k11 = zm7[6] - zm7[2];
225 let y1 = zm7[6] + zm7[2];
226 let y3 = zm7[4] + zm7[0];
227
228 zm7[6] = y1 + y3; // z1 + z5 + z3 + z7
229 zm7[4] = y1 - y3; // z1 + z5 - z3 - z7
230 zm7[2] = k11 - k22; // z1 - z5 + z2 - z6
231 zm7[0] = k11 + k22; // z1 - z5 - z2 + z6
232 }
233
imdct_step3_inner_s_loop_ld654(n :usize, e :&mut [f32], i_off :usize, a :&[f32], base_n :usize)234 fn imdct_step3_inner_s_loop_ld654(n :usize, e :&mut [f32], i_off :usize,
235 a :&[f32], base_n :usize)
236 {
237 let a_off = base_n >> 3;
238 let a2 = a[a_off];
239
240 let mut z_offs = i_off;
241
242 let basep16 = i_off - 16 * (n - 1 as usize);
243
244 macro_rules! z {
245 (-$x:expr) => {e[z_offs - ($x as usize)]}
246 }
247
248 loop {
249 let mut k00 = z![-0] - z![-8];
250 let mut k11 = z![-1] - z![-9];
251 z![-0] = z![-0] + z![-8];
252 z![-1] = z![-1] + z![-9];
253 z![-8] = k00;
254 z![-9] = k11;
255
256 k00 = z![ -2] - z![-10];
257 k11 = z![ -3] - z![-11];
258 z![ -2] = z![ -2] + z![-10];
259 z![ -3] = z![ -3] + z![-11];
260 z![-10] = (k00+k11) * a2;
261 z![-11] = (k11-k00) * a2;
262
263 k00 = z![-12] - z![ -4]; // reverse to avoid a unary negation
264 k11 = z![ -5] - z![-13];
265 z![ -4] = z![ -4] + z![-12];
266 z![ -5] = z![ -5] + z![-13];
267 z![-12] = k11;
268 z![-13] = k00;
269
270 k00 = z![-14] - z![ -6]; // reverse to avoid a unary negation
271 k11 = z![ -7] - z![-15];
272 z![ -6] = z![ -6] + z![-14];
273 z![ -7] = z![ -7] + z![-15];
274 z![-14] = (k00+k11) * a2;
275 z![-15] = (k00-k11) * a2;
276
277 iter_54(e.split_at_mut(z_offs - 7).1);
278 iter_54(e.split_at_mut(z_offs - 7 - 8).1);
279 // We need to compare with basep16 here
280 // in order to prevent a possible overflow
281 // in calculation of base, and in calculation
282 // of z_offs.
283 if z_offs <= basep16 {
284 break;
285 }
286 z_offs -= 16;
287 }
288 }
289
290 #[allow(dead_code)]
inverse_mdct(cached_bd :&CachedBlocksizeDerived, buffer :&mut [f32], bs :u8)291 pub fn inverse_mdct(cached_bd :&CachedBlocksizeDerived, buffer :&mut [f32], bs :u8) {
292 let n = buffer.len();
293 // Pre-condition.
294 assert_eq!(n, 1 << bs);
295
296 let n2 = n >> 1;
297 let n4 = n >> 2;
298 let n8 = n >> 3;
299
300 // TODO later on we might want to do Vec::with_capacity here,
301 // and use buf2.push everywhere...
302 let mut buf2 :Vec<f32> = vec![0.0; n2];
303
304 let ctf = &cached_bd.twiddle_factors;
305 let a :&[f32] = &ctf.a;
306 let b :&[f32] = &ctf.b;
307 let c :&[f32] = &ctf.c;
308
309 macro_rules! break_if_sub_overflows {
310 ($i:ident, $x:expr) => {
311 $i = match $i.checked_sub($x) {
312 Some(v) => v,
313 None => break,
314 };
315 }
316 }
317
318 // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
319 // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old' in stb_vorbis original.
320
321 // kernel from paper
322
323
324 // merged:
325 // copy and reflect spectral data
326 // step 0
327
328 // note that it turns out that the items added together during
329 // this step are, in fact, being added to themselves (as reflected
330 // by step 0). inexplicable inefficiency! this became obvious
331 // once I combined the passes.
332
333 // so there's a missing 'times 2' here (for adding X to itself).
334 // this propogates through linearly to the end, where the numbers
335 // are 1/2 too small, and need to be compensated for.
336
337 {
338 let mut a_offs = 0;
339 let mut d_offs = n2 - 2;
340 let mut e_offs = 0;
341 let e_stop = n2;
342
343 macro_rules! d {
344 ($x:expr) => {buf2[d_offs + ($x as usize)]}
345 }
346 macro_rules! aa {
347 ($x:expr) => {a[a_offs + ($x as usize)]}
348 }
349 macro_rules! e {
350 ($x:expr) => {buffer[e_offs + ($x as usize)]}
351 }
352
353 // TODO replace the while with a for once step_by on iterators
354 // is stabilized
355 while e_offs != e_stop {
356 d![1] = e![0] * aa![0] - e![2]*aa![1];
357 d![0] = e![0] * aa![1] + e![2]*aa![0];
358 d_offs -= 2;
359 a_offs += 2;
360 e_offs += 4;
361 }
362
363 e_offs = n2 - 3;
364 loop {
365 d![1] = -e![2] * aa![0] - -e![0]*aa![1];
366 d![0] = -e![2] * aa![1] + -e![0]*aa![0];
367 break_if_sub_overflows!(d_offs, 2);
368 a_offs += 2;
369 e_offs -= 4;
370 }
371 }
372
373
374 {
375 // now we use symbolic names for these, so that we can
376 // possibly swap their meaning as we change which operations
377 // are in place
378
379 let u = &mut *buffer;
380 let v = &mut *buf2;
381
382 // step 2 (paper output is w, now u)
383 // this could be in place, but the data ends up in the wrong
384 // place... _somebody_'s got to swap it, so this is nominated
385 {
386 let mut a_offs = n2 - 8;
387 let mut d0_offs = n4;
388 let mut d1_offs = 0;
389 let mut e0_offs = n4;
390 let mut e1_offs = 0;
391
392 macro_rules! aa {
393 ($x:expr) => {a[a_offs + ($x as usize)]}
394 }
395 macro_rules! d0 {
396 ($x:expr) => {u[d0_offs + ($x as usize)]}
397 }
398 macro_rules! d1 {
399 ($x:expr) => {u[d1_offs + ($x as usize)]}
400 }
401 macro_rules! e0 {
402 ($x:expr) => {v[e0_offs + ($x as usize)]}
403 }
404 macro_rules! e1 {
405 ($x:expr) => {v[e1_offs + ($x as usize)]}
406 }
407
408 loop {
409 let mut v41_21 = e0![1] - e1![1];
410 let mut v40_20 = e0![0] - e1![0];
411 d0![1] = e0![1] + e1![1];
412 d0![0] = e0![0] + e1![0];
413 d1![1] = v41_21*aa![4] - v40_20*aa![5];
414 d1![0] = v40_20*aa![4] + v41_21*aa![5];
415
416 v41_21 = e0![3] - e1![3];
417 v40_20 = e0![2] - e1![2];
418 d0![3] = e0![3] + e1![3];
419 d0![2] = e0![2] + e1![2];
420 d1![3] = v41_21*aa![0] - v40_20*aa![1];
421 d1![2] = v40_20*aa![0] + v41_21*aa![1];
422
423 break_if_sub_overflows!(a_offs, 8);
424
425 d0_offs += 4;
426 d1_offs += 4;
427 e0_offs += 4;
428 e1_offs += 4;
429 }
430 }
431
432
433 // step 3
434
435 let ld = bs as usize;
436
437 // optimized step 3:
438
439 // the original step3 loop can be nested r inside s or s inside r;
440 // it's written originally as s inside r, but this is dumb when r
441 // iterates many times, and s few. So I have two copies of it and
442 // switch between them halfway.
443
444 // this is iteration 0 of step 3
445 imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n as isize >> 3), a);
446 imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n as isize >> 3), a);
447
448 // this is iteration 1 of step 3
449 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n as isize >> 4), a, 16);
450 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n as isize >> 4), a, 16);
451 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n as isize >> 4), a, 16);
452 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n as isize >> 4), a, 16);
453
454 for l in 2 .. (ld - 3) >> 1 {
455 let k0 = n >> (l + 2);
456 let k0_2 = k0 as isize >> 1;
457 let lim = 1 << (l+1);
458 for i in 0 .. lim {
459 imdct_step3_inner_r_loop(n >> (l + 4),
460 u, n2-1 - k0*i, -k0_2, a, 1 << (l+3));
461 }
462 }
463 for l in (ld - 3) >> 1 .. ld - 6 {
464 let k0 = n >> (l + 2);
465 let k1 = 1 << (l + 3);
466 let k0_2 = k0 as isize >> 1;
467 let rlim = n >> (l + 6);
468 let lim = 1 << (l + 1);
469 let mut i_off = n2 - 1;
470 let mut a_off = 0;
471 for _ in 0 .. rlim {
472 let a0 = a.split_at(a_off).1;
473 imdct_step3_inner_s_loop(lim, u, i_off, -k0_2, a0, k1, k0);
474 a_off += k1 * 4;
475 i_off -= 8;
476 }
477 }
478
479 // iterations with count:
480 // ld-6,-5,-4 all interleaved together
481 // the big win comes from getting rid of needless flops
482 // due to the constants on pass 5 & 4 being all 1 and 0;
483 // combining them to be simultaneous to improve cache made little difference
484 imdct_step3_inner_s_loop_ld654(n >> 5, u, n2 - 1, a, n);
485
486 // output is u
487
488 // step 4, 5, and 6
489 // cannot be in-place because of step 5
490 {
491 let bitrev_vec = &cached_bd.bitrev;
492 // weirdly, I'd have thought reading sequentially and writing
493 // erratically would have been better than vice-versa, but in
494 // fact that's not what my testing showed. (That is, with
495 // j = bitreverse(i), do you read i and write j, or read j and write i.)
496
497 let mut d0_offs = n4 - 4;
498 let mut d1_offs = n2 - 4;
499 let mut bitrev_offs = 0;
500
501 macro_rules! d0 {
502 ($x:expr) => {v[d0_offs + ($x as usize)]}
503 }
504 macro_rules! d1 {
505 ($x:expr) => {v[d1_offs + ($x as usize)]}
506 }
507 macro_rules! bitrev {
508 ($x:expr) => {bitrev_vec[bitrev_offs + ($x as usize)]}
509 }
510
511 loop {
512 let mut k4 = bitrev![0] as usize;
513 d1![3] = u[k4 + 0];
514 d1![2] = u[k4 + 1];
515 d0![3] = u[k4 + 2];
516 d0![2] = u[k4 + 3];
517
518 k4 = bitrev![1] as usize;
519 d1![1] = u[k4 + 0];
520 d1![0] = u[k4 + 1];
521 d0![1] = u[k4 + 2];
522 d0![0] = u[k4 + 3];
523
524 break_if_sub_overflows!(d0_offs, 4);
525 d1_offs -= 4;
526 bitrev_offs += 2;
527 }
528 }
529 // (paper output is u, now v)
530
531 // step 7 (paper output is v, now v)
532 // this is now in place
533 {
534 let mut c_offs = 0;
535 let mut d_offs = 0;
536 let mut e_offs = n2 - 4;
537
538 macro_rules! cc {
539 ($x:expr) => {c[c_offs + ($x as usize)]}
540 }
541 macro_rules! d {
542 ($x:expr) => {v[d_offs + ($x as usize)]}
543 }
544 macro_rules! e {
545 ($x:expr) => {v[e_offs + ($x as usize)]}
546 }
547 while d_offs < e_offs {
548 let mut a02 = d![0] - e![2];
549 let mut a11 = d![1] + e![3];
550
551 let mut b0 = cc![1]*a02 + cc![0]*a11;
552 let mut b1 = cc![1]*a11 - cc![0]*a02;
553
554 let mut b2 = d![0] + e![ 2];
555 let mut b3 = d![1] - e![ 3];
556
557 d![0] = b2 + b0;
558 d![1] = b3 + b1;
559 e![2] = b2 - b0;
560 e![3] = b1 - b3;
561
562 a02 = d![2] - e![0];
563 a11 = d![3] + e![1];
564
565 b0 = cc![3]*a02 + cc![2]*a11;
566 b1 = cc![3]*a11 - cc![2]*a02;
567
568 b2 = d![2] + e![ 0];
569 b3 = d![3] - e![ 1];
570
571 d![2] = b2 + b0;
572 d![3] = b3 + b1;
573 e![0] = b2 - b0;
574 e![1] = b1 - b3;
575
576 c_offs += 4;
577 d_offs += 4;
578 e_offs -= 4;
579 }
580 }
581 }
582
583 // step 8+decode (paper output is X, now buffer)
584 // this generates pairs of data a la 8 and pushes them directly through
585 // the decode kernel (pushing rather than pulling) to avoid having
586 // to make another pass later
587
588 // this cannot POSSIBLY be in place, so we refer to the buffers directly
589 {
590 let mut d0_offs = 0;
591 let mut d1_offs = n2 - 4;
592 let mut d2_offs = n2;
593 let mut d3_offs = n - 4;
594
595 let mut b_offs = n2 - 8;
596 let mut e_offs = n2 - 8;
597
598 macro_rules! d0 {
599 ($x:expr) => {buffer[d0_offs + ($x as usize)]}
600 }
601 macro_rules! d1 {
602 ($x:expr) => {buffer[d1_offs + ($x as usize)]}
603 }
604 macro_rules! d2 {
605 ($x:expr) => {buffer[d2_offs + ($x as usize)]}
606 }
607 macro_rules! d3 {
608 ($x:expr) => {buffer[d3_offs + ($x as usize)]}
609 }
610
611 macro_rules! b {
612 ($x:expr) => {b[b_offs + ($x as usize)]}
613 }
614 macro_rules! e {
615 ($x:expr) => {buf2[e_offs + ($x as usize)]}
616 }
617
618 loop {
619 let mut p3 = e![6]*b![7] - e![7]*b![6];
620 let mut p2 = -e![6]*b![6] - e![7]*b![7];
621
622 d0![0] = p3;
623 d1![3] = - p3;
624 d2![0] = p2;
625 d3![3] = p2;
626
627 let mut p1 = e![4]*b![5] - e![5]*b![4];
628 let mut p0 = -e![4]*b![4] - e![5]*b![5];
629
630 d0![1] = p1;
631 d1![2] = - p1;
632 d2![1] = p0;
633 d3![2] = p0;
634
635 p3 = e![2]*b![3] - e![3]*b![2];
636 p2 = -e![2]*b![2] - e![3]*b![3];
637
638 d0![2] = p3;
639 d1![1] = - p3;
640 d2![2] = p2;
641 d3![1] = p2;
642
643 p1 = e![0]*b![1] - e![1]*b![0];
644 p0 = -e![0]*b![0] - e![1]*b![1];
645
646 d0![3] = p1;
647 d1![0] = - p1;
648 d2![3] = p0;
649 d3![0] = p0;
650
651 break_if_sub_overflows!(e_offs, 8);
652 b_offs -= 8;
653 d0_offs += 4;
654 d2_offs += 4;
655 d1_offs -= 4;
656 d3_offs -= 4;
657 }
658 }
659 }
660
661 #[allow(dead_code)]
inverse_mdct_naive(cached_bd :&CachedBlocksizeDerived, buffer :&mut[f32])662 pub fn inverse_mdct_naive(cached_bd :&CachedBlocksizeDerived, buffer :&mut[f32]) {
663 let n = buffer.len();
664 let n2 = n >> 1;
665 let n4 = n >> 2;
666 let n8 = n >> 3;
667 let n3_4 = n - n4;
668
669 let mut u = [0.0; 1 << 13];
670 let mut xa = [0.0; 1 << 13];
671 let mut v = [0.0; 1 << 13];
672 let mut w = [0.0; 1 << 13];
673
674 // retrieve the cached twiddle factors
675 let ctf = &cached_bd.twiddle_factors;
676 let a :&[f32] = &ctf.a;
677 let b :&[f32] = &ctf.b;
678 let c :&[f32] = &ctf.c;
679
680 // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
681 // Note there are bugs in that pseudocode, presumably due to them attempting
682 // to rename the arrays nicely rather than representing the way their actual
683 // implementation bounces buffers back and forth. As a result, even in the
684 // "some formulars corrected" version, a direct implementation fails. These
685 // are noted below as "paper bug".
686
687 // copy and reflect spectral data
688 for k in 0 .. n2 {
689 u[k] = buffer[k];
690 }
691 for k in n2 .. n {
692 u[k] = -buffer[n - k - 1];
693 }
694
695 let mut k2 = 0;
696 let mut k4 = 0;
697
698 // kernel from paper
699 // step 1
700 while k2 < n2 { // n4 iterations
701 v[n-k4-1] = (u[k4] - u[n-k4-1]) * a[k2] - (u[k4+2] - u[n-k4-3])*a[k2+1];
702 v[n-k4-3] = (u[k4] - u[n-k4-1]) * a[k2+1] + (u[k4+2] - u[n-k4-3])*a[k2];
703 k2 += 2;
704 k4 += 4;
705 }
706 // step 2
707 k4 = 0;
708 while k4 < n2 { // n8 iterations
709 w[n2+3+k4] = v[n2+3+k4] + v[k4+3];
710 w[n2+1+k4] = v[n2+1+k4] + v[k4+1];
711 w[k4+3] = (v[n2+3+k4] - v[k4+3])*a[n2-4-k4] - (v[n2+1+k4]-v[k4+1])*a[n2-3-k4];
712 w[k4+1] = (v[n2+1+k4] - v[k4+1])*a[n2-4-k4] + (v[n2+3+k4]-v[k4+3])*a[n2-3-k4];
713 k4 += 4;
714 }
715
716 // step 3
717 let ld :usize = (::ilog(n as u64) - 1) as usize;
718 for l in 0 .. ld - 3 {
719 let k0 = n >> (l+2);
720 let k1 = 1 << (l+3);
721 let rlim = n >> (l+4);
722 let slim = 1 << (l+1);
723 let mut r4 = 0;
724 for r in 0 .. rlim {
725 let mut s2 = 0;
726 for _ in 0 .. slim {
727 u[n-1-k0*s2-r4] = w[n-1-k0*s2-r4] + w[n-1-k0*(s2+1)-r4];
728 u[n-3-k0*s2-r4] = w[n-3-k0*s2-r4] + w[n-3-k0*(s2+1)-r4];
729 u[n-1-k0*(s2+1)-r4] = (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * a[r*k1]
730 - (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * a[r*k1+1];
731 u[n-3-k0*(s2+1)-r4] = (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * a[r*k1]
732 + (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * a[r*k1+1];
733 s2 += 2;
734 }
735 r4 += 4;
736 }
737 if l+1 < ld-3 {
738 // paper bug: ping-ponging of u&w here is omitted
739 w.copy_from_slice(&u);
740 }
741 }
742
743 // step 4
744 for i in 0 .. n8 {
745 let j = (::bit_reverse(i as u32) >> (32-ld+3)) as usize;
746 assert!(j < n8);
747 if i == j {
748 // paper bug: original code probably swapped in place; if copying,
749 // need to directly copy in this case
750 let ii = i << 3;
751 v[ii+1] = u[ii+1];
752 v[ii+3] = u[ii+3];
753 v[ii+5] = u[ii+5];
754 v[ii+7] = u[ii+7];
755 } else if i < j {
756 let ii = i << 3;
757 let j8 = j << 3;
758 v[j8+1] = u[ii+1];
759 v[ii+1] = u[j8 + 1];
760 v[j8+3] = u[ii+3];
761 v[ii+3] = u[j8 + 3];
762 v[j8+5] = u[ii+5];
763 v[ii+5] = u[j8 + 5];
764 v[j8+7] = u[ii+7];
765 v[ii+7] = u[j8 + 7];
766 }
767 }
768
769 // step 5
770 for k in 0 .. n2 {
771 w[k] = v[k*2+1];
772 }
773 // step 6
774 let mut k2 = 0;
775 let mut k4 = 0;
776 while k2 < n4 { // n8 iterations
777 u[n-1-k2] = w[k4];
778 u[n-2-k2] = w[k4+1];
779 u[n3_4 - 1 - k2] = w[k4+2];
780 u[n3_4 - 2 - k2] = w[k4+3];
781 k2 += 2;
782 k4 += 4;
783 }
784 // step 7
785 k2 = 0;
786 while k2 < n4 { // n8 iterations
787 v[n2 + k2 ] = ( u[n2 + k2] + u[n-2-k2] + c[k2+1]*(u[n2+k2]-u[n-2-k2]) + c[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2.0;
788 v[n-2 - k2] = ( u[n2 + k2] + u[n-2-k2] - c[k2+1]*(u[n2+k2]-u[n-2-k2]) - c[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2.0;
789 v[n2+1+ k2] = ( u[n2+1+k2] - u[n-1-k2] + c[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - c[k2]*(u[n2+k2]-u[n-2-k2]))/2.0;
790 v[n-1 - k2] = (-u[n2+1+k2] + u[n-1-k2] + c[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - c[k2]*(u[n2+k2]-u[n-2-k2]))/2.0;
791 k2 += 2;
792 }
793 // step 8
794 k2 = 0;
795 for k in 0 .. n4 {
796 xa[k] = v[k2+n2]*b[k2 ] + v[k2+1+n2]*b[k2+1];
797 xa[n2-1-k] = v[k2+n2]*b[k2+1] - v[k2+1+n2]*b[k2 ];
798 k2 += 2;
799 }
800
801 // decode kernel to output
802
803 for i in 0 .. n4 {
804 buffer[i] = xa[i + n4];
805 }
806 for i in n4 .. n3_4 {
807 buffer[i] = -xa[n3_4 - i - 1];
808 }
809 for i in n3_4 .. n {
810 buffer[i] = -xa[i - n3_4];
811 }
812 }
813
814 #[cfg(test)]
815 #[test]
test_imdct_naive()816 fn test_imdct_naive() {
817 use imdct_test::*;
818 let mut arr_1 = imdct_prepare(&IMDCT_INPUT_TEST_ARR_1);
819 let cbd = CachedBlocksizeDerived::from_blocksize(8);
820 inverse_mdct_naive(&cbd, &mut arr_1);
821 let mismatches = fuzzy_compare_array(
822 &arr_1, &IMDCT_OUTPUT_TEST_ARR_1,
823 0.00005, true);
824 let mismatches_limit = 0;
825 if mismatches > mismatches_limit {
826 panic!("Numer of mismatches {} was larger than limit of {}",
827 mismatches, mismatches_limit);
828 }
829 }
830
831 #[cfg(test)]
832 #[test]
test_imdct()833 fn test_imdct() {
834 use imdct_test::*;
835 let mut arr_1 = imdct_prepare(&IMDCT_INPUT_TEST_ARR_1);
836 let blocksize = 8;
837 let cbd = CachedBlocksizeDerived::from_blocksize(blocksize);
838 inverse_mdct(&cbd, &mut arr_1, blocksize);
839 let mismatches = fuzzy_compare_array(
840 &arr_1, &IMDCT_OUTPUT_TEST_ARR_1,
841 0.00005, true);
842 let mismatches_limit = 0;
843 if mismatches > mismatches_limit {
844 panic!("Numer of mismatches {} was larger than limit of {}",
845 mismatches, mismatches_limit);
846 }
847 }
848