1 // Vorbis decoder written in Rust
2 //
3 // Copyright (c) 2016 est31 <MTest31@outlook.com>
4 // and contributors. All rights reserved.
5 // Licensed under MIT license, or Apache 2 license,
6 // at your option. Please see the LICENSE file
7 // attached to this source distribution for details.
8 
9 // This file is a very close translation of the
10 // implementation of the algorithm from stb_vorbis.
11 
12 use ::header_cached::CachedBlocksizeDerived;
13 
imdct_step3_iter0_loop(n :usize, e :&mut[f32], i_off :usize, k_off :isize, a :&[f32])14 fn imdct_step3_iter0_loop(n :usize, e :&mut[f32], i_off :usize, k_off :isize, a :&[f32]) {
15 	let mut a_offs = 0;
16 	let mut i_offs = i_off;
17 	let mut k_offs = i_off as isize + k_off;
18 
19 	macro_rules! ee0 {
20 		(-$x:expr) => {e[i_offs - ($x as usize)]};
21 		($x:expr) => {e[i_offs + ($x as usize)]}
22 	}
23 
24 	macro_rules! ee2 {
25 		(-$x:expr) => {e[(k_offs - $x) as usize]};
26 		($x:expr) => {e[(k_offs + $x) as usize]}
27 	}
28 
29 	macro_rules! aa {
30 		($x:expr) => {a[a_offs + ($x as usize)]}
31 	}
32 
33 	assert_eq!((n & 3), 0);
34 
35 	for _ in 0 .. n >> 2 {
36 		let mut k00_20 = ee0![ 0] - ee2![ 0];
37 		let mut k01_21 = ee0![-1] - ee2![-1];
38 		ee0![ 0] += ee2![ 0];
39 		ee0![-1] += ee2![-1];
40 		ee2![ 0] = k00_20 * aa![0] - k01_21 * aa![1];
41 		ee2![-1] = k01_21 * aa![0] + k00_20 * aa![1];
42 		a_offs += 8;
43 
44 		k00_20  = ee0![-2] - ee2![-2];
45 		k01_21  = ee0![-3] - ee2![-3];
46 		ee0![-2] += ee2![-2];
47 		ee0![-3] += ee2![-3];
48 		ee2![-2] = k00_20 * aa![0] - k01_21 * aa![1];
49 		ee2![-3] = k01_21 * aa![0] + k00_20 * aa![1];
50 		a_offs += 8;
51 
52 		k00_20  = ee0![-4] - ee2![-4];
53 		k01_21  = ee0![-5] - ee2![-5];
54 		ee0![-4] += ee2![-4];
55 		ee0![-5] += ee2![-5];
56 		ee2![-4] = k00_20 * aa![0] - k01_21 * aa![1];
57 		ee2![-5] = k01_21 * aa![0] + k00_20 * aa![1];
58 		a_offs += 8;
59 
60 		k00_20  = ee0![-6] - ee2![-6];
61 		k01_21  = ee0![-7] - ee2![-7];
62 		ee0![-6] += ee2![-6];
63 		ee0![-7] += ee2![-7];
64 		ee2![-6] = k00_20 * aa![0] - k01_21 * aa![1];
65 		ee2![-7] = k01_21 * aa![0] + k00_20 * aa![1];
66 
67 		a_offs += 8;
68 		i_offs -= 8;
69 		k_offs -= 8;
70 	}
71 }
72 
imdct_step3_inner_r_loop(lim :usize, e :&mut [f32], d0 :usize, k_off :isize, a :&[f32], k1 :usize)73 fn imdct_step3_inner_r_loop(lim :usize, e :&mut [f32],
74 		d0 :usize, k_off :isize, a :&[f32], k1 :usize) {
75 	let mut a_offs = 0;
76 	let mut d0_offs = d0;
77 	let mut k_offs = d0 as isize + k_off;
78 
79 	macro_rules! e0 {
80 		(-$x:expr) => {e[d0_offs - ($x as usize)]};
81 		($x:expr) => {e[d0_offs + ($x as usize)]}
82 	}
83 
84 	macro_rules! e2 {
85 		(-$x:expr) => {e[(k_offs - $x) as usize]};
86 		($x:expr) => {e[(k_offs + $x) as usize]}
87 	}
88 
89 	macro_rules! aa {
90 		($x:expr) => {a[a_offs + ($x as usize)]}
91 	}
92 
93 	for _ in 0 .. lim >> 2 {
94 		let mut k00_20 = e0![-0] - e2![-0];
95 		let mut k01_21 = e0![-1] - e2![-1];
96 		e0![-0] += e2![-0];
97 		e0![-1] += e2![-1];
98 		e2![-0] = (k00_20) * aa![0] - (k01_21) * aa![1];
99 		e2![-1] = (k01_21) * aa![0] + (k00_20) * aa![1];
100 
101 		a_offs += k1;
102 
103 		k00_20 = e0![-2] - e2![-2];
104 		k01_21 = e0![-3] - e2![-3];
105 		e0![-2] += e2![-2];
106 		e0![-3] += e2![-3];
107 		e2![-2] = (k00_20) * aa![0] - (k01_21) * aa![1];
108 		e2![-3] = (k01_21) * aa![0] + (k00_20) * aa![1];
109 
110 		a_offs += k1;
111 
112 		k00_20 = e0![-4] - e2![-4];
113 		k01_21 = e0![-5] - e2![-5];
114 		e0![-4] += e2![-4];
115 		e0![-5] += e2![-5];
116 		e2![-4] = (k00_20) * aa![0] - (k01_21) * aa![1];
117 		e2![-5] = (k01_21) * aa![0] + (k00_20) * aa![1];
118 
119 		a_offs += k1;
120 
121 		k00_20 = e0![-6] - e2![-6];
122 		k01_21 = e0![-7] - e2![-7];
123 		e0![-6] += e2![-6];
124 		e0![-7] += e2![-7];
125 		e2![-6] = (k00_20) * aa![0] - (k01_21) * aa![1];
126 		e2![-7] = (k01_21) * aa![0] + (k00_20) * aa![1];
127 
128 		d0_offs -= 8;
129 		k_offs -= 8;
130 
131 		a_offs += k1;
132 	}
133 }
134 
imdct_step3_inner_s_loop(n :usize, e :&mut [f32], i_off :usize, k_off :isize, a :&[f32], a_off :usize, k0 :usize)135 fn imdct_step3_inner_s_loop(n :usize, e :&mut [f32], i_off :usize, k_off :isize,
136 		a :&[f32], a_off :usize, k0 :usize) {
137 	let a0 = a[0];
138 	let a1 = a[0+1];
139 	let a2 = a[0+a_off];
140 	let a3 = a[0+a_off+1];
141 	let a4 = a[0+a_off*2+0];
142 	let a5 = a[0+a_off*2+1];
143 	let a6 = a[0+a_off*3+0];
144 	let a7 = a[0+a_off*3+1];
145 
146 	let mut i_offs = i_off;
147 	let mut k_offs = (i_off as isize + k_off) as usize;
148 
149 	macro_rules! ee0 {
150 		(-$x:expr) => {e[i_offs - ($x as usize)]};
151 		($x:expr) => {e[i_offs + ($x as usize)]}
152 	}
153 
154 	macro_rules! ee2 {
155 		(-$x:expr) => {e[k_offs - ($x as usize)]};
156 		($x:expr) => {e[k_offs + ($x as usize)]}
157 	}
158 
159 	let mut i = 0;
160 	loop {
161 		let mut k00 = ee0![ 0] - ee2![ 0];
162 		let mut k11 = ee0![-1] - ee2![-1];
163 		ee0![ 0] =  ee0![ 0] + ee2![ 0];
164 		ee0![-1] =  ee0![-1] + ee2![-1];
165 		ee2![ 0] = (k00) * a0 - (k11) * a1;
166 		ee2![-1] = (k11) * a0 + (k00) * a1;
167 
168 		k00      = ee0![-2] - ee2![-2];
169 		k11      = ee0![-3] - ee2![-3];
170 		ee0![-2] =  ee0![-2] + ee2![-2];
171 		ee0![-3] =  ee0![-3] + ee2![-3];
172 		ee2![-2] = (k00) * a2 - (k11) * a3;
173 		ee2![-3] = (k11) * a2 + (k00) * a3;
174 
175 		k00      = ee0![-4] - ee2![-4];
176 		k11      = ee0![-5] - ee2![-5];
177 		ee0![-4] =  ee0![-4] + ee2![-4];
178 		ee0![-5] =  ee0![-5] + ee2![-5];
179 		ee2![-4] = (k00) * a4 - (k11) * a5;
180 		ee2![-5] = (k11) * a4 + (k00) * a5;
181 
182 		k00      = ee0![-6] - ee2![-6];
183 		k11      = ee0![-7] - ee2![-7];
184 		ee0![-6] =  ee0![-6] + ee2![-6];
185 		ee0![-7] =  ee0![-7] + ee2![-7];
186 		ee2![-6] = (k00) * a6 - (k11) * a7;
187 		ee2![-7] = (k11) * a6 + (k00) * a7;
188 
189 		i += 1;
190 		// we have this check instead of a for loop
191 		// over an iterator because otherwise we
192 		// overflow.
193 		if i >= n {
194 			break;
195 		}
196 		i_offs -= k0;
197 		k_offs -= k0;
198 	}
199 }
200 
201 #[inline]
iter_54(zm7 :&mut [f32])202 fn iter_54(zm7 :&mut [f32]) {
203 	// difference from stb_vorbis implementation:
204 	// zm7 points to z minus 7
205 	// (Rust disallows negative indices)
206 
207 	let k00  = zm7[7] - zm7[3];
208 	let y0   = zm7[7] + zm7[3];
209 	let y2   = zm7[5] + zm7[1];
210 	let k22  = zm7[5] - zm7[1];
211 
212 	zm7[7] = y0 + y2;      // z0 + z4 + z2 + z6
213 	zm7[5] = y0 - y2;      // z0 + z4 - z2 - z6
214 
215 	// done with y0,y2
216 
217 	let k33  = zm7[4] - zm7[0];
218 
219 	zm7[3] = k00 + k33;    // z0 - z4 + z3 - z7
220 	zm7[1] = k00 - k33;    // z0 - z4 - z3 + z7
221 
222 	// done with k33
223 
224 	let k11  = zm7[6] - zm7[2];
225 	let y1   = zm7[6] + zm7[2];
226 	let y3   = zm7[4] + zm7[0];
227 
228 	zm7[6] = y1 + y3;      // z1 + z5 + z3 + z7
229 	zm7[4] = y1 - y3;      // z1 + z5 - z3 - z7
230 	zm7[2] = k11 - k22;    // z1 - z5 + z2 - z6
231 	zm7[0] = k11 + k22;    // z1 - z5 - z2 + z6
232 }
233 
imdct_step3_inner_s_loop_ld654(n :usize, e :&mut [f32], i_off :usize, a :&[f32], base_n :usize)234 fn imdct_step3_inner_s_loop_ld654(n :usize, e :&mut [f32], i_off :usize,
235 	a :&[f32], base_n :usize)
236 {
237 	let a_off = base_n >> 3;
238 	let a2 = a[a_off];
239 
240 	let mut z_offs = i_off;
241 
242 	let basep16 = i_off - 16 * (n - 1 as usize);
243 
244 	macro_rules! z {
245 		(-$x:expr) => {e[z_offs - ($x as usize)]}
246 	}
247 
248 	loop {
249 		let mut k00 = z![-0] - z![-8];
250 		let mut k11 = z![-1] - z![-9];
251 		z![-0] = z![-0] + z![-8];
252 		z![-1] = z![-1] + z![-9];
253 		z![-8] =  k00;
254 		z![-9] =  k11;
255 
256 		k00     = z![ -2] - z![-10];
257 		k11     = z![ -3] - z![-11];
258 		z![ -2] = z![ -2] + z![-10];
259 		z![ -3] = z![ -3] + z![-11];
260 		z![-10] = (k00+k11) * a2;
261 		z![-11] = (k11-k00) * a2;
262 
263 		k00     = z![-12] - z![ -4];  // reverse to avoid a unary negation
264 		k11     = z![ -5] - z![-13];
265 		z![ -4] = z![ -4] + z![-12];
266 		z![ -5] = z![ -5] + z![-13];
267 		z![-12] = k11;
268 		z![-13] = k00;
269 
270 		k00     = z![-14] - z![ -6];  // reverse to avoid a unary negation
271 		k11     = z![ -7] - z![-15];
272 		z![ -6] = z![ -6] + z![-14];
273 		z![ -7] = z![ -7] + z![-15];
274 		z![-14] = (k00+k11) * a2;
275 		z![-15] = (k00-k11) * a2;
276 
277 		iter_54(e.split_at_mut(z_offs - 7).1);
278 		iter_54(e.split_at_mut(z_offs - 7 - 8).1);
279 		// We need to compare with basep16 here
280 		// in order to prevent a possible overflow
281 		// in calculation of base, and in calculation
282 		// of z_offs.
283 		if z_offs <= basep16 {
284 			break;
285 		}
286 		z_offs -= 16;
287 	}
288 }
289 
290 #[allow(dead_code)]
inverse_mdct(cached_bd :&CachedBlocksizeDerived, buffer :&mut [f32], bs :u8)291 pub fn inverse_mdct(cached_bd :&CachedBlocksizeDerived, buffer :&mut [f32], bs :u8) {
292 	let n = buffer.len();
293 	// Pre-condition.
294 	assert_eq!(n, 1 << bs);
295 
296 	let n2 = n >> 1;
297 	let n4 = n >> 2;
298 	let n8 = n >> 3;
299 
300 	// TODO later on we might want to do Vec::with_capacity here,
301 	// and use buf2.push everywhere...
302 	let mut buf2 :Vec<f32> = vec![0.0; n2];
303 
304 	let ctf = &cached_bd.twiddle_factors;
305 	let a :&[f32] = &ctf.a;
306 	let b :&[f32] = &ctf.b;
307 	let c :&[f32] = &ctf.c;
308 
309 	macro_rules! break_if_sub_overflows {
310 		($i:ident, $x:expr) => {
311 			$i = match $i.checked_sub($x) {
312 				Some(v) => v,
313 				None => break,
314 			};
315 		}
316 	}
317 
318 	// IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
319 	// See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old' in stb_vorbis original.
320 
321 	// kernel from paper
322 
323 
324 	// merged:
325 	//   copy and reflect spectral data
326 	//   step 0
327 
328 	// note that it turns out that the items added together during
329 	// this step are, in fact, being added to themselves (as reflected
330 	// by step 0). inexplicable inefficiency! this became obvious
331 	// once I combined the passes.
332 
333 	// so there's a missing 'times 2' here (for adding X to itself).
334 	// this propogates through linearly to the end, where the numbers
335 	// are 1/2 too small, and need to be compensated for.
336 
337 	{
338 		let mut a_offs = 0;
339 		let mut d_offs = n2 - 2;
340 		let mut e_offs = 0;
341 		let e_stop = n2;
342 
343 		macro_rules! d {
344 			($x:expr) => {buf2[d_offs + ($x as usize)]}
345 		}
346 		macro_rules! aa {
347 			($x:expr) => {a[a_offs + ($x as usize)]}
348 		}
349 		macro_rules! e {
350 			($x:expr) => {buffer[e_offs + ($x as usize)]}
351 		}
352 
353 		// TODO replace the while with a for once step_by on iterators
354 		// is stabilized
355 		while e_offs != e_stop {
356 			d![1] = e![0] * aa![0] - e![2]*aa![1];
357 			d![0] = e![0] * aa![1] + e![2]*aa![0];
358 			d_offs -= 2;
359 			a_offs += 2;
360 			e_offs += 4;
361 		}
362 
363 		e_offs = n2 - 3;
364 		loop {
365 			d![1] = -e![2] * aa![0] - -e![0]*aa![1];
366 			d![0] = -e![2] * aa![1] + -e![0]*aa![0];
367 			break_if_sub_overflows!(d_offs, 2);
368 			a_offs += 2;
369 			e_offs -= 4;
370 		}
371 	}
372 
373 
374 	{
375 		// now we use symbolic names for these, so that we can
376 		// possibly swap their meaning as we change which operations
377 		// are in place
378 
379 		let u = &mut *buffer;
380 		let v = &mut *buf2;
381 
382 		// step 2    (paper output is w, now u)
383 		// this could be in place, but the data ends up in the wrong
384 		// place... _somebody_'s got to swap it, so this is nominated
385 		{
386 			let mut a_offs = n2 - 8;
387 			let mut d0_offs = n4;
388 			let mut d1_offs = 0;
389 			let mut e0_offs = n4;
390 			let mut e1_offs = 0;
391 
392 			macro_rules! aa {
393 				($x:expr) => {a[a_offs + ($x as usize)]}
394 			}
395 			macro_rules! d0 {
396 				($x:expr) => {u[d0_offs + ($x as usize)]}
397 			}
398 			macro_rules! d1 {
399 				($x:expr) => {u[d1_offs + ($x as usize)]}
400 			}
401 			macro_rules! e0 {
402 				($x:expr) => {v[e0_offs + ($x as usize)]}
403 			}
404 			macro_rules! e1 {
405 				($x:expr) => {v[e1_offs + ($x as usize)]}
406 			}
407 
408 			loop {
409 				let mut v41_21 = e0![1] - e1![1];
410 				let mut v40_20 = e0![0] - e1![0];
411 				d0![1]  = e0![1] + e1![1];
412 				d0![0]  = e0![0] + e1![0];
413 				d1![1]  = v41_21*aa![4] - v40_20*aa![5];
414 				d1![0]  = v40_20*aa![4] + v41_21*aa![5];
415 
416 				v41_21 = e0![3] - e1![3];
417 				v40_20 = e0![2] - e1![2];
418 				d0![3]  = e0![3] + e1![3];
419 				d0![2]  = e0![2] + e1![2];
420 				d1![3]  = v41_21*aa![0] - v40_20*aa![1];
421 				d1![2]  = v40_20*aa![0] + v41_21*aa![1];
422 
423 				break_if_sub_overflows!(a_offs, 8);
424 
425 				d0_offs += 4;
426 				d1_offs += 4;
427 				e0_offs += 4;
428 				e1_offs += 4;
429 			}
430 		}
431 
432 
433 		// step 3
434 
435 		let ld = bs as usize;
436 
437 		// optimized step 3:
438 
439 		// the original step3 loop can be nested r inside s or s inside r;
440 		// it's written originally as s inside r, but this is dumb when r
441 		// iterates many times, and s few. So I have two copies of it and
442 		// switch between them halfway.
443 
444 		// this is iteration 0 of step 3
445 		imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n as isize >> 3), a);
446 		imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n as isize >> 3), a);
447 
448 		// this is iteration 1 of step 3
449 		imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n as isize >> 4), a, 16);
450 		imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n as isize >> 4), a, 16);
451 		imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n as isize >> 4), a, 16);
452 		imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n as isize >> 4), a, 16);
453 
454 		for l in 2 .. (ld - 3) >> 1 {
455 			let k0 = n >> (l + 2);
456 			let k0_2 = k0 as isize >> 1;
457 			let lim = 1 << (l+1);
458 			for i in 0 .. lim {
459 				imdct_step3_inner_r_loop(n >> (l + 4),
460 					u, n2-1 - k0*i, -k0_2, a, 1 << (l+3));
461 			}
462 		}
463 		for l in (ld - 3) >> 1 .. ld - 6 {
464 			let k0 = n >> (l + 2);
465 			let k1 = 1 << (l + 3);
466 			let k0_2 = k0 as isize >> 1;
467 			let rlim = n >> (l + 6);
468 			let lim = 1 << (l + 1);
469 			let mut i_off = n2 - 1;
470 			let mut a_off = 0;
471 			for _ in 0 .. rlim {
472 				let a0 = a.split_at(a_off).1;
473 				imdct_step3_inner_s_loop(lim, u, i_off, -k0_2, a0, k1, k0);
474 				a_off += k1 * 4;
475 				i_off -= 8;
476 			}
477 		}
478 
479 		// iterations with count:
480 		//   ld-6,-5,-4 all interleaved together
481 		//       the big win comes from getting rid of needless flops
482 		//         due to the constants on pass 5 & 4 being all 1 and 0;
483 		//       combining them to be simultaneous to improve cache made little difference
484 		imdct_step3_inner_s_loop_ld654(n >> 5, u, n2 - 1, a, n);
485 
486 		// output is u
487 
488 		// step 4, 5, and 6
489 		// cannot be in-place because of step 5
490 		{
491 			let bitrev_vec = &cached_bd.bitrev;
492 			// weirdly, I'd have thought reading sequentially and writing
493 			// erratically would have been better than vice-versa, but in
494 			// fact that's not what my testing showed. (That is, with
495 			// j = bitreverse(i), do you read i and write j, or read j and write i.)
496 
497 			let mut d0_offs = n4 - 4;
498 			let mut d1_offs = n2 - 4;
499 			let mut bitrev_offs = 0;
500 
501 			macro_rules! d0 {
502 				($x:expr) => {v[d0_offs + ($x as usize)]}
503 			}
504 			macro_rules! d1 {
505 				($x:expr) => {v[d1_offs + ($x as usize)]}
506 			}
507 			macro_rules! bitrev {
508 				($x:expr) => {bitrev_vec[bitrev_offs + ($x as usize)]}
509 			}
510 
511 			loop {
512 				let mut k4 = bitrev![0] as usize;
513 				d1![3] = u[k4 + 0];
514 				d1![2] = u[k4 + 1];
515 				d0![3] = u[k4 + 2];
516 				d0![2] = u[k4 + 3];
517 
518 				k4 = bitrev![1] as usize;
519 				d1![1] = u[k4 + 0];
520 				d1![0] = u[k4 + 1];
521 				d0![1] = u[k4 + 2];
522 				d0![0] = u[k4 + 3];
523 
524 				break_if_sub_overflows!(d0_offs, 4);
525 				d1_offs -= 4;
526 				bitrev_offs += 2;
527 			}
528 		}
529 		// (paper output is u, now v)
530 
531 		// step 7   (paper output is v, now v)
532 		// this is now in place
533 		{
534 			let mut c_offs = 0;
535 			let mut d_offs = 0;
536 			let mut e_offs = n2 - 4;
537 
538 			macro_rules! cc {
539 				($x:expr) => {c[c_offs + ($x as usize)]}
540 			}
541 			macro_rules! d {
542 				($x:expr) => {v[d_offs + ($x as usize)]}
543 			}
544 			macro_rules! e {
545 				($x:expr) => {v[e_offs + ($x as usize)]}
546 			}
547 			while d_offs < e_offs {
548 				let mut a02 = d![0] - e![2];
549 				let mut a11 = d![1] + e![3];
550 
551 				let mut b0 = cc![1]*a02 + cc![0]*a11;
552 				let mut b1 = cc![1]*a11 - cc![0]*a02;
553 
554 				let mut b2 = d![0] + e![ 2];
555 				let mut b3 = d![1] - e![ 3];
556 
557 				d![0] = b2 + b0;
558 				d![1] = b3 + b1;
559 				e![2] = b2 - b0;
560 				e![3] = b1 - b3;
561 
562 				a02 = d![2] - e![0];
563 				a11 = d![3] + e![1];
564 
565 				b0 = cc![3]*a02 + cc![2]*a11;
566 				b1 = cc![3]*a11 - cc![2]*a02;
567 
568 				b2 = d![2] + e![ 0];
569 				b3 = d![3] - e![ 1];
570 
571 				d![2] = b2 + b0;
572 				d![3] = b3 + b1;
573 				e![0] = b2 - b0;
574 				e![1] = b1 - b3;
575 
576 				c_offs += 4;
577 				d_offs += 4;
578 				e_offs -= 4;
579 			}
580 		}
581 	}
582 
583 	// step 8+decode   (paper output is X, now buffer)
584 	// this generates pairs of data a la 8 and pushes them directly through
585 	// the decode kernel (pushing rather than pulling) to avoid having
586 	// to make another pass later
587 
588 	// this cannot POSSIBLY be in place, so we refer to the buffers directly
589 	{
590 		let mut d0_offs = 0;
591 		let mut d1_offs = n2 - 4;
592 		let mut d2_offs = n2;
593 		let mut d3_offs = n - 4;
594 
595 		let mut b_offs = n2 - 8;
596 		let mut e_offs = n2 - 8;
597 
598 		macro_rules! d0 {
599 			($x:expr) => {buffer[d0_offs + ($x as usize)]}
600 		}
601 		macro_rules! d1 {
602 			($x:expr) => {buffer[d1_offs + ($x as usize)]}
603 		}
604 		macro_rules! d2 {
605 			($x:expr) => {buffer[d2_offs + ($x as usize)]}
606 		}
607 		macro_rules! d3 {
608 			($x:expr) => {buffer[d3_offs + ($x as usize)]}
609 		}
610 
611 		macro_rules! b {
612 			($x:expr) => {b[b_offs + ($x as usize)]}
613 		}
614 		macro_rules! e {
615 			($x:expr) => {buf2[e_offs + ($x as usize)]}
616 		}
617 
618 		loop {
619 			let mut p3 =  e![6]*b![7] - e![7]*b![6];
620 			let mut p2 = -e![6]*b![6] - e![7]*b![7];
621 
622 			d0![0] =   p3;
623 			d1![3] = - p3;
624 			d2![0] =   p2;
625 			d3![3] =   p2;
626 
627 			let mut p1 =  e![4]*b![5] - e![5]*b![4];
628 			let mut p0 = -e![4]*b![4] - e![5]*b![5];
629 
630 			d0![1] =   p1;
631 			d1![2] = - p1;
632 			d2![1] =   p0;
633 			d3![2] =   p0;
634 
635 			p3 =  e![2]*b![3] - e![3]*b![2];
636 			p2 = -e![2]*b![2] - e![3]*b![3];
637 
638 			d0![2] =   p3;
639 			d1![1] = - p3;
640 			d2![2] =   p2;
641 			d3![1] =   p2;
642 
643 			p1 =  e![0]*b![1] - e![1]*b![0];
644 			p0 = -e![0]*b![0] - e![1]*b![1];
645 
646 			d0![3] =   p1;
647 			d1![0] = - p1;
648 			d2![3] =   p0;
649 			d3![0] =   p0;
650 
651 			break_if_sub_overflows!(e_offs, 8);
652 			b_offs -= 8;
653 			d0_offs += 4;
654 			d2_offs += 4;
655 			d1_offs -= 4;
656 			d3_offs -= 4;
657 		}
658 	}
659 }
660 
661 #[allow(dead_code)]
inverse_mdct_naive(cached_bd :&CachedBlocksizeDerived, buffer :&mut[f32])662 pub fn inverse_mdct_naive(cached_bd :&CachedBlocksizeDerived, buffer :&mut[f32]) {
663 	let n = buffer.len();
664 	let n2 = n >> 1;
665 	let n4 = n >> 2;
666 	let n8 = n >> 3;
667 	let n3_4 = n - n4;
668 
669 	let mut u = [0.0; 1 << 13];
670 	let mut xa = [0.0; 1 << 13];
671 	let mut v = [0.0; 1 << 13];
672 	let mut w = [0.0; 1 << 13];
673 
674 	// retrieve the cached twiddle factors
675 	let ctf = &cached_bd.twiddle_factors;
676 	let a :&[f32] = &ctf.a;
677 	let b :&[f32] = &ctf.b;
678 	let c :&[f32] = &ctf.c;
679 
680 	// IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
681 	// Note there are bugs in that pseudocode, presumably due to them attempting
682 	// to rename the arrays nicely rather than representing the way their actual
683 	// implementation bounces buffers back and forth. As a result, even in the
684 	// "some formulars corrected" version, a direct implementation fails. These
685 	// are noted below as "paper bug".
686 
687 	// copy and reflect spectral data
688 	for k in 0 .. n2 {
689 		u[k] = buffer[k];
690 	}
691 	for k in n2 .. n {
692 		u[k] = -buffer[n - k - 1];
693 	}
694 
695 	let mut k2 = 0;
696 	let mut k4 = 0;
697 
698 	// kernel from paper
699 	// step 1
700 	while k2 < n2 { // n4 iterations
701 		v[n-k4-1] = (u[k4] - u[n-k4-1]) * a[k2]   - (u[k4+2] - u[n-k4-3])*a[k2+1];
702 		v[n-k4-3] = (u[k4] - u[n-k4-1]) * a[k2+1] + (u[k4+2] - u[n-k4-3])*a[k2];
703 		k2 += 2;
704 		k4 += 4;
705 	}
706 	// step 2
707 	k4 = 0;
708 	while k4 < n2 { // n8 iterations
709 		w[n2+3+k4] = v[n2+3+k4] + v[k4+3];
710 		w[n2+1+k4] = v[n2+1+k4] + v[k4+1];
711 		w[k4+3]    = (v[n2+3+k4] - v[k4+3])*a[n2-4-k4] - (v[n2+1+k4]-v[k4+1])*a[n2-3-k4];
712 		w[k4+1]    = (v[n2+1+k4] - v[k4+1])*a[n2-4-k4] + (v[n2+3+k4]-v[k4+3])*a[n2-3-k4];
713 		k4 += 4;
714 	}
715 
716 	// step 3
717 	let ld :usize = (::ilog(n as u64) - 1) as usize;
718 	for l in 0 .. ld - 3 {
719 		let k0 = n >> (l+2);
720 		let k1 = 1 << (l+3);
721 		let rlim = n >> (l+4);
722 		let slim = 1 << (l+1);
723 		let mut r4 = 0;
724 		for r in 0 .. rlim {
725 			let mut s2 = 0;
726 			for _ in 0 .. slim {
727 				u[n-1-k0*s2-r4] = w[n-1-k0*s2-r4] + w[n-1-k0*(s2+1)-r4];
728 				u[n-3-k0*s2-r4] = w[n-3-k0*s2-r4] + w[n-3-k0*(s2+1)-r4];
729 				u[n-1-k0*(s2+1)-r4] = (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * a[r*k1]
730 					- (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * a[r*k1+1];
731 				u[n-3-k0*(s2+1)-r4] = (w[n-3-k0*s2-r4] - w[n-3-k0*(s2+1)-r4]) * a[r*k1]
732 					+ (w[n-1-k0*s2-r4] - w[n-1-k0*(s2+1)-r4]) * a[r*k1+1];
733 				s2 += 2;
734 			}
735 			r4 += 4;
736 		}
737 		if l+1 < ld-3 {
738 			// paper bug: ping-ponging of u&w here is omitted
739 			w.copy_from_slice(&u);
740 		}
741 	}
742 
743 	// step 4
744 	for i in 0 .. n8 {
745 		let j = (::bit_reverse(i as u32) >> (32-ld+3)) as usize;
746 		assert!(j < n8);
747 		if i == j {
748 			// paper bug: original code probably swapped in place; if copying,
749 			//            need to directly copy in this case
750 			let ii = i << 3;
751 			v[ii+1] = u[ii+1];
752 			v[ii+3] = u[ii+3];
753 			v[ii+5] = u[ii+5];
754 			v[ii+7] = u[ii+7];
755 		} else if i < j {
756 			let ii = i << 3;
757 			let j8 = j << 3;
758 			v[j8+1] = u[ii+1];
759 			v[ii+1] = u[j8 + 1];
760 			v[j8+3] = u[ii+3];
761 			v[ii+3] = u[j8 + 3];
762 			v[j8+5] = u[ii+5];
763 			v[ii+5] = u[j8 + 5];
764 			v[j8+7] = u[ii+7];
765 			v[ii+7] = u[j8 + 7];
766 		}
767 	}
768 
769 	// step 5
770 	for k in 0 .. n2 {
771 		w[k] = v[k*2+1];
772 	}
773 	// step 6
774 	let mut k2 = 0;
775 	let mut k4 = 0;
776 	while k2 < n4 { // n8 iterations
777 		u[n-1-k2] = w[k4];
778 		u[n-2-k2] = w[k4+1];
779 		u[n3_4 - 1 - k2] = w[k4+2];
780 		u[n3_4 - 2 - k2] = w[k4+3];
781 		k2 += 2;
782 		k4 += 4;
783 	}
784 	// step 7
785 	k2 = 0;
786 	while k2 < n4 { // n8 iterations
787 		v[n2 + k2 ] = ( u[n2 + k2] + u[n-2-k2] + c[k2+1]*(u[n2+k2]-u[n-2-k2]) + c[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2.0;
788 		v[n-2 - k2] = ( u[n2 + k2] + u[n-2-k2] - c[k2+1]*(u[n2+k2]-u[n-2-k2]) - c[k2]*(u[n2+k2+1]+u[n-2-k2+1]))/2.0;
789 		v[n2+1+ k2] = ( u[n2+1+k2] - u[n-1-k2] + c[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - c[k2]*(u[n2+k2]-u[n-2-k2]))/2.0;
790 		v[n-1 - k2] = (-u[n2+1+k2] + u[n-1-k2] + c[k2+1]*(u[n2+1+k2]+u[n-1-k2]) - c[k2]*(u[n2+k2]-u[n-2-k2]))/2.0;
791 		k2 += 2;
792 	}
793 	// step 8
794 	k2 = 0;
795 	for k in 0 .. n4 {
796 		xa[k]      = v[k2+n2]*b[k2  ] + v[k2+1+n2]*b[k2+1];
797 		xa[n2-1-k] = v[k2+n2]*b[k2+1] - v[k2+1+n2]*b[k2  ];
798 		k2 += 2;
799 	}
800 
801 	// decode kernel to output
802 
803 	for i in 0 .. n4 {
804 		buffer[i] = xa[i + n4];
805 	}
806 	for i in n4 .. n3_4 {
807 		buffer[i] = -xa[n3_4 - i - 1];
808 	}
809 	for i in n3_4 .. n {
810 		buffer[i] = -xa[i - n3_4];
811 	}
812 }
813 
814 #[cfg(test)]
815 #[test]
test_imdct_naive()816 fn test_imdct_naive() {
817 	use imdct_test::*;
818 	let mut arr_1 = imdct_prepare(&IMDCT_INPUT_TEST_ARR_1);
819 	let cbd = CachedBlocksizeDerived::from_blocksize(8);
820 	inverse_mdct_naive(&cbd, &mut arr_1);
821 	let mismatches = fuzzy_compare_array(
822 		&arr_1, &IMDCT_OUTPUT_TEST_ARR_1,
823 		0.00005, true);
824 	let mismatches_limit = 0;
825 	if mismatches > mismatches_limit {
826 		panic!("Numer of mismatches {} was larger than limit of {}",
827 			mismatches, mismatches_limit);
828 	}
829 }
830 
831 #[cfg(test)]
832 #[test]
test_imdct()833 fn test_imdct() {
834 	use imdct_test::*;
835 	let mut arr_1 = imdct_prepare(&IMDCT_INPUT_TEST_ARR_1);
836 	let blocksize = 8;
837 	let cbd = CachedBlocksizeDerived::from_blocksize(blocksize);
838 	inverse_mdct(&cbd, &mut arr_1, blocksize);
839 	let mismatches = fuzzy_compare_array(
840 		&arr_1, &IMDCT_OUTPUT_TEST_ARR_1,
841 		0.00005, true);
842 	let mismatches_limit = 0;
843 	if mismatches > mismatches_limit {
844 		panic!("Numer of mismatches {} was larger than limit of {}",
845 			mismatches, mismatches_limit);
846 	}
847 }
848