1 /*
2 	resample: low-latency usable and quick resampler
3 
4 	copyright 2018-2020 by the mpg123 project
5 	licensed under the terms of the LGPL 2.1
6 	see COPYING and AUTHORS files in distribution or http://mpg123.org
7 
8 	initially written by Thomas Orgis
9 
10 	The resampler consists of these steps:
11 
12 	1. If output rate < 1/4 input rate:
13 	1.1 Repeat until output >= 1/4 input rate:
14 	1.1.1 Low pass with transition band from 1/4 to 1/2 input bandwidth.
15 	1.1.2 Drop samples to halve the input sample rate.
16 	1.2 Pre-emphasis to compensate for interpolator response.
17 
18 	2. If output rate > 1/2 input rate:
19 	2.1 Pre-emphasis to compensate for interpolator response.
20 	2.2 2x oversampling (zero-stuffing).
21 
22 	3. Low pass filter with >84% bandwidth to cutoff of the smaller of
23 	   target rate or input rate, between 1/4 and 1/2 of intermediate
24 	   sample rate.
25 
26 	4. Interpolate to target rate.
27 
28 	The filters are IIR-type. The interpolators are based on Olli Niemitalo's
29 	2001 paper titled 'Polynomial Interpolators for High-Quality Resampling of
30 	Oversampled Audio', available as download at
31 	http://yehar.com/blog/wp-content/uploads/2009/08/deip.pdf . It is these
32 	interpolators that drastically improve the quality compared to cubic spline
33 	interpolation.
34 
35 	There are different pre-emphasis filters for the case with and without
36 	oversampling. At all times, the signal band before interpolation only
37 	extends up to 1/2 of Nyquist, enabling the use of interpolators designed
38 	for oversampled input. Without this spectral headroom, things would look
39 	much more distorted.
40 
41 	The higher quality mode achieves distortion below 108 dB with a bandwith
42 	above 84%, while the lower quality mode weakens the final low pass to
43 	72 dB, and has distortion around that, with slightly better bandwidth.
44 
45 	The downside of this all is serious phase distortion, but that does
46 	not matter much to the human ear for resampling before playback of the
47 	stream, as opposed to mixing resampled streams with some phase relations
48 	in mind. If you want to resample your instrument samples before combining
49 	them into music, you have the full signal at hand and can just use the
50 	best method regardless of latency. This here is not your resampler, then.
51 
52 	The upside is that there is virtually no latency compared to FIR-based
53 	resamplers and it should be rather easy to use. You have a direct
54 	relation between input buffer and output buffer and do not have to
55 	think about draining internal windowing buffers. Data with one sample
56 	rate in, data with another sample rate out. Simple as that.
57 
58 	The performance is very good compared to windowed-sinc interpolators,
59 	but is behind the FFT-based approach of libsoxr. You trade in massive
60 	latency for a performance gain, there. Vectorization of the code does
61 	not seem to bring much. A test with SSE intrinsics yielded about 10%
62 	speedup for single-channel operation. This is a basic problem of recursive
63 	computations. There is not that much dumb parallelism --- and the SIMD
64 	parallelism offered in current CPUs is rather dumb and useless without
65 	the specific data flow to be efficient. Or maybe I am just doing it
66 	wrong. Perhaps many channels may benefit from SIMD.
67 
68 	If you want to optimize: Most time is spent in the low pass filtering. The
69 	interpolation itself is rather cheap. The decimation filter is also worth
70 	considering. Currently, it is not even optimized for efficient coefficient
71 	access (that's a TODO), let alone using the more efficient Direct Form II,
72 	or anything more fancy.
73 
74 TODO: efficient ringbuffer access for the decimation filter
75 TODO: initialize with first sample or zero? is there an actual benefit? impulse
76       response?
77 */
78 
79 #define NO_GROW_BUF
80 #define NO_SMAX
81 #define NO_SMIN
82 // SSIZE_MAX is not standard C:-/
83 #define _POSIX_C_SOURCE 200809L
84 #include "syn123_int.h"
85 #include "debug.h"
86 
87 // coefficient tables generated from coeff-ellip-6-0.01-36-16.txd
88 // (coeff-{filter_type}-{order}-{passband_ripple}-{rejection_dB}-{points})
89 
90 // static const unsigned int lpf_db = 36;
91 
92 // Order of the lowpass filter.
93 // You'd wish to have that as a multiple of 4 to help vectorization, but
94 // even 12 is too small to have real benefit from SSE (let alone wider stuff),
95 // and 6 seems to be a limit for stability. The filter needs to be of some minimal
96 // order to be steep enough and becomes unstable, especially with interpolated
97 // coefficients, with higher orders. This is a value I settled on after a long
98 // and painful process. This works between 1/4 and 1/2 of Nyquist. I used to
99 // believe that this can be extended down to 1/16 or even less, but this is
100 // noisy at best.
101 #define LPF_ORDER 6
102 
103 // points in filter coefficient interpolation tables
104 #define LPF_POINTS 16
105 
106 // filter critical frequency in fractions of Nyquist
107 static const float lpf_w_c[LPF_POINTS] =
108 {
109 	+1.9500000000e-01
110 ,	+1.9867850841e-01
111 ,	+2.0682630822e-01
112 ,	+2.1914822332e-01
113 ,	+2.3516664873e-01
114 ,	+2.5424242424e-01
115 ,	+2.7560276877e-01
116 ,	+2.9837505460e-01
117 ,	+3.2162494540e-01
118 ,	+3.4439723123e-01
119 ,	+3.6575757576e-01
120 ,	+3.8483335127e-01
121 ,	+4.0085177668e-01
122 ,	+4.1317369178e-01
123 ,	+4.2132149159e-01
124 ,	+4.2500000000e-01
125 };
126 
127 // actual cutoff frequency in fractions of Nyquist
128 static const float lpf_cutoff[LPF_POINTS] =
129 {
130 	+2.4618202393e-01
131 ,	+2.5064642208e-01
132 ,	+2.6050157602e-01
133 ,	+2.7531601376e-01
134 ,	+2.9440817636e-01
135 ,	+3.1688954073e-01
136 ,	+3.4172256063e-01
137 ,	+3.6778811727e-01
138 ,	+3.9395465344e-01
139 ,	+4.1914062138e-01
140 ,	+4.4236385073e-01
141 ,	+4.6277512916e-01
142 ,	+4.7967708029e-01
143 ,	+4.9253194430e-01
144 ,	+5.0096263453e-01
145 ,	+5.0475083567e-01
146 };
147 
148 // filter bandwidth (passband fraction in percent)
149 static const unsigned char lpf_bw[LPF_POINTS] =
150 {
151 	85
152 ,	85
153 ,	85
154 ,	85
155 ,	85
156 ,	86
157 ,	86
158 ,	86
159 ,	87
160 ,	87
161 ,	88
162 ,	88
163 ,	88
164 ,	89
165 ,	89
166 ,	89
167 };
168 
169 // coefficient b_0
170 static const float lpf_b0[LPF_POINTS] =
171 {
172 	+2.5060910358e-02
173 ,	+2.5545546150e-02
174 ,	+2.6658960397e-02
175 ,	+2.8449469363e-02
176 ,	+3.0975593417e-02
177 ,	+3.4290689410e-02
178 ,	+3.8423542124e-02
179 ,	+4.3355834055e-02
180 ,	+4.8998511458e-02
181 ,	+5.5171358029e-02
182 ,	+6.1592382077e-02
183 ,	+6.7884279531e-02
184 ,	+7.3603031799e-02
185 ,	+7.8288471817e-02
186 ,	+8.1529628810e-02
187 ,	+8.3031304399e-02
188 };
189 
190 // derivative d b_0/d w_c
191 static const float lpf_mb0[LPF_POINTS] =
192 {
193 	+1.3023622859e-01
194 ,	+1.3326373417e-01
195 ,	+1.4006275119e-01
196 ,	+1.5061692736e-01
197 ,	+1.6489815146e-01
198 ,	+1.8285716898e-01
199 ,	+2.0437030679e-01
200 ,	+2.2914833420e-01
201 ,	+2.5663256095e-01
202 ,	+2.8591120057e-01
203 ,	+3.1568709259e-01
204 ,	+3.4431969261e-01
205 ,	+3.6995119359e-01
206 ,	+3.9070858956e-01
207 ,	+4.0495210591e-01
208 ,	+4.1152140765e-01
209 };
210 
211 // coefficients b_1 to b_n
212 static const float lpf_b[LPF_POINTS][LPF_ORDER] =
213 {
214 	{ -5.1762847048e-02, +8.4153207389e-02, -8.3511150073e-02
215 	, +8.4153207389e-02, -5.1762847048e-02, +2.5060910358e-02 }
216 ,	{ -5.0602124846e-02, +8.2431045673e-02, -8.0086433599e-02
217 	, +8.2431045673e-02, -5.0602124846e-02, +2.5545546150e-02 }
218 ,	{ -4.7865340043e-02, +7.8920478243e-02, -7.2561589822e-02
219 	, +7.8920478243e-02, -4.7865340043e-02, +2.6658960397e-02 }
220 ,	{ -4.3255354022e-02, +7.4480386867e-02, -6.1251009130e-02
221 	, +7.4480386867e-02, -4.3255354022e-02, +2.8449469363e-02 }
222 ,	{ -3.6312512281e-02, +7.0487561023e-02, -4.6392636636e-02
223 	, +7.0487561023e-02, -3.6312512281e-02, +3.0975593417e-02 }
224 ,	{ -2.6445440921e-02, +6.8811704409e-02, -2.7845395780e-02
225 	, +6.8811704409e-02, -2.6445440921e-02, +3.4290689410e-02 }
226 ,	{ -1.3021846935e-02, +7.1687243737e-02, -4.8530219060e-03
227 	, +7.1687243737e-02, -1.3021846935e-02, +3.8423542124e-02 }
228 ,	{ +4.4706427843e-03, +8.1414913409e-02, +2.3941615853e-02
229 	, +8.1414913409e-02, +4.4706427843e-03, +4.3355834055e-02 }
230 ,	{ +2.6209675772e-02, +9.9864368285e-02, +6.0088123327e-02
231 	, +9.9864368285e-02, +2.6209675772e-02, +4.8998511458e-02 }
232 ,	{ +5.1814692315e-02, +1.2783927823e-01, +1.0454069691e-01
233 	, +1.2783927823e-01, +5.1814692315e-02, +5.5171358029e-02 }
234 ,	{ +8.0182308668e-02, +1.6447692384e-01, +1.5672311528e-01
235 	, +1.6447692384e-01, +8.0182308668e-02, +6.1592382077e-02 }
236 ,	{ +1.0945240695e-01, +2.0692267707e-01, +2.1383540251e-01
237 	, +2.0692267707e-01, +1.0945240695e-01, +6.7884279531e-02 }
238 ,	{ +1.3715527898e-01, +2.5049018459e-01, +2.7077975696e-01
239 	, +2.5049018459e-01, +1.3715527898e-01, +7.3603031799e-02 }
240 ,	{ +1.6054360691e-01, +2.8938069305e-01, +3.2088440740e-01
241 	, +2.8938069305e-01, +1.6054360691e-01, +7.8288471817e-02 }
242 ,	{ +1.7705368035e-01, +3.1783679836e-01, +3.5729711842e-01
243 	, +3.1783679836e-01, +1.7705368035e-01, +8.1529628810e-02 }
244 ,	{ +1.8478918080e-01, +3.3142891364e-01, +3.7464012836e-01
245 	, +3.3142891364e-01, +1.8478918080e-01, +8.3031304399e-02 }
246 };
247 
248 // derivatives (d b_1/d w_c) to (d b_n/d w_c)
249 static const float lpf_mb[LPF_POINTS][LPF_ORDER] =
250 {
251 	{ +3.0947524511e-01, -4.7927490698e-01, +9.3391532013e-01
252 	, -4.7927490702e-01, +3.0947524507e-01, +1.3023622860e-01 }
253 ,	{ +3.2168974799e-01, -4.5690983225e-01, +9.2827504448e-01
254 	, -4.5690983214e-01, +3.2168974795e-01, +1.3326373418e-01 }
255 ,	{ +3.5049895985e-01, -4.0402713817e-01, +9.1976126033e-01
256 	, -4.0402713816e-01, +3.5049895984e-01, +1.4006275120e-01 }
257 ,	{ +3.9869096973e-01, -3.1471026348e-01, +9.1858562318e-01
258 	, -3.1471026347e-01, +3.9869096972e-01, +1.5061692736e-01 }
259 ,	{ +4.6976534041e-01, -1.8018044797e-01, +9.4154172198e-01
260 	, -1.8018044798e-01, +4.6976534042e-01, +1.6489815146e-01 }
261 ,	{ +5.6706828511e-01, +1.0247608319e-02, +1.0113425453e+00
262 	, +1.0247608337e-02, +5.6706828510e-01, +1.8285716898e-01 }
263 ,	{ +6.9280120305e-01, +2.6709404538e-01, +1.1535785914e+00
264 	, +2.6709404540e-01, +6.9280120305e-01, +2.0437030678e-01 }
265 ,	{ +8.4703144845e-01, +5.9748112137e-01, +1.3910603904e+00
266 	, +5.9748112138e-01, +8.4703144845e-01, +2.2914833420e-01 }
267 ,	{ +1.0268432839e+00, +1.0012919713e+00, +1.7366400230e+00
268 	, +1.0012919713e+00, +1.0268432839e+00, +2.5663256095e-01 }
269 ,	{ +1.2257925890e+00, +1.4678100718e+00, +2.1866645243e+00
270 	, +1.4678100718e+00, +1.2257925891e+00, +2.8591120056e-01 }
271 ,	{ +1.4338418956e+00, +1.9740692730e+00, +2.7173819083e+00
272 	, +1.9740692730e+00, +1.4338418956e+00, +3.1568709258e-01 }
273 ,	{ +1.6379144032e+00, +2.4857766605e+00, +3.2858109638e+00
274 	, +2.4857766605e+00, +1.6379144032e+00, +3.4431969261e-01 }
275 ,	{ +1.8231025322e+00, +2.9609624523e+00, +3.8352350558e+00
276 	, +2.9609624523e+00, +1.8231025322e+00, +3.6995119359e-01 }
277 ,	{ +1.9744172816e+00, +3.3557576586e+00, +4.3041807299e+00
278 	, +3.3557576587e+00, +1.9744172816e+00, +3.9070858955e-01 }
279 ,	{ +2.0788103417e+00, +3.6311428529e+00, +4.6368982737e+00
280 	, +3.6311428526e+00, +2.0788103416e+00, +4.0495210591e-01 }
281 ,	{ +2.1270907033e+00, +3.7592676993e+00, +4.7930965582e+00
282 	, +3.7592676992e+00, +2.1270907033e+00, +4.1152140769e-01 }
283 };
284 
285 // coefficients a_1 to a_n
286 static const float lpf_a[LPF_POINTS][LPF_ORDER] =
287 {
288 	{ -3.9678160947e+00, +7.1582705272e+00, -7.3103432700e+00
289 	, +4.4271625605e+00, -1.4968822486e+00, +2.2103607837e-01 }
290 ,	{ -3.9217586689e+00, +7.0245007818e+00, -7.1375017960e+00
291 	, +4.3093638600e+00, -1.4547982662e+00, +2.1489651918e-01 }
292 ,	{ -3.8190188341e+00, +6.7322820225e+00, -6.7643407898e+00
293 	, +4.0573843887e+00, -1.3653052102e+00, +2.0191441068e-01 }
294 ,	{ -3.6618359076e+00, +6.3015570638e+00, -6.2251488099e+00
295 	, +3.6990273202e+00, -1.2392343109e+00, +1.8379956602e-01 }
296 ,	{ -3.4544704643e+00, +5.7631160859e+00, -5.5687366968e+00
297 	, +3.2721548186e+00, -1.0907981304e+00, +1.6273969374e-01 }
298 ,	{ -3.2034885530e+00, +5.1559829152e+00, -4.8508981080e+00
299 	, +2.8177192974e+00, -9.3461709876e-01, +1.4091459119e-01 }
300 ,	{ -2.9178775764e+00, +4.5233648501e+00, -4.1259259822e+00
301 	, +2.3730384158e+00, -7.8317396639e-01, +1.2011720880e-01 }
302 ,	{ -2.6088711387e+00, +3.9075669790e+00, -3.4391290636e+00
303 	, +1.9669306242e+00, -6.4531233315e-01, +1.0156466880e-01 }
304 ,	{ -2.2894468741e+00, +3.3447965777e+00, -2.8221523134e+00
305 	, +1.6175400427e+00, -5.2592289748e-01, +8.5891269383e-02 }
306 ,	{ -1.9735577122e+00, +2.8610227169e+00, -2.2920156678e+00
307 	, +1.3327010249e+00, -4.2655597907e-01, +7.3258414200e-02 }
308 ,	{ -1.6752255568e+00, +2.4698591405e+00, -1.8535096177e+00
309 	, +1.1119902037e+00, -3.4650830575e-01, +6.3506595124e-02 }
310 ,	{ -1.4076456238e+00, +2.1728886190e+00, -1.5035402243e+00
311 	, +9.4944185607e-01, -2.8395564864e-01, +5.6296779542e-02 }
312 ,	{ -1.1824218415e+00, +1.9621904134e+00, -1.2356636112e+00
313 	, +8.3617447471e-01, -2.3684730680e-01, +5.1219220854e-02 }
314 ,	{ -1.0089985054e+00, +1.8243200388e+00, -1.0434432700e+00
315 	, +7.6259361700e-01, -2.0344554024e-01, +4.7870210629e-02 }
316 ,	{ -8.9429544087e-01, +1.7447560305e+00, -9.2208898364e-01
317 	, +7.2012730057e-01, -1.8252811740e-01, +4.5906155400e-02 }
318 ,	{ -8.4251214238e-01, +1.7118654093e+00, -8.6865163772e-01
319 	, +7.0252569491e-01, -1.7335864376e-01, +4.5082431755e-02 }
320 };
321 
322 // derivatives (d a_1/d w_c) to (d a_n/d w_c)
323 static const float lpf_ma[LPF_POINTS][LPF_ORDER] =
324 {
325 	{ +1.2492433161e+01, -3.6517018434e+01, +4.7357395713e+01
326 	, -3.2369784989e+01, +1.1586055408e+01, -1.6933077254e+00 }
327 ,	{ +1.2548734957e+01, -3.6212194682e+01, +4.6616582888e+01
328 	, -3.1678742548e+01, +1.1296037667e+01, -1.6449911143e+00 }
329 ,	{ +1.2669371015e+01, -3.5511519947e+01, +4.4983356850e+01
330 	, -3.0180967586e+01, +1.0676583147e+01, -1.5427341805e+00 }
331 ,	{ +1.2841272347e+01, -3.4388021134e+01, +4.2540425267e+01
332 	, -2.8002157377e+01, +9.7976496751e+00, -1.3997892255e+00 }
333 ,	{ +1.3046121049e+01, -3.2820302039e+01, +3.9430990541e+01
334 	, -2.5324586959e+01, +8.7534918962e+00, -1.2330749768e+00 }
335 ,	{ +1.3263302462e+01, -3.0810041034e+01, +3.5857000371e+01
336 	, -2.2361001245e+01, +7.6447176195e+00, -1.0593731098e+00 }
337 ,	{ +1.3473094693e+01, -2.8396265557e+01, +3.2063474406e+01
338 	, -1.9323493868e+01, +6.5616850615e+00, -8.9237379829e-01 }
339 ,	{ +1.3659446627e+01, -2.5661488138e+01, +2.8308712196e+01
340 	, -1.6394785270e+01, +5.5731925958e+00, -7.4123740640e-01 }
341 ,	{ +1.3811818531e+01, -2.2727774254e+01, +2.4827395931e+01
342 	, -1.3709404604e+01, +4.7220289931e+00, -6.1062450746e-01 }
343 ,	{ +1.3925825330e+01, -1.9744316114e+01, +2.1797535816e+01
344 	, -1.1348774538e+01, +4.0264909835e+00, -5.0168575497e-01 }
345 ,	{ +1.4002735155e+01, -1.6870564740e+01, +1.9320924822e+01
346 	, -9.3489487064e+00, +3.4855436350e+00, -4.1340259191e-01 }
347 ,	{ +1.4048120627e+01, -1.4259541011e+01, +1.7421558040e+01
348 	, -7.7156336592e+00, +3.0852238544e+00, -3.4381636627e-01 }
349 ,	{ +1.4070068384e+01, -1.2044779182e+01, +1.6060419989e+01
350 	, -6.4399486362e+00, +2.8047048140e+00, -2.9090554179e-01 }
351 ,	{ +1.4077328201e+01, -1.0332356944e+01, +1.5160803823e+01
352 	, -5.5100288590e+00, +2.6213968366e+00, -2.5305663714e-01 }
353 ,	{ +1.4077683041e+01, -9.1976058009e+00, +1.4636862941e+01
354 	, -4.9166414668e+00, +2.5150332199e+00, -2.2919336158e-01 }
355 ,	{ +1.4076709161e+01, -8.6849570278e+00, +1.4418874781e+01
356 	, -4.6538327513e+00, +2.4707840042e+00, -2.1869079956e-01 }
357 };
358 
359 // Low pass with 1/4 pass band, 1/4 transition band on top, for
360 // instant decimation by 2 and further processing to a rate below
361 // 1/4 of original sampling rate.
362 // This tuned instance needs only order 12 once and gives more
363 // attenuation than the regular resampling lowpass filter. Also, the
364 // pass band is maximaly preserved for no additional bandwidth loss.
365 // It is an Chebychev type 2 filter with clean pass band but of course
366 // wild phase shifts. Designed with octave's cheby2(12,112,0.5).
367 
368 #define LPF_4_ORDER 12
369 
370 static const float lpf_4_coeff[2][LPF_4_ORDER+1] =
371 {
372 	{ +4.942274456389e-04, +2.456588519653e-03, +7.333608109998e-03
373 	, +1.564190404912e-02, +2.595749312226e-02, +3.475411633752e-02
374 	, +3.823783863736e-02, +3.475411633752e-02, +2.595749312226e-02
375 	, +1.564190404912e-02, +7.333608109998e-03, +2.456588519652e-03
376 	, +4.942274456389e-04 }
377 ,	{ +1.000000000000e+00, -3.414853772012e+00, +6.812049477837e+00
378 	, -8.988639858221e+00, +8.737226725937e+00, -6.382745001626e+00
379 	, +3.590542256812e+00, -1.543464216385e+00, +5.036659057430e-01
380 	, -1.203614775500e-01, +2.006360736124e-02, -2.070940463771e-03
381 	, +1.010063725617e-04 }
382 };
383 
384 // 4th-order pre-emphasis filters for the differing interpolations
385 
386 #define PREEMP_ORDER 5
387 
388 static const float preemp_1xopt4p4o_coeff[2][6] =
389 {
390 	{ +1.4537908355e+00, -1.5878441906e-01, +7.6588945557e-01
391 	, -1.1187528142e-01, +4.9169512092e-02, -6.1692905372e-03 }
392 ,	{ +1.0000000000e+00, +1.9208553603e-01, +6.4223479412e-01
393 	, +8.4340248522e-02, +6.9072561856e-02, +4.2876716239e-03 }
394 };
395 static const float preemp_1xopt6p5o_coeff[2][6] =
396 {
397 	{ +1.7736168976e+00, -2.9739831101e-01, +8.3183026784e-01
398 	, -1.5434578823e-01, +3.0236127398e-02, -2.1054630699e-03 }
399 ,	{ +1.0000000000e+00, +2.9206686404e-01, +6.5605633523e-01
400 	, +1.4922672324e-01, +7.4792388477e-02, +9.6914195316e-03 }
401 };
402 static const float preemp_2xopt4p4o_coeff[2][6] =
403 {
404 	{ +1.1866573548e+00, +1.1973031478e+00, -9.4241923873e-02
405 	, -4.0863707262e-01, -8.0536389300e-02, -3.8911856607e-04 }
406 ,	{ +1.0000000000e+00, +1.2122990222e+00, +1.0444255630e-01
407 	, -3.7674911658e-01, -1.3106237366e-01, -8.7740900285e-03 }
408 };
409 static const float preemp_2xopt6p5o_coeff[2][6] =
410 {
411 	{ +1.2995411641e+00, +1.3148944902e+00, -4.3762040619e-02
412 	, -3.7595940266e-01, -6.2452466903e-02, +8.5987740998e-04 }
413 ,	{ +1.0000000000e+00, +1.3227482221e+00, +2.6592565819e-01
414 	, -3.1166281821e-01, -1.3266510256e-01, -1.1224337963e-02 }
415 };
416 
417 // TODO: switch the decimation filter to Direct Form II
418 struct lpf4_hist
419 {
420 	// Filter history (to be switched to Direct Form 2).
421 	float x[LPF_4_ORDER];
422 	float y[LPF_4_ORDER];
423 };
424 
425 #ifndef NO_DE_DENORM
426 // Battling denormals. They naturally occur in IIR filters and
427 // need to be avoided since they hurt performance on many CPUs.
428 // Usually, gcc -ffast-math would also avoid them, but we cannot
429 // count on that and debugging performance issues becomes confusing.
430 // The idea is to add an alternatingly positive/negative small number
431 // that is orders of magnitude above denormals to freshly computed
432 // samples.
433 // Downside: You never see a true zero if ther is true zero on input.
434 // Upside would be that you'd convert to 24 or 32 bit integer anyway,
435 // where you'd get your zero back (1 in 32 bit is around 5e-10.
436 static const float denorm_base = 1e-15;
437 #define DE_DENORM(val, base) (val) += (base);
438 #define DE_DENORM_FLIP(base) (base) = -(base);
439 #define DE_DENORM_INIT(base, sign) (base) = (sign)*denorm_base;
440 #else
441 #define DE_DENORM(val, base)
442 #define DE_DENORM_FLIP(base)
443 #define DE_DENORM_INIT(base, sign)
444 #endif
445 
446 struct decimator_state
447 {
448 	unsigned int sflags; // also stores decimation position (0 or 1)
449 	unsigned int n1;
450 	struct lpf4_hist *ch; // one for each channel, pointer into common array
451 	float *out_hist; // channels*STAGE_HISTORY, pointer to common block
452 #ifndef NO_DE_DENORM
453 	float dede; // current de-denormalization offset
454 #endif
455 };
456 
457 #ifdef SYN123_HIGH_PREC
458 typedef double lpf_sum_type;
459 #else
460 typedef float lpf_sum_type;
461 #endif
462 
463 // The amount of lowpass runs determines much of the quality.
464 // Each run gives lpf_db reduction. The lowpass also includes the
465 // pre-emphasis and oversampling.
466 #define LPF_TIMES 3
467 #define DIRTY_LPF_TIMES 2
468 #define LOWPASS                 lowpass3_df2
469 #define LOWPASS_PREEMP          lowpass3_df2_preemp
470 #define LOWPASS_PREEMP_2X       lowpass3_df2_preemp_2x
471 #define DIRTY_LOWPASS           lowpass2_df2
472 #define DIRTY_LOWPASS_PREEMP    lowpass2_df2_preemp
473 #define DIRTY_LOWPASS_PREEMP_2X lowpass2_df2_preemp_2x
474 // The number of points used in the interpolators.
475 #define FINE_POINTS 6
476 #define DIRTY_POINTS 4
477 
478 // The data structure accomodates a fixed amount of low pass applications.
479 #define LPF_MAX_TIMES LPF_TIMES
480 
481 // Let's define what we see a a reasonable history size for an IIR lowpass.
482 // On arrival of a new sample, <order> previous samples are needed, both
483 // input and output (recursive). Let's simply double that to cater for
484 // the recursive nature, ensuring that each recursive input at least already
485 // contains something from the new samples
486 #define LPF_HISTORY(order) (2*(order))
487 
488 // Number of samples to keep for smooth rate changes on
489 // addition/removal of stages.
490 #define STAGE_HISTORY 48
491 
492 enum state_flags
493 {
494 	inter_flow   = 1<<0
495 ,	preemp_configured = 1<<1
496 ,	preemp_flow = 1<<2
497 ,	decimate_store = 1<<3 // one skipped sample from last buffer
498 ,	oversample_2x = 1<<4 // x2 oversampling or not
499 ,	lowpass_configured = 1<<5
500 ,	lowpass_flow = 1<<6
501 ,	dirty_method = 1<<7
502 ,	smooth_change = 1<<8
503 };
504 
505 // TODO: tune that to the smallest sensible value.
506 #ifndef SYN123_BATCH
507 #define BATCH 128
508 #else
509 #define BATCH SYN123_BATCH
510 #endif
511 
512 #if (BATCH < STAGE_HISTORY)
513 #error "Batch size cannot be smaller than the stage history."
514 #endif
515 
516 struct channel_history
517 {
518 	// Direct Form II histories for preemp and lowpass
519 	float pre_w[PREEMP_ORDER]; // history for Direct Form II
520 	float lpf_w[LPF_MAX_TIMES][LPF_ORDER];
521 	float x[5]; // past input samples for interpolator
522 	float c[6]; // interpolator coefficients
523 };
524 
525 struct resample_data
526 {
527 	unsigned int sflags;
528 	// The configured resampling function to call.
529 	// Decides: oversampling/nothing/decimation, fine/dirty
530 	size_t (*resample_func)(struct resample_data *, float *, size_t, float *);
531 	// The lowpass functions used before interpolation, including preemp.
532 	void (*lowpass_2x_func)(struct resample_data *, float *, size_t, float *);
533 	void (*lowpass_func)   (struct resample_data *, float *, size_t);
534 	// Maximum number of input samples that can be fed in one go to have the
535 	// number of output samples safely within the limit of size_t.
536 	size_t input_limit;
537 	// Decimator state. Each stage has a an instance of lpf_4 and the decimator.
538 	unsigned int decim_stages;
539 	struct decimator_state *decim;
540 	// storage for the above to avoid nested malloc
541 	struct lpf4_hist *decim_hist;
542 	struct channel_history *ch;
543 	// Blocks of STAGE_HISTORY*channels samples.
544 	// 0: plain input for first decimation, oversampling or direct interpolation
545 	// 1: output of first decimation
546 	// 2: output of second decimation, etc.
547 	float *stage_history; // storage for past input of final (2x) lowpass
548 	float *frame;  // One PCM frame to work on (one sample for each channel).
549 	float *prebuf; // [channels*BATCH]
550 	float *upbuf;  // [channels*2*BATCH]
551 #ifndef NO_DE_DENORM
552 	float dede; // de-denormalization offset, alternatingly used both in preemp and lpf
553 #endif
554 	// Final lowpass filter setup.
555 	float lpf_cutoff;
556 	float lpf_w_c;
557 	unsigned char lpf_bw; // Bandwidth in percent, rounded to integer.
558 	// Pre-emphasis filter coefficients and history.
559 	float pre_b0;
560 	unsigned char pre_n1;
561 	float pre_b[PREEMP_ORDER][PREEMP_ORDER];
562 	float pre_a[PREEMP_ORDER][PREEMP_ORDER];
563 	// Same for the low pass.
564 	float lpf_b0;
565 	// Low pass filter history in ringbuffer. For each ringbuffer state, there is
566 	// one mirroring set of coefficients to avoid re-sorting during application.
567 	unsigned char lpf_n1; // Current index of the most recent past.
568 	// It may help to have these aligned, but since LPF_ORDER is not 4 nor 8, things
569 	// just don't add up for vector instructions.
570 	float lpf_b[LPF_ORDER][LPF_ORDER];
571 	float lpf_a[LPF_ORDER][LPF_ORDER];
572 	// Interpolator state.
573 	// In future, I could pull the same trick with ringbuffer and copies
574 	// of coefficients (matrices, even) as for lpf.
575 	// TODO: maybe store more of these for better filter adaption on rate changes, or
576 	// at least always the full set of 5.
577 	long offset; // interpolator offset
578 	unsigned int channels;
579 	long inrate;
580 	long vinrate;
581 	long outrate;
582 	long voutrate;
583 };
584 
585 // Interpolation of low pass coefficients for specified cutoff, using
586 // the precomputed table. Instead of properly computing the filter
587 // at runtime, a cheap spline interpolation in the stored tables
588 // does the job, too. This works for filters that are not too unstable.
589 // The employed 6th-order filter still works with the associated rounding
590 // errors, in the limited range of cutoff between 1/4 and 1/2 of Nyquist.
591 
592 // Constructing the low pass filter for a bit less than claimed cutoff,
593 // numerics shift things a bit.
594 static const float lpf_cut_scale = 0.995;
595 
596 // Reference implementation of the cubic spline evaluation.
597 // x: time coordinate to evaluate at
598 // xa: lower interval boundary
599 // xb: upper interval boundary
600 // a: value at xa
601 // b: value at xb
602 // da: derivative at xa
603 // db: derivative at db
spline_val(float x,float xa,float xb,float a,float b,float da,float db)604 static float spline_val( float x, float xa, float xb
605 ,	float a, float b, float da, float db )
606 {
607 	float d = xb-xa;
608 	float t = (x-xa)/d;
609 	float t2 = t*t;
610 	float t3 = t2*t;
611 	float h00 = 2*t3-3*t2+1;
612 	float h10 = -2*t3+3*t2;
613 	float h01d = (t3-2*t2+t)*d;
614 	float h11d = (t3-t2)*d;
615 	return h00*a + h10*b + h01d*da + h11d*db;
616 }
617 
618 // Find coordiante x in interpolation table val.
619 // Returns index for the lower boundary. Monotonic increase
620 // is assumed.
lpf_entry(const float * val,float x)621 static unsigned int lpf_entry(const float *val, float x)
622 {
623 	unsigned int i = 0;
624 	while(i<(LPF_POINTS-2) && val[i+1] <= x)
625 		++i;
626 	return i;
627 }
628 
629 // Derivative of value with respect to w_c at given
630 // index. Intervals are not of equal width ... so central
631 // difference is not straightforward. I'll simply use a
632 // inverse-weighted mean of both sides.
lpf_deriv(const float x[LPF_POINTS],const float val[LPF_POINTS],unsigned int i)633 static float lpf_deriv( const float x[LPF_POINTS]
634 ,	const float val[LPF_POINTS], unsigned int i )
635 {
636 	float w = 0;
637 	float m = 0;
638 	// lpf_w_c assumed to be monotonically increasing
639 	if(i<LPF_POINTS-1)
640 	{
641 		float w1 = x[i+1]-x[i];
642 		m += (val[i+1]-val[i])/(w1*w1);
643 		w += 1./w1;
644 	}
645 	if(i>0)
646 	{
647 		float w0 = x[i]-x[i-1];
648 		m += (val[i]-val[i-1])/(w0*w0);
649 		w += 1./w0;
650 	}
651 	return m/w;
652 }
653 
654 // Static ring buffer index used for filter history and coefficients.
655 // i: offset
656 // n1: current position of the first entry
657 // s: size of the ring buffer
658 #define RING_INDEX(i, n1, s) ( ((n1)+(i)) % s )
659 
660 // Compute actual low pass coefficients for the cutoff in the handle.
661 // This interpolates in the tables without subsequent scaling for zero
662 // DC gain, which has been shown not to be useful.
lpf_init(struct resample_data * rd)663 static void lpf_init(struct resample_data *rd)
664 {
665 	rd->sflags &= ~lowpass_configured;
666 	float real_cutoff = rd->lpf_cutoff * lpf_cut_scale;
667 	// It does not make sense to return an error here. It is an internal
668 	// implementation error if the table is too small. Just run with the
669 	// best value we have.
670 	if(real_cutoff < lpf_cutoff[0])
671 		real_cutoff = lpf_cutoff[0];
672 	if(real_cutoff > lpf_cutoff[LPF_POINTS-1])
673 		real_cutoff = lpf_cutoff[LPF_POINTS-1];
674 	// The coefficient interpolation needs w_c != cutoff.
675 	unsigned int lpfi = lpf_entry(lpf_cutoff, real_cutoff);
676 	rd->lpf_w_c = spline_val
677 	(
678 		real_cutoff, lpf_cutoff[lpfi], lpf_cutoff[lpfi+1]
679 	,	lpf_w_c[lpfi], lpf_w_c[lpfi+1]
680 	,	lpf_deriv(lpf_cutoff, lpf_w_c, lpfi)
681 	,	lpf_deriv(lpf_cutoff, lpf_w_c, lpfi+1)
682 	);
683 	mdebug("lowpass cutoff %g; w_c %g", rd->lpf_cutoff, rd->lpf_w_c);
684 	lpfi = lpf_entry(lpf_w_c, rd->lpf_w_c);
685 	rd->lpf_bw = (unsigned char)( 0.5 + lpf_cut_scale*(lpf_bw[lpfi]
686 	+	(rd->lpf_w_c-lpf_w_c[lpfi])/(lpf_w_c[lpfi+1]-lpf_w_c[lpfi])
687 	*	(lpf_bw[lpfi+1]-lpf_bw[lpfi])) );
688 	mdebug("lowpass bandwidth: %u%%", rd->lpf_bw);
689 	rd->lpf_b0 = spline_val( rd->lpf_w_c, lpf_w_c[lpfi], lpf_w_c[lpfi+1]
690 	,	lpf_b0[lpfi], lpf_b0[lpfi+1]
691 	,	lpf_mb0[lpfi], lpf_mb0[lpfi+1] );
692 	float b[LPF_ORDER];
693 	float a[LPF_ORDER];
694 	for(int i=0; i<LPF_ORDER; ++i)
695 	{
696 		b[i] = spline_val( rd->lpf_w_c
697 		,	lpf_w_c[lpfi], lpf_w_c[lpfi+1]
698 		,	lpf_b[lpfi][i], lpf_b[lpfi+1][i]
699 		,	lpf_mb[lpfi][i], lpf_mb[lpfi+1][i] );
700 		a[i] = spline_val( rd->lpf_w_c
701 		,	lpf_w_c[lpfi], lpf_w_c[lpfi+1]
702 		,	lpf_a[lpfi][i], lpf_a[lpfi+1][i]
703 		,	lpf_ma[lpfi][i], lpf_ma[lpfi+1][i] );
704 	}
705 	memset(rd->lpf_a, 0, sizeof(float)*LPF_ORDER*LPF_ORDER);
706 	memset(rd->lpf_b, 0, sizeof(float)*LPF_ORDER*LPF_ORDER);
707 	for(int i=0; i<LPF_ORDER; ++i)
708 	{
709 		for(int j=0; j<LPF_ORDER; ++j)
710 		{
711 			rd->lpf_b[j][RING_INDEX(i,j,LPF_ORDER)] = b[i];
712 			rd->lpf_a[j][RING_INDEX(i,j,LPF_ORDER)] = a[i];
713 		}
714 	}
715 	debug("lpf coefficients [b,a]:");
716 	mdebug("%.15e", rd->lpf_b0);
717 	for(int i=0; i<LPF_ORDER; ++i)
718 		mdebug("%.15e", rd->lpf_b[0][i]);
719 	mdebug("%.15e", 1.);
720 	for(int i=0; i<LPF_ORDER; ++i)
721 		mdebug("%.15e", rd->lpf_a[0][i]);
722 	rd->sflags |= lowpass_configured;
723 }
724 
725 // Compute an estimate of the magnitude of lowpass history values.
726 // This is used to re-scale those values on sample ratio changes to
727 // try to keep a smooth ride. Relate this to df2_initval().
728 // First approach: b_0 / (1+sum(a))
lpf_history_scale(struct resample_data * rd)729 static float lpf_history_scale(struct resample_data *rd)
730 {
731 	float asum = 1;
732 	for(unsigned int i=0; i<LPF_ORDER; ++i)
733 		asum += rd->lpf_a[0][i];
734 	return rd->lpf_b0/asum;
735 }
736 
preemp_init(struct resample_data * rd)737 static void preemp_init(struct resample_data *rd)
738 {
739 	const float* pre_b;
740 	const float* pre_a;
741 	rd->sflags &= ~preemp_configured;
742 
743 	#define PREEMP_COEFF(name) \
744 		pre_b     = preemp_ ## name ## _coeff[0]; \
745 		pre_a     = preemp_ ## name ## _coeff[1];
746 	if(rd->sflags & oversample_2x)
747 	{
748 		if(rd->sflags & dirty_method)
749 		{
750 			PREEMP_COEFF(2xopt4p4o);
751 		} else
752 		{
753 			PREEMP_COEFF(2xopt6p5o);
754 		}
755 	}
756 	else
757 	{
758 		if(rd->sflags & dirty_method)
759 		{
760 			PREEMP_COEFF(1xopt4p4o);
761 		} else
762 		{
763 			PREEMP_COEFF(1xopt6p5o);
764 		}
765 	}
766 
767 	rd->pre_b0 = pre_b[0];
768 	for(int i=0; i<PREEMP_ORDER; ++i)
769 	{
770 		for(int j=0; j<PREEMP_ORDER; ++j)
771 		{
772 			rd->pre_b[j][RING_INDEX(i,j,PREEMP_ORDER)] = pre_b[i+1];
773 			rd->pre_a[j][RING_INDEX(i,j,PREEMP_ORDER)] = pre_a[i+1];
774 		}
775 	}
776 
777 	rd->sflags |= preemp_configured;
778 }
779 
780 
781 // Store the last samples of given input data in the respective
782 // stage history buffer.
stage_history(struct resample_data * rd,unsigned int stage,float * in,size_t ins)783 static void stage_history( struct resample_data *rd
784 ,	unsigned int stage, float *in, size_t ins )
785 {
786 	if(!rd->stage_history)
787 		return;
788 	// Case 1: We got enough and can fill it all.
789 	// Case 2: Just a bit, have to shift existing
790 	//         history to append to.
791 	float *hist = rd->stage_history + stage*STAGE_HISTORY*rd->channels;
792 	if(ins >= STAGE_HISTORY)
793 		memcpy( hist, in+(ins-STAGE_HISTORY)*rd->channels
794 		,	STAGE_HISTORY*rd->channels*sizeof(float) );
795 	else
796 	{
797 		memmove( hist, hist+ins*rd->channels
798 		,	(STAGE_HISTORY-ins)*rd->channels*sizeof(float) );
799 		memcpy( hist+(STAGE_HISTORY-ins)*rd->channels, in
800 		,	ins*rd->channels*sizeof(float) );
801 	}
802 }
803 
804 
stage_history_init(struct resample_data * rd,unsigned int stage,float * in)805 static void stage_history_init( struct resample_data *rd
806 ,	unsigned int stage, float *in )
807 {
808 	if(!rd->stage_history)
809 		return;
810 	float *hist = rd->stage_history + stage*STAGE_HISTORY*rd->channels;
811 	for(size_t i=0; i<STAGE_HISTORY; ++i)
812 		for(unsigned int c=0; c<rd->channels; ++c)
813 			hist[i*rd->channels+c] = in[c];
814 }
815 
816 // Actual resampling workers.
817 
818 // To make things easier to compilers, things like the number of times
819 // a low pass filter is repeatedly applied are hardcoded in functions
820 // generated with a set of macros to avoid code duplication.
821 
822 // One decimation step: low pass with transition band from 1/2 to 1/4,
823 // drop every second sample. This works within the input buffer.
decimate(struct resample_data * rrd,unsigned int dstage,float * in,size_t ins)824 static size_t decimate( struct resample_data *rrd, unsigned int dstage
825 ,	float *in, size_t ins )
826 {
827 	struct decimator_state *rd = rrd->decim+dstage;
828 	if(!ins)
829 		return 0;
830 	mdebug("decimating %zu samples", ins);
831 	if(!(rd->sflags & lowpass_flow))
832 	{
833 		for(unsigned int c=0; c<rrd->channels; ++c)
834 			for(unsigned int j=0; j<LPF_4_ORDER; ++j)
835 				rd->ch[c].x[j] = rd->ch[c].y[j] = in[c];
836 		rd->n1 = 0;
837 		rd->sflags |= lowpass_flow|decimate_store;
838 		DE_DENORM_INIT(rd->dede, dstage % 2 ? -1 : +1)
839 		stage_history_init(rrd, dstage+1, in);
840 	}
841 	float *out  = in; // output worker
842 	float *oout = in; // for history storage
843 	size_t outs = 0;
844 #ifndef SYN123_NO_CASES
845 	switch(rrd->channels)
846 	{
847 		case 1: for(size_t i=0; i<ins; ++i)
848 		{
849 			int ni[LPF_4_ORDER];
850 			for(int j=0; j<LPF_4_ORDER; ++j)
851 				ni[j] = RING_INDEX(j, rd->n1, LPF_4_ORDER);
852 			rd->n1 = ni[LPF_4_ORDER-1];
853 			// y0 = b0x0 + b1x1 + ... + bxyn - a1y1 - ... - anyn
854 			// Switching y to double pushes roundoff hf noise down (?).
855 			lpf_sum_type ny = lpf_4_coeff[0][0] * in[i]; // a0 == 1 implicit here
856 			for(int j=0; j<LPF_4_ORDER; ++j)
857 			{
858 				// This direct summing without intermediate asum and bsum
859 				// pushs roundoff hf noise down.
860 				ny += rd->ch[0].x[ni[j]]*lpf_4_coeff[0][j+1];
861 				ny -= rd->ch[0].y[ni[j]]*lpf_4_coeff[1][j+1];
862 			}
863 			DE_DENORM(ny, rd->dede)
864 			rd->ch[0].x[rd->n1] = in[i];
865 			rd->ch[0].y[rd->n1] = ny;
866 			// Drop every second sample.
867 			// Maybe its faster doing this in a separate step after all?
868 			if(rd->sflags & decimate_store)
869 			{
870 				DE_DENORM_FLIP(rd->dede)
871 				*(out++) = ny;
872 				rd->sflags &= ~decimate_store;
873 				++outs;
874 			} else
875 				rd->sflags |= decimate_store;
876 		}
877 		break;
878 		case 2: for(size_t i=0; i<ins; ++i)
879 		{
880 			int ni[LPF_4_ORDER];
881 			for(int j=0; j<LPF_4_ORDER; ++j)
882 				ni[j] = RING_INDEX(j, rd->n1, LPF_4_ORDER);
883 			rd->n1 = ni[LPF_4_ORDER-1];
884 			float frame[2];
885 			for(unsigned int c=0; c<2; ++c)
886 			{
887 				// y0 = b0x0 + b1x1 + ... + bxyn - a1y1 - ... - anyn
888 				// Switching y to double pushes roundoff hf noise down (?).
889 				lpf_sum_type ny = lpf_4_coeff[0][0] * in[c]; // a0 == 1 implicit here
890 				for(int j=0; j<LPF_4_ORDER; ++j)
891 				{
892 					// This direct summing without intermediate asum and bsum
893 					// pushs roundoff hf noise down.
894 					ny += rd->ch[c].x[ni[j]]*lpf_4_coeff[0][j+1];
895 					ny -= rd->ch[c].y[ni[j]]*lpf_4_coeff[1][j+1];
896 				}
897 				DE_DENORM(ny, rd->dede)
898 				rd->ch[c].x[rd->n1] = in[c];
899 				rd->ch[c].y[rd->n1] = ny;
900 				frame[c] = ny;
901 			}
902 			in += 2;
903 			// Drop every second sample.
904 			// Maybe its faster doing this in a separate step after all?
905 			if(rd->sflags & decimate_store)
906 			{
907 				DE_DENORM_FLIP(rd->dede)
908 				*(out++) = frame[0];
909 				*(out++) = frame[1];
910 				rd->sflags &= ~decimate_store;
911 				++outs;
912 			} else
913 				rd->sflags |= decimate_store;
914 		}
915 		break;
916 		default:
917 #endif
918 		for(size_t i=0; i<ins; ++i)
919 		{
920 			int ni[LPF_4_ORDER];
921 			for(int j=0; j<LPF_4_ORDER; ++j)
922 				ni[j] = RING_INDEX(j, rd->n1, LPF_4_ORDER);
923 			rd->n1 = ni[LPF_4_ORDER-1];
924 			for(unsigned int c=0; c<rrd->channels; ++c)
925 			{
926 				// y0 = b0x0 + b1x1 + ... + bxyn - a1y1 - ... - anyn
927 				// Switching y to double pushes roundoff hf noise down (?).
928 				lpf_sum_type ny = lpf_4_coeff[0][0] * in[c]; // a0 == 1 implicit here
929 				for(int j=0; j<LPF_4_ORDER; ++j)
930 				{
931 					// This direct summing without intermediate asum and bsum
932 					// pushs roundoff hf noise down.
933 					ny += rd->ch[c].x[ni[j]]*lpf_4_coeff[0][j+1];
934 					ny -= rd->ch[c].y[ni[j]]*lpf_4_coeff[1][j+1];
935 				}
936 				DE_DENORM(ny, rd->dede)
937 				rd->ch[c].x[rd->n1] = in[c];
938 				rd->ch[c].y[rd->n1] = ny;
939 				rrd->frame[c] = ny;
940 			}
941 			in += rrd->channels;
942 			// Drop every second sample.
943 			// Maybe its faster doing this in a separate step after all?
944 			if(rd->sflags & decimate_store)
945 			{
946 				DE_DENORM_FLIP(rd->dede)
947 				for(unsigned int c=0; c<rrd->channels; ++c)
948 					*(out++) = rrd->frame[c];
949 				rd->sflags &= ~decimate_store;
950 				++outs;
951 			} else
952 				rd->sflags |= decimate_store;
953 		}
954 #ifndef SYN123_NO_CASES
955 	}
956 #endif
957 	stage_history(rrd, dstage+1, oout, outs);
958 	return outs;
959 }
960 
961 // Initialization of Direct Form 2 history.
962 // Return a constant value for w[n] that represents an endless history of given insample.
963 // Given the recursive definition: w[n] = x[n] - sum_i(a[i] * w[n-i])
964 // Given my ansatz of endless stream of x[n-i] = x[n] and hence w[n-i] = w[n], it follows
965 // that
966 //       w[n] = x[n] - sum_i(a[i] * w[n]) = x[n] - sum_i(a[i]) * w[n]
967 //  <=>  (1 + sum_i(a[i])) w[n] = x[n]
968 //  <=>  w[n] = 1 / (1 + sum_i(a[i])) * x[n]
969 // Looks simple. Just a scale value.
df2_initval(unsigned int order,float * filter_a,float insample)970 static float df2_initval(unsigned int order, float *filter_a, float insample)
971 {
972 	float asum = 1.;
973 	for(unsigned int i=0; i<order; ++i)
974 		asum += filter_a[i];
975 	float scale = 1.f/asum;
976 	return scale * insample;
977 }
978 
979 #define PREEMP_DF2_BEGIN(initval, channels, ret) \
980 	if(!(rd->sflags & preemp_flow)) \
981 	{ \
982 		if(!(rd->sflags & preemp_configured)) \
983 		{ \
984 			fprintf(stderr, "unconfigured pre-emphasis\n"); \
985 			return ret; \
986 		} \
987 		for(int c=0; c<channels; ++c) \
988 		{ \
989 			float iv = df2_initval(PREEMP_ORDER, rd->pre_a[0], initval[c]); \
990 			for(int i=0; i<PREEMP_ORDER; ++i) \
991 				rd->ch[c].pre_w[i] = iv; \
992 		} \
993 		rd->pre_n1 = 0; \
994 		rd->sflags |= preemp_flow; \
995 	}
996 
997 // Might not be faster than DFI, but for sure needs less storage.
998 #define PREEMP_DF2_SAMPLE(in, out, channels) \
999 	{ \
1000 		unsigned char n1 = rd->pre_n1; \
1001 		rd->pre_n1 = RING_INDEX(PREEMP_ORDER-1, rd->pre_n1, PREEMP_ORDER); \
1002 		for(unsigned int c=0; c<channels; ++c) \
1003 		{ \
1004 			lpf_sum_type ny = 0; \
1005 			lpf_sum_type nw = 0; \
1006 			for(unsigned char i=0; i<PREEMP_ORDER; ++i) \
1007 			{ \
1008 				ny += rd->ch[c].pre_w[i]*rd->pre_b[n1][i]; \
1009 				nw -= rd->ch[c].pre_w[i]*rd->pre_a[n1][i]; \
1010 			} \
1011 			nw += in[c]; \
1012 			DE_DENORM(nw, rd->dede) \
1013 			ny += rd->pre_b0 * nw; \
1014 			rd->ch[c].pre_w[rd->pre_n1] = nw; \
1015 			out[c] = ny; \
1016 		} \
1017 	}
1018 
1019 // Beginning of a low pass function with on-the-fly initialization
1020 // of filter history.
1021 // For oversampling lowpass with zero-stuffing, initialize to half of first
1022 // input sample.
1023 #define LPF_DF2_BEGIN(times,initval,channels,ret) \
1024 	if(!(rd->sflags & lowpass_flow)) \
1025 	{ \
1026 		if(!(rd->sflags & lowpass_configured)) \
1027 		{ \
1028 			fprintf(stderr, "unconfigured lowpass\n"); \
1029 			return ret; \
1030 		} \
1031 		rd->lpf_n1 = 0; \
1032 		for(unsigned int c=0; c<channels; ++c) \
1033 		{ \
1034 			float iv = df2_initval(LPF_ORDER, rd->lpf_a[0], initval[c]); \
1035 			for(int j=0; j<times; ++j) \
1036 				for(int i=0; i<LPF_ORDER; ++i) \
1037 					rd->ch[c].lpf_w[j][i] = iv; \
1038 		} \
1039 		rd->sflags |= lowpass_flow; \
1040 	} \
1041 	int n1 = rd->lpf_n1; \
1042 	int n1n;
1043 
1044 #define LPF_DF2_SAMPLE(times, in, out, channels) \
1045 	n1n = RING_INDEX(LPF_ORDER-1, n1, LPF_ORDER); \
1046 	for(unsigned int c=0; c<channels; ++c) \
1047 	{ \
1048 		float old_y = in[c]; \
1049 		for(int j=0; j<times; ++j) \
1050 		{ \
1051 			lpf_sum_type w = old_y; \
1052 			lpf_sum_type y = 0; \
1053 			for(int k=0; k<LPF_ORDER; ++k) \
1054 			{ \
1055 				y += rd->ch[c].lpf_w[j][k]*rd->lpf_b[n1][k]; \
1056 				w -= rd->ch[c].lpf_w[j][k]*rd->lpf_a[n1][k]; \
1057 			} \
1058 			DE_DENORM(w, rd->dede) \
1059 			rd->ch[c].lpf_w[j][n1n] = w; \
1060 			y += rd->lpf_b0*w; \
1061 			old_y = y; \
1062 		} \
1063 		out[c] = old_y; \
1064 	} \
1065 	n1 = n1n;
1066 
1067 #define LPF_DF2_END \
1068 	rd->lpf_n1 = n1;
1069 
1070 // Define normal and oversampling low pass with pre-emphasis in direct form 2.
1071 // The parameter determines the number of repeated applications of the same
1072 // low pass.
1073 
1074 #ifndef SYN123_NO_CASES
1075 #define LOWPASS_DF2_FUNCSX(times) \
1076 \
1077 static void lowpass##times##_df2_preemp_2x(struct resample_data *rd, float *in, size_t ins, float *out) \
1078 { \
1079 	if(!ins) \
1080 		return; \
1081 	LPF_DF2_BEGIN(times,in,rd->channels,) \
1082 	PREEMP_DF2_BEGIN(in,rd->channels,); \
1083 	switch(rd->channels) \
1084 	{ \
1085 		case 1: for(size_t i=0; i<ins; ++i) \
1086 		{ \
1087 			float sample; \
1088 			PREEMP_DF2_SAMPLE(in, (&sample), 1) \
1089 			/* Zero-stuffing! Insert zero after making up for energy loss. */ \
1090 			sample *= 2; \
1091 			LPF_DF2_SAMPLE(times, (&sample), out, 1) \
1092 			out++; \
1093 			sample = 0; \
1094 			LPF_DF2_SAMPLE(times, (&sample), out, 1) \
1095 			DE_DENORM_FLIP(rd->dede) \
1096 			out++; \
1097 			in++; \
1098 		} \
1099 		break; \
1100 		case 2: for(size_t i=0; i<ins; ++i) \
1101 		{ \
1102 			float frame[2]; \
1103 			PREEMP_DF2_SAMPLE(in, frame, 2) \
1104 			/* Zero-stuffing! Insert zero after making up for energy loss. */ \
1105 			frame[0] *= 2; \
1106 			frame[1] *= 2; \
1107 			LPF_DF2_SAMPLE(times, frame, out, 2) \
1108 			out += 2; \
1109 			frame[1] = frame[0] = 0; \
1110 			LPF_DF2_SAMPLE(times, frame, out, 2) \
1111 			DE_DENORM_FLIP(rd->dede) \
1112 			out += 2; \
1113 			in  += 2; \
1114 		} \
1115 		break; \
1116 		default: for(size_t i=0; i<ins; ++i) \
1117 		{ \
1118 			PREEMP_DF2_SAMPLE(in, rd->frame, rd->channels) \
1119 			/* Zero-stuffing! Insert zero after making up for energy loss. */ \
1120 			for(unsigned int c=0; c<rd->channels; ++c) \
1121 				rd->frame[c] *= 2; \
1122 			LPF_DF2_SAMPLE(times, rd->frame, out, rd->channels) \
1123 			out += rd->channels; \
1124 			for(unsigned int c=0; c<rd->channels; ++c) \
1125 				rd->frame[c] = 0; \
1126 			LPF_DF2_SAMPLE(times, rd->frame, out, rd->channels) \
1127 			DE_DENORM_FLIP(rd->dede) \
1128 			out += rd->channels; \
1129 			in  += rd->channels; \
1130 		} \
1131 	} \
1132 	LPF_DF2_END \
1133 } \
1134 \
1135 static void lowpass##times##_df2_preemp(struct resample_data *rd, float *in, size_t ins) \
1136 { \
1137 	if(!ins) \
1138 		return; \
1139 	float *out = in; \
1140 	LPF_DF2_BEGIN(times, in, rd->channels,) \
1141 	PREEMP_DF2_BEGIN(in, rd->channels,); \
1142 	switch(rd->channels) \
1143 	{ \
1144 		case 1:  for(size_t i=0; i<ins; ++i) \
1145 		{ \
1146 			float sample; \
1147 			PREEMP_DF2_SAMPLE(in, (&sample), 1) \
1148 			LPF_DF2_SAMPLE(times, (&sample), out, 1) \
1149 			DE_DENORM_FLIP(rd->dede) \
1150 			in++; \
1151 			out++; \
1152 		} \
1153 		break; \
1154 		case 2:  for(size_t i=0; i<ins; ++i) \
1155 		{ \
1156 			float frame[2]; \
1157 			PREEMP_DF2_SAMPLE(in, frame, 2) \
1158 			LPF_DF2_SAMPLE(times, frame, out, 2) \
1159 			DE_DENORM_FLIP(rd->dede) \
1160 			in  += 2; \
1161 			out += 2; \
1162 		} \
1163 		break; \
1164 		default: for(size_t i=0; i<ins; ++i) \
1165 		{ \
1166 			PREEMP_DF2_SAMPLE(in, rd->frame, rd->channels) \
1167 			LPF_DF2_SAMPLE(times, rd->frame, out, rd->channels) \
1168 			DE_DENORM_FLIP(rd->dede) \
1169 			in  += rd->channels; \
1170 			out += rd->channels; \
1171 		} \
1172 	} \
1173 	LPF_DF2_END \
1174 }
1175 #else
1176 #define LOWPASS_DF2_FUNCSX(times) \
1177 \
1178 static void lowpass##times##_df2_preemp_2x(struct resample_data *rd, float *in, size_t ins, float *out) \
1179 { \
1180 	if(!ins) \
1181 		return; \
1182 	LPF_DF2_BEGIN(times,in,rd->channels,) \
1183 	PREEMP_DF2_BEGIN(in,rd->channels,); \
1184 	for(size_t i=0; i<ins; ++i) \
1185 	{ \
1186 		PREEMP_DF2_SAMPLE(in, rd->frame, rd->channels) \
1187 		/* Zero-stuffing! Insert zero after making up for energy loss. */ \
1188 		for(unsigned int c=0; c<rd->channels; ++c) \
1189 			rd->frame[c] *= 2; \
1190 		LPF_DF2_SAMPLE(times, rd->frame, out, rd->channels) \
1191 		out += rd->channels; \
1192 		for(unsigned int c=0; c<rd->channels; ++c) \
1193 			rd->frame[c] = 0; \
1194 		LPF_DF2_SAMPLE(times, rd->frame, out, rd->channels) \
1195 		DE_DENORM_FLIP(rd->dede) \
1196 		out += rd->channels; \
1197 		in  += rd->channels; \
1198 	} \
1199 	LPF_DF2_END \
1200 } \
1201 \
1202 static void lowpass##times##_df2_preemp(struct resample_data *rd, float *in, size_t ins) \
1203 { \
1204 	if(!ins) \
1205 		return; \
1206 	float *out = in; \
1207 	LPF_DF2_BEGIN(times, in, rd->channels,) \
1208 	PREEMP_DF2_BEGIN(in, rd->channels,); \
1209 	for(size_t i=0; i<ins; ++i) \
1210 	{ \
1211 		PREEMP_DF2_SAMPLE(in, rd->frame, rd->channels) \
1212 		LPF_DF2_SAMPLE(times, rd->frame, out, rd->channels) \
1213 		DE_DENORM_FLIP(rd->dede) \
1214 		in  += rd->channels; \
1215 		out += rd->channels; \
1216 	} \
1217 	LPF_DF2_END \
1218 }
1219 #endif
1220 
1221 
1222 // Need that indirection so that times is expanded properly for concatenation.
1223 #define LOWPASS_DF2_FUNCS(times) \
1224 	LOWPASS_DF2_FUNCSX(times)
1225 
1226 LOWPASS_DF2_FUNCS(LPF_TIMES)
LOWPASS_DF2_FUNCS(DIRTY_LPF_TIMES)1227 LOWPASS_DF2_FUNCS(DIRTY_LPF_TIMES)
1228 
1229 // Some interpolators based on the deip paper:
1230 // Polynomial Interpolators for High-Quality Resampling of Oversampled Audio
1231 // REVISED VERSION
1232 // by Olli Niemitalo in October 2001
1233 // http://www.student.oulu.fi/~oniemita/DSP/INDEX.HTM
1234 // http://yehar.com/blog/wp-content/uploads/2009/08/deip.pdf
1235 
1236 #define OPT4P4O_BEGIN(in) \
1237 	if(!(rd->sflags & inter_flow)) \
1238 	{ \
1239 		for(unsigned int c=0; c<rd->channels; ++c) \
1240 			rd->ch[c].x[0] = rd->ch[c].x[1] = rd->ch[c].x[2] = in[c]; \
1241 		rd->offset = -rd->vinrate; \
1242 		rd->sflags |= inter_flow; \
1243 	}
1244 
1245 /* See OPT6P5O_INTERPOL for logic hints. */
1246 #define OPT4P4O_INTERPOL(in, out, outs, channels) \
1247 	{ \
1248 		long sampleoff = rd->offset; \
1249 		if(sampleoff+rd->vinrate < rd->voutrate) \
1250 		{ \
1251 			for(unsigned int c=0; c<channels; ++c) \
1252 			{ \
1253 				float even1 = rd->ch[c].x[2]+rd->ch[c].x[1]; \
1254 				float  odd1 = rd->ch[c].x[2]-rd->ch[c].x[1]; \
1255 				float even2 = in[c]+rd->ch[c].x[0]; \
1256 				float  odd2 = in[c]-rd->ch[c].x[0]; \
1257 				rd->ch[c].c[0] = even1*0.45645918406487612f \
1258 				               + even2*0.04354173901996461f; \
1259 				rd->ch[c].c[1] = odd1*0.47236675362442071f \
1260 				               + odd2*0.17686613581136501f; \
1261 				rd->ch[c].c[2] = even1*-0.253674794204558521f \
1262 				               + even2*0.25371918651882464f; \
1263 				rd->ch[c].c[3] = odd1*-0.37917091811631082f \
1264 				               + odd2*0.11952965967158000f; \
1265 				rd->ch[c].c[4] = even1*0.04252164479749607f \
1266 				               + even2*-0.04289144034653719f; \
1267 			} \
1268 			do	{ \
1269 				sampleoff += rd->vinrate; \
1270 				float z = ((float)sampleoff/rd->voutrate) - (float)0.5; \
1271 				for(unsigned int c=0; c<channels; ++c) \
1272 					out[c] = \
1273 					(	(	( \
1274 								rd->ch[c].c[4] \
1275 								*z + rd->ch[c].c[3] \
1276 							)*z + rd->ch[c].c[2] \
1277 						)*z + rd->ch[c].c[1] \
1278 					)*z + rd->ch[c].c[0]; \
1279 				outs++; \
1280 				out += channels; \
1281 			} while(sampleoff+rd->vinrate < rd->voutrate); \
1282 		} \
1283 		for(unsigned int c=0; c<channels; ++c) \
1284 		{ \
1285 			rd->ch[c].x[0] = rd->ch[c].x[1]; \
1286 			rd->ch[c].x[1] = rd->ch[c].x[2]; \
1287 			rd->ch[c].x[2] = in[c]; \
1288 		} \
1289 		rd->offset = sampleoff - rd->voutrate; \
1290 	}
1291 
1292 #define OPT4P4O_END
1293 
1294 // Optimal 2x (4-point, 4th-order) (z-form)
1295 static size_t resample_opt4p4o(struct resample_data *rd, float*in
1296 ,	size_t ins, float *out)
1297 {
1298 	size_t outs = 0;
1299 	if(!ins)
1300 		return outs;
1301 	OPT4P4O_BEGIN(in)
1302 #ifndef SYN123_NO_CASES
1303 	switch(rd->channels)
1304 	{
1305 		case  1: for(size_t i=0; i<ins; ++i)
1306 		{
1307 			OPT4P4O_INTERPOL(in, out, outs, 1)
1308 			in++;
1309 		}
1310 		break;
1311 		case  2: for(size_t i=0; i<ins; ++i)
1312 		{
1313 			OPT4P4O_INTERPOL(in, out, outs, 2)
1314 			in += 2;
1315 		}
1316 		break;
1317 		default:
1318 #endif
1319 		for(size_t i=0; i<ins; ++i)
1320 		{
1321 			OPT4P4O_INTERPOL(in, out, outs, rd->channels)
1322 			in += rd->channels;
1323 		}
1324 #ifndef SYN123_NO_CASES
1325 	}
1326 #endif
1327 	OPT4P4O_END
1328 	return outs;
1329 }
1330 
resample_opt4p4o_2batch(struct resample_data * rd,float * in,float * out)1331 static size_t resample_opt4p4o_2batch(struct resample_data *rd, float*in, float *out)
1332 {
1333 	size_t outs = 0;
1334 	OPT4P4O_BEGIN(in)
1335 #ifndef SYN123_NO_CASES
1336 	switch(rd->channels)
1337 	{
1338 		case  1: for(size_t i=0; i<2*BATCH; ++i)
1339 		{
1340 			OPT4P4O_INTERPOL(in, out, outs, 1)
1341 			in++;
1342 		}
1343 		break;
1344 		case  2: for(size_t i=0; i<2*BATCH; ++i)
1345 		{
1346 			OPT4P4O_INTERPOL(in, out, outs, 2)
1347 			in += 2;
1348 		}
1349 		break;
1350 		default:
1351 #endif
1352 		for(size_t i=0; i<2*BATCH; ++i)
1353 		{
1354 			OPT4P4O_INTERPOL(in, out, outs, rd->channels)
1355 			in += rd->channels;
1356 		}
1357 #ifndef SYN123_NO_CASES
1358 	}
1359 #endif
1360 	OPT4P4O_END
1361 	return outs;
1362 }
1363 
1364 // This one has low distortion, somewhat more problematic response
1365 // Optimal 2x (6-point, 5th-order) (z-form)
1366 
1367 #define OPT6P5O_BEGIN(in) \
1368 	if(!(rd->sflags & inter_flow)) \
1369 	{ \
1370 		for(unsigned int c=0; c<rd->channels; ++c) \
1371 		{ \
1372 			rd->ch[c].x[0] = rd->ch[c].x[1] = rd->ch[c].x[2] \
1373 			=	rd->ch[c].x[3] = rd->ch[c].x[4] = in[c]; \
1374 		} \
1375 		rd->offset = -rd->vinrate; \
1376 		rd->sflags |= inter_flow; \
1377 	}
1378 
1379 #define OPT6P5O_INTERPOL(in, out, outs, channels) \
1380 	{ \
1381 		/* Offset is how long ago the last sample occured. */ \
1382 		long sampleoff = rd->offset; \
1383 		/* First check if this interval is interesting at all. */ \
1384 		if(sampleoff < rd->voutrate-rd->vinrate) \
1385 		{ \
1386 			/* Not sure if one big loop over the channels would be better. SIMD?! */ \
1387 			for(unsigned int c=0; c<channels; ++c) \
1388 			{ \
1389 				float even1 = rd->ch[c].x[3]+rd->ch[c].x[2]; \
1390 				float  odd1 = rd->ch[c].x[3]-rd->ch[c].x[2]; \
1391 				float even2 = rd->ch[c].x[4]+rd->ch[c].x[1]; \
1392 				float  odd2 = rd->ch[c].x[4]-rd->ch[c].x[1]; \
1393 				float even3 = in[c]+rd->ch[c].x[0]; \
1394 				float  odd3 = in[c]-rd->ch[c].x[0]; \
1395 				rd->ch[c].c[0] = even1*0.40513396007145713f + even2*0.09251794438424393f \
1396 				               + even3*0.00234806603570670f; \
1397 				rd->ch[c].c[1] = odd1*0.28342806338906690f + odd2*0.21703277024054901f \
1398 				               + odd3*0.01309294748731515f; \
1399 				rd->ch[c].c[2] = even1*-0.191337682540351941f + even2*0.16187844487943592f \
1400 				               + even3*0.02946017143111912f; \
1401 				rd->ch[c].c[3] = odd1*-0.16471626190554542f + odd2*-0.00154547203542499f \
1402 				               + odd3*0.03399271444851909f; \
1403 				rd->ch[c].c[4] = even1*0.03845798729588149f + even2*-0.05712936104242644f \
1404 				               + even3*0.01866750929921070f; \
1405 				rd->ch[c].c[5] = odd1*0.04317950185225609f + odd2*-0.01802814255926417f \
1406 				               + odd3*0.00152170021558204f; \
1407 			} \
1408 			/* Only reaching this code if at least one evaluation is due. */ \
1409 			do { \
1410 				sampleoff += rd->vinrate; \
1411 				/* Evaluating as vector sum may be worthwhile for strong upsampling, */ \
1412 				/* possibly many channels. Otherwise, it's just more muls. */ \
1413 				float z = ((float)sampleoff/rd->voutrate) - (float)0.5; \
1414 				for(unsigned int c=0; c<channels; ++c) \
1415 					out[c] = \
1416 					(	(	(	( \
1417 									rd->ch[c].c[5] \
1418 									*z + rd->ch[c].c[4] \
1419 								)*z + rd->ch[c].c[3] \
1420 							)*z +	rd->ch[c].c[2] \
1421 						)*z +	rd->ch[c].c[1] \
1422 					)*z + rd->ch[c].c[0]; \
1423 				outs++; \
1424 				out += channels; \
1425 			} while(sampleoff < rd->voutrate-rd->vinrate); \
1426 		} \
1427 		for(unsigned int c=0; c<channels; ++c) \
1428 		{ \
1429 			/* Store shifted history. Think about using RING_INDEX here, too, */ \
1430 			rd->ch[c].x[0] = rd->ch[c].x[1]; \
1431 			rd->ch[c].x[1] = rd->ch[c].x[2]; \
1432 			rd->ch[c].x[2] = rd->ch[c].x[3]; \
1433 			rd->ch[c].x[3] = rd->ch[c].x[4]; \
1434 			rd->ch[c].x[4] = in[c]; \
1435 		} \
1436 		rd->offset = sampleoff - rd->voutrate; \
1437 	}
1438 
1439 #define OPT6P5O_END
1440 
resample_opt6p5o(struct resample_data * rd,float * in,size_t ins,float * out)1441 static size_t resample_opt6p5o(struct resample_data *rd, float*in
1442 ,	size_t ins, float *out)
1443 {
1444 	size_t outs = 0;
1445 	if(!ins)
1446 		return outs;
1447 	OPT6P5O_BEGIN(in)
1448 #ifndef SYN123_NO_CASES
1449 	switch(rd->channels)
1450 	{
1451 		case  1: for(size_t i=0; i<ins; ++i)
1452 		{
1453 			OPT6P5O_INTERPOL(in, out, outs, 1)
1454 			in++;
1455 		}
1456 		break;
1457 		case  2: for(size_t i=0; i<ins; ++i)
1458 		{
1459 			OPT6P5O_INTERPOL(in, out, outs, 2)
1460 			in += 2;
1461 		}
1462 		break;
1463 		default:
1464 #endif
1465 		for(size_t i=0; i<ins; ++i)
1466 		{
1467 			OPT6P5O_INTERPOL(in, out, outs, rd->channels)
1468 			in += rd->channels;
1469 		}
1470 #ifndef SYN123_NO_CASES
1471 	}
1472 #endif
1473 	OPT6P5O_END
1474 	return outs;
1475 }
1476 
1477 // TODO: Remove the 2batch version after verifying that it doesn't improve things.
resample_opt6p5o_2batch(struct resample_data * rd,float * in,float * out)1478 static size_t resample_opt6p5o_2batch(struct resample_data *rd, float*in
1479 ,	float *out)
1480 {
1481 	size_t outs = 0;
1482 	OPT6P5O_BEGIN(in)
1483 #ifndef SYN123_NO_CASES
1484 	switch(rd->channels)
1485 	{
1486 		case 1: for(size_t i=0; i<2*BATCH; ++i)
1487 		{
1488 			OPT6P5O_INTERPOL(in, out, outs, 1)
1489 			in++;
1490 		}
1491 		break;
1492 		case 2: for(size_t i=0; i<2*BATCH; ++i)
1493 		{
1494 			OPT6P5O_INTERPOL(in, out, outs, 2)
1495 			in += 2;
1496 		}
1497 		break;
1498 		default:
1499 #endif
1500 		for(size_t i=0; i<2*BATCH; ++i)
1501 		{
1502 			OPT6P5O_INTERPOL(in, out, outs, rd->channels)
1503 			in += rd->channels;
1504 		}
1505 #ifndef SYN123_NO_CASES
1506 	}
1507 #endif
1508 	OPT6P5O_END
1509 	return outs;
1510 }
1511 
1512 // Full resample function with oversampling.
1513 #define RESAMPLE_FULL_2X(name, lpf_2x, lpf, interpolate_2batch, interpolate) \
1514 static size_t name( struct resample_data *rd \
1515 ,	float *in, size_t ins, float *out ) \
1516 { \
1517 	mxdebug(#name " in %zu", ins); \
1518 	float *iin = in; \
1519 	size_t iins = ins; \
1520 	if(!(rd->sflags & inter_flow) && ins) \
1521 		stage_history_init(rd, 0, iin); \
1522 	size_t outs = 0; \
1523 	size_t nouts = 0; \
1524 	size_t blocks = ins/BATCH; \
1525 	for(size_t bi = 0; bi<blocks; ++bi) \
1526 	{ \
1527 		/* One batch in, two batches out. */ \
1528 		mxdebug(#name " batch %zu lowpass " #lpf_2x, bi); \
1529 		lpf_2x(rd, in, BATCH, rd->upbuf); \
1530 		mxdebug(#name " batch %zu interpol " #interpolate_2batch, bi); \
1531 		nouts = interpolate_2batch(rd, rd->upbuf, out); \
1532 		outs += nouts; \
1533 		out  += nouts*rd->channels; \
1534 		in   += BATCH*rd->channels; \
1535 	} \
1536 	ins -= blocks*BATCH; \
1537 	xdebug(#name " end lowpass " #lpf_2x); \
1538 	lpf_2x(rd, in, ins, rd->upbuf); \
1539 	xdebug(#name " end interpol " #interpolate); \
1540 	nouts = interpolate(rd, rd->upbuf, 2*ins, out); \
1541 	outs += nouts; \
1542 	mxdebug(#name " out %zu", outs); \
1543 	stage_history(rd, 0, iin, iins); \
1544 	return outs; \
1545 }
1546 
1547 // Full resample method without oversampling.
1548 #define RESAMPLE_FULL_1X(name, lpf, interpolate) \
1549 static size_t name( struct resample_data *rd \
1550 ,	float *in, size_t ins, float *out ) \
1551 { \
1552 	mxdebug(#name " in %zu", ins); \
1553 	float *iin = in; \
1554 	size_t iins = ins; \
1555 	if(!(rd->sflags & inter_flow) && ins) \
1556 		stage_history_init(rd, 0, iin); \
1557 	size_t outs = 0; \
1558 	size_t nouts = 0; \
1559 	size_t blocks = ins/BATCH; \
1560 	for(size_t bi = 0; bi<blocks; ++bi) \
1561 	{ \
1562 		mxdebug(#name " batch %zu lowpass " #lpf, bi); \
1563 		memcpy(rd->prebuf, in, BATCH*sizeof(*in)*rd->channels); \
1564 		lpf(rd, rd->prebuf, BATCH); \
1565 		mxdebug(#name " batch %zu interpol " #interpolate, bi); \
1566 		nouts = interpolate(rd, rd->prebuf, BATCH, out); \
1567 		outs += nouts; \
1568 		out  += nouts*rd->channels; \
1569 		in   += BATCH*rd->channels; \
1570 	} \
1571 	ins -= blocks*BATCH; \
1572 	xdebug(#name " end lowpass " #lpf); \
1573 	memcpy(rd->prebuf, in, ins*sizeof(*in)*rd->channels); \
1574 	lpf(rd, rd->prebuf, ins); \
1575 	xdebug(#name " end interpol " #interpolate); \
1576 	nouts = interpolate(rd, rd->prebuf, ins, out); \
1577 	outs += nouts; \
1578 	mxdebug(#name " out %zu", outs); \
1579 	stage_history(rd, 0, iin, iins); \
1580 	return outs; \
1581 }
1582 
1583 // Full resample method with decimation.
1584 #define RESAMPLE_FULL_0X(name, lpf, interpolate) \
1585 static size_t name( struct resample_data *rd \
1586 ,	float *in, size_t ins, float *out ) \
1587 { \
1588 	mxdebug(#name " in %zu", ins); \
1589 	float *iin = in; \
1590 	size_t iins = ins; \
1591 	if(!(rd->sflags & inter_flow) && ins) \
1592 		stage_history_init(rd, 0, iin); \
1593 	size_t outs = 0; \
1594 	size_t nouts = 0; \
1595 	size_t blocks = ins/BATCH; \
1596 	for(size_t bi = 0; bi<blocks; ++bi) \
1597 	{ \
1598 		mxdebug( #name " batch %zu decim %d and lowpass " #lpf \
1599 		,	bi, rd->decim_stages ); \
1600 		int fill = BATCH; \
1601 		memcpy(rd->prebuf, in, fill*sizeof(*in)*rd->channels); \
1602 		for(int ds = 0; ds<rd->decim_stages; ++ds) \
1603 			fill = decimate(rd, ds, rd->prebuf, fill); \
1604 		lpf(rd, rd->prebuf, fill); \
1605 		mxdebug(#name " batch %zu interpol " #interpolate, bi); \
1606 		nouts = interpolate(rd, rd->prebuf, fill, out); \
1607 		outs += nouts; \
1608 		out  += nouts*rd->channels; \
1609 		in   += BATCH*rd->channels; \
1610 	} \
1611 	ins -= blocks*BATCH; \
1612 	int fill = ins; \
1613 	mxdebug( #name " end decim %d and lowpass " #lpf \
1614 	, rd->decim_stages ); \
1615 	memcpy(rd->prebuf, in, fill*sizeof(*in)*rd->channels); \
1616 	for(int ds = 0; ds<rd->decim_stages; ++ds) \
1617 		fill = decimate(rd, ds, rd->prebuf, fill); \
1618 	lpf(rd, rd->prebuf, fill); \
1619 	xdebug(#name " end interpol " #interpolate); \
1620 	nouts = interpolate(rd, rd->prebuf, fill, out); \
1621 	outs += nouts; \
1622 	mxdebug(#name " out %zu", outs); \
1623 	stage_history(rd, 0, iin, iins); \
1624 	return outs; \
1625 }
1626 
1627 // Define the set of resamplers for a given set of lowpasses and interpolators.
1628 #define RESAMPLE_FULL( name_2x, name_1x, name_0x \
1629 , lpf_2x, lpf, interpolate_2batch, interpolate ) \
1630 RESAMPLE_FULL_2X(name_2x, lpf_2x, lpf, interpolate_2batch, interpolate) \
1631 RESAMPLE_FULL_1X(name_1x, lpf, interpolate) \
1632 RESAMPLE_FULL_0X(name_0x, lpf, interpolate)
1633 
1634 // Finally define our dirty and fine resamplers.
RESAMPLE_FULL(resample_2x_dirty,resample_1x_dirty,resample_0x_dirty,DIRTY_LOWPASS_PREEMP_2X,DIRTY_LOWPASS_PREEMP,resample_opt4p4o_2batch,resample_opt4p4o)1635 RESAMPLE_FULL( resample_2x_dirty, resample_1x_dirty, resample_0x_dirty
1636 ,	DIRTY_LOWPASS_PREEMP_2X, DIRTY_LOWPASS_PREEMP
1637 ,	resample_opt4p4o_2batch, resample_opt4p4o )
1638 RESAMPLE_FULL( resample_2x_fine,  resample_1x_fine,  resample_0x_fine
1639 ,	LOWPASS_PREEMP_2X, LOWPASS_PREEMP
1640 ,	resample_opt6p5o_2batch, resample_opt6p5o )
1641 
1642 // Here I go and implement a bit of big number functionality.
1643 // The task is just to truly be able to compute a*b/c with unsigned
1644 // integers for cases where the result does not overflow but the intermediate
1645 // product may not fit into the type.
1646 // I do not want to rely on a 128 bit type being availbe, so I take
1647 // uint64_t and run with it.
1648 
1649 // Multiply two 64 bit unsigned integers and return the result as two
1650 // halves of 128 bit.
1651 static void mul64(uint64_t a, uint64_t b, uint64_t * m1, uint64_t * m0)
1652 {
1653 	uint64_t lowmask = ((uint64_t)1 << 32) - 1;
1654 	uint64_t a1 = a >> 32;
1655 	uint64_t a0 = a & lowmask;
1656 	uint64_t b1 = b >> 32;
1657 	uint64_t b0 = b & lowmask;
1658 	uint64_t prod1 = a1 * b1;
1659 	uint64_t prod0 = a0 * b0;
1660 	uint64_t a1b0 = a1 * b0;
1661 	uint64_t a0b1 = a0 * b1;
1662 	prod1 += a0b1 >> 32;
1663 	prod1 += a1b0 >> 32;
1664 	uint64_t prod0t = prod0 + ((a0b1 & lowmask) << 32);
1665 	if(prod0t < prod0)
1666 		prod1 += 1;
1667 	prod0 = prod0t + ((a1b0 & lowmask) << 32);
1668 	if(prod0 < prod0t)
1669 		prod1 += 1;
1670 	*m1 = prod1;
1671 	*m0 = prod0;
1672 }
1673 
1674 // Multiply two unsigned integers numbers and divide by a third one,
1675 // returning the result in the same integer width, if no overflow
1676 // occurs. Also catches division by zero. On error, zero is returned
1677 // and the error code set to a non-zero value (1==overflow, 2==divbyzero).
1678 // The division result is computed on two halves of 128 bit internally,
1679 // and discarded if the higher half contains set bits.
1680 // Also, if m != NULL, the remainder is stored there.
1681 // The actually interesting function for the interpolation:
1682 // e = (a*b+off)/d
1683 // This is bounded by zero: If (a*b+off) < 0, this returns just
1684 // zero, also zero remainder.
muloffdiv64(uint64_t a,uint64_t b,int64_t off,uint64_t c,int * err,uint64_t * m)1685 static uint64_t muloffdiv64( uint64_t a, uint64_t b, int64_t off, uint64_t c
1686 ,	int *err, uint64_t *m )
1687 {
1688 	uint64_t prod1, prod0;
1689 	uint64_t div1, div0, div0old;
1690 	if(c < 1)
1691 	{
1692 		if (err)
1693 			*err = 2;
1694 		return 0;
1695 	}
1696 	mul64(a, b, &prod1, &prod0);
1697 	if(off)
1698 	{
1699 		uint64_t prod0old = prod0;
1700 		prod0 += off;
1701 		// Offset can be positive or negative, small or large.
1702 		// When adding to prod0, these cases can happen:
1703 		// offset > 0, result > prod0: All fine.
1704 		// offset > 0, result < prod0: Overflow.
1705 		// offset < 0, result < prod0: All fine.
1706 		// offset < 0, result > prod0: Underflow.
1707 		if(off > 0 && prod0 < prod0old)
1708 		{
1709 			if(prod1 == UINT64_MAX)
1710 			{
1711 				if(err)
1712 					*err = 1;
1713 				return 0;
1714 			}
1715 			++prod1;
1716 		}
1717 		if(off < 0 && prod0 > prod0old)
1718 		{
1719 			// Pin to zero on total underflow.
1720 			if(prod1 == 0)
1721 				prod0 = 0;
1722 			else
1723 				--prod1;
1724 		}
1725 	}
1726 	if(c == 1)
1727 	{
1728 		div1 = prod1;
1729 		div0 = prod0;
1730 		if(m)
1731 			*m = 0;
1732 	} else
1733 	{
1734 		div1 = 0;
1735 		div0 = 0;
1736 		uint64_t ctimes = ((uint64_t) - 1) / c;
1737 		uint64_t cblock = ctimes * c;
1738 		while(prod1)
1739 		{
1740 			uint64_t cmult1, cmult0;
1741 			mul64(ctimes, prod1, &cmult1, &cmult0);
1742 			div1 += cmult1;
1743 			div0old = div0;
1744 			div0 += cmult0;
1745 			if(div0 < div0old)
1746 				div1++;
1747 			mul64(cblock, prod1, &cmult1, &cmult0);
1748 			prod1 -= cmult1;
1749 			if(prod0 < cmult0)
1750 				prod1--;
1751 			prod0 -= cmult0;
1752 		}
1753 		div0old = div0;
1754 		div0 += prod0 / c;
1755 		if(m)
1756 			*m = prod0 % c;
1757 		if(div0 < div0old)
1758 			div1++;
1759 	}
1760 	if(div1)
1761 	{
1762 		if(err)
1763 			*err = 1;
1764 		return 0;
1765 	}
1766 	if(err)
1767 		*err = 0;
1768 	return div0;
1769 }
1770 
1771 // Ensure that the sample rates are not above half of the
1772 // possible range of the used data type, including a safeguard
1773 // that 64 bits will always be enough to work with the values.
1774 // This limits you to 63 bits precision in your sampling rate
1775 // ratio for all times, or a maximum speedup/slowdown by 4.6e18.
1776 // This is still a rather big number.
1777 #if LONG_MAX > INT64_MAX
1778 #define RATE_LIMIT INT64_MAX/2
1779 #else
1780 #define RATE_LIMIT LONG_MAX/2
1781 #endif
1782 
1783 long attribute_align_arg
syn123_resample_maxrate(void)1784 syn123_resample_maxrate(void)
1785 {
1786 	return RATE_LIMIT;
1787 }
1788 
1789 // Settle oversampling and decimation stages for the given
1790 // input/output rate ratio.
rate_setup(long inrate,long outrate,int * oversample,unsigned int * decim_stages)1791 static int rate_setup( long inrate, long outrate
1792 ,	int *oversample, unsigned int *decim_stages )
1793 {
1794 	// Want enough headroom so that the sum of two rates can be represented.
1795 	// And of course we don't do reverse or frozen time.
1796 	if( inrate < 1 || inrate  > RATE_LIMIT ||
1797 	   outrate < 1 || outrate > RATE_LIMIT  )
1798 		return -1;
1799 	*oversample = outrate*2 > inrate ? 1 : 0;
1800 	*decim_stages = 0;
1801 	if(outrate <= LONG_MAX/4) while(outrate*4 < inrate)
1802 	{
1803 		(*decim_stages)++;
1804 		outrate *= 2;
1805 	}
1806 	return 0;
1807 }
1808 
1809 // Revisit the cases:
1810 // 2x: to keep the implementation free, one must assume to be able to hold
1811 //     2*ins in the data type, then 2*ins/2*inrate*outrate+1 = ins*outrate/inrate+1
1812 // 1x: no doubling, just ins/inrate*outrate+1
1813 //     but since the rate to interpolate to is <= 1/2 input, outsamples
1814 //     always will be less, so SIZE_MAX
1815 // 0x: output < 1/4 input. Rounding error of 2 does not matter. Things get smaller,
1816 //     so SIZE_MAX.
1817 
1818 size_t attribute_align_arg
syn123_resample_maxincount(long inrate,long outrate)1819 syn123_resample_maxincount(long inrate, long outrate)
1820 {
1821 	int oversample;
1822 	unsigned int decim_stages;
1823 	if(rate_setup(inrate, outrate, &oversample, &decim_stages))
1824 		return 0;
1825 	if(oversample)
1826 	{
1827 		uint64_t outlimit;
1828 		// I'm crazy. I am planning for size_t being more than 64 bits.
1829 		if(SIZE_MAX-1 > UINT64_MAX)
1830 			outlimit = UINT64_MAX;
1831 		else
1832 			outlimit = SIZE_MAX-1;
1833 		// SIZE_MAX = ins*outrate/inrate+1
1834 		// SIZE_MAX-1 = ins*outrate/inrate
1835 		// (SIZE_MAX-1)*inrate = ins*outrate
1836 		// ins = (SIZE_MAX-1)*inrate/outrate
1837 		// Integer math always rounding down, this must be below the limit.
1838 		int err;
1839 		uint64_t maxin = muloffdiv64(outlimit, inrate, 0, outrate, &err, NULL);
1840 		// The only possible error is genuine overflow. Meaning: We can
1841 		// give as much input as can be represented in the variable.
1842 		if(err)
1843 			maxin = UINT64_MAX;
1844 		return maxin > SIZE_MAX ? SIZE_MAX : (size_t)maxin;
1845 	}
1846 	else
1847 		return SIZE_MAX;
1848 }
1849 
1850 // The history is returned as size_t, which usually provide plenty of range.
1851 // The idea is that the history needs to fit into memory, although that is not
1852 // really the case. It is just impractical to consider the full history of
1853 // extreme resampling ratios.
1854 size_t attribute_align_arg
syn123_resample_history(long inrate,long outrate,int dirty)1855 syn123_resample_history(long inrate, long outrate, int dirty)
1856 {
1857 	int oversample;
1858 	unsigned int decim_stages;
1859 	if(rate_setup(inrate, outrate, &oversample, &decim_stages))
1860 		return 0;
1861 	if(oversample && decim_stages)
1862 		return 0;
1863 	// Either 4p4o or 6p5o interpolation at the end. We only need the points
1864 	// before the current sample in the history, so one less.
1865 	size_t history = (dirty ? DIRTY_POINTS : FINE_POINTS) - 1;
1866 	// For the oldest of these, we need a history of input before it to
1867 	// fill the low pass histories. The number of previous low pass points
1868 	// simply comes on top. The number of low pass applications does not matter.
1869 	history += LPF_HISTORY(LPF_ORDER);
1870 
1871 	if(oversample)
1872 	{
1873 		// Need only half the input, but no less.
1874 		history = (history+1)/2;
1875 	}
1876 	// Here, the numbers can potentially become very large.
1877 	// Each decimation step doubles the needed input samples and
1878 	// adds the state of the decimation filter.
1879 	while(decim_stages--)
1880 	{
1881 		// Reverse decimation ...
1882 		if(history > SIZE_MAX/2+1)
1883 			return SIZE_MAX;
1884 		history = (history-1) + history;
1885 		if(history > SIZE_MAX-LPF_HISTORY(LPF_4_ORDER))
1886 			return SIZE_MAX;
1887 		history += LPF_HISTORY(LPF_4_ORDER);
1888 	}
1889 	return history;
1890 }
1891 
1892 #if 0
1893 // This should be the essence of what the interpolation does regarding
1894 // input and output sample count.
1895 int64_t simulate_interpolate(long vinrate, long voutrate, int64_t ins)
1896 {
1897 	int64_t outs = 0;
1898 	long offset = -vinrate;
1899 	for(int64_t n = 0; n<ins; ++n)
1900 	{
1901 		while(offset+vinrate < voutrate)
1902 		{
1903 			++outs;
1904 			offset += vinrate;
1905 		}
1906 		offset -= voutrate;
1907 	}
1908 	return outs;
1909 }
1910 #endif
1911 
1912 // The exact output sample count given total input size.
1913 // It assumes zero history.
1914 // This returns a negative code on error.
1915 // The combination of interpolation and stages of decimation calls
1916 // for more than simple integer division. The overall rounding error
1917 // is limited at 2 samples, btw.
1918 int64_t attribute_align_arg
syn123_resample_total_64(long inrate,long outrate,int64_t ins)1919 syn123_resample_total_64(long inrate, long outrate, int64_t ins)
1920 {
1921 	if(ins < 0)
1922 		return -1;
1923 	int oversample;
1924 	unsigned int decim_stages;
1925 	if(rate_setup(inrate, outrate, &oversample, &decim_stages))
1926 		return SYN123_BAD_FMT;
1927 	if(oversample && decim_stages)
1928 		return SYN123_WEIRD;
1929 
1930 	uint64_t vins = ins;
1931 	uint64_t vinrate = inrate;
1932 	uint64_t voutrate = outrate;
1933 	if(decim_stages)
1934 	{
1935 		// The interpolation uses the virtual output rate to get the same
1936 		// ratio as with decimated input rate.
1937 		voutrate <<= decim_stages;
1938 		// The first sample of a block gets used, the following are dropped.
1939 		// So I get one sample out of the first input sample, regardless of
1940 		// decimation stage count. But the next sample comes only after
1941 		// 2^decim_stages further input samples.
1942 		// The formula for one stage: y = (x+1)/2
1943 		// For n stages ... my guess is wrong. Heck, n is small, less than
1944 		// 64 (see RATE_LIMIT). Let's just mimick the decimations in a loop.
1945 		// Since vins < UINT64_MAX/2, adding 1 is no issue.
1946 		for(unsigned int i=0; i<decim_stages; ++i)
1947 			vins = (vins+1)/2;
1948 	}
1949 	if(oversample)
1950 	{
1951 		vins    *= 2;
1952 		vinrate *= 2;
1953 	}
1954 	int err;
1955 	// Interpolation model:
1956 	//   outs = (vins*voutrate - offset - 1)/vinrate
1957 	// with offset = -vinrate at the beginning. This is the rounding, no
1958 	// need to deal with the remainder.
1959 	uint64_t tot = muloffdiv64(vins, voutrate, vinrate-1, vinrate, &err, NULL);
1960 	// Any error is treated as overflow (div by zero -> infinity).
1961 	if(err)
1962 		return SYN123_OVERFLOW;
1963 	return (tot <= INT64_MAX) ? (int64_t)tot : SYN123_OVERFLOW;
1964 }
1965 
1966 int32_t attribute_align_arg
syn123_resample_total_32(int32_t inrate,int32_t outrate,int32_t ins)1967 syn123_resample_total_32(int32_t inrate, int32_t outrate, int32_t ins)
1968 {
1969 	int64_t tot = syn123_resample_total_64(inrate, outrate, ins);
1970 	return (tot <= INT32_MAX) ? (int32_t)tot : SYN123_OVERFLOW;
1971 }
1972 
syn123_resample_total(long inrate,long outrate,lfs_alias_t ins)1973 lfs_alias_t syn123_resample_total(long inrate, long outrate, lfs_alias_t ins)
1974 {
1975 #if   LFS_ALIAS_BITS+0 == 64
1976 	return syn123_resample_total_64(inrate, outrate, ins);
1977 #elif LFS_ALIAS_BITS+0 == 32
1978 	return syn123_resample_total_32(inrate, outrate, ins);
1979 #else
1980 	#error "Unexpected LFS_ALIAS_BITS value."
1981 #endif
1982 }
1983 
1984 
1985 // The inverse function: How many input samples are needed to get at least
1986 // the desired amount of output?
1987 int64_t attribute_align_arg
syn123_resample_intotal_64(long inrate,long outrate,int64_t outs)1988 syn123_resample_intotal_64(long inrate, long outrate, int64_t outs)
1989 {
1990 	if(outs < 1)
1991 		return outs == 0 ? 0 : -1;
1992 	int oversample;
1993 	unsigned int decim_stages;
1994 	if(rate_setup(inrate, outrate, &oversample, &decim_stages))
1995 		return SYN123_BAD_FMT;
1996 	if(oversample && decim_stages)
1997 		return SYN123_WEIRD;
1998 	uint64_t voutrate = outrate;
1999 	voutrate <<= decim_stages;
2000 	uint64_t vinrate  = inrate;
2001 	if(oversample)
2002 		vinrate *= 2;
2003 	// This works for outs > 0:
2004 	// ins = (outs*inrate+offset)/outrate+1
2005 	// First offset is -inrate.
2006 	// You may want to work it out for yourself. Or trust me;-)
2007 	int err;
2008 	uint64_t vtot = muloffdiv64(outs, vinrate, -(int64_t)vinrate, voutrate, &err, NULL);
2009 	if(err)
2010 		return SYN123_OVERFLOW;
2011 	if(vtot == UINT64_MAX)
2012 		return SYN123_OVERFLOW;
2013 	++vtot; // Must be at least 1 now!
2014 	uint64_t tot = vtot;
2015 	// Un-do oversampling, taking care not to hit overflow.
2016 	if(oversample)
2017 	{
2018 		tot /= 2;
2019 		if(tot*2 < vtot)
2020 			++tot;
2021 	}
2022 	// Now tot is the minimum needed input sample count before interpolation.
2023 	// Decimation still needs to be accounted for.
2024 	// With decimation, I need the smallest input that fulfills out=(2*in+1)/2.
2025 	for(unsigned int i=0; i<decim_stages; ++i)
2026 	{
2027 		if(!tot)
2028 			return 0;
2029 		if(tot > UINT64_MAX/2+1)
2030 			return SYN123_OVERFLOW;
2031 		tot = (tot - 1) + tot;
2032 	}
2033 	return (tot <= INT64_MAX) ? (int64_t)tot : SYN123_OVERFLOW;
2034 }
2035 
2036 int32_t attribute_align_arg
syn123_resample_intotal_32(int32_t inrate,int32_t outrate,int32_t outs)2037 syn123_resample_intotal_32(int32_t inrate, int32_t outrate, int32_t outs)
2038 {
2039 	int64_t tot = syn123_resample_intotal_64(inrate, outrate, outs);
2040 	return (tot <= INT32_MAX) ? (int32_t)tot : SYN123_OVERFLOW;
2041 }
2042 
syn123_resample_intotal(long inrate,long outrate,lfs_alias_t outs)2043 lfs_alias_t syn123_resample_intotal(long inrate, long outrate, lfs_alias_t outs)
2044 {
2045 #if   LFS_ALIAS_BITS+0 == 64
2046 	return syn123_resample_intotal_64(inrate, outrate, outs);
2047 #elif LFS_ALIAS_BITS+0 == 32
2048 	return syn123_resample_intotal_32(inrate, outrate, outs);
2049 #else
2050 	#error "Unexpected LFS_ALIAS_BITS value."
2051 #endif
2052 }
2053 
2054 // As any sensible return value is at least 1, this uses the unsigned
2055 // type and 0 for error/pathological input.
2056 // This function could be simplified to
2057 //     uintmax_t count = prod/inrate + (prod%inrate ? 1 : 0) + 1;
2058 // to accomodate for possible rounding during decimation and interpolation,
2059 // but it would be a teensy bit less accurate. The computation time
2060 // should not matter much. The simple formula can be used as a test
2061 // to check if the elaborate function comes to a value that only
2062 // sometimes is less by 1, but not more.
2063 size_t attribute_align_arg
syn123_resample_count(long inrate,long outrate,size_t ins)2064 syn123_resample_count(long inrate, long outrate, size_t ins)
2065 {
2066 	// The largest output you get from the beginning. So this can just treat ins
2067 	// as offset from there.
2068 	if( ins > syn123_resample_maxincount(inrate, outrate)
2069 #if SIZE_MAX > INT64_MAX
2070 		|| ins > INT64_MAX
2071 #endif
2072 	)
2073 		return 0;
2074 	int64_t tot = syn123_resample_total_64(inrate, outrate, (int64_t)ins);
2075 	return (tot >= 0 && tot <= SIZE_MAX) ? (size_t)tot : 0;
2076 }
2077 
2078 size_t attribute_align_arg
syn123_resample_incount(long inrate,long outrate,size_t outs)2079 syn123_resample_incount(long inrate, long outrate, size_t outs)
2080 {
2081 	// I had logic here about abusing syn123_resample_intotal(), but that is wrong.
2082 	// Do not assume you know what interval to pick to get the maximum value.
2083 	// Work the algorithm for the theoretical maximum at each point, even if that
2084 	// may not be hit with your particular rate ratio and playback time.
2085 #if SIZE_MAX > INT64_MAX
2086 	if(outs > INT64_MAX-1)
2087 		return 0;
2088 #endif
2089 	int oversample;
2090 	unsigned int decim_stages;
2091 	if(rate_setup(inrate, outrate, &oversample, &decim_stages))
2092 		return 0;
2093 	if(oversample && decim_stages)
2094 		return 0;
2095 	uint64_t voutrate = outrate;
2096 	voutrate <<= decim_stages;
2097 	uint64_t vinrate  = inrate;
2098 	if(oversample)
2099 		vinrate *= 2;
2100 	// This works for outs > 0:
2101 	// ins = (outs*inrate+offset)/outrate+1
2102 	// First offset is -inrate, maximum offset is -1. I think.
2103 	// You may want to work it out for yourself. Or trust me;-)
2104 	int err;
2105 	uint64_t vtot = muloffdiv64(outs, vinrate, -1, voutrate, &err, NULL);
2106 	if(err)
2107 		return SYN123_OVERFLOW;
2108 	if(vtot == UINT64_MAX)
2109 		return SYN123_OVERFLOW;
2110 	++vtot; // Must be at least 1 now!
2111 	uint64_t tot = vtot;
2112 	// Un-do oversampling, taking care not to hit overflow.
2113 	if(oversample)
2114 	{
2115 		tot /= 2;
2116 		if(tot*2 < vtot)
2117 			++tot;
2118 	}
2119 	// Now tot is the minimum needed input sample count before interpolation.
2120 	// Decimation still needs to be accounted for. Worst case: We got no
2121 	// sample buffered (dropped) in any stage: So just multiply.
2122 	if(tot > UINT64_MAX>>decim_stages)
2123 		return SYN123_OVERFLOW;
2124 	tot<<=decim_stages;
2125 	return (tot <= SIZE_MAX) ? (size_t)tot : 0;
2126 }
2127 
2128 // A bit of convenience. Just rely on the other functions.
2129 size_t attribute_align_arg
syn123_resample_fillcount(long input_rate,long output_rate,size_t outs)2130 syn123_resample_fillcount(long input_rate, long output_rate, size_t outs)
2131 {
2132 	size_t block = syn123_resample_incount(input_rate, output_rate, outs);
2133 	// Reduce that to ensure that we never get more than a buffer's fill.
2134 	while( block &&
2135 		syn123_resample_count(input_rate, output_rate, block) > outs )
2136 		--block;
2137 	return block;
2138 }
2139 
2140 // The exact predictor: How many output samples will I get _now_
2141 // after feeding the indicated amount? This is concerned with
2142 // Buffer sizes, so let's drop the 32/64 bit distinction.
2143 // Since overflow can occur, we need a sign bit to signal errors.
2144 ssize_t attribute_align_arg
syn123_resample_expect(syn123_handle * sh,size_t ins)2145 syn123_resample_expect(syn123_handle *sh, size_t ins)
2146 {
2147 	if(!sh || !sh->rd)
2148 		return SYN123_BAD_HANDLE;
2149 	if(ins < 1)
2150 		return 0;
2151 	struct resample_data *rd = sh->rd;
2152 	// Need to account for the current resampler state. That is:
2153 	// 1. decimation stages having swallowed a sample or not,
2154 	// 2. current interpolator offset.
2155 	// The first sample of a block gets used, the following are dropped.
2156 	// So I get one sample out of the first input sample, regardless of
2157 	// decimation stage count. But the next sample comes only after
2158 	// 2^decim_stages further input samples.
2159 	// The formula for one stage: y = (x+1)/2
2160 	// For n stages ... my guess is wrong. Heck, n is small, less than
2161 	// 64 (see RATE_LIMIT). Let's just mimick the decimations in a loop.
2162 	// Since vins < UINT64_MAX/2, adding 1 is no issue.
2163 	for(unsigned int i=0; i<rd->decim_stages; ++i)
2164 	{
2165 		// If the stage is fresh or has explictly a past sample in memorian,
2166 		// we can add one virtual input sample. Divide first to avoid overflow.
2167 		size_t nins = ins / 2;
2168 		// When there is one sample left over, a second one from storage
2169 		// adds to the decimated count.
2170 		if( nins*2 < ins && (!(rd->decim[i].sflags & lowpass_flow)
2171 		||	(rd->decim[i].sflags & decimate_store)) )
2172 			++nins;
2173 		ins = nins;
2174 	}
2175 #if SIZE_MAX > UINT64_MAX
2176 	if(ins > UINT64_MAX) // Really?
2177 		return SYN123_OVERFLOW;
2178 #endif
2179 	uint64_t vins = ins;
2180 	if(rd->sflags & oversample_2x)
2181 	{
2182 		if(vins > UINT64_MAX/2)
2183 			return SYN123_OVERFLOW;
2184 		vins *= 2;
2185 	}
2186 	int err;
2187 	int64_t offset = rd->sflags & inter_flow ? rd->offset : -rd->vinrate;
2188 	// Interpolation model:
2189 	//   outs = (vins*voutrate - offset - 1)/vinrate
2190 	uint64_t tot = muloffdiv64( vins, (uint64_t)rd->voutrate
2191 	,	(int64_t)(-offset-1), (uint64_t)rd->vinrate, &err, NULL );
2192 	// Any error is treated as overflow (div by zero -> infinity).
2193 	if(err)
2194 		return SYN123_OVERFLOW;
2195 	return (tot <= SSIZE_MAX) ? (ssize_t)tot : SYN123_OVERFLOW;
2196 }
2197 
2198 // How many input samples are minimally needed to get at least the
2199 // minimally desired output right now?
2200 ssize_t attribute_align_arg
syn123_resample_inexpect(syn123_handle * sh,size_t outs)2201 syn123_resample_inexpect(syn123_handle *sh, size_t outs)
2202 {
2203 	if(!sh || !sh->rd)
2204 		return SYN123_BAD_HANDLE;
2205 	if(outs < 1)
2206 		return 0;
2207 	struct resample_data *rd = sh->rd;
2208 	// This works for outs > 0:
2209 	// ins = (outs*inrate+offset)/outrate+1
2210 	int err;
2211 	int64_t offset = rd->sflags & inter_flow ? rd->offset : -rd->vinrate;
2212 	uint64_t vtot = muloffdiv64(outs, sh->rd->vinrate, offset, rd->voutrate
2213 	,	&err, NULL);
2214 	if(err)
2215 		return SYN123_OVERFLOW;
2216 	if(vtot == UINT64_MAX)
2217 		return SYN123_OVERFLOW;
2218 	++vtot;
2219 	uint64_t tot = vtot;
2220 	// Un-do oversampling, taking care not to hit overflow.
2221 	if(rd->sflags & oversample_2x)
2222 	{
2223 		tot /= 2;
2224 		if(tot*2 < vtot)
2225 			++tot;
2226 	}
2227 	// Still: tot > 0
2228 	// Got needed output of deciation, for each stage
2229 	// a) need twice that amount if there is no sample buffered
2230 	// b) need one less than twice the amount if there is a sample buffered
2231 	for(unsigned int j=0; j<rd->decim_stages; ++j)
2232 	{
2233 		unsigned int i = rd->decim_stages - 1 - j;
2234 		if(tot > UINT64_MAX/2+1)
2235 			return SYN123_OVERFLOW;
2236 		tot = (tot - 1) + tot;
2237 		if( (rd->decim[i].sflags & lowpass_flow)
2238 		&&	!(rd->decim[i].sflags & decimate_store) )
2239 		{
2240 			if(tot == UINT64_MAX)
2241 				return SYN123_OVERFLOW;
2242 			++tot;
2243 		}
2244 	}
2245 	return (tot <= SSIZE_MAX) ? (ssize_t)tot : SYN123_OVERFLOW;
2246 }
2247 
resample_free(struct resample_data * rd)2248 void resample_free(struct resample_data *rd)
2249 {
2250 	if(!rd)
2251 		return;
2252 	if(rd->stage_history)
2253 		free(rd->stage_history);
2254 	if(rd->decim_hist)
2255 		free(rd->decim_hist);
2256 	if(rd->decim)
2257 		free(rd->decim);
2258 	if(rd->upbuf)
2259 		free(rd->upbuf);
2260 	if(rd->prebuf)
2261 		free(rd->prebuf);
2262 	if(rd->ch)
2263 		free(rd->ch);
2264 	if(rd->frame)
2265 		free(rd->frame);
2266 	free(rd);
2267 }
2268 
2269 // Want to support smooth rate changes.
2270 // If there is a handle present, try to keep as much as possible.
2271 // If you change the dirty flag, things get refreshed totally.
2272 int attribute_align_arg
syn123_setup_resample(syn123_handle * sh,long inrate,long outrate,int channels,int dirty,int smooth)2273 syn123_setup_resample( syn123_handle *sh, long inrate, long outrate
2274 ,	int channels, int dirty, int smooth )
2275 {
2276 	int err = 0;
2277 	struct resample_data *rd = NULL;
2278 	int fresh = 0;
2279 
2280 	if(inrate == 0 && outrate == 0 && channels == 0)
2281 	{
2282 		err = SYN123_BAD_FMT;
2283 		goto setup_resample_cleanup;
2284 	}
2285 
2286 	if(channels < 1)
2287 	{
2288 		err = SYN123_BAD_FMT;
2289 		goto setup_resample_cleanup;
2290 	}
2291 
2292 	// If dirty setyp changed, start anew.
2293 	// TODO: implement proper smooth rate change.
2294 	if( sh->rd && ( (sh->rd->channels != channels)
2295 	||	(!(sh->rd->sflags & dirty_method) != !dirty)
2296 	||	(!(sh->rd->sflags & smooth_change) != !smooth) ) )
2297 	{
2298 		resample_free(sh->rd);
2299 		sh->rd = NULL;
2300 	}
2301 
2302 	int oversample;
2303 	unsigned int decim_stages;
2304 	if(rate_setup(inrate, outrate, &oversample, &decim_stages))
2305 		return SYN123_BAD_FMT;
2306 	if(oversample && decim_stages)
2307 		return SYN123_WEIRD;
2308 	long voutrate = outrate<<decim_stages;
2309 	long vinrate  = oversample ? inrate*2 : inrate;
2310 	if(!sh->rd)
2311 	{
2312 		fresh = 1;
2313 		sh->rd = malloc(sizeof(*(sh->rd)));
2314 		if(!sh->rd)
2315 			return SYN123_DOOM;
2316 		memset(sh->rd, 0, sizeof(*(sh->rd)));
2317 		rd = sh->rd;
2318 		rd->inrate = rd->vinrate = rd->voutrate = rd->outrate = -1;
2319 		rd->resample_func = NULL;
2320 		rd->lowpass_func    = dirty ? DIRTY_LOWPASS_PREEMP    : LOWPASS_PREEMP;
2321 		rd->lowpass_2x_func = dirty ? DIRTY_LOWPASS_PREEMP_2X : LOWPASS_PREEMP_2X;
2322 		rd->decim = NULL;
2323 		rd->decim_hist = NULL;
2324 		rd->channels = channels;
2325 		rd->stage_history = NULL;
2326 		DE_DENORM_INIT(rd->dede, +1)
2327 		rd->frame = NULL;
2328 #ifndef SYN123_NO_CASES
2329 		if(channels > 2)
2330 #endif
2331 		{
2332 			rd->frame = malloc(sizeof(float)*channels);
2333 			if(!rd->frame)
2334 			{
2335 				free(rd);
2336 				return SYN123_DOOM;
2337 			}
2338 		}
2339 		rd->ch = malloc(sizeof(struct channel_history)*channels);
2340 		rd->prebuf = malloc(sizeof(float)*channels*BATCH);
2341 		rd->upbuf  = malloc(sizeof(float)*channels*2*BATCH);
2342 		if(!rd->ch || !rd->prebuf || !rd->upbuf)
2343 		{
2344 			if(rd->upbuf)
2345 				free(rd->upbuf);
2346 			if(rd->prebuf)
2347 				free(rd->prebuf);
2348 			if(rd->ch)
2349 				free(rd->ch);
2350 			if(rd->stage_history)
2351 				free(rd->stage_history);
2352 			if(rd->frame)
2353 				free(rd->frame);
2354 			free(rd);
2355 			return SYN123_DOOM;
2356 		}
2357 		memset(rd->ch, 0, sizeof(struct channel_history)*channels);
2358 	}
2359 	else
2360 		rd = sh->rd;
2361 
2362 	mdebug("init in %ld out %ld", inrate, outrate);
2363 	// To support on-the-fly rate changes, superfluous decimator stages are
2364 	// forgotten, new ones added as needed.
2365 	// If smoothing is desired and the stream already flowing, new stages will
2366 	// get started on the previously utmost stage's history.
2367 	if(fresh || decim_stages != rd->decim_stages)
2368 	{
2369 		if(smooth)
2370 		{
2371 			float *sth = safe_realloc( rd->stage_history
2372 			,	sizeof(float)*STAGE_HISTORY*channels*(decim_stages+1) );
2373 			if(!sth)
2374 			{
2375 				err = SYN123_DOOM;
2376 				goto setup_resample_cleanup;
2377 			}
2378 			rd->stage_history = sth;
2379 		}
2380 		struct decimator_state *nd = safe_realloc( rd->decim
2381 		,	sizeof(*rd->decim)*decim_stages );
2382 		struct lpf4_hist *ndh = safe_realloc( rd->decim_hist
2383 		,	sizeof(*rd->decim_hist)*decim_stages*channels );
2384 		if(nd)
2385 			rd->decim = nd;
2386 		if(ndh)
2387 			rd->decim_hist = ndh;
2388 		if(!nd || !ndh)
2389 		{
2390 			perror("cannot allocate decimator state");
2391 			err = SYN123_DOOM;
2392 			goto setup_resample_cleanup;
2393 		}
2394 		// Link up the common memory blocks after each realloc.
2395 		for(unsigned int dc=0; dc<decim_stages; ++dc)
2396 		{
2397 			rd->decim[dc].ch = ndh+dc*channels;
2398 			rd->decim[dc].out_hist = rd->stage_history
2399 			?	rd->stage_history+(dc+1)*STAGE_HISTORY*channels
2400 			:	NULL;
2401 		}
2402 		// Smoothness does matter if the interpolator produced something
2403 		// previously, so inter_flow is a good flag to test.
2404 		int init_stages = rd->stage_history && rd->sflags & inter_flow;
2405 		// The situation: Got stage rd->decim_stages (possibly zero).
2406 		// with proper output history. Need to start up the additional
2407 		// stages with the output of the preceeding stages. I'll pretend
2408 		// each stage history being complete, even if I know that I am
2409 		// halving the count in each step.
2410 		for(unsigned int dc=rd->decim_stages; dc<decim_stages; ++dc)
2411 		{
2412 			rd->decim[dc].sflags = 0;
2413 			if(init_stages)
2414 			{
2415 				// Take previous stage output dc, process through
2416 				// the new stage and have it make up its output history.
2417 				// As decimate() overwrites its input, work on a copy.
2418 				memcpy( rd->prebuf
2419 				,	rd->stage_history+dc*STAGE_HISTORY*rd->channels
2420 				,	STAGE_HISTORY*rd->channels*sizeof(float) );
2421 				decimate(rd, dc, rd->prebuf, STAGE_HISTORY);
2422 			}
2423 		}
2424 		rd->decim_stages = decim_stages;
2425 	}
2426 
2427 	if(rd->sflags & inter_flow)
2428 	{
2429 		// If the virtual input rate changes (esp. increases), the interpolation
2430 		// offset needs adjustment to stay at/below predicted sample counts.
2431 		// Could always re-initialize the interpolation, but want to keep things
2432 		// smooth.
2433 		long sign = rd->offset < 0 ? -1 : +1;
2434 		// Zero on overflow, suits me just fine.
2435 		uint64_t noff = muloffdiv64( (uint64_t)(sign*rd->offset)
2436 		,	(uint64_t)vinrate, 0, (uint64_t)rd->vinrate, NULL, NULL );
2437 		// Magnitude must not be too large. Too small is OK.
2438 		if(noff > LONG_MAX)
2439 			noff = LONG_MAX;
2440 		rd->offset = sign*noff;
2441 		// Total paranoia. This is the crucial condition.
2442 		if(rd->offset < -vinrate)
2443 			rd->offset = -vinrate;
2444 	}
2445 	rd->inrate = inrate;
2446 	rd->outrate = outrate;
2447 	rd->vinrate = vinrate;
2448 	rd->voutrate = voutrate;
2449 	if(oversample)
2450 		rd->sflags |= oversample_2x;
2451 	else
2452 		rd->sflags &= ~oversample_2x;
2453 
2454 	// TODO: channels!
2455 	if(rd->sflags & oversample_2x)
2456 		rd->resample_func = dirty ? resample_2x_dirty : resample_2x_fine;
2457 	else if(rd->decim_stages)
2458 		rd->resample_func = dirty ? resample_0x_dirty : resample_0x_fine;
2459 	else
2460 		rd->resample_func = dirty ? resample_1x_dirty : resample_1x_fine;
2461 
2462 	if(dirty)
2463 		rd->sflags |= dirty_method;
2464 	if(smooth)
2465 		rd->sflags |= smooth_change;
2466 
2467 	mdebug( "%u times decimation by 2"
2468 		", virtual output rate %ld, %ldx oversampling (%i)"
2469 	,	rd->decim_stages, rd->voutrate, rd->vinrate / rd->inrate
2470 	,	rd->sflags && oversample_2x ? 1 : 0 );
2471 
2472 	// Round result to single precision, but compute with double to be able
2473 	// to correctly represent 32 bit longs first.
2474 	rd->lpf_cutoff = (double)(rd->inrate < rd->outrate ? rd->inrate : rd->voutrate)/rd->vinrate;
2475 	mdebug( "lowpass cutoff: %.8f of virtual %.0f Hz = %.0f Hz"
2476 	,	rd->lpf_cutoff, 0.5*rd->vinrate, rd->lpf_cutoff*0.5*rd->vinrate );
2477 	mdebug("rates final: %ld|%ld -> %ld|%ld"
2478 	,	rd->inrate, rd->vinrate, rd->voutrate, rd->outrate );
2479 	preemp_init(rd);
2480 	if(rd->sflags & lowpass_flow)
2481 	{
2482 		// First cheap stage of rate change smoothing:
2483 		// Attempt to dampen spikes from the filter history magnitudes
2484 		// not matching changed filter setup by simple rescaling.
2485 		// This catches the massive spikes on adding decimation stages.
2486 		float scale = lpf_history_scale(rd);
2487 		lpf_init(rd);
2488 		mdebug( "smooth old scale: %g, new scale: %g"
2489 		,	scale, lpf_history_scale(rd) );
2490 		scale = scale > 1e-10 ? lpf_history_scale(rd) / scale : 1.;
2491 		for(unsigned int c=0; c<rd->channels; ++c)
2492 			for(unsigned int j=0; j<LPF_MAX_TIMES; ++j)
2493 				for(unsigned int i=0; i<LPF_ORDER; ++i)
2494 					rd->ch[c].lpf_w[j][i] *= scale;
2495 		// Full smoothness with present stage history: Let the lowpass roll.
2496 		// Preemp needs to be re-configured beforehand!
2497 		// This is the icing on the cake with much more work,
2498 		// relaxing the filters to get rid of the trouble.
2499 		if(rd->stage_history)
2500 		{
2501 			// Feed the final stage output through the filters,
2502 			// manipulate the interpolator history to match.
2503 			float *in = rd->stage_history
2504 			+	rd->decim_stages*STAGE_HISTORY*rd->channels;
2505 			// Pointer to the last 5 filtered samples
2506 			float *last5 = NULL;
2507 			if(rd->sflags & oversample_2x)
2508 			{
2509 				rd->lowpass_2x_func(rd,in, STAGE_HISTORY, rd->upbuf);
2510 				last5 = rd->upbuf+(2*STAGE_HISTORY-5)*rd->channels;
2511 			} else
2512 			{
2513 				memcpy(rd->prebuf, in, STAGE_HISTORY*sizeof(*in)*rd->channels);
2514 				rd->lowpass_func(rd, rd->prebuf, STAGE_HISTORY);
2515 				last5 = rd->prebuf+(STAGE_HISTORY-5)*rd->channels;
2516 			}
2517 			// The above wrote preemp and lowpass channel histories.
2518 			// Now store the last samples for the interpolator history.
2519 			for(unsigned int c=0; c<rd->channels; ++c)
2520 				for(unsigned int i=0; i<5; ++i)
2521 					rd->ch[c].x[i] = last5[c+i*rd->channels];
2522 		}
2523 	}
2524 	else
2525 		lpf_init(rd);
2526 
2527 	rd->input_limit = syn123_resample_maxincount(inrate, outrate);
2528 	return 0;
2529 setup_resample_cleanup:
2530 	resample_free(sh->rd);
2531 	sh->rd = NULL;
2532 	return err;
2533 }
2534 
2535 size_t attribute_align_arg
syn123_resample(syn123_handle * sh,float * MPG123_RESTRICT dst,float * MPG123_RESTRICT src,size_t samples)2536 syn123_resample( syn123_handle *sh,
2537 	float * MPG123_RESTRICT dst, float * MPG123_RESTRICT src, size_t samples )
2538 {
2539 	if(!sh)
2540 		return 0;
2541 	struct resample_data *rd = sh->rd;
2542 	if(!rd)
2543 		return 0;
2544 	// Input limit is zero if no resampler configured.
2545 	if(!samples || samples > sh->rd->input_limit)
2546 		return 0;
2547 	mdebug( "calling actual resample function from %p to %p with %"SIZE_P" samples"
2548 	,	(void*)src, (void*)dst, (size_p)samples );
2549 	return rd->resample_func(rd, src, samples, dst);
2550 }
2551