1 /*
2 * This file is part of libsidplayfp, a SID player engine.
3 *
4 * Copyright 2011-2020 Leandro Nini <drfiemost@users.sourceforge.net>
5 * Copyright 2007-2010 Antti Lankila
6 * Copyright 2004 Dag Lem <resid@nimrod.no>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23 #include "SincResampler.h"
24
25 #include <cassert>
26 #include <cstring>
27 #include <cmath>
28 #include <iostream>
29 #include <sstream>
30
31 #include "siddefs-fp.h"
32
33 #ifdef HAVE_CONFIG_H
34 # include "config.h"
35 #endif
36
37 #ifdef HAVE_EMMINTRIN_H
38 # include <emmintrin.h>
39 #elif defined HAVE_MMINTRIN_H
40 # include <mmintrin.h>
41 #elif defined(HAVE_ARM_NEON_H)
42 # include <arm_neon.h>
43 #endif
44
45 namespace reSIDfp
46 {
47
48 typedef std::map<std::string, matrix_t> fir_cache_t;
49
50 /// Cache for the expensive FIR table computation results.
51 fir_cache_t FIR_CACHE;
52
53 /// Maximum error acceptable in I0 is 1e-6, or ~96 dB.
54 const double I0E = 1e-6;
55
56 const int BITS = 16;
57
58 /**
59 * Compute the 0th order modified Bessel function of the first kind.
60 * This function is originally from resample-1.5/filterkit.c by J. O. Smith.
61 * It is used to build the Kaiser window for resampling.
62 *
63 * @param x evaluate I0 at x
64 * @return value of I0 at x.
65 */
I0(double x)66 double I0(double x)
67 {
68 double sum = 1.;
69 double u = 1.;
70 double n = 1.;
71 const double halfx = x / 2.;
72
73 do
74 {
75 const double temp = halfx / n;
76 u *= temp * temp;
77 sum += u;
78 n += 1.;
79 }
80 while (u >= I0E * sum);
81
82 return sum;
83 }
84
85 /**
86 * Calculate convolution with sample and sinc.
87 *
88 * @param a sample buffer input
89 * @param b sinc buffer
90 * @param bLength length of the sinc buffer
91 * @return convolved result
92 */
convolve(const short * a,const short * b,int bLength)93 int convolve(const short* a, const short* b, int bLength)
94 {
95 #ifdef HAVE_EMMINTRIN_H
96 int out = 0;
97
98 const uintptr_t offset = (uintptr_t)(a) & 0x0f;
99
100 if (offset == ((uintptr_t)(b) & 0x0f))
101 {
102 if (offset)
103 {
104 const int l = (0x10 - offset)/2;
105
106 for (int i = 0; i < l; i++)
107 {
108 out += *a++ * *b++;
109 }
110
111 bLength -= offset;
112 }
113
114 __m128i acc = _mm_setzero_si128();
115
116 const int n = bLength / 8;
117
118 for (int i = 0; i < n; i++)
119 {
120 const __m128i tmp = _mm_madd_epi16(*(__m128i*)a, *(__m128i*)b);
121 acc = _mm_add_epi16(acc, tmp);
122 a += 8;
123 b += 8;
124 }
125
126 __m128i vsum = _mm_add_epi32(acc, _mm_srli_si128(acc, 8));
127 vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
128 out += _mm_cvtsi128_si32(vsum);
129
130 bLength &= 7;
131 }
132 #elif defined HAVE_MMINTRIN_H
133 __m64 acc = _mm_setzero_si64();
134
135 const int n = bLength / 4;
136
137 for (int i = 0; i < n; i++)
138 {
139 const __m64 tmp = _mm_madd_pi16(*(__m64*)a, *(__m64*)b);
140 acc = _mm_add_pi16(acc, tmp);
141 a += 4;
142 b += 4;
143 }
144
145 int out = _mm_cvtsi64_si32(acc) + _mm_cvtsi64_si32(_mm_srli_si64(acc, 32));
146 _mm_empty();
147
148 bLength &= 3;
149 #elif defined(HAVE_ARM_NEON_H)
150 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
151 int32x4_t acc1Low = vdupq_n_s32(0);
152 int32x4_t acc1High = vdupq_n_s32(0);
153 int32x4_t acc2Low = vdupq_n_s32(0);
154 int32x4_t acc2High = vdupq_n_s32(0);
155
156 const int n = bLength / 16;
157
158 for (int i = 0; i < n; i++)
159 {
160 int16x8_t v11 = vld1q_s16(a);
161 int16x8_t v12 = vld1q_s16(a + 8);
162 int16x8_t v21 = vld1q_s16(b);
163 int16x8_t v22 = vld1q_s16(b + 8);
164
165 acc1Low = vmlal_s16(acc1Low, vget_low_s16(v11), vget_low_s16(v21));
166 acc1High = vmlal_high_s16(acc1High, v11, v21);
167 acc2Low = vmlal_s16(acc2Low, vget_low_s16(v12), vget_low_s16(v22));
168 acc2High = vmlal_high_s16(acc2High, v12, v22);
169
170 a += 16;
171 b += 16;
172 }
173
174 bLength &= 15;
175
176 if (bLength >= 8)
177 {
178 int16x8_t v1 = vld1q_s16(a);
179 int16x8_t v2 = vld1q_s16(b);
180
181 acc1Low = vmlal_s16(acc1Low, vget_low_s16(v1), vget_low_s16(v2));
182 acc1High = vmlal_high_s16(acc1High, v1, v2);
183
184 a += 8;
185 b += 8;
186 }
187
188 bLength &= 7;
189
190 if (bLength >= 4)
191 {
192 int16x4_t v1 = vld1_s16(a);
193 int16x4_t v2 = vld1_s16(b);
194
195 acc1Low = vmlal_s16(acc1Low, v1, v2);
196
197 a += 4;
198 b += 4;
199 }
200
201 int32x4_t accSumsNeon = vaddq_s32(acc1Low, acc1High);
202 accSumsNeon = vaddq_s32(accSumsNeon, acc2Low);
203 accSumsNeon = vaddq_s32(accSumsNeon, acc2High);
204
205 int out = vaddvq_s32(accSumsNeon);
206
207 bLength &= 3;
208 #else
209 int32x4_t acc = vdupq_n_s32(0);
210
211 const int n = bLength / 4;
212
213 for (int i = 0; i < n; i++)
214 {
215 const int16x4_t h_vec = vld1_s16(a);
216 const int16x4_t x_vec = vld1_s16(b);
217 acc = vmlal_s16(acc, h_vec, x_vec);
218 a += 4;
219 b += 4;
220 }
221
222 int out = vgetq_lane_s32(acc, 0) +
223 vgetq_lane_s32(acc, 1) +
224 vgetq_lane_s32(acc, 2) +
225 vgetq_lane_s32(acc, 3);
226
227 bLength &= 3;
228 #endif
229 #else
230 int out = 0;
231 #endif
232
233 for (int i = 0; i < bLength; i++)
234 {
235 out += *a++ * *b++;
236 }
237
238 return (out + (1 << 14)) >> 15;
239 }
240
fir(int subcycle)241 int SincResampler::fir(int subcycle)
242 {
243 // Find the first of the nearest fir tables close to the phase
244 int firTableFirst = (subcycle * firRES >> 10);
245 const int firTableOffset = (subcycle * firRES) & 0x3ff;
246
247 // Find firN most recent samples, plus one extra in case the FIR wraps.
248 int sampleStart = sampleIndex - firN + RINGSIZE - 1;
249
250 const int v1 = convolve(sample + sampleStart, (*firTable)[firTableFirst], firN);
251
252 // Use next FIR table, wrap around to first FIR table using
253 // previous sample.
254 if (unlikely(++firTableFirst == firRES))
255 {
256 firTableFirst = 0;
257 ++sampleStart;
258 }
259
260 const int v2 = convolve(sample + sampleStart, (*firTable)[firTableFirst], firN);
261
262 // Linear interpolation between the sinc tables yields good
263 // approximation for the exact value.
264 return v1 + (firTableOffset * (v2 - v1) >> 10);
265 }
266
SincResampler(double clockFrequency,double samplingFrequency,double highestAccurateFrequency)267 SincResampler::SincResampler(double clockFrequency, double samplingFrequency, double highestAccurateFrequency) :
268 sampleIndex(0),
269 cyclesPerSample(static_cast<int>(clockFrequency / samplingFrequency * 1024.)),
270 sampleOffset(0),
271 outputValue(0)
272 {
273 // 16 bits -> -96dB stopband attenuation.
274 const double A = -20. * log10(1.0 / (1 << BITS));
275 // A fraction of the bandwidth is allocated to the transition band, which we double
276 // because we design the filter to transition halfway at nyquist.
277 const double dw = (1. - 2.*highestAccurateFrequency / samplingFrequency) * M_PI * 2.;
278
279 // For calculation of beta and N see the reference for the kaiserord
280 // function in the MATLAB Signal Processing Toolbox:
281 // http://www.mathworks.com/help/signal/ref/kaiserord.html
282 const double beta = 0.1102 * (A - 8.7);
283 const double I0beta = I0(beta);
284 const double cyclesPerSampleD = clockFrequency / samplingFrequency;
285
286 {
287 // The filter order will maximally be 124 with the current constraints.
288 // N >= (96.33 - 7.95)/(2 * pi * 2.285 * (maxfreq - passbandfreq) >= 123
289 // The filter order is equal to the number of zero crossings, i.e.
290 // it should be an even number (sinc is symmetric with respect to x = 0).
291 int N = static_cast<int>((A - 7.95) / (2.285 * dw) + 0.5);
292 N += N & 1;
293
294 // The filter length is equal to the filter order + 1.
295 // The filter length must be an odd number (sinc is symmetric with respect to
296 // x = 0).
297 firN = static_cast<int>(N * cyclesPerSampleD) + 1;
298 firN |= 1;
299
300 // Check whether the sample ring buffer would overflow.
301 assert(firN < RINGSIZE);
302
303 // Error is bounded by err < 1.234 / L^2, so L = sqrt(1.234 / (2^-16)) = sqrt(1.234 * 2^16).
304 firRES = static_cast<int>(ceil(sqrt(1.234 * (1 << BITS)) / cyclesPerSampleD));
305
306 // firN*firRES represent the total resolution of the sinc sampling. JOS
307 // recommends a length of 2^BITS, but we don't quite use that good a filter.
308 // The filter test program indicates that the filter performs well, though.
309 }
310
311 // Create the map key
312 std::ostringstream o;
313 o << firN << "," << firRES << "," << cyclesPerSampleD;
314 const std::string firKey = o.str();
315 fir_cache_t::iterator lb = FIR_CACHE.lower_bound(firKey);
316
317 // The FIR computation is expensive and we set sampling parameters often, but
318 // from a very small set of choices. Thus, caching is used to speed initialization.
319 if (lb != FIR_CACHE.end() && !(FIR_CACHE.key_comp()(firKey, lb->first)))
320 {
321 firTable = &(lb->second);
322 }
323 else
324 {
325 // Allocate memory for FIR tables.
326 matrix_t tempTable(firRES, firN);
327 #ifdef HAVE_CXX11
328 firTable = &(FIR_CACHE.emplace_hint(lb, fir_cache_t::value_type(firKey, tempTable))->second);
329 #else
330 firTable = &(FIR_CACHE.insert(lb, fir_cache_t::value_type(firKey, tempTable))->second);
331 #endif
332
333 // The cutoff frequency is midway through the transition band, in effect the same as nyquist.
334 const double wc = M_PI;
335
336 // Calculate the sinc tables.
337 const double scale = 32768.0 * wc / cyclesPerSampleD / M_PI;
338
339 // we're not interested in the fractional part
340 // so use int division before converting to double
341 const int tmp = firN / 2;
342 const double firN_2 = static_cast<double>(tmp);
343
344 for (int i = 0; i < firRES; i++)
345 {
346 const double jPhase = (double) i / firRES + firN_2;
347
348 for (int j = 0; j < firN; j++)
349 {
350 const double x = j - jPhase;
351
352 const double xt = x / firN_2;
353 const double kaiserXt = fabs(xt) < 1. ? I0(beta * sqrt(1. - xt * xt)) / I0beta : 0.;
354
355 const double wt = wc * x / cyclesPerSampleD;
356 const double sincWt = fabs(wt) >= 1e-8 ? sin(wt) / wt : 1.;
357
358 (*firTable)[i][j] = static_cast<short>(scale * sincWt * kaiserXt);
359 }
360 }
361 }
362 }
363
input(int input)364 bool SincResampler::input(int input)
365 {
366 bool ready = false;
367
368 /*
369 * Clip the input as it may overflow the 16 bit range.
370 *
371 * Approximate measured input ranges:
372 * 6581: [-24262,+25080] (Kawasaki_Synthesizer_Demo)
373 * 8580: [-21514,+35232] (64_Forever, Drum_Fool)
374 */
375 sample[sampleIndex] = sample[sampleIndex + RINGSIZE] = softClip(input);
376 sampleIndex = (sampleIndex + 1) & (RINGSIZE - 1);
377
378 if (sampleOffset < 1024)
379 {
380 outputValue = fir(sampleOffset);
381 ready = true;
382 sampleOffset += cyclesPerSample;
383 }
384
385 sampleOffset -= 1024;
386
387 return ready;
388 }
389
reset()390 void SincResampler::reset()
391 {
392 memset(sample, 0, sizeof(sample));
393 sampleOffset = 0;
394 }
395
396 } // namespace reSIDfp
397