1 /*
2  * This file is part of libsidplayfp, a SID player engine.
3  *
4  * Copyright 2011-2020 Leandro Nini <drfiemost@users.sourceforge.net>
5  * Copyright 2007-2010 Antti Lankila
6  * Copyright 2004 Dag Lem <resid@nimrod.no>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
21  */
22 
23 #include "SincResampler.h"
24 
25 #include <cassert>
26 #include <cstring>
27 #include <cmath>
28 #include <iostream>
29 #include <sstream>
30 
31 #include "siddefs-fp.h"
32 
33 #ifdef HAVE_CONFIG_H
34 #  include "config.h"
35 #endif
36 
37 #ifdef HAVE_EMMINTRIN_H
38 #  include <emmintrin.h>
39 #elif defined HAVE_MMINTRIN_H
40 #  include <mmintrin.h>
41 #elif defined(HAVE_ARM_NEON_H)
42 #  include <arm_neon.h>
43 #endif
44 
45 namespace reSIDfp
46 {
47 
48 typedef std::map<std::string, matrix_t> fir_cache_t;
49 
50 /// Cache for the expensive FIR table computation results.
51 fir_cache_t FIR_CACHE;
52 
53 /// Maximum error acceptable in I0 is 1e-6, or ~96 dB.
54 const double I0E = 1e-6;
55 
56 const int BITS = 16;
57 
58 /**
59  * Compute the 0th order modified Bessel function of the first kind.
60  * This function is originally from resample-1.5/filterkit.c by J. O. Smith.
61  * It is used to build the Kaiser window for resampling.
62  *
63  * @param x evaluate I0 at x
64  * @return value of I0 at x.
65  */
I0(double x)66 double I0(double x)
67 {
68     double sum = 1.;
69     double u = 1.;
70     double n = 1.;
71     const double halfx = x / 2.;
72 
73     do
74     {
75         const double temp = halfx / n;
76         u *= temp * temp;
77         sum += u;
78         n += 1.;
79     }
80     while (u >= I0E * sum);
81 
82     return sum;
83 }
84 
85 /**
86  * Calculate convolution with sample and sinc.
87  *
88  * @param a sample buffer input
89  * @param b sinc buffer
90  * @param bLength length of the sinc buffer
91  * @return convolved result
92  */
convolve(const short * a,const short * b,int bLength)93 int convolve(const short* a, const short* b, int bLength)
94 {
95 #ifdef HAVE_EMMINTRIN_H
96     int out = 0;
97 
98     const uintptr_t offset = (uintptr_t)(a) & 0x0f;
99 
100     if (offset == ((uintptr_t)(b) & 0x0f))
101     {
102         if (offset)
103         {
104             const int l = (0x10 - offset)/2;
105 
106             for (int i = 0; i < l; i++)
107             {
108                 out += *a++ * *b++;
109             }
110 
111             bLength -= offset;
112         }
113 
114         __m128i acc = _mm_setzero_si128();
115 
116         const int n = bLength / 8;
117 
118         for (int i = 0; i < n; i++)
119         {
120             const __m128i tmp = _mm_madd_epi16(*(__m128i*)a, *(__m128i*)b);
121             acc = _mm_add_epi16(acc, tmp);
122             a += 8;
123             b += 8;
124         }
125 
126         __m128i vsum = _mm_add_epi32(acc, _mm_srli_si128(acc, 8));
127         vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
128         out += _mm_cvtsi128_si32(vsum);
129 
130         bLength &= 7;
131     }
132 #elif defined HAVE_MMINTRIN_H
133     __m64 acc = _mm_setzero_si64();
134 
135     const int n = bLength / 4;
136 
137     for (int i = 0; i < n; i++)
138     {
139         const __m64 tmp = _mm_madd_pi16(*(__m64*)a, *(__m64*)b);
140         acc = _mm_add_pi16(acc, tmp);
141         a += 4;
142         b += 4;
143     }
144 
145     int out = _mm_cvtsi64_si32(acc) + _mm_cvtsi64_si32(_mm_srli_si64(acc, 32));
146     _mm_empty();
147 
148     bLength &= 3;
149 #elif defined(HAVE_ARM_NEON_H)
150 #if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
151     int32x4_t acc1Low = vdupq_n_s32(0);
152     int32x4_t acc1High = vdupq_n_s32(0);
153     int32x4_t acc2Low = vdupq_n_s32(0);
154     int32x4_t acc2High = vdupq_n_s32(0);
155 
156     const int n = bLength / 16;
157 
158     for (int i = 0; i < n; i++)
159     {
160         int16x8_t v11 = vld1q_s16(a);
161         int16x8_t v12 = vld1q_s16(a + 8);
162         int16x8_t v21 = vld1q_s16(b);
163         int16x8_t v22 = vld1q_s16(b + 8);
164 
165         acc1Low  = vmlal_s16(acc1Low, vget_low_s16(v11), vget_low_s16(v21));
166         acc1High = vmlal_high_s16(acc1High, v11, v21);
167         acc2Low  = vmlal_s16(acc2Low, vget_low_s16(v12), vget_low_s16(v22));
168         acc2High = vmlal_high_s16(acc2High, v12, v22);
169 
170         a += 16;
171         b += 16;
172     }
173 
174     bLength &= 15;
175 
176     if (bLength >= 8)
177     {
178         int16x8_t v1 = vld1q_s16(a);
179         int16x8_t v2 = vld1q_s16(b);
180 
181         acc1Low  = vmlal_s16(acc1Low, vget_low_s16(v1), vget_low_s16(v2));
182         acc1High = vmlal_high_s16(acc1High, v1, v2);
183 
184         a += 8;
185         b += 8;
186     }
187 
188     bLength &= 7;
189 
190     if (bLength >= 4)
191     {
192         int16x4_t v1 = vld1_s16(a);
193         int16x4_t v2 = vld1_s16(b);
194 
195         acc1Low  = vmlal_s16(acc1Low, v1, v2);
196 
197         a += 4;
198         b += 4;
199     }
200 
201     int32x4_t accSumsNeon = vaddq_s32(acc1Low, acc1High);
202     accSumsNeon = vaddq_s32(accSumsNeon, acc2Low);
203     accSumsNeon = vaddq_s32(accSumsNeon, acc2High);
204 
205     int out = vaddvq_s32(accSumsNeon);
206 
207     bLength &= 3;
208 #else
209     int32x4_t acc = vdupq_n_s32(0);
210 
211     const int n = bLength / 4;
212 
213     for (int i = 0; i < n; i++)
214     {
215         const int16x4_t h_vec = vld1_s16(a);
216         const int16x4_t x_vec = vld1_s16(b);
217         acc = vmlal_s16(acc, h_vec, x_vec);
218         a += 4;
219         b += 4;
220     }
221 
222     int out = vgetq_lane_s32(acc, 0) +
223               vgetq_lane_s32(acc, 1) +
224               vgetq_lane_s32(acc, 2) +
225               vgetq_lane_s32(acc, 3);
226 
227     bLength &= 3;
228 #endif
229 #else
230     int out = 0;
231 #endif
232 
233     for (int i = 0; i < bLength; i++)
234     {
235         out += *a++ * *b++;
236     }
237 
238     return (out + (1 << 14)) >> 15;
239 }
240 
fir(int subcycle)241 int SincResampler::fir(int subcycle)
242 {
243     // Find the first of the nearest fir tables close to the phase
244     int firTableFirst = (subcycle * firRES >> 10);
245     const int firTableOffset = (subcycle * firRES) & 0x3ff;
246 
247     // Find firN most recent samples, plus one extra in case the FIR wraps.
248     int sampleStart = sampleIndex - firN + RINGSIZE - 1;
249 
250     const int v1 = convolve(sample + sampleStart, (*firTable)[firTableFirst], firN);
251 
252     // Use next FIR table, wrap around to first FIR table using
253     // previous sample.
254     if (unlikely(++firTableFirst == firRES))
255     {
256         firTableFirst = 0;
257         ++sampleStart;
258     }
259 
260     const int v2 = convolve(sample + sampleStart, (*firTable)[firTableFirst], firN);
261 
262     // Linear interpolation between the sinc tables yields good
263     // approximation for the exact value.
264     return v1 + (firTableOffset * (v2 - v1) >> 10);
265 }
266 
SincResampler(double clockFrequency,double samplingFrequency,double highestAccurateFrequency)267 SincResampler::SincResampler(double clockFrequency, double samplingFrequency, double highestAccurateFrequency) :
268     sampleIndex(0),
269     cyclesPerSample(static_cast<int>(clockFrequency / samplingFrequency * 1024.)),
270     sampleOffset(0),
271     outputValue(0)
272 {
273     // 16 bits -> -96dB stopband attenuation.
274     const double A = -20. * log10(1.0 / (1 << BITS));
275     // A fraction of the bandwidth is allocated to the transition band, which we double
276     // because we design the filter to transition halfway at nyquist.
277     const double dw = (1. - 2.*highestAccurateFrequency / samplingFrequency) * M_PI * 2.;
278 
279     // For calculation of beta and N see the reference for the kaiserord
280     // function in the MATLAB Signal Processing Toolbox:
281     // http://www.mathworks.com/help/signal/ref/kaiserord.html
282     const double beta = 0.1102 * (A - 8.7);
283     const double I0beta = I0(beta);
284     const double cyclesPerSampleD = clockFrequency / samplingFrequency;
285 
286     {
287         // The filter order will maximally be 124 with the current constraints.
288         // N >= (96.33 - 7.95)/(2 * pi * 2.285 * (maxfreq - passbandfreq) >= 123
289         // The filter order is equal to the number of zero crossings, i.e.
290         // it should be an even number (sinc is symmetric with respect to x = 0).
291         int N = static_cast<int>((A - 7.95) / (2.285 * dw) + 0.5);
292         N += N & 1;
293 
294         // The filter length is equal to the filter order + 1.
295         // The filter length must be an odd number (sinc is symmetric with respect to
296         // x = 0).
297         firN = static_cast<int>(N * cyclesPerSampleD) + 1;
298         firN |= 1;
299 
300         // Check whether the sample ring buffer would overflow.
301         assert(firN < RINGSIZE);
302 
303         // Error is bounded by err < 1.234 / L^2, so L = sqrt(1.234 / (2^-16)) = sqrt(1.234 * 2^16).
304         firRES = static_cast<int>(ceil(sqrt(1.234 * (1 << BITS)) / cyclesPerSampleD));
305 
306         // firN*firRES represent the total resolution of the sinc sampling. JOS
307         // recommends a length of 2^BITS, but we don't quite use that good a filter.
308         // The filter test program indicates that the filter performs well, though.
309     }
310 
311     // Create the map key
312     std::ostringstream o;
313     o << firN << "," << firRES << "," << cyclesPerSampleD;
314     const std::string firKey = o.str();
315     fir_cache_t::iterator lb = FIR_CACHE.lower_bound(firKey);
316 
317     // The FIR computation is expensive and we set sampling parameters often, but
318     // from a very small set of choices. Thus, caching is used to speed initialization.
319     if (lb != FIR_CACHE.end() && !(FIR_CACHE.key_comp()(firKey, lb->first)))
320     {
321         firTable = &(lb->second);
322     }
323     else
324     {
325         // Allocate memory for FIR tables.
326         matrix_t tempTable(firRES, firN);
327 #ifdef HAVE_CXX11
328         firTable = &(FIR_CACHE.emplace_hint(lb, fir_cache_t::value_type(firKey, tempTable))->second);
329 #else
330         firTable = &(FIR_CACHE.insert(lb, fir_cache_t::value_type(firKey, tempTable))->second);
331 #endif
332 
333         // The cutoff frequency is midway through the transition band, in effect the same as nyquist.
334         const double wc = M_PI;
335 
336         // Calculate the sinc tables.
337         const double scale = 32768.0 * wc / cyclesPerSampleD / M_PI;
338 
339         // we're not interested in the fractional part
340         // so use int division before converting to double
341         const int tmp = firN / 2;
342         const double firN_2 = static_cast<double>(tmp);
343 
344         for (int i = 0; i < firRES; i++)
345         {
346             const double jPhase = (double) i / firRES + firN_2;
347 
348             for (int j = 0; j < firN; j++)
349             {
350                 const double x = j - jPhase;
351 
352                 const double xt = x / firN_2;
353                 const double kaiserXt = fabs(xt) < 1. ? I0(beta * sqrt(1. - xt * xt)) / I0beta : 0.;
354 
355                 const double wt = wc * x / cyclesPerSampleD;
356                 const double sincWt = fabs(wt) >= 1e-8 ? sin(wt) / wt : 1.;
357 
358                 (*firTable)[i][j] = static_cast<short>(scale * sincWt * kaiserXt);
359             }
360         }
361     }
362 }
363 
input(int input)364 bool SincResampler::input(int input)
365 {
366     bool ready = false;
367 
368     /*
369      * Clip the input as it may overflow the 16 bit range.
370      *
371      * Approximate measured input ranges:
372      * 6581: [-24262,+25080]  (Kawasaki_Synthesizer_Demo)
373      * 8580: [-21514,+35232]  (64_Forever, Drum_Fool)
374      */
375     sample[sampleIndex] = sample[sampleIndex + RINGSIZE] = softClip(input);
376     sampleIndex = (sampleIndex + 1) & (RINGSIZE - 1);
377 
378     if (sampleOffset < 1024)
379     {
380         outputValue = fir(sampleOffset);
381         ready = true;
382         sampleOffset += cyclesPerSample;
383     }
384 
385     sampleOffset -= 1024;
386 
387     return ready;
388 }
389 
reset()390 void SincResampler::reset()
391 {
392     memset(sample, 0, sizeof(sample));
393     sampleOffset = 0;
394 }
395 
396 } // namespace reSIDfp
397