residfp/resample/SincResampler.cpp

/*
 * This file is part of libsidplayfp, a SID player engine.
 *
 * Copyright 2011-2020 Leandro Nini <drfiemost@users.sourceforge.net>
 * Copyright 2007-2010 Antti Lankila
 * Copyright 2004 Dag Lem <resid@nimrod.no>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */

#include "SincResampler.h"

#include <cassert>
#include <cstring>
#include <cmath>
#include <iostream>
#include <sstream>

#include "siddefs-fp.h"

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif

#ifdef HAVE_EMMINTRIN_H
#  include <emmintrin.h>
#elif defined HAVE_MMINTRIN_H
#  include <mmintrin.h>
#elif defined(HAVE_ARM_NEON_H)
#  include <arm_neon.h>
#endif

namespace reSIDfp
{

typedef std::map<std::string, matrix_t> fir_cache_t;

/// Cache for the expensive FIR table computation results.
fir_cache_t FIR_CACHE;

/// Maximum error acceptable in I0 is 1e-6, or ~96 dB.
const double I0E = 1e-6;

const int BITS = 16;

/**
 * Compute the 0th order modified Bessel function of the first kind.
 * This function is originally from resample-1.5/filterkit.c by J. O. Smith.
 * It is used to build the Kaiser window for resampling.
 *
 * @param x evaluate I0 at x
 * @return value of I0 at x.
 */
double I0(double x)
{
    double sum = 1.;
    double u = 1.;
    double n = 1.;
    const double halfx = x / 2.;

    do
    {
        const double temp = halfx / n;
        u *= temp * temp;
        sum += u;
        n += 1.;
    }
    while (u >= I0E * sum);

    return sum;
}

/**
 * Calculate convolution with sample and sinc.
 *
 * @param a sample buffer input
 * @param b sinc buffer
 * @param bLength length of the sinc buffer
 * @return convolved result
 */
int convolve(const short* a, const short* b, int bLength)
{
#ifdef HAVE_EMMINTRIN_H
    int out = 0;

    const uintptr_t offset = (uintptr_t)(a) & 0x0f;

    if (offset == ((uintptr_t)(b) & 0x0f))
    {
        if (offset)
        {
            const int l = (0x10 - offset)/2;

            for (int i = 0; i < l; i++)
            {
                out += *a++ * *b++;
            }

            bLength -= offset;
        }

        __m128i acc = _mm_setzero_si128();

        const int n = bLength / 8;

        for (int i = 0; i < n; i++)
        {
            const __m128i tmp = _mm_madd_epi16(*(__m128i*)a, *(__m128i*)b);
            acc = _mm_add_epi16(acc, tmp);
            a += 8;
            b += 8;
        }

        __m128i vsum = _mm_add_epi32(acc, _mm_srli_si128(acc, 8));
        vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
        out += _mm_cvtsi128_si32(vsum);

        bLength &= 7;
    }
#elif defined HAVE_MMINTRIN_H
    __m64 acc = _mm_setzero_si64();

    const int n = bLength / 4;

    for (int i = 0; i < n; i++)
    {
        const __m64 tmp = _mm_madd_pi16(*(__m64*)a, *(__m64*)b);
        acc = _mm_add_pi16(acc, tmp);
        a += 4;
        b += 4;
    }

    int out = _mm_cvtsi64_si32(acc) + _mm_cvtsi64_si32(_mm_srli_si64(acc, 32));
    _mm_empty();

    bLength &= 3;
#elif defined(HAVE_ARM_NEON_H)
#if (defined(__arm64__) && defined(__APPLE__)) || defined(__aarch64__)
    int32x4_t acc1Low = vdupq_n_s32(0);
    int32x4_t acc1High = vdupq_n_s32(0);
    int32x4_t acc2Low = vdupq_n_s32(0);
    int32x4_t acc2High = vdupq_n_s32(0);

    const int n = bLength / 16;

    for (int i = 0; i < n; i++)
    {
        int16x8_t v11 = vld1q_s16(a);
        int16x8_t v12 = vld1q_s16(a + 8);
        int16x8_t v21 = vld1q_s16(b);
        int16x8_t v22 = vld1q_s16(b + 8);

        acc1Low  = vmlal_s16(acc1Low, vget_low_s16(v11), vget_low_s16(v21));
        acc1High = vmlal_high_s16(acc1High, v11, v21);
        acc2Low  = vmlal_s16(acc2Low, vget_low_s16(v12), vget_low_s16(v22));
        acc2High = vmlal_high_s16(acc2High, v12, v22);

        a += 16;
        b += 16;
    }

    bLength &= 15;

    if (bLength >= 8)
    {
        int16x8_t v1 = vld1q_s16(a);
        int16x8_t v2 = vld1q_s16(b);

        acc1Low  = vmlal_s16(acc1Low, vget_low_s16(v1), vget_low_s16(v2));
        acc1High = vmlal_high_s16(acc1High, v1, v2);

        a += 8;
        b += 8;
    }

    bLength &= 7;

    if (bLength >= 4)
    {
        int16x4_t v1 = vld1_s16(a);
        int16x4_t v2 = vld1_s16(b);

        acc1Low  = vmlal_s16(acc1Low, v1, v2);

        a += 4;
        b += 4;
    }

    int32x4_t accSumsNeon = vaddq_s32(acc1Low, acc1High);
    accSumsNeon = vaddq_s32(accSumsNeon, acc2Low);
    accSumsNeon = vaddq_s32(accSumsNeon, acc2High);

    int out = vaddvq_s32(accSumsNeon);

    bLength &= 3;
#else
    int32x4_t acc = vdupq_n_s32(0);

    const int n = bLength / 4;

    for (int i = 0; i < n; i++)
    {
        const int16x4_t h_vec = vld1_s16(a);
        const int16x4_t x_vec = vld1_s16(b);
        acc = vmlal_s16(acc, h_vec, x_vec);
        a += 4;
        b += 4;
    }

    int out = vgetq_lane_s32(acc, 0) +
              vgetq_lane_s32(acc, 1) +
              vgetq_lane_s32(acc, 2) +
              vgetq_lane_s32(acc, 3);

    bLength &= 3;
#endif
#else
    int out = 0;
#endif

    for (int i = 0; i < bLength; i++)
    {
        out += *a++ * *b++;
    }

    return (out + (1 << 14)) >> 15;
}

int SincResampler::fir(int subcycle)
{
    // Find the first of the nearest fir tables close to the phase
    int firTableFirst = (subcycle * firRES >> 10);
    const int firTableOffset = (subcycle * firRES) & 0x3ff;

    // Find firN most recent samples, plus one extra in case the FIR wraps.
    int sampleStart = sampleIndex - firN + RINGSIZE - 1;

    const int v1 = convolve(sample + sampleStart, (*firTable)[firTableFirst], firN);

    // Use next FIR table, wrap around to first FIR table using
    // previous sample.
    if (unlikely(++firTableFirst == firRES))
    {
        firTableFirst = 0;
        ++sampleStart;
    }

    const int v2 = convolve(sample + sampleStart, (*firTable)[firTableFirst], firN);

    // Linear interpolation between the sinc tables yields good
    // approximation for the exact value.
    return v1 + (firTableOffset * (v2 - v1) >> 10);
}

SincResampler::SincResampler(double clockFrequency, double samplingFrequency, double highestAccurateFrequency) :
    sampleIndex(0),
    cyclesPerSample(static_cast<int>(clockFrequency / samplingFrequency * 1024.)),
    sampleOffset(0),
    outputValue(0)
{
    // 16 bits -> -96dB stopband attenuation.
    const double A = -20. * log10(1.0 / (1 << BITS));
    // A fraction of the bandwidth is allocated to the transition band, which we double
    // because we design the filter to transition halfway at nyquist.
    const double dw = (1. - 2.*highestAccurateFrequency / samplingFrequency) * M_PI * 2.;

    // For calculation of beta and N see the reference for the kaiserord
    // function in the MATLAB Signal Processing Toolbox:
    // http://www.mathworks.com/help/signal/ref/kaiserord.html
    const double beta = 0.1102 * (A - 8.7);
    const double I0beta = I0(beta);
    const double cyclesPerSampleD = clockFrequency / samplingFrequency;

    {
        // The filter order will maximally be 124 with the current constraints.
        // N >= (96.33 - 7.95)/(2 * pi * 2.285 * (maxfreq - passbandfreq) >= 123
        // The filter order is equal to the number of zero crossings, i.e.
        // it should be an even number (sinc is symmetric with respect to x = 0).
        int N = static_cast<int>((A - 7.95) / (2.285 * dw) + 0.5);
        N += N & 1;

        // The filter length is equal to the filter order + 1.
        // The filter length must be an odd number (sinc is symmetric with respect to
        // x = 0).
        firN = static_cast<int>(N * cyclesPerSampleD) + 1;
        firN |= 1;

        // Check whether the sample ring buffer would overflow.
        assert(firN < RINGSIZE);

        // Error is bounded by err < 1.234 / L^2, so L = sqrt(1.234 / (2^-16)) = sqrt(1.234 * 2^16).
        firRES = static_cast<int>(ceil(sqrt(1.234 * (1 << BITS)) / cyclesPerSampleD));

        // firN*firRES represent the total resolution of the sinc sampling. JOS
        // recommends a length of 2^BITS, but we don't quite use that good a filter.
        // The filter test program indicates that the filter performs well, though.
    }

    // Create the map key
    std::ostringstream o;
    o << firN << "," << firRES << "," << cyclesPerSampleD;
    const std::string firKey = o.str();
    fir_cache_t::iterator lb = FIR_CACHE.lower_bound(firKey);

    // The FIR computation is expensive and we set sampling parameters often, but
    // from a very small set of choices. Thus, caching is used to speed initialization.
    if (lb != FIR_CACHE.end() && !(FIR_CACHE.key_comp()(firKey, lb->first)))
    {
        firTable = &(lb->second);
    }
    else
    {
        // Allocate memory for FIR tables.
        matrix_t tempTable(firRES, firN);
#ifdef HAVE_CXX11
        firTable = &(FIR_CACHE.emplace_hint(lb, fir_cache_t::value_type(firKey, tempTable))->second);
#else
        firTable = &(FIR_CACHE.insert(lb, fir_cache_t::value_type(firKey, tempTable))->second);
#endif

        // The cutoff frequency is midway through the transition band, in effect the same as nyquist.
        const double wc = M_PI;

        // Calculate the sinc tables.
        const double scale = 32768.0 * wc / cyclesPerSampleD / M_PI;

        // we're not interested in the fractional part
        // so use int division before converting to double
        const int tmp = firN / 2;
        const double firN_2 = static_cast<double>(tmp);

        for (int i = 0; i < firRES; i++)
        {
            const double jPhase = (double) i / firRES + firN_2;

            for (int j = 0; j < firN; j++)
            {
                const double x = j - jPhase;

                const double xt = x / firN_2;
                const double kaiserXt = fabs(xt) < 1. ? I0(beta * sqrt(1. - xt * xt)) / I0beta : 0.;

                const double wt = wc * x / cyclesPerSampleD;
                const double sincWt = fabs(wt) >= 1e-8 ? sin(wt) / wt : 1.;

                (*firTable)[i][j] = static_cast<short>(scale * sincWt * kaiserXt);
            }
        }
    }
}

bool SincResampler::input(int input)
{
    bool ready = false;

    /*
     * Clip the input as it may overflow the 16 bit range.
     *
     * Approximate measured input ranges:
     * 6581: [-24262,+25080]  (Kawasaki_Synthesizer_Demo)
     * 8580: [-21514,+35232]  (64_Forever, Drum_Fool)
     */
    sample[sampleIndex] = sample[sampleIndex + RINGSIZE] = softClip(input);
    sampleIndex = (sampleIndex + 1) & (RINGSIZE - 1);

    if (sampleOffset < 1024)
    {
        outputValue = fir(sampleOffset);
        ready = true;
        sampleOffset += cyclesPerSample;
    }

    sampleOffset -= 1024;

    return ready;
}

void SincResampler::reset()
{
    memset(sample, 0, sizeof(sample));
    sampleOffset = 0;
}

} // namespace reSIDfp