1 /*
2  * Copyright (C) 2011 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1.  Redistributions of source code must retain the above copyright
9  *     notice, this list of conditions and the following disclaimer.
10  * 2.  Redistributions in binary form must reproduce the above copyright
11  *     notice, this list of conditions and the following disclaimer in the
12  *     documentation and/or other materials provided with the distribution.
13  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
14  *     its contributors may be used to endorse or promote products derived
15  *     from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include "third_party/blink/renderer/platform/audio/sinc_resampler.h"
30 
31 #include "build/build_config.h"
32 #include "third_party/blink/renderer/platform/audio/audio_bus.h"
33 #include "third_party/blink/renderer/platform/wtf/math_extras.h"
34 
35 #if defined(ARCH_CPU_X86_FAMILY)
36 #include <emmintrin.h>
37 #endif
38 
39 // Input buffer layout, dividing the total buffer into regions (r0 - r5):
40 //
41 // |----------------|-----------------------------------------|----------------|
42 //
43 //                                     blockSize + kernelSize / 2
44 //                   <--------------------------------------------------------->
45 //                                                r0
46 //
47 //   kernelSize / 2   kernelSize / 2          kernelSize / 2     kernelSize / 2
48 // <---------------> <--------------->       <---------------> <--------------->
49 //         r1                r2                      r3               r4
50 //
51 //                                                     blockSize
52 //                                    <---------------------------------------->
53 //                                                         r5
54 
55 // The Algorithm:
56 //
57 // 1) Consume input frames into r0 (r1 is zero-initialized).
58 // 2) Position kernel centered at start of r0 (r2) and generate output frames
59 //    until kernel is centered at start of r4, or we've finished generating
60 //    all the output frames.
61 // 3) Copy r3 to r1 and r4 to r2.
62 // 4) Consume input frames into r5 (zero-pad if we run out of input).
63 // 5) Goto (2) until all of input is consumed.
64 //
65 // note: we're glossing over how the sub-sample handling works with
66 // m_virtualSourceIndex, etc.
67 
68 namespace blink {
69 
SincResampler(double scale_factor,unsigned kernel_size,unsigned number_of_kernel_offsets)70 SincResampler::SincResampler(double scale_factor,
71                              unsigned kernel_size,
72                              unsigned number_of_kernel_offsets)
73     : scale_factor_(scale_factor),
74       kernel_size_(kernel_size),
75       number_of_kernel_offsets_(number_of_kernel_offsets),
76       kernel_storage_(kernel_size_ * (number_of_kernel_offsets_ + 1)),
77       virtual_source_index_(0),
78       block_size_(512),
79       // See input buffer layout above.
80       input_buffer_(block_size_ + kernel_size_),
81       source_(nullptr),
82       source_frames_available_(0),
83       source_provider_(nullptr),
84       is_buffer_primed_(false) {
85   InitializeKernel();
86 }
87 
InitializeKernel()88 void SincResampler::InitializeKernel() {
89   // Blackman window parameters.
90   double alpha = 0.16;
91   double a0 = 0.5 * (1.0 - alpha);
92   double a1 = 0.5;
93   double a2 = 0.5 * alpha;
94 
95   // sincScaleFactor is basically the normalized cutoff frequency of the
96   // low-pass filter.
97   double sinc_scale_factor = scale_factor_ > 1.0 ? 1.0 / scale_factor_ : 1.0;
98 
99   // The sinc function is an idealized brick-wall filter, but since we're
100   // windowing it the transition from pass to stop does not happen right away.
101   // So we should adjust the lowpass filter cutoff slightly downward to avoid
102   // some aliasing at the very high-end.
103   // FIXME: this value is empirical and to be more exact should vary depending
104   // on m_kernelSize.
105   sinc_scale_factor *= 0.9;
106 
107   int n = kernel_size_;
108   int half_size = n / 2;
109 
110   // Generates a set of windowed sinc() kernels.
111   // We generate a range of sub-sample offsets from 0.0 to 1.0.
112   for (unsigned offset_index = 0; offset_index <= number_of_kernel_offsets_;
113        ++offset_index) {
114     double subsample_offset =
115         static_cast<double>(offset_index) / number_of_kernel_offsets_;
116 
117     for (int i = 0; i < n; ++i) {
118       // Compute the sinc() with offset.
119       double s =
120           sinc_scale_factor * kPiDouble * (i - half_size - subsample_offset);
121       double sinc = !s ? 1.0 : std::sin(s) / s;
122       sinc *= sinc_scale_factor;
123 
124       // Compute Blackman window, matching the offset of the sinc().
125       double x = (i - subsample_offset) / n;
126       double window = a0 - a1 * std::cos(kTwoPiDouble * x) +
127                       a2 * std::cos(kTwoPiDouble * 2.0 * x);
128 
129       // Window the sinc() function and store at the correct offset.
130       kernel_storage_[i + offset_index * kernel_size_] = sinc * window;
131     }
132   }
133 }
134 
ConsumeSource(float * buffer,unsigned number_of_source_frames)135 void SincResampler::ConsumeSource(float* buffer,
136                                   unsigned number_of_source_frames) {
137   DCHECK(source_provider_);
138 
139   // Wrap the provided buffer by an AudioBus for use by the source provider.
140   scoped_refptr<AudioBus> bus =
141       AudioBus::Create(1, number_of_source_frames, false);
142 
143   // FIXME: Find a way to make the following const-correct:
144   bus->SetChannelMemory(0, buffer, number_of_source_frames);
145 
146   source_provider_->ProvideInput(bus.get(), number_of_source_frames);
147 }
148 
149 namespace {
150 
151 // BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
152 
153 class BufferSourceProvider final : public AudioSourceProvider {
154  public:
BufferSourceProvider(const float * source,uint32_t number_of_source_frames)155   BufferSourceProvider(const float* source, uint32_t number_of_source_frames)
156       : source_(source), source_frames_available_(number_of_source_frames) {}
157 
158   // Consumes samples from the in-memory buffer.
ProvideInput(AudioBus * bus,uint32_t frames_to_process)159   void ProvideInput(AudioBus* bus, uint32_t frames_to_process) override {
160     DCHECK(source_);
161     DCHECK(bus);
162     if (!source_ || !bus)
163       return;
164 
165     float* buffer = bus->Channel(0)->MutableData();
166 
167     // Clamp to number of frames available and zero-pad.
168     uint32_t frames_to_copy =
169         std::min(source_frames_available_, frames_to_process);
170     memcpy(buffer, source_, sizeof(float) * frames_to_copy);
171 
172     // Zero-pad if necessary.
173     if (frames_to_copy < frames_to_process)
174       memset(buffer + frames_to_copy, 0,
175              sizeof(float) * (frames_to_process - frames_to_copy));
176 
177     source_frames_available_ -= frames_to_copy;
178     source_ += frames_to_copy;
179   }
180 
181  private:
182   const float* source_;
183   uint32_t source_frames_available_;
184 };
185 
186 }  // namespace
187 
Process(const float * source,float * destination,unsigned number_of_source_frames)188 void SincResampler::Process(const float* source,
189                             float* destination,
190                             unsigned number_of_source_frames) {
191   // Resample an in-memory buffer using an AudioSourceProvider.
192   BufferSourceProvider source_provider(source, number_of_source_frames);
193 
194   unsigned number_of_destination_frames =
195       static_cast<unsigned>(number_of_source_frames / scale_factor_);
196   unsigned remaining = number_of_destination_frames;
197 
198   while (remaining) {
199     unsigned frames_this_time = std::min(remaining, block_size_);
200     Process(&source_provider, destination, frames_this_time);
201 
202     destination += frames_this_time;
203     remaining -= frames_this_time;
204   }
205 }
206 
Process(AudioSourceProvider * source_provider,float * destination,uint32_t frames_to_process)207 void SincResampler::Process(AudioSourceProvider* source_provider,
208                             float* destination,
209                             uint32_t frames_to_process) {
210   DCHECK(source_provider);
211   DCHECK_GT(block_size_, kernel_size_);
212   DCHECK_GE(input_buffer_.size(), block_size_ + kernel_size_);
213   DCHECK_EQ(kernel_size_ % 2, 0u);
214 
215   source_provider_ = source_provider;
216 
217   unsigned number_of_destination_frames = frames_to_process;
218 
219   // Setup various region pointers in the buffer (see diagram above).
220   float* r0 = input_buffer_.Data() + kernel_size_ / 2;
221   float* r1 = input_buffer_.Data();
222   float* r2 = r0;
223   float* r3 = r0 + block_size_ - kernel_size_ / 2;
224   float* r4 = r0 + block_size_;
225   float* r5 = r0 + kernel_size_ / 2;
226 
227   // Step (1)
228   // Prime the input buffer at the start of the input stream.
229   if (!is_buffer_primed_) {
230     ConsumeSource(r0, block_size_ + kernel_size_ / 2);
231     is_buffer_primed_ = true;
232   }
233 
234   // Step (2)
235 
236   while (number_of_destination_frames) {
237     while (virtual_source_index_ < block_size_) {
238       // m_virtualSourceIndex lies in between two kernel offsets so figure out
239       // what they are.
240       int source_index_i = static_cast<int>(virtual_source_index_);
241       double subsample_remainder = virtual_source_index_ - source_index_i;
242 
243       double virtual_offset_index =
244           subsample_remainder * number_of_kernel_offsets_;
245       int offset_index = static_cast<int>(virtual_offset_index);
246 
247       float* k1 = kernel_storage_.Data() + offset_index * kernel_size_;
248       float* k2 = k1 + kernel_size_;
249 
250       // Initialize input pointer based on quantized m_virtualSourceIndex.
251       float* input_p = r1 + source_index_i;
252 
253       // We'll compute "convolutions" for the two kernels which straddle
254       // m_virtualSourceIndex
255       float sum1 = 0;
256       float sum2 = 0;
257 
258       // Figure out how much to weight each kernel's "convolution".
259       double kernel_interpolation_factor = virtual_offset_index - offset_index;
260 
261       // Generate a single output sample.
262       int n = kernel_size_;
263 
264 #define CONVOLVE_ONE_SAMPLE() \
265   do {                        \
266     input = *input_p++;       \
267     sum1 += input * *k1;      \
268     sum2 += input * *k2;      \
269     ++k1;                     \
270     ++k2;                     \
271   } while (0)
272 
273       {
274         float input;
275 
276 #if defined(ARCH_CPU_X86_FAMILY)
277         // If the sourceP address is not 16-byte aligned, the first several
278         // frames (at most three) should be processed seperately.
279         while ((reinterpret_cast<uintptr_t>(input_p) & 0x0F) && n) {
280           CONVOLVE_ONE_SAMPLE();
281           n--;
282         }
283 
284         // Now the inputP is aligned and start to apply SSE.
285         float* end_p = input_p + n - n % 4;
286         __m128 m_input;
287         __m128 m_k1;
288         __m128 m_k2;
289         __m128 mul1;
290         __m128 mul2;
291 
292         __m128 sums1 = _mm_setzero_ps();
293         __m128 sums2 = _mm_setzero_ps();
294         bool k1_aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
295         bool k2_aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
296 
297 #define LOAD_DATA(l1, l2)           \
298   do {                              \
299     m_input = _mm_load_ps(input_p); \
300     m_k1 = _mm_##l1##_ps(k1);       \
301     m_k2 = _mm_##l2##_ps(k2);       \
302   } while (0)
303 
304 #define CONVOLVE_4_SAMPLES()          \
305   do {                                \
306     mul1 = _mm_mul_ps(m_input, m_k1); \
307     mul2 = _mm_mul_ps(m_input, m_k2); \
308     sums1 = _mm_add_ps(sums1, mul1);  \
309     sums2 = _mm_add_ps(sums2, mul2);  \
310     input_p += 4;                     \
311     k1 += 4;                          \
312     k2 += 4;                          \
313   } while (0)
314 
315         if (k1_aligned && k2_aligned) {  // both aligned
316           while (input_p < end_p) {
317             LOAD_DATA(load, load);
318             CONVOLVE_4_SAMPLES();
319           }
320         } else if (!k1_aligned && k2_aligned) {  // only k2 aligned
321           while (input_p < end_p) {
322             LOAD_DATA(loadu, load);
323             CONVOLVE_4_SAMPLES();
324           }
325         } else if (k1_aligned && !k2_aligned) {  // only k1 aligned
326           while (input_p < end_p) {
327             LOAD_DATA(load, loadu);
328             CONVOLVE_4_SAMPLES();
329           }
330         } else {  // both non-aligned
331           while (input_p < end_p) {
332             LOAD_DATA(loadu, loadu);
333             CONVOLVE_4_SAMPLES();
334           }
335         }
336 
337         // Summarize the SSE results to sum1 and sum2.
338         float* group_sum_p = reinterpret_cast<float*>(&sums1);
339         sum1 +=
340             group_sum_p[0] + group_sum_p[1] + group_sum_p[2] + group_sum_p[3];
341         group_sum_p = reinterpret_cast<float*>(&sums2);
342         sum2 +=
343             group_sum_p[0] + group_sum_p[1] + group_sum_p[2] + group_sum_p[3];
344 
345         n %= 4;
346         while (n) {
347           CONVOLVE_ONE_SAMPLE();
348           n--;
349         }
350 #else
351         // FIXME: add ARM NEON optimizations for the following. The scalar
352         // code-path can probably also be optimized better.
353 
354         // Optimize size 32 and size 64 kernels by unrolling the while loop.
355         // A 20 - 30% speed improvement was measured in some cases by using this
356         // approach.
357 
358         if (n == 32) {
359           CONVOLVE_ONE_SAMPLE();  // 1
360           CONVOLVE_ONE_SAMPLE();  // 2
361           CONVOLVE_ONE_SAMPLE();  // 3
362           CONVOLVE_ONE_SAMPLE();  // 4
363           CONVOLVE_ONE_SAMPLE();  // 5
364           CONVOLVE_ONE_SAMPLE();  // 6
365           CONVOLVE_ONE_SAMPLE();  // 7
366           CONVOLVE_ONE_SAMPLE();  // 8
367           CONVOLVE_ONE_SAMPLE();  // 9
368           CONVOLVE_ONE_SAMPLE();  // 10
369           CONVOLVE_ONE_SAMPLE();  // 11
370           CONVOLVE_ONE_SAMPLE();  // 12
371           CONVOLVE_ONE_SAMPLE();  // 13
372           CONVOLVE_ONE_SAMPLE();  // 14
373           CONVOLVE_ONE_SAMPLE();  // 15
374           CONVOLVE_ONE_SAMPLE();  // 16
375           CONVOLVE_ONE_SAMPLE();  // 17
376           CONVOLVE_ONE_SAMPLE();  // 18
377           CONVOLVE_ONE_SAMPLE();  // 19
378           CONVOLVE_ONE_SAMPLE();  // 20
379           CONVOLVE_ONE_SAMPLE();  // 21
380           CONVOLVE_ONE_SAMPLE();  // 22
381           CONVOLVE_ONE_SAMPLE();  // 23
382           CONVOLVE_ONE_SAMPLE();  // 24
383           CONVOLVE_ONE_SAMPLE();  // 25
384           CONVOLVE_ONE_SAMPLE();  // 26
385           CONVOLVE_ONE_SAMPLE();  // 27
386           CONVOLVE_ONE_SAMPLE();  // 28
387           CONVOLVE_ONE_SAMPLE();  // 29
388           CONVOLVE_ONE_SAMPLE();  // 30
389           CONVOLVE_ONE_SAMPLE();  // 31
390           CONVOLVE_ONE_SAMPLE();  // 32
391         } else if (n == 64) {
392           CONVOLVE_ONE_SAMPLE();  // 1
393           CONVOLVE_ONE_SAMPLE();  // 2
394           CONVOLVE_ONE_SAMPLE();  // 3
395           CONVOLVE_ONE_SAMPLE();  // 4
396           CONVOLVE_ONE_SAMPLE();  // 5
397           CONVOLVE_ONE_SAMPLE();  // 6
398           CONVOLVE_ONE_SAMPLE();  // 7
399           CONVOLVE_ONE_SAMPLE();  // 8
400           CONVOLVE_ONE_SAMPLE();  // 9
401           CONVOLVE_ONE_SAMPLE();  // 10
402           CONVOLVE_ONE_SAMPLE();  // 11
403           CONVOLVE_ONE_SAMPLE();  // 12
404           CONVOLVE_ONE_SAMPLE();  // 13
405           CONVOLVE_ONE_SAMPLE();  // 14
406           CONVOLVE_ONE_SAMPLE();  // 15
407           CONVOLVE_ONE_SAMPLE();  // 16
408           CONVOLVE_ONE_SAMPLE();  // 17
409           CONVOLVE_ONE_SAMPLE();  // 18
410           CONVOLVE_ONE_SAMPLE();  // 19
411           CONVOLVE_ONE_SAMPLE();  // 20
412           CONVOLVE_ONE_SAMPLE();  // 21
413           CONVOLVE_ONE_SAMPLE();  // 22
414           CONVOLVE_ONE_SAMPLE();  // 23
415           CONVOLVE_ONE_SAMPLE();  // 24
416           CONVOLVE_ONE_SAMPLE();  // 25
417           CONVOLVE_ONE_SAMPLE();  // 26
418           CONVOLVE_ONE_SAMPLE();  // 27
419           CONVOLVE_ONE_SAMPLE();  // 28
420           CONVOLVE_ONE_SAMPLE();  // 29
421           CONVOLVE_ONE_SAMPLE();  // 30
422           CONVOLVE_ONE_SAMPLE();  // 31
423           CONVOLVE_ONE_SAMPLE();  // 32
424           CONVOLVE_ONE_SAMPLE();  // 33
425           CONVOLVE_ONE_SAMPLE();  // 34
426           CONVOLVE_ONE_SAMPLE();  // 35
427           CONVOLVE_ONE_SAMPLE();  // 36
428           CONVOLVE_ONE_SAMPLE();  // 37
429           CONVOLVE_ONE_SAMPLE();  // 38
430           CONVOLVE_ONE_SAMPLE();  // 39
431           CONVOLVE_ONE_SAMPLE();  // 40
432           CONVOLVE_ONE_SAMPLE();  // 41
433           CONVOLVE_ONE_SAMPLE();  // 42
434           CONVOLVE_ONE_SAMPLE();  // 43
435           CONVOLVE_ONE_SAMPLE();  // 44
436           CONVOLVE_ONE_SAMPLE();  // 45
437           CONVOLVE_ONE_SAMPLE();  // 46
438           CONVOLVE_ONE_SAMPLE();  // 47
439           CONVOLVE_ONE_SAMPLE();  // 48
440           CONVOLVE_ONE_SAMPLE();  // 49
441           CONVOLVE_ONE_SAMPLE();  // 50
442           CONVOLVE_ONE_SAMPLE();  // 51
443           CONVOLVE_ONE_SAMPLE();  // 52
444           CONVOLVE_ONE_SAMPLE();  // 53
445           CONVOLVE_ONE_SAMPLE();  // 54
446           CONVOLVE_ONE_SAMPLE();  // 55
447           CONVOLVE_ONE_SAMPLE();  // 56
448           CONVOLVE_ONE_SAMPLE();  // 57
449           CONVOLVE_ONE_SAMPLE();  // 58
450           CONVOLVE_ONE_SAMPLE();  // 59
451           CONVOLVE_ONE_SAMPLE();  // 60
452           CONVOLVE_ONE_SAMPLE();  // 61
453           CONVOLVE_ONE_SAMPLE();  // 62
454           CONVOLVE_ONE_SAMPLE();  // 63
455           CONVOLVE_ONE_SAMPLE();  // 64
456         } else {
457           while (n--) {
458             // Non-optimized using actual while loop.
459             CONVOLVE_ONE_SAMPLE();
460           }
461         }
462 #endif
463       }
464 #undef CONVOLVE_ONE_SAMPLE
465 
466       // Linearly interpolate the two "convolutions".
467       double result = (1.0 - kernel_interpolation_factor) * sum1 +
468                       kernel_interpolation_factor * sum2;
469 
470       *destination++ = result;
471 
472       // Advance the virtual index.
473       virtual_source_index_ += scale_factor_;
474 
475       --number_of_destination_frames;
476       if (!number_of_destination_frames)
477         return;
478     }
479 
480     // Wrap back around to the start.
481     virtual_source_index_ -= block_size_;
482 
483     // Step (3) Copy r3 to r1 and r4 to r2.
484     // This wraps the last input frames back to the start of the buffer.
485     memcpy(r1, r3, sizeof(float) * (kernel_size_ / 2));
486     memcpy(r2, r4, sizeof(float) * (kernel_size_ / 2));
487 
488     // Step (4)
489     // Refresh the buffer with more input.
490     ConsumeSource(r5, block_size_);
491   }
492 }
493 
494 }  // namespace blink
495