1 /* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* this source code form is subject to the terms of the mozilla public
3  * license, v. 2.0. if a copy of the mpl was not distributed with this file,
4  * You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #include "AudioNodeEngineNEON.h"
7 #if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
8 #  include <arm64_neon.h>
9 #else
10 #  include <arm_neon.h>
11 #endif
12 
13 //#ifdef DEBUG
14 #if 0  // see bug 921099
15 #  define ASSERT_ALIGNED(ptr)                                     \
16     MOZ_ASSERT((((uintptr_t)ptr + 15) & ~0x0F) == (uintptr_t)ptr, \
17                #ptr " has to be aligned 16-bytes aligned.");
18 #else
19 #  define ASSERT_ALIGNED(ptr)
20 #endif
21 
22 #define ADDRESS_OF(array, index) ((float32_t*)&array[index])
23 
24 namespace mozilla {
AudioBufferAddWithScale_NEON(const float * aInput,float aScale,float * aOutput,uint32_t aSize)25 void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
26                                   float* aOutput, uint32_t aSize) {
27   ASSERT_ALIGNED(aInput);
28   ASSERT_ALIGNED(aOutput);
29 
30   float32x4_t vin0, vin1, vin2, vin3;
31   float32x4_t vout0, vout1, vout2, vout3;
32   float32x4_t vscale = vmovq_n_f32(aScale);
33 
34   uint32_t dif = aSize % 16;
35   aSize -= dif;
36   unsigned i = 0;
37   for (; i < aSize; i += 16) {
38     vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
39     vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
40     vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
41     vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
42 
43     vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
44     vout1 = vld1q_f32(ADDRESS_OF(aOutput, i + 4));
45     vout2 = vld1q_f32(ADDRESS_OF(aOutput, i + 8));
46     vout3 = vld1q_f32(ADDRESS_OF(aOutput, i + 12));
47 
48     vout0 = vmlaq_f32(vout0, vin0, vscale);
49     vout1 = vmlaq_f32(vout1, vin1, vscale);
50     vout2 = vmlaq_f32(vout2, vin2, vscale);
51     vout3 = vmlaq_f32(vout3, vin3, vscale);
52 
53     vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
54     vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
55     vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
56     vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
57   }
58 
59   for (unsigned j = 0; j < dif; ++i, ++j) {
60     aOutput[i] += aInput[i] * aScale;
61   }
62 }
AudioBlockCopyChannelWithScale_NEON(const float * aInput,float aScale,float * aOutput)63 void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
64                                          float* aOutput) {
65   ASSERT_ALIGNED(aInput);
66   ASSERT_ALIGNED(aOutput);
67 
68   float32x4_t vin0, vin1, vin2, vin3;
69   float32x4_t vout0, vout1, vout2, vout3;
70   float32x4_t vscale = vmovq_n_f32(aScale);
71 
72   for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
73     vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
74     vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
75     vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
76     vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
77 
78     vout0 = vmulq_f32(vin0, vscale);
79     vout1 = vmulq_f32(vin1, vscale);
80     vout2 = vmulq_f32(vin2, vscale);
81     vout3 = vmulq_f32(vin3, vscale);
82 
83     vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
84     vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
85     vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
86     vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
87   }
88 }
89 
AudioBlockCopyChannelWithScale_NEON(const float aInput[WEBAUDIO_BLOCK_SIZE],const float aScale[WEBAUDIO_BLOCK_SIZE],float aOutput[WEBAUDIO_BLOCK_SIZE])90 void AudioBlockCopyChannelWithScale_NEON(
91     const float aInput[WEBAUDIO_BLOCK_SIZE],
92     const float aScale[WEBAUDIO_BLOCK_SIZE],
93     float aOutput[WEBAUDIO_BLOCK_SIZE]) {
94   ASSERT_ALIGNED(aInput);
95   ASSERT_ALIGNED(aScale);
96   ASSERT_ALIGNED(aOutput);
97 
98   float32x4_t vin0, vin1, vin2, vin3;
99   float32x4_t vout0, vout1, vout2, vout3;
100   float32x4_t vscale0, vscale1, vscale2, vscale3;
101 
102   for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
103     vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
104     vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
105     vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
106     vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
107 
108     vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
109     vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
110     vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
111     vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
112 
113     vout0 = vmulq_f32(vin0, vscale0);
114     vout1 = vmulq_f32(vin1, vscale1);
115     vout2 = vmulq_f32(vin2, vscale2);
116     vout3 = vmulq_f32(vin3, vscale3);
117 
118     vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
119     vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
120     vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
121     vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
122   }
123 }
124 
AudioBufferInPlaceScale_NEON(float * aBlock,float aScale,uint32_t aSize)125 void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
126   ASSERT_ALIGNED(aBlock);
127 
128   float32x4_t vin0, vin1, vin2, vin3;
129   float32x4_t vout0, vout1, vout2, vout3;
130   float32x4_t vscale = vmovq_n_f32(aScale);
131 
132   uint32_t dif = aSize % 16;
133   uint32_t vectorSize = aSize - dif;
134   uint32_t i = 0;
135   for (; i < vectorSize; i += 16) {
136     vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
137     vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
138     vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
139     vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
140 
141     vout0 = vmulq_f32(vin0, vscale);
142     vout1 = vmulq_f32(vin1, vscale);
143     vout2 = vmulq_f32(vin2, vscale);
144     vout3 = vmulq_f32(vin3, vscale);
145 
146     vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
147     vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
148     vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
149     vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
150   }
151 
152   for (unsigned j = 0; j < dif; ++i, ++j) {
153     aBlock[i] *= aScale;
154   }
155 }
156 
AudioBufferInPlaceScale_NEON(float * aBlock,float * aScale,uint32_t aSize)157 void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale,
158                                   uint32_t aSize) {
159   ASSERT_ALIGNED(aBlock);
160 
161   float32x4_t vin0, vin1, vin2, vin3;
162   float32x4_t vout0, vout1, vout2, vout3;
163   float32x4_t vscale0, vscale1, vscale2, vscale3;
164 
165   uint32_t dif = aSize % 16;
166   uint32_t vectorSize = aSize - dif;
167   uint32_t i = 0;
168   for (; i < vectorSize; i += 16) {
169     vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
170     vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
171     vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
172     vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
173 
174     vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
175     vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
176     vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
177     vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
178 
179     vout0 = vmulq_f32(vin0, vscale0);
180     vout1 = vmulq_f32(vin1, vscale1);
181     vout2 = vmulq_f32(vin2, vscale2);
182     vout3 = vmulq_f32(vin3, vscale3);
183 
184     vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
185     vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
186     vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
187     vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
188   }
189 
190   for (unsigned j = 0; j < dif; ++i, ++j) {
191     aBlock[i] *= aScale[i];
192   }
193 }
194 
AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],const float aInputR[WEBAUDIO_BLOCK_SIZE],float aGainL,float aGainR,bool aIsOnTheLeft,float aOutputL[WEBAUDIO_BLOCK_SIZE],float aOutputR[WEBAUDIO_BLOCK_SIZE])195 void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
196                                       const float aInputR[WEBAUDIO_BLOCK_SIZE],
197                                       float aGainL, float aGainR,
198                                       bool aIsOnTheLeft,
199                                       float aOutputL[WEBAUDIO_BLOCK_SIZE],
200                                       float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
201   ASSERT_ALIGNED(aInputL);
202   ASSERT_ALIGNED(aInputR);
203   ASSERT_ALIGNED(aOutputL);
204   ASSERT_ALIGNED(aOutputR);
205 
206   float32x4_t vinL0, vinL1;
207   float32x4_t vinR0, vinR1;
208   float32x4_t voutL0, voutL1;
209   float32x4_t voutR0, voutR1;
210   float32x4_t vscaleL = vmovq_n_f32(aGainL);
211   float32x4_t vscaleR = vmovq_n_f32(aGainR);
212 
213   if (aIsOnTheLeft) {
214     for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
215       vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
216       vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
217 
218       vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
219       vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
220 
221       voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
222       voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);
223 
224       vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
225       vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
226 
227       voutR0 = vmulq_f32(vinR0, vscaleR);
228       voutR1 = vmulq_f32(vinR1, vscaleR);
229 
230       vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
231       vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
232     }
233   } else {
234     for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
235       vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
236       vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
237 
238       vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
239       vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
240 
241       voutL0 = vmulq_f32(vinL0, vscaleL);
242       voutL1 = vmulq_f32(vinL1, vscaleL);
243 
244       vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
245       vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
246 
247       voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
248       voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);
249 
250       vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
251       vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
252     }
253   }
254 }
255 
AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],const float aInputR[WEBAUDIO_BLOCK_SIZE],const float aGainL[WEBAUDIO_BLOCK_SIZE],const float aGainR[WEBAUDIO_BLOCK_SIZE],const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],float aOutputL[WEBAUDIO_BLOCK_SIZE],float aOutputR[WEBAUDIO_BLOCK_SIZE])256 void AudioBlockPanStereoToStereo_NEON(
257     const float aInputL[WEBAUDIO_BLOCK_SIZE],
258     const float aInputR[WEBAUDIO_BLOCK_SIZE],
259     const float aGainL[WEBAUDIO_BLOCK_SIZE],
260     const float aGainR[WEBAUDIO_BLOCK_SIZE],
261     const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
262     float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
263   ASSERT_ALIGNED(aInputL);
264   ASSERT_ALIGNED(aInputR);
265   ASSERT_ALIGNED(aGainL);
266   ASSERT_ALIGNED(aGainR);
267   ASSERT_ALIGNED(aIsOnTheLeft);
268   ASSERT_ALIGNED(aOutputL);
269   ASSERT_ALIGNED(aOutputR);
270 
271   float32x4_t vinL0, vinL1;
272   float32x4_t vinR0, vinR1;
273   float32x4_t voutL0, voutL1;
274   float32x4_t voutR0, voutR1;
275   float32x4_t vscaleL0, vscaleL1;
276   float32x4_t vscaleR0, vscaleR1;
277   float32x4_t onleft0, onleft1, notonleft0, notonleft1;
278 
279   float32x4_t zero = vmovq_n_f32(0);
280   uint8x8_t isOnTheLeft;
281 
282   // Although MSVC throws uninitialized value warning for voutL0 and voutL1,
283   // since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid
284   // compiler warning, set zero.
285   voutL0 = zero;
286   voutL1 = zero;
287 
288   for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
289     vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
290     vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
291 
292     vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
293     vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
294 
295     vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i));
296     vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4));
297 
298     vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i));
299     vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4));
300 
301     // Load output with boolean "on the left" values. This assumes that
302     // bools are stored as a single byte.
303     isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]);
304     voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0);
305     voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1);
306     voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2);
307     voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3);
308     voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0);
309     voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1);
310     voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2);
311     voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3);
312 
313     // Convert the boolean values into masks by setting all bits to 1
314     // if true.
315     voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero);
316     voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero);
317 
318     // The right output masks are the same as the left masks
319     voutR0 = voutL0;
320     voutR1 = voutL1;
321 
322     // Calculate left channel assuming isOnTheLeft
323     onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0);
324     onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL0);
325 
326     // Calculate left channel assuming not isOnTheLeft
327     notonleft0 = vmulq_f32(vinL0, vscaleL0);
328     notonleft1 = vmulq_f32(vinL1, vscaleL1);
329 
330     // Write results using previously stored masks
331     voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0);
332     voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1);
333 
334     // Calculate right channel assuming isOnTheLeft
335     onleft0 = vmulq_f32(vinR0, vscaleR0);
336     onleft1 = vmulq_f32(vinR1, vscaleR1);
337 
338     // Calculate right channel assuming not isOnTheLeft
339     notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0);
340     notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1);
341 
342     // Write results using previously stored masks
343     voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0);
344     voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1);
345 
346     vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
347     vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
348     vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
349     vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
350   }
351 }
352 }  // namespace mozilla
353