1 /* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* this source code form is subject to the terms of the mozilla public
3 * license, v. 2.0. if a copy of the mpl was not distributed with this file,
4 * You can obtain one at http://mozilla.org/MPL/2.0/. */
5
6 #include "AudioNodeEngineNEON.h"
7 #if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
8 # include <arm64_neon.h>
9 #else
10 # include <arm_neon.h>
11 #endif
12
13 //#ifdef DEBUG
14 #if 0 // see bug 921099
15 # define ASSERT_ALIGNED(ptr) \
16 MOZ_ASSERT((((uintptr_t)ptr + 15) & ~0x0F) == (uintptr_t)ptr, \
17 #ptr " has to be aligned 16-bytes aligned.");
18 #else
19 # define ASSERT_ALIGNED(ptr)
20 #endif
21
22 #define ADDRESS_OF(array, index) ((float32_t*)&array[index])
23
24 namespace mozilla {
AudioBufferAddWithScale_NEON(const float * aInput,float aScale,float * aOutput,uint32_t aSize)25 void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
26 float* aOutput, uint32_t aSize) {
27 ASSERT_ALIGNED(aInput);
28 ASSERT_ALIGNED(aOutput);
29
30 float32x4_t vin0, vin1, vin2, vin3;
31 float32x4_t vout0, vout1, vout2, vout3;
32 float32x4_t vscale = vmovq_n_f32(aScale);
33
34 uint32_t dif = aSize % 16;
35 aSize -= dif;
36 unsigned i = 0;
37 for (; i < aSize; i += 16) {
38 vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
39 vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
40 vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
41 vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
42
43 vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
44 vout1 = vld1q_f32(ADDRESS_OF(aOutput, i + 4));
45 vout2 = vld1q_f32(ADDRESS_OF(aOutput, i + 8));
46 vout3 = vld1q_f32(ADDRESS_OF(aOutput, i + 12));
47
48 vout0 = vmlaq_f32(vout0, vin0, vscale);
49 vout1 = vmlaq_f32(vout1, vin1, vscale);
50 vout2 = vmlaq_f32(vout2, vin2, vscale);
51 vout3 = vmlaq_f32(vout3, vin3, vscale);
52
53 vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
54 vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
55 vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
56 vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
57 }
58
59 for (unsigned j = 0; j < dif; ++i, ++j) {
60 aOutput[i] += aInput[i] * aScale;
61 }
62 }
AudioBlockCopyChannelWithScale_NEON(const float * aInput,float aScale,float * aOutput)63 void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
64 float* aOutput) {
65 ASSERT_ALIGNED(aInput);
66 ASSERT_ALIGNED(aOutput);
67
68 float32x4_t vin0, vin1, vin2, vin3;
69 float32x4_t vout0, vout1, vout2, vout3;
70 float32x4_t vscale = vmovq_n_f32(aScale);
71
72 for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
73 vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
74 vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
75 vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
76 vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
77
78 vout0 = vmulq_f32(vin0, vscale);
79 vout1 = vmulq_f32(vin1, vscale);
80 vout2 = vmulq_f32(vin2, vscale);
81 vout3 = vmulq_f32(vin3, vscale);
82
83 vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
84 vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
85 vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
86 vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
87 }
88 }
89
AudioBlockCopyChannelWithScale_NEON(const float aInput[WEBAUDIO_BLOCK_SIZE],const float aScale[WEBAUDIO_BLOCK_SIZE],float aOutput[WEBAUDIO_BLOCK_SIZE])90 void AudioBlockCopyChannelWithScale_NEON(
91 const float aInput[WEBAUDIO_BLOCK_SIZE],
92 const float aScale[WEBAUDIO_BLOCK_SIZE],
93 float aOutput[WEBAUDIO_BLOCK_SIZE]) {
94 ASSERT_ALIGNED(aInput);
95 ASSERT_ALIGNED(aScale);
96 ASSERT_ALIGNED(aOutput);
97
98 float32x4_t vin0, vin1, vin2, vin3;
99 float32x4_t vout0, vout1, vout2, vout3;
100 float32x4_t vscale0, vscale1, vscale2, vscale3;
101
102 for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
103 vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
104 vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
105 vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
106 vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
107
108 vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
109 vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
110 vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
111 vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
112
113 vout0 = vmulq_f32(vin0, vscale0);
114 vout1 = vmulq_f32(vin1, vscale1);
115 vout2 = vmulq_f32(vin2, vscale2);
116 vout3 = vmulq_f32(vin3, vscale3);
117
118 vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
119 vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
120 vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
121 vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
122 }
123 }
124
AudioBufferInPlaceScale_NEON(float * aBlock,float aScale,uint32_t aSize)125 void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
126 ASSERT_ALIGNED(aBlock);
127
128 float32x4_t vin0, vin1, vin2, vin3;
129 float32x4_t vout0, vout1, vout2, vout3;
130 float32x4_t vscale = vmovq_n_f32(aScale);
131
132 uint32_t dif = aSize % 16;
133 uint32_t vectorSize = aSize - dif;
134 uint32_t i = 0;
135 for (; i < vectorSize; i += 16) {
136 vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
137 vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
138 vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
139 vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
140
141 vout0 = vmulq_f32(vin0, vscale);
142 vout1 = vmulq_f32(vin1, vscale);
143 vout2 = vmulq_f32(vin2, vscale);
144 vout3 = vmulq_f32(vin3, vscale);
145
146 vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
147 vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
148 vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
149 vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
150 }
151
152 for (unsigned j = 0; j < dif; ++i, ++j) {
153 aBlock[i] *= aScale;
154 }
155 }
156
AudioBufferInPlaceScale_NEON(float * aBlock,float * aScale,uint32_t aSize)157 void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale,
158 uint32_t aSize) {
159 ASSERT_ALIGNED(aBlock);
160
161 float32x4_t vin0, vin1, vin2, vin3;
162 float32x4_t vout0, vout1, vout2, vout3;
163 float32x4_t vscale0, vscale1, vscale2, vscale3;
164
165 uint32_t dif = aSize % 16;
166 uint32_t vectorSize = aSize - dif;
167 uint32_t i = 0;
168 for (; i < vectorSize; i += 16) {
169 vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
170 vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
171 vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
172 vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
173
174 vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
175 vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
176 vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
177 vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
178
179 vout0 = vmulq_f32(vin0, vscale0);
180 vout1 = vmulq_f32(vin1, vscale1);
181 vout2 = vmulq_f32(vin2, vscale2);
182 vout3 = vmulq_f32(vin3, vscale3);
183
184 vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
185 vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
186 vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
187 vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
188 }
189
190 for (unsigned j = 0; j < dif; ++i, ++j) {
191 aBlock[i] *= aScale[i];
192 }
193 }
194
AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],const float aInputR[WEBAUDIO_BLOCK_SIZE],float aGainL,float aGainR,bool aIsOnTheLeft,float aOutputL[WEBAUDIO_BLOCK_SIZE],float aOutputR[WEBAUDIO_BLOCK_SIZE])195 void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
196 const float aInputR[WEBAUDIO_BLOCK_SIZE],
197 float aGainL, float aGainR,
198 bool aIsOnTheLeft,
199 float aOutputL[WEBAUDIO_BLOCK_SIZE],
200 float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
201 ASSERT_ALIGNED(aInputL);
202 ASSERT_ALIGNED(aInputR);
203 ASSERT_ALIGNED(aOutputL);
204 ASSERT_ALIGNED(aOutputR);
205
206 float32x4_t vinL0, vinL1;
207 float32x4_t vinR0, vinR1;
208 float32x4_t voutL0, voutL1;
209 float32x4_t voutR0, voutR1;
210 float32x4_t vscaleL = vmovq_n_f32(aGainL);
211 float32x4_t vscaleR = vmovq_n_f32(aGainR);
212
213 if (aIsOnTheLeft) {
214 for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
215 vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
216 vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
217
218 vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
219 vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
220
221 voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
222 voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);
223
224 vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
225 vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
226
227 voutR0 = vmulq_f32(vinR0, vscaleR);
228 voutR1 = vmulq_f32(vinR1, vscaleR);
229
230 vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
231 vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
232 }
233 } else {
234 for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
235 vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
236 vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
237
238 vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
239 vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
240
241 voutL0 = vmulq_f32(vinL0, vscaleL);
242 voutL1 = vmulq_f32(vinL1, vscaleL);
243
244 vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
245 vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
246
247 voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
248 voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);
249
250 vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
251 vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
252 }
253 }
254 }
255
AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],const float aInputR[WEBAUDIO_BLOCK_SIZE],const float aGainL[WEBAUDIO_BLOCK_SIZE],const float aGainR[WEBAUDIO_BLOCK_SIZE],const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],float aOutputL[WEBAUDIO_BLOCK_SIZE],float aOutputR[WEBAUDIO_BLOCK_SIZE])256 void AudioBlockPanStereoToStereo_NEON(
257 const float aInputL[WEBAUDIO_BLOCK_SIZE],
258 const float aInputR[WEBAUDIO_BLOCK_SIZE],
259 const float aGainL[WEBAUDIO_BLOCK_SIZE],
260 const float aGainR[WEBAUDIO_BLOCK_SIZE],
261 const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
262 float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
263 ASSERT_ALIGNED(aInputL);
264 ASSERT_ALIGNED(aInputR);
265 ASSERT_ALIGNED(aGainL);
266 ASSERT_ALIGNED(aGainR);
267 ASSERT_ALIGNED(aIsOnTheLeft);
268 ASSERT_ALIGNED(aOutputL);
269 ASSERT_ALIGNED(aOutputR);
270
271 float32x4_t vinL0, vinL1;
272 float32x4_t vinR0, vinR1;
273 float32x4_t voutL0, voutL1;
274 float32x4_t voutR0, voutR1;
275 float32x4_t vscaleL0, vscaleL1;
276 float32x4_t vscaleR0, vscaleR1;
277 float32x4_t onleft0, onleft1, notonleft0, notonleft1;
278
279 float32x4_t zero = vmovq_n_f32(0);
280 uint8x8_t isOnTheLeft;
281
282 // Although MSVC throws uninitialized value warning for voutL0 and voutL1,
283 // since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid
284 // compiler warning, set zero.
285 voutL0 = zero;
286 voutL1 = zero;
287
288 for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
289 vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
290 vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
291
292 vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
293 vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
294
295 vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i));
296 vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4));
297
298 vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i));
299 vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4));
300
301 // Load output with boolean "on the left" values. This assumes that
302 // bools are stored as a single byte.
303 isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]);
304 voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0);
305 voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1);
306 voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2);
307 voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3);
308 voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0);
309 voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1);
310 voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2);
311 voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3);
312
313 // Convert the boolean values into masks by setting all bits to 1
314 // if true.
315 voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero);
316 voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero);
317
318 // The right output masks are the same as the left masks
319 voutR0 = voutL0;
320 voutR1 = voutL1;
321
322 // Calculate left channel assuming isOnTheLeft
323 onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0);
324 onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL0);
325
326 // Calculate left channel assuming not isOnTheLeft
327 notonleft0 = vmulq_f32(vinL0, vscaleL0);
328 notonleft1 = vmulq_f32(vinL1, vscaleL1);
329
330 // Write results using previously stored masks
331 voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0);
332 voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1);
333
334 // Calculate right channel assuming isOnTheLeft
335 onleft0 = vmulq_f32(vinR0, vscaleR0);
336 onleft1 = vmulq_f32(vinR1, vscaleR1);
337
338 // Calculate right channel assuming not isOnTheLeft
339 notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0);
340 notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1);
341
342 // Write results using previously stored masks
343 voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0);
344 voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1);
345
346 vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
347 vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
348 vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
349 vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
350 }
351 }
352 } // namespace mozilla
353