1 /*
2     Copyright (C) 2000 Paul Davis
3 
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8 
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13 
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 
18 */
19 
20 #define _ISOC9X_SOURCE  1
21 #define _ISOC99_SOURCE  1
22 
23 #define __USE_ISOC9X    1
24 #define __USE_ISOC99    1
25 
26 #include <stdio.h>
27 #include <string.h>
28 #include <math.h>
29 #include <memory.h>
30 #include <stdlib.h>
31 #include <stdint.h>
32 #include <limits.h>
33 #ifdef __linux__
34 #include <endian.h>
35 #endif
36 #include "memops.h"
37 
38 #if defined (__SSE2__) && !defined (__sun__)
39 #include <emmintrin.h>
40 #ifdef __SSE4_1__
41 #include <smmintrin.h>
42 #endif
43 #endif
44 
45 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
46 #include <arm_neon.h>
47 #endif
48 
49 /* Notes about these *_SCALING values.
50 
51    the MAX_<N>BIT values are floating point. when multiplied by
52    a full-scale normalized floating point sample value (-1.0..+1.0)
53    they should give the maximum value representable with an integer
54    sample type of N bits. Note that this is asymmetric. Sample ranges
55    for signed integer, 2's complement values are -(2^(N-1) to +(2^(N-1)-1)
56 
57    Complications
58    -------------
59    If we use +2^(N-1) for the scaling factors, we run into a problem:
60 
61    if we start with a normalized float value of -1.0, scaling
62    to 24 bits would give -8388608 (-2^23), which is ideal.
63    But with +1.0, we get +8388608, which is technically out of range.
64 
65    We never multiply a full range normalized value by this constant,
66    but we could multiply it by a positive value that is close enough to +1.0
67    to produce a value > +(2^(N-1)-1.
68 
69    There is no way around this paradox without wasting CPU cycles to determine
70    which scaling factor to use (i.e. determine if its negative or not,
71    use the right factor).
72 
73    So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor.
74 */
75 
76 #define SAMPLE_24BIT_SCALING  8388607.0f
77 #define SAMPLE_16BIT_SCALING  32767.0f
78 
79 /* these are just values to use if the floating point value was out of range
80 
81    advice from Fons Adriaensen: make the limits symmetrical
82  */
83 
84 #define SAMPLE_24BIT_MAX  8388607
85 #define SAMPLE_24BIT_MIN  -8388607
86 #define SAMPLE_24BIT_MAX_F  8388607.0f
87 #define SAMPLE_24BIT_MIN_F  -8388607.0f
88 
89 #define SAMPLE_16BIT_MAX  32767
90 #define SAMPLE_16BIT_MIN  -32767
91 #define SAMPLE_16BIT_MAX_F  32767.0f
92 #define SAMPLE_16BIT_MIN_F  -32767.0f
93 
94 /* these mark the outer edges of the range considered "within" range
95    for a floating point sample value. values outside (and on the boundaries)
96    of this range will be clipped before conversion; values within this
97    range will be scaled to appropriate values for the target sample
98    type.
99 */
100 
101 #define NORMALIZED_FLOAT_MIN -1.0f
102 #define NORMALIZED_FLOAT_MAX  1.0f
103 
104 /* define this in case we end up on a platform that is missing
105    the real lrintf functions
106 */
107 
108 #define f_round(f) lrintf(f)
109 
110 #define float_16(s, d)\
111 	if ((s) <= NORMALIZED_FLOAT_MIN) {\
112 		(d) = SAMPLE_16BIT_MIN;\
113 	} else if ((s) >= NORMALIZED_FLOAT_MAX) {\
114 		(d) = SAMPLE_16BIT_MAX;\
115 	} else {\
116 		(d) = f_round ((s) * SAMPLE_16BIT_SCALING);\
117 	}
118 
119 /* call this when "s" has already been scaled (e.g. when dithering)
120  */
121 
122 #define float_16_scaled(s, d)\
123         if ((s) <= SAMPLE_16BIT_MIN_F) {\
124 		(d) = SAMPLE_16BIT_MIN_F;\
125 	} else if ((s) >= SAMPLE_16BIT_MAX_F) {	\
126 		(d) = SAMPLE_16BIT_MAX;\
127 	} else {\
128 	        (d) = f_round ((s));\
129 	}
130 
131 #define float_24u32(s, d) \
132 	if ((s) <= NORMALIZED_FLOAT_MIN) {\
133 		(d) = SAMPLE_24BIT_MIN << 8;\
134 	} else if ((s) >= NORMALIZED_FLOAT_MAX) {\
135 		(d) = SAMPLE_24BIT_MAX << 8;\
136 	} else {\
137 		(d) = f_round ((s) * SAMPLE_24BIT_SCALING) << 8;\
138 	}
139 
140 /* call this when "s" has already been scaled (e.g. when dithering)
141  */
142 
143 #define float_24u32_scaled(s, d)\
144         if ((s) <= SAMPLE_24BIT_MIN_F) {\
145 		(d) = SAMPLE_24BIT_MIN << 8;\
146 	} else if ((s) >= SAMPLE_24BIT_MAX_F) {	\
147 		(d) = SAMPLE_24BIT_MAX << 8;		\
148 	} else {\
149 		(d) = f_round ((s)) << 8; \
150 	}
151 
152 #define float_24(s, d) \
153 	if ((s) <= NORMALIZED_FLOAT_MIN) {\
154 		(d) = SAMPLE_24BIT_MIN;\
155 	} else if ((s) >= NORMALIZED_FLOAT_MAX) {\
156 		(d) = SAMPLE_24BIT_MAX;\
157 	} else {\
158 		(d) = f_round ((s) * SAMPLE_24BIT_SCALING);\
159 	}
160 
161 /* call this when "s" has already been scaled (e.g. when dithering)
162  */
163 
164 #define float_24_scaled(s, d)\
165         if ((s) <= SAMPLE_24BIT_MIN_F) {\
166 		(d) = SAMPLE_24BIT_MIN;\
167 	} else if ((s) >= SAMPLE_24BIT_MAX_F) {	\
168 		(d) = SAMPLE_24BIT_MAX;		\
169 	} else {\
170 		(d) = f_round ((s)); \
171 	}
172 
173 
174 #if defined (__SSE2__) && !defined (__sun__)
175 
176 /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster  */
gen_one(void)177 static inline __m128 gen_one(void)
178 {
179     volatile __m128i x = { 0 }; /* shut up, GCC */
180     __m128i ones = _mm_cmpeq_epi32(x, x);
181     return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23);
182 }
183 
clip(__m128 s,__m128 min,__m128 max)184 static inline __m128 clip(__m128 s, __m128 min, __m128 max)
185 {
186     return _mm_min_ps(max, _mm_max_ps(s, min));
187 }
188 
float_24_sse(__m128 s)189 static inline __m128i float_24_sse(__m128 s)
190 {
191     const __m128 upper_bound = gen_one(); /* NORMALIZED_FLOAT_MAX */
192     const __m128 lower_bound = _mm_sub_ps(_mm_setzero_ps(), upper_bound);
193 
194     __m128 clipped = clip(s, lower_bound, upper_bound);
195     __m128 scaled = _mm_mul_ps(clipped, _mm_set1_ps(SAMPLE_24BIT_SCALING));
196     return _mm_cvtps_epi32(scaled);
197 }
198 #endif
199 
200 
201 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
202 
clip(float32x4_t s,float32x4_t min,float32x4_t max)203 static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
204 {
205 	return vminq_f32(max, vmaxq_f32(s, min));
206 }
207 
float_24_neon(float32x4_t s)208 static inline int32x4_t float_24_neon(float32x4_t s)
209 {
210 	const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
211 	const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
212 
213 	float32x4_t clipped = clip(s, lower_bound, upper_bound);
214 	float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING));
215 	return vcvtq_s32_f32(scaled);
216 }
217 
float_16_neon(float32x4_t s)218 static inline int16x4_t float_16_neon(float32x4_t s)
219 {
220 	const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
221 	const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
222 
223 	float32x4_t clipped = clip(s, lower_bound, upper_bound);
224 	float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING));
225 	return vmovn_s32(vcvtq_s32_f32(scaled));
226 }
227 #endif
228 
229 /* Linear Congruential noise generator. From the music-dsp list
230  * less random than rand(), but good enough and 10x faster
231  */
232 static unsigned int seed = 22222;
233 
fast_rand()234 static inline unsigned int fast_rand() {
235 	seed = (seed * 196314165) + 907633515;
236 	return seed;
237 }
238 
239 /* functions for native float sample data */
240 
sample_move_floatLE_sSs(jack_default_audio_sample_t * dst,char * src,unsigned long nsamples,unsigned long src_skip)241 void sample_move_floatLE_sSs (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) {
242 	while (nsamples--) {
243 		*dst = *((float *) src);
244 		dst++;
245 		src += src_skip;
246 	}
247 }
248 
sample_move_dS_floatLE(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)249 void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) {
250 	while (nsamples--) {
251 		*((float *) dst) = *src;
252 		dst += dst_skip;
253 		src++;
254 	}
255 }
256 
257 /* NOTES on function naming:
258 
259    foo_bar_d<TYPE>_s<TYPE>
260 
261    the "d<TYPE>" component defines the destination type for the operation
262    the "s<TYPE>" component defines the source type for the operation
263 
264    TYPE can be one of:
265 
266    S      - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value
267    Ss     - like S but reverse endian from the host CPU
268    32u24  - sample is an signed 32 bit integer value, but data is in upper 24 bits only
269    32u24s - like 32u24 but reverse endian from the host CPU
270    24     - sample is an signed 24 bit integer value
271    24s    - like 24 but reverse endian from the host CPU
272    16     - sample is an signed 16 bit integer value
273    16s    - like 16 but reverse endian from the host CPU
274 
275    For obvious reasons, the reverse endian versions only show as source types.
276 
277    This covers all known sample formats at 16 bits or larger.
278 */
279 
280 /* functions for native integer sample data */
281 
sample_move_d32u24_sSs(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)282 void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
283 {
284 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
285 	unsigned long unrolled = nsamples / 4;
286 	nsamples = nsamples & 3;
287 
288 	while (unrolled--) {
289 		float32x4_t samples = vld1q_f32(src);
290 		int32x4_t converted = float_24_neon(samples);
291 		int32x4_t shifted = vshlq_n_s32(converted, 8);
292 		shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted)));
293 
294 		switch(dst_skip) {
295 			case 4:
296 				vst1q_s32((int32_t*)dst, shifted);
297 				break;
298 			default:
299 				vst1q_lane_s32((int32_t*)(dst),            shifted, 0);
300 				vst1q_lane_s32((int32_t*)(dst+dst_skip),   shifted, 1);
301 				vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
302                 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
303 				break;
304 		}
305 		dst += 4*dst_skip;
306 		src+= 4;
307 	}
308 #endif
309 
310 	int32_t z;
311 
312 	while (nsamples--) {
313 
314 		float_24u32 (*src, z);
315 
316 #if __BYTE_ORDER == __LITTLE_ENDIAN
317 		dst[0]=(char)(z>>24);
318 		dst[1]=(char)(z>>16);
319 		dst[2]=(char)(z>>8);
320 		dst[3]=(char)(z);
321 #elif __BYTE_ORDER == __BIG_ENDIAN
322 		dst[0]=(char)(z);
323 		dst[1]=(char)(z>>8);
324 		dst[2]=(char)(z>>16);
325 		dst[3]=(char)(z>>24);
326 #endif
327 		dst += dst_skip;
328 		src++;
329 	}
330 }
331 
sample_move_d32u24_sS(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)332 void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
333 {
334 #if defined (__SSE2__) && !defined (__sun__)
335 	__m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
336 	__m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
337 	__m128 factor = int_max;
338 
339 	unsigned long unrolled = nsamples / 4;
340 	nsamples = nsamples & 3;
341 
342 	while (unrolled--) {
343 		__m128 in = _mm_load_ps(src);
344 		__m128 scaled = _mm_mul_ps(in, factor);
345 		__m128 clipped = clip(scaled, int_min, int_max);
346 
347 		__m128i y = _mm_cvttps_epi32(clipped);
348 		__m128i shifted = _mm_slli_epi32(y, 8);
349 
350 #ifdef __SSE4_1__
351 		*(int32_t*)dst              = _mm_extract_epi32(shifted, 0);
352 		*(int32_t*)(dst+dst_skip)   = _mm_extract_epi32(shifted, 1);
353 		*(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
354 		*(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
355 #else
356 		__m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
357 		__m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
358 		__m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
359 
360 		_mm_store_ss((float*)dst, (__m128)shifted);
361 
362 		_mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
363 		_mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
364 		_mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
365 #endif
366 		dst += 4*dst_skip;
367 
368 		src+= 4;
369 	}
370 
371 	while (nsamples--) {
372 		__m128 in = _mm_load_ss(src);
373 		__m128 scaled = _mm_mul_ss(in, factor);
374 		__m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
375 
376 		int y = _mm_cvttss_si32(clipped);
377 		*((int *) dst) = y<<8;
378 
379 		dst += dst_skip;
380 		src++;
381 	}
382 
383 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
384 	unsigned long unrolled = nsamples / 4;
385 	nsamples = nsamples & 3;
386 
387 	while (unrolled--) {
388 		float32x4_t samples = vld1q_f32(src);
389 		int32x4_t converted = float_24_neon(samples);
390 		int32x4_t shifted = vshlq_n_s32(converted, 8);
391 
392 		switch(dst_skip) {
393 			case 4:
394 				vst1q_s32((int32_t*)dst, shifted);
395 				break;
396 			default:
397 				vst1q_lane_s32((int32_t*)(dst),            shifted, 0);
398 				vst1q_lane_s32((int32_t*)(dst+dst_skip),   shifted, 1);
399 				vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
400                 vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
401 				break;
402 		}
403 		dst += 4*dst_skip;
404 
405 		src+= 4;
406 	}
407 #endif
408 
409 #if !defined (__SSE2__)
410 	while (nsamples--) {
411 		float_24u32 (*src, *((int32_t*) dst));
412 		dst += dst_skip;
413 		src++;
414 	}
415 #endif
416 }
417 
sample_move_dS_s32u24s(jack_default_audio_sample_t * dst,char * src,unsigned long nsamples,unsigned long src_skip)418 void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
419 {
420 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
421 	float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
422 	unsigned long unrolled = nsamples / 4;
423 	while (unrolled--) {
424 		int32x4_t src128;
425 		switch(src_skip)
426 		{
427 			case 4:
428 				src128 = vld1q_s32((int32_t*)src);
429 				break;
430 			case 8:
431 				src128 = vld2q_s32((int32_t*)src).val[0];
432 				break;
433 			default:
434 				src128 = vld1q_lane_s32((int32_t*)src,              src128, 0);
435 				src128 = vld1q_lane_s32((int32_t*)(src+src_skip),   src128, 1);
436 				src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
437 				src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
438 				break;
439 		}
440 		src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128)));
441 		int32x4_t shifted = vshrq_n_s32(src128, 8);
442 		float32x4_t as_float = vcvtq_f32_s32(shifted);
443 		float32x4_t divided = vmulq_f32(as_float, factor);
444 		vst1q_f32(dst, divided);
445 
446 		src += 4*src_skip;
447 		dst += 4;
448 	}
449 	nsamples = nsamples & 3;
450 #endif
451 
452 	/* ALERT: signed sign-extension portability !!! */
453 
454 	const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
455 
456 	while (nsamples--) {
457 		int x;
458 #if __BYTE_ORDER == __LITTLE_ENDIAN
459 		x = (unsigned char)(src[0]);
460 		x <<= 8;
461 		x |= (unsigned char)(src[1]);
462 		x <<= 8;
463 		x |= (unsigned char)(src[2]);
464 		x <<= 8;
465 		x |= (unsigned char)(src[3]);
466 #elif __BYTE_ORDER == __BIG_ENDIAN
467 		x = (unsigned char)(src[3]);
468 		x <<= 8;
469 		x |= (unsigned char)(src[2]);
470 		x <<= 8;
471 		x |= (unsigned char)(src[1]);
472 		x <<= 8;
473 		x |= (unsigned char)(src[0]);
474 #endif
475 		*dst = (x >> 8) * scaling;
476 		dst++;
477 		src += src_skip;
478 	}
479 }
480 
sample_move_dS_s32u24(jack_default_audio_sample_t * dst,char * src,unsigned long nsamples,unsigned long src_skip)481 void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
482 {
483 #if defined (__SSE2__) && !defined (__sun__)
484 	unsigned long unrolled = nsamples / 4;
485 	static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
486 	__m128 factor = _mm_set1_ps(inv_sample_max_24bit);
487 	while (unrolled--)
488 	{
489 		int i1 = *((int *) src);
490 		src+= src_skip;
491 		int i2 = *((int *) src);
492 		src+= src_skip;
493 		int i3 = *((int *) src);
494 		src+= src_skip;
495 		int i4 = *((int *) src);
496 		src+= src_skip;
497 
498 		__m128i src = _mm_set_epi32(i4, i3, i2, i1);
499 		__m128i shifted = _mm_srai_epi32(src, 8);
500 
501 		__m128 as_float = _mm_cvtepi32_ps(shifted);
502 		__m128 divided = _mm_mul_ps(as_float, factor);
503 
504 		_mm_storeu_ps(dst, divided);
505 
506 		dst += 4;
507 	}
508 	nsamples = nsamples & 3;
509 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
510 	unsigned long unrolled = nsamples / 4;
511 	float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
512 	while (unrolled--) {
513 		int32x4_t src128;
514 		switch(src_skip) {
515 			case 4:
516 				src128 = vld1q_s32((int32_t*)src);
517 				break;
518 			case 8:
519 				src128 = vld2q_s32((int32_t*)src).val[0];
520 				break;
521 			default:
522 				src128 = vld1q_lane_s32((int32_t*)src,              src128, 0);
523 				src128 = vld1q_lane_s32((int32_t*)(src+src_skip),   src128, 1);
524 				src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
525 				src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
526 				break;
527 		}
528 		int32x4_t shifted = vshrq_n_s32(src128, 8);
529 		float32x4_t as_float = vcvtq_f32_s32(shifted);
530 		float32x4_t divided = vmulq_f32(as_float, factor);
531 		vst1q_f32(dst, divided);
532 
533 		src += 4*src_skip;
534 		dst += 4;
535 	}
536 	nsamples = nsamples & 3;
537 #endif
538 
539 	/* ALERT: signed sign-extension portability !!! */
540 
541 	const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
542 	while (nsamples--) {
543 		*dst = (*((int *) src) >> 8) * scaling;
544 		dst++;
545 		src += src_skip;
546 	}
547 }
548 
sample_move_d24_sSs(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)549 void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
550 {
551 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
552 	unsigned long unrolled = nsamples / 4;
553 	while (unrolled--) {
554 		int i;
555 		int32_t z[4];
556 		float32x4_t samples = vld1q_f32(src);
557 		int32x4_t converted = float_24_neon(samples);
558 		converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
559 		vst1q_s32(z, converted);
560 
561 		for (i = 0; i != 4; ++i) {
562 			memcpy (dst, ((char*)(z+i))+1, 3);
563 			dst += dst_skip;
564 		}
565 		src += 4;
566 	}
567 	nsamples = nsamples & 3;
568 #endif
569 
570 	int32_t z;
571 
572 	while (nsamples--) {
573 		float_24 (*src, z);
574 #if __BYTE_ORDER == __LITTLE_ENDIAN
575 		dst[0]=(char)(z>>16);
576 		dst[1]=(char)(z>>8);
577 		dst[2]=(char)(z);
578 #elif __BYTE_ORDER == __BIG_ENDIAN
579 		dst[0]=(char)(z);
580 		dst[1]=(char)(z>>8);
581 		dst[2]=(char)(z>>16);
582 #endif
583 		dst += dst_skip;
584 		src++;
585 	}
586 }
587 
sample_move_d24_sS(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)588 void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
589 {
590 #if defined (__SSE2__) && !defined (__sun__)
591 	_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
592 	while (nsamples >= 4) {
593 		int i;
594 		int32_t z[4];
595 		__m128 samples = _mm_loadu_ps(src);
596 		__m128i converted = float_24_sse(samples);
597 
598 #ifdef __SSE4_1__
599 		z[0] = _mm_extract_epi32(converted, 0);
600 		z[1] = _mm_extract_epi32(converted, 1);
601 		z[2] = _mm_extract_epi32(converted, 2);
602 		z[3] = _mm_extract_epi32(converted, 3);
603 #else
604 		__m128i shuffled1 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(0, 3, 2, 1));
605 		__m128i shuffled2 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(1, 0, 3, 2));
606 		__m128i shuffled3 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(2, 1, 0, 3));
607 
608 		_mm_store_ss((float*)z, (__m128)converted);
609 		_mm_store_ss((float*)z+1, (__m128)shuffled1);
610 		_mm_store_ss((float*)z+2, (__m128)shuffled2);
611 		_mm_store_ss((float*)z+3, (__m128)shuffled3);
612 #endif
613 
614 		for (i = 0; i != 4; ++i) {
615 			memcpy (dst, z+i, 3);
616 			dst += dst_skip;
617 		}
618 
619 		nsamples -= 4;
620 		src += 4;
621 	}
622 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
623 	unsigned long unrolled = nsamples / 4;
624 	while (unrolled--) {
625 		int i;
626 		int32_t z[4];
627 		float32x4_t samples = vld1q_f32(src);
628 		int32x4_t converted = float_24_neon(samples);
629 		vst1q_s32(z, converted);
630 
631 		for (i = 0; i != 4; ++i) {
632 			memcpy (dst, z+i, 3);
633 			dst += dst_skip;
634 		}
635 		src += 4;
636 	}
637 	nsamples = nsamples & 3;
638 #endif
639 
640     int32_t z;
641 
642 	while (nsamples--) {
643 		float_24 (*src, z);
644 #if __BYTE_ORDER == __LITTLE_ENDIAN
645 		memcpy (dst, &z, 3);
646 #elif __BYTE_ORDER == __BIG_ENDIAN
647 		memcpy (dst, (char *)&z + 1, 3);
648 #endif
649 		dst += dst_skip;
650 		src++;
651 	}
652 }
653 
sample_move_dS_s24s(jack_default_audio_sample_t * dst,char * src,unsigned long nsamples,unsigned long src_skip)654 void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
655 {
656 	const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
657 
658 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
659 	// we shift 8 to the right by dividing by 256.0 -> no sign extra handling
660 	const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
661 	int32_t x[4];
662 	memset(x, 0, sizeof(x));
663 	unsigned long unrolled = nsamples / 4;
664 	while (unrolled--) {
665 #if __BYTE_ORDER == __BIG_ENDIAN	 /* ARM big endian?? */
666 		// right aligned / inverse sequence below -> *256
667 		memcpy(((char*)&x[0])+1, src, 3);
668 		memcpy(((char*)&x[1])+1, src+src_skip, 3);
669 		memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
670 		memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
671 #else
672 		memcpy(&x[0], src, 3);
673 		memcpy(&x[1], src+src_skip, 3);
674 		memcpy(&x[2], src+2*src_skip, 3);
675 		memcpy(&x[3], src+3*src_skip, 3);
676 #endif
677 		src += 4 * src_skip;
678 
679 		int32x4_t source = vld1q_s32(x);
680 		source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source)));
681 		float32x4_t converted = vcvtq_f32_s32(source);
682 		float32x4_t scaled = vmulq_f32(converted, vscaling);
683 		vst1q_f32(dst, scaled);
684 		dst += 4;
685 	}
686 	nsamples = nsamples & 3;
687 #endif
688 
689 	/* ALERT: signed sign-extension portability !!! */
690 
691 	while (nsamples--) {
692 		int x;
693 #if __BYTE_ORDER == __LITTLE_ENDIAN
694 		x = (unsigned char)(src[0]);
695 		x <<= 8;
696 		x |= (unsigned char)(src[1]);
697 		x <<= 8;
698 		x |= (unsigned char)(src[2]);
699 		/* correct sign bit and the rest of the top byte */
700 		if (src[0] & 0x80) {
701 			x |= 0xff << 24;
702 		}
703 #elif __BYTE_ORDER == __BIG_ENDIAN
704 		x = (unsigned char)(src[2]);
705 		x <<= 8;
706 		x |= (unsigned char)(src[1]);
707 		x <<= 8;
708 		x |= (unsigned char)(src[0]);
709 		/* correct sign bit and the rest of the top byte */
710 		if (src[2] & 0x80) {
711 			x |= 0xff << 24;
712 		}
713 #endif
714 		*dst = x * scaling;
715 		dst++;
716 		src += src_skip;
717 	}
718 }
719 
sample_move_dS_s24(jack_default_audio_sample_t * dst,char * src,unsigned long nsamples,unsigned long src_skip)720 void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
721 {
722 	const jack_default_audio_sample_t scaling = 1.f/SAMPLE_24BIT_SCALING;
723 
724 #if defined (__SSE2__) && !defined (__sun__)
725 	const __m128 scaling_block = _mm_set_ps1(scaling);
726 	while (nsamples >= 4) {
727 		int x0, x1, x2, x3;
728 
729 		memcpy((char*)&x0 + 1, src, 3);
730 		memcpy((char*)&x1 + 1, src+src_skip, 3);
731 		memcpy((char*)&x2 + 1, src+2*src_skip, 3);
732 		memcpy((char*)&x3 + 1, src+3*src_skip, 3);
733 		src += 4 * src_skip;
734 
735 		const __m128i block_i = _mm_set_epi32(x3, x2, x1, x0);
736 		const __m128i shifted = _mm_srai_epi32(block_i, 8);
737 		const __m128 converted = _mm_cvtepi32_ps (shifted);
738 		const __m128 scaled = _mm_mul_ps(converted, scaling_block);
739 		_mm_storeu_ps(dst, scaled);
740 		dst += 4;
741 		nsamples -= 4;
742 	}
743 #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
744 	// we shift 8 to the right by dividing by 256.0 -> no sign extra handling
745 	const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
746 	int32_t x[4];
747 	memset(x, 0, sizeof(x));
748 	unsigned long unrolled = nsamples / 4;
749 	while (unrolled--) {
750 #if __BYTE_ORDER == __BIG_ENDIAN	/* ARM big endian?? */
751 		// left aligned -> *256
752 		memcpy(&x[0], src, 3);
753 		memcpy(&x[1], src+src_skip, 3);
754 		memcpy(&x[2], src+2*src_skip, 3);
755 		memcpy(&x[3], src+3*src_skip, 3);
756 #else
757 		memcpy(((char*)&x[0])+1, src, 3);
758 		memcpy(((char*)&x[1])+1, src+src_skip, 3);
759 		memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
760 		memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
761 #endif
762 		src += 4 * src_skip;
763 
764 		int32x4_t source = vld1q_s32(x);
765 		float32x4_t converted = vcvtq_f32_s32(source);
766 		float32x4_t scaled = vmulq_f32(converted, vscaling);
767 		vst1q_f32(dst, scaled);
768 		dst += 4;
769 	}
770 	nsamples = nsamples & 3;
771 #endif
772 
773 	while (nsamples--) {
774 		int x;
775 #if __BYTE_ORDER == __LITTLE_ENDIAN
776 		memcpy((char*)&x + 1, src, 3);
777 #elif __BYTE_ORDER == __BIG_ENDIAN
778 		memcpy(&x, src, 3);
779 #endif
780 		x >>= 8;
781 		*dst = x * scaling;
782 		dst++;
783 		src += src_skip;
784 	}
785 }
786 
787 
sample_move_d16_sSs(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)788 void sample_move_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
789 {
790 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
791 	unsigned long unrolled = nsamples / 4;
792 	nsamples = nsamples & 3;
793 
794 	while (unrolled--) {
795 		float32x4_t samples = vld1q_f32(src);
796 		int16x4_t converted = float_16_neon(samples);
797 		converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted)));
798 
799 		switch(dst_skip) {
800 			case 2:
801 				vst1_s16((int16_t*)dst, converted);
802 				break;
803 			default:
804 				vst1_lane_s16((int16_t*)(dst),            converted, 0);
805 				vst1_lane_s16((int16_t*)(dst+dst_skip),   converted, 1);
806 				vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
807 				vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
808 				break;
809 		}
810 		dst += 4*dst_skip;
811 		src+= 4;
812 	}
813 #endif
814 	int16_t tmp;
815 
816 	while (nsamples--) {
817 		// float_16 (*src, tmp);
818 
819 		if (*src <= NORMALIZED_FLOAT_MIN) {
820 			tmp = SAMPLE_16BIT_MIN;
821 		} else if (*src >= NORMALIZED_FLOAT_MAX) {
822 			tmp = SAMPLE_16BIT_MAX;
823 		} else {
824 			tmp = (int16_t) f_round (*src * SAMPLE_16BIT_SCALING);
825 		}
826 
827 #if __BYTE_ORDER == __LITTLE_ENDIAN
828 		dst[0]=(char)(tmp>>8);
829 		dst[1]=(char)(tmp);
830 #elif __BYTE_ORDER == __BIG_ENDIAN
831 		dst[0]=(char)(tmp);
832 		dst[1]=(char)(tmp>>8);
833 #endif
834 		dst += dst_skip;
835 		src++;
836 	}
837 }
838 
sample_move_d16_sS(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)839 void sample_move_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
840 {
841 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
842 	unsigned long unrolled = nsamples / 4;
843 	nsamples = nsamples & 3;
844 
845 	while (unrolled--) {
846 		float32x4_t samples = vld1q_f32(src);
847 		int16x4_t converted = float_16_neon(samples);
848 
849 		switch(dst_skip) {
850 			case 2:
851 				vst1_s16((int16_t*)dst, converted);
852 				break;
853 			default:
854 				vst1_lane_s16((int16_t*)(dst),            converted, 0);
855 				vst1_lane_s16((int16_t*)(dst+dst_skip),   converted, 1);
856 				vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
857 				vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
858 				break;
859 		}
860 		dst += 4*dst_skip;
861 		src+= 4;
862 	}
863 #endif
864 	while (nsamples--) {
865 		float_16 (*src, *((int16_t*) dst));
866 		dst += dst_skip;
867 		src++;
868 	}
869 }
870 
sample_move_dither_rect_d16_sSs(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)871 void sample_move_dither_rect_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
872 {
873 	jack_default_audio_sample_t val;
874 	int16_t      tmp;
875 
876 	while (nsamples--) {
877 		val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float) UINT_MAX - 0.5f;
878 		float_16_scaled (val, tmp);
879 #if __BYTE_ORDER == __LITTLE_ENDIAN
880 		dst[0]=(char)(tmp>>8);
881 		dst[1]=(char)(tmp);
882 #elif __BYTE_ORDER == __BIG_ENDIAN
883 		dst[0]=(char)(tmp);
884 		dst[1]=(char)(tmp>>8);
885 #endif
886 		dst += dst_skip;
887 		src++;
888 	}
889 }
890 
sample_move_dither_rect_d16_sS(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)891 void sample_move_dither_rect_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
892 {
893 	jack_default_audio_sample_t val;
894 
895 	while (nsamples--) {
896 		val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float)UINT_MAX - 0.5f;
897 		float_16_scaled (val, *((int16_t*) dst));
898 		dst += dst_skip;
899 		src++;
900 	}
901 }
902 
sample_move_dither_tri_d16_sSs(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)903 void sample_move_dither_tri_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
904 {
905 	jack_default_audio_sample_t val;
906 	int16_t      tmp;
907 
908 	while (nsamples--) {
909 		val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
910 		float_16_scaled (val, tmp);
911 
912 #if __BYTE_ORDER == __LITTLE_ENDIAN
913 		dst[0]=(char)(tmp>>8);
914 		dst[1]=(char)(tmp);
915 #elif __BYTE_ORDER == __BIG_ENDIAN
916 		dst[0]=(char)(tmp);
917 		dst[1]=(char)(tmp>>8);
918 #endif
919 		dst += dst_skip;
920 		src++;
921 	}
922 }
923 
sample_move_dither_tri_d16_sS(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)924 void sample_move_dither_tri_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
925 {
926 	jack_default_audio_sample_t val;
927 
928 	while (nsamples--) {
929 		val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
930 		float_16_scaled (val, *((int16_t*) dst));
931 		dst += dst_skip;
932 		src++;
933 	}
934 }
935 
sample_move_dither_shaped_d16_sSs(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)936 void sample_move_dither_shaped_d16_sSs (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
937 {
938 	jack_default_audio_sample_t     x;
939 	jack_default_audio_sample_t     xe; /* the innput sample - filtered error */
940 	jack_default_audio_sample_t     xp; /* x' */
941 	float        r;
942 	float        rm1 = state->rm1;
943 	unsigned int idx = state->idx;
944 	int16_t      tmp;
945 
946 	while (nsamples--) {
947 		x = *src * SAMPLE_16BIT_SCALING;
948 		r = ((float)fast_rand() + (float)fast_rand())  / (float)UINT_MAX - 1.0f;
949 		/* Filter the error with Lipshitz's minimally audible FIR:
950 		   [2.033 -2.165 1.959 -1.590 0.6149] */
951 		xe = x
952 		     - state->e[idx] * 2.033f
953 		     + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
954 		     - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
955 		     + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
956 		     - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
957 		xp = xe + r - rm1;
958 		rm1 = r;
959 
960 		float_16_scaled (xp, tmp);
961 
962 		/* Intrinsic z^-1 delay */
963 		idx = (idx + 1) & DITHER_BUF_MASK;
964 		state->e[idx] = xp - xe;
965 
966 #if __BYTE_ORDER == __LITTLE_ENDIAN
967 		dst[0]=(char)(tmp>>8);
968 		dst[1]=(char)(tmp);
969 #elif __BYTE_ORDER == __BIG_ENDIAN
970 		dst[0]=(char)(tmp);
971 		dst[1]=(char)(tmp>>8);
972 #endif
973 		dst += dst_skip;
974 		src++;
975 	}
976 	state->rm1 = rm1;
977 	state->idx = idx;
978 }
979 
sample_move_dither_shaped_d16_sS(char * dst,jack_default_audio_sample_t * src,unsigned long nsamples,unsigned long dst_skip,dither_state_t * state)980 void sample_move_dither_shaped_d16_sS (char *dst,  jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
981 {
982 	jack_default_audio_sample_t     x;
983 	jack_default_audio_sample_t     xe; /* the innput sample - filtered error */
984 	jack_default_audio_sample_t     xp; /* x' */
985 	float        r;
986 	float        rm1 = state->rm1;
987 	unsigned int idx = state->idx;
988 
989 	while (nsamples--) {
990 		x = *src * SAMPLE_16BIT_SCALING;
991 		r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
992 		/* Filter the error with Lipshitz's minimally audible FIR:
993 		   [2.033 -2.165 1.959 -1.590 0.6149] */
994 		xe = x
995 		     - state->e[idx] * 2.033f
996 		     + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
997 		     - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
998 		     + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
999 		     - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
1000 		xp = xe + r - rm1;
1001 		rm1 = r;
1002 
1003 		float_16_scaled (xp, *((int16_t*) dst));
1004 
1005 		/* Intrinsic z^-1 delay */
1006 		idx = (idx + 1) & DITHER_BUF_MASK;
1007 		state->e[idx] = *((int16_t*) dst) - xe;
1008 
1009 		dst += dst_skip;
1010 		src++;
1011 	}
1012 	state->rm1 = rm1;
1013 	state->idx = idx;
1014 }
1015 
sample_move_dS_s16s(jack_default_audio_sample_t * dst,char * src,unsigned long nsamples,unsigned long src_skip)1016 void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1017 {
1018 	short z;
1019 	const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1020 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1021 	const float32x4_t vscaling = vdupq_n_f32(scaling);
1022 	unsigned long unrolled = nsamples / 4;
1023 	while (unrolled--) {
1024 		int16x4_t source16x4;
1025 		switch(src_skip) {
1026 			case 2:
1027 				source16x4 = vld1_s16((int16_t*)src);
1028 				break;
1029 			case 4:
1030 				source16x4 = vld2_s16((int16_t*)src).val[0];
1031 				break;
1032 			default:
1033 				source16x4 = vld1_lane_s16((int16_t*)src,              source16x4, 0);
1034 				source16x4 = vld1_lane_s16((int16_t*)(src+src_skip),   source16x4, 1);
1035 				source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1036 				source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1037 				break;
1038 		}
1039 		source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4)));
1040 		int32x4_t source32x4 = vmovl_s16(source16x4);
1041 		src += 4 * src_skip;
1042 
1043 		float32x4_t converted = vcvtq_f32_s32(source32x4);
1044 		float32x4_t scaled = vmulq_f32(converted, vscaling);
1045 		vst1q_f32(dst, scaled);
1046 		dst += 4;
1047 	}
1048 	nsamples = nsamples & 3;
1049 #endif
1050 
1051 	/* ALERT: signed sign-extension portability !!! */
1052 	while (nsamples--) {
1053 #if __BYTE_ORDER == __LITTLE_ENDIAN
1054 		z = (unsigned char)(src[0]);
1055 		z <<= 8;
1056 		z |= (unsigned char)(src[1]);
1057 #elif __BYTE_ORDER == __BIG_ENDIAN
1058 		z = (unsigned char)(src[1]);
1059 		z <<= 8;
1060 		z |= (unsigned char)(src[0]);
1061 #endif
1062 		*dst = z * scaling;
1063 		dst++;
1064 		src += src_skip;
1065 	}
1066 }
1067 
sample_move_dS_s16(jack_default_audio_sample_t * dst,char * src,unsigned long nsamples,unsigned long src_skip)1068 void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
1069 {
1070 	/* ALERT: signed sign-extension portability !!! */
1071 	const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
1072 #if defined (__ARM_NEON__) || defined (__ARM_NEON)
1073 	const float32x4_t vscaling = vdupq_n_f32(scaling);
1074 	unsigned long unrolled = nsamples / 4;
1075 	while (unrolled--) {
1076 		int16x4_t source16x4;
1077 		switch(src_skip) {
1078 			case 2:
1079 				source16x4 = vld1_s16((int16_t*)src);
1080 				break;
1081 			case 4:
1082 				source16x4 = vld2_s16((int16_t*)src).val[0];
1083 				break;
1084 			default:
1085 				source16x4 = vld1_lane_s16((int16_t*)src,              source16x4, 0);
1086 				source16x4 = vld1_lane_s16((int16_t*)(src+src_skip),   source16x4, 1);
1087 				source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
1088 				source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
1089 				break;
1090 		}
1091 		int32x4_t source32x4 = vmovl_s16(source16x4);
1092 		src += 4 * src_skip;
1093 
1094 		float32x4_t converted = vcvtq_f32_s32(source32x4);
1095 		float32x4_t scaled = vmulq_f32(converted, vscaling);
1096 		vst1q_f32(dst, scaled);
1097 		dst += 4;
1098 	}
1099 	nsamples = nsamples & 3;
1100 #endif
1101 
1102 	while (nsamples--) {
1103 		*dst = (*((short *) src)) * scaling;
1104 		dst++;
1105 		src += src_skip;
1106 	}
1107 }
1108 
memset_interleave(char * dst,char val,unsigned long bytes,unsigned long unit_bytes,unsigned long skip_bytes)1109 void memset_interleave (char *dst, char val, unsigned long bytes,
1110 			unsigned long unit_bytes,
1111 			unsigned long skip_bytes)
1112 {
1113 	switch (unit_bytes) {
1114 	case 1:
1115 		while (bytes--) {
1116 			*dst = val;
1117 			dst += skip_bytes;
1118 		}
1119 		break;
1120 	case 2:
1121 		while (bytes) {
1122 			*((short *) dst) = (short) val;
1123 			dst += skip_bytes;
1124 			bytes -= 2;
1125 		}
1126 		break;
1127 	case 4:
1128 		while (bytes) {
1129 			*((int *) dst) = (int) val;
1130 			dst += skip_bytes;
1131 			bytes -= 4;
1132 		}
1133 		break;
1134 	default:
1135 		while (bytes) {
1136 			memset(dst, val, unit_bytes);
1137 			dst += skip_bytes;
1138 			bytes -= unit_bytes;
1139 		}
1140 		break;
1141 	}
1142 }
1143 
1144 /* COPY FUNCTIONS: used to move data from an input channel to an
1145    output channel. Note that we assume that the skip distance
1146    is the same for both channels. This is completely fine
1147    unless the input and output were on different audio interfaces that
1148    were interleaved differently. We don't try to handle that.
1149 */
1150 
1151 void
memcpy_fake(char * dst,char * src,unsigned long src_bytes,unsigned long foo,unsigned long bar)1152 memcpy_fake (char *dst, char *src, unsigned long src_bytes, unsigned long foo, unsigned long bar)
1153 {
1154 	memcpy (dst, src, src_bytes);
1155 }
1156 
1157 void
memcpy_interleave_d16_s16(char * dst,char * src,unsigned long src_bytes,unsigned long dst_skip_bytes,unsigned long src_skip_bytes)1158 memcpy_interleave_d16_s16 (char *dst, char *src, unsigned long src_bytes,
1159 			   unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1160 {
1161 	while (src_bytes) {
1162 		*((short *) dst) = *((short *) src);
1163 		dst += dst_skip_bytes;
1164 		src += src_skip_bytes;
1165 		src_bytes -= 2;
1166 	}
1167 }
1168 
1169 void
memcpy_interleave_d24_s24(char * dst,char * src,unsigned long src_bytes,unsigned long dst_skip_bytes,unsigned long src_skip_bytes)1170 memcpy_interleave_d24_s24 (char *dst, char *src, unsigned long src_bytes,
1171 			   unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1172 {
1173 	while (src_bytes) {
1174 		memcpy(dst, src, 3);
1175 		dst += dst_skip_bytes;
1176 		src += src_skip_bytes;
1177 		src_bytes -= 3;
1178 	}
1179 }
1180 
1181 void
memcpy_interleave_d32_s32(char * dst,char * src,unsigned long src_bytes,unsigned long dst_skip_bytes,unsigned long src_skip_bytes)1182 memcpy_interleave_d32_s32 (char *dst, char *src, unsigned long src_bytes,
1183 			   unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
1184 {
1185 	while (src_bytes) {
1186 		*((int *) dst) = *((int *) src);
1187 		dst += dst_skip_bytes;
1188 		src += src_skip_bytes;
1189 		src_bytes -= 4;
1190 	}
1191 }
1192 
1193