1 /*
2 * SpanDSP - a series of DSP components for telephony
3 *
4 * vector_float.c - Floating vector arithmetic routines.
5 *
6 * Written by Steve Underwood <steveu@coppice.org>
7 *
8 * Copyright (C) 2006 Steve Underwood
9 *
10 * All rights reserved.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Lesser General Public License version 2.1,
14 * as published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 */
25
26 /*! \file */
27
28 #if defined(HAVE_CONFIG_H)
29 #include "config.h"
30 #endif
31
32 #include <inttypes.h>
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <string.h>
36 #if defined(HAVE_TGMATH_H)
37 #include <tgmath.h>
38 #endif
39 #if defined(HAVE_MATH_H)
40 #include <math.h>
41 #endif
42 #include <assert.h>
43
44 #include "floating_fudge.h"
45 #include "mmx_sse_decs.h"
46
47 #include "spandsp/telephony.h"
48 #include "spandsp/vector_float.h"
49
50 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_copyf(float z[],const float x[],int n)51 SPAN_DECLARE(void) vec_copyf(float z[], const float x[], int n)
52 {
53 int i;
54 __m128 n1;
55
56 if ((i = n & ~3))
57 {
58 for (i -= 4; i >= 0; i -= 4)
59 {
60 n1 = _mm_loadu_ps(x + i);
61 _mm_storeu_ps(z + i, n1);
62 }
63 }
64 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
65 switch (n & 3)
66 {
67 case 3:
68 z[n - 3] = x[n - 3];
69 case 2:
70 z[n - 2] = x[n - 2];
71 case 1:
72 z[n - 1] = x[n - 1];
73 }
74 }
75 #else
vec_copyf(float z[],const float x[],int n)76 SPAN_DECLARE(void) vec_copyf(float z[], const float x[], int n)
77 {
78 int i;
79
80 for (i = 0; i < n; i++)
81 z[i] = x[i];
82 }
83 #endif
84 /*- End of function --------------------------------------------------------*/
85
vec_copy(double z[],const double x[],int n)86 SPAN_DECLARE(void) vec_copy(double z[], const double x[], int n)
87 {
88 int i;
89
90 for (i = 0; i < n; i++)
91 z[i] = x[i];
92 }
93 /*- End of function --------------------------------------------------------*/
94
95 #if defined(HAVE_LONG_DOUBLE)
vec_copyl(long double z[],const long double x[],int n)96 SPAN_DECLARE(void) vec_copyl(long double z[], const long double x[], int n)
97 {
98 int i;
99
100 for (i = 0; i < n; i++)
101 z[i] = x[i];
102 }
103 /*- End of function --------------------------------------------------------*/
104 #endif
105
106 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_negatef(float z[],const float x[],int n)107 SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n)
108 {
109 int i;
110 static const uint32_t mask = 0x80000000;
111 static const float *fmask = (float *) &mask;
112 __m128 n1;
113 __m128 n2;
114
115 if ((i = n & ~3))
116 {
117 n2 = _mm_set1_ps(*fmask);
118 for (i -= 4; i >= 0; i -= 4)
119 {
120 n1 = _mm_loadu_ps(x + i);
121 n1 = _mm_xor_ps(n1, n2);
122 _mm_storeu_ps(z + i, n1);
123 }
124 }
125 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
126 switch (n & 3)
127 {
128 case 3:
129 z[n - 3] = -x[n - 3];
130 case 2:
131 z[n - 2] = -x[n - 2];
132 case 1:
133 z[n - 1] = -x[n - 1];
134 }
135 }
136 #else
vec_negatef(float z[],const float x[],int n)137 SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n)
138 {
139 int i;
140
141 for (i = 0; i < n; i++)
142 z[i] = -x[i];
143 }
144 #endif
145 /*- End of function --------------------------------------------------------*/
146
vec_negate(double z[],const double x[],int n)147 SPAN_DECLARE(void) vec_negate(double z[], const double x[], int n)
148 {
149 int i;
150
151 for (i = 0; i < n; i++)
152 z[i] = -x[i];
153 }
154 /*- End of function --------------------------------------------------------*/
155
156 #if defined(HAVE_LONG_DOUBLE)
vec_negatel(long double z[],const long double x[],int n)157 SPAN_DECLARE(void) vec_negatel(long double z[], const long double x[], int n)
158 {
159 int i;
160
161 for (i = 0; i < n; i++)
162 z[i] = -x[i];
163 }
164 /*- End of function --------------------------------------------------------*/
165 #endif
166
167 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_zerof(float z[],int n)168 SPAN_DECLARE(void) vec_zerof(float z[], int n)
169 {
170 int i;
171 __m128 n1;
172
173 if ((i = n & ~3))
174 {
175 n1 = _mm_setzero_ps();
176 for (i -= 4; i >= 0; i -= 4)
177 _mm_storeu_ps(z + i, n1);
178 }
179 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
180 switch (n & 3)
181 {
182 case 3:
183 z[n - 3] = 0;
184 case 2:
185 z[n - 2] = 0;
186 case 1:
187 z[n - 1] = 0;
188 }
189 }
190 #else
vec_zerof(float z[],int n)191 SPAN_DECLARE(void) vec_zerof(float z[], int n)
192 {
193 int i;
194
195 for (i = 0; i < n; i++)
196 z[i] = 0.0f;
197 }
198 #endif
199 /*- End of function --------------------------------------------------------*/
200
vec_zero(double z[],int n)201 SPAN_DECLARE(void) vec_zero(double z[], int n)
202 {
203 int i;
204
205 for (i = 0; i < n; i++)
206 z[i] = 0.0;
207 }
208 /*- End of function --------------------------------------------------------*/
209
210 #if defined(HAVE_LONG_DOUBLE)
vec_zerol(long double z[],int n)211 SPAN_DECLARE(void) vec_zerol(long double z[], int n)
212 {
213 int i;
214
215 for (i = 0; i < n; i++)
216 z[i] = 0.0L;
217 }
218 /*- End of function --------------------------------------------------------*/
219 #endif
220
221 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_setf(float z[],float x,int n)222 SPAN_DECLARE(void) vec_setf(float z[], float x, int n)
223 {
224 int i;
225 __m128 n1;
226
227 if ((i = n & ~3))
228 {
229 n1 = _mm_set1_ps(x);
230 for (i -= 4; i >= 0; i -= 4)
231 _mm_storeu_ps(z + i, n1);
232 }
233 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
234 switch (n & 3)
235 {
236 case 3:
237 z[n - 3] = x;
238 case 2:
239 z[n - 2] = x;
240 case 1:
241 z[n - 1] = x;
242 }
243 }
244 #else
vec_setf(float z[],float x,int n)245 SPAN_DECLARE(void) vec_setf(float z[], float x, int n)
246 {
247 int i;
248
249 for (i = 0; i < n; i++)
250 z[i] = x;
251 }
252 #endif
253 /*- End of function --------------------------------------------------------*/
254
vec_set(double z[],double x,int n)255 SPAN_DECLARE(void) vec_set(double z[], double x, int n)
256 {
257 int i;
258
259 for (i = 0; i < n; i++)
260 z[i] = x;
261 }
262 /*- End of function --------------------------------------------------------*/
263
264 #if defined(HAVE_LONG_DOUBLE)
vec_setl(long double z[],long double x,int n)265 SPAN_DECLARE(void) vec_setl(long double z[], long double x, int n)
266 {
267 int i;
268
269 for (i = 0; i < n; i++)
270 z[i] = x;
271 }
272 /*- End of function --------------------------------------------------------*/
273 #endif
274
275 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_addf(float z[],const float x[],const float y[],int n)276 SPAN_DECLARE(void) vec_addf(float z[], const float x[], const float y[], int n)
277 {
278 int i;
279 __m128 n1;
280 __m128 n2;
281
282 if ((i = n & ~3))
283 {
284 for (i -= 4; i >= 0; i -= 4)
285 {
286 n1 = _mm_loadu_ps(x + i);
287 n2 = _mm_loadu_ps(y + i);
288 n2 = _mm_add_ps(n1, n2);
289 _mm_storeu_ps(z + i, n2);
290 }
291 }
292 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
293 switch (n & 3)
294 {
295 case 3:
296 z[n - 3] = x[n - 3] + y[n - 3];
297 case 2:
298 z[n - 2] = x[n - 2] + y[n - 2];
299 case 1:
300 z[n - 1] = x[n - 1] + y[n - 1];
301 }
302 }
303 #else
vec_addf(float z[],const float x[],const float y[],int n)304 SPAN_DECLARE(void) vec_addf(float z[], const float x[], const float y[], int n)
305 {
306 int i;
307
308 for (i = 0; i < n; i++)
309 z[i] = x[i] + y[i];
310 }
311 #endif
312 /*- End of function --------------------------------------------------------*/
313
vec_add(double z[],const double x[],const double y[],int n)314 SPAN_DECLARE(void) vec_add(double z[], const double x[], const double y[], int n)
315 {
316 int i;
317
318 for (i = 0; i < n; i++)
319 z[i] = x[i] + y[i];
320 }
321 /*- End of function --------------------------------------------------------*/
322
323 #if defined(HAVE_LONG_DOUBLE)
vec_addl(long double z[],const long double x[],const long double y[],int n)324 SPAN_DECLARE(void) vec_addl(long double z[], const long double x[], const long double y[], int n)
325 {
326 int i;
327
328 for (i = 0; i < n; i++)
329 z[i] = x[i] + y[i];
330 }
331 /*- End of function --------------------------------------------------------*/
332 #endif
333
334 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_scaledxy_addf(float z[],const float x[],float x_scale,const float y[],float y_scale,int n)335 SPAN_DECLARE(void) vec_scaledxy_addf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n)
336 {
337 int i;
338 __m128 n1;
339 __m128 n2;
340 __m128 n3;
341 __m128 n4;
342
343 if ((i = n & ~3))
344 {
345 n3 = _mm_set1_ps(x_scale);
346 n4 = _mm_set1_ps(y_scale);
347 for (i -= 4; i >= 0; i -= 4)
348 {
349 n1 = _mm_loadu_ps(x + i);
350 n2 = _mm_loadu_ps(y + i);
351 n1 = _mm_mul_ps(n1, n3);
352 n2 = _mm_mul_ps(n2, n4);
353 n2 = _mm_add_ps(n1, n2);
354 _mm_storeu_ps(z + i, n2);
355 }
356 }
357 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
358 switch (n & 3)
359 {
360 case 3:
361 z[n - 3] = x[n - 3]*x_scale + y[n - 3]*y_scale;
362 case 2:
363 z[n - 2] = x[n - 2]*x_scale + y[n - 2]*y_scale;
364 case 1:
365 z[n - 1] = x[n - 1]*x_scale + y[n - 1]*y_scale;
366 }
367 }
368 #else
vec_scaledxy_addf(float z[],const float x[],float x_scale,const float y[],float y_scale,int n)369 SPAN_DECLARE(void) vec_scaledxy_addf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n)
370 {
371 int i;
372
373 for (i = 0; i < n; i++)
374 z[i] = x[i]*x_scale + y[i]*y_scale;
375 }
376 #endif
377 /*- End of function --------------------------------------------------------*/
378
vec_scaledxy_add(double z[],const double x[],double x_scale,const double y[],double y_scale,int n)379 SPAN_DECLARE(void) vec_scaledxy_add(double z[], const double x[], double x_scale, const double y[], double y_scale, int n)
380 {
381 int i;
382
383 for (i = 0; i < n; i++)
384 z[i] = x[i]*x_scale + y[i]*y_scale;
385 }
386 /*- End of function --------------------------------------------------------*/
387
388 #if defined(HAVE_LONG_DOUBLE)
vec_scaledxy_addl(long double z[],const long double x[],long double x_scale,const long double y[],long double y_scale,int n)389 SPAN_DECLARE(void) vec_scaledxy_addl(long double z[], const long double x[], long double x_scale, const long double y[], long double y_scale, int n)
390 {
391 int i;
392
393 for (i = 0; i < n; i++)
394 z[i] = x[i]*x_scale + y[i]*y_scale;
395 }
396 /*- End of function --------------------------------------------------------*/
397 #endif
398
399 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_scaledy_addf(float z[],const float x[],const float y[],float y_scale,int n)400 SPAN_DECLARE(void) vec_scaledy_addf(float z[], const float x[], const float y[], float y_scale, int n)
401 {
402 int i;
403 __m128 n1;
404 __m128 n2;
405 __m128 n3;
406
407 if ((i = n & ~3))
408 {
409 n3 = _mm_set1_ps(y_scale);
410 for (i -= 4; i >= 0; i -= 4)
411 {
412 n1 = _mm_loadu_ps(x + i);
413 n2 = _mm_loadu_ps(y + i);
414 n2 = _mm_mul_ps(n2, n3);
415 n2 = _mm_add_ps(n1, n2);
416 _mm_storeu_ps(z + i, n2);
417 }
418 }
419 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
420 switch (n & 3)
421 {
422 case 3:
423 z[n - 3] = x[n - 3] + y[n - 3]*y_scale;
424 case 2:
425 z[n - 2] = x[n - 2] + y[n - 2]*y_scale;
426 case 1:
427 z[n - 1] = x[n - 1] + y[n - 1]*y_scale;
428 }
429 }
430 #else
vec_scaledy_addf(float z[],const float x[],const float y[],float y_scale,int n)431 SPAN_DECLARE(void) vec_scaledy_addf(float z[], const float x[], const float y[], float y_scale, int n)
432 {
433 int i;
434
435 for (i = 0; i < n; i++)
436 z[i] = x[i] + y[i]*y_scale;
437 }
438 #endif
439 /*- End of function --------------------------------------------------------*/
440
vec_scaledy_add(double z[],const double x[],const double y[],double y_scale,int n)441 SPAN_DECLARE(void) vec_scaledy_add(double z[], const double x[], const double y[], double y_scale, int n)
442 {
443 int i;
444
445 for (i = 0; i < n; i++)
446 z[i] = x[i] + y[i]*y_scale;
447 }
448 /*- End of function --------------------------------------------------------*/
449
450 #if defined(HAVE_LONG_DOUBLE)
vec_scaledy_addl(long double z[],const long double x[],const long double y[],long double y_scale,int n)451 SPAN_DECLARE(void) vec_scaledy_addl(long double z[], const long double x[], const long double y[], long double y_scale, int n)
452 {
453 int i;
454
455 for (i = 0; i < n; i++)
456 z[i] = x[i] + y[i]*y_scale;
457 }
458 /*- End of function --------------------------------------------------------*/
459 #endif
460
461 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_subf(float z[],const float x[],const float y[],int n)462 SPAN_DECLARE(void) vec_subf(float z[], const float x[], const float y[], int n)
463 {
464 int i;
465 __m128 n1;
466 __m128 n2;
467
468 if ((i = n & ~3))
469 {
470 for (i -= 4; i >= 0; i -= 4)
471 {
472 n1 = _mm_loadu_ps(x + i);
473 n2 = _mm_loadu_ps(y + i);
474 n2 = _mm_sub_ps(n1, n2);
475 _mm_storeu_ps(z + i, n2);
476 }
477 }
478 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
479 switch (n & 3)
480 {
481 case 3:
482 z[n - 3] = x[n - 3] - y[n - 3];
483 case 2:
484 z[n - 2] = x[n - 2] - y[n - 2];
485 case 1:
486 z[n - 1] = x[n - 1] - y[n - 1];
487 }
488 }
489 #else
vec_subf(float z[],const float x[],const float y[],int n)490 SPAN_DECLARE(void) vec_subf(float z[], const float x[], const float y[], int n)
491 {
492 int i;
493
494 for (i = 0; i < n; i++)
495 z[i] = x[i] - y[i];
496 }
497 #endif
498 /*- End of function --------------------------------------------------------*/
499
vec_sub(double z[],const double x[],const double y[],int n)500 SPAN_DECLARE(void) vec_sub(double z[], const double x[], const double y[], int n)
501 {
502 int i;
503
504 for (i = 0; i < n; i++)
505 z[i] = x[i] - y[i];
506 }
507 /*- End of function --------------------------------------------------------*/
508
509 #if defined(HAVE_LONG_DOUBLE)
vec_subl(long double z[],const long double x[],const long double y[],int n)510 SPAN_DECLARE(void) vec_subl(long double z[], const long double x[], const long double y[], int n)
511 {
512 int i;
513
514 for (i = 0; i < n; i++)
515 z[i] = x[i] - y[i];
516 }
517 /*- End of function --------------------------------------------------------*/
518 #endif
519
vec_scaledxy_subf(float z[],const float x[],float x_scale,const float y[],float y_scale,int n)520 SPAN_DECLARE(void) vec_scaledxy_subf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n)
521 {
522 int i;
523
524 for (i = 0; i < n; i++)
525 z[i] = x[i]*x_scale - y[i]*y_scale;
526 }
527 /*- End of function --------------------------------------------------------*/
528
vec_scaledxy_sub(double z[],const double x[],double x_scale,const double y[],double y_scale,int n)529 SPAN_DECLARE(void) vec_scaledxy_sub(double z[], const double x[], double x_scale, const double y[], double y_scale, int n)
530 {
531 int i;
532
533 for (i = 0; i < n; i++)
534 z[i] = x[i]*x_scale - y[i]*y_scale;
535 }
536 /*- End of function --------------------------------------------------------*/
537
538 #if defined(HAVE_LONG_DOUBLE)
vec_scaledxy_subl(long double z[],const long double x[],long double x_scale,const long double y[],long double y_scale,int n)539 SPAN_DECLARE(void) vec_scaledxy_subl(long double z[], const long double x[], long double x_scale, const long double y[], long double y_scale, int n)
540 {
541 int i;
542
543 for (i = 0; i < n; i++)
544 z[i] = x[i]*x_scale - y[i]*y_scale;
545 }
546 /*- End of function --------------------------------------------------------*/
547 #endif
548
549 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_scalar_mulf(float z[],const float x[],float y,int n)550 SPAN_DECLARE(void) vec_scalar_mulf(float z[], const float x[], float y, int n)
551 {
552 int i;
553 __m128 n1;
554 __m128 n2;
555
556 if ((i = n & ~3))
557 {
558 n2 = _mm_set1_ps(y);
559 for (i -= 4; i >= 0; i -= 4)
560 {
561 n1 = _mm_loadu_ps(x + i);
562 n1 = _mm_mul_ps(n1, n2);
563 _mm_storeu_ps(z + i, n1);
564 }
565 }
566 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
567 switch (n & 3)
568 {
569 case 3:
570 z[n - 3] = x[n - 3]*y;
571 case 2:
572 z[n - 2] = x[n - 2]*y;
573 case 1:
574 z[n - 1] = x[n - 1]*y;
575 }
576 }
577 #else
vec_scalar_mulf(float z[],const float x[],float y,int n)578 SPAN_DECLARE(void) vec_scalar_mulf(float z[], const float x[], float y, int n)
579 {
580 int i;
581
582 for (i = 0; i < n; i++)
583 z[i] = x[i]*y;
584 }
585 #endif
586 /*- End of function --------------------------------------------------------*/
587
vec_scalar_mul(double z[],const double x[],double y,int n)588 SPAN_DECLARE(void) vec_scalar_mul(double z[], const double x[], double y, int n)
589 {
590 int i;
591
592 for (i = 0; i < n; i++)
593 z[i] = x[i]*y;
594 }
595 /*- End of function --------------------------------------------------------*/
596
597 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_scalar_addf(float z[],const float x[],float y,int n)598 SPAN_DECLARE(void) vec_scalar_addf(float z[], const float x[], float y, int n)
599 {
600 int i;
601 __m128 n1;
602 __m128 n2;
603
604 if ((i = n & ~3))
605 {
606 n2 = _mm_set1_ps(y);
607 for (i -= 4; i >= 0; i -= 4)
608 {
609 n1 = _mm_loadu_ps(x + i);
610 n1 = _mm_add_ps(n1, n2);
611 _mm_storeu_ps(z + i, n1);
612 }
613 }
614 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
615 switch (n & 3)
616 {
617 case 3:
618 z[n - 3] = x[n - 3] + y;
619 case 2:
620 z[n - 2] = x[n - 2] + y;
621 case 1:
622 z[n - 1] = x[n - 1] + y;
623 }
624 }
625 #else
vec_scalar_addf(float z[],const float x[],float y,int n)626 SPAN_DECLARE(void) vec_scalar_addf(float z[], const float x[], float y, int n)
627 {
628 int i;
629
630 for (i = 0; i < n; i++)
631 z[i] = x[i] + y;
632 }
633 #endif
634 /*- End of function --------------------------------------------------------*/
635
vec_scalar_add(double z[],const double x[],double y,int n)636 SPAN_DECLARE(void) vec_scalar_add(double z[], const double x[], double y, int n)
637 {
638 int i;
639
640 for (i = 0; i < n; i++)
641 z[i] = x[i] + y;
642 }
643 /*- End of function --------------------------------------------------------*/
644
645 #if defined(HAVE_LONG_DOUBLE)
vec_scalar_addl(long double z[],const long double x[],long double y,int n)646 SPAN_DECLARE(void) vec_scalar_addl(long double z[], const long double x[], long double y, int n)
647 {
648 int i;
649
650 for (i = 0; i < n; i++)
651 z[i] = x[i] + y;
652 }
653 /*- End of function --------------------------------------------------------*/
654 #endif
655
656 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_scalar_subf(float z[],const float x[],float y,int n)657 SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n)
658 {
659 int i;
660 __m128 n1;
661 __m128 n2;
662
663 if ((i = n & ~3))
664 {
665 n2 = _mm_set1_ps(y);
666 for (i -= 4; i >= 0; i -= 4)
667 {
668 n1 = _mm_loadu_ps(x + i);
669 n1 = _mm_sub_ps(n1, n2);
670 _mm_storeu_ps(z + i, n1);
671 }
672 }
673 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
674 switch (n & 3)
675 {
676 case 3:
677 z[n - 3] = x[n - 3] - y;
678 case 2:
679 z[n - 2] = x[n - 2] - y;
680 case 1:
681 z[n - 1] = x[n - 1] - y;
682 }
683 }
684 #else
vec_scalar_subf(float z[],const float x[],float y,int n)685 SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n)
686 {
687 int i;
688
689 for (i = 0; i < n; i++)
690 z[i] = x[i] - y;
691 }
692 #endif
693 /*- End of function --------------------------------------------------------*/
694
vec_scalar_sub(double z[],const double x[],double y,int n)695 SPAN_DECLARE(void) vec_scalar_sub(double z[], const double x[], double y, int n)
696 {
697 int i;
698
699 for (i = 0; i < n; i++)
700 z[i] = x[i] - y;
701 }
702 /*- End of function --------------------------------------------------------*/
703
704 #if defined(HAVE_LONG_DOUBLE)
vec_scalar_subl(long double z[],const long double x[],long double y,int n)705 SPAN_DECLARE(void) vec_scalar_subl(long double z[], const long double x[], long double y, int n)
706 {
707 int i;
708
709 for (i = 0; i < n; i++)
710 z[i] = x[i] - y;
711 }
712 /*- End of function --------------------------------------------------------*/
713 #endif
714
715 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_mulf(float z[],const float x[],const float y[],int n)716 SPAN_DECLARE(void) vec_mulf(float z[], const float x[], const float y[], int n)
717 {
718 int i;
719 __m128 n1;
720 __m128 n2;
721 __m128 n3;
722
723 if ((i = n & ~3))
724 {
725 for (i -= 4; i >= 0; i -= 4)
726 {
727 n1 = _mm_loadu_ps(x + i);
728 n2 = _mm_loadu_ps(y + i);
729 n3 = _mm_mul_ps(n1, n2);
730 _mm_storeu_ps(z + i, n3);
731 }
732 }
733 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
734 switch (n & 3)
735 {
736 case 3:
737 z[n - 3] = x[n - 3]*y[n - 3];
738 case 2:
739 z[n - 2] = x[n - 2]*y[n - 2];
740 case 1:
741 z[n - 1] = x[n - 1]*y[n - 1];
742 }
743 }
744 #else
vec_mulf(float z[],const float x[],const float y[],int n)745 SPAN_DECLARE(void) vec_mulf(float z[], const float x[], const float y[], int n)
746 {
747 int i;
748
749 for (i = 0; i < n; i++)
750 z[i] = x[i]*y[i];
751 }
752 /*- End of function --------------------------------------------------------*/
753 #endif
754
vec_mul(double z[],const double x[],const double y[],int n)755 SPAN_DECLARE(void) vec_mul(double z[], const double x[], const double y[], int n)
756 {
757 int i;
758
759 for (i = 0; i < n; i++)
760 z[i] = x[i]*y[i];
761 }
762 /*- End of function --------------------------------------------------------*/
763
764 #if defined(HAVE_LONG_DOUBLE)
vec_mull(long double z[],const long double x[],const long double y[],int n)765 SPAN_DECLARE(void) vec_mull(long double z[], const long double x[], const long double y[], int n)
766 {
767 int i;
768
769 for (i = 0; i < n; i++)
770 z[i] = x[i]*y[i];
771 }
772 /*- End of function --------------------------------------------------------*/
773 #endif
774
775 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_dot_prodf(const float x[],const float y[],int n)776 SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n)
777 {
778 int i;
779 float z;
780 __m128 n1;
781 __m128 n2;
782 __m128 n3;
783 __m128 n4;
784
785 z = 0.0f;
786 if ((i = n & ~3))
787 {
788 n4 = _mm_setzero_ps(); //sets sum to zero
789 for (i -= 4; i >= 0; i -= 4)
790 {
791 n1 = _mm_loadu_ps(x + i);
792 n2 = _mm_loadu_ps(y + i);
793 n3 = _mm_mul_ps(n1, n2);
794 n4 = _mm_add_ps(n4, n3);
795 }
796 n4 = _mm_add_ps(_mm_movehl_ps(n4, n4), n4);
797 n4 = _mm_add_ss(_mm_shuffle_ps(n4, n4, 1), n4);
798 _mm_store_ss(&z, n4);
799 }
800 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
801 switch (n & 3)
802 {
803 case 3:
804 z += x[n - 3]*y[n - 3];
805 case 2:
806 z += x[n - 2]*y[n - 2];
807 case 1:
808 z += x[n - 1]*y[n - 1];
809 }
810 return z;
811 }
812 #else
vec_dot_prodf(const float x[],const float y[],int n)813 SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n)
814 {
815 int i;
816 float z;
817
818 z = 0.0f;
819 for (i = 0; i < n; i++)
820 z += x[i]*y[i];
821 return z;
822 }
823 /*- End of function --------------------------------------------------------*/
824 #endif
825
vec_dot_prod(const double x[],const double y[],int n)826 SPAN_DECLARE(double) vec_dot_prod(const double x[], const double y[], int n)
827 {
828 int i;
829 double z;
830
831 z = 0.0;
832 for (i = 0; i < n; i++)
833 z += x[i]*y[i];
834 return z;
835 }
836 /*- End of function --------------------------------------------------------*/
837
838 #if defined(HAVE_LONG_DOUBLE)
vec_dot_prodl(const long double x[],const long double y[],int n)839 SPAN_DECLARE(long double) vec_dot_prodl(const long double x[], const long double y[], int n)
840 {
841 int i;
842 long double z;
843
844 z = 0.0L;
845 for (i = 0; i < n; i++)
846 z += x[i]*y[i];
847 return z;
848 }
849 /*- End of function --------------------------------------------------------*/
850 #endif
851
vec_circular_dot_prodf(const float x[],const float y[],int n,int pos)852 SPAN_DECLARE(float) vec_circular_dot_prodf(const float x[], const float y[], int n, int pos)
853 {
854 float z;
855
856 z = vec_dot_prodf(&x[pos], &y[0], n - pos);
857 z += vec_dot_prodf(&x[0], &y[n - pos], pos);
858 return z;
859 }
860 /*- End of function --------------------------------------------------------*/
861
862 #define LMS_LEAK_RATE 0.9999f
863
864 #if defined(__GNUC__) && defined(SPANDSP_USE_SSE2)
vec_lmsf(const float x[],float y[],int n,float error)865 SPAN_DECLARE(void) vec_lmsf(const float x[], float y[], int n, float error)
866 {
867 int i;
868 __m128 n1;
869 __m128 n2;
870 __m128 n3;
871 __m128 n4;
872
873 if ((i = n & ~3))
874 {
875 n3 = _mm_set1_ps(error);
876 n4 = _mm_set1_ps(LMS_LEAK_RATE);
877 for (i -= 4; i >= 0; i -= 4)
878 {
879 n1 = _mm_loadu_ps(x + i);
880 n2 = _mm_loadu_ps(y + i);
881 n1 = _mm_mul_ps(n1, n3);
882 n2 = _mm_mul_ps(n2, n4);
883 n1 = _mm_add_ps(n1, n2);
884 _mm_storeu_ps(y + i, n1);
885 }
886 }
887 /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
888 switch (n & 3)
889 {
890 case 3:
891 y[n - 3] = y[n - 3]*LMS_LEAK_RATE + x[n - 3]*error;
892 case 2:
893 y[n - 2] = y[n - 2]*LMS_LEAK_RATE + x[n - 2]*error;
894 case 1:
895 y[n - 1] = y[n - 1]*LMS_LEAK_RATE + x[n - 1]*error;
896 }
897 }
898 #else
vec_lmsf(const float x[],float y[],int n,float error)899 SPAN_DECLARE(void) vec_lmsf(const float x[], float y[], int n, float error)
900 {
901 int i;
902
903 for (i = 0; i < n; i++)
904 {
905 /* Leak a little to tame uncontrolled wandering */
906 y[i] = y[i]*LMS_LEAK_RATE + x[i]*error;
907 }
908 }
909 #endif
910 /*- End of function --------------------------------------------------------*/
911
vec_circular_lmsf(const float x[],float y[],int n,int pos,float error)912 SPAN_DECLARE(void) vec_circular_lmsf(const float x[], float y[], int n, int pos, float error)
913 {
914 vec_lmsf(&x[pos], &y[0], n - pos, error);
915 vec_lmsf(&x[0], &y[n - pos], pos, error);
916 }
917 /*- End of function --------------------------------------------------------*/
918 /*- End of file ------------------------------------------------------------*/
919