1 /*
2  * SpanDSP - a series of DSP components for telephony
3  *
4  * vector_float.c - Floating vector arithmetic routines.
5  *
6  * Written by Steve Underwood <steveu@coppice.org>
7  *
8  * Copyright (C) 2006 Steve Underwood
9  *
10  * All rights reserved.
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU Lesser General Public License version 2.1,
14  * as published by the Free Software Foundation.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with this program; if not, write to the Free Software
23  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24  */
25 
26 /*! \file */
27 
28 #if defined(HAVE_CONFIG_H)
29 #include "config.h"
30 #endif
31 
32 #include <inttypes.h>
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <string.h>
36 #if defined(HAVE_TGMATH_H)
37 #include <tgmath.h>
38 #endif
39 #if defined(HAVE_MATH_H)
40 #include <math.h>
41 #endif
42 #include <assert.h>
43 
44 #include "floating_fudge.h"
45 #include "mmx_sse_decs.h"
46 
47 #include "spandsp/telephony.h"
48 #include "spandsp/vector_float.h"
49 
50 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_copyf(float z[],const float x[],int n)51 SPAN_DECLARE(void) vec_copyf(float z[], const float x[], int n)
52 {
53     int i;
54     __m128 n1;
55 
56     if ((i = n & ~3))
57     {
58         for (i -= 4;  i >= 0;  i -= 4)
59         {
60             n1 = _mm_loadu_ps(x + i);
61             _mm_storeu_ps(z + i, n1);
62         }
63     }
64     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
65     switch (n & 3)
66     {
67     case 3:
68         z[n - 3] = x[n - 3];
69     case 2:
70         z[n - 2] = x[n - 2];
71     case 1:
72         z[n - 1] = x[n - 1];
73     }
74 }
75 #else
vec_copyf(float z[],const float x[],int n)76 SPAN_DECLARE(void) vec_copyf(float z[], const float x[], int n)
77 {
78     int i;
79 
80     for (i = 0;  i < n;  i++)
81         z[i] = x[i];
82 }
83 #endif
84 /*- End of function --------------------------------------------------------*/
85 
vec_copy(double z[],const double x[],int n)86 SPAN_DECLARE(void) vec_copy(double z[], const double x[], int n)
87 {
88     int i;
89 
90     for (i = 0;  i < n;  i++)
91         z[i] = x[i];
92 }
93 /*- End of function --------------------------------------------------------*/
94 
95 #if defined(HAVE_LONG_DOUBLE)
vec_copyl(long double z[],const long double x[],int n)96 SPAN_DECLARE(void) vec_copyl(long double z[], const long double x[], int n)
97 {
98     int i;
99 
100     for (i = 0;  i < n;  i++)
101         z[i] = x[i];
102 }
103 /*- End of function --------------------------------------------------------*/
104 #endif
105 
106 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_negatef(float z[],const float x[],int n)107 SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n)
108 {
109     int i;
110     static const uint32_t mask = 0x80000000;
111     static const float *fmask = (float *) &mask;
112     __m128 n1;
113     __m128 n2;
114 
115     if ((i = n & ~3))
116     {
117         n2 = _mm_set1_ps(*fmask);
118         for (i -= 4;  i >= 0;  i -= 4)
119         {
120             n1 = _mm_loadu_ps(x + i);
121             n1 = _mm_xor_ps(n1, n2);
122             _mm_storeu_ps(z + i, n1);
123         }
124     }
125     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
126     switch (n & 3)
127     {
128     case 3:
129         z[n - 3] = -x[n - 3];
130     case 2:
131         z[n - 2] = -x[n - 2];
132     case 1:
133         z[n - 1] = -x[n - 1];
134     }
135 }
136 #else
vec_negatef(float z[],const float x[],int n)137 SPAN_DECLARE(void) vec_negatef(float z[], const float x[], int n)
138 {
139     int i;
140 
141     for (i = 0;  i < n;  i++)
142         z[i] = -x[i];
143 }
144 #endif
145 /*- End of function --------------------------------------------------------*/
146 
vec_negate(double z[],const double x[],int n)147 SPAN_DECLARE(void) vec_negate(double z[], const double x[], int n)
148 {
149     int i;
150 
151     for (i = 0;  i < n;  i++)
152         z[i] = -x[i];
153 }
154 /*- End of function --------------------------------------------------------*/
155 
156 #if defined(HAVE_LONG_DOUBLE)
vec_negatel(long double z[],const long double x[],int n)157 SPAN_DECLARE(void) vec_negatel(long double z[], const long double x[], int n)
158 {
159     int i;
160 
161     for (i = 0;  i < n;  i++)
162         z[i] = -x[i];
163 }
164 /*- End of function --------------------------------------------------------*/
165 #endif
166 
167 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_zerof(float z[],int n)168 SPAN_DECLARE(void) vec_zerof(float z[], int n)
169 {
170     int i;
171     __m128 n1;
172 
173     if ((i = n & ~3))
174     {
175         n1 = _mm_setzero_ps();
176         for (i -= 4;  i >= 0;  i -= 4)
177             _mm_storeu_ps(z + i, n1);
178     }
179     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
180     switch (n & 3)
181     {
182     case 3:
183         z[n - 3] = 0;
184     case 2:
185         z[n - 2] = 0;
186     case 1:
187         z[n - 1] = 0;
188     }
189 }
190 #else
vec_zerof(float z[],int n)191 SPAN_DECLARE(void) vec_zerof(float z[], int n)
192 {
193     int i;
194 
195     for (i = 0;  i < n;  i++)
196         z[i] = 0.0f;
197 }
198 #endif
199 /*- End of function --------------------------------------------------------*/
200 
vec_zero(double z[],int n)201 SPAN_DECLARE(void) vec_zero(double z[], int n)
202 {
203     int i;
204 
205     for (i = 0;  i < n;  i++)
206         z[i] = 0.0;
207 }
208 /*- End of function --------------------------------------------------------*/
209 
210 #if defined(HAVE_LONG_DOUBLE)
vec_zerol(long double z[],int n)211 SPAN_DECLARE(void) vec_zerol(long double z[], int n)
212 {
213     int i;
214 
215     for (i = 0;  i < n;  i++)
216         z[i] = 0.0L;
217 }
218 /*- End of function --------------------------------------------------------*/
219 #endif
220 
221 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_setf(float z[],float x,int n)222 SPAN_DECLARE(void) vec_setf(float z[], float x, int n)
223 {
224     int i;
225     __m128 n1;
226 
227     if ((i = n & ~3))
228     {
229         n1 = _mm_set1_ps(x);
230         for (i -= 4;  i >= 0;  i -= 4)
231             _mm_storeu_ps(z + i, n1);
232     }
233     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
234     switch (n & 3)
235     {
236     case 3:
237         z[n - 3] = x;
238     case 2:
239         z[n - 2] = x;
240     case 1:
241         z[n - 1] = x;
242     }
243 }
244 #else
vec_setf(float z[],float x,int n)245 SPAN_DECLARE(void) vec_setf(float z[], float x, int n)
246 {
247     int i;
248 
249     for (i = 0;  i < n;  i++)
250         z[i] = x;
251 }
252 #endif
253 /*- End of function --------------------------------------------------------*/
254 
vec_set(double z[],double x,int n)255 SPAN_DECLARE(void) vec_set(double z[], double x, int n)
256 {
257     int i;
258 
259     for (i = 0;  i < n;  i++)
260         z[i] = x;
261 }
262 /*- End of function --------------------------------------------------------*/
263 
264 #if defined(HAVE_LONG_DOUBLE)
vec_setl(long double z[],long double x,int n)265 SPAN_DECLARE(void) vec_setl(long double z[], long double x, int n)
266 {
267     int i;
268 
269     for (i = 0;  i < n;  i++)
270         z[i] = x;
271 }
272 /*- End of function --------------------------------------------------------*/
273 #endif
274 
275 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_addf(float z[],const float x[],const float y[],int n)276 SPAN_DECLARE(void) vec_addf(float z[], const float x[], const float y[], int n)
277 {
278     int i;
279     __m128 n1;
280     __m128 n2;
281 
282     if ((i = n & ~3))
283     {
284         for (i -= 4;  i >= 0;  i -= 4)
285         {
286             n1 = _mm_loadu_ps(x + i);
287             n2 = _mm_loadu_ps(y + i);
288             n2 = _mm_add_ps(n1, n2);
289             _mm_storeu_ps(z + i, n2);
290         }
291     }
292     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
293     switch (n & 3)
294     {
295     case 3:
296         z[n - 3] = x[n - 3] + y[n - 3];
297     case 2:
298         z[n - 2] = x[n - 2] + y[n - 2];
299     case 1:
300         z[n - 1] = x[n - 1] + y[n - 1];
301     }
302 }
303 #else
vec_addf(float z[],const float x[],const float y[],int n)304 SPAN_DECLARE(void) vec_addf(float z[], const float x[], const float y[], int n)
305 {
306     int i;
307 
308     for (i = 0;  i < n;  i++)
309         z[i] = x[i] + y[i];
310 }
311 #endif
312 /*- End of function --------------------------------------------------------*/
313 
vec_add(double z[],const double x[],const double y[],int n)314 SPAN_DECLARE(void) vec_add(double z[], const double x[], const double y[], int n)
315 {
316     int i;
317 
318     for (i = 0;  i < n;  i++)
319         z[i] = x[i] + y[i];
320 }
321 /*- End of function --------------------------------------------------------*/
322 
323 #if defined(HAVE_LONG_DOUBLE)
vec_addl(long double z[],const long double x[],const long double y[],int n)324 SPAN_DECLARE(void) vec_addl(long double z[], const long double x[], const long double y[], int n)
325 {
326     int i;
327 
328     for (i = 0;  i < n;  i++)
329         z[i] = x[i] + y[i];
330 }
331 /*- End of function --------------------------------------------------------*/
332 #endif
333 
334 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_scaledxy_addf(float z[],const float x[],float x_scale,const float y[],float y_scale,int n)335 SPAN_DECLARE(void) vec_scaledxy_addf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n)
336 {
337     int i;
338     __m128 n1;
339     __m128 n2;
340     __m128 n3;
341     __m128 n4;
342 
343     if ((i = n & ~3))
344     {
345         n3 = _mm_set1_ps(x_scale);
346         n4 = _mm_set1_ps(y_scale);
347         for (i -= 4;  i >= 0;  i -= 4)
348         {
349             n1 = _mm_loadu_ps(x + i);
350             n2 = _mm_loadu_ps(y + i);
351             n1 = _mm_mul_ps(n1, n3);
352             n2 = _mm_mul_ps(n2, n4);
353             n2 = _mm_add_ps(n1, n2);
354             _mm_storeu_ps(z + i, n2);
355         }
356     }
357     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
358     switch (n & 3)
359     {
360     case 3:
361         z[n - 3] = x[n - 3]*x_scale + y[n - 3]*y_scale;
362     case 2:
363         z[n - 2] = x[n - 2]*x_scale + y[n - 2]*y_scale;
364     case 1:
365         z[n - 1] = x[n - 1]*x_scale + y[n - 1]*y_scale;
366     }
367 }
368 #else
vec_scaledxy_addf(float z[],const float x[],float x_scale,const float y[],float y_scale,int n)369 SPAN_DECLARE(void) vec_scaledxy_addf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n)
370 {
371     int i;
372 
373     for (i = 0;  i < n;  i++)
374         z[i] = x[i]*x_scale + y[i]*y_scale;
375 }
376 #endif
377 /*- End of function --------------------------------------------------------*/
378 
vec_scaledxy_add(double z[],const double x[],double x_scale,const double y[],double y_scale,int n)379 SPAN_DECLARE(void) vec_scaledxy_add(double z[], const double x[], double x_scale, const double y[], double y_scale, int n)
380 {
381     int i;
382 
383     for (i = 0;  i < n;  i++)
384         z[i] = x[i]*x_scale + y[i]*y_scale;
385 }
386 /*- End of function --------------------------------------------------------*/
387 
388 #if defined(HAVE_LONG_DOUBLE)
vec_scaledxy_addl(long double z[],const long double x[],long double x_scale,const long double y[],long double y_scale,int n)389 SPAN_DECLARE(void) vec_scaledxy_addl(long double z[], const long double x[], long double x_scale, const long double y[], long double y_scale, int n)
390 {
391     int i;
392 
393     for (i = 0;  i < n;  i++)
394         z[i] = x[i]*x_scale + y[i]*y_scale;
395 }
396 /*- End of function --------------------------------------------------------*/
397 #endif
398 
399 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_scaledy_addf(float z[],const float x[],const float y[],float y_scale,int n)400 SPAN_DECLARE(void) vec_scaledy_addf(float z[], const float x[], const float y[], float y_scale, int n)
401 {
402     int i;
403     __m128 n1;
404     __m128 n2;
405     __m128 n3;
406 
407     if ((i = n & ~3))
408     {
409         n3 = _mm_set1_ps(y_scale);
410         for (i -= 4;  i >= 0;  i -= 4)
411         {
412             n1 = _mm_loadu_ps(x + i);
413             n2 = _mm_loadu_ps(y + i);
414             n2 = _mm_mul_ps(n2, n3);
415             n2 = _mm_add_ps(n1, n2);
416             _mm_storeu_ps(z + i, n2);
417         }
418     }
419     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
420     switch (n & 3)
421     {
422     case 3:
423         z[n - 3] = x[n - 3] + y[n - 3]*y_scale;
424     case 2:
425         z[n - 2] = x[n - 2] + y[n - 2]*y_scale;
426     case 1:
427         z[n - 1] = x[n - 1] + y[n - 1]*y_scale;
428     }
429 }
430 #else
vec_scaledy_addf(float z[],const float x[],const float y[],float y_scale,int n)431 SPAN_DECLARE(void) vec_scaledy_addf(float z[], const float x[], const float y[], float y_scale, int n)
432 {
433     int i;
434 
435     for (i = 0;  i < n;  i++)
436         z[i] = x[i] + y[i]*y_scale;
437 }
438 #endif
439 /*- End of function --------------------------------------------------------*/
440 
vec_scaledy_add(double z[],const double x[],const double y[],double y_scale,int n)441 SPAN_DECLARE(void) vec_scaledy_add(double z[], const double x[], const double y[], double y_scale, int n)
442 {
443     int i;
444 
445     for (i = 0;  i < n;  i++)
446         z[i] = x[i] + y[i]*y_scale;
447 }
448 /*- End of function --------------------------------------------------------*/
449 
450 #if defined(HAVE_LONG_DOUBLE)
vec_scaledy_addl(long double z[],const long double x[],const long double y[],long double y_scale,int n)451 SPAN_DECLARE(void) vec_scaledy_addl(long double z[], const long double x[], const long double y[], long double y_scale, int n)
452 {
453     int i;
454 
455     for (i = 0;  i < n;  i++)
456         z[i] = x[i] + y[i]*y_scale;
457 }
458 /*- End of function --------------------------------------------------------*/
459 #endif
460 
461 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_subf(float z[],const float x[],const float y[],int n)462 SPAN_DECLARE(void) vec_subf(float z[], const float x[], const float y[], int n)
463 {
464     int i;
465     __m128 n1;
466     __m128 n2;
467 
468     if ((i = n & ~3))
469     {
470         for (i -= 4;  i >= 0;  i -= 4)
471         {
472             n1 = _mm_loadu_ps(x + i);
473             n2 = _mm_loadu_ps(y + i);
474             n2 = _mm_sub_ps(n1, n2);
475             _mm_storeu_ps(z + i, n2);
476         }
477     }
478     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
479     switch (n & 3)
480     {
481     case 3:
482         z[n - 3] = x[n - 3] - y[n - 3];
483     case 2:
484         z[n - 2] = x[n - 2] - y[n - 2];
485     case 1:
486         z[n - 1] = x[n - 1] - y[n - 1];
487     }
488 }
489 #else
vec_subf(float z[],const float x[],const float y[],int n)490 SPAN_DECLARE(void) vec_subf(float z[], const float x[], const float y[], int n)
491 {
492     int i;
493 
494     for (i = 0;  i < n;  i++)
495         z[i] = x[i] - y[i];
496 }
497 #endif
498 /*- End of function --------------------------------------------------------*/
499 
vec_sub(double z[],const double x[],const double y[],int n)500 SPAN_DECLARE(void) vec_sub(double z[], const double x[], const double y[], int n)
501 {
502     int i;
503 
504     for (i = 0;  i < n;  i++)
505         z[i] = x[i] - y[i];
506 }
507 /*- End of function --------------------------------------------------------*/
508 
509 #if defined(HAVE_LONG_DOUBLE)
vec_subl(long double z[],const long double x[],const long double y[],int n)510 SPAN_DECLARE(void) vec_subl(long double z[], const long double x[], const long double y[], int n)
511 {
512     int i;
513 
514     for (i = 0;  i < n;  i++)
515         z[i] = x[i] - y[i];
516 }
517 /*- End of function --------------------------------------------------------*/
518 #endif
519 
vec_scaledxy_subf(float z[],const float x[],float x_scale,const float y[],float y_scale,int n)520 SPAN_DECLARE(void) vec_scaledxy_subf(float z[], const float x[], float x_scale, const float y[], float y_scale, int n)
521 {
522     int i;
523 
524     for (i = 0;  i < n;  i++)
525         z[i] = x[i]*x_scale - y[i]*y_scale;
526 }
527 /*- End of function --------------------------------------------------------*/
528 
vec_scaledxy_sub(double z[],const double x[],double x_scale,const double y[],double y_scale,int n)529 SPAN_DECLARE(void) vec_scaledxy_sub(double z[], const double x[], double x_scale, const double y[], double y_scale, int n)
530 {
531     int i;
532 
533     for (i = 0;  i < n;  i++)
534         z[i] = x[i]*x_scale - y[i]*y_scale;
535 }
536 /*- End of function --------------------------------------------------------*/
537 
538 #if defined(HAVE_LONG_DOUBLE)
vec_scaledxy_subl(long double z[],const long double x[],long double x_scale,const long double y[],long double y_scale,int n)539 SPAN_DECLARE(void) vec_scaledxy_subl(long double z[], const long double x[], long double x_scale, const long double y[], long double y_scale, int n)
540 {
541     int i;
542 
543     for (i = 0;  i < n;  i++)
544         z[i] = x[i]*x_scale - y[i]*y_scale;
545 }
546 /*- End of function --------------------------------------------------------*/
547 #endif
548 
549 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_scalar_mulf(float z[],const float x[],float y,int n)550 SPAN_DECLARE(void) vec_scalar_mulf(float z[], const float x[], float y, int n)
551 {
552     int i;
553     __m128 n1;
554     __m128 n2;
555 
556     if ((i = n & ~3))
557     {
558         n2 = _mm_set1_ps(y);
559         for (i -= 4;  i >= 0;  i -= 4)
560         {
561             n1 = _mm_loadu_ps(x + i);
562             n1 = _mm_mul_ps(n1, n2);
563             _mm_storeu_ps(z + i, n1);
564         }
565     }
566     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
567     switch (n & 3)
568     {
569     case 3:
570         z[n - 3] = x[n - 3]*y;
571     case 2:
572         z[n - 2] = x[n - 2]*y;
573     case 1:
574         z[n - 1] = x[n - 1]*y;
575     }
576 }
577 #else
vec_scalar_mulf(float z[],const float x[],float y,int n)578 SPAN_DECLARE(void) vec_scalar_mulf(float z[], const float x[], float y, int n)
579 {
580     int i;
581 
582     for (i = 0;  i < n;  i++)
583         z[i] = x[i]*y;
584 }
585 #endif
586 /*- End of function --------------------------------------------------------*/
587 
vec_scalar_mul(double z[],const double x[],double y,int n)588 SPAN_DECLARE(void) vec_scalar_mul(double z[], const double x[], double y, int n)
589 {
590     int i;
591 
592     for (i = 0;  i < n;  i++)
593         z[i] = x[i]*y;
594 }
595 /*- End of function --------------------------------------------------------*/
596 
597 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_scalar_addf(float z[],const float x[],float y,int n)598 SPAN_DECLARE(void) vec_scalar_addf(float z[], const float x[], float y, int n)
599 {
600     int i;
601     __m128 n1;
602     __m128 n2;
603 
604     if ((i = n & ~3))
605     {
606         n2 = _mm_set1_ps(y);
607         for (i -= 4;  i >= 0;  i -= 4)
608         {
609             n1 = _mm_loadu_ps(x + i);
610             n1 = _mm_add_ps(n1, n2);
611             _mm_storeu_ps(z + i, n1);
612         }
613     }
614     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
615     switch (n & 3)
616     {
617     case 3:
618         z[n - 3] = x[n - 3] + y;
619     case 2:
620         z[n - 2] = x[n - 2] + y;
621     case 1:
622         z[n - 1] = x[n - 1] + y;
623     }
624 }
625 #else
vec_scalar_addf(float z[],const float x[],float y,int n)626 SPAN_DECLARE(void) vec_scalar_addf(float z[], const float x[], float y, int n)
627 {
628     int i;
629 
630     for (i = 0;  i < n;  i++)
631         z[i] = x[i] + y;
632 }
633 #endif
634 /*- End of function --------------------------------------------------------*/
635 
vec_scalar_add(double z[],const double x[],double y,int n)636 SPAN_DECLARE(void) vec_scalar_add(double z[], const double x[], double y, int n)
637 {
638     int i;
639 
640     for (i = 0;  i < n;  i++)
641         z[i] = x[i] + y;
642 }
643 /*- End of function --------------------------------------------------------*/
644 
645 #if defined(HAVE_LONG_DOUBLE)
vec_scalar_addl(long double z[],const long double x[],long double y,int n)646 SPAN_DECLARE(void) vec_scalar_addl(long double z[], const long double x[], long double y, int n)
647 {
648     int i;
649 
650     for (i = 0;  i < n;  i++)
651         z[i] = x[i] + y;
652 }
653 /*- End of function --------------------------------------------------------*/
654 #endif
655 
656 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_scalar_subf(float z[],const float x[],float y,int n)657 SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n)
658 {
659     int i;
660     __m128 n1;
661     __m128 n2;
662 
663     if ((i = n & ~3))
664     {
665         n2 = _mm_set1_ps(y);
666         for (i -= 4;  i >= 0;  i -= 4)
667         {
668             n1 = _mm_loadu_ps(x + i);
669             n1 = _mm_sub_ps(n1, n2);
670             _mm_storeu_ps(z + i, n1);
671         }
672     }
673     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
674     switch (n & 3)
675     {
676     case 3:
677         z[n - 3] = x[n - 3] - y;
678     case 2:
679         z[n - 2] = x[n - 2] - y;
680     case 1:
681         z[n - 1] = x[n - 1] - y;
682     }
683 }
684 #else
vec_scalar_subf(float z[],const float x[],float y,int n)685 SPAN_DECLARE(void) vec_scalar_subf(float z[], const float x[], float y, int n)
686 {
687     int i;
688 
689     for (i = 0;  i < n;  i++)
690         z[i] = x[i] - y;
691 }
692 #endif
693 /*- End of function --------------------------------------------------------*/
694 
vec_scalar_sub(double z[],const double x[],double y,int n)695 SPAN_DECLARE(void) vec_scalar_sub(double z[], const double x[], double y, int n)
696 {
697     int i;
698 
699     for (i = 0;  i < n;  i++)
700         z[i] = x[i] - y;
701 }
702 /*- End of function --------------------------------------------------------*/
703 
704 #if defined(HAVE_LONG_DOUBLE)
vec_scalar_subl(long double z[],const long double x[],long double y,int n)705 SPAN_DECLARE(void) vec_scalar_subl(long double z[], const long double x[], long double y, int n)
706 {
707     int i;
708 
709     for (i = 0;  i < n;  i++)
710         z[i] = x[i] - y;
711 }
712 /*- End of function --------------------------------------------------------*/
713 #endif
714 
715 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_mulf(float z[],const float x[],const float y[],int n)716 SPAN_DECLARE(void) vec_mulf(float z[], const float x[], const float y[], int n)
717 {
718     int i;
719     __m128 n1;
720     __m128 n2;
721     __m128 n3;
722 
723     if ((i = n & ~3))
724     {
725         for (i -= 4;  i >= 0;  i -= 4)
726         {
727             n1 = _mm_loadu_ps(x + i);
728             n2 = _mm_loadu_ps(y + i);
729             n3 = _mm_mul_ps(n1, n2);
730             _mm_storeu_ps(z + i, n3);
731         }
732     }
733     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
734     switch (n & 3)
735     {
736     case 3:
737         z[n - 3] = x[n - 3]*y[n - 3];
738     case 2:
739         z[n - 2] = x[n - 2]*y[n - 2];
740     case 1:
741         z[n - 1] = x[n - 1]*y[n - 1];
742     }
743 }
744 #else
vec_mulf(float z[],const float x[],const float y[],int n)745 SPAN_DECLARE(void) vec_mulf(float z[], const float x[], const float y[], int n)
746 {
747     int i;
748 
749     for (i = 0;  i < n;  i++)
750         z[i] = x[i]*y[i];
751 }
752 /*- End of function --------------------------------------------------------*/
753 #endif
754 
vec_mul(double z[],const double x[],const double y[],int n)755 SPAN_DECLARE(void) vec_mul(double z[], const double x[], const double y[], int n)
756 {
757     int i;
758 
759     for (i = 0;  i < n;  i++)
760         z[i] = x[i]*y[i];
761 }
762 /*- End of function --------------------------------------------------------*/
763 
764 #if defined(HAVE_LONG_DOUBLE)
vec_mull(long double z[],const long double x[],const long double y[],int n)765 SPAN_DECLARE(void) vec_mull(long double z[], const long double x[], const long double y[], int n)
766 {
767     int i;
768 
769     for (i = 0;  i < n;  i++)
770         z[i] = x[i]*y[i];
771 }
772 /*- End of function --------------------------------------------------------*/
773 #endif
774 
775 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_dot_prodf(const float x[],const float y[],int n)776 SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n)
777 {
778     int i;
779     float z;
780     __m128 n1;
781     __m128 n2;
782     __m128 n3;
783     __m128 n4;
784 
785     z = 0.0f;
786     if ((i = n & ~3))
787     {
788         n4 = _mm_setzero_ps();  //sets sum to zero
789         for (i -= 4;  i >= 0;  i -= 4)
790         {
791             n1 = _mm_loadu_ps(x + i);
792             n2 = _mm_loadu_ps(y + i);
793             n3 = _mm_mul_ps(n1, n2);
794             n4 = _mm_add_ps(n4, n3);
795         }
796         n4 = _mm_add_ps(_mm_movehl_ps(n4, n4), n4);
797         n4 = _mm_add_ss(_mm_shuffle_ps(n4, n4, 1), n4);
798         _mm_store_ss(&z, n4);
799     }
800     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
801     switch (n & 3)
802     {
803     case 3:
804         z += x[n - 3]*y[n - 3];
805     case 2:
806         z += x[n - 2]*y[n - 2];
807     case 1:
808         z += x[n - 1]*y[n - 1];
809     }
810     return z;
811 }
812 #else
vec_dot_prodf(const float x[],const float y[],int n)813 SPAN_DECLARE(float) vec_dot_prodf(const float x[], const float y[], int n)
814 {
815     int i;
816     float z;
817 
818     z = 0.0f;
819     for (i = 0;  i < n;  i++)
820         z += x[i]*y[i];
821     return z;
822 }
823 /*- End of function --------------------------------------------------------*/
824 #endif
825 
vec_dot_prod(const double x[],const double y[],int n)826 SPAN_DECLARE(double) vec_dot_prod(const double x[], const double y[], int n)
827 {
828     int i;
829     double z;
830 
831     z = 0.0;
832     for (i = 0;  i < n;  i++)
833         z += x[i]*y[i];
834     return z;
835 }
836 /*- End of function --------------------------------------------------------*/
837 
838 #if defined(HAVE_LONG_DOUBLE)
vec_dot_prodl(const long double x[],const long double y[],int n)839 SPAN_DECLARE(long double) vec_dot_prodl(const long double x[], const long double y[], int n)
840 {
841     int i;
842     long double z;
843 
844     z = 0.0L;
845     for (i = 0;  i < n;  i++)
846         z += x[i]*y[i];
847     return z;
848 }
849 /*- End of function --------------------------------------------------------*/
850 #endif
851 
vec_circular_dot_prodf(const float x[],const float y[],int n,int pos)852 SPAN_DECLARE(float) vec_circular_dot_prodf(const float x[], const float y[], int n, int pos)
853 {
854     float z;
855 
856     z = vec_dot_prodf(&x[pos], &y[0], n - pos);
857     z += vec_dot_prodf(&x[0], &y[n - pos], pos);
858     return z;
859 }
860 /*- End of function --------------------------------------------------------*/
861 
862 #define LMS_LEAK_RATE   0.9999f
863 
864 #if defined(__GNUC__)  &&  defined(SPANDSP_USE_SSE2)
vec_lmsf(const float x[],float y[],int n,float error)865 SPAN_DECLARE(void) vec_lmsf(const float x[], float y[], int n, float error)
866 {
867     int i;
868     __m128 n1;
869     __m128 n2;
870     __m128 n3;
871     __m128 n4;
872 
873     if ((i = n & ~3))
874     {
875         n3 = _mm_set1_ps(error);
876         n4 = _mm_set1_ps(LMS_LEAK_RATE);
877         for (i -= 4;  i >= 0;  i -= 4)
878         {
879             n1 = _mm_loadu_ps(x + i);
880             n2 = _mm_loadu_ps(y + i);
881             n1 = _mm_mul_ps(n1, n3);
882             n2 = _mm_mul_ps(n2, n4);
883             n1 = _mm_add_ps(n1, n2);
884             _mm_storeu_ps(y + i, n1);
885         }
886     }
887     /* Now deal with the last 1 to 3 elements, which don't fill an SSE2 register */
888     switch (n & 3)
889     {
890     case 3:
891         y[n - 3] = y[n - 3]*LMS_LEAK_RATE + x[n - 3]*error;
892     case 2:
893         y[n - 2] = y[n - 2]*LMS_LEAK_RATE + x[n - 2]*error;
894     case 1:
895         y[n - 1] = y[n - 1]*LMS_LEAK_RATE + x[n - 1]*error;
896     }
897 }
898 #else
vec_lmsf(const float x[],float y[],int n,float error)899 SPAN_DECLARE(void) vec_lmsf(const float x[], float y[], int n, float error)
900 {
901     int i;
902 
903     for (i = 0;  i < n;  i++)
904     {
905         /* Leak a little to tame uncontrolled wandering */
906         y[i] = y[i]*LMS_LEAK_RATE + x[i]*error;
907     }
908 }
909 #endif
910 /*- End of function --------------------------------------------------------*/
911 
vec_circular_lmsf(const float x[],float y[],int n,int pos,float error)912 SPAN_DECLARE(void) vec_circular_lmsf(const float x[], float y[], int n, int pos, float error)
913 {
914     vec_lmsf(&x[pos], &y[0], n - pos, error);
915     vec_lmsf(&x[0], &y[n - pos], pos, error);
916 }
917 /*- End of function --------------------------------------------------------*/
918 /*- End of file ------------------------------------------------------------*/
919