1 /*
2 
3     x86 specific optimized assembler dsp routines
4     Copyright (C) 2001-2005 Jussi Laako
5 
6     This program is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10 
11     This program is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with this program; if not, write to the Free Software
18     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19 
20 */
21 
22 
23 #ifdef DSP_X86
24 
25 
26 #include <stdio.h>
27 #include <string.h>
28 #include <limits.h>
29 #include <math.h>
30 #include <float.h>
31 
32 #include "dsp/X86.h"
33 
34 
35 #ifndef DSP_X86_64
36 static char cpCPUid[13];
37 #endif
38 
39 
40 #ifdef __cplusplus
41 extern "C"
42 {
43 #endif
44 
45 
46 #ifndef DSP_X86_64
dsp_x86_cpuid()47 const char *dsp_x86_cpuid ()
48 {
49     unsigned int *ipCPUid = (unsigned int *) cpCPUid;
50 
51     X86_ASM (
52         "pushl %%ebx\n\t" \
53         "xorl %%eax, %%eax\n\t" \
54         "cpuid\n\t" \
55         "movl %%ebx, %0\n\t" \
56         "movl %%ecx, %2\n\t" \
57         "movl %%edx, %1\n\t" \
58         "popl %%ebx\n\t" \
59         : "=m" (ipCPUid[0]),
60           "=m" (ipCPUid[1]),
61           "=m" (ipCPUid[2])
62         :
63         : "eax", "ecx", "edx", "memory");
64     cpCPUid[12] = '\0';
65 
66     return cpCPUid;
67 }
68 
69 
dsp_x86_features()70 unsigned int dsp_x86_features ()
71 {
72     unsigned int uiFeatures = 0;
73 
74     X86_ASM (
75         "pushl %%ebx\n\t" \
76         "movl $1, %%eax\n\t" \
77         "cpuid\n\t" \
78         "movl %%edx, %0\n\t" \
79         "popl %%ebx\n\t" \
80         : "=m" (uiFeatures)
81         :
82         : "eax", "ecx", "edx", "memory");
83 
84     return uiFeatures;
85 }
86 
87 
dsp_x86_amd_features()88 unsigned int dsp_x86_amd_features ()
89 {
90     unsigned int uiFunction = 0x80000001;
91     unsigned int uiFeatures = 0;
92 
93     X86_ASM (
94         "pushl %%ebx\n\t" \
95         "movl %1, %%eax\n\t" \
96         "cpuid\n\t" \
97         "movl %%edx, %0\n\t" \
98         "popl %%ebx\n\t" \
99         : "=m" (uiFeatures)
100         : "m" (uiFunction)
101         : "eax", "ecx", "edx", "memory");
102 
103     return uiFeatures;
104 }
105 #endif
106 
107 
dsp_x86_have_e3dnow()108 extern int dsp_x86_have_e3dnow ()
109 {
110     #ifndef DSP_X86_64
111     unsigned int uiExtSup = 0;
112     unsigned int uiFeatures;
113 
114     X86_ASM (
115         "pushl %%ebx\n\t" \
116         "movl $0x80000000, %%eax\n\t" \
117         "cpuid\n\t" \
118         "cmpl $0x80000001, %%eax\n\t" \
119         "jl have3dnowxit\n\t" \
120         "movl $1, %0\n\t" \
121         "have3dnowxit:\n\t" \
122         "popl %%ebx\n\t"
123         : "=m" (uiExtSup)
124         :
125         : "eax", "ecx", "edx", "memory");
126     if (uiExtSup)
127     {
128         uiFeatures = dsp_x86_amd_features();
129         if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
130             return 1;
131     }
132     return 0;
133     #else
134     return 1;
135     #endif
136 }
137 
138 
dsp_x86_have_sse2()139 extern int dsp_x86_have_sse2 ()
140 {
141     #ifndef DSP_X86_64
142     unsigned int uiFeatures;
143 
144     uiFeatures = dsp_x86_features();
145     if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
146         return 1;
147     return 0;
148     #else
149     return 1;
150     #endif
151 }
152 
153 
154 // --- inline code snippets
155 
156 
dsp_x86_prefetchntf_init(const float * fpSrc)157 inline void dsp_x86_prefetchntf_init (const float *fpSrc)
158 {
159     stpm64 m64pSrc = (stpm64) fpSrc;
160 
161     X86_ASM (
162         "prefetchnta %0\n\t" \
163         "prefetchnta %1\n\t" \
164         "prefetchnta %2\n\t" \
165         "prefetchnta %3\n\t"
166         :
167         : "m" (m64pSrc[0]),
168           "m" (m64pSrc[8]),
169           "m" (m64pSrc[16]),
170           "m" (m64pSrc[24]));
171 }
172 
173 
dsp_x86_prefetchnt_init(const double * dpSrc)174 inline void dsp_x86_prefetchnt_init (const double *dpSrc)
175 {
176     stpm64 m64pSrc = (stpm64) dpSrc;
177 
178     X86_ASM (
179         "prefetchnta %0\n\t" \
180         "prefetchnta %1\n\t" \
181         "prefetchnta %2\n\t" \
182         "prefetchnta %3\n\t"
183         :
184         : "m" (m64pSrc[0]),
185           "m" (m64pSrc[8]),
186           "m" (m64pSrc[16]),
187           "m" (m64pSrc[24]));
188 }
189 
190 
dsp_x86_prefetchtf_init(const float * fpSrc)191 inline void dsp_x86_prefetchtf_init (const float *fpSrc)
192 {
193     stpm64 m64pSrc = (stpm64) fpSrc;
194 
195     X86_ASM (
196         "prefetcht0 %0\n\t" \
197         "prefetcht0 %1\n\t" \
198         "prefetcht0 %2\n\t" \
199         "prefetcht0 %3\n\t"
200         :
201         : "m" (m64pSrc[0]),
202           "m" (m64pSrc[8]),
203           "m" (m64pSrc[16]),
204           "m" (m64pSrc[24]));
205 }
206 
207 
dsp_x86_prefetcht_init(const double * dpSrc)208 inline void dsp_x86_prefetcht_init (const double *dpSrc)
209 {
210     stpm64 m64pSrc = (stpm64) dpSrc;
211 
212     X86_ASM (
213         "prefetcht0 %0\n\t" \
214         "prefetcht0 %1\n\t" \
215         "prefetcht0 %2\n\t" \
216         "prefetcht0 %3\n\t"
217         :
218         : "m" (m64pSrc[0]),
219           "m" (m64pSrc[8]),
220           "m" (m64pSrc[16]),
221           "m" (m64pSrc[24]));
222 }
223 
224 
dsp_x86_prefetchntf_next(const float * fpSrc)225 inline void dsp_x86_prefetchntf_next (const float *fpSrc)
226 {
227     stpm64 m64pSrc = (stpm64) fpSrc;
228 
229     X86_ASM (
230         "prefetchnta %0\n\t"
231         :
232         : "m" (m64pSrc[32]));
233 }
234 
235 
dsp_x86_prefetchnt_next(const double * dpSrc)236 inline void dsp_x86_prefetchnt_next (const double *dpSrc)
237 {
238     stpm64 m64pSrc = (stpm64) dpSrc;
239 
240     X86_ASM (
241         "prefetchnta %0\n\t"
242         :
243         : "m" (m64pSrc[32]));
244 }
245 
246 
dsp_x86_prefetchtf_next(const float * fpSrc)247 inline void dsp_x86_prefetchtf_next (const float *fpSrc)
248 {
249     stpm64 m64pSrc = (stpm64) fpSrc;
250 
251     X86_ASM (
252         "prefetcht0 %0\n\t"
253         :
254         : "m" (m64pSrc[32]));
255 }
256 
257 
dsp_x86_prefetcht_next(const double * dpSrc)258 inline void dsp_x86_prefetcht_next (const double *dpSrc)
259 {
260     stpm64 m64pSrc = (stpm64) dpSrc;
261 
262     X86_ASM (
263         "prefetcht0 %0\n\t"
264         :
265         : "m" (m64pSrc[32]));
266 }
267 
268 
269 // ---
270 
271 
dsp_x86_3dnow_copyf(float * fpDest,const float * fpSrc,int iDataLength)272 void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
273 {
274     int iStartIdx;
275     int iDataCntr;
276     int iDataCount;
277     pv2sf m64pDest = (pv2sf) fpDest;
278     pv2sf m64pSrc = (pv2sf) fpSrc;
279 
280     iStartIdx = 0;
281     X86_ASM (
282         "prefetchnta %0\n\t" \
283         "prefetchnta %1\n\t" \
284         "prefetchnta %2\n\t" \
285         "prefetchnta %3\n\t"
286         :
287         : "m" (m64pSrc[0]),
288           "m" (m64pSrc[8]),
289           "m" (m64pSrc[16]),
290           "m" (m64pSrc[24]));
291     iDataCount = ((iDataLength & 0xfffffff0) >> 1);
292     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
293     {
294         X86_ASM (
295             "prefetchnta %16\n\t" \
296             "movq %8, %%mm0\n\t" \
297             "movq %9, %%mm1\n\t" \
298             "movq %10, %%mm2\n\t" \
299             "movq %11, %%mm3\n\t" \
300             "movq %12, %%mm4\n\t" \
301             "movq %13, %%mm5\n\t" \
302             "movq %14, %%mm6\n\t" \
303             "movq %15, %%mm7\n\t" \
304             "movntq %%mm0, %0\n\t" \
305             "movntq %%mm1, %1\n\t" \
306             "movntq %%mm2, %2\n\t" \
307             "movntq %%mm3, %3\n\t" \
308             "movntq %%mm4, %4\n\t" \
309             "movntq %%mm5, %5\n\t" \
310             "movntq %%mm6, %6\n\t" \
311             "movntq %%mm7, %7\n\t"
312             : "=m" (m64pDest[iDataCntr]),
313               "=m" (m64pDest[iDataCntr + 1]),
314               "=m" (m64pDest[iDataCntr + 2]),
315               "=m" (m64pDest[iDataCntr + 3]),
316               "=m" (m64pDest[iDataCntr + 4]),
317               "=m" (m64pDest[iDataCntr + 5]),
318               "=m" (m64pDest[iDataCntr + 6]),
319               "=m" (m64pDest[iDataCntr + 7])
320             : "m" (m64pSrc[iDataCntr]),
321               "m" (m64pSrc[iDataCntr + 1]),
322               "m" (m64pSrc[iDataCntr + 2]),
323               "m" (m64pSrc[iDataCntr + 3]),
324               "m" (m64pSrc[iDataCntr + 4]),
325               "m" (m64pSrc[iDataCntr + 5]),
326               "m" (m64pSrc[iDataCntr + 6]),
327               "m" (m64pSrc[iDataCntr + 7]),
328               "m" (m64pSrc[iDataCntr + 32])
329             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
330     }
331     iStartIdx = iDataCount;
332     iDataCount = ((iDataLength & 0xfffffffe) >> 1);
333     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
334     {
335         X86_ASM (
336             "prefetchnta %2\n\t" \
337             "movq %1, %%mm0\n\t" \
338             "movntq %%mm0, %0\n\t"
339             : "=m" (m64pDest[iDataCntr])
340             : "m" (m64pSrc[iDataCntr]),
341               "m" (m64pSrc[iDataCntr + 32])
342             : "mm0", "memory");
343     }
344     if (iDataLength & 0x1)
345     {
346         X86_ASM (
347             "movd %1, %%mm0\n\t" \
348             "movd %%mm0, %0\n\t"
349             : "=m" (fpDest[iDataLength - 1])
350             : "m" (fpSrc[iDataLength - 1])
351             : "mm0", "memory");
352     }
353     X86_ASM (
354         "femms\n\t" \
355         "sfence\n\t");
356 }
357 
358 
dsp_x86_3dnow_copyd(double * dpDest,const double * dpSrc,int iDataLength)359 void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
360 {
361     int iStartIdx;
362     int iDataCntr;
363     int iDataCount;
364 
365     iStartIdx = 0;
366     X86_ASM (
367         "prefetchnta %0\n\t" \
368         "prefetchnta %1\n\t" \
369         "prefetchnta %2\n\t" \
370         "prefetchnta %3\n\t"
371         :
372         : "m" (dpSrc[0]),
373           "m" (dpSrc[8]),
374           "m" (dpSrc[16]),
375           "m" (dpSrc[24]));
376     iDataCount = (iDataLength & 0xfffffff8);
377     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
378     {
379         X86_ASM (
380             "prefetchnta %16\n\t" \
381             "movq %8, %%mm0\n\t" \
382             "movq %9, %%mm1\n\t" \
383             "movq %10, %%mm2\n\t" \
384             "movq %11, %%mm3\n\t" \
385             "movq %12, %%mm4\n\t" \
386             "movq %13, %%mm5\n\t" \
387             "movq %14, %%mm6\n\t" \
388             "movq %15, %%mm7\n\t" \
389             "movntq %%mm0, %0\n\t" \
390             "movntq %%mm1, %1\n\t" \
391             "movntq %%mm2, %2\n\t" \
392             "movntq %%mm3, %3\n\t" \
393             "movntq %%mm4, %4\n\t" \
394             "movntq %%mm5, %5\n\t" \
395             "movntq %%mm6, %6\n\t" \
396             "movntq %%mm7, %7\n\t"
397             : "=m" (dpDest[iDataCntr]),
398               "=m" (dpDest[iDataCntr + 1]),
399               "=m" (dpDest[iDataCntr + 2]),
400               "=m" (dpDest[iDataCntr + 3]),
401               "=m" (dpDest[iDataCntr + 4]),
402               "=m" (dpDest[iDataCntr + 5]),
403               "=m" (dpDest[iDataCntr + 6]),
404               "=m" (dpDest[iDataCntr + 7])
405             : "m" (dpSrc[iDataCntr]),
406               "m" (dpSrc[iDataCntr + 1]),
407               "m" (dpSrc[iDataCntr + 2]),
408               "m" (dpSrc[iDataCntr + 3]),
409               "m" (dpSrc[iDataCntr + 4]),
410               "m" (dpSrc[iDataCntr + 5]),
411               "m" (dpSrc[iDataCntr + 6]),
412               "m" (dpSrc[iDataCntr + 7]),
413               "m" (dpSrc[iDataCntr + 32])
414             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
415     }
416     iStartIdx = iDataCount;
417     iDataCount = iDataLength;
418     for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
419     {
420         X86_ASM (
421             "prefetchnta %2\n\t" \
422             "movq %1, %%mm0\n\t" \
423             "movntq %%mm0, %0\n\t"
424             : "=m" (dpDest[iDataCntr])
425             : "m" (dpSrc[iDataCntr]),
426               "m" (dpSrc[iDataCntr + 32])
427             : "mm0", "memory");
428     }
429     X86_ASM (
430         "femms\n\t" \
431         "sfence\n\t");
432 }
433 
434 
dsp_x86_3dnow_addf(float * fpVect,float fSrc,int iDataLength)435 void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
436 {
437     int iDataCntr;
438     int iDataCount;
439     pv2sf m64pVect = (pv2sf) fpVect;
440     stm64 m64Src;
441 
442     m64Src.f[0] = m64Src.f[1] = fSrc;
443     iDataCount = (iDataLength >> 1);
444     X86_ASM (
445         "movq %0, %%mm1\n\t"
446         :
447         : "m" (m64Src)
448         : "mm1", "memory");
449     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
450     {
451         X86_ASM (
452             "movq %1, %%mm0\n\t" \
453             "pfadd %%mm1, %%mm0\n\t" \
454             "movntq %%mm0, %0\n\t"
455             : "=m" (m64pVect[iDataCntr])
456             : "m0" (m64pVect[iDataCntr])
457             : "mm0", "mm1", "memory");
458     }
459     if (iDataLength & 0x1)
460     {
461         X86_ASM (
462             "movd %1, %%mm0\n\t" \
463             "pfadd %%mm1, %%mm0\n\t" \
464             "movd %%mm0, %0\n\t"
465             : "=m" (fpVect[iDataLength - 1])
466             : "m0" (fpVect[iDataLength - 1])
467             : "mm0", "mm1", "memory");
468     }
469     X86_ASM (
470         "femms\n\t" \
471         "sfence\n\t");
472 }
473 
474 
dsp_x86_sse_addf(float * fpVect,float fSrc,int iDataLength)475 void dsp_x86_sse_addf (float *fpVect, float fSrc, int iDataLength)
476 {
477     int iDataCntr;
478 
479     X86_ASM (
480         "movss %0, %%xmm1\n\t"
481         :
482         : "m" (fSrc)
483         : "xmm1", "memory");
484     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
485     {
486         X86_ASM (
487             "movss %1, %%xmm0\n\t" \
488             "addss %%xmm1, %%xmm0\n\t" \
489             "movss %%xmm0, %0\n\t"
490             : "=m" (fpVect[iDataCntr])
491             : "m0" (fpVect[iDataCntr])
492             : "xmm0", "xmm1", "memory");
493     }
494 }
495 
496 
dsp_x86_sse_add(double * dpVect,double dSrc,int iDataLength)497 void dsp_x86_sse_add (double *dpVect, double dSrc, int iDataLength)
498 {
499     int iDataCntr;
500 
501     X86_ASM (
502         "movsd %0, %%xmm1\n\t"
503         :
504         : "m" (dSrc)
505         : "xmm1", "memory");
506     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
507     {
508         X86_ASM (
509             "movsd %1, %%xmm0\n\t" \
510             "addsd %%xmm1, %%xmm0\n\t" \
511             "movsd %%xmm0, %0\n\t"
512             : "=m" (dpVect[iDataCntr])
513             : "m0" (dpVect[iDataCntr])
514             : "xmm0", "xmm1", "memory");
515     }
516 }
517 
518 
dsp_x86_3dnow_mulf(float * fpVect,float fSrc,int iDataLength)519 void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
520 {
521     int iDataCntr;
522     int iDataCount;
523     pv2sf m64pVect = (pv2sf) fpVect;
524     stm64 m64Src;
525 
526     m64Src.f[0] = m64Src.f[1] = fSrc;
527     iDataCount = (iDataLength >> 1);
528     X86_ASM (
529         "movq %0, %%mm1\n\t"
530         :
531         : "m" (m64Src)
532         : "mm1", "memory");
533     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
534     {
535         X86_ASM (
536             "movq %1, %%mm0\n\t" \
537             "pfmul %%mm1, %%mm0\n\t" \
538             "movntq %%mm0, %0\n\t"
539             : "=m" (m64pVect[iDataCntr])
540             : "m0" (m64pVect[iDataCntr])
541             : "mm0", "mm1", "memory");
542     }
543     if (iDataLength & 0x1)
544     {
545         X86_ASM (
546             "movd %1, %%mm0\n\t" \
547             "pfmul %%mm1, %%mm0\n\t" \
548             "movd %%mm0, %0\n\t"
549             : "=m" (fpVect[iDataLength - 1])
550             : "m0" (fpVect[iDataLength - 1])
551             : "mm0", "mm1", "memory");
552     }
553     X86_ASM (
554         "femms\n\t" \
555         "sfence\n\t");
556 }
557 
558 
dsp_x86_sse_mulf(float * fpVect,float fSrc,int iDataLength)559 void dsp_x86_sse_mulf (float *fpVect, float fSrc, int iDataLength)
560 {
561     int iDataCntr;
562 
563     X86_ASM (
564         "movss %0, %%xmm1\n\t"
565         :
566         : "m" (fSrc)
567         : "xmm1", "memory");
568     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
569     {
570         X86_ASM (
571             "movss %1, %%xmm0\n\t" \
572             "mulss %%xmm1, %%xmm0\n\t" \
573             "movss %%xmm0, %0\n\t"
574             : "=m" (fpVect[iDataCntr])
575             : "m0" (fpVect[iDataCntr])
576             : "xmm0", "xmm1", "memory");
577     }
578 }
579 
580 
dsp_x86_sse_mul(double * dpVect,double dSrc,int iDataLength)581 void dsp_x86_sse_mul (double *dpVect, double dSrc, int iDataLength)
582 {
583     int iDataCntr;
584 
585     X86_ASM (
586         "movsd %0, %%xmm1\n\t"
587         :
588         : "m" (dSrc)
589         : "xmm1", "memory");
590     for (iDataCntr = 0; iDataCntr <iDataLength; iDataCntr++)
591     {
592         X86_ASM (
593             "movsd %1, %%xmm0\n\t" \
594             "mulsd %%xmm1, %%xmm0\n\t" \
595             "movsd %%xmm0, %0\n\t"
596             : "=m" (dpVect[iDataCntr])
597             : "m0" (dpVect[iDataCntr])
598             : "xmm0", "xmm1", "memory");
599     }
600 }
601 
602 
dsp_x86_3dnow_mulf_nip(float * fpDest,const float * fpSrc1,float fSrc2,int iDataLength)603 void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
604     int iDataLength)
605 {
606     int iDataCntr;
607     int iDataCount;
608     pv2sf m64pDest = (pv2sf) fpDest;
609     pv2sf m64pSrc1 = (pv2sf) fpSrc1;
610     stm64 m64Src2;
611 
612     m64Src2.f[0] = m64Src2.f[1] = fSrc2;
613     iDataCount = (iDataLength >> 1);
614     X86_ASM (
615         "movq %0, %%mm1\n\t"
616         :
617         : "m" (m64Src2)
618         : "mm1", "memory");
619     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
620     {
621         X86_ASM (
622             "movq %1, %%mm0\n\t" \
623             "pfmul %%mm1, %%mm0\n\t" \
624             "movntq %%mm0, %0\n\t"
625             : "=m" (m64pDest[iDataCntr])
626             : "m" (m64pSrc1[iDataCntr])
627             : "mm0", "mm1", "memory");
628     }
629     if (iDataLength & 0x1)
630     {
631         X86_ASM (
632             "movd %1, %%mm0\n\t" \
633             "pfmul %%mm1, %%mm0\n\t" \
634             "movd %%mm0, %0\n\t"
635             : "=m" (fpDest[iDataLength - 1])
636             : "m" (fpSrc1[iDataLength - 1])
637             : "mm0", "mm1", "memory");
638     }
639     X86_ASM (
640         "femms\n\t" \
641         "sfence\n\t");
642 }
643 
644 
dsp_x86_sse_mulf_nip(float * fpDest,const float * fpSrc1,float fSrc2,int iDataLength)645 void dsp_x86_sse_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
646     int iDataLength)
647 {
648     int iDataCntr;
649 
650     X86_ASM (
651         "movss %0, %%xmm1\n\t"
652         :
653         : "m" (fSrc2)
654         : "xmm1", "memory");
655     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
656     {
657         X86_ASM (
658             "movss %1, %%xmm0\n\t" \
659             "mulss %%xmm1, %%xmm0\n\t" \
660             "movss %%xmm0, %0\n\t"
661             : "=m" (fpDest[iDataCntr])
662             : "m" (fpSrc1[iDataCntr])
663             : "xmm0", "xmm1", "memory");
664     }
665 }
666 
667 
dsp_x86_sse_mul_nip(double * dpDest,const double * dpSrc1,double dSrc2,int iDataLength)668 void dsp_x86_sse_mul_nip (double *dpDest, const double *dpSrc1, double dSrc2,
669     int iDataLength)
670 {
671     int iDataCntr;
672 
673     X86_ASM (
674         "movsd %0, %%xmm1\n\t"
675         :
676         : "m" (dSrc2)
677         : "xmm1", "memory");
678     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
679     {
680         X86_ASM (
681             "movsd %1, %%xmm0\n\t" \
682             "mulsd %%xmm1, %%xmm0\n\t" \
683             "movsd %%xmm0, %0\n\t"
684             : "=m" (dpDest[iDataCntr])
685             : "m" (dpSrc1[iDataCntr])
686             : "xmm0", "xmm1", "memory");
687     }
688 }
689 
690 
dsp_x86_3dnow_add2f(float * fpDest,const float * fpSrc,int iDataLength)691 void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
692 {
693     int iDataCntr;
694     int iDataCount;
695     pv2sf m64pDest = (pv2sf) fpDest;
696     pv2sf m64pSrc = (pv2sf) fpSrc;
697 
698     iDataCount = (iDataLength >> 1);
699     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
700     {
701         X86_ASM (
702             "movq %1, %%mm0\n\t" \
703             "movq %2, %%mm1\n\t" \
704             "pfadd %%mm1, %%mm0\n\t" \
705             "movntq %%mm0, %0\n\t"
706             : "=m" (m64pDest[iDataCntr])
707             : "m0" (m64pDest[iDataCntr]),
708               "m" (m64pSrc[iDataCntr])
709             : "mm0", "mm1", "memory");
710     }
711     if (iDataLength & 0x1)
712     {
713         X86_ASM (
714             "movd %1, %%mm0\n\t" \
715             "movd %2, %%mm1\n\t" \
716             "pfadd %%mm1, %%mm0\n\t" \
717             "movd %%mm0, %0\n\t"
718             : "=m" (fpDest[iDataLength - 1])
719             : "m0" (fpDest[iDataLength - 1]),
720               "m" (fpSrc[iDataLength - 1])
721             : "mm0", "mm1", "memory");
722     }
723     X86_ASM (
724         "femms\n\t" \
725         "sfence\n\t");
726 }
727 
728 
dsp_x86_sse_add2f(float * fpDest,const float * fpSrc,int iDataLength)729 void dsp_x86_sse_add2f (float *fpDest, const float *fpSrc, int iDataLength)
730 {
731     int iDataCntr;
732 
733     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
734     {
735         X86_ASM (
736             "movss %1, %%xmm0\n\t" \
737             "addss %2, %%xmm0\n\t" \
738             "movss %%xmm0, %0\n\t"
739             : "=m" (fpDest[iDataCntr])
740             : "m0" (fpDest[iDataCntr]),
741               "m" (fpSrc[iDataCntr])
742             : "xmm0", "memory");
743     }
744 }
745 
746 
dsp_x86_sse_add2(double * dpDest,const double * dpSrc,int iDataLength)747 void dsp_x86_sse_add2 (double *dpDest, const double *dpSrc, int iDataLength)
748 {
749     int iDataCntr;
750 
751     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
752     {
753         X86_ASM (
754             "movsd %1, %%xmm0\n\t" \
755             "addsd %2, %%xmm0\n\t" \
756             "movsd %%xmm0, %0\n\t"
757             : "=m" (dpDest[iDataCntr])
758             : "m0" (dpDest[iDataCntr]),
759               "m" (dpSrc[iDataCntr])
760             : "xmm0", "memory");
761     }
762 }
763 
764 
dsp_x86_3dnow_mul2f(float * fpDest,const float * fpSrc,int iDataLength)765 void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
766 {
767     int iDataCntr;
768     int iDataCount;
769     pv2sf m64pDest = (pv2sf) fpDest;
770     pv2sf m64pSrc = (pv2sf) fpSrc;
771 
772     iDataCount = (iDataLength >> 1);
773     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
774     {
775         X86_ASM (
776             "movq %1, %%mm0\n\t" \
777             "movq %2, %%mm1\n\t" \
778             "pfmul %%mm1, %%mm0\n\t" \
779             "movntq %%mm0, %0\n\t"
780             : "=m" (m64pDest[iDataCntr])
781             : "m0" (m64pDest[iDataCntr]),
782               "m" (m64pSrc[iDataCntr])
783             : "mm0", "mm1", "memory");
784     }
785     if (iDataLength & 0x1)
786     {
787         X86_ASM (
788             "movd %1, %%mm0\n\t" \
789             "movd %2, %%mm1\n\t" \
790             "pfmul %%mm1, %%mm0\n\t" \
791             "movd %%mm0, %0\n\t"
792             : "=m" (fpDest[iDataLength - 1])
793             : "m0" (fpDest[iDataLength - 1]),
794               "m" (fpSrc[iDataLength - 1])
795             : "mm0", "mm1", "memory");
796     }
797     X86_ASM (
798         "femms\n\t" \
799         "sfence\n\t");
800 }
801 
802 
dsp_x86_sse_mul2f(float * fpDest,const float * fpSrc,int iDataLength)803 void dsp_x86_sse_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
804 {
805     int iDataCntr;
806 
807     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
808     {
809         X86_ASM (
810             "movss %1, %%xmm0\n\t" \
811             "mulss %2, %%xmm0\n\t" \
812             "movss %%xmm0, %0\n\t"
813             : "=m" (fpDest[iDataCntr])
814             : "m0" (fpDest[iDataCntr]),
815               "m" (fpSrc[iDataCntr])
816             : "xmm0", "memory");
817     }
818 }
819 
820 
dsp_x86_sse_mul2(double * dpDest,const double * dpSrc,int iDataLength)821 void dsp_x86_sse_mul2 (double *dpDest, const double *dpSrc, int iDataLength)
822 {
823     int iDataCntr;
824 
825     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
826     {
827         X86_ASM (
828             "movsd %1, %%xmm0\n\t" \
829             "mulsd %2, %%xmm0\n\t" \
830             "movsd %%xmm0, %0\n\t"
831             : "=m" (dpDest[iDataCntr])
832             : "m0" (dpDest[iDataCntr]),
833               "m" (dpSrc[iDataCntr])
834             : "xmm0", "memory");
835     }
836 }
837 
838 
dsp_x86_3dnow_add3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)839 void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1,
840     const float *fpSrc2, int iDataLength)
841 {
842     int iDataCntr;
843     int iDataCount;
844     pv2sf m64pDest = (pv2sf) fpDest;
845     pv2sf m64pSrc1 = (pv2sf) fpSrc1;
846     pv2sf m64pSrc2 = (pv2sf) fpSrc2;
847 
848     iDataCount = (iDataLength >> 1);
849     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
850     {
851         X86_ASM (
852             "movq %1, %%mm0\n\t" \
853             "movq %2, %%mm1\n\t" \
854             "pfadd %%mm1, %%mm0\n\t" \
855             "movntq %%mm0, %0\n\t"
856             : "=m" (m64pDest[iDataCntr])
857             : "m" (m64pSrc1[iDataCntr]),
858               "m" (m64pSrc2[iDataCntr])
859             : "mm0", "mm1", "memory");
860     }
861     if (iDataLength & 0x1)
862     {
863         X86_ASM (
864             "movd %1, %%mm0\n\t" \
865             "movd %2, %%mm1\n\t" \
866             "pfadd %%mm1, %%mm0\n\t" \
867             "movd %%mm0, %0\n\t"
868             : "=m" (fpDest[iDataLength - 1])
869             : "m" (fpSrc1[iDataLength - 1]),
870               "m" (fpSrc2[iDataLength - 1])
871             : "mm0", "mm1", "memory");
872     }
873     X86_ASM (
874         "femms\n\t" \
875         "sfence\n\t");
876 }
877 
878 
dsp_x86_sse_add3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)879 void dsp_x86_sse_add3f (float *fpDest, const float *fpSrc1,
880     const float *fpSrc2, int iDataLength)
881 {
882     int iDataCntr;
883 
884     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
885     {
886         X86_ASM (
887             "movss %1, %%xmm0\n\t" \
888             "addss %2, %%xmm0\n\t" \
889             "movss %%xmm0, %0\n\t"
890             : "=m" (fpDest[iDataCntr])
891             : "m" (fpSrc1[iDataCntr]),
892               "m" (fpSrc2[iDataCntr])
893             : "xmm0", "memory");
894     }
895 }
896 
897 
dsp_x86_sse_add3(double * dpDest,const double * dpSrc1,const double * dpSrc2,int iDataLength)898 void dsp_x86_sse_add3 (double *dpDest, const double *dpSrc1,
899     const double *dpSrc2, int iDataLength)
900 {
901     int iDataCntr;
902 
903     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
904     {
905         X86_ASM (
906             "movsd %1, %%xmm0\n\t" \
907             "addsd %2, %%xmm0\n\t" \
908             "movsd %%xmm0, %0\n\t"
909             : "=m" (dpDest[iDataCntr])
910             : "m" (dpSrc1[iDataCntr]),
911               "m" (dpSrc2[iDataCntr])
912             : "xmm0", "memory");
913     }
914 }
915 
916 
dsp_x86_3dnow_mul3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)917 void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1,
918     const float *fpSrc2, int iDataLength)
919 {
920     int iDataCntr;
921     int iDataCount;
922     pv2sf m64pDest = (pv2sf) fpDest;
923     pv2sf m64pSrc1 = (pv2sf) fpSrc1;
924     pv2sf m64pSrc2 = (pv2sf) fpSrc2;
925 
926     iDataCount = (iDataLength >> 1);
927     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
928     {
929         X86_ASM (
930             "movq %1, %%mm0\n\t" \
931             "movq %2, %%mm1\n\t" \
932             "pfmul %%mm1, %%mm0\n\t" \
933             "movntq %%mm0, %0\n\t"
934             : "=m" (m64pDest[iDataCntr])
935             : "m" (m64pSrc1[iDataCntr]),
936               "m" (m64pSrc2[iDataCntr])
937             : "mm0", "mm1", "memory");
938     }
939     if (iDataLength & 0x1)
940     {
941         X86_ASM (
942             "movd %1, %%mm0\n\t" \
943             "movd %2, %%mm1\n\t" \
944             "pfmul %%mm1, %%mm0\n\t" \
945             "movd %%mm0, %0\n\t"
946             : "=m" (fpDest[iDataLength - 1])
947             : "m" (fpSrc1[iDataLength - 1]),
948               "m" (fpSrc2[iDataLength - 1])
949             : "mm0", "mm1", "memory");
950     }
951     X86_ASM (
952         "femms\n\t" \
953         "sfence\n\t");
954 }
955 
956 
dsp_x86_sse_mul3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)957 void dsp_x86_sse_mul3f (float *fpDest, const float *fpSrc1,
958     const float *fpSrc2, int iDataLength)
959 {
960     int iDataCntr;
961 
962     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
963     {
964         X86_ASM (
965             "movss %1, %%xmm0\n\t" \
966             "mulss %2, %%xmm0\n\t" \
967             "movss %%xmm0, %0\n\t"
968             : "=m" (fpDest[iDataCntr])
969             : "m" (fpSrc1[iDataCntr]),
970               "m" (fpSrc2[iDataCntr])
971             : "xmm0", "memory");
972     }
973 }
974 
975 
dsp_x86_sse_mul3(double * dpDest,const double * dpSrc1,const double * dpSrc2,int iDataLength)976 void dsp_x86_sse_mul3 (double *dpDest, const double *dpSrc1,
977     const double *dpSrc2, int iDataLength)
978 {
979     int iDataCntr;
980 
981     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
982     {
983         X86_ASM (
984             "movsd %1, %%xmm0\n\t" \
985             "mulsd %2, %%xmm0\n\t" \
986             "movsd %%xmm0, %0\n\t"
987             : "=m" (dpDest[iDataCntr])
988             : "m" (dpSrc1[iDataCntr]),
989               "m" (dpSrc2[iDataCntr])
990             : "xmm0", "memory");
991     }
992 }
993 
994 
dsp_x86_3dnow_cmulf(float * fpDest,const float * fpSrc,int iDataLength)995 void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
996 {
997     int iDataCntr;
998     pv2sf m64pDest = (pv2sf) fpDest;
999 
1000     X86_ASM (
1001         "movq %0, %%mm3\n\t"
1002         :
1003         : "m" (fpSrc[0])
1004         : "mm3", "memory");
1005     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1006     {
1007         X86_ASM (
1008             "movq %1, %%mm0\n\t" \
1009             "movq %%mm3, %%mm1\n\t" \
1010             "pswapd %%mm1, %%mm2\n\t" \
1011             "pfmul %%mm0, %%mm1\n\t" \
1012             "pfmul %%mm0, %%mm2\n\t" \
1013             "pfpnacc %%mm2, %%mm1\n\t" \
1014             "movntq %%mm1, %0\n\t"
1015             : "=m" (m64pDest[iDataCntr])
1016             : "m0" (m64pDest[iDataCntr])
1017             : "mm0", "mm1", "mm2", "mm3", "memory");
1018     }
1019     X86_ASM (
1020         "femms\n\t" \
1021         "sfence\n\t");
1022 }
1023 
1024 
dsp_x86_sse_cmulf(float * fpDest,const float * fpSrc,int iDataLength)1025 void dsp_x86_sse_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
1026 {
1027     int iDataCntr;
1028     int iDataCount;
1029 
1030     X86_ASM (
1031         "movss %0, %%xmm2\n\t" \
1032         "movss %1, %%xmm3\n\t"
1033         :
1034         : "m" (fpSrc[0]),
1035           "m" (fpSrc[1])
1036         : "xmm2", "xmm3", "memory");
1037     iDataCount = (iDataLength << 1);
1038     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1039     {
1040         X86_ASM (
1041             "movss %2, %%xmm0\n\t" \
1042             "movss %%xmm0, %%xmm1\n\t" \
1043             "movss %3, %%xmm4\n\t" \
1044             \
1045             "mulss %%xmm2, %%xmm0\n\t" \
1046             "movss %%xmm4, %%xmm5\n\t" \
1047             "mulss %%xmm3, %%xmm5\n\t" \
1048             "subss %%xmm5, %%xmm0\n\t" \
1049             \
1050             "mulss %%xmm3, %%xmm1\n\t" \
1051             "movss %%xmm4, %%xmm5\n\t" \
1052             "mulss %%xmm2, %%xmm5\n\t" \
1053             "addss %%xmm5, %%xmm1\n\t" \
1054             \
1055             "movss %%xmm0, %0\n\t" \
1056             "movss %%xmm1, %1\n\t"
1057             : "=m" (fpDest[iDataCntr]),
1058               "=m" (fpDest[iDataCntr + 1])
1059             : "m0" (fpDest[iDataCntr]),
1060               "m1" (fpDest[iDataCntr + 1])
1061             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1062     }
1063 }
1064 
1065 
dsp_x86_sse_cmul(double * dpDest,const double * dpSrc,int iDataLength)1066 void dsp_x86_sse_cmul (double *dpDest, const double *dpSrc, int iDataLength)
1067 {
1068     int iDataCntr;
1069     int iDataCount;
1070 
1071     X86_ASM (
1072         "movsd %0, %%xmm2\n\t" \
1073         "movsd %1, %%xmm3\n\t"
1074         :
1075         : "m" (dpSrc[0]),
1076           "m" (dpSrc[1])
1077         : "xmm2", "xmm3", "memory");
1078     iDataCount = (iDataLength << 1);
1079     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1080     {
1081         X86_ASM (
1082             "movsd %2, %%xmm0\n\t" \
1083             "movsd %%xmm0, %%xmm1\n\t" \
1084             "movsd %3, %%xmm4\n\t" \
1085             \
1086             "mulsd %%xmm2, %%xmm0\n\t" \
1087             "movsd %%xmm4, %%xmm5\n\t" \
1088             "mulsd %%xmm3, %%xmm5\n\t" \
1089             "subsd %%xmm5, %%xmm0\n\t" \
1090             \
1091             "mulsd %%xmm3, %%xmm1\n\t" \
1092             "movsd %%xmm4, %%xmm5\n\t" \
1093             "mulsd %%xmm2, %%xmm5\n\t" \
1094             "addsd %%xmm5, %%xmm1\n\t" \
1095             \
1096             "movsd %%xmm0, %0\n\t" \
1097             "movsd %%xmm1, %1\n\t"
1098             : "=m" (dpDest[iDataCntr]),
1099               "=m" (dpDest[iDataCntr + 1])
1100             : "m0" (dpDest[iDataCntr]),
1101               "m1" (dpDest[iDataCntr + 1])
1102             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1103     }
1104 }
1105 
1106 
dsp_x86_3dnow_cmul2f(float * fpDest,const float * fpSrc,int iDataLength)1107 void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
1108 {
1109     int iDataCntr;
1110     pv2sf m64pDest = (pv2sf) fpDest;
1111     pv2sf m64pSrc = (pv2sf) fpSrc;
1112 
1113     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1114     {
1115         X86_ASM (
1116             "movq %1, %%mm0\n\t" \
1117             "movq %2, %%mm1\n\t" \
1118             "pswapd %%mm1, %%mm2\n\t" \
1119             "pfmul %%mm0, %%mm1\n\t" \
1120             "pfmul %%mm0, %%mm2\n\t" \
1121             "pfpnacc %%mm2, %%mm1\n\t" \
1122             "movntq %%mm1, %0\n\t"
1123             : "=m" (m64pDest[iDataCntr])
1124             : "m0" (m64pDest[iDataCntr]),
1125               "m" (m64pSrc[iDataCntr])
1126             : "mm0", "mm1", "mm2", "memory");
1127     }
1128     X86_ASM (
1129         "femms\n\t" \
1130         "sfence\n\t");
1131 }
1132 
1133 
dsp_x86_sse_cmul2f(float * fpDest,const float * fpSrc,int iDataLength)1134 void dsp_x86_sse_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
1135 {
1136     int iDataCntr;
1137     int iDataCount;
1138 
1139     iDataCount = (iDataLength << 1);
1140     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1141     {
1142         X86_ASM (
1143             "movss %4, %%xmm2\n\t" \
1144             "movss %5, %%xmm3\n\t" \
1145             \
1146             "movss %2, %%xmm0\n\t" \
1147             "movss %%xmm0, %%xmm1\n\t" \
1148             "movss %3, %%xmm4\n\t" \
1149             \
1150             "mulss %%xmm2, %%xmm0\n\t" \
1151             "movss %%xmm4, %%xmm5\n\t" \
1152             "mulss %%xmm3, %%xmm5\n\t" \
1153             "subss %%xmm5, %%xmm0\n\t" \
1154             \
1155             "mulss %%xmm3, %%xmm1\n\t" \
1156             "movss %%xmm4, %%xmm5\n\t" \
1157             "mulss %%xmm2, %%xmm5\n\t" \
1158             "addss %%xmm5, %%xmm1\n\t" \
1159             \
1160             "movss %%xmm0, %0\n\t" \
1161             "movss %%xmm1, %1\n\t"
1162             : "=m" (fpDest[iDataCntr]),
1163               "=m" (fpDest[iDataCntr + 1])
1164             : "m0" (fpDest[iDataCntr]),
1165               "m1" (fpDest[iDataCntr + 1]),
1166               "m" (fpSrc[iDataCntr]),
1167               "m" (fpSrc[iDataCntr + 1])
1168             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1169     }
1170 }
1171 
1172 
dsp_x86_sse_cmul2(double * dpDest,const double * dpSrc,int iDataLength)1173 void dsp_x86_sse_cmul2 (double *dpDest, const double *dpSrc, int iDataLength)
1174 {
1175     int iDataCntr;
1176     int iDataCount;
1177 
1178     iDataCount = (iDataLength << 1);
1179     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1180     {
1181         X86_ASM (
1182             "movsd %4, %%xmm2\n\t" \
1183             "movsd %5, %%xmm3\n\t" \
1184             \
1185             "movsd %2, %%xmm0\n\t" \
1186             "movsd %%xmm0, %%xmm1\n\t" \
1187             "movsd %3, %%xmm4\n\t" \
1188             \
1189             "mulsd %%xmm2, %%xmm0\n\t" \
1190             "movsd %%xmm4, %%xmm5\n\t" \
1191             "mulsd %%xmm3, %%xmm5\n\t" \
1192             "subsd %%xmm5, %%xmm0\n\t" \
1193             \
1194             "mulsd %%xmm3, %%xmm1\n\t" \
1195             "movsd %%xmm4, %%xmm5\n\t" \
1196             "mulsd %%xmm2, %%xmm5\n\t" \
1197             "addsd %%xmm5, %%xmm1\n\t" \
1198             \
1199             "movsd %%xmm0, %0\n\t" \
1200             "movsd %%xmm1, %1\n\t"
1201             : "=m" (dpDest[iDataCntr]),
1202               "=m" (dpDest[iDataCntr + 1])
1203             : "m0" (dpDest[iDataCntr]),
1204               "m1" (dpDest[iDataCntr + 1]),
1205               "m" (dpSrc[iDataCntr]),
1206               "m" (dpSrc[iDataCntr + 1])
1207             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1208     }
1209 }
1210 
1211 
dsp_x86_3dnow_cmul3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)1212 void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1,
1213     const float *fpSrc2, int iDataLength)
1214 {
1215     int iDataCntr;
1216     pv2sf m64pDest = (pv2sf) fpDest;
1217     pv2sf m64pSrc1 = (pv2sf) fpSrc1;
1218     pv2sf m64pSrc2 = (pv2sf) fpSrc2;
1219 
1220     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1221     {
1222         X86_ASM (
1223             "movq %1, %%mm0\n\t" \
1224             "movq %2, %%mm1\n\t" \
1225             "pswapd %%mm1, %%mm2\n\t" \
1226             "pfmul %%mm0, %%mm1\n\t" \
1227             "pfmul %%mm0, %%mm2\n\t" \
1228             "pfpnacc %%mm2, %%mm1\n\t" \
1229             "movntq %%mm1, %0\n\t"
1230             : "=m" (m64pDest[iDataCntr])
1231             : "m" (m64pSrc1[iDataCntr]),
1232               "m" (m64pSrc2[iDataCntr])
1233             : "mm0", "mm1", "mm2", "memory");
1234     }
1235     X86_ASM (
1236         "femms\n\t" \
1237         "sfence\n\t");
1238 }
1239 
1240 
dsp_x86_sse_cmul3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)1241 void dsp_x86_sse_cmul3f (float *fpDest, const float *fpSrc1,
1242     const float *fpSrc2, int iDataLength)
1243 {
1244     int iDataCntr;
1245     int iDataCount;
1246 
1247     iDataCount = (iDataLength << 1);
1248     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1249     {
1250         X86_ASM (
1251             "movss %4, %%xmm2\n\t" \
1252             "movss %5, %%xmm3\n\t" \
1253             \
1254             "movss %2, %%xmm0\n\t" \
1255             "movss %%xmm0, %%xmm1\n\t" \
1256             "movss %3, %%xmm4\n\t" \
1257             \
1258             "mulss %%xmm2, %%xmm0\n\t" \
1259             "movss %%xmm4, %%xmm5\n\t" \
1260             "mulss %%xmm3, %%xmm5\n\t" \
1261             "subss %%xmm5, %%xmm0\n\t" \
1262             \
1263             "mulss %%xmm3, %%xmm1\n\t" \
1264             "movss %%xmm4, %%xmm5\n\t" \
1265             "mulss %%xmm2, %%xmm5\n\t" \
1266             "addss %%xmm5, %%xmm1\n\t" \
1267             \
1268             "movss %%xmm0, %0\n\t" \
1269             "movss %%xmm1, %1\n\t"
1270             : "=m" (fpDest[iDataCntr]),
1271               "=m" (fpDest[iDataCntr + 1])
1272             : "m" (fpSrc1[iDataCntr]),
1273               "m" (fpSrc1[iDataCntr + 1]),
1274               "m" (fpSrc2[iDataCntr]),
1275               "m" (fpSrc2[iDataCntr + 1])
1276             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1277     }
1278 }
1279 
1280 
dsp_x86_sse_cmul3(double * dpDest,const double * dpSrc1,const double * dpSrc2,int iDataLength)1281 void dsp_x86_sse_cmul3 (double *dpDest, const double *dpSrc1,
1282     const double *dpSrc2, int iDataLength)
1283 {
1284     int iDataCntr;
1285     int iDataCount;
1286 
1287     iDataCount = (iDataLength << 1);
1288     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1289     {
1290         X86_ASM (
1291             "movsd %4, %%xmm2\n\t" \
1292             "movsd %5, %%xmm3\n\t" \
1293             \
1294             "movsd %2, %%xmm0\n\t" \
1295             "movsd %%xmm0, %%xmm1\n\t" \
1296             "movsd %3, %%xmm4\n\t" \
1297             \
1298             "mulsd %%xmm2, %%xmm0\n\t" \
1299             "movsd %%xmm4, %%xmm5\n\t" \
1300             "mulsd %%xmm3, %%xmm5\n\t" \
1301             "subsd %%xmm5, %%xmm0\n\t" \
1302             \
1303             "mulsd %%xmm3, %%xmm1\n\t" \
1304             "movsd %%xmm4, %%xmm5\n\t" \
1305             "mulsd %%xmm2, %%xmm5\n\t" \
1306             "addsd %%xmm5, %%xmm1\n\t" \
1307             \
1308             "movsd %%xmm0, %0\n\t" \
1309             "movsd %%xmm1, %1\n\t"
1310             : "=m" (dpDest[iDataCntr]),
1311               "=m" (dpDest[iDataCntr + 1])
1312             : "m" (dpSrc1[iDataCntr]),
1313               "m" (dpSrc1[iDataCntr + 1]),
1314               "m" (dpSrc2[iDataCntr]),
1315               "m" (dpSrc2[iDataCntr + 1])
1316             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1317     }
1318 }
1319 
1320 
dsp_x86_3dnow_maf(float * fpVect,float fMul,float fAdd,int iDataLength)1321 void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
1322 {
1323     int iDataCntr;
1324     int iDataCount;
1325     pv2sf m64pVect = (pv2sf) fpVect;
1326     stm64 m64Mul;
1327     stm64 m64Add;
1328 
1329     m64Mul.f[0] = m64Mul.f[1] = fMul;
1330     m64Add.f[0] = m64Add.f[1] = fAdd;
1331     iDataCount = (iDataLength >> 1);
1332     X86_ASM (
1333         "movq %0, %%mm1\n\t" \
1334         "movq %1, %%mm2\n\t"
1335         :
1336         : "m" (m64Mul),
1337           "m" (m64Add)
1338         : "mm1", "mm2", "memory");
1339     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1340     {
1341         X86_ASM (
1342             "movq %1, %%mm0\n\t" \
1343             "pfmul %%mm1, %%mm0\n\t" \
1344             "pfadd %%mm2, %%mm0\n\t" \
1345             "movntq %%mm0, %0\n\t"
1346             : "=m" (m64pVect[iDataCntr])
1347             : "m0" (m64pVect[iDataCntr])
1348             : "mm0", "mm1", "mm2", "memory");
1349     }
1350     if (iDataLength & 0x1)
1351     {
1352         X86_ASM (
1353             "movd %1, %%mm0\n\t" \
1354             "pfmul %%mm1, %%mm0\n\t" \
1355             "pfadd %%mm2, %%mm0\n\t" \
1356             "movd %%mm0, %0\n\t"
1357             : "=m" (fpVect[iDataLength - 1])
1358             : "m0" (fpVect[iDataLength - 1])
1359             : "mm0", "mm1", "mm2", "memory");
1360     }
1361     X86_ASM (
1362         "femms\n\t" \
1363         "sfence\n\t");
1364 }
1365 
1366 
dsp_x86_sse_maf(float * fpVect,float fMul,float fAdd,int iDataLength)1367 void dsp_x86_sse_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
1368 {
1369     int iDataCntr;
1370 
1371     X86_ASM (
1372         "movss %0, %%xmm1\n\t" \
1373         "movss %1, %%xmm2\n\t"
1374         :
1375         : "m" (fMul),
1376           "m" (fAdd)
1377         : "xmm1", "xmm2", "memory");
1378     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1379     {
1380         X86_ASM (
1381             "movss %1, %%xmm0\n\t" \
1382             "mulss %%xmm1, %%xmm0\n\t" \
1383             "addss %%xmm2, %%xmm0\n\t" \
1384             "movss %%xmm0, %0\n\t"
1385             : "=m" (fpVect[iDataCntr])
1386             : "m0" (fpVect[iDataCntr])
1387             : "xmm0", "xmm1", "xmm2", "memory");
1388     }
1389 }
1390 
1391 
dsp_x86_sse_ma(double * dpVect,double dMul,double dAdd,int iDataLength)1392 void dsp_x86_sse_ma (double *dpVect, double dMul, double dAdd, int iDataLength)
1393 {
1394     int iDataCntr;
1395 
1396     X86_ASM (
1397         "movsd %0, %%xmm1\n\t" \
1398         "movsd %1, %%xmm2\n\t"
1399         :
1400         : "m" (dMul),
1401           "m" (dAdd)
1402         : "xmm1", "xmm2", "memory");
1403     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1404     {
1405         X86_ASM (
1406             "movsd %1, %%xmm0\n\t" \
1407             "mulsd %%xmm1, %%xmm0\n\t" \
1408             "addsd %%xmm2, %%xmm0\n\t" \
1409             "movsd %%xmm0, %0\n\t"
1410             : "=m" (dpVect[iDataCntr])
1411             : "m0" (dpVect[iDataCntr])
1412             : "xmm0", "xmm1", "xmm2", "memory");
1413     }
1414 }
1415 
1416 
dsp_x86_3dnow_ma2f(float * fpDest,const float * fpSrc,float fMul,float fAdd,int iDataLength)1417 void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
1418     float fMul, float fAdd, int iDataLength)
1419 {
1420     int iDataCntr;
1421     int iDataCount;
1422     pv2sf m64pDest = (pv2sf) fpDest;
1423     pv2sf m64pSrc = (pv2sf) fpSrc;
1424     stm64 m64Mul;
1425     stm64 m64Add;
1426 
1427     m64Mul.f[0] = m64Mul.f[1] = fMul;
1428     m64Add.f[0] = m64Add.f[1] = fAdd;
1429     iDataCount = (iDataLength >> 1);
1430     X86_ASM (
1431         "movq %0, %%mm1\n\t" \
1432         "movq %1, %%mm2\n\t"
1433         :
1434         : "m" (m64Mul),
1435           "m" (m64Add)
1436         : "mm1", "mm2", "memory");
1437     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1438     {
1439         X86_ASM (
1440             "movq %1, %%mm0\n\t" \
1441             "pfmul %%mm1, %%mm0\n\t" \
1442             "pfadd %%mm2, %%mm0\n\t" \
1443             "movntq %%mm0, %0\n\t"
1444             : "=m" (m64pDest[iDataCntr])
1445             : "m" (m64pSrc[iDataCntr])
1446             : "mm0", "mm1", "mm2", "memory");
1447     }
1448     if (iDataLength & 0x1)
1449     {
1450         X86_ASM (
1451             "movd %1, %%mm0\n\t" \
1452             "pfmul %%mm1, %%mm0\n\t" \
1453             "pfadd %%mm2, %%mm0\n\t" \
1454             "movd %%mm0, %0\n\t"
1455             : "=m" (fpDest[iDataLength - 1])
1456             : "m" (fpSrc[iDataLength - 1])
1457             : "mm0", "mm1", "mm2", "memory");
1458     }
1459     X86_ASM (
1460         "femms\n\t" \
1461         "sfence\n\t");
1462 }
1463 
1464 
dsp_x86_sse_ma2f(float * fpDest,const float * fpSrc,float fMul,float fAdd,int iDataLength)1465 void dsp_x86_sse_ma2f (float *fpDest, const float *fpSrc,
1466     float fMul, float fAdd, int iDataLength)
1467 {
1468     int iDataCntr;
1469 
1470     X86_ASM (
1471         "movss %0, %%xmm1\n\t" \
1472         "movss %1, %%xmm2\n\t"
1473         :
1474         : "m" (fMul),
1475           "m" (fAdd)
1476         : "xmm1", "xmm2", "memory");
1477     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1478     {
1479         X86_ASM (
1480             "movss %1, %%xmm0\n\t" \
1481             "mulss %%xmm1, %%xmm0\n\t" \
1482             "addss %%xmm2, %%xmm0\n\t" \
1483             "movss %%xmm0, %0\n\t"
1484             : "=m" (fpDest[iDataCntr])
1485             : "m" (fpSrc[iDataCntr])
1486             : "xmm0", "xmm1", "xmm2", "memory");
1487     }
1488 }
1489 
1490 
dsp_x86_sse_ma2(double * dpDest,const double * dpSrc,double dMul,double dAdd,int iDataLength)1491 void dsp_x86_sse_ma2 (double *dpDest, const double *dpSrc,
1492     double dMul, double dAdd, int iDataLength)
1493 {
1494     int iDataCntr;
1495 
1496     X86_ASM (
1497         "movsd %0, %%xmm1\n\t" \
1498         "movsd %1, %%xmm2\n\t"
1499         :
1500         : "m" (dMul),
1501           "m" (dAdd)
1502         : "xmm1", "xmm2", "memory");
1503     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1504     {
1505         X86_ASM (
1506             "movsd %1, %%xmm0\n\t" \
1507             "mulsd %%xmm1, %%xmm0\n\t" \
1508             "addsd %%xmm2, %%xmm0\n\t" \
1509             "movsd %%xmm0, %0\n\t"
1510             : "=m" (dpDest[iDataCntr])
1511             : "m" (dpSrc[iDataCntr])
1512             : "xmm0", "xmm1", "xmm2", "memory");
1513     }
1514 }
1515 
1516 
dsp_x86_3dnow_cmaf(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)1517 void dsp_x86_3dnow_cmaf (float *fpDest, const float *fpSrc1,
1518     const float *fpSrc2, int iDataLength)
1519 {
1520     int iDataCntr;
1521     pv2sf m64pDest = (pv2sf) fpDest;
1522     pv2sf m64pSrc1 = (pv2sf) fpSrc1;
1523     pv2sf m64pSrc2 = (pv2sf) fpSrc2;
1524 
1525     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1526     {
1527         X86_ASM (
1528             "movq %2, %%mm0\n\t" \
1529             "movq %3, %%mm1\n\t" \
1530             "movq %1, %%mm3\n\t" \
1531             "pswapd %%mm1, %%mm2\n\t" \
1532             "pfmul %%mm0, %%mm1\n\t" \
1533             "pfmul %%mm0, %%mm2\n\t" \
1534             "pfpnacc %%mm2, %%mm1\n\t" \
1535             "pfadd %%mm1, %%mm3\n\t" \
1536             "movntq %%mm3, %0\n\t"
1537             : "=m" (m64pDest[iDataCntr])
1538             : "m0" (m64pDest[iDataCntr]),
1539               "m" (m64pSrc1[iDataCntr]),
1540               "m" (m64pSrc2[iDataCntr])
1541             : "mm0", "mm1", "mm2", "mm3", "memory");
1542     }
1543     X86_ASM (
1544         "femms\n\t" \
1545         "sfence\n\t");
1546 }
1547 
1548 
dsp_x86_sse_cmaf(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)1549 void dsp_x86_sse_cmaf (float *fpDest, const float *fpSrc1,
1550     const float *fpSrc2, int iDataLength)
1551 {
1552     int iDataCntr;
1553     int iDataCount;
1554 
1555     iDataCount = (iDataLength << 1);
1556     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1557     {
1558         X86_ASM (
1559             "movss %6, %%xmm2\n\t" \
1560             "movss %7, %%xmm3\n\t" \
1561             \
1562             "movss %4, %%xmm0\n\t" \
1563             "movss %%xmm0, %%xmm1\n\t" \
1564             "movss %5, %%xmm4\n\t" \
1565             \
1566             "movss %2, %%xmm6\n\t" \
1567             "movss %3, %%xmm7\n\t" \
1568             \
1569             "mulss %%xmm2, %%xmm0\n\t" \
1570             "movss %%xmm4, %%xmm5\n\t" \
1571             "mulss %%xmm3, %%xmm5\n\t" \
1572             "subss %%xmm5, %%xmm0\n\t" \
1573             \
1574             "mulss %%xmm3, %%xmm1\n\t" \
1575             "movss %%xmm4, %%xmm5\n\t" \
1576             "mulss %%xmm2, %%xmm5\n\t" \
1577             "addss %%xmm5, %%xmm1\n\t" \
1578             \
1579             "addss %%xmm0, %%xmm6\n\t" \
1580             "addss %%xmm1, %%xmm7\n\t" \
1581             \
1582             "movss %%xmm6, %0\n\t" \
1583             "movss %%xmm7, %1\n\t"
1584             : "=m" (fpDest[iDataCntr]),
1585               "=m" (fpDest[iDataCntr + 1])
1586             : "m0" (fpDest[iDataCntr]),
1587               "m1" (fpDest[iDataCntr + 1]),
1588               "m" (fpSrc1[iDataCntr]),
1589               "m" (fpSrc1[iDataCntr + 1]),
1590               "m" (fpSrc2[iDataCntr]),
1591               "m" (fpSrc2[iDataCntr + 1])
1592             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
1593               "memory");
1594     }
1595 }
1596 
1597 
dsp_x86_sse_cma(double * dpDest,const double * dpSrc1,const double * dpSrc2,int iDataLength)1598 void dsp_x86_sse_cma (double *dpDest, const double *dpSrc1,
1599     const double *dpSrc2, int iDataLength)
1600 {
1601     int iDataCntr;
1602     int iDataCount;
1603 
1604     iDataCount = (iDataLength << 1);
1605     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1606     {
1607         X86_ASM (
1608             "movsd %6, %%xmm2\n\t" \
1609             "movsd %7, %%xmm3\n\t" \
1610             \
1611             "movsd %4, %%xmm0\n\t" \
1612             "movsd %%xmm0, %%xmm1\n\t" \
1613             "movsd %5, %%xmm4\n\t" \
1614             \
1615             "movsd %2, %%xmm6\n\t" \
1616             "movsd %3, %%xmm7\n\t" \
1617             \
1618             "mulsd %%xmm2, %%xmm0\n\t" \
1619             "movsd %%xmm4, %%xmm5\n\t" \
1620             "mulsd %%xmm3, %%xmm5\n\t" \
1621             "subsd %%xmm5, %%xmm0\n\t" \
1622             \
1623             "mulsd %%xmm3, %%xmm1\n\t" \
1624             "movsd %%xmm4, %%xmm5\n\t" \
1625             "mulsd %%xmm2, %%xmm5\n\t" \
1626             "addsd %%xmm5, %%xmm1\n\t" \
1627             \
1628             "addsd %%xmm0, %%xmm6\n\t" \
1629             "addsd %%xmm1, %%xmm7\n\t" \
1630             \
1631             "movsd %%xmm6, %0\n\t" \
1632             "movsd %%xmm7, %1\n\t"
1633             : "=m" (dpDest[iDataCntr]),
1634               "=m" (dpDest[iDataCntr + 1])
1635             : "m0" (dpDest[iDataCntr]),
1636               "m1" (dpDest[iDataCntr + 1]),
1637               "m" (dpSrc1[iDataCntr]),
1638               "m" (dpSrc1[iDataCntr + 1]),
1639               "m" (dpSrc2[iDataCntr]),
1640               "m" (dpSrc2[iDataCntr + 1])
1641             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
1642               "memory");
1643     }
1644 }
1645 
1646 
dsp_x86_3dnow_amf(float * fpVect,float fAdd,float fMul,int iDataLength)1647 void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
1648 {
1649     int iDataCntr;
1650     int iDataCount;
1651     pv2sf m64pVect = (pv2sf) fpVect;
1652     stm64 m64Add;
1653     stm64 m64Mul;
1654 
1655     m64Add.f[0] = m64Add.f[1] = fAdd;
1656     m64Mul.f[0] = m64Mul.f[1] = fMul;
1657     iDataCount = (iDataLength >> 1);
1658     X86_ASM (
1659         "movq %0, %%mm1\n\t" \
1660         "movq %1, %%mm2\n\t"
1661         :
1662         : "m" (m64Add),
1663           "m" (m64Mul)
1664         : "mm1", "mm2", "memory");
1665     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1666     {
1667         X86_ASM (
1668             "movq %1, %%mm0\n\t" \
1669             "pfadd %%mm1, %%mm0\n\t" \
1670             "pfmul %%mm2, %%mm0\n\t" \
1671             "movntq %%mm0, %0\n\t"
1672             : "=m" (m64pVect[iDataCntr])
1673             : "m0" (m64pVect[iDataCntr])
1674             : "mm0", "mm1", "mm2", "memory");
1675     }
1676     if (iDataLength & 0x1)
1677     {
1678         X86_ASM (
1679             "movd %1, %%mm0\n\t" \
1680             "pfadd %%mm1, %%mm0\n\t" \
1681             "pfmul %%mm2, %%mm0\n\t" \
1682             "movd %%mm0, %0\n\t"
1683             : "=m" (fpVect[iDataLength - 1])
1684             : "m0" (fpVect[iDataLength - 1])
1685             : "mm0", "mm1", "mm2", "memory");
1686     }
1687     X86_ASM (
1688         "femms\n\t" \
1689         "sfence\n\t");
1690 }
1691 
1692 
dsp_x86_sse_amf(float * fpVect,float fAdd,float fMul,int iDataLength)1693 void dsp_x86_sse_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
1694 {
1695     int iDataCntr;
1696 
1697     X86_ASM (
1698         "movss %0, %%xmm1\n\t" \
1699         "movss %1, %%xmm2\n\t"
1700         :
1701         : "m" (fAdd),
1702           "m" (fMul)
1703         : "xmm1", "xmm2", "memory");
1704     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1705     {
1706         X86_ASM (
1707             "movss %1, %%xmm0\n\t" \
1708             "addss %%xmm1, %%xmm0\n\t" \
1709             "mulss %%xmm2, %%xmm0\n\t" \
1710             "movss %%xmm0, %0\n\t"
1711             : "=m" (fpVect[iDataCntr])
1712             : "m0" (fpVect[iDataCntr])
1713             : "xmm0", "xmm1", "xmm2", "memory");
1714     }
1715 }
1716 
1717 
dsp_x86_3dnow_macf(const float * fpSrc1,const float * fpSrc2,int iDataLength)1718 float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2,
1719     int iDataLength)
1720 {
1721     int iDataCntr;
1722     int iDataCount;
1723     float fRes;
1724     pv2sf m64pSrc1 = (pv2sf) fpSrc1;
1725     pv2sf m64pSrc2 = (pv2sf) fpSrc2;
1726 
1727     iDataCount = (iDataLength >> 1);
1728     X86_ASM (
1729         "pxor %%mm0, %%mm0\n\t"
1730         :
1731         :
1732         : "mm0");
1733     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1734     {
1735         X86_ASM (
1736             "movq %0, %%mm1\n\t" \
1737             "movq %1, %%mm2\n\t" \
1738             "pfmul %%mm2, %%mm1\n\t" \
1739             "pfacc %%mm1, %%mm0\n\t"
1740             :
1741             : "m" (m64pSrc1[iDataCntr]),
1742               "m" (m64pSrc2[iDataCntr])
1743             : "mm0", "mm1", "mm2", "memory");
1744     }
1745     if (iDataLength & 0x1)
1746     {
1747         X86_ASM (
1748             "movd %0, %%mm1\n\t" \
1749             "movd %1, %%mm2\n\t" \
1750             "pfmul %%mm2, %%mm1\n\t" \
1751             "pfacc %%mm1, %%mm0\n\t"
1752             :
1753             : "m" (fpSrc1[iDataLength - 1]),
1754               "m" (fpSrc2[iDataLength - 1])
1755             : "mm0", "mm1", "mm2", "memory");
1756     }
1757     X86_ASM (
1758         "pfacc %%mm0, %%mm0\n\t" \
1759         "movd %%mm0, %0\n\t"
1760         : "=m" (fRes)
1761         :
1762         : "mm0", "memory");
1763     X86_ASM ("femms\n\t");
1764 
1765     return fRes;
1766 }
1767 
1768 
dsp_x86_sse_macf(const float * fpSrc1,const float * fpSrc2,int iDataLength)1769 float dsp_x86_sse_macf (const float *fpSrc1, const float *fpSrc2,
1770     int iDataLength)
1771 {
1772     int iDataCntr;
1773     float fRes;
1774 
1775     X86_ASM (
1776         "xorps %%xmm0, %%xmm0\n\t"
1777         :
1778         :
1779         : "xmm0");
1780     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1781     {
1782         X86_ASM (
1783             "movss %0, %%xmm1\n\t" \
1784             "mulss %1, %%xmm1\n\t" \
1785             "addss %%xmm1, %%xmm0\n\t"
1786             :
1787             : "m" (fpSrc1[iDataCntr]),
1788               "m" (fpSrc2[iDataCntr])
1789             : "xmm0", "xmm1", "xmm2", "memory");
1790     }
1791     X86_ASM (
1792         "movss %%xmm0, %0\n\t"
1793         : "=m" (fRes)
1794         :
1795         : "xmm0");
1796 
1797     return fRes;
1798 }
1799 
1800 
dsp_x86_sse_mac(const double * dpSrc1,const double * dpSrc2,int iDataLength)1801 double dsp_x86_sse_mac (const double *dpSrc1, const double *dpSrc2,
1802     int iDataLength)
1803 {
1804     int iDataCntr;
1805     double dRes;
1806 
1807     X86_ASM (
1808         "xorpd %%xmm0, %%xmm0\n\t"
1809         :
1810         :
1811         : "xmm0");
1812     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1813     {
1814         X86_ASM (
1815             "movsd %0, %%xmm1\n\t" \
1816             "mulsd %1, %%xmm1\n\t" \
1817             "addsd %%xmm1, %%xmm0\n\t"
1818             :
1819             : "m" (dpSrc1[iDataCntr]),
1820               "m" (dpSrc2[iDataCntr])
1821             : "xmm0", "xmm1", "xmm2", "memory");
1822     }
1823     X86_ASM (
1824         "movsd %%xmm0, %0\n\t"
1825         : "=m" (dRes)
1826         :
1827         : "xmm0");
1828 
1829     return dRes;
1830 }
1831 
1832 
dsp_x86_3dnow_minmaxf(float * fpMin,float * fpMax,const float * fpSrc,int iDataLength)1833 void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
1834     int iDataLength)
1835 {
1836     int iDataCntr;
1837     int iDataCount;
1838     stm64 m64Min;
1839     stm64 m64Max;
1840     pv2sf m64pSrc = (pv2sf) fpSrc;
1841 
1842     m64Min.f[0] = m64Min.f[1] = FLT_MAX;
1843     m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
1844     iDataCount = (iDataLength >> 1);
1845     X86_ASM (
1846         "movq %0, %%mm1\n\t" \
1847         "movq %1, %%mm2\n\t"
1848         :
1849         : "m" (m64Min),
1850           "m" (m64Max)
1851         : "mm1", "mm2", "memory");
1852     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1853     {
1854         X86_ASM (
1855             "movq %0, %%mm0\n\t" \
1856             "pfmin %%mm0, %%mm1\n\t" \
1857             "pfmax %%mm0, %%mm2\n\t"
1858             :
1859             : "m" (m64pSrc[iDataCntr])
1860             : "mm0", "mm1", "mm2", "memory");
1861     }
1862     if (iDataLength & 0x1)
1863     {
1864         X86_ASM (
1865             "movd %0, %%mm0\n\t" \
1866             "pfmin %%mm0, %%mm1\n\t" \
1867             "pfmax %%mm0, %%mm2\n\t"
1868             :
1869             : "m" (fpSrc[iDataLength - 1])
1870             : "mm0", "mm1", "mm2", "memory");
1871     }
1872     X86_ASM (
1873         "pswapd %%mm1, %%mm3\n\t" \
1874         "pfmin %%mm3, %%mm1\n\t" \
1875         "pswapd %%mm2, %%mm3\n\t" \
1876         "pfmax %%mm3, %%mm2\n\t" \
1877         "movd %%mm1, %0\n\t" \
1878         "movd %%mm2, %1\n\t"
1879         : "=m" (*fpMin),
1880           "=m" (*fpMax)
1881         :
1882         : "mm1", "mm2", "mm3", "memory");
1883     X86_ASM ("femms\n\t");
1884 }
1885 
1886 
dsp_x86_sse_minmaxf(float * fpMin,float * fpMax,const float * fpSrc,int iDataLength)1887 void dsp_x86_sse_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
1888     int iDataLength)
1889 {
1890     int iDataCntr;
1891 
1892     *fpMin = FLT_MAX;
1893     *fpMax = -FLT_MAX;
1894     X86_ASM (
1895         "movss %0, %%xmm0\n\t" \
1896         "movss %1, %%xmm1\n\t"
1897         :
1898         : "m" (*fpMin),
1899           "m" (*fpMax)
1900         : "xmm0", "xmm1", "memory");
1901     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1902     {
1903         X86_ASM (
1904             "movss %0, %%xmm2\n\t" \
1905             "minss %%xmm2, %%xmm0\n\t" \
1906             "maxss %%xmm2, %%xmm1\n\t"
1907             :
1908             : "m" (fpSrc[iDataCntr])
1909             : "xmm0", "xmm1", "xmm2", "memory");
1910     }
1911     X86_ASM (
1912         "movss %%xmm0, %0\n\t" \
1913         "movss %%xmm1, %1\n\t"
1914         : "=m" (*fpMin),
1915           "=m" (*fpMax)
1916         :
1917         : "xmm0", "xmm1", "memory");
1918 }
1919 
1920 
dsp_x86_sse_minmax(double * dpMin,double * dpMax,const double * dpSrc,int iDataLength)1921 void dsp_x86_sse_minmax (double *dpMin, double *dpMax, const double *dpSrc,
1922     int iDataLength)
1923 {
1924     int iDataCntr;
1925 
1926     *dpMin = FLT_MAX;
1927     *dpMax = -FLT_MAX;
1928     X86_ASM (
1929         "movsd %0, %%xmm0\n\t" \
1930         "movsd %1, %%xmm1\n\t"
1931         :
1932         : "m" (*dpMin),
1933           "m" (*dpMax)
1934         : "xmm0", "xmm1", "memory");
1935     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1936     {
1937         X86_ASM (
1938             "movsd %0, %%xmm2\n\t" \
1939             "minsd %%xmm2, %%xmm0\n\t" \
1940             "maxsd %%xmm2, %%xmm1\n\t"
1941             :
1942             : "m" (dpSrc[iDataCntr])
1943             : "xmm0", "xmm1", "xmm2", "memory");
1944     }
1945     X86_ASM (
1946         "movss %%xmm0, %0\n\t" \
1947         "movss %%xmm1, %1\n\t"
1948         : "=m" (*dpMin),
1949           "=m" (*dpMax)
1950         :
1951         : "xmm0", "xmm1", "memory");
1952 }
1953 
1954 
dsp_x86_3dnow_crosscorrf(const float * fpSrc1,const float * fpSrc2,int iDataLength)1955 float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
1956     int iDataLength)
1957 {
1958     int iDataCntr;
1959     int iDataCount;
1960     float fRes;
1961     pv2sf m64pSrc1 = (pv2sf) fpSrc1;
1962     pv2sf m64pSrc2 = (pv2sf) fpSrc2;
1963 
1964     iDataCount = (iDataLength >> 1);
1965     X86_ASM (
1966         "pxor %%mm3, %%mm3\n\t" \
1967         "pxor %%mm4, %%mm4\n\t" \
1968         "pxor %%mm5, %%mm5\n\t"
1969         :
1970         :
1971         : "mm3", "mm4", "mm5");
1972     for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1973     {
1974         X86_ASM (
1975             "movq %0, %%mm0\n\t" \
1976             "movq %1, %%mm1\n\t" \
1977             "movq %%mm1, %%mm2\n\t" \
1978             "pfmul %%mm0, %%mm2\n\t" \
1979             "pfacc %%mm2, %%mm5\n\t" \
1980             "pfmul %%mm0, %%mm0\n\t" \
1981             "pfacc %%mm0, %%mm3\n\t" \
1982             "pfmul %%mm1, %%mm1\n\t" \
1983             "pfacc %%mm1, %%mm4\n\t"
1984             :
1985             : "m" (m64pSrc1[iDataCntr]),
1986               "m" (m64pSrc2[iDataCntr])
1987             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
1988     }
1989     if (iDataLength & 0x1)
1990     {
1991         X86_ASM (
1992             "movd %0, %%mm0\n\t" \
1993             "movd %1, %%mm1\n\t" \
1994             "movq %%mm1, %%mm2\n\t" \
1995             "pfmul %%mm0, %%mm2\n\t" \
1996             "pfacc %%mm2, %%mm5\n\t" \
1997             "pfmul %%mm0, %%mm0\n\t" \
1998             "pfacc %%mm0, %%mm3\n\t" \
1999             "pfmul %%mm1, %%mm1\n\t" \
2000             "pfacc %%mm1, %%mm4\n\t"
2001             :
2002             : "m" (fpSrc1[iDataLength - 1]),
2003               "m" (fpSrc2[iDataLength - 1])
2004             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
2005     }
2006     X86_ASM (
2007         "pfacc %%mm3, %%mm3\n\t" \
2008         "pfacc %%mm4, %%mm4\n\t" \
2009         "pfacc %%mm5, %%mm5\n\t" \
2010         \
2011         "movd %1, %%mm6\n\t" \
2012         "pswapd %%mm6, %%mm7\n\t" \
2013         "paddd %%mm7, %%mm6\n\t" \
2014         "pi2fd %%mm6, %%mm7\n\t" \
2015         \
2016         "pfrcp %%mm7, %%mm6\n\t" \
2017         "pfrcpit1 %%mm6, %%mm7\n\t" \
2018         "pfrcpit2 %%mm6, %%mm7\n\t" \
2019         \
2020         "pfmul %%mm3, %%mm4\n\t" \
2021         \
2022         "movq %%mm4, %%mm0\n\t" \
2023         "pfrsqrt %%mm4, %%mm1\n\t" \
2024         "movq %%mm1, %%mm2\n\t" \
2025         "pfmul %%mm1, %%mm1\n\t" \
2026         "pfrsqit1 %%mm4, %%mm1\n\t" \
2027         "pfrcpit2 %%mm2, %%mm1\n\t" \
2028         "pfmul %%mm1, %%mm4\n\t" \
2029         \
2030         "pfmul %%mm6, %%mm4\n\t" \
2031         \
2032         "pfrcp %%mm4, %%mm0\n\t" \
2033         "pfrcpit1 %%mm0, %%mm4\n\t" \
2034         "pfrcpit2 %%mm0, %%mm4\n\t" \
2035         \
2036         "pfmul %%mm6, %%mm5\n\t" \
2037         "pfmul %%mm4, %%mm5\n\t" \
2038         "movd %%mm5, %0\n\t"
2039         : "=m" (fRes)
2040         : "m" (iDataLength)
2041         : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
2042     X86_ASM ("femms\n\t");
2043 
2044     return fRes;
2045 }
2046 
2047 
dsp_x86_sse_crosscorrf(const float * fpSrc1,const float * fpSrc2,int iDataLength)2048 float dsp_x86_sse_crosscorrf (const float *fpSrc1, const float *fpSrc2,
2049     int iDataLength)
2050 {
2051     int iDataCntr;
2052     float fScale;
2053     float fNormFact;
2054     float fProdSum;
2055     float fSqSum1;
2056     float fSqSum2;
2057     float fRes;
2058 
2059     X86_ASM (
2060         "xorps %%xmm0, %%xmm0\n\t" \
2061         "xorps %%xmm1, %%xmm1\n\t" \
2062         "xorps %%xmm2, %%xmm2\n\t"
2063         :
2064         :
2065         : "xmm0", "xmm1", "xmm2");
2066     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2067     {
2068         X86_ASM (
2069             "movss %3, %%xmm3\n\t" \
2070             "movss %4, %%xmm4\n\t" \
2071             \
2072             "movss %%xmm4, %%xmm5\n\t" \
2073             "mulss %%xmm3, %%xmm5\n\t" \
2074             "addss %%xmm5, %%xmm0\n\t" \
2075             \
2076             "movss %%xmm3, %%xmm5\n\t" \
2077             "mulss %%xmm3, %%xmm5\n\t" \
2078             "addss %%xmm5, %%xmm1\n\t" \
2079             \
2080             "movss %%xmm4, %%xmm5\n\t" \
2081             "mulss %%xmm4, %%xmm5\n\t" \
2082             "addss %%xmm5, %%xmm2\n\t" \
2083             \
2084             "movss %%xmm0, %0\n\t" \
2085             "movss %%xmm1, %1\n\t" \
2086             "movss %%xmm2, %2\n\t"
2087             : "=m" (fProdSum),
2088               "=m" (fSqSum1),
2089               "=m" (fSqSum2)
2090             : "m" (fpSrc1[iDataCntr]),
2091               "m" (fpSrc2[iDataCntr])
2092             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2093     }
2094     fScale = 1.0F / iDataLength;
2095     fNormFact = sqrtf(fSqSum1 * fSqSum2) * fScale;
2096     fRes = (fProdSum * fScale) / fNormFact;
2097 
2098     return fRes;
2099 }
2100 
2101 
dsp_x86_sse_crosscorr(const double * dpSrc1,const double * dpSrc2,int iDataLength)2102 double dsp_x86_sse_crosscorr (const double *dpSrc1, const double *dpSrc2,
2103     int iDataLength)
2104 {
2105     int iDataCntr;
2106     double dScale;
2107     double dNormFact;
2108     double dProdSum;
2109     double dSqSum1;
2110     double dSqSum2;
2111     double dRes;
2112 
2113     X86_ASM (
2114         "xorpd %%xmm0, %%xmm0\n\t" \
2115         "xorpd %%xmm1, %%xmm1\n\t" \
2116         "xorpd %%xmm2, %%xmm2\n\t"
2117         :
2118         :
2119         : "xmm0", "xmm1", "xmm2");
2120     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2121     {
2122         X86_ASM (
2123             "movsd %3, %%xmm3\n\t" \
2124             "movsd %4, %%xmm4\n\t" \
2125             \
2126             "movsd %%xmm4, %%xmm5\n\t" \
2127             "mulsd %%xmm3, %%xmm5\n\t" \
2128             "addsd %%xmm5, %%xmm0\n\t" \
2129             \
2130             "movsd %%xmm3, %%xmm5\n\t" \
2131             "mulsd %%xmm3, %%xmm5\n\t" \
2132             "addsd %%xmm5, %%xmm1\n\t" \
2133             \
2134             "movsd %%xmm4, %%xmm5\n\t" \
2135             "mulsd %%xmm4, %%xmm5\n\t" \
2136             "addsd %%xmm5, %%xmm2\n\t" \
2137             \
2138             "movsd %%xmm0, %0\n\t" \
2139             "movsd %%xmm1, %1\n\t" \
2140             "movsd %%xmm2, %2\n\t"
2141             : "=m" (dProdSum),
2142               "=m" (dSqSum1),
2143               "=m" (dSqSum2)
2144             : "m" (dpSrc1[iDataCntr]),
2145               "m" (dpSrc2[iDataCntr])
2146             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2147     }
2148     dScale = 1.0 / iDataLength;
2149     dNormFact = sqrt(dSqSum1 * dSqSum2) * dScale;
2150     dRes = (dProdSum * dScale) / dNormFact;
2151 
2152     return dRes;
2153 }
2154 
2155 
dsp_x86_3dnow_i16tof(float * fpDest,const short * ipSrc,int iDataLength,int iIntMax)2156 void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
2157     int iIntMax)
2158 {
2159     int iDataCntr;
2160     float fScale;
2161 
2162     X86_ASM (
2163         "movd %1, %%mm1\n\t" \
2164         "pswapd %%mm1, %%mm2\n\t" \
2165         "paddd %%mm2, %%mm1\n\t" \
2166         "pi2fd %%mm1, %%mm1\n\t" \
2167         "pfrcp %%mm1, %%mm2\n\t" \
2168         "pfrcpit1 %%mm2, %%mm1\n\t" \
2169         "pfrcpit2 %%mm2, %%mm1\n\t" \
2170         "movd %%mm1, %0\n\t"
2171         : "=m" (fScale)
2172         : "m" (iIntMax)
2173         : "mm1", "mm2", "memory");
2174     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
2175     {
2176         X86_ASM (
2177             "movd %1, %%mm0\n\t" \
2178             "punpcklwd %%mm0, %%mm0\n\t" \
2179             "pi2fw %%mm0, %%mm0\n\t" \
2180             "pfmul %%mm1, %%mm0\n\t" \
2181             "movntq %%mm0, %0\n\t"
2182             : "=m" (fpDest[iDataCntr])
2183             : "m" (ipSrc[iDataCntr])
2184             : "mm0", "mm1", "memory");
2185     }
2186     X86_ASM (
2187         "femms\n\t" \
2188         "sfence\n\t");
2189     if ((iDataLength % 2) != 0)
2190     {
2191         fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
2192     }
2193 }
2194 
2195 
dsp_x86_3dnow_i32tof(float * fpDest,const int * ipSrc,int iDataLength,int iIntMax)2196 void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
2197     int iIntMax)
2198 {
2199     int iDataCntr;
2200     float fScale;
2201 
2202     X86_ASM (
2203         "movd %1, %%mm1\n\t" \
2204         "pswapd %%mm1, %%mm2\n\t" \
2205         "paddd %%mm2, %%mm1\n\t" \
2206         "pi2fd %%mm1, %%mm1\n\t" \
2207         "pfrcp %%mm1, %%mm2\n\t" \
2208         "pfrcpit1 %%mm2, %%mm1\n\t" \
2209         "pfrcpit2 %%mm2, %%mm1\n\t" \
2210         "movd %%mm1, %0\n\t"
2211         : "=m" (fScale)
2212         : "m" (iIntMax)
2213         : "mm1", "mm2", "memory");
2214     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
2215     {
2216         X86_ASM (
2217             "movq %1, %%mm0\n\t" \
2218             "pi2fd %%mm0, %%mm0\n\t" \
2219             "pfmul %%mm1, %%mm0\n\t" \
2220             "movntq %%mm0, %0\n\t"
2221             : "=m" (fpDest[iDataCntr])
2222             : "m" (ipSrc[iDataCntr])
2223             : "mm0", "mm1", "memory");
2224     }
2225     X86_ASM (
2226         "femms\n\t" \
2227         "sfence\n\t");
2228     if ((iDataLength % 2) != 0)
2229     {
2230         fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
2231     }
2232 }
2233 
2234 
dsp_x86_3dnow_firf(float * fpDest,const float * fpSrc,int iDataLength,const float * fpCoeff,int iCoeffLength)2235 void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength,
2236     const float *fpCoeff, int iCoeffLength)
2237 {
2238     int iSrcCntr;
2239     int iDestCntr;
2240     int iCoeffCntr;
2241     int iSrcCount;
2242     pv2sf m64pDest = (pv2sf) fpDest;
2243 
2244     iDestCntr = 0;
2245     iSrcCount = iDataLength + iCoeffLength;
2246     for (iSrcCntr = iCoeffLength;
2247         iSrcCntr < iSrcCount;
2248         iSrcCntr += 2)
2249     {
2250         X86_ASM (
2251             "pxor %%mm0, %%mm0\n\t"
2252             :
2253             :
2254             : "mm0");
2255         for (iCoeffCntr = 0;
2256             iCoeffCntr < iCoeffLength;
2257             iCoeffCntr++)
2258         {
2259             X86_ASM (
2260                 "movq %0, %%mm1\n\t" \
2261                 "movd %1, %%mm2\n\t" \
2262                 "pswapd %%mm2, %%mm3\n\t" \
2263                 "pfadd %%mm3, %%mm2\n\t" \
2264                 "pfmul %%mm2, %%mm1\n\t" \
2265                 "pfadd %%mm1, %%mm0\n\t"
2266                 :
2267                 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
2268                   "m" (fpCoeff[iCoeffCntr])
2269                 : "mm0", "mm1", "mm2", "mm3", "memory");
2270         }
2271         X86_ASM (
2272             "movntq %%mm0, %0\n\t"
2273             : "=m" (m64pDest[iDestCntr++])
2274             :
2275             : "mm0", "memory");
2276     }
2277     if (iDataLength & 0x1)
2278     {
2279         X86_ASM (
2280             "pxor %%mm0, %%mm0\n\t"
2281             :
2282             :
2283             : "mm0");
2284         for (iCoeffCntr = 0;
2285             iCoeffCntr < iCoeffLength;
2286             iCoeffCntr++)
2287         {
2288             X86_ASM (
2289                 "movd %0, %%mm1\n\t" \
2290                 "movd %1, %%mm2\n\t" \
2291                 "pfmul %%mm2, %%mm1\n\t" \
2292                 "pfadd %%mm1, %%mm0\n\t"
2293                 :
2294                 : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
2295                   "m" (fpCoeff[iCoeffCntr])
2296                 : "mm0", "mm1", "mm2", "memory");
2297         }
2298         X86_ASM (
2299             "movd %%mm0, %0\n\t"
2300             : "=m" (fpDest[iDataLength - 1])
2301             :
2302             : "mm0", "memory");
2303     }
2304     X86_ASM (
2305         "femms\n\t" \
2306         "sfence\n\t");
2307 }
2308 
2309 
dsp_x86_sse_firf(float * fpDest,const float * fpSrc,int iDataLength,const float * fpCoeff,int iCoeffLength)2310 void dsp_x86_sse_firf (float *fpDest, const float *fpSrc, int iDataLength,
2311     const float *fpCoeff, int iCoeffLength)
2312 {
2313     int iDestCntr;
2314     int iSrcCntr;
2315     int iCoeffCntr;
2316     int iSrcCount;
2317 
2318     iDestCntr = 0;
2319     iSrcCount = iDataLength + iCoeffLength;
2320     for (iSrcCntr = iCoeffLength;
2321         iSrcCntr < iSrcCount;
2322         iSrcCntr++)
2323     {
2324         X86_ASM (
2325             "xorps %%xmm0, %%xmm0\n\t"
2326             :
2327             :
2328             : "xmm0");
2329         for (iCoeffCntr = 0;
2330             iCoeffCntr < iCoeffLength;
2331             iCoeffCntr++)
2332         {
2333             X86_ASM (
2334                 "movss %0, %%xmm1\n\t"
2335                 "mulss %1, %%xmm1\n\t"
2336                 "addss %%xmm1, %%xmm0\n\t"
2337                 :
2338                 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
2339                   "m" (fpCoeff[iCoeffCntr])
2340                 : "xmm0", "xmm1", "memory");
2341         }
2342         X86_ASM (
2343             "movss %%xmm0, %0\n\t"
2344             : "=m" (fpDest[iDestCntr++])
2345             :
2346             : "xmm0", "memory");
2347     }
2348 }
2349 
2350 
dsp_x86_sse_fir(double * dpDest,const double * dpSrc,int iDataLength,const double * dpCoeff,int iCoeffLength)2351 void dsp_x86_sse_fir (double *dpDest, const double *dpSrc, int iDataLength,
2352     const double *dpCoeff, int iCoeffLength)
2353 {
2354     int iDestCntr;
2355     int iSrcCntr;
2356     int iCoeffCntr;
2357     int iSrcCount;
2358 
2359     iDestCntr = 0;
2360     iSrcCount = iDataLength + iCoeffLength;
2361     for (iSrcCntr = iCoeffLength;
2362         iSrcCntr < iSrcCount;
2363         iSrcCntr++)
2364     {
2365         X86_ASM (
2366             "xorpd %%xmm0, %%xmm0\n\t"
2367             :
2368             :
2369             : "xmm0");
2370         for (iCoeffCntr = 0;
2371             iCoeffCntr < iCoeffLength;
2372             iCoeffCntr++)
2373         {
2374             X86_ASM (
2375                 "movsd %0, %%xmm1\n\t"
2376                 "mulsd %1, %%xmm1\n\t"
2377                 "addsd %%xmm1, %%xmm0\n\t"
2378                 :
2379                 : "m" (dpSrc[iSrcCntr - iCoeffCntr]),
2380                   "m" (dpCoeff[iCoeffCntr])
2381                 : "xmm0", "xmm1", "memory");
2382         }
2383         X86_ASM (
2384             "movsd %%xmm0, %0\n\t"
2385             : "=m" (dpDest[iDestCntr++])
2386             :
2387             : "xmm0", "memory");
2388     }
2389 }
2390 
2391 
dsp_x86_3dnow_iirf(float * fpVect,int iDataLength,const float * fpCoeff,float * fpX,float * fpY)2392 void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
2393     float *fpX, float *fpY)
2394 {
2395     int iDataCntr;
2396     pv2sf m64pCoeff = (pv2sf) &fpCoeff[1];
2397     pv2sf m64pCoeff2 = (pv2sf) &fpCoeff[3];
2398     pv2sf m64pX = (pv2sf) fpX;
2399     pv2sf m64pY = (pv2sf) fpY;
2400 
2401     X86_ASM (
2402         "movq %0, %%mm0\n\t" \
2403         "pswapd %%mm0, %%mm2\n\t" \
2404         "movd %1, %%mm3\n\t" \
2405         "movq %2, %%mm0\n\t" \
2406         "pswapd %%mm0, %%mm4\n\t" \
2407         "movq %3, %%mm5\n\t" \
2408         "movq %4, %%mm7\n\t" \
2409         :
2410         : "m" (*m64pCoeff),
2411           "m" (fpCoeff[0]),
2412           "m" (*m64pCoeff2),
2413           "m" (*m64pX),
2414           "m" (*m64pY)
2415         : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
2416     for (iDataCntr = 0;
2417         iDataCntr < iDataLength;
2418         iDataCntr++)
2419     {
2420         X86_ASM (
2421             "pxor %%mm0, %%mm0\n\t" \
2422             "movd %1, %%mm6\n\t" \
2423             "movq %%mm5, %%mm1\n\t" \
2424             "pfmul %%mm2, %%mm1\n\t" \
2425             "pfacc %%mm1, %%mm0\n\t" \
2426             "movq %%mm6, %%mm1\n\t" \
2427             "pfmul %%mm3, %%mm1\n\t" \
2428             "pfacc %%mm1, %%mm0\n\t" \
2429             "movq %%mm7, %%mm1\n\t" \
2430             "pfmul %%mm4, %%mm1\n\t" \
2431             "pfacc %%mm1, %%mm0\n\t" \
2432             "pfacc %%mm0, %%mm0\n\t" \
2433             \
2434             "pswapd %%mm7, %%mm1\n\t" \
2435             "movq %%mm1, %%mm7\n\t" \
2436             "punpckldq %%mm0, %%mm7\n\t" \
2437             \
2438             "pswapd %%mm5, %%mm1\n\t" \
2439             "movq %%mm1, %%mm5\n\t" \
2440             "movq %%mm6, %%mm1\n\t" \
2441             "punpckldq %%mm1, %%mm5\n\t" \
2442             \
2443             "movd %%mm0, %0\n\t"
2444             : "=m" (fpVect[iDataCntr])
2445             : "m0" (fpVect[iDataCntr])
2446             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
2447     }
2448     X86_ASM (
2449         "movq %%mm5, %0\n\t" \
2450         "movd %%mm6, %1\n\t" \
2451         "movq %%mm7, %2\n\t"
2452         : "=m" (*m64pX),
2453           "=m" (fpX[2]),
2454           "=m" (*m64pY)
2455         :
2456         : "mm5", "mm6", "mm7", "memory");
2457     X86_ASM ("femms\n\t");
2458 }
2459 
2460 
dsp_x86_sse_iirf(float * fpVect,int iDataLength,const float * fpCoeff,float * fpX,float * fpY)2461 void dsp_x86_sse_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
2462     float *fpX, float *fpY)
2463 {
2464     int iDataCntr;
2465 
2466     X86_ASM (
2467         "movss %0, %%xmm1\n\t" \
2468         "movss %1, %%xmm2\n\t" \
2469         "movss %2, %%xmm3\n\t" \
2470         "movss %3, %%xmm4\n\t" \
2471         "prefetchnta %4\n\t"
2472         :
2473         : "m" (fpX[1]),
2474           "m" (fpX[2]),
2475           "m" (fpY[0]),
2476           "m" (fpY[1]),
2477           "m" (fpCoeff[0])
2478         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2479     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2480     {
2481         X86_ASM (
2482             "movss %%xmm1, %%xmm0\n\t" \
2483             "movss %%xmm2, %%xmm1\n\t" \
2484             "movss %1, %%xmm2\n\t" \
2485             \
2486             "movss %2, %%xmm5\n\t" \
2487             "mulss %%xmm2, %%xmm5\n\t" \
2488             "movss %3, %%xmm6\n\t" \
2489             "mulss %%xmm1, %%xmm6\n\t" \
2490             "addss %%xmm6, %%xmm5\n\t" \
2491             "movss %4, %%xmm6\n\t" \
2492             "mulss %%xmm0, %%xmm6\n\t" \
2493             "addss %%xmm6, %%xmm5\n\t" \
2494             \
2495             "movss %5, %%xmm6\n\t" \
2496             "mulss %%xmm4, %%xmm6\n\t" \
2497             "movss %6, %%xmm7\n\t" \
2498             "mulss %%xmm3, %%xmm7\n\t" \
2499             "addss %%xmm7, %%xmm6\n\t" \
2500             \
2501             "addss %%xmm5, %%xmm6\n\t" \
2502             "movss %%xmm4, %%xmm3\n\t" \
2503             "movss %%xmm6, %%xmm4\n\t" \
2504             \
2505             "movss %%xmm6, %0\n\t"
2506             : "=m" (fpVect[iDataCntr])
2507             : "m0" (fpVect[iDataCntr]),
2508               "m" (fpCoeff[0]),
2509               "m" (fpCoeff[1]),
2510               "m" (fpCoeff[2]),
2511               "m" (fpCoeff[3]),
2512               "m" (fpCoeff[4])
2513             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
2514               "memory");
2515     }
2516     X86_ASM (
2517         "movss %%xmm0, %0\n\t" \
2518         "movss %%xmm1, %1\n\t" \
2519         "movss %%xmm2, %2\n\t" \
2520         "movss %%xmm3, %3\n\t" \
2521         "movss %%xmm4, %4\n\t"
2522         : "=m" (fpX[0]),
2523           "=m" (fpX[1]),
2524           "=m" (fpX[2]),
2525           "=m" (fpY[0]),
2526           "=m" (fpY[1])
2527         :
2528         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2529 }
2530 
2531 
dsp_x86_sse_iir(double * dpVect,int iDataLength,const double * dpCoeff,double * dpX,double * dpY)2532 void dsp_x86_sse_iir (double *dpVect, int iDataLength, const double *dpCoeff,
2533     double *dpX, double *dpY)
2534 {
2535     int iDataCntr;
2536 
2537     X86_ASM (
2538         "movsd %0, %%xmm1\n\t" \
2539         "movsd %1, %%xmm2\n\t" \
2540         "movsd %2, %%xmm3\n\t" \
2541         "movsd %3, %%xmm4\n\t" \
2542         "prefetchnta %4\n\t" \
2543         "prefetchnta %5\n\t"
2544         :
2545         : "m" (dpX[1]),
2546           "m" (dpX[2]),
2547           "m" (dpY[0]),
2548           "m" (dpY[1]),
2549           "m" (dpCoeff[0]),
2550           "m" (dpCoeff[3])
2551         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2552     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2553     {
2554         X86_ASM (
2555             "movsd %%xmm1, %%xmm0\n\t" \
2556             "movsd %%xmm2, %%xmm1\n\t" \
2557             "movsd %1, %%xmm2\n\t" \
2558             \
2559             "movsd %2, %%xmm5\n\t" \
2560             "mulsd %%xmm2, %%xmm5\n\t" \
2561             "movsd %3, %%xmm6\n\t" \
2562             "mulsd %%xmm1, %%xmm6\n\t" \
2563             "addsd %%xmm6, %%xmm5\n\t" \
2564             "movsd %4, %%xmm6\n\t" \
2565             "mulsd %%xmm0, %%xmm6\n\t" \
2566             "addsd %%xmm6, %%xmm5\n\t" \
2567             \
2568             "movsd %5, %%xmm6\n\t" \
2569             "mulsd %%xmm4, %%xmm6\n\t" \
2570             "movsd %6, %%xmm7\n\t" \
2571             "mulsd %%xmm3, %%xmm7\n\t" \
2572             "addsd %%xmm7, %%xmm6\n\t" \
2573             \
2574             "addsd %%xmm5, %%xmm6\n\t" \
2575             "movsd %%xmm4, %%xmm3\n\t" \
2576             "movsd %%xmm6, %%xmm4\n\t" \
2577             \
2578             "movsd %%xmm6, %0\n\t"
2579             : "=m" (dpVect[iDataCntr])
2580             : "m0" (dpVect[iDataCntr]),
2581               "m" (dpCoeff[0]),
2582               "m" (dpCoeff[1]),
2583               "m" (dpCoeff[2]),
2584               "m" (dpCoeff[3]),
2585               "m" (dpCoeff[4])
2586             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
2587               "memory");
2588     }
2589     X86_ASM (
2590         "movsd %%xmm0, %0\n\t" \
2591         "movsd %%xmm1, %1\n\t" \
2592         "movsd %%xmm2, %2\n\t" \
2593         "movsd %%xmm3, %3\n\t" \
2594         "movsd %%xmm4, %4\n\t"
2595         : "=m" (dpX[0]),
2596           "=m" (dpX[1]),
2597           "=m" (dpX[2]),
2598           "=m" (dpY[0]),
2599           "=m" (dpY[1])
2600         :
2601         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2602 }
2603 
2604 
dsp_x86_3dnow_iirf_nip(float * fpDest,const float * fpSrc,int iDataLength,const float * fpCoeff,float * fpX,float * fpY)2605 void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
2606     const float *fpCoeff, float *fpX, float *fpY)
2607 {
2608     int iDataCntr;
2609     pv2sf m64pCoeff = (pv2sf) &fpCoeff[1];
2610     pv2sf m64pCoeff2 = (pv2sf) &fpCoeff[3];
2611     pv2sf m64pX = (pv2sf) fpX;
2612     pv2sf m64pY = (pv2sf) fpY;
2613 
2614     X86_ASM (
2615         "movq %0, %%mm0\n\t" \
2616         "pswapd %%mm0, %%mm2\n\t" \
2617         "movd %1, %%mm3\n\t" \
2618         "movq %2, %%mm0\n\t" \
2619         "pswapd %%mm0, %%mm4\n\t" \
2620         "movq %3, %%mm5\n\t" \
2621         "movq %4, %%mm7\n\t" \
2622         :
2623         : "m" (*m64pCoeff),
2624           "m" (fpCoeff[0]),
2625           "m" (*m64pCoeff2),
2626           "m" (*m64pX),
2627           "m" (*m64pY)
2628         : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
2629     for (iDataCntr = 0;
2630         iDataCntr < iDataLength;
2631         iDataCntr++)
2632     {
2633         X86_ASM (
2634             "pxor %%mm0, %%mm0\n\t" \
2635             "movd %1, %%mm6\n\t" \
2636             "movq %%mm5, %%mm1\n\t" \
2637             "pfmul %%mm2, %%mm1\n\t" \
2638             "pfacc %%mm1, %%mm0\n\t" \
2639             "movq %%mm6, %%mm1\n\t" \
2640             "pfmul %%mm3, %%mm1\n\t" \
2641             "pfacc %%mm1, %%mm0\n\t" \
2642             "movq %%mm7, %%mm1\n\t" \
2643             "pfmul %%mm4, %%mm1\n\t" \
2644             "pfacc %%mm1, %%mm0\n\t" \
2645             "pfacc %%mm0, %%mm0\n\t" \
2646             \
2647             "pswapd %%mm7, %%mm1\n\t" \
2648             "movq %%mm1, %%mm7\n\t" \
2649             "punpckldq %%mm0, %%mm7\n\t" \
2650             \
2651             "pswapd %%mm5, %%mm1\n\t" \
2652             "movq %%mm1, %%mm5\n\t" \
2653             "movq %%mm6, %%mm1\n\t" \
2654             "punpckldq %%mm1, %%mm5\n\t" \
2655             \
2656             "movd %%mm0, %0\n\t"
2657             : "=m" (fpDest[iDataCntr])
2658             : "m" (fpSrc[iDataCntr])
2659             : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
2660     }
2661     X86_ASM (
2662         "movq %%mm5, %0\n\t" \
2663         "movd %%mm6, %1\n\t" \
2664         "movq %%mm7, %2\n\t"
2665         : "=m" (*m64pX),
2666           "=m" (fpX[2]),
2667           "=m" (*m64pY)
2668         :
2669         : "mm5", "mm6", "mm7", "memory");
2670     X86_ASM ("femms\n\t");
2671 }
2672 
2673 
dsp_x86_sse_iirf_nip(float * fpDest,const float * fpSrc,int iDataLength,const float * fpCoeff,float * fpX,float * fpY)2674 void dsp_x86_sse_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
2675     const float *fpCoeff, float *fpX, float *fpY)
2676 {
2677     int iDataCntr;
2678 
2679     X86_ASM (
2680         "movss %0, %%xmm1\n\t" \
2681         "movss %1, %%xmm2\n\t" \
2682         "movss %2, %%xmm3\n\t" \
2683         "movss %3, %%xmm4\n\t" \
2684         "prefetchnta %4\n\t"
2685         :
2686         : "m" (fpX[1]),
2687           "m" (fpX[2]),
2688           "m" (fpY[0]),
2689           "m" (fpY[1]),
2690           "m" (fpCoeff[0])
2691         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2692     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2693     {
2694         X86_ASM (
2695             "movss %%xmm1, %%xmm0\n\t" \
2696             "movss %%xmm2, %%xmm1\n\t" \
2697             "movss %1, %%xmm2\n\t" \
2698             \
2699             "movss %2, %%xmm5\n\t" \
2700             "mulss %%xmm2, %%xmm5\n\t" \
2701             "movss %3, %%xmm6\n\t" \
2702             "mulss %%xmm1, %%xmm6\n\t" \
2703             "addss %%xmm6, %%xmm5\n\t" \
2704             "movss %4, %%xmm6\n\t" \
2705             "mulss %%xmm0, %%xmm6\n\t" \
2706             "addss %%xmm6, %%xmm5\n\t" \
2707             \
2708             "movss %5, %%xmm6\n\t" \
2709             "mulss %%xmm4, %%xmm6\n\t" \
2710             "movss %6, %%xmm7\n\t" \
2711             "mulss %%xmm3, %%xmm7\n\t" \
2712             "addss %%xmm7, %%xmm6\n\t" \
2713             \
2714             "addss %%xmm5, %%xmm6\n\t" \
2715             "movss %%xmm4, %%xmm3\n\t" \
2716             "movss %%xmm6, %%xmm4\n\t" \
2717             \
2718             "movss %%xmm6, %0\n\t"
2719             : "=m" (fpDest[iDataCntr])
2720             : "m" (fpSrc[iDataCntr]),
2721               "m" (fpCoeff[0]),
2722               "m" (fpCoeff[1]),
2723               "m" (fpCoeff[2]),
2724               "m" (fpCoeff[3]),
2725               "m" (fpCoeff[4])
2726             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
2727               "memory");
2728     }
2729     X86_ASM (
2730         "movss %%xmm0, %0\n\t" \
2731         "movss %%xmm1, %1\n\t" \
2732         "movss %%xmm2, %2\n\t" \
2733         "movss %%xmm3, %3\n\t" \
2734         "movss %%xmm4, %4\n\t"
2735         : "=m" (fpX[0]),
2736           "=m" (fpX[1]),
2737           "=m" (fpX[2]),
2738           "=m" (fpY[0]),
2739           "=m" (fpY[1])
2740         :
2741         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2742 }
2743 
2744 
dsp_x86_sse_iir_nip(double * dpDest,const double * dpSrc,int iDataLength,const double * dpCoeff,double * dpX,double * dpY)2745 void dsp_x86_sse_iir_nip (double *dpDest, const double *dpSrc, int iDataLength,
2746     const double *dpCoeff, double *dpX, double *dpY)
2747 {
2748     int iDataCntr;
2749 
2750     X86_ASM (
2751         "movsd %0, %%xmm1\n\t" \
2752         "movsd %1, %%xmm2\n\t" \
2753         "movsd %2, %%xmm3\n\t" \
2754         "movsd %3, %%xmm4\n\t" \
2755         "prefetchnta %4\n\t" \
2756         "prefetchnta %5\n\t"
2757         :
2758         : "m" (dpX[1]),
2759           "m" (dpX[2]),
2760           "m" (dpY[0]),
2761           "m" (dpY[1]),
2762           "m" (dpCoeff[0]),
2763           "m" (dpCoeff[3])
2764         : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2765     for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2766     {
2767         X86_ASM (
2768             "movsd %%xmm1, %%xmm0\n\t" \
2769             "movsd %%xmm2, %%xmm1\n\t" \
2770             "movsd %1, %%xmm2\n\t" \
2771             \
2772             "movsd %2, %%xmm5\n\t" \
2773             "mulsd %%xmm2, %%xmm5\n\t" \
2774             "movsd %3, %%xmm6\n\t" \
2775             "mulsd %%xmm1, %%xmm6\n\t" \
2776             "addsd %%xmm6, %%xmm5\n\t" \
2777             "movsd %4, %%xmm6\n\t" \
2778             "mulsd %%xmm0, %%xmm6\n\t" \
2779             "addsd %%xmm6, %%xmm5\n\t" \
2780             \
2781             "movsd %5, %%xmm6\n\t" \
2782             "mulsd %%xmm4, %%xmm6\n\t" \
2783             "movsd %6, %%xmm7\n\t" \
2784             "mulsd %%xmm3, %%xmm7\n\t" \
2785             "addsd %%xmm7, %%xmm6\n\t" \
2786             \
2787             "addsd %%xmm5, %%xmm6\n\t" \
2788             "movsd %%xmm4, %%xmm3\n\t" \
2789             "movsd %%xmm6, %%xmm4\n\t" \
2790             \
2791             "movsd %%xmm6, %0\n\t"
2792             : "=m" (dpDest[iDataCntr])
2793             : "m" (dpSrc[iDataCntr]),
2794               "m" (dpCoeff[0]),
2795               "m" (dpCoeff[1]),
2796               "m" (dpCoeff[2]),
2797               "m" (dpCoeff[3]),
2798               "m" (dpCoeff[4])
2799             : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
2800               "memory");
2801     }
2802     X86_ASM (
2803         "movsd %%xmm0, %0\n\t" \
2804         "movsd %%xmm1, %1\n\t" \
2805         "movsd %%xmm2, %2\n\t" \
2806         "movsd %%xmm3, %3\n\t" \
2807         "movsd %%xmm4, %4\n\t"
2808         : "=m" (dpX[0]),
2809           "=m" (dpX[1]),
2810           "=m" (dpX[2]),
2811           "=m" (dpY[0]),
2812           "=m" (dpY[1])
2813         :
2814         : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2815 }
2816 
2817 
2818 #ifdef __cplusplus
2819 }
2820 #endif
2821 
2822 #endif
2823