1 /*
2
3 x86 specific optimized assembler dsp routines
4 Copyright (C) 2001-2005 Jussi Laako
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
20 */
21
22
23 #ifdef DSP_X86
24
25
26 #include <stdio.h>
27 #include <string.h>
28 #include <limits.h>
29 #include <math.h>
30 #include <float.h>
31
32 #include "dsp/X86.h"
33
34
35 #ifndef DSP_X86_64
36 static char cpCPUid[13];
37 #endif
38
39
40 #ifdef __cplusplus
41 extern "C"
42 {
43 #endif
44
45
46 #ifndef DSP_X86_64
dsp_x86_cpuid()47 const char *dsp_x86_cpuid ()
48 {
49 unsigned int *ipCPUid = (unsigned int *) cpCPUid;
50
51 X86_ASM (
52 "pushl %%ebx\n\t" \
53 "xorl %%eax, %%eax\n\t" \
54 "cpuid\n\t" \
55 "movl %%ebx, %0\n\t" \
56 "movl %%ecx, %2\n\t" \
57 "movl %%edx, %1\n\t" \
58 "popl %%ebx\n\t" \
59 : "=m" (ipCPUid[0]),
60 "=m" (ipCPUid[1]),
61 "=m" (ipCPUid[2])
62 :
63 : "eax", "ecx", "edx", "memory");
64 cpCPUid[12] = '\0';
65
66 return cpCPUid;
67 }
68
69
dsp_x86_features()70 unsigned int dsp_x86_features ()
71 {
72 unsigned int uiFeatures = 0;
73
74 X86_ASM (
75 "pushl %%ebx\n\t" \
76 "movl $1, %%eax\n\t" \
77 "cpuid\n\t" \
78 "movl %%edx, %0\n\t" \
79 "popl %%ebx\n\t" \
80 : "=m" (uiFeatures)
81 :
82 : "eax", "ecx", "edx", "memory");
83
84 return uiFeatures;
85 }
86
87
dsp_x86_amd_features()88 unsigned int dsp_x86_amd_features ()
89 {
90 unsigned int uiFunction = 0x80000001;
91 unsigned int uiFeatures = 0;
92
93 X86_ASM (
94 "pushl %%ebx\n\t" \
95 "movl %1, %%eax\n\t" \
96 "cpuid\n\t" \
97 "movl %%edx, %0\n\t" \
98 "popl %%ebx\n\t" \
99 : "=m" (uiFeatures)
100 : "m" (uiFunction)
101 : "eax", "ecx", "edx", "memory");
102
103 return uiFeatures;
104 }
105 #endif
106
107
dsp_x86_have_e3dnow()108 extern int dsp_x86_have_e3dnow ()
109 {
110 #ifndef DSP_X86_64
111 unsigned int uiExtSup = 0;
112 unsigned int uiFeatures;
113
114 X86_ASM (
115 "pushl %%ebx\n\t" \
116 "movl $0x80000000, %%eax\n\t" \
117 "cpuid\n\t" \
118 "cmpl $0x80000001, %%eax\n\t" \
119 "jl have3dnowxit\n\t" \
120 "movl $1, %0\n\t" \
121 "have3dnowxit:\n\t" \
122 "popl %%ebx\n\t"
123 : "=m" (uiExtSup)
124 :
125 : "eax", "ecx", "edx", "memory");
126 if (uiExtSup)
127 {
128 uiFeatures = dsp_x86_amd_features();
129 if ((uiFeatures & (1 << 31)) && (uiFeatures & (1 << 30)))
130 return 1;
131 }
132 return 0;
133 #else
134 return 1;
135 #endif
136 }
137
138
dsp_x86_have_sse2()139 extern int dsp_x86_have_sse2 ()
140 {
141 #ifndef DSP_X86_64
142 unsigned int uiFeatures;
143
144 uiFeatures = dsp_x86_features();
145 if ((uiFeatures & (1 << 25)) && (uiFeatures & (1 << 26)))
146 return 1;
147 return 0;
148 #else
149 return 1;
150 #endif
151 }
152
153
154 // --- inline code snippets
155
156
dsp_x86_prefetchntf_init(const float * fpSrc)157 inline void dsp_x86_prefetchntf_init (const float *fpSrc)
158 {
159 stpm64 m64pSrc = (stpm64) fpSrc;
160
161 X86_ASM (
162 "prefetchnta %0\n\t" \
163 "prefetchnta %1\n\t" \
164 "prefetchnta %2\n\t" \
165 "prefetchnta %3\n\t"
166 :
167 : "m" (m64pSrc[0]),
168 "m" (m64pSrc[8]),
169 "m" (m64pSrc[16]),
170 "m" (m64pSrc[24]));
171 }
172
173
dsp_x86_prefetchnt_init(const double * dpSrc)174 inline void dsp_x86_prefetchnt_init (const double *dpSrc)
175 {
176 stpm64 m64pSrc = (stpm64) dpSrc;
177
178 X86_ASM (
179 "prefetchnta %0\n\t" \
180 "prefetchnta %1\n\t" \
181 "prefetchnta %2\n\t" \
182 "prefetchnta %3\n\t"
183 :
184 : "m" (m64pSrc[0]),
185 "m" (m64pSrc[8]),
186 "m" (m64pSrc[16]),
187 "m" (m64pSrc[24]));
188 }
189
190
dsp_x86_prefetchtf_init(const float * fpSrc)191 inline void dsp_x86_prefetchtf_init (const float *fpSrc)
192 {
193 stpm64 m64pSrc = (stpm64) fpSrc;
194
195 X86_ASM (
196 "prefetcht0 %0\n\t" \
197 "prefetcht0 %1\n\t" \
198 "prefetcht0 %2\n\t" \
199 "prefetcht0 %3\n\t"
200 :
201 : "m" (m64pSrc[0]),
202 "m" (m64pSrc[8]),
203 "m" (m64pSrc[16]),
204 "m" (m64pSrc[24]));
205 }
206
207
dsp_x86_prefetcht_init(const double * dpSrc)208 inline void dsp_x86_prefetcht_init (const double *dpSrc)
209 {
210 stpm64 m64pSrc = (stpm64) dpSrc;
211
212 X86_ASM (
213 "prefetcht0 %0\n\t" \
214 "prefetcht0 %1\n\t" \
215 "prefetcht0 %2\n\t" \
216 "prefetcht0 %3\n\t"
217 :
218 : "m" (m64pSrc[0]),
219 "m" (m64pSrc[8]),
220 "m" (m64pSrc[16]),
221 "m" (m64pSrc[24]));
222 }
223
224
dsp_x86_prefetchntf_next(const float * fpSrc)225 inline void dsp_x86_prefetchntf_next (const float *fpSrc)
226 {
227 stpm64 m64pSrc = (stpm64) fpSrc;
228
229 X86_ASM (
230 "prefetchnta %0\n\t"
231 :
232 : "m" (m64pSrc[32]));
233 }
234
235
dsp_x86_prefetchnt_next(const double * dpSrc)236 inline void dsp_x86_prefetchnt_next (const double *dpSrc)
237 {
238 stpm64 m64pSrc = (stpm64) dpSrc;
239
240 X86_ASM (
241 "prefetchnta %0\n\t"
242 :
243 : "m" (m64pSrc[32]));
244 }
245
246
dsp_x86_prefetchtf_next(const float * fpSrc)247 inline void dsp_x86_prefetchtf_next (const float *fpSrc)
248 {
249 stpm64 m64pSrc = (stpm64) fpSrc;
250
251 X86_ASM (
252 "prefetcht0 %0\n\t"
253 :
254 : "m" (m64pSrc[32]));
255 }
256
257
dsp_x86_prefetcht_next(const double * dpSrc)258 inline void dsp_x86_prefetcht_next (const double *dpSrc)
259 {
260 stpm64 m64pSrc = (stpm64) dpSrc;
261
262 X86_ASM (
263 "prefetcht0 %0\n\t"
264 :
265 : "m" (m64pSrc[32]));
266 }
267
268
269 // ---
270
271
dsp_x86_3dnow_copyf(float * fpDest,const float * fpSrc,int iDataLength)272 void dsp_x86_3dnow_copyf (float *fpDest, const float *fpSrc, int iDataLength)
273 {
274 int iStartIdx;
275 int iDataCntr;
276 int iDataCount;
277 pv2sf m64pDest = (pv2sf) fpDest;
278 pv2sf m64pSrc = (pv2sf) fpSrc;
279
280 iStartIdx = 0;
281 X86_ASM (
282 "prefetchnta %0\n\t" \
283 "prefetchnta %1\n\t" \
284 "prefetchnta %2\n\t" \
285 "prefetchnta %3\n\t"
286 :
287 : "m" (m64pSrc[0]),
288 "m" (m64pSrc[8]),
289 "m" (m64pSrc[16]),
290 "m" (m64pSrc[24]));
291 iDataCount = ((iDataLength & 0xfffffff0) >> 1);
292 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
293 {
294 X86_ASM (
295 "prefetchnta %16\n\t" \
296 "movq %8, %%mm0\n\t" \
297 "movq %9, %%mm1\n\t" \
298 "movq %10, %%mm2\n\t" \
299 "movq %11, %%mm3\n\t" \
300 "movq %12, %%mm4\n\t" \
301 "movq %13, %%mm5\n\t" \
302 "movq %14, %%mm6\n\t" \
303 "movq %15, %%mm7\n\t" \
304 "movntq %%mm0, %0\n\t" \
305 "movntq %%mm1, %1\n\t" \
306 "movntq %%mm2, %2\n\t" \
307 "movntq %%mm3, %3\n\t" \
308 "movntq %%mm4, %4\n\t" \
309 "movntq %%mm5, %5\n\t" \
310 "movntq %%mm6, %6\n\t" \
311 "movntq %%mm7, %7\n\t"
312 : "=m" (m64pDest[iDataCntr]),
313 "=m" (m64pDest[iDataCntr + 1]),
314 "=m" (m64pDest[iDataCntr + 2]),
315 "=m" (m64pDest[iDataCntr + 3]),
316 "=m" (m64pDest[iDataCntr + 4]),
317 "=m" (m64pDest[iDataCntr + 5]),
318 "=m" (m64pDest[iDataCntr + 6]),
319 "=m" (m64pDest[iDataCntr + 7])
320 : "m" (m64pSrc[iDataCntr]),
321 "m" (m64pSrc[iDataCntr + 1]),
322 "m" (m64pSrc[iDataCntr + 2]),
323 "m" (m64pSrc[iDataCntr + 3]),
324 "m" (m64pSrc[iDataCntr + 4]),
325 "m" (m64pSrc[iDataCntr + 5]),
326 "m" (m64pSrc[iDataCntr + 6]),
327 "m" (m64pSrc[iDataCntr + 7]),
328 "m" (m64pSrc[iDataCntr + 32])
329 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
330 }
331 iStartIdx = iDataCount;
332 iDataCount = ((iDataLength & 0xfffffffe) >> 1);
333 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
334 {
335 X86_ASM (
336 "prefetchnta %2\n\t" \
337 "movq %1, %%mm0\n\t" \
338 "movntq %%mm0, %0\n\t"
339 : "=m" (m64pDest[iDataCntr])
340 : "m" (m64pSrc[iDataCntr]),
341 "m" (m64pSrc[iDataCntr + 32])
342 : "mm0", "memory");
343 }
344 if (iDataLength & 0x1)
345 {
346 X86_ASM (
347 "movd %1, %%mm0\n\t" \
348 "movd %%mm0, %0\n\t"
349 : "=m" (fpDest[iDataLength - 1])
350 : "m" (fpSrc[iDataLength - 1])
351 : "mm0", "memory");
352 }
353 X86_ASM (
354 "femms\n\t" \
355 "sfence\n\t");
356 }
357
358
dsp_x86_3dnow_copyd(double * dpDest,const double * dpSrc,int iDataLength)359 void dsp_x86_3dnow_copyd (double *dpDest, const double *dpSrc, int iDataLength)
360 {
361 int iStartIdx;
362 int iDataCntr;
363 int iDataCount;
364
365 iStartIdx = 0;
366 X86_ASM (
367 "prefetchnta %0\n\t" \
368 "prefetchnta %1\n\t" \
369 "prefetchnta %2\n\t" \
370 "prefetchnta %3\n\t"
371 :
372 : "m" (dpSrc[0]),
373 "m" (dpSrc[8]),
374 "m" (dpSrc[16]),
375 "m" (dpSrc[24]));
376 iDataCount = (iDataLength & 0xfffffff8);
377 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr += 8)
378 {
379 X86_ASM (
380 "prefetchnta %16\n\t" \
381 "movq %8, %%mm0\n\t" \
382 "movq %9, %%mm1\n\t" \
383 "movq %10, %%mm2\n\t" \
384 "movq %11, %%mm3\n\t" \
385 "movq %12, %%mm4\n\t" \
386 "movq %13, %%mm5\n\t" \
387 "movq %14, %%mm6\n\t" \
388 "movq %15, %%mm7\n\t" \
389 "movntq %%mm0, %0\n\t" \
390 "movntq %%mm1, %1\n\t" \
391 "movntq %%mm2, %2\n\t" \
392 "movntq %%mm3, %3\n\t" \
393 "movntq %%mm4, %4\n\t" \
394 "movntq %%mm5, %5\n\t" \
395 "movntq %%mm6, %6\n\t" \
396 "movntq %%mm7, %7\n\t"
397 : "=m" (dpDest[iDataCntr]),
398 "=m" (dpDest[iDataCntr + 1]),
399 "=m" (dpDest[iDataCntr + 2]),
400 "=m" (dpDest[iDataCntr + 3]),
401 "=m" (dpDest[iDataCntr + 4]),
402 "=m" (dpDest[iDataCntr + 5]),
403 "=m" (dpDest[iDataCntr + 6]),
404 "=m" (dpDest[iDataCntr + 7])
405 : "m" (dpSrc[iDataCntr]),
406 "m" (dpSrc[iDataCntr + 1]),
407 "m" (dpSrc[iDataCntr + 2]),
408 "m" (dpSrc[iDataCntr + 3]),
409 "m" (dpSrc[iDataCntr + 4]),
410 "m" (dpSrc[iDataCntr + 5]),
411 "m" (dpSrc[iDataCntr + 6]),
412 "m" (dpSrc[iDataCntr + 7]),
413 "m" (dpSrc[iDataCntr + 32])
414 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
415 }
416 iStartIdx = iDataCount;
417 iDataCount = iDataLength;
418 for (iDataCntr = iStartIdx; iDataCntr < iDataCount; iDataCntr++)
419 {
420 X86_ASM (
421 "prefetchnta %2\n\t" \
422 "movq %1, %%mm0\n\t" \
423 "movntq %%mm0, %0\n\t"
424 : "=m" (dpDest[iDataCntr])
425 : "m" (dpSrc[iDataCntr]),
426 "m" (dpSrc[iDataCntr + 32])
427 : "mm0", "memory");
428 }
429 X86_ASM (
430 "femms\n\t" \
431 "sfence\n\t");
432 }
433
434
dsp_x86_3dnow_addf(float * fpVect,float fSrc,int iDataLength)435 void dsp_x86_3dnow_addf (float *fpVect, float fSrc, int iDataLength)
436 {
437 int iDataCntr;
438 int iDataCount;
439 pv2sf m64pVect = (pv2sf) fpVect;
440 stm64 m64Src;
441
442 m64Src.f[0] = m64Src.f[1] = fSrc;
443 iDataCount = (iDataLength >> 1);
444 X86_ASM (
445 "movq %0, %%mm1\n\t"
446 :
447 : "m" (m64Src)
448 : "mm1", "memory");
449 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
450 {
451 X86_ASM (
452 "movq %1, %%mm0\n\t" \
453 "pfadd %%mm1, %%mm0\n\t" \
454 "movntq %%mm0, %0\n\t"
455 : "=m" (m64pVect[iDataCntr])
456 : "m0" (m64pVect[iDataCntr])
457 : "mm0", "mm1", "memory");
458 }
459 if (iDataLength & 0x1)
460 {
461 X86_ASM (
462 "movd %1, %%mm0\n\t" \
463 "pfadd %%mm1, %%mm0\n\t" \
464 "movd %%mm0, %0\n\t"
465 : "=m" (fpVect[iDataLength - 1])
466 : "m0" (fpVect[iDataLength - 1])
467 : "mm0", "mm1", "memory");
468 }
469 X86_ASM (
470 "femms\n\t" \
471 "sfence\n\t");
472 }
473
474
dsp_x86_sse_addf(float * fpVect,float fSrc,int iDataLength)475 void dsp_x86_sse_addf (float *fpVect, float fSrc, int iDataLength)
476 {
477 int iDataCntr;
478
479 X86_ASM (
480 "movss %0, %%xmm1\n\t"
481 :
482 : "m" (fSrc)
483 : "xmm1", "memory");
484 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
485 {
486 X86_ASM (
487 "movss %1, %%xmm0\n\t" \
488 "addss %%xmm1, %%xmm0\n\t" \
489 "movss %%xmm0, %0\n\t"
490 : "=m" (fpVect[iDataCntr])
491 : "m0" (fpVect[iDataCntr])
492 : "xmm0", "xmm1", "memory");
493 }
494 }
495
496
dsp_x86_sse_add(double * dpVect,double dSrc,int iDataLength)497 void dsp_x86_sse_add (double *dpVect, double dSrc, int iDataLength)
498 {
499 int iDataCntr;
500
501 X86_ASM (
502 "movsd %0, %%xmm1\n\t"
503 :
504 : "m" (dSrc)
505 : "xmm1", "memory");
506 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
507 {
508 X86_ASM (
509 "movsd %1, %%xmm0\n\t" \
510 "addsd %%xmm1, %%xmm0\n\t" \
511 "movsd %%xmm0, %0\n\t"
512 : "=m" (dpVect[iDataCntr])
513 : "m0" (dpVect[iDataCntr])
514 : "xmm0", "xmm1", "memory");
515 }
516 }
517
518
dsp_x86_3dnow_mulf(float * fpVect,float fSrc,int iDataLength)519 void dsp_x86_3dnow_mulf (float *fpVect, float fSrc, int iDataLength)
520 {
521 int iDataCntr;
522 int iDataCount;
523 pv2sf m64pVect = (pv2sf) fpVect;
524 stm64 m64Src;
525
526 m64Src.f[0] = m64Src.f[1] = fSrc;
527 iDataCount = (iDataLength >> 1);
528 X86_ASM (
529 "movq %0, %%mm1\n\t"
530 :
531 : "m" (m64Src)
532 : "mm1", "memory");
533 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
534 {
535 X86_ASM (
536 "movq %1, %%mm0\n\t" \
537 "pfmul %%mm1, %%mm0\n\t" \
538 "movntq %%mm0, %0\n\t"
539 : "=m" (m64pVect[iDataCntr])
540 : "m0" (m64pVect[iDataCntr])
541 : "mm0", "mm1", "memory");
542 }
543 if (iDataLength & 0x1)
544 {
545 X86_ASM (
546 "movd %1, %%mm0\n\t" \
547 "pfmul %%mm1, %%mm0\n\t" \
548 "movd %%mm0, %0\n\t"
549 : "=m" (fpVect[iDataLength - 1])
550 : "m0" (fpVect[iDataLength - 1])
551 : "mm0", "mm1", "memory");
552 }
553 X86_ASM (
554 "femms\n\t" \
555 "sfence\n\t");
556 }
557
558
dsp_x86_sse_mulf(float * fpVect,float fSrc,int iDataLength)559 void dsp_x86_sse_mulf (float *fpVect, float fSrc, int iDataLength)
560 {
561 int iDataCntr;
562
563 X86_ASM (
564 "movss %0, %%xmm1\n\t"
565 :
566 : "m" (fSrc)
567 : "xmm1", "memory");
568 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
569 {
570 X86_ASM (
571 "movss %1, %%xmm0\n\t" \
572 "mulss %%xmm1, %%xmm0\n\t" \
573 "movss %%xmm0, %0\n\t"
574 : "=m" (fpVect[iDataCntr])
575 : "m0" (fpVect[iDataCntr])
576 : "xmm0", "xmm1", "memory");
577 }
578 }
579
580
dsp_x86_sse_mul(double * dpVect,double dSrc,int iDataLength)581 void dsp_x86_sse_mul (double *dpVect, double dSrc, int iDataLength)
582 {
583 int iDataCntr;
584
585 X86_ASM (
586 "movsd %0, %%xmm1\n\t"
587 :
588 : "m" (dSrc)
589 : "xmm1", "memory");
590 for (iDataCntr = 0; iDataCntr <iDataLength; iDataCntr++)
591 {
592 X86_ASM (
593 "movsd %1, %%xmm0\n\t" \
594 "mulsd %%xmm1, %%xmm0\n\t" \
595 "movsd %%xmm0, %0\n\t"
596 : "=m" (dpVect[iDataCntr])
597 : "m0" (dpVect[iDataCntr])
598 : "xmm0", "xmm1", "memory");
599 }
600 }
601
602
dsp_x86_3dnow_mulf_nip(float * fpDest,const float * fpSrc1,float fSrc2,int iDataLength)603 void dsp_x86_3dnow_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
604 int iDataLength)
605 {
606 int iDataCntr;
607 int iDataCount;
608 pv2sf m64pDest = (pv2sf) fpDest;
609 pv2sf m64pSrc1 = (pv2sf) fpSrc1;
610 stm64 m64Src2;
611
612 m64Src2.f[0] = m64Src2.f[1] = fSrc2;
613 iDataCount = (iDataLength >> 1);
614 X86_ASM (
615 "movq %0, %%mm1\n\t"
616 :
617 : "m" (m64Src2)
618 : "mm1", "memory");
619 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
620 {
621 X86_ASM (
622 "movq %1, %%mm0\n\t" \
623 "pfmul %%mm1, %%mm0\n\t" \
624 "movntq %%mm0, %0\n\t"
625 : "=m" (m64pDest[iDataCntr])
626 : "m" (m64pSrc1[iDataCntr])
627 : "mm0", "mm1", "memory");
628 }
629 if (iDataLength & 0x1)
630 {
631 X86_ASM (
632 "movd %1, %%mm0\n\t" \
633 "pfmul %%mm1, %%mm0\n\t" \
634 "movd %%mm0, %0\n\t"
635 : "=m" (fpDest[iDataLength - 1])
636 : "m" (fpSrc1[iDataLength - 1])
637 : "mm0", "mm1", "memory");
638 }
639 X86_ASM (
640 "femms\n\t" \
641 "sfence\n\t");
642 }
643
644
dsp_x86_sse_mulf_nip(float * fpDest,const float * fpSrc1,float fSrc2,int iDataLength)645 void dsp_x86_sse_mulf_nip (float *fpDest, const float *fpSrc1, float fSrc2,
646 int iDataLength)
647 {
648 int iDataCntr;
649
650 X86_ASM (
651 "movss %0, %%xmm1\n\t"
652 :
653 : "m" (fSrc2)
654 : "xmm1", "memory");
655 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
656 {
657 X86_ASM (
658 "movss %1, %%xmm0\n\t" \
659 "mulss %%xmm1, %%xmm0\n\t" \
660 "movss %%xmm0, %0\n\t"
661 : "=m" (fpDest[iDataCntr])
662 : "m" (fpSrc1[iDataCntr])
663 : "xmm0", "xmm1", "memory");
664 }
665 }
666
667
dsp_x86_sse_mul_nip(double * dpDest,const double * dpSrc1,double dSrc2,int iDataLength)668 void dsp_x86_sse_mul_nip (double *dpDest, const double *dpSrc1, double dSrc2,
669 int iDataLength)
670 {
671 int iDataCntr;
672
673 X86_ASM (
674 "movsd %0, %%xmm1\n\t"
675 :
676 : "m" (dSrc2)
677 : "xmm1", "memory");
678 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
679 {
680 X86_ASM (
681 "movsd %1, %%xmm0\n\t" \
682 "mulsd %%xmm1, %%xmm0\n\t" \
683 "movsd %%xmm0, %0\n\t"
684 : "=m" (dpDest[iDataCntr])
685 : "m" (dpSrc1[iDataCntr])
686 : "xmm0", "xmm1", "memory");
687 }
688 }
689
690
dsp_x86_3dnow_add2f(float * fpDest,const float * fpSrc,int iDataLength)691 void dsp_x86_3dnow_add2f (float *fpDest, const float *fpSrc, int iDataLength)
692 {
693 int iDataCntr;
694 int iDataCount;
695 pv2sf m64pDest = (pv2sf) fpDest;
696 pv2sf m64pSrc = (pv2sf) fpSrc;
697
698 iDataCount = (iDataLength >> 1);
699 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
700 {
701 X86_ASM (
702 "movq %1, %%mm0\n\t" \
703 "movq %2, %%mm1\n\t" \
704 "pfadd %%mm1, %%mm0\n\t" \
705 "movntq %%mm0, %0\n\t"
706 : "=m" (m64pDest[iDataCntr])
707 : "m0" (m64pDest[iDataCntr]),
708 "m" (m64pSrc[iDataCntr])
709 : "mm0", "mm1", "memory");
710 }
711 if (iDataLength & 0x1)
712 {
713 X86_ASM (
714 "movd %1, %%mm0\n\t" \
715 "movd %2, %%mm1\n\t" \
716 "pfadd %%mm1, %%mm0\n\t" \
717 "movd %%mm0, %0\n\t"
718 : "=m" (fpDest[iDataLength - 1])
719 : "m0" (fpDest[iDataLength - 1]),
720 "m" (fpSrc[iDataLength - 1])
721 : "mm0", "mm1", "memory");
722 }
723 X86_ASM (
724 "femms\n\t" \
725 "sfence\n\t");
726 }
727
728
dsp_x86_sse_add2f(float * fpDest,const float * fpSrc,int iDataLength)729 void dsp_x86_sse_add2f (float *fpDest, const float *fpSrc, int iDataLength)
730 {
731 int iDataCntr;
732
733 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
734 {
735 X86_ASM (
736 "movss %1, %%xmm0\n\t" \
737 "addss %2, %%xmm0\n\t" \
738 "movss %%xmm0, %0\n\t"
739 : "=m" (fpDest[iDataCntr])
740 : "m0" (fpDest[iDataCntr]),
741 "m" (fpSrc[iDataCntr])
742 : "xmm0", "memory");
743 }
744 }
745
746
dsp_x86_sse_add2(double * dpDest,const double * dpSrc,int iDataLength)747 void dsp_x86_sse_add2 (double *dpDest, const double *dpSrc, int iDataLength)
748 {
749 int iDataCntr;
750
751 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
752 {
753 X86_ASM (
754 "movsd %1, %%xmm0\n\t" \
755 "addsd %2, %%xmm0\n\t" \
756 "movsd %%xmm0, %0\n\t"
757 : "=m" (dpDest[iDataCntr])
758 : "m0" (dpDest[iDataCntr]),
759 "m" (dpSrc[iDataCntr])
760 : "xmm0", "memory");
761 }
762 }
763
764
dsp_x86_3dnow_mul2f(float * fpDest,const float * fpSrc,int iDataLength)765 void dsp_x86_3dnow_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
766 {
767 int iDataCntr;
768 int iDataCount;
769 pv2sf m64pDest = (pv2sf) fpDest;
770 pv2sf m64pSrc = (pv2sf) fpSrc;
771
772 iDataCount = (iDataLength >> 1);
773 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
774 {
775 X86_ASM (
776 "movq %1, %%mm0\n\t" \
777 "movq %2, %%mm1\n\t" \
778 "pfmul %%mm1, %%mm0\n\t" \
779 "movntq %%mm0, %0\n\t"
780 : "=m" (m64pDest[iDataCntr])
781 : "m0" (m64pDest[iDataCntr]),
782 "m" (m64pSrc[iDataCntr])
783 : "mm0", "mm1", "memory");
784 }
785 if (iDataLength & 0x1)
786 {
787 X86_ASM (
788 "movd %1, %%mm0\n\t" \
789 "movd %2, %%mm1\n\t" \
790 "pfmul %%mm1, %%mm0\n\t" \
791 "movd %%mm0, %0\n\t"
792 : "=m" (fpDest[iDataLength - 1])
793 : "m0" (fpDest[iDataLength - 1]),
794 "m" (fpSrc[iDataLength - 1])
795 : "mm0", "mm1", "memory");
796 }
797 X86_ASM (
798 "femms\n\t" \
799 "sfence\n\t");
800 }
801
802
dsp_x86_sse_mul2f(float * fpDest,const float * fpSrc,int iDataLength)803 void dsp_x86_sse_mul2f (float *fpDest, const float *fpSrc, int iDataLength)
804 {
805 int iDataCntr;
806
807 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
808 {
809 X86_ASM (
810 "movss %1, %%xmm0\n\t" \
811 "mulss %2, %%xmm0\n\t" \
812 "movss %%xmm0, %0\n\t"
813 : "=m" (fpDest[iDataCntr])
814 : "m0" (fpDest[iDataCntr]),
815 "m" (fpSrc[iDataCntr])
816 : "xmm0", "memory");
817 }
818 }
819
820
dsp_x86_sse_mul2(double * dpDest,const double * dpSrc,int iDataLength)821 void dsp_x86_sse_mul2 (double *dpDest, const double *dpSrc, int iDataLength)
822 {
823 int iDataCntr;
824
825 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
826 {
827 X86_ASM (
828 "movsd %1, %%xmm0\n\t" \
829 "mulsd %2, %%xmm0\n\t" \
830 "movsd %%xmm0, %0\n\t"
831 : "=m" (dpDest[iDataCntr])
832 : "m0" (dpDest[iDataCntr]),
833 "m" (dpSrc[iDataCntr])
834 : "xmm0", "memory");
835 }
836 }
837
838
dsp_x86_3dnow_add3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)839 void dsp_x86_3dnow_add3f (float *fpDest, const float *fpSrc1,
840 const float *fpSrc2, int iDataLength)
841 {
842 int iDataCntr;
843 int iDataCount;
844 pv2sf m64pDest = (pv2sf) fpDest;
845 pv2sf m64pSrc1 = (pv2sf) fpSrc1;
846 pv2sf m64pSrc2 = (pv2sf) fpSrc2;
847
848 iDataCount = (iDataLength >> 1);
849 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
850 {
851 X86_ASM (
852 "movq %1, %%mm0\n\t" \
853 "movq %2, %%mm1\n\t" \
854 "pfadd %%mm1, %%mm0\n\t" \
855 "movntq %%mm0, %0\n\t"
856 : "=m" (m64pDest[iDataCntr])
857 : "m" (m64pSrc1[iDataCntr]),
858 "m" (m64pSrc2[iDataCntr])
859 : "mm0", "mm1", "memory");
860 }
861 if (iDataLength & 0x1)
862 {
863 X86_ASM (
864 "movd %1, %%mm0\n\t" \
865 "movd %2, %%mm1\n\t" \
866 "pfadd %%mm1, %%mm0\n\t" \
867 "movd %%mm0, %0\n\t"
868 : "=m" (fpDest[iDataLength - 1])
869 : "m" (fpSrc1[iDataLength - 1]),
870 "m" (fpSrc2[iDataLength - 1])
871 : "mm0", "mm1", "memory");
872 }
873 X86_ASM (
874 "femms\n\t" \
875 "sfence\n\t");
876 }
877
878
dsp_x86_sse_add3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)879 void dsp_x86_sse_add3f (float *fpDest, const float *fpSrc1,
880 const float *fpSrc2, int iDataLength)
881 {
882 int iDataCntr;
883
884 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
885 {
886 X86_ASM (
887 "movss %1, %%xmm0\n\t" \
888 "addss %2, %%xmm0\n\t" \
889 "movss %%xmm0, %0\n\t"
890 : "=m" (fpDest[iDataCntr])
891 : "m" (fpSrc1[iDataCntr]),
892 "m" (fpSrc2[iDataCntr])
893 : "xmm0", "memory");
894 }
895 }
896
897
dsp_x86_sse_add3(double * dpDest,const double * dpSrc1,const double * dpSrc2,int iDataLength)898 void dsp_x86_sse_add3 (double *dpDest, const double *dpSrc1,
899 const double *dpSrc2, int iDataLength)
900 {
901 int iDataCntr;
902
903 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
904 {
905 X86_ASM (
906 "movsd %1, %%xmm0\n\t" \
907 "addsd %2, %%xmm0\n\t" \
908 "movsd %%xmm0, %0\n\t"
909 : "=m" (dpDest[iDataCntr])
910 : "m" (dpSrc1[iDataCntr]),
911 "m" (dpSrc2[iDataCntr])
912 : "xmm0", "memory");
913 }
914 }
915
916
dsp_x86_3dnow_mul3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)917 void dsp_x86_3dnow_mul3f (float *fpDest, const float *fpSrc1,
918 const float *fpSrc2, int iDataLength)
919 {
920 int iDataCntr;
921 int iDataCount;
922 pv2sf m64pDest = (pv2sf) fpDest;
923 pv2sf m64pSrc1 = (pv2sf) fpSrc1;
924 pv2sf m64pSrc2 = (pv2sf) fpSrc2;
925
926 iDataCount = (iDataLength >> 1);
927 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
928 {
929 X86_ASM (
930 "movq %1, %%mm0\n\t" \
931 "movq %2, %%mm1\n\t" \
932 "pfmul %%mm1, %%mm0\n\t" \
933 "movntq %%mm0, %0\n\t"
934 : "=m" (m64pDest[iDataCntr])
935 : "m" (m64pSrc1[iDataCntr]),
936 "m" (m64pSrc2[iDataCntr])
937 : "mm0", "mm1", "memory");
938 }
939 if (iDataLength & 0x1)
940 {
941 X86_ASM (
942 "movd %1, %%mm0\n\t" \
943 "movd %2, %%mm1\n\t" \
944 "pfmul %%mm1, %%mm0\n\t" \
945 "movd %%mm0, %0\n\t"
946 : "=m" (fpDest[iDataLength - 1])
947 : "m" (fpSrc1[iDataLength - 1]),
948 "m" (fpSrc2[iDataLength - 1])
949 : "mm0", "mm1", "memory");
950 }
951 X86_ASM (
952 "femms\n\t" \
953 "sfence\n\t");
954 }
955
956
dsp_x86_sse_mul3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)957 void dsp_x86_sse_mul3f (float *fpDest, const float *fpSrc1,
958 const float *fpSrc2, int iDataLength)
959 {
960 int iDataCntr;
961
962 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
963 {
964 X86_ASM (
965 "movss %1, %%xmm0\n\t" \
966 "mulss %2, %%xmm0\n\t" \
967 "movss %%xmm0, %0\n\t"
968 : "=m" (fpDest[iDataCntr])
969 : "m" (fpSrc1[iDataCntr]),
970 "m" (fpSrc2[iDataCntr])
971 : "xmm0", "memory");
972 }
973 }
974
975
dsp_x86_sse_mul3(double * dpDest,const double * dpSrc1,const double * dpSrc2,int iDataLength)976 void dsp_x86_sse_mul3 (double *dpDest, const double *dpSrc1,
977 const double *dpSrc2, int iDataLength)
978 {
979 int iDataCntr;
980
981 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
982 {
983 X86_ASM (
984 "movsd %1, %%xmm0\n\t" \
985 "mulsd %2, %%xmm0\n\t" \
986 "movsd %%xmm0, %0\n\t"
987 : "=m" (dpDest[iDataCntr])
988 : "m" (dpSrc1[iDataCntr]),
989 "m" (dpSrc2[iDataCntr])
990 : "xmm0", "memory");
991 }
992 }
993
994
dsp_x86_3dnow_cmulf(float * fpDest,const float * fpSrc,int iDataLength)995 void dsp_x86_3dnow_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
996 {
997 int iDataCntr;
998 pv2sf m64pDest = (pv2sf) fpDest;
999
1000 X86_ASM (
1001 "movq %0, %%mm3\n\t"
1002 :
1003 : "m" (fpSrc[0])
1004 : "mm3", "memory");
1005 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1006 {
1007 X86_ASM (
1008 "movq %1, %%mm0\n\t" \
1009 "movq %%mm3, %%mm1\n\t" \
1010 "pswapd %%mm1, %%mm2\n\t" \
1011 "pfmul %%mm0, %%mm1\n\t" \
1012 "pfmul %%mm0, %%mm2\n\t" \
1013 "pfpnacc %%mm2, %%mm1\n\t" \
1014 "movntq %%mm1, %0\n\t"
1015 : "=m" (m64pDest[iDataCntr])
1016 : "m0" (m64pDest[iDataCntr])
1017 : "mm0", "mm1", "mm2", "mm3", "memory");
1018 }
1019 X86_ASM (
1020 "femms\n\t" \
1021 "sfence\n\t");
1022 }
1023
1024
dsp_x86_sse_cmulf(float * fpDest,const float * fpSrc,int iDataLength)1025 void dsp_x86_sse_cmulf (float *fpDest, const float *fpSrc, int iDataLength)
1026 {
1027 int iDataCntr;
1028 int iDataCount;
1029
1030 X86_ASM (
1031 "movss %0, %%xmm2\n\t" \
1032 "movss %1, %%xmm3\n\t"
1033 :
1034 : "m" (fpSrc[0]),
1035 "m" (fpSrc[1])
1036 : "xmm2", "xmm3", "memory");
1037 iDataCount = (iDataLength << 1);
1038 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1039 {
1040 X86_ASM (
1041 "movss %2, %%xmm0\n\t" \
1042 "movss %%xmm0, %%xmm1\n\t" \
1043 "movss %3, %%xmm4\n\t" \
1044 \
1045 "mulss %%xmm2, %%xmm0\n\t" \
1046 "movss %%xmm4, %%xmm5\n\t" \
1047 "mulss %%xmm3, %%xmm5\n\t" \
1048 "subss %%xmm5, %%xmm0\n\t" \
1049 \
1050 "mulss %%xmm3, %%xmm1\n\t" \
1051 "movss %%xmm4, %%xmm5\n\t" \
1052 "mulss %%xmm2, %%xmm5\n\t" \
1053 "addss %%xmm5, %%xmm1\n\t" \
1054 \
1055 "movss %%xmm0, %0\n\t" \
1056 "movss %%xmm1, %1\n\t"
1057 : "=m" (fpDest[iDataCntr]),
1058 "=m" (fpDest[iDataCntr + 1])
1059 : "m0" (fpDest[iDataCntr]),
1060 "m1" (fpDest[iDataCntr + 1])
1061 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1062 }
1063 }
1064
1065
dsp_x86_sse_cmul(double * dpDest,const double * dpSrc,int iDataLength)1066 void dsp_x86_sse_cmul (double *dpDest, const double *dpSrc, int iDataLength)
1067 {
1068 int iDataCntr;
1069 int iDataCount;
1070
1071 X86_ASM (
1072 "movsd %0, %%xmm2\n\t" \
1073 "movsd %1, %%xmm3\n\t"
1074 :
1075 : "m" (dpSrc[0]),
1076 "m" (dpSrc[1])
1077 : "xmm2", "xmm3", "memory");
1078 iDataCount = (iDataLength << 1);
1079 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1080 {
1081 X86_ASM (
1082 "movsd %2, %%xmm0\n\t" \
1083 "movsd %%xmm0, %%xmm1\n\t" \
1084 "movsd %3, %%xmm4\n\t" \
1085 \
1086 "mulsd %%xmm2, %%xmm0\n\t" \
1087 "movsd %%xmm4, %%xmm5\n\t" \
1088 "mulsd %%xmm3, %%xmm5\n\t" \
1089 "subsd %%xmm5, %%xmm0\n\t" \
1090 \
1091 "mulsd %%xmm3, %%xmm1\n\t" \
1092 "movsd %%xmm4, %%xmm5\n\t" \
1093 "mulsd %%xmm2, %%xmm5\n\t" \
1094 "addsd %%xmm5, %%xmm1\n\t" \
1095 \
1096 "movsd %%xmm0, %0\n\t" \
1097 "movsd %%xmm1, %1\n\t"
1098 : "=m" (dpDest[iDataCntr]),
1099 "=m" (dpDest[iDataCntr + 1])
1100 : "m0" (dpDest[iDataCntr]),
1101 "m1" (dpDest[iDataCntr + 1])
1102 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1103 }
1104 }
1105
1106
dsp_x86_3dnow_cmul2f(float * fpDest,const float * fpSrc,int iDataLength)1107 void dsp_x86_3dnow_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
1108 {
1109 int iDataCntr;
1110 pv2sf m64pDest = (pv2sf) fpDest;
1111 pv2sf m64pSrc = (pv2sf) fpSrc;
1112
1113 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1114 {
1115 X86_ASM (
1116 "movq %1, %%mm0\n\t" \
1117 "movq %2, %%mm1\n\t" \
1118 "pswapd %%mm1, %%mm2\n\t" \
1119 "pfmul %%mm0, %%mm1\n\t" \
1120 "pfmul %%mm0, %%mm2\n\t" \
1121 "pfpnacc %%mm2, %%mm1\n\t" \
1122 "movntq %%mm1, %0\n\t"
1123 : "=m" (m64pDest[iDataCntr])
1124 : "m0" (m64pDest[iDataCntr]),
1125 "m" (m64pSrc[iDataCntr])
1126 : "mm0", "mm1", "mm2", "memory");
1127 }
1128 X86_ASM (
1129 "femms\n\t" \
1130 "sfence\n\t");
1131 }
1132
1133
dsp_x86_sse_cmul2f(float * fpDest,const float * fpSrc,int iDataLength)1134 void dsp_x86_sse_cmul2f (float *fpDest, const float *fpSrc, int iDataLength)
1135 {
1136 int iDataCntr;
1137 int iDataCount;
1138
1139 iDataCount = (iDataLength << 1);
1140 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1141 {
1142 X86_ASM (
1143 "movss %4, %%xmm2\n\t" \
1144 "movss %5, %%xmm3\n\t" \
1145 \
1146 "movss %2, %%xmm0\n\t" \
1147 "movss %%xmm0, %%xmm1\n\t" \
1148 "movss %3, %%xmm4\n\t" \
1149 \
1150 "mulss %%xmm2, %%xmm0\n\t" \
1151 "movss %%xmm4, %%xmm5\n\t" \
1152 "mulss %%xmm3, %%xmm5\n\t" \
1153 "subss %%xmm5, %%xmm0\n\t" \
1154 \
1155 "mulss %%xmm3, %%xmm1\n\t" \
1156 "movss %%xmm4, %%xmm5\n\t" \
1157 "mulss %%xmm2, %%xmm5\n\t" \
1158 "addss %%xmm5, %%xmm1\n\t" \
1159 \
1160 "movss %%xmm0, %0\n\t" \
1161 "movss %%xmm1, %1\n\t"
1162 : "=m" (fpDest[iDataCntr]),
1163 "=m" (fpDest[iDataCntr + 1])
1164 : "m0" (fpDest[iDataCntr]),
1165 "m1" (fpDest[iDataCntr + 1]),
1166 "m" (fpSrc[iDataCntr]),
1167 "m" (fpSrc[iDataCntr + 1])
1168 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1169 }
1170 }
1171
1172
dsp_x86_sse_cmul2(double * dpDest,const double * dpSrc,int iDataLength)1173 void dsp_x86_sse_cmul2 (double *dpDest, const double *dpSrc, int iDataLength)
1174 {
1175 int iDataCntr;
1176 int iDataCount;
1177
1178 iDataCount = (iDataLength << 1);
1179 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1180 {
1181 X86_ASM (
1182 "movsd %4, %%xmm2\n\t" \
1183 "movsd %5, %%xmm3\n\t" \
1184 \
1185 "movsd %2, %%xmm0\n\t" \
1186 "movsd %%xmm0, %%xmm1\n\t" \
1187 "movsd %3, %%xmm4\n\t" \
1188 \
1189 "mulsd %%xmm2, %%xmm0\n\t" \
1190 "movsd %%xmm4, %%xmm5\n\t" \
1191 "mulsd %%xmm3, %%xmm5\n\t" \
1192 "subsd %%xmm5, %%xmm0\n\t" \
1193 \
1194 "mulsd %%xmm3, %%xmm1\n\t" \
1195 "movsd %%xmm4, %%xmm5\n\t" \
1196 "mulsd %%xmm2, %%xmm5\n\t" \
1197 "addsd %%xmm5, %%xmm1\n\t" \
1198 \
1199 "movsd %%xmm0, %0\n\t" \
1200 "movsd %%xmm1, %1\n\t"
1201 : "=m" (dpDest[iDataCntr]),
1202 "=m" (dpDest[iDataCntr + 1])
1203 : "m0" (dpDest[iDataCntr]),
1204 "m1" (dpDest[iDataCntr + 1]),
1205 "m" (dpSrc[iDataCntr]),
1206 "m" (dpSrc[iDataCntr + 1])
1207 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1208 }
1209 }
1210
1211
dsp_x86_3dnow_cmul3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)1212 void dsp_x86_3dnow_cmul3f (float *fpDest, const float *fpSrc1,
1213 const float *fpSrc2, int iDataLength)
1214 {
1215 int iDataCntr;
1216 pv2sf m64pDest = (pv2sf) fpDest;
1217 pv2sf m64pSrc1 = (pv2sf) fpSrc1;
1218 pv2sf m64pSrc2 = (pv2sf) fpSrc2;
1219
1220 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1221 {
1222 X86_ASM (
1223 "movq %1, %%mm0\n\t" \
1224 "movq %2, %%mm1\n\t" \
1225 "pswapd %%mm1, %%mm2\n\t" \
1226 "pfmul %%mm0, %%mm1\n\t" \
1227 "pfmul %%mm0, %%mm2\n\t" \
1228 "pfpnacc %%mm2, %%mm1\n\t" \
1229 "movntq %%mm1, %0\n\t"
1230 : "=m" (m64pDest[iDataCntr])
1231 : "m" (m64pSrc1[iDataCntr]),
1232 "m" (m64pSrc2[iDataCntr])
1233 : "mm0", "mm1", "mm2", "memory");
1234 }
1235 X86_ASM (
1236 "femms\n\t" \
1237 "sfence\n\t");
1238 }
1239
1240
dsp_x86_sse_cmul3f(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)1241 void dsp_x86_sse_cmul3f (float *fpDest, const float *fpSrc1,
1242 const float *fpSrc2, int iDataLength)
1243 {
1244 int iDataCntr;
1245 int iDataCount;
1246
1247 iDataCount = (iDataLength << 1);
1248 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1249 {
1250 X86_ASM (
1251 "movss %4, %%xmm2\n\t" \
1252 "movss %5, %%xmm3\n\t" \
1253 \
1254 "movss %2, %%xmm0\n\t" \
1255 "movss %%xmm0, %%xmm1\n\t" \
1256 "movss %3, %%xmm4\n\t" \
1257 \
1258 "mulss %%xmm2, %%xmm0\n\t" \
1259 "movss %%xmm4, %%xmm5\n\t" \
1260 "mulss %%xmm3, %%xmm5\n\t" \
1261 "subss %%xmm5, %%xmm0\n\t" \
1262 \
1263 "mulss %%xmm3, %%xmm1\n\t" \
1264 "movss %%xmm4, %%xmm5\n\t" \
1265 "mulss %%xmm2, %%xmm5\n\t" \
1266 "addss %%xmm5, %%xmm1\n\t" \
1267 \
1268 "movss %%xmm0, %0\n\t" \
1269 "movss %%xmm1, %1\n\t"
1270 : "=m" (fpDest[iDataCntr]),
1271 "=m" (fpDest[iDataCntr + 1])
1272 : "m" (fpSrc1[iDataCntr]),
1273 "m" (fpSrc1[iDataCntr + 1]),
1274 "m" (fpSrc2[iDataCntr]),
1275 "m" (fpSrc2[iDataCntr + 1])
1276 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1277 }
1278 }
1279
1280
dsp_x86_sse_cmul3(double * dpDest,const double * dpSrc1,const double * dpSrc2,int iDataLength)1281 void dsp_x86_sse_cmul3 (double *dpDest, const double *dpSrc1,
1282 const double *dpSrc2, int iDataLength)
1283 {
1284 int iDataCntr;
1285 int iDataCount;
1286
1287 iDataCount = (iDataLength << 1);
1288 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1289 {
1290 X86_ASM (
1291 "movsd %4, %%xmm2\n\t" \
1292 "movsd %5, %%xmm3\n\t" \
1293 \
1294 "movsd %2, %%xmm0\n\t" \
1295 "movsd %%xmm0, %%xmm1\n\t" \
1296 "movsd %3, %%xmm4\n\t" \
1297 \
1298 "mulsd %%xmm2, %%xmm0\n\t" \
1299 "movsd %%xmm4, %%xmm5\n\t" \
1300 "mulsd %%xmm3, %%xmm5\n\t" \
1301 "subsd %%xmm5, %%xmm0\n\t" \
1302 \
1303 "mulsd %%xmm3, %%xmm1\n\t" \
1304 "movsd %%xmm4, %%xmm5\n\t" \
1305 "mulsd %%xmm2, %%xmm5\n\t" \
1306 "addsd %%xmm5, %%xmm1\n\t" \
1307 \
1308 "movsd %%xmm0, %0\n\t" \
1309 "movsd %%xmm1, %1\n\t"
1310 : "=m" (dpDest[iDataCntr]),
1311 "=m" (dpDest[iDataCntr + 1])
1312 : "m" (dpSrc1[iDataCntr]),
1313 "m" (dpSrc1[iDataCntr + 1]),
1314 "m" (dpSrc2[iDataCntr]),
1315 "m" (dpSrc2[iDataCntr + 1])
1316 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
1317 }
1318 }
1319
1320
dsp_x86_3dnow_maf(float * fpVect,float fMul,float fAdd,int iDataLength)1321 void dsp_x86_3dnow_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
1322 {
1323 int iDataCntr;
1324 int iDataCount;
1325 pv2sf m64pVect = (pv2sf) fpVect;
1326 stm64 m64Mul;
1327 stm64 m64Add;
1328
1329 m64Mul.f[0] = m64Mul.f[1] = fMul;
1330 m64Add.f[0] = m64Add.f[1] = fAdd;
1331 iDataCount = (iDataLength >> 1);
1332 X86_ASM (
1333 "movq %0, %%mm1\n\t" \
1334 "movq %1, %%mm2\n\t"
1335 :
1336 : "m" (m64Mul),
1337 "m" (m64Add)
1338 : "mm1", "mm2", "memory");
1339 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1340 {
1341 X86_ASM (
1342 "movq %1, %%mm0\n\t" \
1343 "pfmul %%mm1, %%mm0\n\t" \
1344 "pfadd %%mm2, %%mm0\n\t" \
1345 "movntq %%mm0, %0\n\t"
1346 : "=m" (m64pVect[iDataCntr])
1347 : "m0" (m64pVect[iDataCntr])
1348 : "mm0", "mm1", "mm2", "memory");
1349 }
1350 if (iDataLength & 0x1)
1351 {
1352 X86_ASM (
1353 "movd %1, %%mm0\n\t" \
1354 "pfmul %%mm1, %%mm0\n\t" \
1355 "pfadd %%mm2, %%mm0\n\t" \
1356 "movd %%mm0, %0\n\t"
1357 : "=m" (fpVect[iDataLength - 1])
1358 : "m0" (fpVect[iDataLength - 1])
1359 : "mm0", "mm1", "mm2", "memory");
1360 }
1361 X86_ASM (
1362 "femms\n\t" \
1363 "sfence\n\t");
1364 }
1365
1366
dsp_x86_sse_maf(float * fpVect,float fMul,float fAdd,int iDataLength)1367 void dsp_x86_sse_maf (float *fpVect, float fMul, float fAdd, int iDataLength)
1368 {
1369 int iDataCntr;
1370
1371 X86_ASM (
1372 "movss %0, %%xmm1\n\t" \
1373 "movss %1, %%xmm2\n\t"
1374 :
1375 : "m" (fMul),
1376 "m" (fAdd)
1377 : "xmm1", "xmm2", "memory");
1378 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1379 {
1380 X86_ASM (
1381 "movss %1, %%xmm0\n\t" \
1382 "mulss %%xmm1, %%xmm0\n\t" \
1383 "addss %%xmm2, %%xmm0\n\t" \
1384 "movss %%xmm0, %0\n\t"
1385 : "=m" (fpVect[iDataCntr])
1386 : "m0" (fpVect[iDataCntr])
1387 : "xmm0", "xmm1", "xmm2", "memory");
1388 }
1389 }
1390
1391
dsp_x86_sse_ma(double * dpVect,double dMul,double dAdd,int iDataLength)1392 void dsp_x86_sse_ma (double *dpVect, double dMul, double dAdd, int iDataLength)
1393 {
1394 int iDataCntr;
1395
1396 X86_ASM (
1397 "movsd %0, %%xmm1\n\t" \
1398 "movsd %1, %%xmm2\n\t"
1399 :
1400 : "m" (dMul),
1401 "m" (dAdd)
1402 : "xmm1", "xmm2", "memory");
1403 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1404 {
1405 X86_ASM (
1406 "movsd %1, %%xmm0\n\t" \
1407 "mulsd %%xmm1, %%xmm0\n\t" \
1408 "addsd %%xmm2, %%xmm0\n\t" \
1409 "movsd %%xmm0, %0\n\t"
1410 : "=m" (dpVect[iDataCntr])
1411 : "m0" (dpVect[iDataCntr])
1412 : "xmm0", "xmm1", "xmm2", "memory");
1413 }
1414 }
1415
1416
dsp_x86_3dnow_ma2f(float * fpDest,const float * fpSrc,float fMul,float fAdd,int iDataLength)1417 void dsp_x86_3dnow_ma2f (float *fpDest, const float *fpSrc,
1418 float fMul, float fAdd, int iDataLength)
1419 {
1420 int iDataCntr;
1421 int iDataCount;
1422 pv2sf m64pDest = (pv2sf) fpDest;
1423 pv2sf m64pSrc = (pv2sf) fpSrc;
1424 stm64 m64Mul;
1425 stm64 m64Add;
1426
1427 m64Mul.f[0] = m64Mul.f[1] = fMul;
1428 m64Add.f[0] = m64Add.f[1] = fAdd;
1429 iDataCount = (iDataLength >> 1);
1430 X86_ASM (
1431 "movq %0, %%mm1\n\t" \
1432 "movq %1, %%mm2\n\t"
1433 :
1434 : "m" (m64Mul),
1435 "m" (m64Add)
1436 : "mm1", "mm2", "memory");
1437 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1438 {
1439 X86_ASM (
1440 "movq %1, %%mm0\n\t" \
1441 "pfmul %%mm1, %%mm0\n\t" \
1442 "pfadd %%mm2, %%mm0\n\t" \
1443 "movntq %%mm0, %0\n\t"
1444 : "=m" (m64pDest[iDataCntr])
1445 : "m" (m64pSrc[iDataCntr])
1446 : "mm0", "mm1", "mm2", "memory");
1447 }
1448 if (iDataLength & 0x1)
1449 {
1450 X86_ASM (
1451 "movd %1, %%mm0\n\t" \
1452 "pfmul %%mm1, %%mm0\n\t" \
1453 "pfadd %%mm2, %%mm0\n\t" \
1454 "movd %%mm0, %0\n\t"
1455 : "=m" (fpDest[iDataLength - 1])
1456 : "m" (fpSrc[iDataLength - 1])
1457 : "mm0", "mm1", "mm2", "memory");
1458 }
1459 X86_ASM (
1460 "femms\n\t" \
1461 "sfence\n\t");
1462 }
1463
1464
dsp_x86_sse_ma2f(float * fpDest,const float * fpSrc,float fMul,float fAdd,int iDataLength)1465 void dsp_x86_sse_ma2f (float *fpDest, const float *fpSrc,
1466 float fMul, float fAdd, int iDataLength)
1467 {
1468 int iDataCntr;
1469
1470 X86_ASM (
1471 "movss %0, %%xmm1\n\t" \
1472 "movss %1, %%xmm2\n\t"
1473 :
1474 : "m" (fMul),
1475 "m" (fAdd)
1476 : "xmm1", "xmm2", "memory");
1477 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1478 {
1479 X86_ASM (
1480 "movss %1, %%xmm0\n\t" \
1481 "mulss %%xmm1, %%xmm0\n\t" \
1482 "addss %%xmm2, %%xmm0\n\t" \
1483 "movss %%xmm0, %0\n\t"
1484 : "=m" (fpDest[iDataCntr])
1485 : "m" (fpSrc[iDataCntr])
1486 : "xmm0", "xmm1", "xmm2", "memory");
1487 }
1488 }
1489
1490
dsp_x86_sse_ma2(double * dpDest,const double * dpSrc,double dMul,double dAdd,int iDataLength)1491 void dsp_x86_sse_ma2 (double *dpDest, const double *dpSrc,
1492 double dMul, double dAdd, int iDataLength)
1493 {
1494 int iDataCntr;
1495
1496 X86_ASM (
1497 "movsd %0, %%xmm1\n\t" \
1498 "movsd %1, %%xmm2\n\t"
1499 :
1500 : "m" (dMul),
1501 "m" (dAdd)
1502 : "xmm1", "xmm2", "memory");
1503 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1504 {
1505 X86_ASM (
1506 "movsd %1, %%xmm0\n\t" \
1507 "mulsd %%xmm1, %%xmm0\n\t" \
1508 "addsd %%xmm2, %%xmm0\n\t" \
1509 "movsd %%xmm0, %0\n\t"
1510 : "=m" (dpDest[iDataCntr])
1511 : "m" (dpSrc[iDataCntr])
1512 : "xmm0", "xmm1", "xmm2", "memory");
1513 }
1514 }
1515
1516
dsp_x86_3dnow_cmaf(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)1517 void dsp_x86_3dnow_cmaf (float *fpDest, const float *fpSrc1,
1518 const float *fpSrc2, int iDataLength)
1519 {
1520 int iDataCntr;
1521 pv2sf m64pDest = (pv2sf) fpDest;
1522 pv2sf m64pSrc1 = (pv2sf) fpSrc1;
1523 pv2sf m64pSrc2 = (pv2sf) fpSrc2;
1524
1525 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1526 {
1527 X86_ASM (
1528 "movq %2, %%mm0\n\t" \
1529 "movq %3, %%mm1\n\t" \
1530 "movq %1, %%mm3\n\t" \
1531 "pswapd %%mm1, %%mm2\n\t" \
1532 "pfmul %%mm0, %%mm1\n\t" \
1533 "pfmul %%mm0, %%mm2\n\t" \
1534 "pfpnacc %%mm2, %%mm1\n\t" \
1535 "pfadd %%mm1, %%mm3\n\t" \
1536 "movntq %%mm3, %0\n\t"
1537 : "=m" (m64pDest[iDataCntr])
1538 : "m0" (m64pDest[iDataCntr]),
1539 "m" (m64pSrc1[iDataCntr]),
1540 "m" (m64pSrc2[iDataCntr])
1541 : "mm0", "mm1", "mm2", "mm3", "memory");
1542 }
1543 X86_ASM (
1544 "femms\n\t" \
1545 "sfence\n\t");
1546 }
1547
1548
dsp_x86_sse_cmaf(float * fpDest,const float * fpSrc1,const float * fpSrc2,int iDataLength)1549 void dsp_x86_sse_cmaf (float *fpDest, const float *fpSrc1,
1550 const float *fpSrc2, int iDataLength)
1551 {
1552 int iDataCntr;
1553 int iDataCount;
1554
1555 iDataCount = (iDataLength << 1);
1556 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1557 {
1558 X86_ASM (
1559 "movss %6, %%xmm2\n\t" \
1560 "movss %7, %%xmm3\n\t" \
1561 \
1562 "movss %4, %%xmm0\n\t" \
1563 "movss %%xmm0, %%xmm1\n\t" \
1564 "movss %5, %%xmm4\n\t" \
1565 \
1566 "movss %2, %%xmm6\n\t" \
1567 "movss %3, %%xmm7\n\t" \
1568 \
1569 "mulss %%xmm2, %%xmm0\n\t" \
1570 "movss %%xmm4, %%xmm5\n\t" \
1571 "mulss %%xmm3, %%xmm5\n\t" \
1572 "subss %%xmm5, %%xmm0\n\t" \
1573 \
1574 "mulss %%xmm3, %%xmm1\n\t" \
1575 "movss %%xmm4, %%xmm5\n\t" \
1576 "mulss %%xmm2, %%xmm5\n\t" \
1577 "addss %%xmm5, %%xmm1\n\t" \
1578 \
1579 "addss %%xmm0, %%xmm6\n\t" \
1580 "addss %%xmm1, %%xmm7\n\t" \
1581 \
1582 "movss %%xmm6, %0\n\t" \
1583 "movss %%xmm7, %1\n\t"
1584 : "=m" (fpDest[iDataCntr]),
1585 "=m" (fpDest[iDataCntr + 1])
1586 : "m0" (fpDest[iDataCntr]),
1587 "m1" (fpDest[iDataCntr + 1]),
1588 "m" (fpSrc1[iDataCntr]),
1589 "m" (fpSrc1[iDataCntr + 1]),
1590 "m" (fpSrc2[iDataCntr]),
1591 "m" (fpSrc2[iDataCntr + 1])
1592 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
1593 "memory");
1594 }
1595 }
1596
1597
dsp_x86_sse_cma(double * dpDest,const double * dpSrc1,const double * dpSrc2,int iDataLength)1598 void dsp_x86_sse_cma (double *dpDest, const double *dpSrc1,
1599 const double *dpSrc2, int iDataLength)
1600 {
1601 int iDataCntr;
1602 int iDataCount;
1603
1604 iDataCount = (iDataLength << 1);
1605 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr += 2)
1606 {
1607 X86_ASM (
1608 "movsd %6, %%xmm2\n\t" \
1609 "movsd %7, %%xmm3\n\t" \
1610 \
1611 "movsd %4, %%xmm0\n\t" \
1612 "movsd %%xmm0, %%xmm1\n\t" \
1613 "movsd %5, %%xmm4\n\t" \
1614 \
1615 "movsd %2, %%xmm6\n\t" \
1616 "movsd %3, %%xmm7\n\t" \
1617 \
1618 "mulsd %%xmm2, %%xmm0\n\t" \
1619 "movsd %%xmm4, %%xmm5\n\t" \
1620 "mulsd %%xmm3, %%xmm5\n\t" \
1621 "subsd %%xmm5, %%xmm0\n\t" \
1622 \
1623 "mulsd %%xmm3, %%xmm1\n\t" \
1624 "movsd %%xmm4, %%xmm5\n\t" \
1625 "mulsd %%xmm2, %%xmm5\n\t" \
1626 "addsd %%xmm5, %%xmm1\n\t" \
1627 \
1628 "addsd %%xmm0, %%xmm6\n\t" \
1629 "addsd %%xmm1, %%xmm7\n\t" \
1630 \
1631 "movsd %%xmm6, %0\n\t" \
1632 "movsd %%xmm7, %1\n\t"
1633 : "=m" (dpDest[iDataCntr]),
1634 "=m" (dpDest[iDataCntr + 1])
1635 : "m0" (dpDest[iDataCntr]),
1636 "m1" (dpDest[iDataCntr + 1]),
1637 "m" (dpSrc1[iDataCntr]),
1638 "m" (dpSrc1[iDataCntr + 1]),
1639 "m" (dpSrc2[iDataCntr]),
1640 "m" (dpSrc2[iDataCntr + 1])
1641 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
1642 "memory");
1643 }
1644 }
1645
1646
dsp_x86_3dnow_amf(float * fpVect,float fAdd,float fMul,int iDataLength)1647 void dsp_x86_3dnow_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
1648 {
1649 int iDataCntr;
1650 int iDataCount;
1651 pv2sf m64pVect = (pv2sf) fpVect;
1652 stm64 m64Add;
1653 stm64 m64Mul;
1654
1655 m64Add.f[0] = m64Add.f[1] = fAdd;
1656 m64Mul.f[0] = m64Mul.f[1] = fMul;
1657 iDataCount = (iDataLength >> 1);
1658 X86_ASM (
1659 "movq %0, %%mm1\n\t" \
1660 "movq %1, %%mm2\n\t"
1661 :
1662 : "m" (m64Add),
1663 "m" (m64Mul)
1664 : "mm1", "mm2", "memory");
1665 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1666 {
1667 X86_ASM (
1668 "movq %1, %%mm0\n\t" \
1669 "pfadd %%mm1, %%mm0\n\t" \
1670 "pfmul %%mm2, %%mm0\n\t" \
1671 "movntq %%mm0, %0\n\t"
1672 : "=m" (m64pVect[iDataCntr])
1673 : "m0" (m64pVect[iDataCntr])
1674 : "mm0", "mm1", "mm2", "memory");
1675 }
1676 if (iDataLength & 0x1)
1677 {
1678 X86_ASM (
1679 "movd %1, %%mm0\n\t" \
1680 "pfadd %%mm1, %%mm0\n\t" \
1681 "pfmul %%mm2, %%mm0\n\t" \
1682 "movd %%mm0, %0\n\t"
1683 : "=m" (fpVect[iDataLength - 1])
1684 : "m0" (fpVect[iDataLength - 1])
1685 : "mm0", "mm1", "mm2", "memory");
1686 }
1687 X86_ASM (
1688 "femms\n\t" \
1689 "sfence\n\t");
1690 }
1691
1692
dsp_x86_sse_amf(float * fpVect,float fAdd,float fMul,int iDataLength)1693 void dsp_x86_sse_amf (float *fpVect, float fAdd, float fMul, int iDataLength)
1694 {
1695 int iDataCntr;
1696
1697 X86_ASM (
1698 "movss %0, %%xmm1\n\t" \
1699 "movss %1, %%xmm2\n\t"
1700 :
1701 : "m" (fAdd),
1702 "m" (fMul)
1703 : "xmm1", "xmm2", "memory");
1704 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1705 {
1706 X86_ASM (
1707 "movss %1, %%xmm0\n\t" \
1708 "addss %%xmm1, %%xmm0\n\t" \
1709 "mulss %%xmm2, %%xmm0\n\t" \
1710 "movss %%xmm0, %0\n\t"
1711 : "=m" (fpVect[iDataCntr])
1712 : "m0" (fpVect[iDataCntr])
1713 : "xmm0", "xmm1", "xmm2", "memory");
1714 }
1715 }
1716
1717
dsp_x86_3dnow_macf(const float * fpSrc1,const float * fpSrc2,int iDataLength)1718 float dsp_x86_3dnow_macf (const float *fpSrc1, const float *fpSrc2,
1719 int iDataLength)
1720 {
1721 int iDataCntr;
1722 int iDataCount;
1723 float fRes;
1724 pv2sf m64pSrc1 = (pv2sf) fpSrc1;
1725 pv2sf m64pSrc2 = (pv2sf) fpSrc2;
1726
1727 iDataCount = (iDataLength >> 1);
1728 X86_ASM (
1729 "pxor %%mm0, %%mm0\n\t"
1730 :
1731 :
1732 : "mm0");
1733 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1734 {
1735 X86_ASM (
1736 "movq %0, %%mm1\n\t" \
1737 "movq %1, %%mm2\n\t" \
1738 "pfmul %%mm2, %%mm1\n\t" \
1739 "pfacc %%mm1, %%mm0\n\t"
1740 :
1741 : "m" (m64pSrc1[iDataCntr]),
1742 "m" (m64pSrc2[iDataCntr])
1743 : "mm0", "mm1", "mm2", "memory");
1744 }
1745 if (iDataLength & 0x1)
1746 {
1747 X86_ASM (
1748 "movd %0, %%mm1\n\t" \
1749 "movd %1, %%mm2\n\t" \
1750 "pfmul %%mm2, %%mm1\n\t" \
1751 "pfacc %%mm1, %%mm0\n\t"
1752 :
1753 : "m" (fpSrc1[iDataLength - 1]),
1754 "m" (fpSrc2[iDataLength - 1])
1755 : "mm0", "mm1", "mm2", "memory");
1756 }
1757 X86_ASM (
1758 "pfacc %%mm0, %%mm0\n\t" \
1759 "movd %%mm0, %0\n\t"
1760 : "=m" (fRes)
1761 :
1762 : "mm0", "memory");
1763 X86_ASM ("femms\n\t");
1764
1765 return fRes;
1766 }
1767
1768
dsp_x86_sse_macf(const float * fpSrc1,const float * fpSrc2,int iDataLength)1769 float dsp_x86_sse_macf (const float *fpSrc1, const float *fpSrc2,
1770 int iDataLength)
1771 {
1772 int iDataCntr;
1773 float fRes;
1774
1775 X86_ASM (
1776 "xorps %%xmm0, %%xmm0\n\t"
1777 :
1778 :
1779 : "xmm0");
1780 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1781 {
1782 X86_ASM (
1783 "movss %0, %%xmm1\n\t" \
1784 "mulss %1, %%xmm1\n\t" \
1785 "addss %%xmm1, %%xmm0\n\t"
1786 :
1787 : "m" (fpSrc1[iDataCntr]),
1788 "m" (fpSrc2[iDataCntr])
1789 : "xmm0", "xmm1", "xmm2", "memory");
1790 }
1791 X86_ASM (
1792 "movss %%xmm0, %0\n\t"
1793 : "=m" (fRes)
1794 :
1795 : "xmm0");
1796
1797 return fRes;
1798 }
1799
1800
dsp_x86_sse_mac(const double * dpSrc1,const double * dpSrc2,int iDataLength)1801 double dsp_x86_sse_mac (const double *dpSrc1, const double *dpSrc2,
1802 int iDataLength)
1803 {
1804 int iDataCntr;
1805 double dRes;
1806
1807 X86_ASM (
1808 "xorpd %%xmm0, %%xmm0\n\t"
1809 :
1810 :
1811 : "xmm0");
1812 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1813 {
1814 X86_ASM (
1815 "movsd %0, %%xmm1\n\t" \
1816 "mulsd %1, %%xmm1\n\t" \
1817 "addsd %%xmm1, %%xmm0\n\t"
1818 :
1819 : "m" (dpSrc1[iDataCntr]),
1820 "m" (dpSrc2[iDataCntr])
1821 : "xmm0", "xmm1", "xmm2", "memory");
1822 }
1823 X86_ASM (
1824 "movsd %%xmm0, %0\n\t"
1825 : "=m" (dRes)
1826 :
1827 : "xmm0");
1828
1829 return dRes;
1830 }
1831
1832
dsp_x86_3dnow_minmaxf(float * fpMin,float * fpMax,const float * fpSrc,int iDataLength)1833 void dsp_x86_3dnow_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
1834 int iDataLength)
1835 {
1836 int iDataCntr;
1837 int iDataCount;
1838 stm64 m64Min;
1839 stm64 m64Max;
1840 pv2sf m64pSrc = (pv2sf) fpSrc;
1841
1842 m64Min.f[0] = m64Min.f[1] = FLT_MAX;
1843 m64Max.f[0] = m64Max.f[1] = -FLT_MAX;
1844 iDataCount = (iDataLength >> 1);
1845 X86_ASM (
1846 "movq %0, %%mm1\n\t" \
1847 "movq %1, %%mm2\n\t"
1848 :
1849 : "m" (m64Min),
1850 "m" (m64Max)
1851 : "mm1", "mm2", "memory");
1852 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1853 {
1854 X86_ASM (
1855 "movq %0, %%mm0\n\t" \
1856 "pfmin %%mm0, %%mm1\n\t" \
1857 "pfmax %%mm0, %%mm2\n\t"
1858 :
1859 : "m" (m64pSrc[iDataCntr])
1860 : "mm0", "mm1", "mm2", "memory");
1861 }
1862 if (iDataLength & 0x1)
1863 {
1864 X86_ASM (
1865 "movd %0, %%mm0\n\t" \
1866 "pfmin %%mm0, %%mm1\n\t" \
1867 "pfmax %%mm0, %%mm2\n\t"
1868 :
1869 : "m" (fpSrc[iDataLength - 1])
1870 : "mm0", "mm1", "mm2", "memory");
1871 }
1872 X86_ASM (
1873 "pswapd %%mm1, %%mm3\n\t" \
1874 "pfmin %%mm3, %%mm1\n\t" \
1875 "pswapd %%mm2, %%mm3\n\t" \
1876 "pfmax %%mm3, %%mm2\n\t" \
1877 "movd %%mm1, %0\n\t" \
1878 "movd %%mm2, %1\n\t"
1879 : "=m" (*fpMin),
1880 "=m" (*fpMax)
1881 :
1882 : "mm1", "mm2", "mm3", "memory");
1883 X86_ASM ("femms\n\t");
1884 }
1885
1886
dsp_x86_sse_minmaxf(float * fpMin,float * fpMax,const float * fpSrc,int iDataLength)1887 void dsp_x86_sse_minmaxf (float *fpMin, float *fpMax, const float *fpSrc,
1888 int iDataLength)
1889 {
1890 int iDataCntr;
1891
1892 *fpMin = FLT_MAX;
1893 *fpMax = -FLT_MAX;
1894 X86_ASM (
1895 "movss %0, %%xmm0\n\t" \
1896 "movss %1, %%xmm1\n\t"
1897 :
1898 : "m" (*fpMin),
1899 "m" (*fpMax)
1900 : "xmm0", "xmm1", "memory");
1901 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1902 {
1903 X86_ASM (
1904 "movss %0, %%xmm2\n\t" \
1905 "minss %%xmm2, %%xmm0\n\t" \
1906 "maxss %%xmm2, %%xmm1\n\t"
1907 :
1908 : "m" (fpSrc[iDataCntr])
1909 : "xmm0", "xmm1", "xmm2", "memory");
1910 }
1911 X86_ASM (
1912 "movss %%xmm0, %0\n\t" \
1913 "movss %%xmm1, %1\n\t"
1914 : "=m" (*fpMin),
1915 "=m" (*fpMax)
1916 :
1917 : "xmm0", "xmm1", "memory");
1918 }
1919
1920
dsp_x86_sse_minmax(double * dpMin,double * dpMax,const double * dpSrc,int iDataLength)1921 void dsp_x86_sse_minmax (double *dpMin, double *dpMax, const double *dpSrc,
1922 int iDataLength)
1923 {
1924 int iDataCntr;
1925
1926 *dpMin = FLT_MAX;
1927 *dpMax = -FLT_MAX;
1928 X86_ASM (
1929 "movsd %0, %%xmm0\n\t" \
1930 "movsd %1, %%xmm1\n\t"
1931 :
1932 : "m" (*dpMin),
1933 "m" (*dpMax)
1934 : "xmm0", "xmm1", "memory");
1935 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
1936 {
1937 X86_ASM (
1938 "movsd %0, %%xmm2\n\t" \
1939 "minsd %%xmm2, %%xmm0\n\t" \
1940 "maxsd %%xmm2, %%xmm1\n\t"
1941 :
1942 : "m" (dpSrc[iDataCntr])
1943 : "xmm0", "xmm1", "xmm2", "memory");
1944 }
1945 X86_ASM (
1946 "movss %%xmm0, %0\n\t" \
1947 "movss %%xmm1, %1\n\t"
1948 : "=m" (*dpMin),
1949 "=m" (*dpMax)
1950 :
1951 : "xmm0", "xmm1", "memory");
1952 }
1953
1954
dsp_x86_3dnow_crosscorrf(const float * fpSrc1,const float * fpSrc2,int iDataLength)1955 float dsp_x86_3dnow_crosscorrf (const float *fpSrc1, const float *fpSrc2,
1956 int iDataLength)
1957 {
1958 int iDataCntr;
1959 int iDataCount;
1960 float fRes;
1961 pv2sf m64pSrc1 = (pv2sf) fpSrc1;
1962 pv2sf m64pSrc2 = (pv2sf) fpSrc2;
1963
1964 iDataCount = (iDataLength >> 1);
1965 X86_ASM (
1966 "pxor %%mm3, %%mm3\n\t" \
1967 "pxor %%mm4, %%mm4\n\t" \
1968 "pxor %%mm5, %%mm5\n\t"
1969 :
1970 :
1971 : "mm3", "mm4", "mm5");
1972 for (iDataCntr = 0; iDataCntr < iDataCount; iDataCntr++)
1973 {
1974 X86_ASM (
1975 "movq %0, %%mm0\n\t" \
1976 "movq %1, %%mm1\n\t" \
1977 "movq %%mm1, %%mm2\n\t" \
1978 "pfmul %%mm0, %%mm2\n\t" \
1979 "pfacc %%mm2, %%mm5\n\t" \
1980 "pfmul %%mm0, %%mm0\n\t" \
1981 "pfacc %%mm0, %%mm3\n\t" \
1982 "pfmul %%mm1, %%mm1\n\t" \
1983 "pfacc %%mm1, %%mm4\n\t"
1984 :
1985 : "m" (m64pSrc1[iDataCntr]),
1986 "m" (m64pSrc2[iDataCntr])
1987 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
1988 }
1989 if (iDataLength & 0x1)
1990 {
1991 X86_ASM (
1992 "movd %0, %%mm0\n\t" \
1993 "movd %1, %%mm1\n\t" \
1994 "movq %%mm1, %%mm2\n\t" \
1995 "pfmul %%mm0, %%mm2\n\t" \
1996 "pfacc %%mm2, %%mm5\n\t" \
1997 "pfmul %%mm0, %%mm0\n\t" \
1998 "pfacc %%mm0, %%mm3\n\t" \
1999 "pfmul %%mm1, %%mm1\n\t" \
2000 "pfacc %%mm1, %%mm4\n\t"
2001 :
2002 : "m" (fpSrc1[iDataLength - 1]),
2003 "m" (fpSrc2[iDataLength - 1])
2004 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "memory");
2005 }
2006 X86_ASM (
2007 "pfacc %%mm3, %%mm3\n\t" \
2008 "pfacc %%mm4, %%mm4\n\t" \
2009 "pfacc %%mm5, %%mm5\n\t" \
2010 \
2011 "movd %1, %%mm6\n\t" \
2012 "pswapd %%mm6, %%mm7\n\t" \
2013 "paddd %%mm7, %%mm6\n\t" \
2014 "pi2fd %%mm6, %%mm7\n\t" \
2015 \
2016 "pfrcp %%mm7, %%mm6\n\t" \
2017 "pfrcpit1 %%mm6, %%mm7\n\t" \
2018 "pfrcpit2 %%mm6, %%mm7\n\t" \
2019 \
2020 "pfmul %%mm3, %%mm4\n\t" \
2021 \
2022 "movq %%mm4, %%mm0\n\t" \
2023 "pfrsqrt %%mm4, %%mm1\n\t" \
2024 "movq %%mm1, %%mm2\n\t" \
2025 "pfmul %%mm1, %%mm1\n\t" \
2026 "pfrsqit1 %%mm4, %%mm1\n\t" \
2027 "pfrcpit2 %%mm2, %%mm1\n\t" \
2028 "pfmul %%mm1, %%mm4\n\t" \
2029 \
2030 "pfmul %%mm6, %%mm4\n\t" \
2031 \
2032 "pfrcp %%mm4, %%mm0\n\t" \
2033 "pfrcpit1 %%mm0, %%mm4\n\t" \
2034 "pfrcpit2 %%mm0, %%mm4\n\t" \
2035 \
2036 "pfmul %%mm6, %%mm5\n\t" \
2037 "pfmul %%mm4, %%mm5\n\t" \
2038 "movd %%mm5, %0\n\t"
2039 : "=m" (fRes)
2040 : "m" (iDataLength)
2041 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
2042 X86_ASM ("femms\n\t");
2043
2044 return fRes;
2045 }
2046
2047
dsp_x86_sse_crosscorrf(const float * fpSrc1,const float * fpSrc2,int iDataLength)2048 float dsp_x86_sse_crosscorrf (const float *fpSrc1, const float *fpSrc2,
2049 int iDataLength)
2050 {
2051 int iDataCntr;
2052 float fScale;
2053 float fNormFact;
2054 float fProdSum;
2055 float fSqSum1;
2056 float fSqSum2;
2057 float fRes;
2058
2059 X86_ASM (
2060 "xorps %%xmm0, %%xmm0\n\t" \
2061 "xorps %%xmm1, %%xmm1\n\t" \
2062 "xorps %%xmm2, %%xmm2\n\t"
2063 :
2064 :
2065 : "xmm0", "xmm1", "xmm2");
2066 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2067 {
2068 X86_ASM (
2069 "movss %3, %%xmm3\n\t" \
2070 "movss %4, %%xmm4\n\t" \
2071 \
2072 "movss %%xmm4, %%xmm5\n\t" \
2073 "mulss %%xmm3, %%xmm5\n\t" \
2074 "addss %%xmm5, %%xmm0\n\t" \
2075 \
2076 "movss %%xmm3, %%xmm5\n\t" \
2077 "mulss %%xmm3, %%xmm5\n\t" \
2078 "addss %%xmm5, %%xmm1\n\t" \
2079 \
2080 "movss %%xmm4, %%xmm5\n\t" \
2081 "mulss %%xmm4, %%xmm5\n\t" \
2082 "addss %%xmm5, %%xmm2\n\t" \
2083 \
2084 "movss %%xmm0, %0\n\t" \
2085 "movss %%xmm1, %1\n\t" \
2086 "movss %%xmm2, %2\n\t"
2087 : "=m" (fProdSum),
2088 "=m" (fSqSum1),
2089 "=m" (fSqSum2)
2090 : "m" (fpSrc1[iDataCntr]),
2091 "m" (fpSrc2[iDataCntr])
2092 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2093 }
2094 fScale = 1.0F / iDataLength;
2095 fNormFact = sqrtf(fSqSum1 * fSqSum2) * fScale;
2096 fRes = (fProdSum * fScale) / fNormFact;
2097
2098 return fRes;
2099 }
2100
2101
dsp_x86_sse_crosscorr(const double * dpSrc1,const double * dpSrc2,int iDataLength)2102 double dsp_x86_sse_crosscorr (const double *dpSrc1, const double *dpSrc2,
2103 int iDataLength)
2104 {
2105 int iDataCntr;
2106 double dScale;
2107 double dNormFact;
2108 double dProdSum;
2109 double dSqSum1;
2110 double dSqSum2;
2111 double dRes;
2112
2113 X86_ASM (
2114 "xorpd %%xmm0, %%xmm0\n\t" \
2115 "xorpd %%xmm1, %%xmm1\n\t" \
2116 "xorpd %%xmm2, %%xmm2\n\t"
2117 :
2118 :
2119 : "xmm0", "xmm1", "xmm2");
2120 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2121 {
2122 X86_ASM (
2123 "movsd %3, %%xmm3\n\t" \
2124 "movsd %4, %%xmm4\n\t" \
2125 \
2126 "movsd %%xmm4, %%xmm5\n\t" \
2127 "mulsd %%xmm3, %%xmm5\n\t" \
2128 "addsd %%xmm5, %%xmm0\n\t" \
2129 \
2130 "movsd %%xmm3, %%xmm5\n\t" \
2131 "mulsd %%xmm3, %%xmm5\n\t" \
2132 "addsd %%xmm5, %%xmm1\n\t" \
2133 \
2134 "movsd %%xmm4, %%xmm5\n\t" \
2135 "mulsd %%xmm4, %%xmm5\n\t" \
2136 "addsd %%xmm5, %%xmm2\n\t" \
2137 \
2138 "movsd %%xmm0, %0\n\t" \
2139 "movsd %%xmm1, %1\n\t" \
2140 "movsd %%xmm2, %2\n\t"
2141 : "=m" (dProdSum),
2142 "=m" (dSqSum1),
2143 "=m" (dSqSum2)
2144 : "m" (dpSrc1[iDataCntr]),
2145 "m" (dpSrc2[iDataCntr])
2146 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2147 }
2148 dScale = 1.0 / iDataLength;
2149 dNormFact = sqrt(dSqSum1 * dSqSum2) * dScale;
2150 dRes = (dProdSum * dScale) / dNormFact;
2151
2152 return dRes;
2153 }
2154
2155
dsp_x86_3dnow_i16tof(float * fpDest,const short * ipSrc,int iDataLength,int iIntMax)2156 void dsp_x86_3dnow_i16tof (float *fpDest, const short *ipSrc, int iDataLength,
2157 int iIntMax)
2158 {
2159 int iDataCntr;
2160 float fScale;
2161
2162 X86_ASM (
2163 "movd %1, %%mm1\n\t" \
2164 "pswapd %%mm1, %%mm2\n\t" \
2165 "paddd %%mm2, %%mm1\n\t" \
2166 "pi2fd %%mm1, %%mm1\n\t" \
2167 "pfrcp %%mm1, %%mm2\n\t" \
2168 "pfrcpit1 %%mm2, %%mm1\n\t" \
2169 "pfrcpit2 %%mm2, %%mm1\n\t" \
2170 "movd %%mm1, %0\n\t"
2171 : "=m" (fScale)
2172 : "m" (iIntMax)
2173 : "mm1", "mm2", "memory");
2174 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
2175 {
2176 X86_ASM (
2177 "movd %1, %%mm0\n\t" \
2178 "punpcklwd %%mm0, %%mm0\n\t" \
2179 "pi2fw %%mm0, %%mm0\n\t" \
2180 "pfmul %%mm1, %%mm0\n\t" \
2181 "movntq %%mm0, %0\n\t"
2182 : "=m" (fpDest[iDataCntr])
2183 : "m" (ipSrc[iDataCntr])
2184 : "mm0", "mm1", "memory");
2185 }
2186 X86_ASM (
2187 "femms\n\t" \
2188 "sfence\n\t");
2189 if ((iDataLength % 2) != 0)
2190 {
2191 fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
2192 }
2193 }
2194
2195
dsp_x86_3dnow_i32tof(float * fpDest,const int * ipSrc,int iDataLength,int iIntMax)2196 void dsp_x86_3dnow_i32tof (float *fpDest, const int *ipSrc, int iDataLength,
2197 int iIntMax)
2198 {
2199 int iDataCntr;
2200 float fScale;
2201
2202 X86_ASM (
2203 "movd %1, %%mm1\n\t" \
2204 "pswapd %%mm1, %%mm2\n\t" \
2205 "paddd %%mm2, %%mm1\n\t" \
2206 "pi2fd %%mm1, %%mm1\n\t" \
2207 "pfrcp %%mm1, %%mm2\n\t" \
2208 "pfrcpit1 %%mm2, %%mm1\n\t" \
2209 "pfrcpit2 %%mm2, %%mm1\n\t" \
2210 "movd %%mm1, %0\n\t"
2211 : "=m" (fScale)
2212 : "m" (iIntMax)
2213 : "mm1", "mm2", "memory");
2214 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr += 2)
2215 {
2216 X86_ASM (
2217 "movq %1, %%mm0\n\t" \
2218 "pi2fd %%mm0, %%mm0\n\t" \
2219 "pfmul %%mm1, %%mm0\n\t" \
2220 "movntq %%mm0, %0\n\t"
2221 : "=m" (fpDest[iDataCntr])
2222 : "m" (ipSrc[iDataCntr])
2223 : "mm0", "mm1", "memory");
2224 }
2225 X86_ASM (
2226 "femms\n\t" \
2227 "sfence\n\t");
2228 if ((iDataLength % 2) != 0)
2229 {
2230 fpDest[iDataLength - 1] = ((float) ipSrc[iDataLength - 1]) * fScale;
2231 }
2232 }
2233
2234
dsp_x86_3dnow_firf(float * fpDest,const float * fpSrc,int iDataLength,const float * fpCoeff,int iCoeffLength)2235 void dsp_x86_3dnow_firf (float *fpDest, const float *fpSrc, int iDataLength,
2236 const float *fpCoeff, int iCoeffLength)
2237 {
2238 int iSrcCntr;
2239 int iDestCntr;
2240 int iCoeffCntr;
2241 int iSrcCount;
2242 pv2sf m64pDest = (pv2sf) fpDest;
2243
2244 iDestCntr = 0;
2245 iSrcCount = iDataLength + iCoeffLength;
2246 for (iSrcCntr = iCoeffLength;
2247 iSrcCntr < iSrcCount;
2248 iSrcCntr += 2)
2249 {
2250 X86_ASM (
2251 "pxor %%mm0, %%mm0\n\t"
2252 :
2253 :
2254 : "mm0");
2255 for (iCoeffCntr = 0;
2256 iCoeffCntr < iCoeffLength;
2257 iCoeffCntr++)
2258 {
2259 X86_ASM (
2260 "movq %0, %%mm1\n\t" \
2261 "movd %1, %%mm2\n\t" \
2262 "pswapd %%mm2, %%mm3\n\t" \
2263 "pfadd %%mm3, %%mm2\n\t" \
2264 "pfmul %%mm2, %%mm1\n\t" \
2265 "pfadd %%mm1, %%mm0\n\t"
2266 :
2267 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
2268 "m" (fpCoeff[iCoeffCntr])
2269 : "mm0", "mm1", "mm2", "mm3", "memory");
2270 }
2271 X86_ASM (
2272 "movntq %%mm0, %0\n\t"
2273 : "=m" (m64pDest[iDestCntr++])
2274 :
2275 : "mm0", "memory");
2276 }
2277 if (iDataLength & 0x1)
2278 {
2279 X86_ASM (
2280 "pxor %%mm0, %%mm0\n\t"
2281 :
2282 :
2283 : "mm0");
2284 for (iCoeffCntr = 0;
2285 iCoeffCntr < iCoeffLength;
2286 iCoeffCntr++)
2287 {
2288 X86_ASM (
2289 "movd %0, %%mm1\n\t" \
2290 "movd %1, %%mm2\n\t" \
2291 "pfmul %%mm2, %%mm1\n\t" \
2292 "pfadd %%mm1, %%mm0\n\t"
2293 :
2294 : "m" (fpSrc[iDataLength - 1 - iCoeffCntr]),
2295 "m" (fpCoeff[iCoeffCntr])
2296 : "mm0", "mm1", "mm2", "memory");
2297 }
2298 X86_ASM (
2299 "movd %%mm0, %0\n\t"
2300 : "=m" (fpDest[iDataLength - 1])
2301 :
2302 : "mm0", "memory");
2303 }
2304 X86_ASM (
2305 "femms\n\t" \
2306 "sfence\n\t");
2307 }
2308
2309
dsp_x86_sse_firf(float * fpDest,const float * fpSrc,int iDataLength,const float * fpCoeff,int iCoeffLength)2310 void dsp_x86_sse_firf (float *fpDest, const float *fpSrc, int iDataLength,
2311 const float *fpCoeff, int iCoeffLength)
2312 {
2313 int iDestCntr;
2314 int iSrcCntr;
2315 int iCoeffCntr;
2316 int iSrcCount;
2317
2318 iDestCntr = 0;
2319 iSrcCount = iDataLength + iCoeffLength;
2320 for (iSrcCntr = iCoeffLength;
2321 iSrcCntr < iSrcCount;
2322 iSrcCntr++)
2323 {
2324 X86_ASM (
2325 "xorps %%xmm0, %%xmm0\n\t"
2326 :
2327 :
2328 : "xmm0");
2329 for (iCoeffCntr = 0;
2330 iCoeffCntr < iCoeffLength;
2331 iCoeffCntr++)
2332 {
2333 X86_ASM (
2334 "movss %0, %%xmm1\n\t"
2335 "mulss %1, %%xmm1\n\t"
2336 "addss %%xmm1, %%xmm0\n\t"
2337 :
2338 : "m" (fpSrc[iSrcCntr - iCoeffCntr]),
2339 "m" (fpCoeff[iCoeffCntr])
2340 : "xmm0", "xmm1", "memory");
2341 }
2342 X86_ASM (
2343 "movss %%xmm0, %0\n\t"
2344 : "=m" (fpDest[iDestCntr++])
2345 :
2346 : "xmm0", "memory");
2347 }
2348 }
2349
2350
dsp_x86_sse_fir(double * dpDest,const double * dpSrc,int iDataLength,const double * dpCoeff,int iCoeffLength)2351 void dsp_x86_sse_fir (double *dpDest, const double *dpSrc, int iDataLength,
2352 const double *dpCoeff, int iCoeffLength)
2353 {
2354 int iDestCntr;
2355 int iSrcCntr;
2356 int iCoeffCntr;
2357 int iSrcCount;
2358
2359 iDestCntr = 0;
2360 iSrcCount = iDataLength + iCoeffLength;
2361 for (iSrcCntr = iCoeffLength;
2362 iSrcCntr < iSrcCount;
2363 iSrcCntr++)
2364 {
2365 X86_ASM (
2366 "xorpd %%xmm0, %%xmm0\n\t"
2367 :
2368 :
2369 : "xmm0");
2370 for (iCoeffCntr = 0;
2371 iCoeffCntr < iCoeffLength;
2372 iCoeffCntr++)
2373 {
2374 X86_ASM (
2375 "movsd %0, %%xmm1\n\t"
2376 "mulsd %1, %%xmm1\n\t"
2377 "addsd %%xmm1, %%xmm0\n\t"
2378 :
2379 : "m" (dpSrc[iSrcCntr - iCoeffCntr]),
2380 "m" (dpCoeff[iCoeffCntr])
2381 : "xmm0", "xmm1", "memory");
2382 }
2383 X86_ASM (
2384 "movsd %%xmm0, %0\n\t"
2385 : "=m" (dpDest[iDestCntr++])
2386 :
2387 : "xmm0", "memory");
2388 }
2389 }
2390
2391
dsp_x86_3dnow_iirf(float * fpVect,int iDataLength,const float * fpCoeff,float * fpX,float * fpY)2392 void dsp_x86_3dnow_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
2393 float *fpX, float *fpY)
2394 {
2395 int iDataCntr;
2396 pv2sf m64pCoeff = (pv2sf) &fpCoeff[1];
2397 pv2sf m64pCoeff2 = (pv2sf) &fpCoeff[3];
2398 pv2sf m64pX = (pv2sf) fpX;
2399 pv2sf m64pY = (pv2sf) fpY;
2400
2401 X86_ASM (
2402 "movq %0, %%mm0\n\t" \
2403 "pswapd %%mm0, %%mm2\n\t" \
2404 "movd %1, %%mm3\n\t" \
2405 "movq %2, %%mm0\n\t" \
2406 "pswapd %%mm0, %%mm4\n\t" \
2407 "movq %3, %%mm5\n\t" \
2408 "movq %4, %%mm7\n\t" \
2409 :
2410 : "m" (*m64pCoeff),
2411 "m" (fpCoeff[0]),
2412 "m" (*m64pCoeff2),
2413 "m" (*m64pX),
2414 "m" (*m64pY)
2415 : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
2416 for (iDataCntr = 0;
2417 iDataCntr < iDataLength;
2418 iDataCntr++)
2419 {
2420 X86_ASM (
2421 "pxor %%mm0, %%mm0\n\t" \
2422 "movd %1, %%mm6\n\t" \
2423 "movq %%mm5, %%mm1\n\t" \
2424 "pfmul %%mm2, %%mm1\n\t" \
2425 "pfacc %%mm1, %%mm0\n\t" \
2426 "movq %%mm6, %%mm1\n\t" \
2427 "pfmul %%mm3, %%mm1\n\t" \
2428 "pfacc %%mm1, %%mm0\n\t" \
2429 "movq %%mm7, %%mm1\n\t" \
2430 "pfmul %%mm4, %%mm1\n\t" \
2431 "pfacc %%mm1, %%mm0\n\t" \
2432 "pfacc %%mm0, %%mm0\n\t" \
2433 \
2434 "pswapd %%mm7, %%mm1\n\t" \
2435 "movq %%mm1, %%mm7\n\t" \
2436 "punpckldq %%mm0, %%mm7\n\t" \
2437 \
2438 "pswapd %%mm5, %%mm1\n\t" \
2439 "movq %%mm1, %%mm5\n\t" \
2440 "movq %%mm6, %%mm1\n\t" \
2441 "punpckldq %%mm1, %%mm5\n\t" \
2442 \
2443 "movd %%mm0, %0\n\t"
2444 : "=m" (fpVect[iDataCntr])
2445 : "m0" (fpVect[iDataCntr])
2446 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
2447 }
2448 X86_ASM (
2449 "movq %%mm5, %0\n\t" \
2450 "movd %%mm6, %1\n\t" \
2451 "movq %%mm7, %2\n\t"
2452 : "=m" (*m64pX),
2453 "=m" (fpX[2]),
2454 "=m" (*m64pY)
2455 :
2456 : "mm5", "mm6", "mm7", "memory");
2457 X86_ASM ("femms\n\t");
2458 }
2459
2460
dsp_x86_sse_iirf(float * fpVect,int iDataLength,const float * fpCoeff,float * fpX,float * fpY)2461 void dsp_x86_sse_iirf (float *fpVect, int iDataLength, const float *fpCoeff,
2462 float *fpX, float *fpY)
2463 {
2464 int iDataCntr;
2465
2466 X86_ASM (
2467 "movss %0, %%xmm1\n\t" \
2468 "movss %1, %%xmm2\n\t" \
2469 "movss %2, %%xmm3\n\t" \
2470 "movss %3, %%xmm4\n\t" \
2471 "prefetchnta %4\n\t"
2472 :
2473 : "m" (fpX[1]),
2474 "m" (fpX[2]),
2475 "m" (fpY[0]),
2476 "m" (fpY[1]),
2477 "m" (fpCoeff[0])
2478 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2479 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2480 {
2481 X86_ASM (
2482 "movss %%xmm1, %%xmm0\n\t" \
2483 "movss %%xmm2, %%xmm1\n\t" \
2484 "movss %1, %%xmm2\n\t" \
2485 \
2486 "movss %2, %%xmm5\n\t" \
2487 "mulss %%xmm2, %%xmm5\n\t" \
2488 "movss %3, %%xmm6\n\t" \
2489 "mulss %%xmm1, %%xmm6\n\t" \
2490 "addss %%xmm6, %%xmm5\n\t" \
2491 "movss %4, %%xmm6\n\t" \
2492 "mulss %%xmm0, %%xmm6\n\t" \
2493 "addss %%xmm6, %%xmm5\n\t" \
2494 \
2495 "movss %5, %%xmm6\n\t" \
2496 "mulss %%xmm4, %%xmm6\n\t" \
2497 "movss %6, %%xmm7\n\t" \
2498 "mulss %%xmm3, %%xmm7\n\t" \
2499 "addss %%xmm7, %%xmm6\n\t" \
2500 \
2501 "addss %%xmm5, %%xmm6\n\t" \
2502 "movss %%xmm4, %%xmm3\n\t" \
2503 "movss %%xmm6, %%xmm4\n\t" \
2504 \
2505 "movss %%xmm6, %0\n\t"
2506 : "=m" (fpVect[iDataCntr])
2507 : "m0" (fpVect[iDataCntr]),
2508 "m" (fpCoeff[0]),
2509 "m" (fpCoeff[1]),
2510 "m" (fpCoeff[2]),
2511 "m" (fpCoeff[3]),
2512 "m" (fpCoeff[4])
2513 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
2514 "memory");
2515 }
2516 X86_ASM (
2517 "movss %%xmm0, %0\n\t" \
2518 "movss %%xmm1, %1\n\t" \
2519 "movss %%xmm2, %2\n\t" \
2520 "movss %%xmm3, %3\n\t" \
2521 "movss %%xmm4, %4\n\t"
2522 : "=m" (fpX[0]),
2523 "=m" (fpX[1]),
2524 "=m" (fpX[2]),
2525 "=m" (fpY[0]),
2526 "=m" (fpY[1])
2527 :
2528 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2529 }
2530
2531
dsp_x86_sse_iir(double * dpVect,int iDataLength,const double * dpCoeff,double * dpX,double * dpY)2532 void dsp_x86_sse_iir (double *dpVect, int iDataLength, const double *dpCoeff,
2533 double *dpX, double *dpY)
2534 {
2535 int iDataCntr;
2536
2537 X86_ASM (
2538 "movsd %0, %%xmm1\n\t" \
2539 "movsd %1, %%xmm2\n\t" \
2540 "movsd %2, %%xmm3\n\t" \
2541 "movsd %3, %%xmm4\n\t" \
2542 "prefetchnta %4\n\t" \
2543 "prefetchnta %5\n\t"
2544 :
2545 : "m" (dpX[1]),
2546 "m" (dpX[2]),
2547 "m" (dpY[0]),
2548 "m" (dpY[1]),
2549 "m" (dpCoeff[0]),
2550 "m" (dpCoeff[3])
2551 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2552 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2553 {
2554 X86_ASM (
2555 "movsd %%xmm1, %%xmm0\n\t" \
2556 "movsd %%xmm2, %%xmm1\n\t" \
2557 "movsd %1, %%xmm2\n\t" \
2558 \
2559 "movsd %2, %%xmm5\n\t" \
2560 "mulsd %%xmm2, %%xmm5\n\t" \
2561 "movsd %3, %%xmm6\n\t" \
2562 "mulsd %%xmm1, %%xmm6\n\t" \
2563 "addsd %%xmm6, %%xmm5\n\t" \
2564 "movsd %4, %%xmm6\n\t" \
2565 "mulsd %%xmm0, %%xmm6\n\t" \
2566 "addsd %%xmm6, %%xmm5\n\t" \
2567 \
2568 "movsd %5, %%xmm6\n\t" \
2569 "mulsd %%xmm4, %%xmm6\n\t" \
2570 "movsd %6, %%xmm7\n\t" \
2571 "mulsd %%xmm3, %%xmm7\n\t" \
2572 "addsd %%xmm7, %%xmm6\n\t" \
2573 \
2574 "addsd %%xmm5, %%xmm6\n\t" \
2575 "movsd %%xmm4, %%xmm3\n\t" \
2576 "movsd %%xmm6, %%xmm4\n\t" \
2577 \
2578 "movsd %%xmm6, %0\n\t"
2579 : "=m" (dpVect[iDataCntr])
2580 : "m0" (dpVect[iDataCntr]),
2581 "m" (dpCoeff[0]),
2582 "m" (dpCoeff[1]),
2583 "m" (dpCoeff[2]),
2584 "m" (dpCoeff[3]),
2585 "m" (dpCoeff[4])
2586 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
2587 "memory");
2588 }
2589 X86_ASM (
2590 "movsd %%xmm0, %0\n\t" \
2591 "movsd %%xmm1, %1\n\t" \
2592 "movsd %%xmm2, %2\n\t" \
2593 "movsd %%xmm3, %3\n\t" \
2594 "movsd %%xmm4, %4\n\t"
2595 : "=m" (dpX[0]),
2596 "=m" (dpX[1]),
2597 "=m" (dpX[2]),
2598 "=m" (dpY[0]),
2599 "=m" (dpY[1])
2600 :
2601 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2602 }
2603
2604
dsp_x86_3dnow_iirf_nip(float * fpDest,const float * fpSrc,int iDataLength,const float * fpCoeff,float * fpX,float * fpY)2605 void dsp_x86_3dnow_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
2606 const float *fpCoeff, float *fpX, float *fpY)
2607 {
2608 int iDataCntr;
2609 pv2sf m64pCoeff = (pv2sf) &fpCoeff[1];
2610 pv2sf m64pCoeff2 = (pv2sf) &fpCoeff[3];
2611 pv2sf m64pX = (pv2sf) fpX;
2612 pv2sf m64pY = (pv2sf) fpY;
2613
2614 X86_ASM (
2615 "movq %0, %%mm0\n\t" \
2616 "pswapd %%mm0, %%mm2\n\t" \
2617 "movd %1, %%mm3\n\t" \
2618 "movq %2, %%mm0\n\t" \
2619 "pswapd %%mm0, %%mm4\n\t" \
2620 "movq %3, %%mm5\n\t" \
2621 "movq %4, %%mm7\n\t" \
2622 :
2623 : "m" (*m64pCoeff),
2624 "m" (fpCoeff[0]),
2625 "m" (*m64pCoeff2),
2626 "m" (*m64pX),
2627 "m" (*m64pY)
2628 : "mm0", "mm2", "mm3", "mm4", "mm5", "mm7", "memory");
2629 for (iDataCntr = 0;
2630 iDataCntr < iDataLength;
2631 iDataCntr++)
2632 {
2633 X86_ASM (
2634 "pxor %%mm0, %%mm0\n\t" \
2635 "movd %1, %%mm6\n\t" \
2636 "movq %%mm5, %%mm1\n\t" \
2637 "pfmul %%mm2, %%mm1\n\t" \
2638 "pfacc %%mm1, %%mm0\n\t" \
2639 "movq %%mm6, %%mm1\n\t" \
2640 "pfmul %%mm3, %%mm1\n\t" \
2641 "pfacc %%mm1, %%mm0\n\t" \
2642 "movq %%mm7, %%mm1\n\t" \
2643 "pfmul %%mm4, %%mm1\n\t" \
2644 "pfacc %%mm1, %%mm0\n\t" \
2645 "pfacc %%mm0, %%mm0\n\t" \
2646 \
2647 "pswapd %%mm7, %%mm1\n\t" \
2648 "movq %%mm1, %%mm7\n\t" \
2649 "punpckldq %%mm0, %%mm7\n\t" \
2650 \
2651 "pswapd %%mm5, %%mm1\n\t" \
2652 "movq %%mm1, %%mm5\n\t" \
2653 "movq %%mm6, %%mm1\n\t" \
2654 "punpckldq %%mm1, %%mm5\n\t" \
2655 \
2656 "movd %%mm0, %0\n\t"
2657 : "=m" (fpDest[iDataCntr])
2658 : "m" (fpSrc[iDataCntr])
2659 : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "memory");
2660 }
2661 X86_ASM (
2662 "movq %%mm5, %0\n\t" \
2663 "movd %%mm6, %1\n\t" \
2664 "movq %%mm7, %2\n\t"
2665 : "=m" (*m64pX),
2666 "=m" (fpX[2]),
2667 "=m" (*m64pY)
2668 :
2669 : "mm5", "mm6", "mm7", "memory");
2670 X86_ASM ("femms\n\t");
2671 }
2672
2673
dsp_x86_sse_iirf_nip(float * fpDest,const float * fpSrc,int iDataLength,const float * fpCoeff,float * fpX,float * fpY)2674 void dsp_x86_sse_iirf_nip (float *fpDest, const float *fpSrc, int iDataLength,
2675 const float *fpCoeff, float *fpX, float *fpY)
2676 {
2677 int iDataCntr;
2678
2679 X86_ASM (
2680 "movss %0, %%xmm1\n\t" \
2681 "movss %1, %%xmm2\n\t" \
2682 "movss %2, %%xmm3\n\t" \
2683 "movss %3, %%xmm4\n\t" \
2684 "prefetchnta %4\n\t"
2685 :
2686 : "m" (fpX[1]),
2687 "m" (fpX[2]),
2688 "m" (fpY[0]),
2689 "m" (fpY[1]),
2690 "m" (fpCoeff[0])
2691 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2692 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2693 {
2694 X86_ASM (
2695 "movss %%xmm1, %%xmm0\n\t" \
2696 "movss %%xmm2, %%xmm1\n\t" \
2697 "movss %1, %%xmm2\n\t" \
2698 \
2699 "movss %2, %%xmm5\n\t" \
2700 "mulss %%xmm2, %%xmm5\n\t" \
2701 "movss %3, %%xmm6\n\t" \
2702 "mulss %%xmm1, %%xmm6\n\t" \
2703 "addss %%xmm6, %%xmm5\n\t" \
2704 "movss %4, %%xmm6\n\t" \
2705 "mulss %%xmm0, %%xmm6\n\t" \
2706 "addss %%xmm6, %%xmm5\n\t" \
2707 \
2708 "movss %5, %%xmm6\n\t" \
2709 "mulss %%xmm4, %%xmm6\n\t" \
2710 "movss %6, %%xmm7\n\t" \
2711 "mulss %%xmm3, %%xmm7\n\t" \
2712 "addss %%xmm7, %%xmm6\n\t" \
2713 \
2714 "addss %%xmm5, %%xmm6\n\t" \
2715 "movss %%xmm4, %%xmm3\n\t" \
2716 "movss %%xmm6, %%xmm4\n\t" \
2717 \
2718 "movss %%xmm6, %0\n\t"
2719 : "=m" (fpDest[iDataCntr])
2720 : "m" (fpSrc[iDataCntr]),
2721 "m" (fpCoeff[0]),
2722 "m" (fpCoeff[1]),
2723 "m" (fpCoeff[2]),
2724 "m" (fpCoeff[3]),
2725 "m" (fpCoeff[4])
2726 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
2727 "memory");
2728 }
2729 X86_ASM (
2730 "movss %%xmm0, %0\n\t" \
2731 "movss %%xmm1, %1\n\t" \
2732 "movss %%xmm2, %2\n\t" \
2733 "movss %%xmm3, %3\n\t" \
2734 "movss %%xmm4, %4\n\t"
2735 : "=m" (fpX[0]),
2736 "=m" (fpX[1]),
2737 "=m" (fpX[2]),
2738 "=m" (fpY[0]),
2739 "=m" (fpY[1])
2740 :
2741 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2742 }
2743
2744
dsp_x86_sse_iir_nip(double * dpDest,const double * dpSrc,int iDataLength,const double * dpCoeff,double * dpX,double * dpY)2745 void dsp_x86_sse_iir_nip (double *dpDest, const double *dpSrc, int iDataLength,
2746 const double *dpCoeff, double *dpX, double *dpY)
2747 {
2748 int iDataCntr;
2749
2750 X86_ASM (
2751 "movsd %0, %%xmm1\n\t" \
2752 "movsd %1, %%xmm2\n\t" \
2753 "movsd %2, %%xmm3\n\t" \
2754 "movsd %3, %%xmm4\n\t" \
2755 "prefetchnta %4\n\t" \
2756 "prefetchnta %5\n\t"
2757 :
2758 : "m" (dpX[1]),
2759 "m" (dpX[2]),
2760 "m" (dpY[0]),
2761 "m" (dpY[1]),
2762 "m" (dpCoeff[0]),
2763 "m" (dpCoeff[3])
2764 : "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2765 for (iDataCntr = 0; iDataCntr < iDataLength; iDataCntr++)
2766 {
2767 X86_ASM (
2768 "movsd %%xmm1, %%xmm0\n\t" \
2769 "movsd %%xmm2, %%xmm1\n\t" \
2770 "movsd %1, %%xmm2\n\t" \
2771 \
2772 "movsd %2, %%xmm5\n\t" \
2773 "mulsd %%xmm2, %%xmm5\n\t" \
2774 "movsd %3, %%xmm6\n\t" \
2775 "mulsd %%xmm1, %%xmm6\n\t" \
2776 "addsd %%xmm6, %%xmm5\n\t" \
2777 "movsd %4, %%xmm6\n\t" \
2778 "mulsd %%xmm0, %%xmm6\n\t" \
2779 "addsd %%xmm6, %%xmm5\n\t" \
2780 \
2781 "movsd %5, %%xmm6\n\t" \
2782 "mulsd %%xmm4, %%xmm6\n\t" \
2783 "movsd %6, %%xmm7\n\t" \
2784 "mulsd %%xmm3, %%xmm7\n\t" \
2785 "addsd %%xmm7, %%xmm6\n\t" \
2786 \
2787 "addsd %%xmm5, %%xmm6\n\t" \
2788 "movsd %%xmm4, %%xmm3\n\t" \
2789 "movsd %%xmm6, %%xmm4\n\t" \
2790 \
2791 "movsd %%xmm6, %0\n\t"
2792 : "=m" (dpDest[iDataCntr])
2793 : "m" (dpSrc[iDataCntr]),
2794 "m" (dpCoeff[0]),
2795 "m" (dpCoeff[1]),
2796 "m" (dpCoeff[2]),
2797 "m" (dpCoeff[3]),
2798 "m" (dpCoeff[4])
2799 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
2800 "memory");
2801 }
2802 X86_ASM (
2803 "movsd %%xmm0, %0\n\t" \
2804 "movsd %%xmm1, %1\n\t" \
2805 "movsd %%xmm2, %2\n\t" \
2806 "movsd %%xmm3, %3\n\t" \
2807 "movsd %%xmm4, %4\n\t"
2808 : "=m" (dpX[0]),
2809 "=m" (dpX[1]),
2810 "=m" (dpX[2]),
2811 "=m" (dpY[0]),
2812 "=m" (dpY[1])
2813 :
2814 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "memory");
2815 }
2816
2817
2818 #ifdef __cplusplus
2819 }
2820 #endif
2821
2822 #endif
2823