1;*****************************************************************************
2;* x86-optimized Float DSP functions
3;*
4;* Copyright 2006 Loren Merritt
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "x86util.asm"
24
25SECTION_RODATA 32
26pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
27
28SECTION .text
29
30;-----------------------------------------------------------------------------
31; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
32;-----------------------------------------------------------------------------
33%macro VECTOR_FMUL 0
34cglobal vector_fmul, 4,4,2, dst, src0, src1, len
35    lea       lenq, [lend*4 - 64]
36ALIGN 16
37.loop:
38%assign a 0
39%rep 32/mmsize
40    mova      m0,   [src0q + lenq + (a+0)*mmsize]
41    mova      m1,   [src0q + lenq + (a+1)*mmsize]
42    mulps     m0, m0, [src1q + lenq + (a+0)*mmsize]
43    mulps     m1, m1, [src1q + lenq + (a+1)*mmsize]
44    mova      [dstq + lenq + (a+0)*mmsize], m0
45    mova      [dstq + lenq + (a+1)*mmsize], m1
46%assign a a+2
47%endrep
48
49    sub       lenq, 64
50    jge       .loop
51    REP_RET
52%endmacro
53
54INIT_XMM sse
55VECTOR_FMUL
56%if HAVE_AVX_EXTERNAL
57INIT_YMM avx
58VECTOR_FMUL
59%endif
60
61;-----------------------------------------------------------------------------
62; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
63;-----------------------------------------------------------------------------
64%macro VECTOR_DMUL 0
65cglobal vector_dmul, 4,4,4, dst, src0, src1, len
66    lea       lend, [lenq*8 - mmsize*4]
67ALIGN 16
68.loop:
69    movaps    m0,     [src0q + lenq + 0*mmsize]
70    movaps    m1,     [src0q + lenq + 1*mmsize]
71    movaps    m2,     [src0q + lenq + 2*mmsize]
72    movaps    m3,     [src0q + lenq + 3*mmsize]
73    mulpd     m0, m0, [src1q + lenq + 0*mmsize]
74    mulpd     m1, m1, [src1q + lenq + 1*mmsize]
75    mulpd     m2, m2, [src1q + lenq + 2*mmsize]
76    mulpd     m3, m3, [src1q + lenq + 3*mmsize]
77    movaps    [dstq + lenq + 0*mmsize], m0
78    movaps    [dstq + lenq + 1*mmsize], m1
79    movaps    [dstq + lenq + 2*mmsize], m2
80    movaps    [dstq + lenq + 3*mmsize], m3
81
82    sub       lenq, mmsize*4
83    jge       .loop
84    RET
85%endmacro
86
87INIT_XMM sse2
88VECTOR_DMUL
89%if HAVE_AVX_EXTERNAL
90INIT_YMM avx
91VECTOR_DMUL
92%endif
93
94;------------------------------------------------------------------------------
95; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
96;------------------------------------------------------------------------------
97
98%macro VECTOR_FMAC_SCALAR 0
99%if UNIX64
100cglobal vector_fmac_scalar, 3,3,5, dst, src, len
101%else
102cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
103%endif
104%if ARCH_X86_32
105    VBROADCASTSS m0, mulm
106%else
107%if WIN64
108    SWAP 0, 2
109%endif
110    shufps      xm0, xm0, 0
111%if cpuflag(avx)
112    vinsertf128  m0, m0, xm0, 1
113%endif
114%endif
115    lea    lenq, [lend*4-64]
116.loop:
117%if cpuflag(fma3)
118    mova     m1,     [dstq+lenq]
119    mova     m2,     [dstq+lenq+1*mmsize]
120    fmaddps  m1, m0, [srcq+lenq], m1
121    fmaddps  m2, m0, [srcq+lenq+1*mmsize], m2
122%else ; cpuflag
123    mulps    m1, m0, [srcq+lenq]
124    mulps    m2, m0, [srcq+lenq+1*mmsize]
125%if mmsize < 32
126    mulps    m3, m0, [srcq+lenq+2*mmsize]
127    mulps    m4, m0, [srcq+lenq+3*mmsize]
128%endif ; mmsize
129    addps    m1, m1, [dstq+lenq]
130    addps    m2, m2, [dstq+lenq+1*mmsize]
131%if mmsize < 32
132    addps    m3, m3, [dstq+lenq+2*mmsize]
133    addps    m4, m4, [dstq+lenq+3*mmsize]
134%endif ; mmsize
135%endif ; cpuflag
136    mova  [dstq+lenq], m1
137    mova  [dstq+lenq+1*mmsize], m2
138%if mmsize < 32
139    mova  [dstq+lenq+2*mmsize], m3
140    mova  [dstq+lenq+3*mmsize], m4
141%endif ; mmsize
142    sub    lenq, 64
143    jge .loop
144    REP_RET
145%endmacro
146
147INIT_XMM sse
148VECTOR_FMAC_SCALAR
149%if HAVE_AVX_EXTERNAL
150INIT_YMM avx
151VECTOR_FMAC_SCALAR
152%endif
153%if HAVE_FMA3_EXTERNAL
154INIT_YMM fma3
155VECTOR_FMAC_SCALAR
156%endif
157
158;------------------------------------------------------------------------------
159; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
160;------------------------------------------------------------------------------
161
162%macro VECTOR_FMUL_SCALAR 0
163%if UNIX64
164cglobal vector_fmul_scalar, 3,3,2, dst, src, len
165%else
166cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
167%endif
168%if ARCH_X86_32
169    movss    m0, mulm
170%elif WIN64
171    SWAP 0, 2
172%endif
173    shufps   m0, m0, 0
174    lea    lenq, [lend*4-mmsize]
175.loop:
176    mova     m1, [srcq+lenq]
177    mulps    m1, m0
178    mova  [dstq+lenq], m1
179    sub    lenq, mmsize
180    jge .loop
181    REP_RET
182%endmacro
183
184INIT_XMM sse
185VECTOR_FMUL_SCALAR
186
187;------------------------------------------------------------------------------
188; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
189;                            int len)
190;------------------------------------------------------------------------------
191
192%macro VECTOR_DMAC_SCALAR 0
193%if ARCH_X86_32
194cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
195    mov          lenq, lenaddrm
196    VBROADCASTSD m0, mulm
197%else
198%if UNIX64
199cglobal vector_dmac_scalar, 3,3,5, dst, src, len
200%else
201cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
202    SWAP 0, 2
203%endif
204    movlhps     xm0, xm0
205%if cpuflag(avx)
206    vinsertf128  m0, m0, xm0, 1
207%endif
208%endif
209    lea    lenq, [lend*8-mmsize*4]
210.loop:
211%if cpuflag(fma3)
212    movaps   m1,     [dstq+lenq]
213    movaps   m2,     [dstq+lenq+1*mmsize]
214    movaps   m3,     [dstq+lenq+2*mmsize]
215    movaps   m4,     [dstq+lenq+3*mmsize]
216    fmaddpd  m1, m0, [srcq+lenq], m1
217    fmaddpd  m2, m0, [srcq+lenq+1*mmsize], m2
218    fmaddpd  m3, m0, [srcq+lenq+2*mmsize], m3
219    fmaddpd  m4, m0, [srcq+lenq+3*mmsize], m4
220%else ; cpuflag
221    mulpd    m1, m0, [srcq+lenq]
222    mulpd    m2, m0, [srcq+lenq+1*mmsize]
223    mulpd    m3, m0, [srcq+lenq+2*mmsize]
224    mulpd    m4, m0, [srcq+lenq+3*mmsize]
225    addpd    m1, m1, [dstq+lenq]
226    addpd    m2, m2, [dstq+lenq+1*mmsize]
227    addpd    m3, m3, [dstq+lenq+2*mmsize]
228    addpd    m4, m4, [dstq+lenq+3*mmsize]
229%endif ; cpuflag
230    movaps [dstq+lenq], m1
231    movaps [dstq+lenq+1*mmsize], m2
232    movaps [dstq+lenq+2*mmsize], m3
233    movaps [dstq+lenq+3*mmsize], m4
234    sub    lenq, mmsize*4
235    jge .loop
236    REP_RET
237%endmacro
238
239INIT_XMM sse2
240VECTOR_DMAC_SCALAR
241%if HAVE_AVX_EXTERNAL
242INIT_YMM avx
243VECTOR_DMAC_SCALAR
244%endif
245%if HAVE_FMA3_EXTERNAL
246INIT_YMM fma3
247VECTOR_DMAC_SCALAR
248%endif
249
250;------------------------------------------------------------------------------
251; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
252;                            int len)
253;------------------------------------------------------------------------------
254
255%macro VECTOR_DMUL_SCALAR 0
256%if ARCH_X86_32
257cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
258    mov          lenq, lenaddrm
259%elif UNIX64
260cglobal vector_dmul_scalar, 3,3,3, dst, src, len
261%else
262cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
263%endif
264%if ARCH_X86_32
265    VBROADCASTSD   m0, mulm
266%else
267%if WIN64
268    SWAP 0, 2
269%endif
270    movlhps       xm0, xm0
271%if cpuflag(avx)
272    vinsertf128   ym0, ym0, xm0, 1
273%endif
274%endif
275    lea          lenq, [lend*8-2*mmsize]
276.loop:
277    mulpd          m1, m0, [srcq+lenq       ]
278    mulpd          m2, m0, [srcq+lenq+mmsize]
279    movaps [dstq+lenq       ], m1
280    movaps [dstq+lenq+mmsize], m2
281    sub          lenq, 2*mmsize
282    jge .loop
283    REP_RET
284%endmacro
285
286INIT_XMM sse2
287VECTOR_DMUL_SCALAR
288%if HAVE_AVX_EXTERNAL
289INIT_YMM avx
290VECTOR_DMUL_SCALAR
291%endif
292
293;-----------------------------------------------------------------------------
294; vector_fmul_window(float *dst, const float *src0,
295;                    const float *src1, const float *win, int len);
296;-----------------------------------------------------------------------------
297%macro VECTOR_FMUL_WINDOW 0
298cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
299    shl     lend, 2
300    lea    len1q, [lenq - mmsize]
301    add    src0q, lenq
302    add     dstq, lenq
303    add     winq, lenq
304    neg     lenq
305.loop:
306    mova      m0, [winq  + lenq]
307    mova      m4, [src0q + lenq]
308%if cpuflag(sse)
309    mova      m1, [winq  + len1q]
310    mova      m5, [src1q + len1q]
311    shufps    m1, m1, 0x1b
312    shufps    m5, m5, 0x1b
313    mova      m2, m0
314    mova      m3, m1
315    mulps     m2, m4
316    mulps     m3, m5
317    mulps     m1, m4
318    mulps     m0, m5
319    addps     m2, m3
320    subps     m1, m0
321    shufps    m2, m2, 0x1b
322%else
323    pswapd    m1, [winq  + len1q]
324    pswapd    m5, [src1q + len1q]
325    mova      m2, m0
326    mova      m3, m1
327    pfmul     m2, m4
328    pfmul     m3, m5
329    pfmul     m1, m4
330    pfmul     m0, m5
331    pfadd     m2, m3
332    pfsub     m1, m0
333    pswapd    m2, m2
334%endif
335    mova      [dstq + lenq], m1
336    mova      [dstq + len1q], m2
337    sub       len1q, mmsize
338    add       lenq,  mmsize
339    jl .loop
340%if mmsize == 8
341    femms
342%endif
343    REP_RET
344%endmacro
345
346INIT_MMX 3dnowext
347VECTOR_FMUL_WINDOW
348INIT_XMM sse
349VECTOR_FMUL_WINDOW
350
351;-----------------------------------------------------------------------------
352; vector_fmul_add(float *dst, const float *src0, const float *src1,
353;                 const float *src2, int len)
354;-----------------------------------------------------------------------------
355%macro VECTOR_FMUL_ADD 0
356cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
357    lea       lenq, [lend*4 - 2*mmsize]
358ALIGN 16
359.loop:
360    mova    m0,   [src0q + lenq]
361    mova    m1,   [src0q + lenq + mmsize]
362%if cpuflag(fma3)
363    mova    m2,     [src2q + lenq]
364    mova    m3,     [src2q + lenq + mmsize]
365    fmaddps m0, m0, [src1q + lenq], m2
366    fmaddps m1, m1, [src1q + lenq + mmsize], m3
367%else
368    mulps   m0, m0, [src1q + lenq]
369    mulps   m1, m1, [src1q + lenq + mmsize]
370    addps   m0, m0, [src2q + lenq]
371    addps   m1, m1, [src2q + lenq + mmsize]
372%endif
373    mova    [dstq + lenq], m0
374    mova    [dstq + lenq + mmsize], m1
375
376    sub     lenq,   2*mmsize
377    jge     .loop
378    REP_RET
379%endmacro
380
381INIT_XMM sse
382VECTOR_FMUL_ADD
383%if HAVE_AVX_EXTERNAL
384INIT_YMM avx
385VECTOR_FMUL_ADD
386%endif
387%if HAVE_FMA3_EXTERNAL
388INIT_YMM fma3
389VECTOR_FMUL_ADD
390%endif
391
392;-----------------------------------------------------------------------------
393; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
394;                          int len)
395;-----------------------------------------------------------------------------
396%macro VECTOR_FMUL_REVERSE 0
397cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
398%if cpuflag(avx2)
399    movaps  m2, [pd_reverse]
400%endif
401    lea       lenq, [lend*4 - 2*mmsize]
402ALIGN 16
403.loop:
404%if cpuflag(avx2)
405    vpermps m0, m2, [src1q]
406    vpermps m1, m2, [src1q+mmsize]
407%elif cpuflag(avx)
408    vmovaps     xmm0, [src1q + 16]
409    vinsertf128 m0, m0, [src1q], 1
410    vshufps     m0, m0, m0, q0123
411    vmovaps     xmm1, [src1q + mmsize + 16]
412    vinsertf128 m1, m1, [src1q + mmsize], 1
413    vshufps     m1, m1, m1, q0123
414%else
415    mova    m0, [src1q]
416    mova    m1, [src1q + mmsize]
417    shufps  m0, m0, q0123
418    shufps  m1, m1, q0123
419%endif
420    mulps   m0, m0, [src0q + lenq + mmsize]
421    mulps   m1, m1, [src0q + lenq]
422    movaps  [dstq + lenq + mmsize], m0
423    movaps  [dstq + lenq], m1
424    add     src1q, 2*mmsize
425    sub     lenq,  2*mmsize
426    jge     .loop
427    REP_RET
428%endmacro
429
430INIT_XMM sse
431VECTOR_FMUL_REVERSE
432%if HAVE_AVX_EXTERNAL
433INIT_YMM avx
434VECTOR_FMUL_REVERSE
435%endif
436%if HAVE_AVX2_EXTERNAL
437INIT_YMM avx2
438VECTOR_FMUL_REVERSE
439%endif
440
441; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
442INIT_XMM sse
443cglobal scalarproduct_float, 3,3,2, v1, v2, offset
444    shl   offsetd, 2
445    add       v1q, offsetq
446    add       v2q, offsetq
447    neg   offsetq
448    xorps    xmm0, xmm0
449.loop:
450    movaps   xmm1, [v1q+offsetq]
451    mulps    xmm1, [v2q+offsetq]
452    addps    xmm0, xmm1
453    add   offsetq, 16
454    js .loop
455    movhlps  xmm1, xmm0
456    addps    xmm0, xmm1
457    movss    xmm1, xmm0
458    shufps   xmm0, xmm0, 1
459    addss    xmm0, xmm1
460%if ARCH_X86_64 == 0
461    movss     r0m,  xmm0
462    fld dword r0m
463%endif
464    RET
465
466;-----------------------------------------------------------------------------
467; void ff_butterflies_float(float *src0, float *src1, int len);
468;-----------------------------------------------------------------------------
469INIT_XMM sse
470cglobal butterflies_float, 3,3,3, src0, src1, len
471    shl       lend, 2
472    add      src0q, lenq
473    add      src1q, lenq
474    neg       lenq
475.loop:
476    mova        m0, [src0q + lenq]
477    mova        m1, [src1q + lenq]
478    subps       m2, m0, m1
479    addps       m0, m0, m1
480    mova        [src1q + lenq], m2
481    mova        [src0q + lenq], m0
482    add       lenq, mmsize
483    jl .loop
484    REP_RET
485