1;******************************************************************************
2;* SIMD optimized MPEG-4 Parametric Stereo decoding functions
3;*
4;* Copyright (C) 2015 James Almer
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA
26
27ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
28
29SECTION .text
30
31;*************************************************************************
32;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n);
33;*************************************************************************
34%macro PS_ADD_SQUARES 1
35cglobal ps_add_squares, 3, 3, %1, dst, src, n
36    shl    nd, 3
37    add  srcq, nq
38    neg    nq
39
40align 16
41.loop:
42    movaps m0, [srcq+nq]
43    movaps m1, [srcq+nq+mmsize]
44    mulps  m0, m0
45    mulps  m1, m1
46    HADDPS m0, m1, m2
47    addps  m0, [dstq]
48    movaps [dstq], m0
49    add  dstq, mmsize
50    add    nq, mmsize*2
51    jl .loop
52    REP_RET
53%endmacro
54
55INIT_XMM sse
56PS_ADD_SQUARES 2
57INIT_XMM sse3
58PS_ADD_SQUARES 3
59
60;*******************************************************************
61;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2],
62;                                   float *src1, int n);
63;*******************************************************************
64INIT_XMM sse
65cglobal ps_mul_pair_single, 4, 4, 4, dst, src1, src2, n
66    shl      nd, 3
67    add   src1q, nq
68    add    dstq, nq
69    neg      nq
70
71align 16
72.loop:
73    movu     m0, [src1q+nq]
74    movu     m1, [src1q+nq+mmsize]
75    mova     m2, [src2q]
76    mova     m3, m2
77    unpcklps m2, m2
78    unpckhps m3, m3
79    mulps    m0, m2
80    mulps    m1, m3
81    mova [dstq+nq], m0
82    mova [dstq+nq+mmsize], m1
83    add   src2q, mmsize
84    add      nq, mmsize*2
85    jl .loop
86    REP_RET
87
88;***********************************************************************
89;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
90;                                   float h[2][4], float h_step[2][4],
91;                                   int len);
92;***********************************************************************
93INIT_XMM sse3
94cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n
95    movaps   m0, [hq]
96    movaps   m1, [h_stepq]
97    unpcklps m4, m0, m0
98    unpckhps m0, m0
99    unpcklps m5, m1, m1
100    unpckhps m1, m1
101    shl      nd, 3
102    add      lq, nq
103    add      rq, nq
104    neg      nq
105
106align 16
107.loop:
108    addps    m4, m5
109    addps    m0, m1
110    movddup  m2, [lq+nq]
111    movddup  m3, [rq+nq]
112    mulps    m2, m4
113    mulps    m3, m0
114    addps    m2, m3
115    movsd  [lq+nq], m2
116    movhps [rq+nq], m2
117    add      nq, 8
118    jl .loop
119    REP_RET
120
121;***************************************************************************
122;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
123;                                       float h[2][4], float h_step[2][4],
124;                                       int len);
125;***************************************************************************
126INIT_XMM sse3
127cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n
128    movaps   m0, [hq]
129    movaps   m1, [hq+mmsize]
130%if ARCH_X86_64
131    movaps   m8, [h_stepq]
132    movaps   m9, [h_stepq+mmsize]
133    %define  H_STEP0 m8
134    %define  H_STEP1 m9
135%else
136    %define  H_STEP0 [h_stepq]
137    %define  H_STEP1 [h_stepq+mmsize]
138%endif
139    shl      nd, 3
140    add      lq, nq
141    add      rq, nq
142    neg      nq
143
144align 16
145.loop:
146    addps    m0, H_STEP0
147    addps    m1, H_STEP1
148    movddup  m2, [lq+nq]
149    movddup  m3, [rq+nq]
150    shufps   m4, m2, m2, q2301
151    shufps   m5, m3, m3, q2301
152    unpcklps m6, m0, m0
153    unpckhps m7, m0, m0
154    mulps    m2, m6
155    mulps    m3, m7
156    unpcklps m6, m1, m1
157    unpckhps m7, m1, m1
158    mulps    m4, m6
159    mulps    m5, m7
160    addps    m2, m3
161    addsubps m2, m4
162    addsubps m2, m5
163    movsd  [lq+nq], m2
164    movhps [rq+nq], m2
165    add      nq, 8
166    jl .loop
167    REP_RET
168
169;**********************************************************
170;void ps_hybrid_analysis_ileave_sse(float out[2][38][64],
171;                                   float (*in)[32][2],
172;                                   int i, int len)
173;**********************************************************
174INIT_XMM sse
175cglobal ps_hybrid_analysis_ileave, 3, 7, 5, out, in, i, len, in0, in1, tmp
176    movsxdifnidn        iq, id
177    mov               lend, 32 << 3
178    lea                inq, [inq+iq*4]
179    mov               tmpd, id
180    shl               tmpd, 8
181    add               outq, tmpq
182    mov               tmpd, 64
183    sub               tmpd, id
184    mov                 id, tmpd
185
186    test                id, 1
187    jne .loop4
188    test                id, 2
189    jne .loop8
190
191align 16
192.loop16:
193    mov               in0q, inq
194    mov               in1q, 38*64*4
195    add               in1q, in0q
196    mov               tmpd, lend
197
198.inner_loop16:
199    movaps              m0, [in0q]
200    movaps              m1, [in1q]
201    movaps              m2, [in0q+lenq]
202    movaps              m3, [in1q+lenq]
203    TRANSPOSE4x4PS 0, 1, 2, 3, 4
204    movaps          [outq], m0
205    movaps     [outq+lenq], m1
206    movaps   [outq+lenq*2], m2
207    movaps [outq+3*32*2*4], m3
208    lea               in0q, [in0q+lenq*2]
209    lea               in1q, [in1q+lenq*2]
210    add               outq, mmsize
211    sub               tmpd, mmsize
212    jg .inner_loop16
213    add                inq, 16
214    add               outq, 3*32*2*4
215    sub                 id, 4
216    jg .loop16
217    RET
218
219align 16
220.loop8:
221    mov               in0q, inq
222    mov               in1q, 38*64*4
223    add               in1q, in0q
224    mov               tmpd, lend
225
226.inner_loop8:
227    movlps              m0, [in0q]
228    movlps              m1, [in1q]
229    movhps              m0, [in0q+lenq]
230    movhps              m1, [in1q+lenq]
231    SBUTTERFLYPS 0, 1, 2
232    SBUTTERFLYPD 0, 1, 2
233    movaps          [outq], m0
234    movaps     [outq+lenq], m1
235    lea               in0q, [in0q+lenq*2]
236    lea               in1q, [in1q+lenq*2]
237    add               outq, mmsize
238    sub               tmpd, mmsize
239    jg .inner_loop8
240    add                inq, 8
241    add               outq, lenq
242    sub                 id, 2
243    jg .loop16
244    RET
245
246align 16
247.loop4:
248    mov               in0q, inq
249    mov               in1q, 38*64*4
250    add               in1q, in0q
251    mov               tmpd, lend
252
253.inner_loop4:
254    movss               m0, [in0q]
255    movss               m1, [in1q]
256    movss               m2, [in0q+lenq]
257    movss               m3, [in1q+lenq]
258    movlhps             m0, m1
259    movlhps             m2, m3
260    shufps              m0, m2, q2020
261    movaps          [outq], m0
262    lea               in0q, [in0q+lenq*2]
263    lea               in1q, [in1q+lenq*2]
264    add               outq, mmsize
265    sub               tmpd, mmsize
266    jg .inner_loop4
267    add                inq, 4
268    sub                 id, 1
269    test                id, 2
270    jne .loop8
271    cmp                 id, 4
272    jge .loop16
273    RET
274
275;***********************************************************
276;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
277;                                    float (*in)[32][2],
278;                                    int i, int len)
279;***********************************************************
280%macro HYBRID_SYNTHESIS_DEINT 0
281cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
282%if cpuflag(sse4)
283%define MOVH movsd
284%else
285%define MOVH movlps
286%endif
287    movsxdifnidn        iq, id
288    mov               lend, 32 << 3
289    lea               outq, [outq+iq*4]
290    mov               tmpd, id
291    shl               tmpd, 8
292    add                inq, tmpq
293    mov               tmpd, 64
294    sub               tmpd, id
295    mov                 id, tmpd
296
297    test                id, 1
298    jne .loop4
299    test                id, 2
300    jne .loop8
301
302align 16
303.loop16:
304    mov              out0q, outq
305    mov              out1q, 38*64*4
306    add              out1q, out0q
307    mov               tmpd, lend
308
309.inner_loop16:
310    movaps              m0, [inq]
311    movaps              m1, [inq+lenq]
312    movaps              m2, [inq+lenq*2]
313    movaps              m3, [inq+3*32*2*4]
314    TRANSPOSE4x4PS 0, 1, 2, 3, 4
315    movaps         [out0q], m0
316    movaps         [out1q], m1
317    movaps    [out0q+lenq], m2
318    movaps    [out1q+lenq], m3
319    lea              out0q, [out0q+lenq*2]
320    lea              out1q, [out1q+lenq*2]
321    add                inq, mmsize
322    sub               tmpd, mmsize
323    jg .inner_loop16
324    add               outq, 16
325    add                inq, 3*32*2*4
326    sub                 id, 4
327    jg .loop16
328    RET
329
330align 16
331.loop8:
332    mov              out0q, outq
333    mov              out1q, 38*64*4
334    add              out1q, out0q
335    mov               tmpd, lend
336
337.inner_loop8:
338    movaps              m0, [inq]
339    movaps              m1, [inq+lenq]
340    SBUTTERFLYPS 0, 1, 2
341    SBUTTERFLYPD 0, 1, 2
342    MOVH           [out0q], m0
343    MOVH           [out1q], m1
344    movhps    [out0q+lenq], m0
345    movhps    [out1q+lenq], m1
346    lea              out0q, [out0q+lenq*2]
347    lea              out1q, [out1q+lenq*2]
348    add                inq, mmsize
349    sub               tmpd, mmsize
350    jg .inner_loop8
351    add               outq, 8
352    add                inq, lenq
353    sub                 id, 2
354    jg .loop16
355    RET
356
357align 16
358.loop4:
359    mov              out0q, outq
360    mov              out1q, 38*64*4
361    add              out1q, out0q
362    mov               tmpd, lend
363
364.inner_loop4:
365    movaps              m0, [inq]
366    movss          [out0q], m0
367%if cpuflag(sse4)
368    extractps      [out1q], m0, 1
369    extractps [out0q+lenq], m0, 2
370    extractps [out1q+lenq], m0, 3
371%else
372    movhlps             m1, m0
373    movss     [out0q+lenq], m1
374    shufps              m0, m0, 0xb1
375    movss          [out1q], m0
376    movhlps             m1, m0
377    movss     [out1q+lenq], m1
378%endif
379    lea              out0q, [out0q+lenq*2]
380    lea              out1q, [out1q+lenq*2]
381    add                inq, mmsize
382    sub               tmpd, mmsize
383    jg .inner_loop4
384    add               outq, 4
385    sub                 id, 1
386    test                id, 2
387    jne .loop8
388    cmp                 id, 4
389    jge .loop16
390    RET
391%endmacro
392
393INIT_XMM sse
394HYBRID_SYNTHESIS_DEINT
395INIT_XMM sse4
396HYBRID_SYNTHESIS_DEINT
397
398;*******************************************************************
399;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
400;                                 const float (*filter)[8][2],
401;                                 ptrdiff_t stride, int n);
402;*******************************************************************
403%macro PS_HYBRID_ANALYSIS_LOOP 3
404    movu     %1, [inq+mmsize*%3]
405    movu     m1, [inq+mmsize*(5-%3)+8]
406%if cpuflag(sse3)
407    pshufd   %2, %1, q2301
408    pshufd   m4, m1, q0123
409    pshufd   m1, m1, q1032
410    pshufd   m2, [filterq+nq+mmsize*%3], q2301
411    addsubps %2, m4
412    addsubps %1, m1
413%else
414    mova     m2, [filterq+nq+mmsize*%3]
415    mova     %2, %1
416    mova     m4, m1
417    shufps   %2, %2, q2301
418    shufps   m4, m4, q0123
419    shufps   m1, m1, q1032
420    shufps   m2, m2, q2301
421    xorps    m4, m7
422    xorps    m1, m7
423    subps    %2, m4
424    subps    %1, m1
425%endif
426    mulps    %2, m2
427    mulps    %1, m2
428%if %3
429    addps    m3, %2
430    addps    m0, %1
431%endif
432%endmacro
433
434%macro PS_HYBRID_ANALYSIS 0
435cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n
436%if cpuflag(sse3)
437%define MOVH movsd
438%else
439%define MOVH movlps
440%endif
441    shl strideq, 3
442    shl nd, 6
443    add filterq, nq
444    neg nq
445    mova m7, [ps_p1m1p1m1]
446
447align 16
448.loop:
449    PS_HYBRID_ANALYSIS_LOOP m0, m3, 0
450    PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
451    PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
452
453%if cpuflag(sse3)
454    pshufd   m3, m3, q2301
455    xorps    m0, m7
456    hsubps   m3, m0
457    pshufd   m1, m3, q0020
458    pshufd   m3, m3, q0031
459    addps    m1, m3
460    movsd    m2, [inq+6*8]
461%else
462    mova     m1, m3
463    mova     m2, m0
464    shufps   m1, m1, q2301
465    shufps   m2, m2, q2301
466    subps    m1, m3
467    addps    m2, m0
468    unpcklps m3, m1, m2
469    unpckhps m1, m2
470    addps    m1, m3
471    movu     m2, [inq+6*8] ; faster than movlps and no risk of overread
472%endif
473    movss    m3, [filterq+nq+8*6]
474    SPLATD   m3
475    mulps    m2, m3
476    addps    m1, m2
477    MOVH [outq], m1
478    add    outq, strideq
479    add      nq, 64
480    jl .loop
481    REP_RET
482%endmacro
483
484INIT_XMM sse
485PS_HYBRID_ANALYSIS
486INIT_XMM sse3
487PS_HYBRID_ANALYSIS
488