1;******************************************************************************
2;* AAC Spectral Band Replication decoding functions
3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25; mask equivalent for multiply by -1.0 1.0
26ps_mask         times 2 dd 1<<31, 0
27ps_mask2        times 2 dd 0, 1<<31
28ps_mask3        dd  0, 0, 0, 1<<31
29ps_noise0       times 2 dd  1.0,  0.0,
30ps_noise2       times 2 dd -1.0,  0.0
31ps_noise13      dd  0.0,  1.0, 0.0, -1.0
32                dd  0.0, -1.0, 0.0,  1.0
33                dd  0.0,  1.0, 0.0, -1.0
34cextern         sbr_noise_table
35cextern         ps_neg
36
37SECTION .text
38
39INIT_XMM sse
40cglobal sbr_sum_square, 2, 3, 6
41    mov        r2d, r1d
42    xorps       m0, m0
43    xorps       m1, m1
44    sar         r2, 3
45    jz          .prepare
46.loop:
47    movu        m2, [r0 +  0]
48    movu        m3, [r0 + 16]
49    movu        m4, [r0 + 32]
50    movu        m5, [r0 + 48]
51    mulps       m2, m2
52    mulps       m3, m3
53    mulps       m4, m4
54    mulps       m5, m5
55    addps       m0, m2
56    addps       m1, m3
57    addps       m0, m4
58    addps       m1, m5
59    add         r0, 64
60    dec         r2
61    jnz         .loop
62.prepare:
63    and         r1, 7
64    sar         r1, 1
65    jz          .end
66; len is a multiple of 2, thus there are at least 4 elements to process
67.endloop:
68    movu        m2, [r0]
69    add         r0, 16
70    mulps       m2, m2
71    dec         r1
72    addps       m0, m2
73    jnz         .endloop
74.end:
75    addps       m0, m1
76    movhlps     m2, m0
77    addps       m0, m2
78    movss       m1, m0
79    shufps      m0, m0, 1
80    addss       m0, m1
81%if ARCH_X86_64 == 0
82    movss       r0m,  m0
83    fld         dword r0m
84%endif
85    RET
86
87%define STEP  40*4*2
88cglobal sbr_hf_g_filt, 5, 6, 5
89    lea         r1, [r1 + 8*r4] ; offset by ixh elements into X_high
90    mov         r5, r3
91    and         r3, 0xFC
92    lea         r2, [r2 + r3*4]
93    lea         r0, [r0 + r3*8]
94    neg         r3
95    jz          .loop1
96.loop4:
97    movlps      m0, [r2 + 4*r3 + 0]
98    movlps      m1, [r2 + 4*r3 + 8]
99    movlps      m2, [r1 + 0*STEP]
100    movlps      m3, [r1 + 2*STEP]
101    movhps      m2, [r1 + 1*STEP]
102    movhps      m3, [r1 + 3*STEP]
103    unpcklps    m0, m0
104    unpcklps    m1, m1
105    mulps       m0, m2
106    mulps       m1, m3
107    movu        [r0 + 8*r3 +  0], m0
108    movu        [r0 + 8*r3 + 16], m1
109    add         r1, 4*STEP
110    add         r3, 4
111    jnz         .loop4
112    and         r5, 3 ; number of single element loops
113    jz          .end
114.loop1: ; element 0 and 1 can be computed at the same time
115    movss       m0, [r2]
116    movlps      m2, [r1]
117    unpcklps    m0, m0
118    mulps       m2, m0
119    movlps    [r0], m2
120    add         r0, 8
121    add         r2, 4
122    add         r1, STEP
123    dec         r5
124    jnz         .loop1
125.end:
126    RET
127
128; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
129;                        const float alpha0[2], const float alpha1[2],
130;                        float bw, int start, int end)
131;
132cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
133    ; load alpha factors
134%define bw m0
135%if ARCH_X86_64 == 0 || WIN64
136    movss      bw, BWm
137%endif
138    movlps     m2, [alpha1q]
139    movlps     m1, [alpha0q]
140    shufps     bw, bw, 0
141    mulps      m2, bw             ; (a1[0] a1[1])*bw
142    mulps      m1, bw             ; (a0[0] a0[1])*bw    = (a2 a3)
143    mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
144    mova       m3, m1
145    mova       m4, m2
146
147    ; Set pointers
148%if ARCH_X86_64 == 0 || WIN64
149    ; start and end 6th and 7th args on stack
150    mov        r2d, Sm
151    mov        r3d, Em
152    DEFINE_ARGS X_high, X_low, start, end
153%else
154; BW does not actually occupy a register, so shift by 1
155    DEFINE_ARGS X_high, X_low, alpha0, alpha1, start, end
156    movsxd  startq, startd
157    movsxd    endq, endd
158%endif
159    sub     startq, endq         ; neg num of loops
160    lea    X_highq, [X_highq + endq*2*4]
161    lea     X_lowq, [X_lowq  + endq*2*4 - 2*2*4]
162    shl     startq, 3            ; offset from num loops
163
164    mova        m0, [X_lowq + startq]
165    shufps      m3, m3, q1111
166    shufps      m4, m4, q1111
167    xorps       m3, [ps_mask]
168    shufps      m1, m1, q0000
169    shufps      m2, m2, q0000
170    xorps       m4, [ps_mask]
171.loop2:
172    movu        m7, [X_lowq + startq + 8]       ; BbCc
173    mova        m6, m0
174    mova        m5, m7
175    shufps      m0, m0, q2301                   ; aAbB
176    shufps      m7, m7, q2301                   ; bBcC
177    mulps       m0, m4
178    mulps       m7, m3
179    mulps       m6, m2
180    mulps       m5, m1
181    addps       m7, m0
182    mova        m0, [X_lowq + startq + 16]      ; CcDd
183    addps       m7, m0
184    addps       m6, m5
185    addps       m7, m6
186    mova  [X_highq + startq], m7
187    add     startq, 16
188    jnz         .loop2
189    RET
190
191cglobal sbr_sum64x5, 1,2,4,z
192    lea    r1q, [zq+ 256]
193.loop:
194    mova    m0, [zq+   0]
195    mova    m2, [zq+  16]
196    mova    m1, [zq+ 256]
197    mova    m3, [zq+ 272]
198    addps   m0, [zq+ 512]
199    addps   m2, [zq+ 528]
200    addps   m1, [zq+ 768]
201    addps   m3, [zq+ 784]
202    addps   m0, [zq+1024]
203    addps   m2, [zq+1040]
204    addps   m0, m1
205    addps   m2, m3
206    mova  [zq], m0
207    mova  [zq+16], m2
208    add     zq, 32
209    cmp     zq, r1q
210    jne  .loop
211    REP_RET
212
213INIT_XMM sse
214cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
215    lea              r2q, [zq + (64-4)*4]
216    mova              m3, [ps_neg]
217.loop:
218    mova              m1, [zq]
219    xorps             m0, m3, [r2q]
220    shufps            m0, m0, m0, q0123
221    unpcklps          m2, m0, m1
222    unpckhps          m0, m0, m1
223    mova       [Wq +  0], m2
224    mova       [Wq + 16], m0
225    add               Wq, 32
226    sub              r2q, 16
227    add               zq, 16
228    cmp               zq, r2q
229    jl             .loop
230    REP_RET
231
232INIT_XMM sse
233cglobal sbr_neg_odd_64, 1,2,4,z
234    lea        r1q, [zq+256]
235.loop:
236    mova        m0, [zq+ 0]
237    mova        m1, [zq+16]
238    mova        m2, [zq+32]
239    mova        m3, [zq+48]
240    xorps       m0, [ps_mask2]
241    xorps       m1, [ps_mask2]
242    xorps       m2, [ps_mask2]
243    xorps       m3, [ps_mask2]
244    mova   [zq+ 0], m0
245    mova   [zq+16], m1
246    mova   [zq+32], m2
247    mova   [zq+48], m3
248    add         zq, 64
249    cmp         zq, r1q
250    jne      .loop
251    REP_RET
252
253; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
254%macro SBR_QMF_DEINT_BFLY  0
255cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
256    mov               cq, 64*4-2*mmsize
257    lea            vrevq, [vq + 64*4]
258.loop:
259    mova              m0, [src0q+cq]
260    mova              m1, [src1q]
261    mova              m4, [src0q+cq+mmsize]
262    mova              m5, [src1q+mmsize]
263%if cpuflag(sse2)
264    pshufd            m2, m0, q0123
265    pshufd            m3, m1, q0123
266    pshufd            m6, m4, q0123
267    pshufd            m7, m5, q0123
268%else
269    shufps            m2, m0, m0, q0123
270    shufps            m3, m1, m1, q0123
271    shufps            m6, m4, m4, q0123
272    shufps            m7, m5, m5, q0123
273%endif
274    addps             m5, m2
275    subps             m0, m7
276    addps             m1, m6
277    subps             m4, m3
278    mova         [vrevq], m1
279    mova  [vrevq+mmsize], m5
280    mova         [vq+cq], m0
281    mova  [vq+cq+mmsize], m4
282    add            src1q, 2*mmsize
283    add            vrevq, 2*mmsize
284    sub               cq, 2*mmsize
285    jge            .loop
286    REP_RET
287%endmacro
288
289INIT_XMM sse
290SBR_QMF_DEINT_BFLY
291
292INIT_XMM sse2
293SBR_QMF_DEINT_BFLY
294
295INIT_XMM sse2
296cglobal sbr_qmf_pre_shuffle, 1,4,6,z
297%define OFFSET  (32*4-2*mmsize)
298    mov       r3q, OFFSET
299    lea       r1q, [zq + (32+1)*4]
300    lea       r2q, [zq + 64*4]
301    mova       m5, [ps_neg]
302.loop:
303    movu       m0, [r1q]
304    movu       m2, [r1q + mmsize]
305    movu       m1, [zq + r3q + 4 + mmsize]
306    movu       m3, [zq + r3q + 4]
307
308    pxor       m2, m5
309    pxor       m0, m5
310    pshufd     m2, m2, q0123
311    pshufd     m0, m0, q0123
312    SBUTTERFLY dq, 2, 3, 4
313    SBUTTERFLY dq, 0, 1, 4
314    mova  [r2q + 2*r3q + 0*mmsize], m2
315    mova  [r2q + 2*r3q + 1*mmsize], m3
316    mova  [r2q + 2*r3q + 2*mmsize], m0
317    mova  [r2q + 2*r3q + 3*mmsize], m1
318    add       r1q, 2*mmsize
319    sub       r3q, 2*mmsize
320    jge      .loop
321    movq       m2, [zq]
322    movq    [r2q], m2
323    REP_RET
324
325%ifdef PIC
326%define NREGS 1
327%if UNIX64
328%define NOISE_TABLE r6q ; r5q is m_max
329%else
330%define NOISE_TABLE r5q
331%endif
332%else
333%define NREGS 0
334%define NOISE_TABLE sbr_noise_table
335%endif
336
337%macro LOAD_NST  1
338%ifdef PIC
339    lea  NOISE_TABLE, [%1]
340    mova          m0, [kxq + NOISE_TABLE]
341%else
342    mova          m0, [kxq + %1]
343%endif
344%endmacro
345
346INIT_XMM sse2
347; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
348;                      const float *q_filt, int noise,
349;                      int kx, int m_max)
350cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
351    mova       m0, [ps_noise0]
352    jmp apply_noise_main
353
354; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
355;                      const float *q_filt, int noise,
356;                      int kx, int m_max)
357cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
358    and       kxq, 1
359    shl       kxq, 4
360    LOAD_NST  ps_noise13
361    jmp apply_noise_main
362
363; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
364;                      const float *q_filt, int noise,
365;                      int kx, int m_max)
366cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
367    mova       m0, [ps_noise2]
368    jmp apply_noise_main
369
370; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
371;                      const float *q_filt, int noise,
372;                      int kx, int m_max)
373cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
374    and       kxq, 1
375    shl       kxq, 4
376    LOAD_NST  ps_noise13+16
377
378apply_noise_main:
379%if ARCH_X86_64 == 0 || WIN64
380    mov       kxd, m_maxm
381    DEFINE_ARGS Y, s_m, q_filt, noise, count
382%else
383    DEFINE_ARGS Y, s_m, q_filt, noise, kx, count
384%endif
385    movsxdifnidn    noiseq, noised
386    dec    noiseq
387    shl    countd, 2
388%ifdef PIC
389    lea NOISE_TABLE, [sbr_noise_table]
390%endif
391    lea        Yq, [Yq + 2*countq]
392    add      s_mq, countq
393    add   q_filtq, countq
394    shl    noiseq, 3
395    pxor       m5, m5
396    neg    countq
397.loop:
398    mova       m1, [q_filtq + countq]
399    movu       m3, [noiseq + NOISE_TABLE + 1*mmsize]
400    movu       m4, [noiseq + NOISE_TABLE + 2*mmsize]
401    add    noiseq, 2*mmsize
402    and    noiseq, 0x1ff<<3
403    punpckhdq  m2, m1, m1
404    punpckldq  m1, m1
405    mulps      m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
406    mulps      m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
407    mova       m3, [s_mq + countq]
408    ; TODO: replace by a vpermd in AVX2
409    punpckhdq  m4, m3, m3
410    punpckldq  m3, m3
411    pcmpeqd    m6, m3, m5 ; m6 == 0
412    pcmpeqd    m7, m4, m5 ; m7 == 0
413    mulps      m3, m0 ; s_m[m] * phi_sign
414    mulps      m4, m0 ; s_m[m] * phi_sign
415    pand       m1, m6
416    pand       m2, m7
417    movu       m6, [Yq + 2*countq]
418    movu       m7, [Yq + 2*countq + mmsize]
419    addps      m3, m1
420    addps      m4, m2
421    addps      m6, m3
422    addps      m7, m4
423    movu    [Yq + 2*countq], m6
424    movu    [Yq + 2*countq + mmsize], m7
425    add    countq, mmsize
426    jl      .loop
427    RET
428
429INIT_XMM sse
430cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
431%define COUNT  32*4
432%define OFFSET 32*4
433    mov        cq, -COUNT
434    lea     vrevq, [vq + OFFSET + COUNT]
435    add        vq, OFFSET-mmsize
436    add      srcq, 2*COUNT
437    mova       m3, [ps_neg]
438.loop:
439    mova       m0, [srcq + 2*cq + 0*mmsize]
440    mova       m1, [srcq + 2*cq + 1*mmsize]
441    shufps     m2, m0, m1, q2020
442    shufps     m1, m0, q1313
443    xorps      m2, m3
444    mova     [vq], m1
445    mova  [vrevq + cq], m2
446    sub        vq, mmsize
447    add        cq, mmsize
448    jl      .loop
449    REP_RET
450
451%macro SBR_AUTOCORRELATE 0
452cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
453    mov   cntq, 37*8
454    add     xq, cntq
455    neg   cntq
456
457%if cpuflag(sse3)
458%define   MOVH  movsd
459    movddup m5, [xq+cntq]
460%else
461%define   MOVH  movlps
462    movlps  m5, [xq+cntq]
463    movlhps m5, m5
464%endif
465    MOVH    m7, [xq+cntq+8 ]
466    MOVH    m1, [xq+cntq+16]
467    shufps  m7, m7, q0110
468    shufps  m1, m1, q0110
469    mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
470    mulps   m4, m5, m5   ;              x[0][0] * x[0][0], x[0][1] * x[0][1];
471    mulps   m5, m1       ; real_sum2  = x[0][0] * x[2][0], x[0][1] * x[2][1]; imag_sum2 = x[0][0] * x[2][1], x[0][1] * x[2][0]
472    movaps  [rsp   ], m3
473    movaps  [rsp+16], m4
474    add   cntq, 8
475
476    MOVH    m2, [xq+cntq+16]
477    movlhps m7, m7
478    shufps  m2, m2, q0110
479    mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
480    mulps   m4, m7, m2
481    mulps   m7, m7       ; real_sum0  = x[1][0] * x[1][0], x[1][1] * x[1][1];
482    addps   m5, m4       ; real_sum2 += x[1][0] * x[3][0], x[1][1] * x[3][1]; imag_sum2 += x[1][0] * x[3][1], x[1][1] * x[3][0]
483
484align 16
485.loop:
486    add   cntq, 8
487    MOVH    m0, [xq+cntq+16]
488    movlhps m1, m1
489    shufps  m0, m0, q0110
490    mulps   m3, m1, m2
491    mulps   m4, m1, m0
492    mulps   m1, m1
493    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
494    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
495    addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
496    add   cntq, 8
497    MOVH    m1, [xq+cntq+16]
498    movlhps m2, m2
499    shufps  m1, m1, q0110
500    mulps   m3, m2, m0
501    mulps   m4, m2, m1
502    mulps   m2, m2
503    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
504    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
505    addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
506    add   cntq, 8
507    MOVH    m2, [xq+cntq+16]
508    movlhps m0, m0
509    shufps  m2, m2, q0110
510    mulps   m3, m0, m1
511    mulps   m4, m0, m2
512    mulps   m0, m0
513    addps   m6, m3       ; real_sum1 += x[i][0] * x[i + 1][0], x[i][1] * x[i + 1][1]; imag_sum1 += x[i][0] * x[i + 1][1], x[i][1] * x[i + 1][0];
514    addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
515    addps   m7, m0       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
516    jl .loop
517
518    movlhps m1, m1
519    mulps   m2, m1
520    mulps   m1, m1
521    addps   m2, m6       ; real_sum1 + x[38][0] * x[39][0], x[38][1] * x[39][1]; imag_sum1 + x[38][0] * x[39][1], x[38][1] * x[39][0];
522    addps   m1, m7       ; real_sum0 + x[38][0] * x[38][0], x[38][1] * x[38][1];
523    addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0], x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1], x[ 0][1] * x[ 1][0];
524    addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0], x[ 0][1] * x[ 0][1];
525
526    xorps   m2, [ps_mask3]
527    xorps   m5, [ps_mask3]
528    xorps   m6, [ps_mask3]
529    HADDPS  m2, m5, m3
530    HADDPS  m7, m6, m4
531%if cpuflag(sse3)
532    movshdup m0, m1
533%else
534    movss   m0, m1
535    shufps  m1, m1, q0001
536%endif
537    addss   m1, m0
538    movaps  [phiq     ], m2
539    movhps  [phiq+0x18], m7
540    movss   [phiq+0x28], m7
541    movss   [phiq+0x10], m1
542    RET
543%endmacro
544
545INIT_XMM sse
546SBR_AUTOCORRELATE
547INIT_XMM sse3
548SBR_AUTOCORRELATE
549