1;******************************************************************************
2;* FLAC DSP SIMD optimizations
3;*
4;* Copyright (C) 2014 Loren Merritt
5;* Copyright (C) 2014 James Almer
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28%macro PMACSDQL 5
29%if cpuflag(xop)
30    pmacsdql %1, %2, %3, %1
31%else
32    pmuldq   %2, %3
33    paddq    %1, %2
34%endif
35%endmacro
36
37%macro LPC_32 1
38INIT_XMM %1
39cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
40    sub    lend, pred_orderd
41    jle .ret
42    lea    decodedq, [decodedq+pred_orderq*4-8]
43    lea    coeffsq, [coeffsq+pred_orderq*4]
44    neg    pred_orderq
45    movd   m4, qlevelm
46ALIGN 16
47.loop_sample:
48    movd   m0, [decodedq+pred_orderq*4+8]
49    add    decodedq, 8
50    movd   m1, [coeffsq+pred_orderq*4]
51    pxor   m2, m2
52    pxor   m3, m3
53    lea    jq, [pred_orderq+1]
54    test   jq, jq
55    jz .end_order
56.loop_order:
57    PMACSDQL m2, m0, m1, m2, m0
58    movd   m0, [decodedq+jq*4]
59    PMACSDQL m3, m1, m0, m3, m1
60    movd   m1, [coeffsq+jq*4]
61    inc    jq
62    jl .loop_order
63.end_order:
64    PMACSDQL m2, m0, m1, m2, m0
65    psrlq  m2, m4
66    movd   m0, [decodedq]
67    paddd  m0, m2
68    movd   [decodedq], m0
69    sub  lend, 2
70    jl .ret
71    PMACSDQL m3, m1, m0, m3, m1
72    psrlq  m3, m4
73    movd   m1, [decodedq+4]
74    paddd  m1, m3
75    movd   [decodedq+4], m1
76    jg .loop_sample
77.ret:
78    REP_RET
79%endmacro
80
81%if HAVE_XOP_EXTERNAL
82LPC_32 xop
83%endif
84LPC_32 sse4
85
86;----------------------------------------------------------------------------------
87;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
88;                                                   int len, int shift);
89;----------------------------------------------------------------------------------
90%macro FLAC_DECORRELATE_16 3-4
91cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
92%if ARCH_X86_32
93    mov      lend, lenm
94%endif
95    movd       m3, r4m
96    shl      lend, 2
97    mov      in1q, [in0q + gprsize]
98    mov      in0q, [in0q]
99    mov      outq, [outq]
100    add      in1q, lenq
101    add      in0q, lenq
102    add      outq, lenq
103    neg      lenq
104
105align 16
106.loop:
107    mova       m0, [in0q + lenq]
108    mova       m1, [in1q + lenq]
109%ifidn %1, ms
110    psrad      m2, m1, 1
111    psubd      m0, m2
112%endif
113%ifnidn %1, indep2
114    p%4d       m2, m0, m1
115%endif
116    packssdw  m%2, m%2
117    packssdw  m%3, m%3
118    punpcklwd m%2, m%3
119    psllw     m%2, m3
120    mova [outq + lenq], m%2
121    add      lenq, 16
122    jl .loop
123    REP_RET
124%endmacro
125
126INIT_XMM sse2
127FLAC_DECORRELATE_16 ls, 0, 2, sub
128FLAC_DECORRELATE_16 rs, 2, 1, add
129FLAC_DECORRELATE_16 ms, 2, 0, add
130
131;----------------------------------------------------------------------------------
132;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
133;                                        int len, int shift);
134;----------------------------------------------------------------------------------
135%macro FLAC_DECORRELATE_32 5
136cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
137%if ARCH_X86_32
138    mov      lend, lenm
139%endif
140    movd       m3, r4m
141    mov      in1q, [in0q + gprsize]
142    mov      in0q, [in0q]
143    mov      outq, [outq]
144    sub      in1q, in0q
145
146align 16
147.loop:
148    mova       m0, [in0q]
149    mova       m1, [in0q + in1q]
150%ifidn %1, ms
151    psrad      m2, m1, 1
152    psubd      m0, m2
153%endif
154    p%5d       m2, m0, m1
155    pslld     m%2, m3
156    pslld     m%3, m3
157
158    SBUTTERFLY dq, %2, %3, %4
159
160    mova  [outq         ], m%2
161    mova  [outq + mmsize], m%3
162
163    add      in0q, mmsize
164    add      outq, mmsize*2
165    sub      lend, mmsize/4
166    jg .loop
167    REP_RET
168%endmacro
169
170INIT_XMM sse2
171FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
172FLAC_DECORRELATE_32 rs, 2, 1, 0, add
173FLAC_DECORRELATE_32 ms, 2, 0, 1, add
174
175;-----------------------------------------------------------------------------------------
176;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
177;                                            int len, int shift);
178;-----------------------------------------------------------------------------------------
179;%1 = bps
180;%2 = channels
181;%3 = last xmm reg used
182;%4 = word/dword (shift instruction)
183%macro FLAC_DECORRELATE_INDEP 4
184%define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
185cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
186%if ARCH_X86_32
187%if %2 == 6
188    DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
189    %define  lend  dword r3m
190%else
191    mov      lend, lenm
192%endif
193%endif
194    movd      m%3, r4m
195
196%assign %%i 1
197%rep %2-1
198    mov      in %+ %%i %+ q, [in0q+%%i*gprsize]
199%assign %%i %%i+1
200%endrep
201
202    mov      in0q, [in0q]
203    mov      outq, [outq]
204
205%assign %%i 1
206%rep %2-1
207    sub      in %+ %%i %+ q, in0q
208%assign %%i %%i+1
209%endrep
210
211align 16
212.loop:
213    mova       m0, [in0q]
214
215%assign %%i 1
216%rep REPCOUNT-1
217    mova     m %+ %%i, [in0q + in %+ %%i %+ q]
218%assign %%i %%i+1
219%endrep
220
221%if %1 == 32
222
223%if %2 == 8
224    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
225%elif %2 == 6
226    SBUTTERFLY dq, 0, 1, 6
227    SBUTTERFLY dq, 2, 3, 6
228    SBUTTERFLY dq, 4, 5, 6
229
230    punpcklqdq m6, m0, m2
231    punpckhqdq m2, m4
232    shufps     m4, m0, 0xe4
233    punpcklqdq m0, m1, m3
234    punpckhqdq m3, m5
235    shufps     m5, m1, 0xe4
236    SWAP 0,6,1,4,5,3
237%elif %2 == 4
238    TRANSPOSE4x4D 0, 1, 2, 3, 4
239%else ; %2 == 2
240    SBUTTERFLY dq, 0, 1, 2
241%endif
242
243%else ; %1 == 16
244
245%if %2 == 8
246    packssdw   m0, [in0q + in4q]
247    packssdw   m1, [in0q + in5q]
248    packssdw   m2, [in0q + in6q]
249    packssdw   m3, [in0q + in7q]
250    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
251%elif %2 == 6
252    packssdw   m0, [in0q + in3q]
253    packssdw   m1, [in0q + in4q]
254    packssdw   m2, [in0q + in5q]
255    pshufd     m3, m0,     q1032
256    punpcklwd  m0, m1
257    punpckhwd  m1, m2
258    punpcklwd  m2, m3
259
260    shufps     m3, m0, m2, q2020
261    shufps     m0, m1,     q2031
262    shufps     m2, m1,     q3131
263    shufps     m1, m2, m3, q3120
264    shufps     m3, m0,     q0220
265    shufps     m0, m2,     q3113
266    SWAP 2, 0, 3
267%else ; %2 == 4
268    packssdw   m0, [in0q + in2q]
269    packssdw   m1, [in0q + in3q]
270    SBUTTERFLY wd, 0, 1, 2
271    SBUTTERFLY dq, 0, 1, 2
272%endif
273
274%endif
275
276%assign %%i 0
277%rep REPCOUNT
278    psll%4   m %+ %%i, m%3
279%assign %%i %%i+1
280%endrep
281
282%assign %%i 0
283%rep REPCOUNT
284    mova [outq + %%i*mmsize], m %+ %%i
285%assign %%i %%i+1
286%endrep
287
288    add      in0q, mmsize
289    add      outq, mmsize*REPCOUNT
290    sub      lend, mmsize/4
291    jg .loop
292    REP_RET
293%endmacro
294
295INIT_XMM sse2
296FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
297FLAC_DECORRELATE_INDEP 32, 2, 3, d
298FLAC_DECORRELATE_INDEP 16, 4, 3, w
299FLAC_DECORRELATE_INDEP 32, 4, 5, d
300FLAC_DECORRELATE_INDEP 16, 6, 4, w
301FLAC_DECORRELATE_INDEP 32, 6, 7, d
302%if ARCH_X86_64
303FLAC_DECORRELATE_INDEP 16, 8, 5, w
304FLAC_DECORRELATE_INDEP 32, 8, 9, d
305%endif
306
307INIT_XMM avx
308FLAC_DECORRELATE_INDEP 32, 4, 5, d
309FLAC_DECORRELATE_INDEP 32, 6, 7, d
310%if ARCH_X86_64
311FLAC_DECORRELATE_INDEP 16, 8, 5, w
312FLAC_DECORRELATE_INDEP 32, 8, 9, d
313%endif
314