1;******************************************************************************
2;* ALAC DSP SIMD optimizations
3;*
4;* Copyright (C) 2015 James Almer
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION .text
26
27INIT_XMM sse4
28%if ARCH_X86_64
29cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
30%else
31cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
32%define  buf1q  r2q
33%endif
34    movd    m6, shiftm
35    movd    m7, weightm
36    SPLATD  m7
37    shl   lend, 2
38    mov  buf1q, [buf0q + gprsize]
39    mov  buf0q, [buf0q]
40    add  buf1q, lenq
41    add  buf0q, lenq
42    neg  lenq
43
44align 16
45.loop:
46    mova    m0, [buf0q + lenq]
47    mova    m1, [buf0q + lenq + mmsize]
48    mova    m2, [buf1q + lenq]
49    mova    m3, [buf1q + lenq + mmsize]
50    pmulld  m4, m2, m7
51    pmulld  m5, m3, m7
52    psrad   m4, m6
53    psrad   m5, m6
54    psubd   m0, m4
55    psubd   m1, m5
56    paddd   m2, m0
57    paddd   m3, m1
58    mova [buf1q + lenq], m0
59    mova [buf1q + lenq + mmsize], m1
60    mova [buf0q + lenq], m2
61    mova [buf0q + lenq + mmsize], m3
62
63    add   lenq, mmsize*2
64    jl .loop
65    RET
66
67INIT_XMM sse2
68cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
69    movifnidn lend, lenm
70    movd      m4, r2m ; exbits
71    shl     lend, 2
72    mov    buf1q, [buf0q + gprsize]
73    mov    buf0q, [buf0q]
74    mov  exbuf1q, [exbuf0q + gprsize]
75    mov  exbuf0q, [exbuf0q]
76    add    buf1q, lenq
77    add    buf0q, lenq
78    add  exbuf1q, lenq
79    add  exbuf0q, lenq
80    neg lenq
81
82align 16
83.loop:
84    mova      m0, [buf0q + lenq]
85    mova      m1, [buf0q + lenq + mmsize]
86    pslld     m0, m4
87    pslld     m1, m4
88    mova      m2, [buf1q + lenq]
89    mova      m3, [buf1q + lenq + mmsize]
90    pslld     m2, m4
91    pslld     m3, m4
92    por       m0, [exbuf0q + lenq]
93    por       m1, [exbuf0q + lenq + mmsize]
94    por       m2, [exbuf1q + lenq]
95    por       m3, [exbuf1q + lenq + mmsize]
96    mova [buf0q + lenq         ], m0
97    mova [buf0q + lenq + mmsize], m1
98    mova [buf1q + lenq         ], m2
99    mova [buf1q + lenq + mmsize], m3
100
101    add     lenq, mmsize*2
102    jl .loop
103    REP_RET
104
105%if ARCH_X86_64
106cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
107%else
108cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
109%define exbitsm r2m
110%endif
111    movifnidn lend, r4m
112    movd     m2, exbitsm
113    shl    lend, 2
114    mov    bufq, [bufq]
115    mov  exbufq, [exbufq]
116    add    bufq, lenq
117    add  exbufq, lenq
118    neg lenq
119
120align 16
121.loop:
122    mova      m0, [bufq + lenq]
123    mova      m1, [bufq + lenq + mmsize]
124    pslld     m0, m2
125    pslld     m1, m2
126    por       m0, [exbufq + lenq]
127    por       m1, [exbufq + lenq + mmsize]
128    mova [bufq + lenq], m0
129    mova [bufq + lenq + mmsize], m1
130
131    add     lenq, mmsize*2
132    jl .loop
133    REP_RET
134