1;******************************************************************************
2;* FLAC DSP functions
3;*
4;* Copyright (c) 2014 James Darnley <james.darnley@gmail.com>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or modify
9;* it under the terms of the GNU General Public License as published by
10;* the Free Software Foundation; either version 2 of the License, or
11;* (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16;* GNU General Public License for more details.
17;*
18;* You should have received a copy of the GNU General Public License along
19;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION .text
26
27INIT_XMM sse4
28%if ARCH_X86_64
29    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
30    DECLARE_REG_TMP 5, 6
31    %define length r2d
32
33    movsxd orderq, orderd
34%else
35    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
36    DECLARE_REG_TMP 2, 5
37    %define length r2mp
38%endif
39
40; Here we assume that the maximum order value is 32.  This means that we only
41; need to copy a maximum of 32 samples.  Therefore we let the preprocessor
42; unroll this loop and copy all 32.
43%assign iter 0
44%rep 32/(mmsize/4)
45    movu  m0,         [smpq+iter]
46    movu [resq+iter],  m0
47    %assign iter iter+mmsize
48%endrep
49
50lea  resq,   [resq+orderq*4]
51lea  smpq,   [smpq+orderq*4]
52lea  coefsq, [coefsq+orderq*4]
53sub  length,  orderd
54movd m3,      r5m
55neg  orderq
56
57%define posj t0q
58%define negj t1q
59
60.looplen:
61    pxor m0,   m0
62    pxor m4,   m4
63    pxor m6,   m6
64    mov  posj, orderq
65    xor  negj, negj
66
67    .looporder:
68        movd   m2, [coefsq+posj*4] ; c = coefs[j]
69        SPLATD m2
70        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
71        movu   m5, [smpq+negj*4-4+mmsize]
72        movu   m7, [smpq+negj*4-4+mmsize*2]
73        pmulld m1,  m2
74        pmulld m5,  m2
75        pmulld m7,  m2
76        paddd  m0,  m1             ; p += c * s
77        paddd  m4,  m5
78        paddd  m6,  m7
79
80        dec    negj
81        inc    posj
82    jnz .looporder
83
84    psrad  m0,     m3              ; p >>= shift
85    psrad  m4,     m3
86    psrad  m6,     m3
87    movu   m1,    [smpq]
88    movu   m5,    [smpq+mmsize]
89    movu   m7,    [smpq+mmsize*2]
90    psubd  m1,     m0              ; smp[i] - p
91    psubd  m5,     m4
92    psubd  m7,     m6
93    movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
94    movu  [resq+mmsize], m5
95    movu  [resq+mmsize*2], m7
96
97    add resq,    3*mmsize
98    add smpq,    3*mmsize
99    sub length, (3*mmsize)/4
100jg .looplen
101RET
102