1;******************************************************************************
2;* SIMD optimized SBC encoder DSP functions
3;*
4;* Copyright (C) 2017  Aurelien Jacobs <aurel@gnuage.org>
5;* Copyright (C) 2008-2010  Nokia Corporation
6;* Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
7;* Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
8;* Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29SECTION_RODATA
30
31scale_mask: times 2 dd 0x8000    ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
32
33SECTION .text
34
35%macro NIDN 3
36%ifnidn %2, %3
37    %1            %2, %3
38%endif
39%endmacro
40
41%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
42    NIDN movq,    %5, %3
43    NIDN movq,    %6, %4
44    pmaddwd       %5, [constsq+%9]
45    pmaddwd       %6, [constsq+%9+8]
46    NIDN paddd,   %1, %7
47    NIDN paddd,   %2, %8
48%endmacro
49
50%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
51    ANALYZE_MAC   %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
52%endmacro
53
54%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
55%ifidn %7, pack
56    psrad         %3, 16    ; SBC_PROTO_FIXED_SCALE
57    packssdw      %3, %3
58%endif
59    ANALYZE_MAC   %1, %2, %3, %3, %4, %5, %4, %5, %6
60%endmacro
61
62;*******************************************************************
63;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
64;*******************************************************************
65INIT_MMX mmx
66cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
67    ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask], 0
68    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 16
69    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 32
70    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 48
71    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 64
72
73    ANALYZE_MAC_REG  m0, m2, m0, m0, m2, 80, pack
74    ANALYZE_MAC_REG  m0, m2, m1, m1, m3, 96, pack
75
76    movq          [outq  ], m0
77    movq          [outq+8], m2
78
79    RET
80
81
82;*******************************************************************
83;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
84;*******************************************************************
85INIT_MMX mmx
86cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
87    ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask],  0
88    ANALYZE_MAC_IN   m2, m3, m2, m3, [scale_mask], [scale_mask], 16
89    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  32
90    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  48
91    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  64
92    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  80
93    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  96
94    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 112
95    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5, 128
96    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 144
97
98    ANALYZE_MAC_REG  m4, m5, m0, m4, m5, 160, pack
99    ANALYZE_MAC_REG  m4, m5, m1, m6, m7, 192, pack
100    ANALYZE_MAC_REG  m4, m5, m2, m6, m7, 224, pack
101    ANALYZE_MAC_REG  m4, m5, m3, m6, m7, 256, pack
102
103    movq          [outq  ], m4
104    movq          [outq+8], m5
105
106    ANALYZE_MAC_REG  m0, m5, m0, m0, m5, 176, no
107    ANALYZE_MAC_REG  m0, m5, m1, m1, m7, 208, no
108    ANALYZE_MAC_REG  m0, m5, m2, m2, m7, 240, no
109    ANALYZE_MAC_REG  m0, m5, m3, m3, m7, 272, no
110
111    movq          [outq+16], m0
112    movq          [outq+24], m5
113
114    RET
115
116
117;*******************************************************************
118;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
119;                              uint32_t scale_factor[2][8],
120;                              int blocks, int channels, int subbands)
121;*******************************************************************
122INIT_MMX mmx
123cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
124    ; subbands = 4 * subbands * channels
125    movq          m3, [scale_mask]
126    shl           subbandsd, 2
127    cmp           channelsd, 2
128    jl            .loop_1
129    shl           subbandsd, 1
130
131.loop_1:
132    sub           subbandsq, 8
133    lea           ptrq, [sb_sample_fq + subbandsq]
134
135    ; blk = (blocks - 1) * 64;
136    lea           blkq, [blocksq - 1]
137    shl           blkd, 6
138
139    movq          m0, m3
140.loop_2:
141    movq          m1, [ptrq+blkq]
142    pxor          m2, m2
143    pcmpgtd       m1, m2
144    paddd         m1, [ptrq+blkq]
145    pcmpgtd       m2, m1
146    pxor          m1, m2
147
148    por           m0, m1
149
150    sub           blkq, 64
151    jns           .loop_2
152
153    movd          blkd, m0
154    psrlq         m0,   32
155    bsr           blkd, blkd
156    sub           blkd, 15    ; SCALE_OUT_BITS
157    mov           [scale_factorq + subbandsq], blkd
158
159    movd          blkd, m0
160    bsr           blkd, blkd
161    sub           blkd, 15    ; SCALE_OUT_BITS
162    mov           [scale_factorq + subbandsq + 4], blkd
163
164    cmp           subbandsq, 0
165    jg            .loop_1
166
167    emms
168    RET
169