1;******************************************************************************
2;* Vorbis x86 optimizations
3;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26pdw_80000000: times 4 dd 0x80000000
27
28SECTION .text
29
30%if ARCH_X86_32
31INIT_MMX 3dnow
32cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
33    pxor                     m7, m7
34    lea                    magq, [magq+block_sizeq*4]
35    lea                    angq, [angq+block_sizeq*4]
36    neg             block_sizeq
37.loop:
38    mova                     m0, [magq+block_sizeq*4]
39    mova                     m1, [angq+block_sizeq*4]
40    mova                     m2, m0
41    mova                     m3, m1
42    pfcmpge                  m2, m7     ; m <= 0.0
43    pfcmpge                  m3, m7     ; a <= 0.0
44    pslld                    m2, 31     ; keep only the sign bit
45    pxor                     m1, m2
46    mova                     m4, m3
47    pand                     m3, m1
48    pandn                    m4, m1
49    pfadd                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
50    pfsub                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
51    mova   [angq+block_sizeq*4], m3
52    mova   [magq+block_sizeq*4], m0
53    add             block_sizeq, 2
54    jl .loop
55    femms
56    RET
57%endif
58
59INIT_XMM sse
60cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
61    mova                     m5, [pdw_80000000]
62    xor                   cntrq, cntrq
63align 16
64.loop:
65    mova                     m0, [magq+cntrq*4]
66    mova                     m1, [angq+cntrq*4]
67    xorps                    m2, m2
68    xorps                    m3, m3
69    cmpleps                  m2, m0     ; m <= 0.0
70    cmpleps                  m3, m1     ; a <= 0.0
71    andps                    m2, m5     ; keep only the sign bit
72    xorps                    m1, m2
73    mova                     m4, m3
74    andps                    m3, m1
75    andnps                   m4, m1
76    addps                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
77    subps                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
78    mova         [angq+cntrq*4], m3
79    mova         [magq+cntrq*4], m0
80    add                   cntrq, 4
81    cmp                   cntrq, block_sizeq
82    jl .loop
83    RET
84