1;******************************************************************************
2;* Vorbis x86 optimizations
3;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25
26pdw_80000000: times 4 dd 0x80000000
27
28SECTION .text
29
30%if ARCH_X86_32
31INIT_MMX 3dnow
32cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
33    pxor                     m7, m7
34    lea                    magq, [magq+block_sizeq*4]
35    lea                    angq, [angq+block_sizeq*4]
36    neg             block_sizeq
37.loop:
38    mova                     m0, [magq+block_sizeq*4]
39    mova                     m1, [angq+block_sizeq*4]
40    mova                     m2, m0
41    mova                     m3, m1
42    pfcmpge                  m2, m7     ; m <= 0.0
43    pfcmpge                  m3, m7     ; a <= 0.0
44    pslld                    m2, 31     ; keep only the sign bit
45    pxor                     m1, m2
46    mova                     m4, m3
47    pand                     m3, m1
48    pandn                    m4, m1
49    pfadd                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
50    pfsub                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
51    mova   [angq+block_sizeq*4], m3
52    mova   [magq+block_sizeq*4], m0
53    add             block_sizeq, 2
54    jl .loop
55    femms
56    RET
57%endif
58
59INIT_XMM sse
60cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
61    mova                     m5, [pdw_80000000]
62    shl             block_sized, 2
63    add                    magq, block_sizeq
64    add                    angq, block_sizeq
65    neg             block_sizeq
66
67align 16
68.loop:
69    mova                     m0, [magq+block_sizeq]
70    mova                     m1, [angq+block_sizeq]
71    xorps                    m2, m2
72    xorps                    m3, m3
73    cmpleps                  m2, m0     ; m <= 0.0
74    cmpleps                  m3, m1     ; a <= 0.0
75    andps                    m2, m5     ; keep only the sign bit
76    xorps                    m1, m2
77    mova                     m4, m3
78    andps                    m3, m1
79    andnps                   m4, m1
80    addps                    m3, m0     ; a = m + ((a < 0) & (a ^ sign(m)))
81    subps                    m0, m4     ; m = m + ((a > 0) & (a ^ sign(m)))
82    mova     [angq+block_sizeq], m3
83    mova     [magq+block_sizeq], m0
84    add             block_sizeq, mmsize
85    jl .loop
86    RET
87