1;******************************************************************************
2;* SIMD-optimized JPEG2000 DSP functions
3;* Copyright (c) 2014 Nicolas Bertrand
4;* Copyright (c) 2015 James Almer
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;******************************************************************************
22
23%include "libavutil/x86/x86util.asm"
24
25SECTION_RODATA 32
26
27pf_ict0: times 8 dd 1.402
28pf_ict1: times 8 dd 0.34413
29pf_ict2: times 8 dd 0.71414
30pf_ict3: times 8 dd 1.772
31
32SECTION .text
33
34;***********************************************************************
35; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
36;***********************************************************************
37%macro ICT_FLOAT 1
38cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
39    shl  csized, 2
40    add   src0q, csizeq
41    add   src1q, csizeq
42    add   src2q, csizeq
43    neg  csizeq
44    movaps   m6, [pf_ict0]
45    movaps   m7, [pf_ict1]
46    %define ICT0 m6
47    %define ICT1 m7
48
49%if ARCH_X86_64
50    movaps   m8, [pf_ict2]
51    %define ICT2 m8
52%if cpuflag(avx)
53    movaps   m3, [pf_ict3]
54    %define ICT3 m3
55%else
56    movaps   m9, [pf_ict3]
57    %define ICT3 m9
58%endif
59
60%else ; ARCH_X86_32
61    %define ICT2 [pf_ict2]
62%if cpuflag(avx)
63    movaps   m3, [pf_ict3]
64    %define ICT3 m3
65%else
66    %define ICT3 [pf_ict3]
67%endif
68
69%endif ; ARCH
70
71align 16
72.loop:
73    movaps   m0, [src0q+csizeq]
74    movaps   m1, [src1q+csizeq]
75    movaps   m2, [src2q+csizeq]
76
77%if cpuflag(fma4) || cpuflag(fma3)
78%if cpuflag(fma4)
79    fnmaddps  m5, m1, ICT1, m0
80    fmaddps   m4, m2, ICT0, m0
81%else ; fma3
82    movaps    m5, m1
83    movaps    m4, m2
84    fnmaddps  m5, m5, ICT1, m0
85    fmaddps   m4, m4, ICT0, m0
86%endif
87    fmaddps   m0, m1, ICT3, m0
88    fnmaddps  m5, m2, ICT2, m5
89%else ; non FMA
90%if cpuflag(avx)
91    mulps    m5, m1, ICT1
92    mulps    m4, m2, ICT0
93    mulps    m1, m1, ICT3
94    mulps    m2, m2, ICT2
95    subps    m5, m0, m5
96%else ; sse
97    movaps   m3, m1
98    movaps   m4, m2
99    movaps   m5, m0
100    mulps    m3, ICT1
101    mulps    m4, ICT0
102    mulps    m1, ICT3
103    mulps    m2, ICT2
104    subps    m5, m3
105%endif
106    addps    m4, m4, m0
107    addps    m0, m0, m1
108    subps    m5, m5, m2
109%endif
110
111    movaps   [src0q+csizeq], m4
112    movaps   [src2q+csizeq], m0
113    movaps   [src1q+csizeq], m5
114    add  csizeq, mmsize
115    jl .loop
116    REP_RET
117%endmacro
118
119INIT_XMM sse
120ICT_FLOAT 10
121INIT_YMM avx
122ICT_FLOAT 9
123%if HAVE_FMA4_EXTERNAL
124INIT_XMM fma4
125ICT_FLOAT 9
126%endif
127INIT_YMM fma3
128ICT_FLOAT 9
129
130;***************************************************************************
131; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
132;***************************************************************************
133%macro RCT_INT 0
134cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
135    shl  csized, 2
136    add   src0q, csizeq
137    add   src1q, csizeq
138    add   src2q, csizeq
139    neg  csizeq
140
141align 16
142.loop:
143    mova   m1, [src1q+csizeq]
144    mova   m2, [src2q+csizeq]
145    mova   m0, [src0q+csizeq]
146    paddd  m3, m1, m2
147    psrad  m3, 2
148    psubd  m0, m3
149    paddd  m1, m0
150    paddd  m2, m0
151    mova   [src1q+csizeq], m0
152    mova   [src2q+csizeq], m1
153    mova   [src0q+csizeq], m2
154    add  csizeq, mmsize
155    jl .loop
156    REP_RET
157%endmacro
158
159INIT_XMM sse2
160RCT_INT
161%if HAVE_AVX2_EXTERNAL
162INIT_YMM avx2
163RCT_INT
164%endif
165