1;******************************************************************************
2;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25pw_row_coeffs:  times 4 dw 13
26                times 4 dw 17
27                times 4 dw  7
28pd_512: times 2 dd 0x200
29pw_col_coeffs:  dw 13,  13,  13, -13
30                dw 17,   7,   7, -17
31                dw 13, -13,  13,  13
32                dw -7,  17, -17,  -7
33
34SECTION .text
35
36%macro IDCT_DC_NOROUND 1
37    imul   %1, 13*13*3
38    sar    %1, 11
39%endmacro
40
41%macro IDCT_DC_ROUND 1
42    imul   %1, 13*13
43    add    %1, 0x200
44    sar    %1, 10
45%endmacro
46
47%macro rv34_idct 1
48cglobal rv34_idct_%1, 1, 2, 0
49    movsx   r1, word [r0]
50    IDCT_DC r1
51    movd    m0, r1d
52    pshufw  m0, m0, 0
53    movq    [r0+ 0], m0
54    movq    [r0+ 8], m0
55    movq    [r0+16], m0
56    movq    [r0+24], m0
57    REP_RET
58%endmacro
59
60INIT_MMX mmxext
61%define IDCT_DC IDCT_DC_ROUND
62rv34_idct dc
63%define IDCT_DC IDCT_DC_NOROUND
64rv34_idct dc_noround
65
66; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
67%if ARCH_X86_32
68INIT_MMX mmx
69cglobal rv34_idct_dc_add, 3, 3
70    ; calculate DC
71    IDCT_DC_ROUND r2
72    pxor       m1, m1
73    movd       m0, r2d
74    psubw      m1, m0
75    packuswb   m0, m0
76    packuswb   m1, m1
77    punpcklbw  m0, m0
78    punpcklbw  m1, m1
79    punpcklwd  m0, m0
80    punpcklwd  m1, m1
81
82    ; add DC
83    lea        r2, [r0+r1*2]
84    movh       m2, [r0]
85    movh       m3, [r0+r1]
86    movh       m4, [r2]
87    movh       m5, [r2+r1]
88    paddusb    m2, m0
89    paddusb    m3, m0
90    paddusb    m4, m0
91    paddusb    m5, m0
92    psubusb    m2, m1
93    psubusb    m3, m1
94    psubusb    m4, m1
95    psubusb    m5, m1
96    movh       [r0], m2
97    movh       [r0+r1], m3
98    movh       [r2], m4
99    movh       [r2+r1], m5
100    RET
101%endif
102
103; Load coeffs and perform row transform
104; Output: coeffs in mm[0467], rounder in mm5
105%macro ROW_TRANSFORM  1
106    pxor        mm7, mm7
107    mova        mm0, [%1+ 0*8]
108    mova        mm1, [%1+ 1*8]
109    mova        mm2, [%1+ 2*8]
110    mova        mm3, [%1+ 3*8]
111    mova  [%1+ 0*8], mm7
112    mova  [%1+ 1*8], mm7
113    mova  [%1+ 2*8], mm7
114    mova  [%1+ 3*8], mm7
115    mova        mm4, mm0
116    mova        mm6, [pw_row_coeffs+ 0]
117    paddsw      mm0, mm2                ; b0 + b2
118    psubsw      mm4, mm2                ; b0 - b2
119    pmullw      mm0, mm6                ; *13 = z0
120    pmullw      mm4, mm6                ; *13 = z1
121    mova        mm5, mm1
122    pmullw      mm1, [pw_row_coeffs+ 8] ; b1*17
123    pmullw      mm5, [pw_row_coeffs+16] ; b1* 7
124    mova        mm7, mm3
125    pmullw      mm3, [pw_row_coeffs+ 8] ; b3*17
126    pmullw      mm7, [pw_row_coeffs+16] ; b3* 7
127    paddsw      mm1, mm7                ; z3 = b1*17 + b3* 7
128    psubsw      mm5, mm3                ; z2 = b1* 7 - b3*17
129    mova        mm7, mm0
130    mova        mm6, mm4
131    paddsw      mm0, mm1                ; z0 + z3
132    psubsw      mm7, mm1                ; z0 - z3
133    paddsw      mm4, mm5                ; z1 + z2
134    psubsw      mm6, mm5                ; z1 - z2
135    mova        mm5, [pd_512]           ; 0x200
136%endmacro
137
138; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
139%macro COL_TRANSFORM  4
140    pshufw      mm3, %2, 0xDD        ; col. 1,3,1,3
141    pshufw       %2, %2, 0x88        ; col. 0,2,0,2
142    pmaddwd      %2, %3              ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
143    pmaddwd     mm3, %4              ; 17*c1+ 7*c3 |  7*c1-17*c3 = z3 | z2
144    paddd        %2, mm5
145    pshufw      mm1,  %2, 01001110b  ;    z1 | z0
146    pshufw      mm2, mm3, 01001110b  ;    z2 | z3
147    paddd        %2, mm3             ; z0+z3 | z1+z2
148    psubd       mm1, mm2             ; z1-z2 | z0-z3
149    movd        mm3, %1
150    psrad        %2, 10
151    pxor        mm2, mm2
152    psrad       mm1, 10
153    punpcklbw   mm3, mm2
154    packssdw     %2, mm1
155    paddw        %2, mm3
156    packuswb     %2, %2
157    movd         %1, %2
158%endmacro
159INIT_MMX mmxext
160cglobal rv34_idct_add, 3,3,0, d, s, b
161    ROW_TRANSFORM       bq
162    COL_TRANSFORM     [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
163    mova               mm0, [pw_col_coeffs+ 0]
164    COL_TRANSFORM  [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
165    mova               mm4, [pw_col_coeffs+ 8]
166    lea                 dq, [dq + 2*sq]
167    COL_TRANSFORM     [dq], mm6, mm0, mm4
168    COL_TRANSFORM  [dq+sq], mm7, mm0, mm4
169    ret
170
171; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
172%macro RV34_IDCT_DC_ADD 0
173cglobal rv34_idct_dc_add, 3, 3, 6
174    ; load data
175    IDCT_DC_ROUND r2
176    pxor       m1, m1
177
178    ; calculate DC
179    movd       m0, r2d
180    lea        r2, [r0+r1*2]
181    movd       m2, [r0]
182    movd       m3, [r0+r1]
183    pshuflw    m0, m0, 0
184    movd       m4, [r2]
185    movd       m5, [r2+r1]
186    punpcklqdq m0, m0
187    punpckldq  m2, m3
188    punpckldq  m4, m5
189    punpcklbw  m2, m1
190    punpcklbw  m4, m1
191    paddw      m2, m0
192    paddw      m4, m0
193    packuswb   m2, m4
194    movd      [r0], m2
195%if cpuflag(sse4)
196    pextrd [r0+r1], m2, 1
197    pextrd    [r2], m2, 2
198    pextrd [r2+r1], m2, 3
199%else
200    psrldq     m2, 4
201    movd   [r0+r1], m2
202    psrldq     m2, 4
203    movd      [r2], m2
204    psrldq     m2, 4
205    movd   [r2+r1], m2
206%endif
207    RET
208%endmacro
209
210INIT_XMM sse2
211RV34_IDCT_DC_ADD
212INIT_XMM sse4
213RV34_IDCT_DC_ADD
214