1 /*
2 * Copyright (C) 2004 the ffmpeg project
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 /**
22 * @file vp3dsp_mmx.c
23 * MMX-optimized functions cribbed from the original VP3 source code.
24 */
25
26 #include "dsputil.h"
27 #include "mmx.h"
28
29 #define IdctAdjustBeforeShift 8
30
31 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
32 * idct_constants[0..15] = Mask table (M(I))
33 * idct_constants[16..43] = Cosine table (C(I))
34 * idct_constants[44..47] = 8
35 */
36 static uint16_t idct_constants[(4 + 7 + 1) * 4];
37 static const uint16_t idct_cosine_table[7] = {
38 64277, 60547, 54491, 46341, 36410, 25080, 12785
39 };
40
41 #define r0 mm0
42 #define r1 mm1
43 #define r2 mm2
44 #define r3 mm3
45 #define r4 mm4
46 #define r5 mm5
47 #define r6 mm6
48 #define r7 mm7
49
50 /* from original comments: The Macro does IDct on 4 1-D Dcts */
51 #define BeginIDCT() { \
52 movq_m2r(*I(3), r2); \
53 movq_m2r(*C(3), r6); \
54 movq_r2r(r2, r4); \
55 movq_m2r(*J(5), r7); \
56 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
57 movq_m2r(*C(5), r1); \
58 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
59 movq_r2r(r1, r5); \
60 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
61 movq_m2r(*I(1), r3); \
62 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
63 movq_m2r(*C(1), r0); /* (all registers are in use) */ \
64 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
65 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
66 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
67 movq_m2r(*J(7), r1); \
68 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
69 movq_r2r(r0, r5); /* r5 = c1 */ \
70 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
71 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
72 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
73 movq_m2r(*C(7), r7); \
74 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
75 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
76 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
77 movq_m2r(*I(2), r2); \
78 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
79 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
80 movq_r2r(r2, r1); /* r1 = i2 */ \
81 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
82 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
83 movq_m2r(*J(6), r5); \
84 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
85 movq_r2r(r5, r7); /* r7 = i6 */ \
86 psubsw_r2r(r4, r0); /* r0 = A - C */ \
87 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
88 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
89 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
90 paddsw_r2r(r4, r4); /* r4 = C + C */ \
91 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
92 psubsw_r2r(r6, r3); /* r3 = B - D */ \
93 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
94 paddsw_r2r(r6, r6); /* r6 = D + D */ \
95 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
96 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
97 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
98 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
99 movq_m2r(*C(4), r4); \
100 movq_r2r(r3, r5); /* r5 = B - D */ \
101 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
102 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
103 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
104 movq_r2r(r0, r2); /* r2 = A - C */ \
105 movq_m2r(*I(0), r6); \
106 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
107 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
108 movq_m2r(*J(4), r3); \
109 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
110 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
111 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
112 movq_r2r(r6, r0); \
113 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
114 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
115 paddsw_r2r(r1, r1); /* r1 = H + H */ \
116 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
117 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
118 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
119 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
120 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
121 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
122 movq_m2r(*I(1), r0); /* r0 = C. */ \
123 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
124 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
125 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
126 }
127
128 /* RowIDCT gets ready to transpose */
129 #define RowIDCT() { \
130 \
131 BeginIDCT(); \
132 \
133 movq_m2r(*I(2), r3); /* r3 = D. */ \
134 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
135 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
136 paddsw_r2r(r7, r7); /* r7 = G + G */ \
137 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
138 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
139 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
140 paddsw_r2r(r3, r3); \
141 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
142 paddsw_r2r(r5, r5); \
143 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
144 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
145 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
146 paddsw_r2r(r0, r0); \
147 movq_r2m(r1, *I(1)); /* save R1 */ \
148 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
149 }
150
151 /* Column IDCT normalizes and stores final results */
152 #define ColumnIDCT() { \
153 \
154 BeginIDCT(); \
155 \
156 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
157 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
158 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
159 psraw_i2r(4, r2); /* r2 = NR2 */ \
160 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
161 psraw_i2r(4, r1); /* r1 = NR1 */ \
162 movq_m2r(*I(2), r3); /* r3 = D. */ \
163 paddsw_r2r(r7, r7); /* r7 = G + G */ \
164 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
165 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
166 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
167 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
168 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
169 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
170 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
171 psraw_i2r(4, r4); /* r4 = NR4 */ \
172 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
173 psraw_i2r(4, r3); /* r3 = NR3 */ \
174 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
175 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
176 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
177 psraw_i2r(4, r6); /* r6 = NR6 */ \
178 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
179 psraw_i2r(4, r5); /* r5 = NR5 */ \
180 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
181 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
182 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
183 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
184 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
185 psraw_i2r(4, r7); /* r7 = NR7 */ \
186 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
187 psraw_i2r(4, r0); /* r0 = NR0 */ \
188 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
189 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
190 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
191 }
192
193 /* Following macro does two 4x4 transposes in place.
194
195 At entry (we assume):
196
197 r0 = a3 a2 a1 a0
198 I(1) = b3 b2 b1 b0
199 r2 = c3 c2 c1 c0
200 r3 = d3 d2 d1 d0
201
202 r4 = e3 e2 e1 e0
203 r5 = f3 f2 f1 f0
204 r6 = g3 g2 g1 g0
205 r7 = h3 h2 h1 h0
206
207 At exit, we have:
208
209 I(0) = d0 c0 b0 a0
210 I(1) = d1 c1 b1 a1
211 I(2) = d2 c2 b2 a2
212 I(3) = d3 c3 b3 a3
213
214 J(4) = h0 g0 f0 e0
215 J(5) = h1 g1 f1 e1
216 J(6) = h2 g2 f2 e2
217 J(7) = h3 g3 f3 e3
218
219 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
220 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
221
222 Since r1 is free at entry, we calculate the Js first. */
223
224 #define Transpose() { \
225 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
226 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
227 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
228 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
229 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
230 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
231 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
232 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
233 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
234 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
235 movq_r2m(r4, *J(4)); \
236 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
237 movq_r2m(r5, *J(5)); \
238 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
239 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
240 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
241 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
242 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
243 movq_r2m(r6, *J(7)); \
244 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
245 movq_r2m(r1, *J(6)); \
246 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
247 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
248 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
249 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
250 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
251 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
252 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
253 movq_r2m(r0, *I(0)); \
254 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
255 movq_r2m(r1, *I(1)); \
256 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
257 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
258 movq_r2m(r4, *I(3)); \
259 movq_r2m(r2, *I(2)); \
260 }
261
ff_vp3_dsp_init_mmx(void)262 void ff_vp3_dsp_init_mmx(void)
263 {
264 int j = 16;
265 uint16_t *p;
266
267 j = 1;
268 do {
269 p = idct_constants + ((j + 3) << 2);
270 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
271 } while (++j <= 7);
272
273 idct_constants[44] = idct_constants[45] =
274 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
275 }
276
ff_vp3_idct_mmx(int16_t * output_data)277 void ff_vp3_idct_mmx(int16_t *output_data)
278 {
279 /* eax = quantized input
280 * ebx = dequantizer matrix
281 * ecx = IDCT constants
282 * M(I) = ecx + MaskOffset(0) + I * 8
283 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
284 * edx = output
285 * r0..r7 = mm0..mm7
286 */
287
288 #define C(x) (idct_constants + 16 + (x - 1) * 4)
289 #define Eight (idct_constants + 44)
290
291 /* at this point, function has completed dequantization + dezigzag +
292 * partial transposition; now do the idct itself */
293 #define I(K) (output_data + K * 8)
294 #define J(K) (output_data + ((K - 4) * 8) + 4)
295
296 RowIDCT();
297 Transpose();
298
299 #undef I
300 #undef J
301 #define I(K) (output_data + (K * 8) + 32)
302 #define J(K) (output_data + ((K - 4) * 8) + 36)
303
304 RowIDCT();
305 Transpose();
306
307 #undef I
308 #undef J
309 #define I(K) (output_data + K * 8)
310 #define J(K) (output_data + K * 8)
311
312 ColumnIDCT();
313
314 #undef I
315 #undef J
316 #define I(K) (output_data + (K * 8) + 4)
317 #define J(K) (output_data + (K * 8) + 4)
318
319 ColumnIDCT();
320
321 #undef I
322 #undef J
323
324 }
325
ff_vp3_idct_put_mmx(uint8_t * dest,int line_size,DCTELEM * block)326 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
327 {
328 ff_vp3_idct_mmx(block);
329 put_signed_pixels_clamped_mmx(block, dest, line_size);
330 }
331
ff_vp3_idct_add_mmx(uint8_t * dest,int line_size,DCTELEM * block)332 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
333 {
334 ff_vp3_idct_mmx(block);
335 add_pixels_clamped_mmx(block, dest, line_size);
336 }
337