1 /*
2  * Copyright (C) 2004 the ffmpeg project
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file vp3dsp_mmx.c
23  * MMX-optimized functions cribbed from the original VP3 source code.
24  */
25 
26 #include "dsputil.h"
27 #include "mmx.h"
28 
29 #define IdctAdjustBeforeShift 8
30 
31 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
32  * idct_constants[0..15] = Mask table (M(I))
33  * idct_constants[16..43] = Cosine table (C(I))
34  * idct_constants[44..47] = 8
35  */
36 static uint16_t idct_constants[(4 + 7 + 1) * 4];
37 static const uint16_t idct_cosine_table[7] = {
38     64277, 60547, 54491, 46341, 36410, 25080, 12785
39 };
40 
41 #define r0 mm0
42 #define r1 mm1
43 #define r2 mm2
44 #define r3 mm3
45 #define r4 mm4
46 #define r5 mm5
47 #define r6 mm6
48 #define r7 mm7
49 
50 /* from original comments: The Macro does IDct on 4 1-D Dcts */
51 #define BeginIDCT() { \
52     movq_m2r(*I(3), r2); \
53     movq_m2r(*C(3), r6); \
54     movq_r2r(r2, r4); \
55     movq_m2r(*J(5), r7); \
56     pmulhw_r2r(r6, r4);       /* r4 = c3*i3 - i3 */ \
57     movq_m2r(*C(5), r1); \
58     pmulhw_r2r(r7, r6);       /* r6 = c3*i5 - i5 */ \
59     movq_r2r(r1, r5); \
60     pmulhw_r2r(r2, r1);       /* r1 = c5*i3 - i3 */ \
61     movq_m2r(*I(1), r3); \
62     pmulhw_r2r(r7, r5);       /* r5 = c5*i5 - i5 */ \
63     movq_m2r(*C(1), r0);      /* (all registers are in use) */ \
64     paddw_r2r(r2, r4);        /* r4 = c3*i3 */ \
65     paddw_r2r(r7, r6);        /* r6 = c3*i5 */ \
66     paddw_r2r(r1, r2);        /* r2 = c5*i3 */ \
67     movq_m2r(*J(7), r1); \
68     paddw_r2r(r5, r7);        /* r7 = c5*i5 */ \
69     movq_r2r(r0, r5);         /* r5 = c1 */ \
70     pmulhw_r2r(r3, r0);       /* r0 = c1*i1 - i1 */ \
71     paddsw_r2r(r7, r4);       /* r4 = C = c3*i3 + c5*i5 */ \
72     pmulhw_r2r(r1, r5);       /* r5 = c1*i7 - i7 */ \
73     movq_m2r(*C(7), r7); \
74     psubsw_r2r(r2, r6);       /* r6 = D = c3*i5 - c5*i3 */ \
75     paddw_r2r(r3, r0);        /* r0 = c1*i1 */ \
76     pmulhw_r2r(r7, r3);       /* r3 = c7*i1 */ \
77     movq_m2r(*I(2), r2); \
78     pmulhw_r2r(r1, r7);       /* r7 = c7*i7 */ \
79     paddw_r2r(r1, r5);        /* r5 = c1*i7 */ \
80     movq_r2r(r2, r1);         /* r1 = i2 */ \
81     pmulhw_m2r(*C(2), r2);    /* r2 = c2*i2 - i2 */ \
82     psubsw_r2r(r5, r3);       /* r3 = B = c7*i1 - c1*i7 */ \
83     movq_m2r(*J(6), r5); \
84     paddsw_r2r(r7, r0);       /* r0 = A = c1*i1 + c7*i7 */ \
85     movq_r2r(r5, r7);         /* r7 = i6 */ \
86     psubsw_r2r(r4, r0);       /* r0 = A - C */ \
87     pmulhw_m2r(*C(2), r5);    /* r5 = c2*i6 - i6 */ \
88     paddw_r2r(r1, r2);        /* r2 = c2*i2 */ \
89     pmulhw_m2r(*C(6), r1);    /* r1 = c6*i2 */ \
90     paddsw_r2r(r4, r4);       /* r4 = C + C */ \
91     paddsw_r2r(r0, r4);       /* r4 = C. = A + C */ \
92     psubsw_r2r(r6, r3);       /* r3 = B - D */ \
93     paddw_r2r(r7, r5);        /* r5 = c2*i6 */ \
94     paddsw_r2r(r6, r6);       /* r6 = D + D */ \
95     pmulhw_m2r(*C(6), r7);    /* r7 = c6*i6 */ \
96     paddsw_r2r(r3, r6);       /* r6 = D. = B + D */ \
97     movq_r2m(r4, *I(1));      /* save C. at I(1) */ \
98     psubsw_r2r(r5, r1);       /* r1 = H = c6*i2 - c2*i6 */ \
99     movq_m2r(*C(4), r4); \
100     movq_r2r(r3, r5);         /* r5 = B - D */ \
101     pmulhw_r2r(r4, r3);       /* r3 = (c4 - 1) * (B - D) */ \
102     paddsw_r2r(r2, r7);       /* r7 = G = c6*i6 + c2*i2 */ \
103     movq_r2m(r6, *I(2));      /* save D. at I(2) */ \
104     movq_r2r(r0, r2);         /* r2 = A - C */ \
105     movq_m2r(*I(0), r6); \
106     pmulhw_r2r(r4, r0);       /* r0 = (c4 - 1) * (A - C) */ \
107     paddw_r2r(r3, r5);        /* r5 = B. = c4 * (B - D) */ \
108     movq_m2r(*J(4), r3); \
109     psubsw_r2r(r1, r5);       /* r5 = B.. = B. - H */ \
110     paddw_r2r(r0, r2);        /* r0 = A. = c4 * (A - C) */ \
111     psubsw_r2r(r3, r6);       /* r6 = i0 - i4 */ \
112     movq_r2r(r6, r0); \
113     pmulhw_r2r(r4, r6);       /* r6 = (c4 - 1) * (i0 - i4) */ \
114     paddsw_r2r(r3, r3);       /* r3 = i4 + i4 */ \
115     paddsw_r2r(r1, r1);       /* r1 = H + H */ \
116     paddsw_r2r(r0, r3);       /* r3 = i0 + i4 */ \
117     paddsw_r2r(r5, r1);       /* r1 = H. = B + H */ \
118     pmulhw_r2r(r3, r4);       /* r4 = (c4 - 1) * (i0 + i4) */ \
119     paddsw_r2r(r0, r6);       /* r6 = F = c4 * (i0 - i4) */ \
120     psubsw_r2r(r2, r6);       /* r6 = F. = F - A. */ \
121     paddsw_r2r(r2, r2);       /* r2 = A. + A. */ \
122     movq_m2r(*I(1), r0);      /* r0 = C. */ \
123     paddsw_r2r(r6, r2);       /* r2 = A.. = F + A. */ \
124     paddw_r2r(r3, r4);        /* r4 = E = c4 * (i0 + i4) */ \
125     psubsw_r2r(r1, r2);       /* r2 = R2 = A.. - H. */ \
126 }
127 
128 /* RowIDCT gets ready to transpose */
129 #define RowIDCT() { \
130     \
131     BeginIDCT(); \
132     \
133     movq_m2r(*I(2), r3);   /* r3 = D. */ \
134     psubsw_r2r(r7, r4);    /* r4 = E. = E - G */ \
135     paddsw_r2r(r1, r1);    /* r1 = H. + H. */ \
136     paddsw_r2r(r7, r7);    /* r7 = G + G */ \
137     paddsw_r2r(r2, r1);    /* r1 = R1 = A.. + H. */ \
138     paddsw_r2r(r4, r7);    /* r7 = G. = E + G */ \
139     psubsw_r2r(r3, r4);    /* r4 = R4 = E. - D. */ \
140     paddsw_r2r(r3, r3); \
141     psubsw_r2r(r5, r6);    /* r6 = R6 = F. - B.. */ \
142     paddsw_r2r(r5, r5); \
143     paddsw_r2r(r4, r3);    /* r3 = R3 = E. + D. */ \
144     paddsw_r2r(r6, r5);    /* r5 = R5 = F. + B.. */ \
145     psubsw_r2r(r0, r7);    /* r7 = R7 = G. - C. */ \
146     paddsw_r2r(r0, r0); \
147     movq_r2m(r1, *I(1));   /* save R1 */ \
148     paddsw_r2r(r7, r0);    /* r0 = R0 = G. + C. */ \
149 }
150 
151 /* Column IDCT normalizes and stores final results */
152 #define ColumnIDCT() { \
153     \
154     BeginIDCT(); \
155     \
156     paddsw_m2r(*Eight, r2);    /* adjust R2 (and R1) for shift */ \
157     paddsw_r2r(r1, r1);        /* r1 = H. + H. */ \
158     paddsw_r2r(r2, r1);        /* r1 = R1 = A.. + H. */ \
159     psraw_i2r(4, r2);          /* r2 = NR2 */ \
160     psubsw_r2r(r7, r4);        /* r4 = E. = E - G */ \
161     psraw_i2r(4, r1);          /* r1 = NR1 */ \
162     movq_m2r(*I(2), r3);       /* r3 = D. */ \
163     paddsw_r2r(r7, r7);        /* r7 = G + G */ \
164     movq_r2m(r2, *I(2));       /* store NR2 at I2 */ \
165     paddsw_r2r(r4, r7);        /* r7 = G. = E + G */ \
166     movq_r2m(r1, *I(1));       /* store NR1 at I1 */ \
167     psubsw_r2r(r3, r4);        /* r4 = R4 = E. - D. */ \
168     paddsw_m2r(*Eight, r4);    /* adjust R4 (and R3) for shift */ \
169     paddsw_r2r(r3, r3);        /* r3 = D. + D. */ \
170     paddsw_r2r(r4, r3);        /* r3 = R3 = E. + D. */ \
171     psraw_i2r(4, r4);          /* r4 = NR4 */ \
172     psubsw_r2r(r5, r6);        /* r6 = R6 = F. - B.. */ \
173     psraw_i2r(4, r3);          /* r3 = NR3 */ \
174     paddsw_m2r(*Eight, r6);    /* adjust R6 (and R5) for shift */ \
175     paddsw_r2r(r5, r5);        /* r5 = B.. + B.. */ \
176     paddsw_r2r(r6, r5);        /* r5 = R5 = F. + B.. */ \
177     psraw_i2r(4, r6);          /* r6 = NR6 */ \
178     movq_r2m(r4, *J(4));       /* store NR4 at J4 */ \
179     psraw_i2r(4, r5);          /* r5 = NR5 */ \
180     movq_r2m(r3, *I(3));       /* store NR3 at I3 */ \
181     psubsw_r2r(r0, r7);        /* r7 = R7 = G. - C. */ \
182     paddsw_m2r(*Eight, r7);    /* adjust R7 (and R0) for shift */ \
183     paddsw_r2r(r0, r0);        /* r0 = C. + C. */ \
184     paddsw_r2r(r7, r0);        /* r0 = R0 = G. + C. */ \
185     psraw_i2r(4, r7);          /* r7 = NR7 */ \
186     movq_r2m(r6, *J(6));       /* store NR6 at J6 */ \
187     psraw_i2r(4, r0);          /* r0 = NR0 */ \
188     movq_r2m(r5, *J(5));       /* store NR5 at J5 */ \
189     movq_r2m(r7, *J(7));       /* store NR7 at J7 */ \
190     movq_r2m(r0, *I(0));       /* store NR0 at I0 */ \
191 }
192 
193 /* Following macro does two 4x4 transposes in place.
194 
195   At entry (we assume):
196 
197     r0 = a3 a2 a1 a0
198     I(1) = b3 b2 b1 b0
199     r2 = c3 c2 c1 c0
200     r3 = d3 d2 d1 d0
201 
202     r4 = e3 e2 e1 e0
203     r5 = f3 f2 f1 f0
204     r6 = g3 g2 g1 g0
205     r7 = h3 h2 h1 h0
206 
207   At exit, we have:
208 
209     I(0) = d0 c0 b0 a0
210     I(1) = d1 c1 b1 a1
211     I(2) = d2 c2 b2 a2
212     I(3) = d3 c3 b3 a3
213 
214     J(4) = h0 g0 f0 e0
215     J(5) = h1 g1 f1 e1
216     J(6) = h2 g2 f2 e2
217     J(7) = h3 g3 f3 e3
218 
219    I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
220    J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
221 
222    Since r1 is free at entry, we calculate the Js first. */
223 
224 #define Transpose() { \
225     movq_r2r(r4, r1);         /* r1 = e3 e2 e1 e0 */ \
226     punpcklwd_r2r(r5, r4);    /* r4 = f1 e1 f0 e0 */ \
227     movq_r2m(r0, *I(0));      /* save a3 a2 a1 a0 */ \
228     punpckhwd_r2r(r5, r1);    /* r1 = f3 e3 f2 e2 */ \
229     movq_r2r(r6, r0);         /* r0 = g3 g2 g1 g0 */ \
230     punpcklwd_r2r(r7, r6);    /* r6 = h1 g1 h0 g0 */ \
231     movq_r2r(r4, r5);         /* r5 = f1 e1 f0 e0 */ \
232     punpckldq_r2r(r6, r4);    /* r4 = h0 g0 f0 e0 = R4 */ \
233     punpckhdq_r2r(r6, r5);    /* r5 = h1 g1 f1 e1 = R5 */ \
234     movq_r2r(r1, r6);         /* r6 = f3 e3 f2 e2 */ \
235     movq_r2m(r4, *J(4)); \
236     punpckhwd_r2r(r7, r0);    /* r0 = h3 g3 h2 g2 */ \
237     movq_r2m(r5, *J(5)); \
238     punpckhdq_r2r(r0, r6);    /* r6 = h3 g3 f3 e3 = R7 */ \
239     movq_m2r(*I(0), r4);      /* r4 = a3 a2 a1 a0 */ \
240     punpckldq_r2r(r0, r1);    /* r1 = h2 g2 f2 e2 = R6 */ \
241     movq_m2r(*I(1), r5);      /* r5 = b3 b2 b1 b0 */ \
242     movq_r2r(r4, r0);         /* r0 = a3 a2 a1 a0 */ \
243     movq_r2m(r6, *J(7)); \
244     punpcklwd_r2r(r5, r0);    /* r0 = b1 a1 b0 a0 */ \
245     movq_r2m(r1, *J(6)); \
246     punpckhwd_r2r(r5, r4);    /* r4 = b3 a3 b2 a2 */ \
247     movq_r2r(r2, r5);         /* r5 = c3 c2 c1 c0 */ \
248     punpcklwd_r2r(r3, r2);    /* r2 = d1 c1 d0 c0 */ \
249     movq_r2r(r0, r1);         /* r1 = b1 a1 b0 a0 */ \
250     punpckldq_r2r(r2, r0);    /* r0 = d0 c0 b0 a0 = R0 */ \
251     punpckhdq_r2r(r2, r1);    /* r1 = d1 c1 b1 a1 = R1 */ \
252     movq_r2r(r4, r2);         /* r2 = b3 a3 b2 a2 */ \
253     movq_r2m(r0, *I(0)); \
254     punpckhwd_r2r(r3, r5);    /* r5 = d3 c3 d2 c2 */ \
255     movq_r2m(r1, *I(1)); \
256     punpckhdq_r2r(r5, r4);    /* r4 = d3 c3 b3 a3 = R3 */ \
257     punpckldq_r2r(r5, r2);    /* r2 = d2 c2 b2 a2 = R2 */ \
258     movq_r2m(r4, *I(3)); \
259     movq_r2m(r2, *I(2)); \
260 }
261 
ff_vp3_dsp_init_mmx(void)262 void ff_vp3_dsp_init_mmx(void)
263 {
264     int j = 16;
265     uint16_t *p;
266 
267     j = 1;
268     do {
269         p = idct_constants + ((j + 3) << 2);
270         p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
271     } while (++j <= 7);
272 
273     idct_constants[44] = idct_constants[45] =
274     idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
275 }
276 
ff_vp3_idct_mmx(int16_t * output_data)277 void ff_vp3_idct_mmx(int16_t *output_data)
278 {
279     /* eax = quantized input
280      * ebx = dequantizer matrix
281      * ecx = IDCT constants
282      *  M(I) = ecx + MaskOffset(0) + I * 8
283      *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
284      * edx = output
285      * r0..r7 = mm0..mm7
286      */
287 
288 #define C(x) (idct_constants + 16 + (x - 1) * 4)
289 #define Eight (idct_constants + 44)
290 
291     /* at this point, function has completed dequantization + dezigzag +
292      * partial transposition; now do the idct itself */
293 #define I(K) (output_data + K * 8)
294 #define J(K) (output_data + ((K - 4) * 8) + 4)
295 
296     RowIDCT();
297     Transpose();
298 
299 #undef I
300 #undef J
301 #define I(K) (output_data + (K * 8) + 32)
302 #define J(K) (output_data + ((K - 4) * 8) + 36)
303 
304     RowIDCT();
305     Transpose();
306 
307 #undef I
308 #undef J
309 #define I(K) (output_data + K * 8)
310 #define J(K) (output_data + K * 8)
311 
312     ColumnIDCT();
313 
314 #undef I
315 #undef J
316 #define I(K) (output_data + (K * 8) + 4)
317 #define J(K) (output_data + (K * 8) + 4)
318 
319     ColumnIDCT();
320 
321 #undef I
322 #undef J
323 
324 }
325 
ff_vp3_idct_put_mmx(uint8_t * dest,int line_size,DCTELEM * block)326 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
327 {
328     ff_vp3_idct_mmx(block);
329     put_signed_pixels_clamped_mmx(block, dest, line_size);
330 }
331 
ff_vp3_idct_add_mmx(uint8_t * dest,int line_size,DCTELEM * block)332 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
333 {
334     ff_vp3_idct_mmx(block);
335     add_pixels_clamped_mmx(block, dest, line_size);
336 }
337