1 /*
2  * idct_mmx.c
3  * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
4  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
5  *
6  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
7  * See http://libmpeg2.sourceforge.net/ for updates.
8  *
9  * mpeg2dec is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * mpeg2dec is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22  */
23 
24 #include "config.h"
25 
26 #if defined(ARCH_X86) || defined(ARCH_X86_64)
27 
28 #include <inttypes.h>
29 
30 #include "mpeg2.h"
31 #include "attributes.h"
32 #include "mpeg2_internal.h"
33 #include "mmx.h"
34 
35 #define ROW_SHIFT 15
36 #define COL_SHIFT 6
37 
38 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
39 #define rounder(bias) {round (bias), round (bias)}
40 #define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)}
41 
42 
43 #if 0
44 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */
45 static inline void idct_row (int16_t * row, int offset,
46 			     int16_t * table, int32_t * rounder)
47 {
48     int C1, C2, C3, C4, C5, C6, C7;
49     int a0, a1, a2, a3, b0, b1, b2, b3;
50 
51     row += offset;
52 
53     C1 = table[1];
54     C2 = table[2];
55     C3 = table[3];
56     C4 = table[4];
57     C5 = table[5];
58     C6 = table[6];
59     C7 = table[7];
60 
61     a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
62     a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
63     a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
64     a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
65 
66     b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
67     b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
68     b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
69     b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
70 
71     row[0] = (a0 + b0) >> ROW_SHIFT;
72     row[1] = (a1 + b1) >> ROW_SHIFT;
73     row[2] = (a2 + b2) >> ROW_SHIFT;
74     row[3] = (a3 + b3) >> ROW_SHIFT;
75     row[4] = (a3 - b3) >> ROW_SHIFT;
76     row[5] = (a2 - b2) >> ROW_SHIFT;
77     row[6] = (a1 - b1) >> ROW_SHIFT;
78     row[7] = (a0 - b0) >> ROW_SHIFT;
79 }
80 #endif
81 
82 
83 /* SSE2 row IDCT */
84 #define sse2_table(c1,c2,c3,c4,c5,c6,c7) {  c4,  c2,  c4,  c6,   \
85 					    c4, -c6,  c4, -c2,   \
86 					    c4,  c6, -c4, -c2,   \
87 					   -c4,  c2,  c4, -c6,   \
88 					    c1,  c3,  c3, -c7,   \
89 					    c5, -c1,  c7, -c5,   \
90 					    c5,  c7, -c1, -c5,   \
91 					    c7,  c3,  c3, -c1 }
92 
93 #define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do {               \
94     /* no scheduling: trust in out of order execution */                     \
95     /* based on Intel AP-945 */                                              \
96     /* (http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf) */       \
97                                                                              \
98     /* input */                      /* 1: row1= x7 x5 x3 x1  x6 x4 x2 x0 */ \
99     pshufd_r2r   (row1, xmm1, 0);    /* 1: xmm1= x2 x0 x2 x0  x2 x0 x2 x0 */ \
100     pmaddwd_m2r  (table[0], xmm1);   /* 1: xmm1= x2*C + x0*C ...          */ \
101     pshufd_r2r   (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1  x3 x1 x3 x1 */ \
102     pmaddwd_m2r  (table[2*8], xmm3); /* 1: xmm3= x3*C + x1*C ...          */ \
103     pshufd_r2r   (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4  x6 x4 x6 x4 */ \
104     pshufd_r2r   (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5  x7 x5 x7 x5 */ \
105     pmaddwd_m2r  (table[1*8], xmm2); /* 1: xmm2= x6*C + x4*C ...          */ \
106     paddd_m2r    (round1, xmm1);     /* 1: xmm1= x2*C + x0*C + round ...  */ \
107     pmaddwd_m2r  (table[3*8], row1); /* 1: row1= x7*C + x5*C ...          */ \
108     pshufd_r2r   (row2, xmm5, 0);    /*    2:                             */ \
109     pshufd_r2r   (row2, xmm6, 0x55); /*    2:                             */ \
110     pmaddwd_m2r  (table[0], xmm5);   /*    2:                             */ \
111     paddd_r2r    (xmm2, xmm1);       /* 1: xmm1= a[]                      */ \
112     movdqa_r2r   (xmm1, xmm2);       /* 1: xmm2= a[]                      */ \
113     pshufd_r2r   (row2, xmm7, 0xaa); /*    2:                             */ \
114     pmaddwd_m2r  (table[1*8], xmm6); /*    2:                             */ \
115     paddd_r2r    (xmm3, row1);       /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \
116     pshufd_r2r   (row2, row2, 0xff); /*    2:                             */ \
117     psubd_r2r    (row1, xmm2);       /* 1: xmm2= a[] - b[]                */ \
118     pmaddwd_m2r  (table[2*8], xmm7); /*    2:                             */ \
119     paddd_r2r    (xmm1, row1);       /* 1: row1= a[] + b[]                */ \
120     psrad_i2r    (ROW_SHIFT, xmm2);  /* 1: xmm2= result 4...7             */ \
121     paddd_m2r    (round2, xmm5);     /*    2:                             */ \
122     pmaddwd_m2r  (table[3*8], row2); /*    2:                             */ \
123     paddd_r2r    (xmm6, xmm5);       /*    2:                             */ \
124     movdqa_r2r   (xmm5, xmm6);       /*    2:                             */ \
125     psrad_i2r    (ROW_SHIFT, row1);  /* 1: row1= result 0...4             */ \
126     pshufd_r2r   (xmm2, xmm2, 0x1b); /* 1: [0 1 2 3] -> [3 2 1 0]         */ \
127     packssdw_r2r (xmm2, row1);       /* 1: row1= result[]                 */ \
128     paddd_r2r    (xmm7, row2);       /*    2:                             */ \
129     psubd_r2r    (row2, xmm6);       /*    2:                             */ \
130     paddd_r2r    (xmm5, row2);       /*    2:                             */ \
131     psrad_i2r    (ROW_SHIFT, xmm6);  /*    2:                             */ \
132     psrad_i2r    (ROW_SHIFT, row2);  /*    2:                             */ \
133     pshufd_r2r   (xmm6, xmm6, 0x1b); /*    2:                             */ \
134     packssdw_r2r (xmm6, row2);       /*    2:                             */ \
135 } while (0)
136 
137 
138 /* MMXEXT row IDCT */
139 
140 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2, -c4, -c2,	\
141 						   c4,  c6,  c4,  c6,	\
142 						   c1,  c3, -c1, -c5,	\
143 						   c5,  c7,  c3, -c7,	\
144 						   c4, -c6,  c4, -c6,	\
145 						  -c4,  c2,  c4, -c2,	\
146 						   c5, -c1,  c3, -c1,	\
147 						   c7,  c3,  c7, -c5 }
148 
mmxext_row_head(int16_t * const row,const int offset,const int16_t * const table)149 static inline void mmxext_row_head (int16_t * const row, const int offset,
150 				    const int16_t * const table)
151 {
152     movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
153 
154     movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
155     movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
156 
157     movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
158     movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
159 
160     movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
161     pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
162 
163     pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
164 }
165 
mmxext_row(const int16_t * const table,const int32_t * const rounder)166 static inline void mmxext_row (const int16_t * const table,
167 			       const int32_t * const rounder)
168 {
169     movq_m2r (*(table+8), mm1);		/* mm1 = -C5 -C1 C3 C1 */
170     pmaddwd_r2r (mm2, mm4);		/* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
171 
172     pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
173     pshufw_r2r (mm6, mm6, 0x4e);	/* mm6 = x3 x1 x7 x5 */
174 
175     movq_m2r (*(table+12), mm7);	/* mm7 = -C7 C3 C7 C5 */
176     pmaddwd_r2r (mm5, mm1);		/* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
177 
178     paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
179     pmaddwd_r2r (mm6, mm7);		/* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
180 
181     pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
182     paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
183 
184     pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
185     movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
186 
187     pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
188     paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
189 
190     paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
191     psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
192 
193     psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
194     paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
195 
196     paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
197     psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
198 
199     paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
200     movq_r2r (mm0, mm4);		/* mm4 = a3 a2 + rounder */
201 
202     paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
203     psubd_r2r (mm5, mm4);		/* mm4 = a3-b3 a2-b2 + rounder */
204 }
205 
mmxext_row_tail(int16_t * const row,const int store)206 static inline void mmxext_row_tail (int16_t * const row, const int store)
207 {
208     psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
209 
210     psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
211 
212     packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
213 
214     packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
215 
216     movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
217     pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
218 
219     /* slot */
220 
221     movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
222 }
223 
mmxext_row_mid(int16_t * const row,const int store,const int offset,const int16_t * const table)224 static inline void mmxext_row_mid (int16_t * const row, const int store,
225 				   const int offset,
226 				   const int16_t * const table)
227 {
228     movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
229     psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
230 
231     movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
232     psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
233 
234     packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
235     movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
236 
237     packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
238     movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
239 
240     movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
241     pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
242 
243     movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
244     movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
245 
246     pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
247 
248     movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
249     pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
250 }
251 
252 
253 /* MMX row IDCT */
254 
255 #define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2,  c4,  c6,	\
256 					   c4,  c6, -c4, -c2,	\
257 					   c1,  c3,  c3, -c7,	\
258 					   c5,  c7, -c1, -c5,	\
259 					   c4, -c6,  c4, -c2,	\
260 					  -c4,  c2,  c4, -c6,	\
261 					   c5, -c1,  c7, -c5,	\
262 					   c7,  c3,  c3, -c1 }
263 
mmx_row_head(int16_t * const row,const int offset,const int16_t * const table)264 static inline void mmx_row_head (int16_t * const row, const int offset,
265 				 const int16_t * const table)
266 {
267     movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
268 
269     movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
270     movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
271 
272     movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
273     movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
274 
275     punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
276 
277     movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
278     pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
279 
280     movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
281     punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
282 }
283 
mmx_row(const int16_t * const table,const int32_t * const rounder)284 static inline void mmx_row (const int16_t * const table,
285 			    const int32_t * const rounder)
286 {
287     pmaddwd_r2r (mm2, mm4);		/* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
288     punpckldq_r2r (mm5, mm5);		/* mm5 = x3 x1 x3 x1 */
289 
290     pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
291     punpckhdq_r2r (mm6, mm6);		/* mm6 = x7 x5 x7 x5 */
292 
293     movq_m2r (*(table+12), mm7);	/* mm7 = -C5 -C1 C7 C5 */
294     pmaddwd_r2r (mm5, mm1);		/* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
295 
296     paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
297     pmaddwd_r2r (mm6, mm7);		/* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
298 
299     pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
300     paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
301 
302     pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
303     movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
304 
305     pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
306     paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
307 
308     paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
309     psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
310 
311     psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
312     paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
313 
314     paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
315     psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
316 
317     paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
318     movq_r2r (mm0, mm7);		/* mm7 = a3 a2 + rounder */
319 
320     paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
321     psubd_r2r (mm5, mm7);		/* mm7 = a3-b3 a2-b2 + rounder */
322 }
323 
mmx_row_tail(int16_t * const row,const int store)324 static inline void mmx_row_tail (int16_t * const row, const int store)
325 {
326     psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
327 
328     psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
329 
330     packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
331 
332     packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
333 
334     movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
335     movq_r2r (mm7, mm4);		/* mm4 = y6 y7 y4 y5 */
336 
337     pslld_i2r (16, mm7);		/* mm7 = y7 0 y5 0 */
338 
339     psrld_i2r (16, mm4);		/* mm4 = 0 y6 0 y4 */
340 
341     por_r2r (mm4, mm7);			/* mm7 = y7 y6 y5 y4 */
342 
343     /* slot */
344 
345     movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
346 }
347 
mmx_row_mid(int16_t * const row,const int store,const int offset,const int16_t * const table)348 static inline void mmx_row_mid (int16_t * const row, const int store,
349 				const int offset, const int16_t * const table)
350 {
351     movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
352     psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
353 
354     movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
355     psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
356 
357     packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
358     movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
359 
360     packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
361     movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
362 
363     movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
364     movq_r2r (mm7, mm1);		/* mm1 = y6 y7 y4 y5 */
365 
366     punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
367     psrld_i2r (16, mm7);		/* mm7 = 0 y6 0 y4 */
368 
369     movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
370     pslld_i2r (16, mm1);		/* mm1 = y7 0 y5 0 */
371 
372     movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
373     por_r2r (mm1, mm7);			/* mm7 = y7 y6 y5 y4 */
374 
375     movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
376     punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
377 
378     movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
379     pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
380 }
381 
382 
383 #if 0
384 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */
385 static inline void idct_col (int16_t * col, int offset)
386 {
387 /* multiplication - as implemented on mmx */
388 #define F(c,x) (((c) * (x)) >> 16)
389 
390 /* saturation - it helps us handle torture test cases */
391 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
392 
393     int16_t x0, x1, x2, x3, x4, x5, x6, x7;
394     int16_t y0, y1, y2, y3, y4, y5, y6, y7;
395     int16_t a0, a1, a2, a3, b0, b1, b2, b3;
396     int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
397 
398     col += offset;
399 
400     x0 = col[0*8];
401     x1 = col[1*8];
402     x2 = col[2*8];
403     x3 = col[3*8];
404     x4 = col[4*8];
405     x5 = col[5*8];
406     x6 = col[6*8];
407     x7 = col[7*8];
408 
409     u04 = S (x0 + x4);
410     v04 = S (x0 - x4);
411     u26 = S (F (T2, x6) + x2);
412     v26 = S (F (T2, x2) - x6);
413 
414     a0 = S (u04 + u26);
415     a1 = S (v04 + v26);
416     a2 = S (v04 - v26);
417     a3 = S (u04 - u26);
418 
419     u17 = S (F (T1, x7) + x1);
420     v17 = S (F (T1, x1) - x7);
421     u35 = S (F (T3, x5) + x3);
422     v35 = S (F (T3, x3) - x5);
423 
424     b0 = S (u17 + u35);
425     b3 = S (v17 - v35);
426     u12 = S (u17 - u35);
427     v12 = S (v17 + v35);
428     u12 = S (2 * F (C4, u12));
429     v12 = S (2 * F (C4, v12));
430     b1 = S (u12 + v12);
431     b2 = S (u12 - v12);
432 
433     y0 = S (a0 + b0) >> COL_SHIFT;
434     y1 = S (a1 + b1) >> COL_SHIFT;
435     y2 = S (a2 + b2) >> COL_SHIFT;
436     y3 = S (a3 + b3) >> COL_SHIFT;
437 
438     y4 = S (a3 - b3) >> COL_SHIFT;
439     y5 = S (a2 - b2) >> COL_SHIFT;
440     y6 = S (a1 - b1) >> COL_SHIFT;
441     y7 = S (a0 - b0) >> COL_SHIFT;
442 
443     col[0*8] = y0;
444     col[1*8] = y1;
445     col[2*8] = y2;
446     col[3*8] = y3;
447     col[4*8] = y4;
448     col[5*8] = y5;
449     col[6*8] = y6;
450     col[7*8] = y7;
451 }
452 #endif
453 
454 
455 #define T1 13036
456 #define T2 27146
457 #define T3 43790
458 #define C4 23170
459 
460 
461 /* SSE2 column IDCT */
sse2_idct_col(int16_t * const col)462 static inline void sse2_idct_col (int16_t * const col)
463 {
464     /* Almost identical to mmxext version:  */
465     /* just do both 4x8 columns in paraller */
466 
467     static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1};
468     static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2};
469     static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3};
470     static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4};
471 
472 #if defined(__x86_64__)
473 
474     /* INPUT: block in xmm8 ... xmm15 */
475 
476     movdqa_m2r (*t1_vector, xmm0);	/* xmm0  = T1 */
477     movdqa_r2r (xmm9, xmm1);		/* xmm1  = x1 */
478 
479     movdqa_r2r (xmm0, xmm2);		/* xmm2  = T1 */
480     pmulhw_r2r (xmm1, xmm0);		/* xmm0  = T1*x1 */
481 
482     movdqa_m2r (*t3_vector, xmm5);	/* xmm5  = T3 */
483     pmulhw_r2r (xmm15, xmm2);		/* xmm2  = T1*x7 */
484 
485     movdqa_r2r (xmm5, xmm7);		/* xmm7  = T3-1 */
486     psubsw_r2r (xmm15, xmm0);		/* xmm0  = v17 */
487 
488     movdqa_m2r (*t2_vector, xmm9);	/* xmm9  = T2 */
489     pmulhw_r2r (xmm11, xmm5);		/* xmm5  = (T3-1)*x3 */
490 
491     paddsw_r2r (xmm2, xmm1);		/* xmm1  = u17 */
492     pmulhw_r2r (xmm13, xmm7);		/* xmm7  = (T3-1)*x5 */
493 
494     movdqa_r2r (xmm9, xmm2);		/* xmm2  = T2 */
495     paddsw_r2r (xmm11, xmm5);		/* xmm5  = T3*x3 */
496 
497     pmulhw_r2r (xmm10, xmm9);   	/* xmm9  = T2*x2 */
498     paddsw_r2r (xmm13, xmm7);		/* xmm7  = T3*x5 */
499 
500     psubsw_r2r (xmm13, xmm5);		/* xmm5  = v35 */
501     paddsw_r2r (xmm11, xmm7);		/* xmm7  = u35 */
502 
503     movdqa_r2r (xmm0, xmm6);		/* xmm6  = v17 */
504     pmulhw_r2r (xmm14, xmm2);		/* xmm2  = T2*x6 */
505 
506     psubsw_r2r (xmm5, xmm0);		/* xmm0  = b3 */
507     psubsw_r2r (xmm14, xmm9);		/* xmm9  = v26 */
508 
509     paddsw_r2r (xmm6, xmm5);		/* xmm5  = v12 */
510     movdqa_r2r (xmm0, xmm11);		/* xmm11 = b3 */
511 
512     movdqa_r2r (xmm1, xmm6);		/* xmm6  = u17 */
513     paddsw_r2r (xmm10, xmm2);		/* xmm2  = u26 */
514 
515     paddsw_r2r (xmm7, xmm6);		/* xmm6  = b0 */
516     psubsw_r2r (xmm7, xmm1);		/* xmm1  = u12 */
517 
518     movdqa_r2r (xmm1, xmm7);		/* xmm7  = u12 */
519     paddsw_r2r (xmm5, xmm1);		/* xmm1  = u12+v12 */
520 
521     movdqa_m2r (*c4_vector, xmm0);	/* xmm0  = C4/2 */
522     psubsw_r2r (xmm5, xmm7);		/* xmm7  = u12-v12 */
523 
524     movdqa_r2r (xmm6, xmm4);		/* xmm4  = b0 */
525     pmulhw_r2r (xmm0, xmm1);		/* xmm1  = b1/2 */
526 
527     movdqa_r2r (xmm9, xmm6);		/* xmm6  = v26 */
528     pmulhw_r2r (xmm0, xmm7);		/* xmm7  = b2/2 */
529 
530     movdqa_r2r (xmm8, xmm10);		/* xmm10 = x0 */
531     movdqa_r2r (xmm8, xmm0);		/* xmm0  = x0 */
532 
533     psubsw_r2r (xmm12, xmm10);		/* xmm10 = v04 */
534     paddsw_r2r (xmm12, xmm0);		/* xmm0  = u04 */
535 
536     paddsw_r2r (xmm10, xmm9);		/* xmm9  = a1 */
537     movdqa_r2r (xmm0, xmm8);		/* xmm8  = u04 */
538 
539     psubsw_r2r (xmm6, xmm10);		/* xmm10 = a2 */
540     paddsw_r2r (xmm2, xmm8);		/* xmm5  = a0 */
541 
542     paddsw_r2r (xmm1, xmm1);		/* xmm1  = b1 */
543     psubsw_r2r (xmm2, xmm0);		/* xmm0  = a3 */
544 
545     paddsw_r2r (xmm7, xmm7);		/* xmm7  = b2 */
546     movdqa_r2r (xmm10, xmm13);		/* xmm13 = a2 */
547 
548     movdqa_r2r (xmm9, xmm14);		/* xmm14 = a1 */
549     paddsw_r2r (xmm7, xmm10);		/* xmm10 = a2+b2 */
550 
551     psraw_i2r (COL_SHIFT,xmm10);	/* xmm10 = y2 */
552     paddsw_r2r (xmm1, xmm9);		/* xmm9  = a1+b1 */
553 
554     psraw_i2r (COL_SHIFT, xmm9);	/* xmm9  = y1 */
555     psubsw_r2r (xmm1, xmm14);		/* xmm14 = a1-b1 */
556 
557     psubsw_r2r (xmm7, xmm13);		/* xmm13 = a2-b2 */
558     psraw_i2r (COL_SHIFT,xmm14);	/* xmm14 = y6 */
559 
560     movdqa_r2r (xmm8, xmm15);		/* xmm15 = a0 */
561     psraw_i2r (COL_SHIFT,xmm13);	/* xmm13 = y5 */
562 
563     paddsw_r2r (xmm4, xmm8);		/* xmm8  = a0+b0 */
564     psubsw_r2r (xmm4, xmm15);		/* xmm15 = a0-b0 */
565 
566     psraw_i2r (COL_SHIFT, xmm8);	/* xmm8  = y0 */
567     movdqa_r2r (xmm0, xmm12);		/* xmm12 = a3 */
568 
569     psubsw_r2r (xmm11, xmm12);		/* xmm12 = a3-b3 */
570     psraw_i2r (COL_SHIFT,xmm15);	/* xmm15 = y7 */
571 
572     paddsw_r2r (xmm0, xmm11);		/* xmm11 = a3+b3 */
573     psraw_i2r (COL_SHIFT,xmm12);	/* xmm12 = y4 */
574 
575     psraw_i2r (COL_SHIFT,xmm11);	/* xmm11 = y3 */
576 
577     /* OUTPUT: block in xmm8 ... xmm15 */
578 
579 #else
580     movdqa_m2r (*t1_vector, xmm0);	/* xmm0 = T1 */
581 
582     movdqa_m2r (*(col+1*8), xmm1);	/* xmm1 = x1 */
583     movdqa_r2r (xmm0, xmm2);		/* xmm2 = T1 */
584 
585     movdqa_m2r (*(col+7*8), xmm4);	/* xmm4 = x7 */
586     pmulhw_r2r (xmm1, xmm0);		/* xmm0 = T1*x1 */
587 
588     movdqa_m2r (*t3_vector, xmm5);	/* xmm5 = T3 */
589     pmulhw_r2r (xmm4, xmm2);		/* xmm2 = T1*x7 */
590 
591     movdqa_m2r (*(col+5*8), xmm6);	/* xmm6 = x5 */
592     movdqa_r2r (xmm5, xmm7);		/* xmm7 = T3-1 */
593 
594     movdqa_m2r (*(col+3*8), xmm3);	/* xmm3 = x3 */
595     psubsw_r2r (xmm4, xmm0);		/* xmm0 = v17 */
596 
597     movdqa_m2r (*t2_vector, xmm4);	/* xmm4 = T2 */
598     pmulhw_r2r (xmm3, xmm5);		/* xmm5 = (T3-1)*x3 */
599 
600     paddsw_r2r (xmm2, xmm1);		/* xmm1 = u17 */
601     pmulhw_r2r (xmm6, xmm7);		/* xmm7 = (T3-1)*x5 */
602 
603     /* slot */
604 
605     movdqa_r2r (xmm4, xmm2);		/* xmm2 = T2 */
606     paddsw_r2r (xmm3, xmm5);		/* xmm5 = T3*x3 */
607 
608     pmulhw_m2r (*(col+2*8), xmm4);	/* xmm4 = T2*x2 */
609     paddsw_r2r (xmm6, xmm7);		/* xmm7 = T3*x5 */
610 
611     psubsw_r2r (xmm6, xmm5);		/* xmm5 = v35 */
612     paddsw_r2r (xmm3, xmm7);		/* xmm7 = u35 */
613 
614     movdqa_m2r (*(col+6*8), xmm3);	/* xmm3 = x6 */
615     movdqa_r2r (xmm0, xmm6);		/* xmm6 = v17 */
616 
617     pmulhw_r2r (xmm3, xmm2);		/* xmm2 = T2*x6 */
618     psubsw_r2r (xmm5, xmm0);		/* xmm0 = b3 */
619 
620     psubsw_r2r (xmm3, xmm4);		/* xmm4 = v26 */
621     paddsw_r2r (xmm6, xmm5);		/* xmm5 = v12 */
622 
623     movdqa_r2m (xmm0, *(col+3*8));	/* save b3 in scratch0 */
624     movdqa_r2r (xmm1, xmm6);		/* xmm6 = u17 */
625 
626     paddsw_m2r (*(col+2*8), xmm2);	/* xmm2 = u26 */
627     paddsw_r2r (xmm7, xmm6);		/* xmm6 = b0 */
628 
629     psubsw_r2r (xmm7, xmm1);		/* xmm1 = u12 */
630     movdqa_r2r (xmm1, xmm7);		/* xmm7 = u12 */
631 
632     movdqa_m2r (*(col+0*8), xmm3);	/* xmm3 = x0 */
633     paddsw_r2r (xmm5, xmm1);		/* xmm1 = u12+v12 */
634 
635     movdqa_m2r (*c4_vector, xmm0);	/* xmm0 = C4/2 */
636     psubsw_r2r (xmm5, xmm7);		/* xmm7 = u12-v12 */
637 
638     movdqa_r2m (xmm6, *(col+5*8));	/* save b0 in scratch1 */
639     pmulhw_r2r (xmm0, xmm1);		/* xmm1 = b1/2 */
640 
641     movdqa_r2r (xmm4, xmm6);		/* xmm6 = v26 */
642     pmulhw_r2r (xmm0, xmm7);		/* xmm7 = b2/2 */
643 
644     movdqa_m2r (*(col+4*8), xmm5);	/* xmm5 = x4 */
645     movdqa_r2r (xmm3, xmm0);		/* xmm0 = x0 */
646 
647     psubsw_r2r (xmm5, xmm3);		/* xmm3 = v04 */
648     paddsw_r2r (xmm5, xmm0);		/* xmm0 = u04 */
649 
650     paddsw_r2r (xmm3, xmm4);		/* xmm4 = a1 */
651     movdqa_r2r (xmm0, xmm5);		/* xmm5 = u04 */
652 
653     psubsw_r2r (xmm6, xmm3);		/* xmm3 = a2 */
654     paddsw_r2r (xmm2, xmm5);		/* xmm5 = a0 */
655 
656     paddsw_r2r (xmm1, xmm1);		/* xmm1 = b1 */
657     psubsw_r2r (xmm2, xmm0);		/* xmm0 = a3 */
658 
659     paddsw_r2r (xmm7, xmm7);		/* xmm7 = b2 */
660     movdqa_r2r (xmm3, xmm2);		/* xmm2 = a2 */
661 
662     movdqa_r2r (xmm4, xmm6);		/* xmm6 = a1 */
663     paddsw_r2r (xmm7, xmm3);		/* xmm3 = a2+b2 */
664 
665     psraw_i2r (COL_SHIFT, xmm3);	/* xmm3 = y2 */
666     paddsw_r2r (xmm1, xmm4);		/* xmm4 = a1+b1 */
667 
668     psraw_i2r (COL_SHIFT, xmm4);	/* xmm4 = y1 */
669     psubsw_r2r (xmm1, xmm6);		/* xmm6 = a1-b1 */
670 
671     movdqa_m2r (*(col+5*8), xmm1);	/* xmm1 = b0 */
672     psubsw_r2r (xmm7, xmm2);		/* xmm2 = a2-b2 */
673 
674     psraw_i2r (COL_SHIFT, xmm6);	/* xmm6 = y6 */
675     movdqa_r2r (xmm5, xmm7);		/* xmm7 = a0 */
676 
677     movdqa_r2m (xmm4, *(col+1*8));	/* save y1 */
678     psraw_i2r (COL_SHIFT, xmm2);	/* xmm2 = y5 */
679 
680     movdqa_r2m (xmm3, *(col+2*8));	/* save y2 */
681     paddsw_r2r (xmm1, xmm5);		/* xmm5 = a0+b0 */
682 
683     movdqa_m2r (*(col+3*8), xmm4);	/* xmm4 = b3 */
684     psubsw_r2r (xmm1, xmm7);		/* xmm7 = a0-b0 */
685 
686     psraw_i2r (COL_SHIFT, xmm5);	/* xmm5 = y0 */
687     movdqa_r2r (xmm0, xmm3);		/* xmm3 = a3 */
688 
689     movdqa_r2m (xmm2, *(col+5*8));	/* save y5 */
690     psubsw_r2r (xmm4, xmm3);		/* xmm3 = a3-b3 */
691 
692     psraw_i2r (COL_SHIFT, xmm7);	/* xmm7 = y7 */
693     paddsw_r2r (xmm0, xmm4);		/* xmm4 = a3+b3 */
694 
695     movdqa_r2m (xmm5, *(col+0*8));	/* save y0 */
696     psraw_i2r (COL_SHIFT, xmm3);	/* xmm3 = y4 */
697 
698     movdqa_r2m (xmm6, *(col+6*8));	/* save y6 */
699     psraw_i2r (COL_SHIFT, xmm4);	/* xmm4 = y3 */
700 
701     movdqa_r2m (xmm7, *(col+7*8));	/* save y7 */
702 
703     movdqa_r2m (xmm3, *(col+4*8));	/* save y4 */
704 
705     movdqa_r2m (xmm4, *(col+3*8));	/* save y3 */
706 #endif
707 }
708 
709 
710 /* MMX column IDCT */
idct_col(int16_t * const col,const int offset)711 static inline void idct_col (int16_t * const col, const int offset)
712 {
713     static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
714     static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
715     static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
716     static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
717 
718     /* column code adapted from peter gubanov */
719     /* http://www.elecard.com/peter/idct.shtml */
720 
721     movq_m2r (*t1_vector, mm0);		/* mm0 = T1 */
722 
723     movq_m2r (*(col+offset+1*8), mm1);	/* mm1 = x1 */
724     movq_r2r (mm0, mm2);		/* mm2 = T1 */
725 
726     movq_m2r (*(col+offset+7*8), mm4);	/* mm4 = x7 */
727     pmulhw_r2r (mm1, mm0);		/* mm0 = T1*x1 */
728 
729     movq_m2r (*t3_vector, mm5);		/* mm5 = T3 */
730     pmulhw_r2r (mm4, mm2);		/* mm2 = T1*x7 */
731 
732     movq_m2r (*(col+offset+5*8), mm6);	/* mm6 = x5 */
733     movq_r2r (mm5, mm7);		/* mm7 = T3-1 */
734 
735     movq_m2r (*(col+offset+3*8), mm3);	/* mm3 = x3 */
736     psubsw_r2r (mm4, mm0);		/* mm0 = v17 */
737 
738     movq_m2r (*t2_vector, mm4);		/* mm4 = T2 */
739     pmulhw_r2r (mm3, mm5);		/* mm5 = (T3-1)*x3 */
740 
741     paddsw_r2r (mm2, mm1);		/* mm1 = u17 */
742     pmulhw_r2r (mm6, mm7);		/* mm7 = (T3-1)*x5 */
743 
744     /* slot */
745 
746     movq_r2r (mm4, mm2);		/* mm2 = T2 */
747     paddsw_r2r (mm3, mm5);		/* mm5 = T3*x3 */
748 
749     pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */
750     paddsw_r2r (mm6, mm7);		/* mm7 = T3*x5 */
751 
752     psubsw_r2r (mm6, mm5);		/* mm5 = v35 */
753     paddsw_r2r (mm3, mm7);		/* mm7 = u35 */
754 
755     movq_m2r (*(col+offset+6*8), mm3);	/* mm3 = x6 */
756     movq_r2r (mm0, mm6);		/* mm6 = v17 */
757 
758     pmulhw_r2r (mm3, mm2);		/* mm2 = T2*x6 */
759     psubsw_r2r (mm5, mm0);		/* mm0 = b3 */
760 
761     psubsw_r2r (mm3, mm4);		/* mm4 = v26 */
762     paddsw_r2r (mm6, mm5);		/* mm5 = v12 */
763 
764     movq_r2m (mm0, *(col+offset+3*8));	/* save b3 in scratch0 */
765     movq_r2r (mm1, mm6);		/* mm6 = u17 */
766 
767     paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */
768     paddsw_r2r (mm7, mm6);		/* mm6 = b0 */
769 
770     psubsw_r2r (mm7, mm1);		/* mm1 = u12 */
771     movq_r2r (mm1, mm7);		/* mm7 = u12 */
772 
773     movq_m2r (*(col+offset+0*8), mm3);	/* mm3 = x0 */
774     paddsw_r2r (mm5, mm1);		/* mm1 = u12+v12 */
775 
776     movq_m2r (*c4_vector, mm0);		/* mm0 = C4/2 */
777     psubsw_r2r (mm5, mm7);		/* mm7 = u12-v12 */
778 
779     movq_r2m (mm6, *(col+offset+5*8));	/* save b0 in scratch1 */
780     pmulhw_r2r (mm0, mm1);		/* mm1 = b1/2 */
781 
782     movq_r2r (mm4, mm6);		/* mm6 = v26 */
783     pmulhw_r2r (mm0, mm7);		/* mm7 = b2/2 */
784 
785     movq_m2r (*(col+offset+4*8), mm5);	/* mm5 = x4 */
786     movq_r2r (mm3, mm0);		/* mm0 = x0 */
787 
788     psubsw_r2r (mm5, mm3);		/* mm3 = v04 */
789     paddsw_r2r (mm5, mm0);		/* mm0 = u04 */
790 
791     paddsw_r2r (mm3, mm4);		/* mm4 = a1 */
792     movq_r2r (mm0, mm5);		/* mm5 = u04 */
793 
794     psubsw_r2r (mm6, mm3);		/* mm3 = a2 */
795     paddsw_r2r (mm2, mm5);		/* mm5 = a0 */
796 
797     paddsw_r2r (mm1, mm1);		/* mm1 = b1 */
798     psubsw_r2r (mm2, mm0);		/* mm0 = a3 */
799 
800     paddsw_r2r (mm7, mm7);		/* mm7 = b2 */
801     movq_r2r (mm3, mm2);		/* mm2 = a2 */
802 
803     movq_r2r (mm4, mm6);		/* mm6 = a1 */
804     paddsw_r2r (mm7, mm3);		/* mm3 = a2+b2 */
805 
806     psraw_i2r (COL_SHIFT, mm3);		/* mm3 = y2 */
807     paddsw_r2r (mm1, mm4);		/* mm4 = a1+b1 */
808 
809     psraw_i2r (COL_SHIFT, mm4);		/* mm4 = y1 */
810     psubsw_r2r (mm1, mm6);		/* mm6 = a1-b1 */
811 
812     movq_m2r (*(col+offset+5*8), mm1);	/* mm1 = b0 */
813     psubsw_r2r (mm7, mm2);		/* mm2 = a2-b2 */
814 
815     psraw_i2r (COL_SHIFT, mm6);		/* mm6 = y6 */
816     movq_r2r (mm5, mm7);		/* mm7 = a0 */
817 
818     movq_r2m (mm4, *(col+offset+1*8));	/* save y1 */
819     psraw_i2r (COL_SHIFT, mm2);		/* mm2 = y5 */
820 
821     movq_r2m (mm3, *(col+offset+2*8));	/* save y2 */
822     paddsw_r2r (mm1, mm5);		/* mm5 = a0+b0 */
823 
824     movq_m2r (*(col+offset+3*8), mm4);	/* mm4 = b3 */
825     psubsw_r2r (mm1, mm7);		/* mm7 = a0-b0 */
826 
827     psraw_i2r (COL_SHIFT, mm5);		/* mm5 = y0 */
828     movq_r2r (mm0, mm3);		/* mm3 = a3 */
829 
830     movq_r2m (mm2, *(col+offset+5*8));	/* save y5 */
831     psubsw_r2r (mm4, mm3);		/* mm3 = a3-b3 */
832 
833     psraw_i2r (COL_SHIFT, mm7);		/* mm7 = y7 */
834     paddsw_r2r (mm0, mm4);		/* mm4 = a3+b3 */
835 
836     movq_r2m (mm5, *(col+offset+0*8));	/* save y0 */
837     psraw_i2r (COL_SHIFT, mm3);		/* mm3 = y4 */
838 
839     movq_r2m (mm6, *(col+offset+6*8));	/* save y6 */
840     psraw_i2r (COL_SHIFT, mm4);		/* mm4 = y3 */
841 
842     movq_r2m (mm7, *(col+offset+7*8));	/* save y7 */
843 
844     movq_r2m (mm3, *(col+offset+4*8));	/* save y4 */
845 
846     movq_r2m (mm4, *(col+offset+3*8));	/* save y3 */
847 }
848 
849 
850 static const int32_t rounder0[] ATTR_ALIGN(8) =
851     rounder ((1 << (COL_SHIFT - 1)) - 0.5);
852 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
853 static const int32_t rounder1[] ATTR_ALIGN(8) =
854     rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
855 static const int32_t rounder7[] ATTR_ALIGN(8) =
856     rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
857 static const int32_t rounder2[] ATTR_ALIGN(8) =
858     rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
859 static const int32_t rounder6[] ATTR_ALIGN(8) =
860     rounder (-0.25);		/* C2 * (C6-C2)/2 */
861 static const int32_t rounder3[] ATTR_ALIGN(8) =
862     rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
863 static const int32_t rounder5[] ATTR_ALIGN(8) =
864     rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
865 
866 
867 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
868 static inline void idct (int16_t * const block)				\
869 {									\
870     static const int16_t table04[] ATTR_ALIGN(16) =			\
871 	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
872     static const int16_t table17[] ATTR_ALIGN(16) =			\
873 	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
874     static const int16_t table26[] ATTR_ALIGN(16) =			\
875 	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
876     static const int16_t table35[] ATTR_ALIGN(16) =			\
877 	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
878 									\
879     idct_row_head (block, 0*8, table04);				\
880     idct_row (table04, rounder0);					\
881     idct_row_mid (block, 0*8, 4*8, table04);				\
882     idct_row (table04, rounder4);					\
883     idct_row_mid (block, 4*8, 1*8, table17);				\
884     idct_row (table17, rounder1);					\
885     idct_row_mid (block, 1*8, 7*8, table17);				\
886     idct_row (table17, rounder7);					\
887     idct_row_mid (block, 7*8, 2*8, table26);				\
888     idct_row (table26, rounder2);					\
889     idct_row_mid (block, 2*8, 6*8, table26);				\
890     idct_row (table26, rounder6);					\
891     idct_row_mid (block, 6*8, 3*8, table35);				\
892     idct_row (table35, rounder3);					\
893     idct_row_mid (block, 3*8, 5*8, table35);				\
894     idct_row (table35, rounder5);					\
895     idct_row_tail (block, 5*8);						\
896 									\
897     idct_col (block, 0);						\
898     idct_col (block, 4);						\
899 }
900 
sse2_idct(int16_t * const block)901 static inline void sse2_idct (int16_t * const block)
902 {
903     static const int16_t table04[] ATTR_ALIGN(16) =
904 	sse2_table (22725, 21407, 19266, 16384, 12873,  8867, 4520);
905     static const int16_t table17[] ATTR_ALIGN(16) =
906 	sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270);
907     static const int16_t table26[] ATTR_ALIGN(16) =
908 	sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906);
909     static const int16_t table35[] ATTR_ALIGN(16) =
910 	sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315);
911 
912     static const int32_t rounder0_128[] ATTR_ALIGN(16) =
913 	rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5);
914     static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0);
915     static const int32_t rounder1_128[] ATTR_ALIGN(16) =
916 	rounder_sse2 (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
917     static const int32_t rounder7_128[] ATTR_ALIGN(16) =
918 	rounder_sse2 (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
919     static const int32_t rounder2_128[] ATTR_ALIGN(16) =
920 	rounder_sse2 (0.60355339059);	/* C2 * (C6+C2)/2 */
921     static const int32_t rounder6_128[] ATTR_ALIGN(16) =
922 	rounder_sse2 (-0.25);		/* C2 * (C6-C2)/2 */
923     static const int32_t rounder3_128[] ATTR_ALIGN(16) =
924 	rounder_sse2 (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
925     static const int32_t rounder5_128[] ATTR_ALIGN(16) =
926 	rounder_sse2 (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
927 
928 #if defined(__x86_64__)
929     movdqa_m2r (block[0*8], xmm8);
930     movdqa_m2r (block[4*8], xmm12);
931     SSE2_IDCT_2ROW (table04,  xmm8, xmm12, *rounder0_128, *rounder4_128);
932 
933     movdqa_m2r (block[1*8], xmm9);
934     movdqa_m2r (block[7*8], xmm15);
935     SSE2_IDCT_2ROW (table17,  xmm9, xmm15, *rounder1_128, *rounder7_128);
936 
937     movdqa_m2r (block[2*8], xmm10);
938     movdqa_m2r (block[6*8], xmm14);
939     SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128);
940 
941     movdqa_m2r (block[3*8], xmm11);
942     movdqa_m2r (block[5*8], xmm13);
943     SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128);
944 
945     /* OUTPUT: block in xmm8 ... xmm15 */
946 
947 #else
948     movdqa_m2r (block[0*8], xmm0);
949     movdqa_m2r (block[4*8], xmm4);
950     SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128);
951     movdqa_r2m (xmm0, block[0*8]);
952     movdqa_r2m (xmm4, block[4*8]);
953 
954     movdqa_m2r (block[1*8], xmm0);
955     movdqa_m2r (block[7*8], xmm4);
956     SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128);
957     movdqa_r2m (xmm0, block[1*8]);
958     movdqa_r2m (xmm4, block[7*8]);
959 
960     movdqa_m2r (block[2*8], xmm0);
961     movdqa_m2r (block[6*8], xmm4);
962     SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128);
963     movdqa_r2m (xmm0, block[2*8]);
964     movdqa_r2m (xmm4, block[6*8]);
965 
966     movdqa_m2r (block[3*8], xmm0);
967     movdqa_m2r (block[5*8], xmm4);
968     SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128);
969     movdqa_r2m (xmm0, block[3*8]);
970     movdqa_r2m (xmm4, block[5*8]);
971 #endif
972 
973     sse2_idct_col (block);
974 }
975 
sse2_block_copy(int16_t * const block,uint8_t * dest,const int stride)976 static void sse2_block_copy (int16_t * const block, uint8_t * dest,
977 			     const int stride)
978 {
979 #if defined(__x86_64__)
980     /* INPUT: block in xmm8 ... xmm15 */
981     packuswb_r2r (xmm8, xmm8);
982     packuswb_r2r (xmm9, xmm9);
983     movq_r2m (xmm8,  *(dest+0*stride));
984     packuswb_r2r (xmm10, xmm10);
985     movq_r2m (xmm9,  *(dest+1*stride));
986     packuswb_r2r (xmm11, xmm11);
987     movq_r2m (xmm10, *(dest+2*stride));
988     packuswb_r2r (xmm12, xmm12);
989     movq_r2m (xmm11, *(dest+3*stride));
990     packuswb_r2r (xmm13, xmm13);
991     movq_r2m (xmm12, *(dest+4*stride));
992     packuswb_r2r (xmm14, xmm14);
993     movq_r2m (xmm13, *(dest+5*stride));
994     packuswb_r2r (xmm15, xmm15);
995     movq_r2m (xmm14, *(dest+6*stride));
996     movq_r2m (xmm15, *(dest+7*stride));
997 #else
998     movdqa_m2r (*(block+0*8), xmm0);
999     movdqa_m2r (*(block+1*8), xmm1);
1000     movdqa_m2r (*(block+2*8), xmm2);
1001     packuswb_r2r (xmm0, xmm0);
1002     movdqa_m2r (*(block+3*8), xmm3);
1003     packuswb_r2r (xmm1, xmm1);
1004     movdqa_m2r (*(block+4*8), xmm4);
1005     packuswb_r2r (xmm2, xmm2);
1006     movdqa_m2r (*(block+5*8), xmm5);
1007     packuswb_r2r (xmm3, xmm3);
1008     movdqa_m2r (*(block+6*8), xmm6);
1009     packuswb_r2r (xmm4, xmm4);
1010     movdqa_m2r (*(block+7*8), xmm7);
1011     movq_r2m (xmm0, *(dest+0*stride));
1012     packuswb_r2r (xmm5, xmm5);
1013     movq_r2m (xmm1, *(dest+1*stride));
1014     packuswb_r2r (xmm6, xmm6);
1015     movq_r2m (xmm2, *(dest+2*stride));
1016     packuswb_r2r (xmm7, xmm7);
1017     movq_r2m (xmm3, *(dest+3*stride));
1018     movq_r2m (xmm4, *(dest+4*stride));
1019     movq_r2m (xmm5, *(dest+5*stride));
1020     movq_r2m (xmm6, *(dest+6*stride));
1021     movq_r2m (xmm7, *(dest+7*stride));
1022 #endif
1023 }
1024 
1025 #define COPY_MMX(offset,r0,r1,r2)	\
1026 do {					\
1027     movq_m2r (*(block+offset), r0);	\
1028     dest += stride;			\
1029     movq_m2r (*(block+offset+4), r1);	\
1030     movq_r2m (r2, *dest);		\
1031     packuswb_r2r (r1, r0);		\
1032 } while (0)
1033 
block_copy(int16_t * const block,uint8_t * dest,const int stride)1034 static inline void block_copy (int16_t * const block, uint8_t * dest,
1035 			       const int stride)
1036 {
1037     movq_m2r (*(block+0*8), mm0);
1038     movq_m2r (*(block+0*8+4), mm1);
1039     movq_m2r (*(block+1*8), mm2);
1040     packuswb_r2r (mm1, mm0);
1041     movq_m2r (*(block+1*8+4), mm3);
1042     movq_r2m (mm0, *dest);
1043     packuswb_r2r (mm3, mm2);
1044     COPY_MMX (2*8, mm0, mm1, mm2);
1045     COPY_MMX (3*8, mm2, mm3, mm0);
1046     COPY_MMX (4*8, mm0, mm1, mm2);
1047     COPY_MMX (5*8, mm2, mm3, mm0);
1048     COPY_MMX (6*8, mm0, mm1, mm2);
1049     COPY_MMX (7*8, mm2, mm3, mm0);
1050     movq_r2m (mm2, *(dest+stride));
1051 }
1052 
1053 #define ADD_SSE2_2ROW(op, block0, block1)\
1054 do {					\
1055     movq_m2r (*(dest), xmm1);		\
1056     movq_m2r (*(dest+stride), xmm2);	\
1057     punpcklbw_r2r (xmm0, xmm1);		\
1058     punpcklbw_r2r (xmm0, xmm2);		\
1059     paddsw_##op (block0, xmm1);		\
1060     paddsw_##op (block1, xmm2);		\
1061     packuswb_r2r (xmm1, xmm1);		\
1062     packuswb_r2r (xmm2, xmm2);		\
1063     movq_r2m (xmm1, *(dest));		\
1064     movq_r2m (xmm2, *(dest+stride));	\
1065     dest += 2*stride;			\
1066 } while (0)
1067 
sse2_block_add(int16_t * const block,uint8_t * dest,const int stride)1068 static void sse2_block_add (int16_t * const block, uint8_t * dest,
1069 			    const int stride)
1070 {
1071     pxor_r2r(xmm0, xmm0);
1072 #if defined(__x86_64__)
1073     /* INPUT: block in xmm8 ... xmm15 */
1074     ADD_SSE2_2ROW(r2r, xmm8, xmm9);
1075     ADD_SSE2_2ROW(r2r, xmm10, xmm11);
1076     ADD_SSE2_2ROW(r2r, xmm12, xmm13);
1077     ADD_SSE2_2ROW(r2r, xmm14, xmm15);
1078 #else
1079     ADD_SSE2_2ROW(m2r, *(block+0*8), *(block+1*8));
1080     ADD_SSE2_2ROW(m2r, *(block+2*8), *(block+3*8));
1081     ADD_SSE2_2ROW(m2r, *(block+4*8), *(block+5*8));
1082     ADD_SSE2_2ROW(m2r, *(block+6*8), *(block+7*8));
1083 #endif
1084 }
1085 
1086 #define ADD_MMX(offset,r1,r2,r3,r4)	\
1087 do {					\
1088     movq_m2r (*(dest+2*stride), r1);	\
1089     packuswb_r2r (r4, r3);		\
1090     movq_r2r (r1, r2);			\
1091     dest += stride;			\
1092     movq_r2m (r3, *dest);		\
1093     punpcklbw_r2r (mm0, r1);		\
1094     paddsw_m2r (*(block+offset), r1);	\
1095     punpckhbw_r2r (mm0, r2);		\
1096     paddsw_m2r (*(block+offset+4), r2);	\
1097 } while (0)
1098 
block_add(int16_t * const block,uint8_t * dest,const int stride)1099 static inline void block_add (int16_t * const block, uint8_t * dest,
1100 			      const int stride)
1101 {
1102     movq_m2r (*dest, mm1);
1103     pxor_r2r (mm0, mm0);
1104     movq_m2r (*(dest+stride), mm3);
1105     movq_r2r (mm1, mm2);
1106     punpcklbw_r2r (mm0, mm1);
1107     movq_r2r (mm3, mm4);
1108     paddsw_m2r (*(block+0*8), mm1);
1109     punpckhbw_r2r (mm0, mm2);
1110     paddsw_m2r (*(block+0*8+4), mm2);
1111     punpcklbw_r2r (mm0, mm3);
1112     paddsw_m2r (*(block+1*8), mm3);
1113     packuswb_r2r (mm2, mm1);
1114     punpckhbw_r2r (mm0, mm4);
1115     movq_r2m (mm1, *dest);
1116     paddsw_m2r (*(block+1*8+4), mm4);
1117     ADD_MMX (2*8, mm1, mm2, mm3, mm4);
1118     ADD_MMX (3*8, mm3, mm4, mm1, mm2);
1119     ADD_MMX (4*8, mm1, mm2, mm3, mm4);
1120     ADD_MMX (5*8, mm3, mm4, mm1, mm2);
1121     ADD_MMX (6*8, mm1, mm2, mm3, mm4);
1122     ADD_MMX (7*8, mm3, mm4, mm1, mm2);
1123     packuswb_r2r (mm4, mm3);
1124     movq_r2m (mm3, *(dest+stride));
1125 }
1126 
1127 
sse2_block_zero(int16_t * const block)1128 static inline void sse2_block_zero (int16_t * const block)
1129 {
1130     pxor_r2r (xmm0, xmm0);
1131     movdqa_r2m (xmm0, *(block+0*8));
1132     movdqa_r2m (xmm0, *(block+1*8));
1133     movdqa_r2m (xmm0, *(block+2*8));
1134     movdqa_r2m (xmm0, *(block+3*8));
1135     movdqa_r2m (xmm0, *(block+4*8));
1136     movdqa_r2m (xmm0, *(block+5*8));
1137     movdqa_r2m (xmm0, *(block+6*8));
1138     movdqa_r2m (xmm0, *(block+7*8));
1139 }
1140 
block_zero(int16_t * const block)1141 static inline void block_zero (int16_t * const block)
1142 {
1143     pxor_r2r (mm0, mm0);
1144     movq_r2m (mm0, *(block+0*4));
1145     movq_r2m (mm0, *(block+1*4));
1146     movq_r2m (mm0, *(block+2*4));
1147     movq_r2m (mm0, *(block+3*4));
1148     movq_r2m (mm0, *(block+4*4));
1149     movq_r2m (mm0, *(block+5*4));
1150     movq_r2m (mm0, *(block+6*4));
1151     movq_r2m (mm0, *(block+7*4));
1152     movq_r2m (mm0, *(block+8*4));
1153     movq_r2m (mm0, *(block+9*4));
1154     movq_r2m (mm0, *(block+10*4));
1155     movq_r2m (mm0, *(block+11*4));
1156     movq_r2m (mm0, *(block+12*4));
1157     movq_r2m (mm0, *(block+13*4));
1158     movq_r2m (mm0, *(block+14*4));
1159     movq_r2m (mm0, *(block+15*4));
1160 }
1161 
1162 
1163 #define CPU_MMXEXT 0
1164 #define CPU_MMX 1
1165 
1166 #define dup4(reg)			\
1167 do {					\
1168     if (cpu != CPU_MMXEXT) {		\
1169 	punpcklwd_r2r (reg, reg);	\
1170 	punpckldq_r2r (reg, reg);	\
1171     } else				\
1172 	pshufw_r2r (reg, reg, 0x00);	\
1173 } while (0)
1174 
block_add_DC(int16_t * const block,uint8_t * dest,const int stride,const int cpu)1175 static inline void block_add_DC (int16_t * const block, uint8_t * dest,
1176 				 const int stride, const int cpu)
1177 {
1178     movd_v2r ((block[0] + 64) >> 7, mm0);
1179     pxor_r2r (mm1, mm1);
1180     movq_m2r (*dest, mm2);
1181     dup4 (mm0);
1182     psubsw_r2r (mm0, mm1);
1183     packuswb_r2r (mm0, mm0);
1184     paddusb_r2r (mm0, mm2);
1185     packuswb_r2r (mm1, mm1);
1186     movq_m2r (*(dest + stride), mm3);
1187     psubusb_r2r (mm1, mm2);
1188     block[0] = 0;
1189     paddusb_r2r (mm0, mm3);
1190     movq_r2m (mm2, *dest);
1191     psubusb_r2r (mm1, mm3);
1192     movq_m2r (*(dest + 2*stride), mm2);
1193     dest += stride;
1194     movq_r2m (mm3, *dest);
1195     paddusb_r2r (mm0, mm2);
1196     movq_m2r (*(dest + 2*stride), mm3);
1197     psubusb_r2r (mm1, mm2);
1198     dest += stride;
1199     paddusb_r2r (mm0, mm3);
1200     movq_r2m (mm2, *dest);
1201     psubusb_r2r (mm1, mm3);
1202     movq_m2r (*(dest + 2*stride), mm2);
1203     dest += stride;
1204     movq_r2m (mm3, *dest);
1205     paddusb_r2r (mm0, mm2);
1206     movq_m2r (*(dest + 2*stride), mm3);
1207     psubusb_r2r (mm1, mm2);
1208     dest += stride;
1209     paddusb_r2r (mm0, mm3);
1210     movq_r2m (mm2, *dest);
1211     psubusb_r2r (mm1, mm3);
1212     movq_m2r (*(dest + 2*stride), mm2);
1213     dest += stride;
1214     movq_r2m (mm3, *dest);
1215     paddusb_r2r (mm0, mm2);
1216     movq_m2r (*(dest + 2*stride), mm3);
1217     psubusb_r2r (mm1, mm2);
1218     block[63] = 0;
1219     paddusb_r2r (mm0, mm3);
1220     movq_r2m (mm2, *(dest + stride));
1221     psubusb_r2r (mm1, mm3);
1222     movq_r2m (mm3, *(dest + 2*stride));
1223 }
1224 
mpeg2_idct_copy_sse2(int16_t * const block,uint8_t * const dest,const int stride)1225 void mpeg2_idct_copy_sse2 (int16_t * const block, uint8_t * const dest,
1226 			   const int stride)
1227 {
1228     sse2_idct (block);
1229     sse2_block_copy (block, dest, stride);
1230     sse2_block_zero (block);
1231 }
1232 
mpeg2_idct_add_sse2(const int last,int16_t * const block,uint8_t * const dest,const int stride)1233 void mpeg2_idct_add_sse2 (const int last, int16_t * const block,
1234 			  uint8_t * const dest, const int stride)
1235 {
1236     if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
1237 	sse2_idct (block);
1238 	sse2_block_add (block, dest, stride);
1239 	sse2_block_zero (block);
1240     } else
1241 	block_add_DC (block, dest, stride, CPU_MMXEXT);
1242 }
1243 
1244 
declare_idct(mmxext_idct,mmxext_table,mmxext_row_head,mmxext_row,mmxext_row_tail,mmxext_row_mid)1245 declare_idct (mmxext_idct, mmxext_table,
1246 	      mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
1247 
1248 void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest,
1249 			     const int stride)
1250 {
1251     mmxext_idct (block);
1252     block_copy (block, dest, stride);
1253     block_zero (block);
1254 }
1255 
mpeg2_idct_add_mmxext(const int last,int16_t * const block,uint8_t * const dest,const int stride)1256 void mpeg2_idct_add_mmxext (const int last, int16_t * const block,
1257 			    uint8_t * const dest, const int stride)
1258 {
1259     if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
1260 	mmxext_idct (block);
1261 	block_add (block, dest, stride);
1262 	block_zero (block);
1263     } else
1264 	block_add_DC (block, dest, stride, CPU_MMXEXT);
1265 }
1266 
1267 
declare_idct(mmx_idct,mmx_table,mmx_row_head,mmx_row,mmx_row_tail,mmx_row_mid)1268 declare_idct (mmx_idct, mmx_table,
1269 	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
1270 
1271 void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest,
1272 			  const int stride)
1273 {
1274     mmx_idct (block);
1275     block_copy (block, dest, stride);
1276     block_zero (block);
1277 }
1278 
mpeg2_idct_add_mmx(const int last,int16_t * const block,uint8_t * const dest,const int stride)1279 void mpeg2_idct_add_mmx (const int last, int16_t * const block,
1280 			 uint8_t * const dest, const int stride)
1281 {
1282     if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
1283 	mmx_idct (block);
1284 	block_add (block, dest, stride);
1285 	block_zero (block);
1286     } else
1287 	block_add_DC (block, dest, stride, CPU_MMX);
1288 }
1289 
1290 
mpeg2_idct_mmx_init(void)1291 void mpeg2_idct_mmx_init (void)
1292 {
1293     int i, j;
1294 
1295     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
1296 
1297     for (i = 0; i < 64; i++) {
1298 	j = mpeg2_scan_norm[i];
1299 	mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
1300 	j = mpeg2_scan_alt[i];
1301 	mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
1302     }
1303 }
1304 
1305 #endif
1306