1 //////////////////////////////////////////////////////////////////////////////
2 //
3 // fdctam32.c - AP922 MMX(3D-Now) forward-DCT
4 // ----------
5 // Intel Application Note AP-922 - fast, precise implementation of DCT
6 // http://developer.intel.com/vtune/cbts/appnotes.htm
7 // ----------
8 //
9 // This routine can use a 3D-Now/MMX enhancement to increase the
10 // accuracy of the fdct_col_4 macro. The dct_col function uses 3D-Now's
11 // PMHULHRW instead of MMX's PMHULHW(and POR). The substitution improves
12 // accuracy very slightly with performance penalty. If the target CPU
13 // does not support 3D-Now, then this function cannot be executed.
14 //
15 // For a fast, precise MMX implementation of inverse-DCT
16 // visit http://www.elecard.com/peter
17 //
18 // v1.0 07/22/2000 (initial release)
19 //
20 // liaor@iname.com http://members.tripod.com/~liaor
21 //////////////////////////////////////////////////////////////////////////////
22
23 /*
24 * A.Stevens Jul 2000: ported to nasm syntax and disentangled from
25 * from Win**** compiler specific stuff.
26 * All the real work was done above though.
27 * See above for how to optimise quality on 3DNow! CPU's
28 * Nov 2003 changed to PIC for use in shared libraries
29 *
30 * G.Vervoort Jan 2005: ported to inline asm.
31 */
32
33 #include <config.h>
34 #include "mjpeg_types.h"
35 #include "mmx.h"
36
37
38 //////////////////////////////////////////////////////////////////////
39 //
40 // constants for the forward DCT
41 // -----------------------------
42 //
43 // Be sure to check that your compiler is aligning all constants to QWORD
44 // (8-byte) memory boundaries! Otherwise the unaligned memory access will
45 // severely stall MMX execution.
46 //
47 //////////////////////////////////////////////////////////////////////
48
49 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
50 #define SHIFT_FRW_COL BITS_FRW_ACC
51 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
52 //#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
53 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
54 //#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
55 #define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
56
57 //concatenated table, for forward DCT transformation
58 static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
59 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5
60 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5
61 -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5
62 };
63
64 static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
65 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
66 };
67
68 static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
69 static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
70
71 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
72 //row0
73 16384, 16384, 21407, -8867, // w09 w01 w08 w00
74 16384, 16384, 8867, -21407, // w13 w05 w12 w04
75 16384, -16384, 8867, 21407, // w11 w03 w10 w02
76 -16384, 16384, -21407, -8867, // w15 w07 w14 w06
77 22725, 12873, 19266, -22725, // w22 w20 w18 w16
78 19266, 4520, -4520, -12873, // w23 w21 w19 w17
79 12873, 4520, 4520, 19266, // w30 w28 w26 w24
80 -22725, 19266, -12873, -22725, // w31 w29 w27 w25
81
82 //row1
83 22725, 22725, 29692, -12299, // w09 w01 w08 w00
84 22725, 22725, 12299, -29692, // w13 w05 w12 w04
85 22725, -22725, 12299, 29692, // w11 w03 w10 w02
86 -22725, 22725, -29692, -12299, // w15 w07 w14 w06
87 31521, 17855, 26722, -31521, // w22 w20 w18 w16
88 26722, 6270, -6270, -17855, // w23 w21 w19 w17
89 17855, 6270, 6270, 26722, // w30 w28 w26 w24
90 -31521, 26722, -17855, -31521, // w31 w29 w27 w25
91
92 //row2
93 21407, 21407, 27969, -11585, // w09 w01 w08 w00
94 21407, 21407, 11585, -27969, // w13 w05 w12 w04
95 21407, -21407, 11585, 27969, // w11 w03 w10 w02
96 -21407, 21407, -27969, -11585, // w15 w07 w14 w06
97 29692, 16819, 25172, -29692, // w22 w20 w18 w16
98 25172, 5906, -5906, -16819, // w23 w21 w19 w17
99 16819, 5906, 5906, 25172, // w30 w28 w26 w24
100 -29692, 25172, -16819, -29692, // w31 w29 w27 w25
101
102 //row3
103 19266, 19266, 25172, -10426, // w09 w01 w08 w00
104 19266, 19266, 10426, -25172, // w13 w05 w12 w04
105 19266, -19266, 10426, 25172, // w11 w03 w10 w02
106 -19266, 19266, -25172, -10426, // w15 w07 w14 w06,
107 26722, 15137, 22654, -26722, // w22 w20 w18 w16
108 22654, 5315, -5315, -15137, // w23 w21 w19 w17
109 15137, 5315, 5315, 22654, // w30 w28 w26 w24
110 -26722, 22654, -15137, -26722, // w31 w29 w27 w25,
111
112 //row4
113 16384, 16384, 21407, -8867, // w09 w01 w08 w00
114 16384, 16384, 8867, -21407, // w13 w05 w12 w04
115 16384, -16384, 8867, 21407, // w11 w03 w10 w02
116 -16384, 16384, -21407, -8867, // w15 w07 w14 w06
117 22725, 12873, 19266, -22725, // w22 w20 w18 w16
118 19266, 4520, -4520, -12873, // w23 w21 w19 w17
119 12873, 4520, 4520, 19266, // w30 w28 w26 w24
120 -22725, 19266, -12873, -22725, // w31 w29 w27 w25
121
122 //row5
123 19266, 19266, 25172, -10426, // w09 w01 w08 w00
124 19266, 19266, 10426, -25172, // w13 w05 w12 w04
125 19266, -19266, 10426, 25172, // w11 w03 w10 w02
126 -19266, 19266, -25172, -10426, // w15 w07 w14 w06
127 26722, 15137, 22654, -26722, // w22 w20 w18 w16
128 22654, 5315, -5315, -15137, // w23 w21 w19 w17
129 15137, 5315, 5315, 22654, // w30 w28 w26 w24
130 -26722, 22654, -15137, -26722, // w31 w29 w27 w25
131
132 //row6
133 21407, 21407, 27969, -11585, // w09 w01 w08 w00
134 21407, 21407, 11585, -27969, // w13 w05 w12 w04
135 21407, -21407, 11585, 27969, // w11 w03 w10 w02
136 -21407, 21407, -27969, -11585, // w15 w07 w14 w06,
137 29692, 16819, 25172, -29692, // w22 w20 w18 w16
138 25172, 5906, -5906, -16819, // w23 w21 w19 w17
139 16819, 5906, 5906, 25172, // w30 w28 w26 w24
140 -29692, 25172, -16819, -29692, // w31 w29 w27 w25,
141
142 //row7
143 22725, 22725, 29692, -12299, // w09 w01 w08 w00
144 22725, 22725, 12299, -29692, // w13 w05 w12 w04
145 22725, -22725, 12299, 29692, // w11 w03 w10 w02
146 -22725, 22725, -29692, -12299, // w15 w07 w14 w06,
147 31521, 17855, 26722, -31521, // w22 w20 w18 w16
148 26722, 6270, -6270, -17855, // w23 w21 w19 w17
149 17855, 6270, 6270, 26722, // w30 w28 w26 w24
150 -31521, 26722, -17855, -31521 // w31 w29 w27 w25
151 };
152
153
154 #define x0 (inp + 0*8)
155 #define x1 (inp + 1*8)
156 #define x2 (inp + 2*8)
157 #define x3 (inp + 3*8)
158 #define x4 (inp + 4*8)
159 #define x5 (inp + 5*8)
160 #define x6 (inp + 6*8)
161 #define x7 (inp + 7*8)
162 #define y0 (out + 0*8)
163 #define y1 (out + 1*8)
164 #define y2 (out + 2*8)
165 #define y3 (out + 3*8)
166 #define y4 (out + 4*8)
167 #define y5 (out + 5*8)
168 #define y6 (out + 6*8)
169 #define y7 (out + 7*8)
170
171 #define round_frw_row fdct_r_row
172
173 ////////////////////////////////////////////////////////////////////////
174 //
175 // The high-level pseudocode for the fdct_am32() routine :
176 //
177 // fdct_am32()
178 // {
179 // forward_dct_col03(); // dct_column transform on cols 0-3
180 // forward_dct_col47(); // dct_column transform on cols 4-7
181 // for ( j = 0; j < 8; j=j+1 )
182 // forward_dct_row1(j); // dct_row transform on row #j
183 // }
184 //
185
186
fdct_mmx(int16_t * blk)187 void fdct_mmx(int16_t *blk)
188 {
189 int16_t *inp, *out;
190 int16_t *table;
191 int i;
192
193 /* transform the left half of the matrix (4 columns) */
194
195 out = inp = blk;
196
197 /*
198 * for ( i = 0; i < 2; i = i + 1)
199 * the for-loop is executed twice. We are better off unrolling the
200 * loop to avoid branch misprediction.
201 * .mmx32_fdct_col03:
202 */
203 movq_m2r(*x1, mm0); /* 0 ; x1 */
204
205 movq_m2r(*x6, mm1); /* 1 ; x6 */
206 movq_r2r(mm0, mm2); /* 2 ; x1 */
207
208 movq_m2r(*x2, mm3); /* 3 ; x2 */
209 paddsw_r2r(mm1, mm0); /* t1 = x[1] + x[6] */
210
211 movq_m2r(*x5, mm4); /* 4 ; x5 */
212 psllw_i2r(SHIFT_FRW_COL, mm0); /* t1 */
213
214 movq_m2r(*x0, mm5); /* 5 ; x0 */
215 paddsw_r2r(mm3, mm4); /* t2 = x[2] + x[5] */
216
217 paddsw_m2r(*x7, mm5); /* t0 = x[0] + x[7] */
218 psllw_i2r(SHIFT_FRW_COL, mm4); /* t2 */
219
220 movq_r2r(mm0, mm6); /* 6 ; t1 */
221 psubsw_r2r(mm1, mm2); /* 1 ; t6 = x[1] - x[6] */
222
223 movq_m2r(*(fdct_tg_all_16 + 4), mm1); /* 1 ; tg_2_16 */
224 psubsw_r2r(mm4, mm0); /* tm12 = t1 - t2 */
225
226 movq_m2r(*x3, mm7); /* x3 */
227 pmulhw_r2r(mm0, mm1); /* tm12*tg_2_16 */
228
229 paddsw_m2r(*x4, mm7); /* t3 = x[3] + x[4] */
230 psllw_i2r(SHIFT_FRW_COL, mm5); /* t0 */
231
232 paddsw_r2r(mm4, mm6); /* 4 ; tp12 = t1 + t2 */
233 psllw_i2r(SHIFT_FRW_COL, mm7); /* t3 */
234
235 movq_r2r(mm5, mm4); /* 4 ; t0 */
236 psubsw_r2r(mm7, mm5); /* tm03 = t0 - t3 */
237
238 paddsw_r2r(mm5, mm1); /* y2 = tm03 + tm12*tg_2_16 */
239 paddsw_r2r(mm7, mm4); /* 7 ; tp03 = t0 + t3 */
240
241 por_m2r(fdct_one_corr, mm1); /* correction y2 +0.5 */
242 psllw_i2r(SHIFT_FRW_COL+1, mm2); /* t6 */
243
244 pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5); /* tm03*tg_2_16 */
245 movq_r2r(mm4, mm7); /* 7 ; tp03 */
246
247 psubsw_m2r(*x5, mm3); /* t5 = x[2] - x[5] */
248 psubsw_r2r(mm6, mm4); /* y4 = tp03 - tp12 */
249
250 movq_r2m(mm1, *y2); /* 1 ; save y2 */
251 paddsw_r2r(mm6, mm7); /* 6 ; y0 = tp03 + tp12 */
252
253 movq_m2r(*x3, mm1); /* 1 ; x3 */
254 psllw_i2r(SHIFT_FRW_COL+1, mm3); /* t5 */
255
256 psubsw_m2r(*x4, mm1); /* t4 = x[3] - x[4] */
257 movq_r2r(mm2, mm6); /* 6 ; t6 */
258
259 movq_r2m(mm4, *y4); /* 4 ; save y4 */
260 paddsw_r2r(mm3, mm2); /* t6 + t5 */
261
262 pmulhw_m2r(*ocos_4_16, mm2); /* tp65 = (t6 + t5)*cos_4_16 */
263 psubsw_r2r(mm3, mm6); /* 3 ; t6 - t5 */
264
265 pmulhw_m2r(*ocos_4_16, mm6); /* tm65 = (t6 - t5)*cos_4_16 */
266 psubsw_r2r(mm0, mm5); /* 0 ; y6 = tm03*tg_2_16 - tm12 */
267
268 por_m2r(fdct_one_corr, mm5); /* correction y6 +0.5 */
269 psllw_i2r(SHIFT_FRW_COL, mm1); /* t4 */
270
271 por_m2r(fdct_one_corr, mm2); /* correction tp65 +0.5 */
272 movq_r2r(mm1, mm4); /* 4 ; t4 */
273
274 movq_m2r(*x0, mm3); /* 3 ; x0 */
275 paddsw_r2r(mm6, mm1); /* tp465 = t4 + tm65 */
276
277 psubsw_m2r(*x7, mm3); /* t7 = x[0] - x[7] */
278 psubsw_r2r(mm6, mm4); /* 6 ; tm465 = t4 - tm65 */
279
280 movq_m2r(*(fdct_tg_all_16 + 0), mm0); /* 0 ; tg_1_16 */
281 psllw_i2r(SHIFT_FRW_COL, mm3); /* t7 */
282
283 movq_m2r(*(fdct_tg_all_16 + 8), mm6); /* 6 ; tg_3_16 */
284 pmulhw_r2r(mm1, mm0); /* tp465*tg_1_16 */
285
286 movq_r2m(mm7, *y0); /* 7 ; save y0 */
287 pmulhw_r2r(mm4, mm6); /* tm465*tg_3_16 */
288
289 movq_r2m(mm5, *y6); /* save y6 */
290 movq_r2r(mm3, mm7); /* t7 */
291
292 movq_m2r(*(fdct_tg_all_16 + 8), mm5); /* 5 ; tg_3_16 */
293 psubsw_r2r(mm2, mm7); /* tm765 = t7 - tp65 */
294
295 paddsw_r2r(mm2, mm3); /* 2 ; tp765 = t7 + tp65 */
296 pmulhw_r2r(mm7, mm5); /* tm765*tg_3_16 */
297
298 paddsw_r2r(mm3, mm0); /* y1 = tp765 + tp465*tg_1_16 */
299 paddsw_r2r(mm4, mm6); /* tm465*tg_3_16 */
300
301 pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3); /* tp765*tg_1_16 */
302
303 por_m2r(fdct_one_corr, mm0); /* correction y1 +0.5 */
304 paddsw_r2r(mm7, mm5); /* tm765*tg_3_16 */
305
306 psubsw_r2r(mm6, mm7); /* 6 ; y3 = tm765 - tm465*tg_3_16 */
307 inp += 4; /* increment pointer */
308
309 movq_r2m(mm0, *y1); /* 0 ; save y1 */
310 paddsw_r2r(mm4, mm5); /* 4 ; y5 = tm765*tg_3_16 + tm465 */
311
312 movq_r2m(mm7, *y3); /* 7 ; save y3 */
313 psubsw_r2r(mm1, mm3); /* 1 ; y7 = tp765*tg_1_16 - tp465 */
314
315 movq_r2m(mm5, *y5); /* save y5 */
316
317 /* .mmx32_fdct_col47: ; begin processing last four columns */
318 movq_m2r(*x1, mm0); /* 0 ; x1 */
319
320 movq_r2m(mm3, *y7); /* 3 ; save y7 (columns 0-4) */
321
322 movq_m2r(*x6, mm1); /* 1 ; x6 */
323 movq_r2r(mm0, mm2); /* 2 ; x1 */
324
325 movq_m2r(*x2, mm3); /* 3 ; x2 */
326 paddsw_r2r(mm1, mm0); /* t1 = x[1] + x[6] */
327
328 movq_m2r(*x5, mm4); /* 4 ; x5 */
329 psllw_i2r(SHIFT_FRW_COL, mm0); /* t1 */
330
331 movq_m2r(*x0, mm5); /* 5 ; x0 */
332 paddsw_r2r(mm3, mm4); /* t2 = x[2] + x[5] */
333
334 paddsw_m2r(*x7, mm5); /* t0 = x[0] + x[7] */
335 psllw_i2r(SHIFT_FRW_COL, mm4); /* t2 */
336
337 movq_r2r(mm0, mm6); /* 6 ; t1 */
338 psubsw_r2r(mm1, mm2); /* 1 ; t6 = x[1] - x[6] */
339
340 movq_m2r(*(fdct_tg_all_16 + 4), mm1); /* 1 ; tg_2_16 */
341 psubsw_r2r(mm4, mm0); /* tm12 = t1 - t2 */
342
343 movq_m2r(*x3, mm7); /* 7 ; x3 */
344 pmulhw_r2r(mm0, mm1); /* tm12*tg_2_16 */
345
346 paddsw_m2r(*x4, mm7); /* t3 = x[3] + x[4] */
347 psllw_i2r(SHIFT_FRW_COL, mm5); /* t0 */
348
349 paddsw_r2r(mm4, mm6); /* 4 ; tp12 = t1 + t2 */
350 psllw_i2r(SHIFT_FRW_COL, mm7); /* t3 */
351
352 movq_r2r(mm5, mm4); /* 4 ; t0 */
353 psubsw_r2r(mm7, mm5); /* tm03 = t0 - t3 */
354
355 paddsw_r2r(mm5, mm1); /* y2 = tm03 + tm12*tg_2_16 */
356 paddsw_r2r(mm7, mm4); /* 7 ; tp03 = t0 + t3 */
357
358 por_m2r(fdct_one_corr, mm1); /* correction y2 +0.5 */
359 psllw_i2r(SHIFT_FRW_COL+1, mm2); /* t6 */
360
361 pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5); /* tm03*tg_2_16 */
362 movq_r2r(mm4, mm7); /* 7 ; tp03 */
363
364 psubsw_m2r(*x5, mm3); /* t5 = x[2] - x[5] */
365 psubsw_r2r(mm6, mm4); /* y4 = tp03 - tp12 */
366
367 movq_r2m(mm1, *(y2+4)); /* save y2 */
368 paddsw_r2r(mm6, mm7); /* y0 = tp03 + tp12 */
369
370 movq_m2r(*x3, mm1); /* 1 ; x3 */
371 psllw_i2r(SHIFT_FRW_COL+1, mm3); /* t5 */
372
373 psubsw_m2r(*x4, mm1); /* t4 = x[3] - x[4] */
374 movq_r2r(mm2, mm6); /* 6 ; t6 */
375
376 movq_r2m(mm4, *(y4+4)); /* save y4 */
377 paddsw_r2r(mm3, mm2); /* t6 + t5 */
378
379 pmulhw_m2r(*ocos_4_16, mm2); /* tp65 = (t6 + t5)*cos_4_16 */
380 psubsw_r2r(mm3, mm6); /* 3 ; t6 - t5 */
381
382 pmulhw_m2r(*ocos_4_16, mm6); /* tm65 = (t6 - t5)*cos_4_16 */
383 psubsw_r2r(mm0, mm5); /* 0 ; y6 = tm03*tg_2_16 - tm12 */
384
385 por_m2r(fdct_one_corr, mm5); /* correction y6 +0.5 */
386 psllw_i2r(SHIFT_FRW_COL, mm1); /* t4 */
387
388 por_m2r(fdct_one_corr, mm2); /* correction tp65 +0.5 */
389 movq_r2r(mm1, mm4); /* 4 ; t4 */
390
391 movq_m2r(*x0, mm3); /* 3 ; x0 */
392 paddsw_r2r(mm6, mm1); /* tp465 = t4 + tm65 */
393
394 psubsw_m2r(*x7, mm3); /* t7 = x[0] - x[7] */
395 psubsw_r2r(mm6, mm4); /* 6 ; tm465 = t4 - tm65 */
396
397 movq_m2r(*(fdct_tg_all_16 + 0), mm0); /* 0 ; tg_1_16 */
398 psllw_i2r(SHIFT_FRW_COL, mm3); /* t7 */
399
400 movq_m2r(*(fdct_tg_all_16 + 8), mm6); /* 6 ; tg_3_16 */
401 pmulhw_r2r(mm1, mm0); /* tp465*tg_1_16 */
402
403 movq_r2m(mm7, *(y0+4)); /* 7 ; save y0 */
404 pmulhw_r2r(mm4, mm6); /* tm465*tg_3_16 */
405
406 movq_r2m(mm5, *(y6+4)); /* 5 ; save y6 */
407 movq_r2r(mm3, mm7); /* 7 ; t7 */
408
409 movq_m2r(*(fdct_tg_all_16 + 8), mm5); /* 5 ; tg_3_16 */
410 psubsw_r2r(mm2, mm7); /* tm765 = t7 - tp65 */
411
412 paddsw_r2r(mm2, mm3); /* 2 ; tp765 = t7 + tp65 */
413 pmulhw_r2r(mm7, mm5); /* tm765*tg_3_16 */
414
415 paddsw_r2r(mm3, mm0); /* y1 = tp765 + tp465*tg_1_16 */
416 paddsw_r2r(mm4, mm6); /* tm465*tg_3_16 */
417
418 pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3); /* tp765*tg_1_16 */
419
420 por_m2r(fdct_one_corr, mm0); /* correction y1 +0.5 */
421 paddsw_r2r(mm7, mm5); /* tm765*tg_3_16 */
422
423 psubsw_r2r(mm6, mm7); /* 6 ; y3 = tm765 - tm465*tg_3_16 */
424
425 movq_r2m(mm0, *(y1+4)); /* 0 ; save y1 */
426 paddsw_r2r(mm4, mm5); /* 4 ; y5 = tm765*tg_3_16 + tm465 */
427
428 movq_r2m(mm7, *(y3+4)); /* 7 ; save y3 */
429 psubsw_r2r(mm1, mm3); /* 1 ; y7 = tp765*tg_1_16 - tp465 */
430
431 movq_r2m(mm5, *(y5+4)); /* 5 ; save y5 */
432
433 movq_r2m(mm3, *(y7+4)); /* 3 ; save y7 */
434
435 /* } ; end of forward_dct_col07() */
436 /* done with dct_row transform */
437
438 /*
439 * fdct_mmx32_cols() --
440 * the following subroutine repeats the row-transform operation,
441 * except with different shift&round constants. This version
442 * does NOT transpose the output again. Thus the final output
443 * is transposed with respect to the source.
444 *
445 * The output is stored into blk[], which destroys the original
446 * input data.
447 */
448
449 out = inp = blk;
450
451 table = (int16_t *)tab_frw_01234567;
452
453 /*
454 * for ( x = 8; x > 0; --x ) ; transform one row per iteration
455 * ---------- loop begin
456 */
457 for(i=0; i<8; i++)
458 {
459 movd_m2r(*(inp+6), mm5); /* mm5 = 7 6 */
460
461 punpcklwd_m2r(*(inp+4), mm5); /* mm5 = 5 7 4 6 */
462
463 movq_r2r(mm5, mm2); /* mm2 = 5 7 4 6 */
464 psrlq_i2r(32, mm5); /* mm5 = _ _ 5 7 */
465
466 movq_m2r(*inp, mm0); /* mm0 = 3 2 1 0 */
467 punpcklwd_r2r(mm2, mm5); /* mm5 = 4 5 6 7 */
468
469 movq_r2r(mm0, mm1); /* mm1 = 3 2 1 0 */
470 paddsw_r2r(mm5, mm0); /* mm0 = [3+4, 2+5, 1+6, 0+7] (xt3, xt2, xt1, xt0) */
471
472 psubsw_r2r(mm5, mm1); /* mm1 = [3-4, 2-5, 1-6, 0-7] (xt7, xt6, xt5, xt4) */
473 movq_r2r(mm0, mm2); /* mm2 = [ xt3 xt2 xt1 xt0 ] */
474
475 /* movq [ xt3xt2xt1xt0 ], mm0; */
476 /* movq [ xt7xt6xt5xt4 ], mm1; */
477
478 punpcklwd_r2r(mm1, mm0); /* mm0 = [ xt5 xt1 xt4 xt0 ] */
479
480 punpckhwd_r2r(mm1, mm2); /* mm2 = [ xt7 xt3 xt6 xt2 ] */
481 movq_r2r(mm2, mm1); /* mm1 */
482
483 /* shuffle bytes around */
484
485 /* movq mm0, [INP] ; 0 ; x3 x2 x1 x0 */
486 /* movq mm1, [INP+8] ; 1 ; x7 x6 x5 x4 */
487 movq_r2r(mm0, mm2); /* 2 ; x3 x2 x1 x0 */
488
489 movq_m2r(*table, mm3); /* 3 ; w06 w04 w02 w00 */
490 punpcklwd_r2r(mm1, mm0); /* x5 x1 x4 x0 */
491
492 movq_r2r(mm0, mm5); /* 5 ; x5 x1 x4 x0 */
493 punpckldq_r2r(mm0, mm0); /* x4 x0 x4 x0 [ xt2 xt0 xt2 xt0 ] */
494
495 movq_m2r(*(table+4), mm4); /* 4 ; w07 w05 w03 w01 */
496 punpckhwd_r2r(mm1, mm2); /* 1 ; x7 x3 x6 x2 */
497
498 pmaddwd_r2r(mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00 */
499 movq_r2r(mm2, mm6); /* 6 ; x7 x3 x6 x2 */
500
501 movq_m2r(*(table+16), mm1); /* 1 ; w22 w20 w18 w16 */
502 punpckldq_r2r(mm2, mm2); /* x6 x2 x6 x2 [ xt3 xt1 xt3 xt1 ] */
503
504 pmaddwd_r2r(mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01 */
505 punpckhdq_r2r(mm5, mm5); /* x5 x1 x5 x1 [ xt6 xt4 xt6 xt4 ] */
506
507 pmaddwd_m2r(*(table+8), mm0); /* x4*w14+x0*w12 x4*w10+x0*w08 */
508 punpckhdq_r2r(mm6, mm6); /* x7 x3 x7 x3 [ xt7 xt5 xt7 xt5 ] */
509
510 movq_m2r(*(table+20), mm7); /* 7 ; w23 w21 w19 w17 */
511 pmaddwd_r2r(mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16 */
512
513 /*
514 * mm3 = a1, a0 (y2,y0)
515 * mm1 = b1, b0 (y3,y1)
516 * mm0 = a3,a2 (y6,y4)
517 * mm5 = b3,b2 (y7,y5)
518 */
519
520 paddd_m2r(*round_frw_row, mm3); /* +rounder (y2,y0) */
521 pmaddwd_r2r(mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17 */
522
523 pmaddwd_m2r(*(table+12), mm2); /* x6*w15+x2*w13 x6*w11+x2*w09 */
524 paddd_r2r(mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0) ; now ( y2, y0) */
525
526 pmaddwd_m2r(*(table+24), mm5); /* x5*w30+x1*w28 x5*w26+x1*w24 */
527
528 pmaddwd_m2r(*(table+28), mm6); /* x7*w31+x3*w29 x7*w27+x3*w25 */
529 paddd_r2r(mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0) ; now ( y3, y1) */
530
531 paddd_m2r(*round_frw_row, mm0); /* +rounder (y6,y4) */
532 psrad_i2r(SHIFT_FRW_ROW, mm3); /* (y2, y0) */
533
534 paddd_m2r(*round_frw_row, mm1); /* +rounder (y3,y1) */
535 paddd_r2r(mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2) ; now (y6, y4) */
536
537 paddd_m2r(*round_frw_row, mm5); /* +rounder (y7,y5) */
538 psrad_i2r(SHIFT_FRW_ROW, mm1); /* y1=a1+b1 y0=a0+b0 */
539
540 paddd_r2r(mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2) ; now ( y7, y5) */
541 psrad_i2r(SHIFT_FRW_ROW, mm0); /* y3=a3+b3 y2=a2+b2 */
542
543 out += 8; /* increment row-output address by 1 row */
544 psrad_i2r(SHIFT_FRW_ROW, mm5); /* y4=a3-b3 y5=a2-b2 */
545
546 inp += 8; /* increment row-address by 1 row */
547 packssdw_r2r(mm0, mm3); /* 0 ; y6 y4 y2 y0 */
548
549 packssdw_r2r(mm5, mm1); /* 3 ; y7 y5 y3 y1 */
550 movq_r2r(mm3, mm6); /* mm0 = y6 y4 y2 y0 */
551
552 punpcklwd_r2r(mm1, mm3); /* y3 y2 y1 y0 */
553
554 punpckhwd_r2r(mm1, mm6); /* y7 y6 y5 y4 */
555 table += 32; /* increment to next table */
556
557 movq_r2m(mm3, *(out-8)); /* 1 ; save y3 y2 y1 y0 */
558
559 movq_r2m(mm6, *(out-4)); /* 7 ; save y7 y6 y5 y4 */
560 }
561
562 emms();
563 }
564