1 /*
2 * YUY2.c
3 *
4 * Copyright (C) Charles 'Buck' Krasic - April 2000
5 * Copyright (C) Erik Walthinsen - April 2000
6 *
7 * This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
8 * codec.
9 *
10 * libdv is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser Public License as published by
12 * the Free Software Foundation; either version 2.1, or (at your
13 * option) any later version.
14 *
15 * libdv is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser Public License
21 * along with libdv; see the file COPYING. If not, write to
22 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * The libdv homepage is http://libdv.sourceforge.net/.
25 */
26
27 /* Most of this file is derived from patch 101018 submitted by Stefan
28 * Lucke */
29
30 #if HAVE_CONFIG_H
31 # include <config.h>
32 #endif
33
34 #include <stdlib.h>
35
36 #include "YUY2.h"
37
38 #if ARCH_X86 || ARCH_X86_64
39 #include "mmx.h"
40 #endif // ARCH_X68 | ARCH_X86_64
41
42 /* Lookup tables for mapping signed to unsigned, and clamping */
43 static unsigned char real_uvlut[256], *uvlut;
44 static unsigned char real_ylut[768], *ylut;
45 static unsigned char real_ylut_setup[768], *ylut_setup;
46
47 #if ARCH_X86 || ARCH_X86_64
48 /* Define some constants used in MMX range mapping and clamping logic */
49 static mmx_t mmx_0x0010s = (mmx_t) 0x0010001000100010LL,
50 mmx_0x8080s = (mmx_t) 0x8080808080808080LL,
51 mmx_zero = (mmx_t) 0x0000000000000000LL,
52 mmx_cch = (mmx_t) 0x0f000f000f000f00LL,
53 mmx_ccl = (mmx_t) 0x1f001f001f001f00LL,
54 mmx_ccb = (mmx_t) 0x1000100010001000LL,
55 mmx_clh = (mmx_t) 0x0014001400140014LL,
56 mmx_cll = (mmx_t) 0x0024002400240024LL,
57 mmx_clb = (mmx_t) 0x0010001000100010LL,
58 mmx_cbh = (mmx_t) 0x0f140f140f140f14LL,
59 mmx_cbl = (mmx_t) 0x1f241f241f241f24LL,
60 mmx_cbb = (mmx_t) 0x1010101010101010LL;
61
62 #endif // ARCH_X86 || ARCH_X86_64
63
64 /* ----------------------------------------------------------------------------
65 */
66 void
dv_YUY2_init(int clamp_luma,int clamp_chroma)67 dv_YUY2_init(int clamp_luma, int clamp_chroma) {
68 int i;
69 int value;
70
71 uvlut = real_uvlut + 128; // index from -128 .. 127
72 for(i=-128;
73 i<128;
74 ++i) {
75 value = i + 128;
76 if (clamp_chroma == TRUE) value = CLAMP(value, 16, 240);
77 uvlut[i] = value;
78 } /* for */
79
80 ylut = real_ylut + 256; // index from -256 .. 511
81 ylut_setup = real_ylut_setup + 256;
82 for(i=-256;
83 i<512;
84 ++i) {
85 value = i + 128;
86 if (clamp_luma == TRUE) value = CLAMP(value, 16, 235);
87 else value = CLAMP(value, 0, 255);
88 ylut[i] = value;
89 value += 16;
90 ylut_setup[i] = CLAMP(value, 0, 255);
91 } /* for */
92 } /* dv_YUY2_init */
93
94 /* ----------------------------------------------------------------------------
95 */
96 void
dv_mb411_YUY2(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup)97 dv_mb411_YUY2(dv_macroblock_t *mb, uint8_t **pixels, int *pitches, int add_ntsc_setup) {
98 dv_coeff_t *Y[4], *cr_frame, *cb_frame;
99 unsigned char *pyuv, *pwyuv, cb, cr, *my_ylut;
100 int i, j, row;
101
102 Y [0] = mb->b[0].coeffs;
103 Y [1] = mb->b[1].coeffs;
104 Y [2] = mb->b[2].coeffs;
105 Y [3] = mb->b[3].coeffs;
106 cr_frame = mb->b[4].coeffs;
107 cb_frame = mb->b[5].coeffs;
108
109 pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
110 my_ylut = (add_ntsc_setup == TRUE ? ylut_setup : ylut);
111
112 for (row = 0; row < 8; ++row) { // Eight rows
113 pwyuv = pyuv;
114
115 for (i = 0; i < 4; ++i) { // Four Y blocks
116 dv_coeff_t *Ytmp = Y[i]; // less indexing in inner loop speedup?
117
118
119 for (j = 0; j < 2; ++j) { // two 4-pixel spans per Y block
120
121 cb = uvlut[CLAMP(*cb_frame, -128, 127)];
122 cr = uvlut[CLAMP(*cr_frame, -128, 127)];
123 cb_frame++;
124 cr_frame++;
125
126 #if 0 /* DV_IS_BIG_ENDIAN */
127 *pwyuv++ = cb;
128 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
129 Ytmp++;
130 *pwyuv++ = cr;
131 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
132 Ytmp++;
133
134 *pwyuv++ = cb;
135 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
136 Ytmp++;
137 *pwyuv++ = cr;
138 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
139 Ytmp++;
140 #else
141 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
142 *pwyuv++ = cb;
143 Ytmp++;
144 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
145 *pwyuv++ = cr;
146 Ytmp++;
147
148 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
149 *pwyuv++ = cb;
150 Ytmp++;
151 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
152 *pwyuv++ = cr;
153 Ytmp++;
154 #endif
155 } /* for j */
156
157 Y[i] = Ytmp;
158 } /* for i */
159
160 pyuv += pitches[0];
161 } /* for row */
162 } /* dv_mb411_YUY2 */
163
164 /* ----------------------------------------------------------------------------
165 */
166 void
dv_mb411_right_YUY2(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup)167 dv_mb411_right_YUY2(dv_macroblock_t *mb, uint8_t **pixels, int *pitches, int add_ntsc_setup) {
168
169 dv_coeff_t *Y[4], *Ytmp, *cr_frame, *cb_frame;
170 unsigned char *pyuv, *pwyuv, cb, cr, *my_ylut;
171 int i, j, col, row;
172
173
174 Y[0] = mb->b[0].coeffs;
175 Y[1] = mb->b[1].coeffs;
176 Y[2] = mb->b[2].coeffs;
177 Y[3] = mb->b[3].coeffs;
178
179 pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
180 my_ylut = (add_ntsc_setup == TRUE ? ylut_setup : ylut);
181
182 for (j = 0; j < 4; j += 2) { // Two rows of blocks
183 cr_frame = mb->b[4].coeffs + (j * 2);
184 cb_frame = mb->b[5].coeffs + (j * 2);
185
186 for (row = 0; row < 8; row++) {
187 pwyuv = pyuv;
188
189 for (i = 0; i < 2; ++i) { // Two columns of blocks
190 Ytmp = Y[j + i];
191
192 for (col = 0; col < 8; col+=4) { // two 4-pixel spans per Y block
193
194 cb = uvlut[*cb_frame];
195 cr = uvlut[*cr_frame];
196 cb_frame++;
197 cr_frame++;
198
199 #if 0 /* DV_IS_BIG_ENDIAN */
200 *pwyuv++ = cb;
201 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
202 Ytmp++;
203 *pwyuv++ = cr;
204 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
205 Ytmp++;
206
207 *pwyuv++ = cb;
208 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
209 Ytmp++;
210 *pwyuv++ = cr;
211 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
212 Ytmp++;
213 #else
214 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
215 Ytmp++;
216 *pwyuv++ = cb;
217 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
218 Ytmp++;
219 *pwyuv++ = cr;
220
221 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
222 Ytmp++;
223 *pwyuv++ = cb;
224 *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
225 Ytmp++;
226 *pwyuv++ = cr;
227 #endif
228 } /* for col */
229 Y[j + i] = Ytmp;
230
231 } /* for i */
232
233 cb_frame += 4;
234 cr_frame += 4;
235 pyuv += pitches[0];
236 } /* for row */
237
238 } /* for j */
239 } /* dv_mb411_right_YUY2 */
240
241 /* ----------------------------------------------------------------------------
242 */
243 void
dv_mb420_YUY2(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches)244 dv_mb420_YUY2 (dv_macroblock_t *mb, uint8_t **pixels, int *pitches) {
245 dv_coeff_t *Y [4], *Ytmp0, *cr_frame, *cb_frame;
246 unsigned char *pyuv,*pwyuv0, *pwyuv1,
247 cb, cr;
248 int i, j, col, row, inc_l2, inc_l4;
249
250 pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
251
252 Y [0] = mb->b[0].coeffs;
253 Y [1] = mb->b[1].coeffs;
254 Y [2] = mb->b[2].coeffs;
255 Y [3] = mb->b[3].coeffs;
256 cr_frame = mb->b[4].coeffs;
257 cb_frame = mb->b[5].coeffs;
258 inc_l2 = pitches[0];
259 inc_l4 = pitches[0]*2;
260
261 for (j = 0; j < 4; j += 2) { // Two rows of blocks j, j+1
262 for (row = 0; row < 4; row++) { // 4 pairs of two rows
263 pwyuv0 = pyuv;
264 pwyuv1 = pyuv + inc_l4;
265 for (i = 0; i < 2; ++i) { // Two columns of blocks
266 Ytmp0 = Y[j + i];
267 for (col = 0; col < 4; ++col) { // 4 spans of 2x2 pixels
268 cb = uvlut[CLAMP(*cb_frame, -128, 127)];
269 cr = uvlut[CLAMP(*cr_frame, -128, 127)];
270 cb_frame++;
271 cr_frame++;
272
273 #if DV_IS_LITTLE_ENDIAN
274 *pwyuv0++ = ylut[CLAMP(*(Ytmp0 + 0), -256, 511)];
275 *pwyuv0++ = cb;
276 *pwyuv0++ = ylut[CLAMP(*(Ytmp0 + 1), -256, 511)];
277 *pwyuv0++ = cr;
278
279 *pwyuv1++ = ylut[CLAMP(*(Ytmp0 + 16), -256, 511)];
280 *pwyuv1++ = cb;
281 *pwyuv1++ = ylut[CLAMP(*(Ytmp0 + 17), -256, 511)];
282 *pwyuv1++ = cr;
283 #else
284 *pwyuv0++ = cr;
285 *pwyuv0++ = ylut[CLAMP(*(Ytmp0 + 1), -256, 511)];
286 *pwyuv0++ = cb;
287 *pwyuv0++ = ylut[CLAMP(*(Ytmp0 + 0), -256, 511)];
288
289 *pwyuv1++ = cr;
290 *pwyuv1++ = ylut[CLAMP(*(Ytmp0 + 17), -256, 511)];
291 *pwyuv1++ = cb;
292 *pwyuv1++ = ylut[CLAMP(*(Ytmp0 + 16), -256, 511)];
293 #endif
294 Ytmp0 += 2;
295 }
296 if (row & 1) {
297 Ytmp0 += 16;
298 }
299 Y[j + i] = Ytmp0;
300 }
301 pyuv += inc_l2;
302 if (row & 1) {
303 pyuv += inc_l4;
304 }
305 }
306 }
307 }
308
309 #if ARCH_X86 || ARCH_X86_64
310
311 /* TODO (by Buck):
312 *
313 * When testing YV12.c, I discovered that my video card (RAGE 128)
314 * doesn't care that pixel components are strictly clamped to Y
315 * (16..235), UV (16..240). So I was able to use MMX pack with
316 * unsigned saturation to go from 16 to 8 bit representation. This
317 * clamps to (0..255). Applying this to the code here might allow
318 * us to reduce instruction count below.
319 *
320 * Question: do other video cards behave the same way?
321 * DRD> No, your video card will not care about clamping to these
322 * ranges. This issue only applies to analog NTSC output.
323 * */
324 void
dv_mb411_YUY2_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup,int clamp_luma,int clamp_chroma)325 dv_mb411_YUY2_mmx(dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
326 int add_ntsc_setup, int clamp_luma, int clamp_chroma) {
327 dv_coeff_t *Y[4], *cr_frame, *cb_frame;
328 unsigned char *pyuv, *pwyuv;
329 int i, row;
330
331 Y[0] = mb->b[0].coeffs;
332 Y[1] = mb->b[1].coeffs;
333 Y[2] = mb->b[2].coeffs;
334 Y[3] = mb->b[3].coeffs;
335 cr_frame = mb->b[4].coeffs;
336 cb_frame = mb->b[5].coeffs;
337
338 pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
339
340 if (clamp_luma && clamp_chroma) {
341 movq_m2r (mmx_cbh, mm5);
342 movq_m2r (mmx_cbl, mm6);
343 movq_m2r (mmx_cbb, mm7);
344 } else if (clamp_luma) {
345 movq_m2r (mmx_clh, mm5);
346 movq_m2r (mmx_cll, mm6);
347 movq_m2r (mmx_clb, mm7);
348 } else if (clamp_chroma) {
349 movq_m2r (mmx_cch, mm5);
350 movq_m2r (mmx_ccl, mm6);
351 movq_m2r (mmx_ccb, mm7);
352 } else {
353 movq_m2r (mmx_zero, mm5);
354 movq_m2r (mmx_zero, mm6);
355 movq_m2r (mmx_zero, mm7);
356 }
357
358 if (add_ntsc_setup)
359 paddusb_m2r (mmx_0x0010s, mm5); /* add setup to hi clamp */
360
361 for (row = 0; row < 8; ++row) { // Eight rows
362 pwyuv = pyuv;
363 for (i = 0; i < 4; ++i) { // Four Y blocks
364 dv_coeff_t *Ytmp = Y [i]; // less indexing in inner loop speedup?
365 /* ---------------------------------------------------------------------
366 */
367 movq_m2r (*cb_frame, mm2); // cb0 cb1 cb2 cb3
368 movq_m2r (*cr_frame, mm3); // cr0 cr1 cr2 cr3
369 punpcklwd_r2r (mm3, mm2); // cb0cr0 cb1cr1
370 movq_r2r (mm2, mm3);
371 punpckldq_r2r (mm2, mm2); // cb0cr0 cb0cr0
372 movq_m2r (Ytmp [0], mm0);
373 movq_r2r (mm0, mm1);
374 punpcklwd_r2r (mm2, mm0);
375 punpckhwd_r2r (mm2, mm1);
376
377 packsswb_r2r (mm1, mm0);
378 paddb_m2r (mmx_0x8080s, mm0);
379 paddusb_r2r (mm5, mm0); /* clamp high */
380 psubusb_r2r (mm6, mm0); /* clamp low */
381 paddusb_r2r (mm7, mm0); /* to black level */
382 movq_r2m (mm0, pwyuv [0]);
383
384 /* ---------------------------------------------------------------------
385 */
386 movq_m2r (Ytmp [4], mm0);
387 punpckhdq_r2r (mm3, mm3);
388 movq_r2r (mm0, mm1);
389 punpcklwd_r2r (mm3, mm0);
390 punpckhwd_r2r (mm3, mm1);
391
392 packsswb_r2r (mm1, mm0);
393 paddb_m2r (mmx_0x8080s, mm0);
394 paddusb_r2r (mm5, mm0); /* clamp high */
395 psubusb_r2r (mm6, mm0); /* clamp low */
396 paddusb_r2r (mm7, mm0); /* to black level */
397 movq_r2m (mm0, pwyuv [8]);
398
399 pwyuv += 16;
400 cr_frame += 2;
401 cb_frame += 2;
402 Y [i] = Ytmp + 8;
403 } /* for i */
404 pyuv += pitches[0];
405 } /* for j */
406 emms ();
407 } /* dv_mb411_YUY2_mmx */
408
409 /* ----------------------------------------------------------------------------
410 */
411 void
dv_mb411_right_YUY2_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup,int clamp_luma,int clamp_chroma)412 dv_mb411_right_YUY2_mmx(dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
413 int add_ntsc_setup, int clamp_luma, int clamp_chroma) {
414
415 dv_coeff_t *Y[4], *Ytmp, *cr_frame, *cb_frame;
416 unsigned char *pyuv;
417 int j, row;
418
419 Y[0] = mb->b[0].coeffs;
420 Y[1] = mb->b[1].coeffs;
421 Y[2] = mb->b[2].coeffs;
422 Y[3] = mb->b[3].coeffs;
423
424 pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
425
426 if (clamp_luma && clamp_chroma) {
427 movq_m2r (mmx_cbh, mm5);
428 movq_m2r (mmx_cbl, mm6);
429 movq_m2r (mmx_cbb, mm7);
430 } else if (clamp_luma) {
431 movq_m2r (mmx_clh, mm5);
432 movq_m2r (mmx_cll, mm6);
433 movq_m2r (mmx_clb, mm7);
434 } else if (clamp_chroma) {
435 movq_m2r (mmx_cch, mm5);
436 movq_m2r (mmx_ccl, mm6);
437 movq_m2r (mmx_ccb, mm7);
438 } else {
439 movq_m2r (mmx_zero, mm5);
440 movq_m2r (mmx_zero, mm6);
441 movq_m2r (mmx_zero, mm7);
442 }
443
444 if (add_ntsc_setup)
445 paddusb_m2r (mmx_0x0010s, mm5); /* add setup to hi clamp */
446
447 for (j = 0; j < 4; j += 2) { // Two rows of blocks
448 cr_frame = mb->b[4].coeffs + (j * 2);
449 cb_frame = mb->b[5].coeffs + (j * 2);
450
451 for (row = 0; row < 8; row++) {
452
453 movq_m2r(*cb_frame, mm0);
454 packsswb_r2r(mm0,mm0);
455 movq_m2r(*cr_frame, mm1);
456 packsswb_r2r(mm1,mm1);
457 punpcklbw_r2r(mm1,mm0);
458 movq_r2r(mm0,mm1);
459
460 punpcklwd_r2r(mm0,mm0); // pack doubled low cb and crs
461 punpckhwd_r2r(mm1,mm1); // pack doubled high cb and crs
462
463 Ytmp = Y[j];
464
465 movq_m2r(Ytmp [0],mm2);
466 movq_m2r(Ytmp [4],mm3);
467
468 packsswb_r2r(mm3,mm2); // pack Ys from signed 16-bit to signed 8-bit
469 movq_r2r(mm2,mm3);
470
471 punpcklbw_r2r (mm0, mm3); // interlieve low Ys with crcbs
472 paddb_m2r (mmx_0x8080s, mm3);
473 paddusb_r2r (mm5, mm3); /* clamp high */
474 psubusb_r2r (mm6, mm3); /* clamp low */
475 paddusb_r2r (mm7, mm3); /* to black level */
476 movq_r2m (mm3, pyuv [0]);
477
478 punpckhbw_r2r (mm0, mm2); // interlieve high Ys with crcbs
479 paddb_m2r (mmx_0x8080s, mm2);
480 paddusb_r2r (mm5, mm2); /* clamp high */
481 psubusb_r2r (mm6, mm2); /* clamp low */
482 paddusb_r2r (mm7, mm2); /* to black level */
483 movq_r2m (mm2, pyuv [8]);
484
485 Y[j] += 8;
486
487 Ytmp = Y[j+1];
488
489 movq_m2r(Ytmp [0],mm2);
490 movq_m2r(Ytmp [4],mm3);
491
492 packsswb_r2r(mm3,mm2); // pack Ys from signed 16-bit to signed 8-bit
493 movq_r2r(mm2,mm3);
494
495 punpcklbw_r2r(mm1,mm3);
496 paddb_m2r (mmx_0x8080s, mm3);
497 paddusb_r2r (mm5, mm3); /* clamp high */
498 psubusb_r2r (mm6, mm3); /* clamp low */
499 paddusb_r2r (mm7, mm3); /* to black level */
500 movq_r2m(mm3, pyuv [16]);
501
502 punpckhbw_r2r(mm1,mm2); // interlieve low Ys with crcbs
503 paddb_m2r (mmx_0x8080s, mm2);
504 paddusb_r2r (mm5, mm2); /* clamp high */
505 psubusb_r2r (mm6, mm2); /* clamp low */
506 paddusb_r2r (mm7, mm2); /* to black level */
507 movq_r2m (mm2, pyuv [24]); // interlieve high Ys with crcbs
508
509 Y[j+1] += 8;
510 cr_frame += 8;
511 cb_frame += 8;
512
513 pyuv += pitches[0];
514 } /* for row */
515
516 } /* for j */
517 emms();
518 } /* dv_mb411_right_YUY2_mmx */
519
520 /* ----------------------------------------------------------------------------
521 */
522 void
dv_mb420_YUY2_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int clamp_luma,int clamp_chroma)523 dv_mb420_YUY2_mmx (dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
524 int clamp_luma, int clamp_chroma) {
525 dv_coeff_t *Y [4], *Ytmp0, *cr_frame, *cb_frame;
526 unsigned char *pyuv,
527 *pwyuv0, *pwyuv1;
528 int i, j, row, inc_l2, inc_l4;
529
530 pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
531
532 Y [0] = mb->b[0].coeffs;
533 Y [1] = mb->b[1].coeffs;
534 Y [2] = mb->b[2].coeffs;
535 Y [3] = mb->b[3].coeffs;
536 cr_frame = mb->b[4].coeffs;
537 cb_frame = mb->b[5].coeffs;
538 inc_l2 = pitches[0];
539 inc_l4 = pitches[0]*2;
540
541 if (clamp_luma && clamp_chroma) {
542 movq_m2r (mmx_cbh, mm5);
543 movq_m2r (mmx_cbl, mm6);
544 movq_m2r (mmx_cbb, mm7);
545 } else if (clamp_luma) {
546 movq_m2r (mmx_clh, mm5);
547 movq_m2r (mmx_cll, mm6);
548 movq_m2r (mmx_clb, mm7);
549 } else if (clamp_chroma) {
550 movq_m2r (mmx_cch, mm5);
551 movq_m2r (mmx_ccl, mm6);
552 movq_m2r (mmx_ccb, mm7);
553 } else {
554 movq_m2r (mmx_zero, mm5);
555 movq_m2r (mmx_zero, mm6);
556 movq_m2r (mmx_zero, mm7);
557 }
558
559 for (j = 0; j < 4; j += 2) { // Two rows of blocks j, j+1
560 for (row = 0; row < 4; row++) { // 4 pairs of two rows
561 pwyuv0 = pyuv;
562 pwyuv1 = pyuv + inc_l4;
563 for (i = 0; i < 2; ++i) { // Two columns of blocks
564 Ytmp0 = Y[j + i];
565
566 /* -------------------------------------------------------------------
567 */
568 movq_m2r (*cb_frame, mm2); /* mm2 = b1 b2 b3 b4 */
569 movq_m2r (*cr_frame, mm3); /* mm3 = r1 r2 r3 r4 */
570 movq_r2r (mm2, mm4); /* mm4 = b1 b2 b3 b4 */
571 punpcklwd_r2r (mm3, mm4); /* mm4 = b3 r3 b4 r4 */
572
573 movq_m2r (Ytmp0[0], mm0); /* mm0 = y1 y2 y3 y4 */
574 movq_r2r (mm0, mm1);
575
576 punpcklwd_r2r (mm4, mm0); /* mm0 = b4 y3 r4 y4 */
577 punpckhwd_r2r (mm4, mm1); /* mm1 = b3 y1 r3 y2 */
578
579 packsswb_r2r (mm1, mm0); /* mm0 = b3 y1 r3 y2 b4 y3 r4 y4 */
580 paddb_m2r (mmx_0x8080s, mm0);
581 paddusb_r2r (mm5, mm0); /* clamp high */
582 psubusb_r2r (mm6, mm0); /* clamp low */
583 paddusb_r2r (mm7, mm0); /* to black level */
584 movq_r2m (mm0, pwyuv0[0]);
585
586 movq_m2r (Ytmp0[8+8], mm0);
587 movq_r2r (mm0, mm1);
588
589 punpcklwd_r2r (mm4, mm0);
590 punpckhwd_r2r (mm4, mm1);
591 packsswb_r2r (mm1, mm0);
592 paddb_m2r (mmx_0x8080s, mm0);
593 paddusb_r2r (mm5, mm0); /* clamp high */
594 psubusb_r2r (mm6, mm0); /* clamp low */
595 paddusb_r2r (mm7, mm0); /* to black level */
596 movq_r2m (mm0, pwyuv1[0]);
597
598 movq_r2r (mm2, mm4);
599 punpckhwd_r2r (mm3, mm4);
600 movq_m2r (Ytmp0[4], mm0);
601 movq_r2r (mm0, mm1);
602 punpcklwd_r2r (mm4, mm0);
603 punpckhwd_r2r (mm4, mm1);
604 packsswb_r2r (mm1, mm0);
605 paddb_m2r (mmx_0x8080s, mm0);
606 paddusb_r2r (mm5, mm0); /* clamp high */
607 psubusb_r2r (mm6, mm0); /* clamp low */
608 paddusb_r2r (mm7, mm0); /* to black level */
609 movq_r2m (mm0, pwyuv0[8]);
610
611 movq_m2r (Ytmp0[8+12], mm0);
612 movq_r2r (mm0, mm1);
613 punpcklwd_r2r (mm4, mm0);
614 punpckhwd_r2r (mm4, mm1);
615 packsswb_r2r (mm1, mm0);
616 paddb_m2r (mmx_0x8080s, mm0);
617 paddusb_r2r (mm5, mm0); /* clamp high */
618 psubusb_r2r (mm6, mm0); /* clamp low */
619 paddusb_r2r (mm7, mm0); /* to black level */
620 movq_r2m (mm0, pwyuv1[8]);
621
622 pwyuv0 += 16;
623 pwyuv1 += 16;
624 cb_frame += 4;
625 cr_frame += 4;
626 if (row & 1) {
627 Ytmp0 += 24;
628 } else {
629 Ytmp0 += 8;
630 }
631 Y[j + i] = Ytmp0;
632 }
633 pyuv += inc_l2;
634 if (row & 1)
635 pyuv += inc_l4;
636 }
637 }
638 emms ();
639 }
640
641 /* ----------------------------------------------------------------------------
642 * start of half high render functions
643 */
644
645 /* ---------------------------------------------------------------------------
646 */
647 void
dv_mb411_YUY2_hh_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup,int clamp_luma,int clamp_chroma)648 dv_mb411_YUY2_hh_mmx(dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
649 int add_ntsc_setup, int clamp_luma, int clamp_chroma) {
650 dv_coeff_t *Y[4], *cr_frame, *cb_frame;
651 unsigned char *pyuv, *pwyuv;
652 int i, row;
653
654 Y[0] = mb->b[0].coeffs;
655 Y[1] = mb->b[1].coeffs;
656 Y[2] = mb->b[2].coeffs;
657 Y[3] = mb->b[3].coeffs;
658 cr_frame = mb->b[4].coeffs;
659 cb_frame = mb->b[5].coeffs;
660
661 pyuv = pixels[0] + (mb->x * 2) + ((mb->y * pitches[0]) / 2);
662
663 if (clamp_luma && clamp_chroma) {
664 movq_m2r (mmx_cbh, mm5);
665 movq_m2r (mmx_cbl, mm6);
666 movq_m2r (mmx_cbb, mm7);
667 } else if (clamp_luma) {
668 movq_m2r (mmx_clh, mm5);
669 movq_m2r (mmx_cll, mm6);
670 movq_m2r (mmx_clb, mm7);
671 } else if (clamp_chroma) {
672 movq_m2r (mmx_cch, mm5);
673 movq_m2r (mmx_ccl, mm6);
674 movq_m2r (mmx_ccb, mm7);
675 } else {
676 movq_m2r (mmx_zero, mm5);
677 movq_m2r (mmx_zero, mm6);
678 movq_m2r (mmx_zero, mm7);
679 }
680
681 if (add_ntsc_setup)
682 paddusb_m2r (mmx_0x0010s, mm5); /* add setup to hi clamp */
683
684 for (row = 0; row < 4; ++row) { // Eight rows
685 pwyuv = pyuv;
686 for (i = 0; i < 4; ++i) { // Four Y blocks
687 dv_coeff_t *Ytmp = Y [i]; // less indexing in inner loop speedup?
688 /* ---------------------------------------------------------------------
689 */
690 movq_m2r (*cb_frame, mm2); // cb0 cb1 cb2 cb3
691 movq_m2r (*cr_frame, mm3); // cr0 cr1 cr2 cr3
692 punpcklwd_r2r (mm3, mm2); // cb0cr0 cb1cr1
693 movq_r2r (mm2, mm3);
694 punpckldq_r2r (mm2, mm2); // cb0cr0 cb0cr0
695 movq_m2r (Ytmp [0], mm0);
696 movq_r2r (mm0, mm1);
697 punpcklwd_r2r (mm2, mm0); /* mm0 = b4 y3 r4 y4 */
698 punpckhwd_r2r (mm2, mm1); /* mm1 = b3 y1 r3 y2 */
699
700 packsswb_r2r (mm1, mm0); /* mm0 = b3 y1 r3 y2 b4 y3 r4 y4 */
701 paddb_m2r (mmx_0x8080s, mm0);
702 paddusb_r2r (mm5, mm0); /* clamp high */
703 psubusb_r2r (mm6, mm0); /* clamp low */
704 paddusb_r2r (mm7, mm0); /* to black level */
705 movq_r2m (mm0, pwyuv [0]);
706
707 /* ---------------------------------------------------------------------
708 */
709 movq_m2r (Ytmp [4], mm0);
710 punpckhdq_r2r (mm3, mm3);
711 movq_r2r (mm0, mm1);
712 punpcklwd_r2r (mm3, mm0); /* mm0 = b4 y3 r4 y4 */
713 punpckhwd_r2r (mm3, mm1); /* mm1 = b3 y1 r3 y2 */
714
715 packsswb_r2r (mm1, mm0); /* mm0 = b3 y1 r3 y2 b4 y3 r4 y4 */
716 paddb_m2r (mmx_0x8080s, mm0);
717 paddusb_r2r (mm5, mm0); /* clamp high */
718 psubusb_r2r (mm6, mm0); /* clamp low */
719 paddusb_r2r (mm7, mm0); /* to black level */
720 movq_r2m (mm0, pwyuv [8]);
721
722 pwyuv += 16;
723
724 cr_frame += 2;
725 cb_frame += 2;
726 Y [i] = Ytmp + 16;
727 } /* for i */
728 cr_frame += 8;
729 cb_frame += 8;
730 pyuv += pitches[0];
731 } /* for j */
732 emms ();
733 } /* dv_mb411_YUY2_mmx */
734
735 /* ----------------------------------------------------------------------------
736 */
737 void
dv_mb411_right_YUY2_hh_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup,int clamp_luma,int clamp_chroma)738 dv_mb411_right_YUY2_hh_mmx(dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
739 int add_ntsc_setup, int clamp_luma, int clamp_chroma) {
740
741 dv_coeff_t *Y[4], *Ytmp, *cr_frame, *cb_frame;
742 unsigned char *pyuv;
743 int j, row;
744
745 Y[0] = mb->b[0].coeffs;
746 Y[1] = mb->b[1].coeffs;
747 Y[2] = mb->b[2].coeffs;
748 Y[3] = mb->b[3].coeffs;
749
750 pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]) / 2;
751
752 if (clamp_luma && clamp_chroma) {
753 movq_m2r (mmx_cbh, mm5);
754 movq_m2r (mmx_cbl, mm6);
755 movq_m2r (mmx_cbb, mm7);
756 } else if (clamp_luma) {
757 movq_m2r (mmx_clh, mm5);
758 movq_m2r (mmx_cll, mm6);
759 movq_m2r (mmx_clb, mm7);
760 } else if (clamp_chroma) {
761 movq_m2r (mmx_cch, mm5);
762 movq_m2r (mmx_ccl, mm6);
763 movq_m2r (mmx_ccb, mm7);
764 } else {
765 movq_m2r (mmx_zero, mm5);
766 movq_m2r (mmx_zero, mm6);
767 movq_m2r (mmx_zero, mm7);
768 }
769
770 if (add_ntsc_setup)
771 paddusb_m2r (mmx_0x0010s, mm5); /* add setup to hi clamp */
772
773 for (j = 0; j < 4; j += 2) { // Two rows of blocks
774 cr_frame = mb->b[4].coeffs + (j * 2);
775 cb_frame = mb->b[5].coeffs + (j * 2);
776
777 for (row = 0; row < 4; row++) {
778
779 movq_m2r(*cb_frame, mm0);
780 packsswb_r2r(mm0,mm0);
781 movq_m2r(*cr_frame, mm1);
782 packsswb_r2r(mm1,mm1);
783 punpcklbw_r2r(mm1,mm0);
784 movq_r2r(mm0,mm1);
785
786 punpcklwd_r2r(mm0,mm0); // pack doubled low cb and crs
787 punpckhwd_r2r(mm1,mm1); // pack doubled high cb and crs
788
789 Ytmp = Y[j];
790
791 movq_m2r(Ytmp [0],mm2);
792 movq_m2r(Ytmp [4],mm3);
793
794 packsswb_r2r(mm3,mm2); // pack Ys from signed 16-bit to unsigned 8-bit
795 movq_r2r(mm2,mm3);
796
797 punpcklbw_r2r (mm0, mm3); // interlieve low Ys with crcbs
798 paddb_m2r (mmx_0x8080s, mm3);
799 paddusb_r2r (mm5, mm3); /* clamp high */
800 psubusb_r2r (mm6, mm3); /* clamp low */
801 paddusb_r2r (mm7, mm3); /* to black level */
802 movq_r2m (mm3, pyuv [0]);
803
804 punpckhbw_r2r (mm0, mm2); // interlieve high Ys with crcbs
805 paddb_m2r (mmx_0x8080s, mm2);
806 paddusb_r2r (mm5, mm2); /* clamp high */
807 psubusb_r2r (mm6, mm2); /* clamp low */
808 paddusb_r2r (mm7, mm2); /* to black level */
809 movq_r2m (mm2, pyuv [8]);
810
811 Y[j] += 16;
812
813 Ytmp = Y[j+1];
814
815 movq_m2r(Ytmp [0],mm2);
816 movq_m2r(Ytmp [4],mm3);
817
818 packsswb_r2r(mm3,mm2); // pack Ys from signed 16-bit to unsigned 8-bit
819 movq_r2r(mm2,mm3);
820
821 punpcklbw_r2r(mm1,mm3);
822 paddb_m2r (mmx_0x8080s, mm3);
823 paddusb_r2r (mm5, mm3); /* clamp high */
824 psubusb_r2r (mm6, mm3); /* clamp low */
825 paddusb_r2r (mm7, mm3); /* to black level */
826 movq_r2m(mm3, pyuv [16]);
827
828 punpckhbw_r2r(mm1,mm2); // interlieve low Ys with crcbs
829 paddb_m2r (mmx_0x8080s, mm2);
830 paddusb_r2r (mm5, mm2); /* clamp high */
831 psubusb_r2r (mm6, mm2); /* clamp low */
832 paddusb_r2r (mm7, mm2); /* to black level */
833 movq_r2m (mm2, pyuv [24]); // interlieve high Ys with crcbs
834
835 Y[j+1] += 16;
836 cr_frame += 16;
837 cb_frame += 16;
838
839 pyuv += pitches[0];
840 } /* for row */
841
842 } /* for j */
843 emms();
844 } /* dv_mb411_right_YUY2_mmx */
845
846 /* ---------------------------------------------------------------------------
847 */
848 void
dv_mb420_YUY2_hh_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int clamp_luma,int clamp_chroma)849 dv_mb420_YUY2_hh_mmx (dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
850 int clamp_luma, int clamp_chroma) {
851 dv_coeff_t *Y [4], *Ytmp0, *cr_frame, *cb_frame;
852 unsigned char *pyuv,
853 *pwyuv0;
854 int i, j, row, inc_l2;
855
856 pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]) / 2;
857
858 Y [0] = mb->b[0].coeffs;
859 Y [1] = mb->b[1].coeffs;
860 Y [2] = mb->b[2].coeffs;
861 Y [3] = mb->b[3].coeffs;
862 cr_frame = mb->b[4].coeffs;
863 cb_frame = mb->b[5].coeffs;
864 inc_l2 = pitches[0];
865
866 if (clamp_luma && clamp_chroma) {
867 movq_m2r (mmx_cbh, mm5);
868 movq_m2r (mmx_cbl, mm6);
869 movq_m2r (mmx_cbb, mm7);
870 } else if (clamp_luma) {
871 movq_m2r (mmx_clh, mm5);
872 movq_m2r (mmx_cll, mm6);
873 movq_m2r (mmx_clb, mm7);
874 } else if (clamp_chroma) {
875 movq_m2r (mmx_cch, mm5);
876 movq_m2r (mmx_ccl, mm6);
877 movq_m2r (mmx_ccb, mm7);
878 } else {
879 movq_m2r (mmx_zero, mm5);
880 movq_m2r (mmx_zero, mm6);
881 movq_m2r (mmx_zero, mm7);
882 }
883
884 for (j = 0; j < 4; j += 2) { // Two rows of blocks j, j+1
885 for (row = 0; row < 4; row++) { // 4 pairs of two rows
886 pwyuv0 = pyuv;
887 for (i = 0; i < 2; ++i) { // Two columns of blocks
888 Ytmp0 = Y[j + i];
889
890 /* -------------------------------------------------------------------
891 */
892 movq_m2r (*cb_frame, mm2); /* mm2 = b1 b2 b3 b4 */
893 movq_m2r (*cr_frame, mm3); /* mm3 = r1 r2 r3 r4 */
894 movq_r2r (mm2, mm4); /* mm4 = b1 b2 b3 b4 */
895 punpcklwd_r2r (mm3, mm4); /* mm4 = b3 r3 b4 r4 */
896
897 movq_m2r (Ytmp0[0], mm0); /* mm0 = y1 y2 y3 y4 */
898 movq_r2r (mm0, mm1);
899
900 punpcklwd_r2r (mm4, mm0); /* mm0 = b4 y3 r4 y4 */
901 punpckhwd_r2r (mm4, mm1); /* mm1 = b3 y1 r3 y2 */
902
903 packsswb_r2r (mm1, mm0); /* mm4 = b3 y1 r3 y2 b4 y3 r4 y4 */
904 paddb_m2r (mmx_0x8080s, mm0);
905 paddusb_r2r (mm5, mm0); /* clamp high */
906 psubusb_r2r (mm6, mm0); /* clamp low */
907 paddusb_r2r (mm7, mm0); /* to black level */
908 movq_r2m (mm0, pwyuv0[0]);
909
910 movq_r2r (mm2, mm4);
911 punpckhwd_r2r (mm3, mm4);
912 movq_m2r (Ytmp0[4], mm0);
913 movq_r2r (mm0, mm1);
914 punpcklwd_r2r (mm4, mm0); /* mm4 = b4 y3 r4 y4 */
915 punpckhwd_r2r (mm4, mm1); /* mm5 = b3 y1 r3 y2 */
916 packsswb_r2r (mm1, mm0); /* mm4 = b3 y1 r3 y2 b4 y3 r4 y4 */
917 paddb_m2r (mmx_0x8080s, mm0);
918 paddusb_r2r (mm5, mm0); /* clamp high */
919 psubusb_r2r (mm6, mm0); /* clamp low */
920 paddusb_r2r (mm7, mm0); /* to black level */
921 movq_r2m (mm0, pwyuv0[8]);
922
923 pwyuv0 += 16;
924 cb_frame += 4;
925 cr_frame += 4;
926 Y[j + i] = Ytmp0 + 16;
927 }
928 /* ---------------------------------------------------------------------
929 * for odd value of row counter (this is NOT an odd line number)
930 * we have to go one additional step forward, and for even value
931 * we have to go one step back to use the color information again.
932 * Assuming that chroma information is fields based.
933 */
934 if (row & 1) {
935 cb_frame += 8;
936 cr_frame += 8;
937 } else {
938 cb_frame -= 8;
939 cr_frame -= 8;
940 }
941 pyuv += inc_l2;
942 }
943 }
944 emms ();
945 }
946 #endif // ARCH_X86 || ARCH_X86_64
947