1 /*
2  *  YUY2.c
3  *
4  *     Copyright (C) Charles 'Buck' Krasic - April 2000
5  *     Copyright (C) Erik Walthinsen - April 2000
6  *
7  *  This file is part of libdv, a free DV (IEC 61834/SMPTE 314M)
8  *  codec.
9  *
10  *  libdv is free software; you can redistribute it and/or modify it
11  *  under the terms of the GNU Lesser Public License as published by
12  *  the Free Software Foundation; either version 2.1, or (at your
13  *  option) any later version.
14  *
15  *  libdv is distributed in the hope that it will be useful, but
16  *  WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  *  Lesser Public License for more details.
19  *
20  *  You should have received a copy of the GNU Lesser Public License
21  *  along with libdv; see the file COPYING.  If not, write to
22  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  *  The libdv homepage is http://libdv.sourceforge.net/.
25  */
26 
27 /* Most of this file is derived from patch 101018 submitted by Stefan
28  * Lucke */
29 
30 #if HAVE_CONFIG_H
31 # include <config.h>
32 #endif
33 
34 #include <stdlib.h>
35 
36 #include "YUY2.h"
37 
38 #if ARCH_X86 || ARCH_X86_64
39 #include "mmx.h"
40 #endif // ARCH_X68 | ARCH_X86_64
41 
42 /* Lookup tables for mapping signed to unsigned, and clamping */
43 static unsigned char	real_uvlut[256], *uvlut;
44 static unsigned char	real_ylut[768],  *ylut;
45 static unsigned char	real_ylut_setup[768],  *ylut_setup;
46 
47 #if ARCH_X86 || ARCH_X86_64
48 /* Define some constants used in MMX range mapping and clamping logic */
49 static mmx_t		mmx_0x0010s = (mmx_t) 0x0010001000100010LL,
50 			mmx_0x8080s = (mmx_t) 0x8080808080808080LL,
51 			mmx_zero = (mmx_t) 0x0000000000000000LL,
52 			mmx_cch  = (mmx_t) 0x0f000f000f000f00LL,
53 			mmx_ccl  = (mmx_t) 0x1f001f001f001f00LL,
54 			mmx_ccb  = (mmx_t) 0x1000100010001000LL,
55 			mmx_clh  = (mmx_t) 0x0014001400140014LL,
56 			mmx_cll  = (mmx_t) 0x0024002400240024LL,
57 			mmx_clb  = (mmx_t) 0x0010001000100010LL,
58 			mmx_cbh  = (mmx_t) 0x0f140f140f140f14LL,
59 			mmx_cbl  = (mmx_t) 0x1f241f241f241f24LL,
60 			mmx_cbb  = (mmx_t) 0x1010101010101010LL;
61 
62 #endif // ARCH_X86 || ARCH_X86_64
63 
64 /* ----------------------------------------------------------------------------
65  */
66 void
dv_YUY2_init(int clamp_luma,int clamp_chroma)67 dv_YUY2_init(int clamp_luma, int clamp_chroma) {
68   int i;
69   int value;
70 
71   uvlut = real_uvlut + 128; // index from -128 .. 127
72   for(i=-128;
73       i<128;
74       ++i) {
75     value = i + 128;
76     if (clamp_chroma == TRUE) value = CLAMP(value, 16, 240);
77     uvlut[i] = value;
78   } /* for */
79 
80   ylut = real_ylut + 256; // index from -256 .. 511
81   ylut_setup = real_ylut_setup + 256;
82   for(i=-256;
83       i<512;
84       ++i) {
85 	value = i + 128;
86 	if (clamp_luma == TRUE) value = CLAMP(value, 16, 235);
87 	else value = CLAMP(value, 0, 255);
88 	ylut[i] = value;
89 	value += 16;
90 	ylut_setup[i] = CLAMP(value, 0, 255);
91   } /* for */
92 } /* dv_YUY2_init */
93 
94 /* ----------------------------------------------------------------------------
95  */
96 void
dv_mb411_YUY2(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup)97 dv_mb411_YUY2(dv_macroblock_t *mb, uint8_t **pixels, int *pitches, int add_ntsc_setup) {
98   dv_coeff_t		*Y[4], *cr_frame, *cb_frame;
99   unsigned char	        *pyuv, *pwyuv, cb, cr, *my_ylut;
100   int			i, j, row;
101 
102   Y [0] = mb->b[0].coeffs;
103   Y [1] = mb->b[1].coeffs;
104   Y [2] = mb->b[2].coeffs;
105   Y [3] = mb->b[3].coeffs;
106   cr_frame = mb->b[4].coeffs;
107   cb_frame = mb->b[5].coeffs;
108 
109   pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
110   my_ylut = (add_ntsc_setup == TRUE ? ylut_setup : ylut);
111 
112   for (row = 0; row < 8; ++row) { // Eight rows
113     pwyuv = pyuv;
114 
115     for (i = 0; i < 4; ++i) {     // Four Y blocks
116       dv_coeff_t *Ytmp = Y[i];   // less indexing in inner loop speedup?
117 
118 
119       for (j = 0; j < 2; ++j) {   // two 4-pixel spans per Y block
120 
121         cb = uvlut[CLAMP(*cb_frame, -128, 127)];
122         cr = uvlut[CLAMP(*cr_frame, -128, 127)];
123         cb_frame++;
124         cr_frame++;
125 
126 #if 0 /* DV_IS_BIG_ENDIAN */
127        *pwyuv++ = cb;
128        *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
129        Ytmp++;
130        *pwyuv++ = cr;
131 	*pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
132 	Ytmp++;
133 
134 	*pwyuv++ = cb;
135 	*pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
136 	Ytmp++;
137 	*pwyuv++ = cr;
138 	*pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
139 	Ytmp++;
140 #else
141 	*pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
142        *pwyuv++ = cb;
143 	Ytmp++;
144        *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
145 	*pwyuv++ = cr;
146        Ytmp++;
147 
148        *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
149        *pwyuv++ = cb;
150        Ytmp++;
151        *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
152        *pwyuv++ = cr;
153        Ytmp++;
154 #endif
155       } /* for j */
156 
157       Y[i] = Ytmp;
158     } /* for i */
159 
160     pyuv += pitches[0];
161   } /* for row */
162 } /* dv_mb411_YUY2 */
163 
164 /* ----------------------------------------------------------------------------
165  */
166 void
dv_mb411_right_YUY2(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup)167 dv_mb411_right_YUY2(dv_macroblock_t *mb, uint8_t **pixels, int *pitches, int add_ntsc_setup) {
168 
169   dv_coeff_t		*Y[4], *Ytmp, *cr_frame, *cb_frame;
170   unsigned char	        *pyuv, *pwyuv, cb, cr, *my_ylut;
171   int			i, j, col, row;
172 
173 
174   Y[0] = mb->b[0].coeffs;
175   Y[1] = mb->b[1].coeffs;
176   Y[2] = mb->b[2].coeffs;
177   Y[3] = mb->b[3].coeffs;
178 
179   pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
180   my_ylut = (add_ntsc_setup == TRUE ? ylut_setup : ylut);
181 
182   for (j = 0; j < 4; j += 2) { // Two rows of blocks
183     cr_frame = mb->b[4].coeffs + (j * 2);
184     cb_frame = mb->b[5].coeffs + (j * 2);
185 
186     for (row = 0; row < 8; row++) {
187       pwyuv = pyuv;
188 
189       for (i = 0; i < 2; ++i) { // Two columns of blocks
190         Ytmp = Y[j + i];
191 
192         for (col = 0; col < 8; col+=4) {  // two 4-pixel spans per Y block
193 
194           cb = uvlut[*cb_frame];
195           cr = uvlut[*cr_frame];
196 	  cb_frame++;
197 	  cr_frame++;
198 
199 #if 0 /* DV_IS_BIG_ENDIAN */
200          *pwyuv++ = cb;
201          *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
202          Ytmp++;
203          *pwyuv++ = cr;
204          *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
205          Ytmp++;
206 
207          *pwyuv++ = cb;
208          *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
209          Ytmp++;
210          *pwyuv++ = cr;
211          *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
212          Ytmp++;
213 #else
214 	  *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
215 	  Ytmp++;
216 	  *pwyuv++ = cb;
217 	  *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
218 	  Ytmp++;
219 	  *pwyuv++ = cr;
220 
221 	  *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
222 	  Ytmp++;
223 	  *pwyuv++ = cb;
224 	  *pwyuv++ = my_ylut[CLAMP(*Ytmp, -256, 511)];
225 	  Ytmp++;
226 	  *pwyuv++ = cr;
227 #endif
228         } /* for col */
229         Y[j + i] = Ytmp;
230 
231       } /* for i */
232 
233       cb_frame += 4;
234       cr_frame += 4;
235       pyuv += pitches[0];
236     } /* for row */
237 
238   } /* for j */
239 } /* dv_mb411_right_YUY2 */
240 
241 /* ----------------------------------------------------------------------------
242  */
243 void
dv_mb420_YUY2(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches)244 dv_mb420_YUY2 (dv_macroblock_t *mb, uint8_t **pixels, int *pitches) {
245     dv_coeff_t    *Y [4], *Ytmp0, *cr_frame, *cb_frame;
246     unsigned char *pyuv,*pwyuv0, *pwyuv1,
247                   cb, cr;
248     int           i, j, col, row, inc_l2, inc_l4;
249 
250   pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
251 
252   Y [0] = mb->b[0].coeffs;
253   Y [1] = mb->b[1].coeffs;
254   Y [2] = mb->b[2].coeffs;
255   Y [3] = mb->b[3].coeffs;
256   cr_frame = mb->b[4].coeffs;
257   cb_frame = mb->b[5].coeffs;
258   inc_l2 = pitches[0];
259   inc_l4 = pitches[0]*2;
260 
261   for (j = 0; j < 4; j += 2) { // Two rows of blocks j, j+1
262     for (row = 0; row < 4; row++) { // 4 pairs of two rows
263       pwyuv0 = pyuv;
264       pwyuv1 = pyuv + inc_l4;
265       for (i = 0; i < 2; ++i) { // Two columns of blocks
266         Ytmp0 = Y[j + i];
267         for (col = 0; col < 4; ++col) {  // 4 spans of 2x2 pixels
268           cb = uvlut[CLAMP(*cb_frame, -128, 127)];
269           cr = uvlut[CLAMP(*cr_frame, -128, 127)];
270           cb_frame++;
271           cr_frame++;
272 
273 #if DV_IS_LITTLE_ENDIAN
274           *pwyuv0++ = ylut[CLAMP(*(Ytmp0 + 0), -256, 511)];
275           *pwyuv0++ = cb;
276           *pwyuv0++ = ylut[CLAMP(*(Ytmp0 + 1), -256, 511)];
277           *pwyuv0++ = cr;
278 
279           *pwyuv1++ = ylut[CLAMP(*(Ytmp0 + 16), -256, 511)];
280           *pwyuv1++ = cb;
281           *pwyuv1++ = ylut[CLAMP(*(Ytmp0 + 17), -256, 511)];
282           *pwyuv1++ = cr;
283 #else
284           *pwyuv0++ = cr;
285           *pwyuv0++ = ylut[CLAMP(*(Ytmp0 + 1), -256, 511)];
286           *pwyuv0++ = cb;
287           *pwyuv0++ = ylut[CLAMP(*(Ytmp0 + 0), -256, 511)];
288 
289           *pwyuv1++ = cr;
290           *pwyuv1++ = ylut[CLAMP(*(Ytmp0 + 17), -256, 511)];
291           *pwyuv1++ = cb;
292           *pwyuv1++ = ylut[CLAMP(*(Ytmp0 + 16), -256, 511)];
293 #endif
294           Ytmp0 += 2;
295         }
296         if (row & 1) {
297           Ytmp0 += 16;
298         }
299         Y[j + i] = Ytmp0;
300       }
301       pyuv += inc_l2;
302       if (row & 1) {
303         pyuv += inc_l4;
304       }
305     }
306   }
307 }
308 
309 #if ARCH_X86 || ARCH_X86_64
310 
311 /* TODO (by Buck):
312  *
313  *   When testing YV12.c, I discovered that my video card (RAGE 128)
314  *   doesn't care that pixel components are strictly clamped to Y
315  *   (16..235), UV (16..240).  So I was able to use MMX pack with
316  *   unsigned saturation to go from 16 to 8 bit representation.  This
317  *   clamps to (0..255).  Applying this to the code here might allow
318  *   us to reduce instruction count below.
319  *
320  *   Question:  do other video cards behave the same way?
321  *   DRD> No, your video card will not care about clamping to these
322  *   ranges. This issue only applies to analog NTSC output.
323  * */
324 void
dv_mb411_YUY2_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup,int clamp_luma,int clamp_chroma)325 dv_mb411_YUY2_mmx(dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
326                   int add_ntsc_setup, int clamp_luma, int clamp_chroma) {
327     dv_coeff_t		*Y[4], *cr_frame, *cb_frame;
328     unsigned char	*pyuv, *pwyuv;
329     int			i, row;
330 
331     Y[0] = mb->b[0].coeffs;
332     Y[1] = mb->b[1].coeffs;
333     Y[2] = mb->b[2].coeffs;
334     Y[3] = mb->b[3].coeffs;
335     cr_frame = mb->b[4].coeffs;
336     cb_frame = mb->b[5].coeffs;
337 
338     pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
339 
340     if (clamp_luma && clamp_chroma) {
341       movq_m2r (mmx_cbh, mm5);
342       movq_m2r (mmx_cbl, mm6);
343       movq_m2r (mmx_cbb, mm7);
344     } else if (clamp_luma) {
345       movq_m2r (mmx_clh, mm5);
346       movq_m2r (mmx_cll, mm6);
347       movq_m2r (mmx_clb, mm7);
348     } else if (clamp_chroma) {
349       movq_m2r (mmx_cch, mm5);
350       movq_m2r (mmx_ccl, mm6);
351       movq_m2r (mmx_ccb, mm7);
352     } else {
353       movq_m2r (mmx_zero, mm5);
354       movq_m2r (mmx_zero, mm6);
355       movq_m2r (mmx_zero, mm7);
356     }
357 
358     if (add_ntsc_setup)
359       paddusb_m2r (mmx_0x0010s, mm5);	/* add setup to hi clamp	*/
360 
361     for (row = 0; row < 8; ++row) { // Eight rows
362       pwyuv = pyuv;
363       for (i = 0; i < 4; ++i) {     // Four Y blocks
364 	dv_coeff_t *Ytmp = Y [i];   // less indexing in inner loop speedup?
365 	/* ---------------------------------------------------------------------
366 	 */
367 	movq_m2r (*cb_frame, mm2);    // cb0 cb1 cb2 cb3
368 	movq_m2r (*cr_frame, mm3);    // cr0 cr1 cr2 cr3
369 	punpcklwd_r2r (mm3, mm2); // cb0cr0 cb1cr1
370 	movq_r2r (mm2, mm3);
371 	punpckldq_r2r (mm2, mm2); // cb0cr0 cb0cr0
372 	movq_m2r (Ytmp [0], mm0);
373 	movq_r2r (mm0, mm1);
374 	punpcklwd_r2r (mm2, mm0);
375 	punpckhwd_r2r (mm2, mm1);
376 
377 	packsswb_r2r (mm1, mm0);
378 	paddb_m2r (mmx_0x8080s, mm0);
379 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
380 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
381 	paddusb_r2r (mm7, mm0);		/* to black level	*/
382 	movq_r2m (mm0, pwyuv [0]);
383 
384 	/* ---------------------------------------------------------------------
385 	 */
386 	movq_m2r (Ytmp [4], mm0);
387 	punpckhdq_r2r (mm3, mm3);
388 	movq_r2r (mm0, mm1);
389 	punpcklwd_r2r (mm3, mm0);
390 	punpckhwd_r2r (mm3, mm1);
391 
392 	packsswb_r2r (mm1, mm0);
393 	paddb_m2r (mmx_0x8080s, mm0);
394 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
395 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
396 	paddusb_r2r (mm7, mm0);		/* to black level	*/
397 	movq_r2m (mm0, pwyuv [8]);
398 
399 	pwyuv += 16;
400 	cr_frame += 2;
401 	cb_frame += 2;
402 	Y [i] = Ytmp + 8;
403       } /* for i */
404       pyuv += pitches[0];
405     } /* for j */
406     emms ();
407 } /* dv_mb411_YUY2_mmx */
408 
409 /* ----------------------------------------------------------------------------
410  */
411 void
dv_mb411_right_YUY2_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup,int clamp_luma,int clamp_chroma)412 dv_mb411_right_YUY2_mmx(dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
413                         int add_ntsc_setup, int clamp_luma, int clamp_chroma) {
414 
415   dv_coeff_t		*Y[4], *Ytmp, *cr_frame, *cb_frame;
416   unsigned char	        *pyuv;
417   int			j, row;
418 
419   Y[0] = mb->b[0].coeffs;
420   Y[1] = mb->b[1].coeffs;
421   Y[2] = mb->b[2].coeffs;
422   Y[3] = mb->b[3].coeffs;
423 
424   pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
425 
426   if (clamp_luma && clamp_chroma) {
427     movq_m2r (mmx_cbh, mm5);
428     movq_m2r (mmx_cbl, mm6);
429     movq_m2r (mmx_cbb, mm7);
430   } else if (clamp_luma) {
431     movq_m2r (mmx_clh, mm5);
432     movq_m2r (mmx_cll, mm6);
433     movq_m2r (mmx_clb, mm7);
434   } else if (clamp_chroma) {
435     movq_m2r (mmx_cch, mm5);
436     movq_m2r (mmx_ccl, mm6);
437     movq_m2r (mmx_ccb, mm7);
438   } else {
439     movq_m2r (mmx_zero, mm5);
440     movq_m2r (mmx_zero, mm6);
441     movq_m2r (mmx_zero, mm7);
442   }
443 
444   if (add_ntsc_setup)
445     paddusb_m2r (mmx_0x0010s, mm5);	/* add setup to hi clamp	*/
446 
447   for (j = 0; j < 4; j += 2) { // Two rows of blocks
448     cr_frame = mb->b[4].coeffs + (j * 2);
449     cb_frame = mb->b[5].coeffs + (j * 2);
450 
451     for (row = 0; row < 8; row++) {
452 
453       movq_m2r(*cb_frame, mm0);
454       packsswb_r2r(mm0,mm0);
455       movq_m2r(*cr_frame, mm1);
456       packsswb_r2r(mm1,mm1);
457       punpcklbw_r2r(mm1,mm0);
458       movq_r2r(mm0,mm1);
459 
460       punpcklwd_r2r(mm0,mm0); // pack doubled low cb and crs
461       punpckhwd_r2r(mm1,mm1); // pack doubled high cb and crs
462 
463       Ytmp = Y[j];
464 
465       movq_m2r(Ytmp [0],mm2);
466       movq_m2r(Ytmp [4],mm3);
467 
468       packsswb_r2r(mm3,mm2);  // pack Ys from signed 16-bit to signed 8-bit
469       movq_r2r(mm2,mm3);
470 
471       punpcklbw_r2r (mm0, mm3); // interlieve low Ys with crcbs
472       paddb_m2r (mmx_0x8080s, mm3);
473       paddusb_r2r (mm5, mm3);		/* clamp high		*/
474       psubusb_r2r (mm6, mm3);		/* clamp low		*/
475       paddusb_r2r (mm7, mm3);		/* to black level	*/
476       movq_r2m (mm3, pyuv [0]);
477 
478       punpckhbw_r2r (mm0, mm2); // interlieve high Ys with crcbs
479       paddb_m2r (mmx_0x8080s, mm2);
480       paddusb_r2r (mm5, mm2);		/* clamp high		*/
481       psubusb_r2r (mm6, mm2);		/* clamp low		*/
482       paddusb_r2r (mm7, mm2);		/* to black level	*/
483       movq_r2m (mm2, pyuv [8]);
484 
485       Y[j] += 8;
486 
487       Ytmp = Y[j+1];
488 
489       movq_m2r(Ytmp [0],mm2);
490       movq_m2r(Ytmp [4],mm3);
491 
492       packsswb_r2r(mm3,mm2); // pack Ys from signed 16-bit to signed 8-bit
493       movq_r2r(mm2,mm3);
494 
495       punpcklbw_r2r(mm1,mm3);
496       paddb_m2r (mmx_0x8080s, mm3);
497       paddusb_r2r (mm5, mm3);		/* clamp high		*/
498       psubusb_r2r (mm6, mm3);		/* clamp low		*/
499       paddusb_r2r (mm7, mm3);		/* to black level	*/
500       movq_r2m(mm3, pyuv [16]);
501 
502       punpckhbw_r2r(mm1,mm2);  // interlieve low Ys with crcbs
503       paddb_m2r (mmx_0x8080s, mm2);
504       paddusb_r2r (mm5, mm2);		/* clamp high		*/
505       psubusb_r2r (mm6, mm2);		/* clamp low		*/
506       paddusb_r2r (mm7, mm2);		/* to black level	*/
507       movq_r2m (mm2, pyuv [24]); // interlieve high Ys with crcbs
508 
509       Y[j+1] += 8;
510       cr_frame += 8;
511       cb_frame += 8;
512 
513       pyuv += pitches[0];
514     } /* for row */
515 
516   } /* for j */
517   emms();
518 } /* dv_mb411_right_YUY2_mmx */
519 
520 /* ----------------------------------------------------------------------------
521  */
522 void
dv_mb420_YUY2_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int clamp_luma,int clamp_chroma)523 dv_mb420_YUY2_mmx (dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
524                    int clamp_luma, int clamp_chroma) {
525     dv_coeff_t		*Y [4], *Ytmp0, *cr_frame, *cb_frame;
526     unsigned char	*pyuv,
527 			*pwyuv0, *pwyuv1;
528     int			i, j, row, inc_l2, inc_l4;
529 
530   pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]);
531 
532   Y [0] = mb->b[0].coeffs;
533   Y [1] = mb->b[1].coeffs;
534   Y [2] = mb->b[2].coeffs;
535   Y [3] = mb->b[3].coeffs;
536   cr_frame = mb->b[4].coeffs;
537   cb_frame = mb->b[5].coeffs;
538   inc_l2 = pitches[0];
539   inc_l4 = pitches[0]*2;
540 
541   if (clamp_luma && clamp_chroma) {
542     movq_m2r (mmx_cbh, mm5);
543     movq_m2r (mmx_cbl, mm6);
544     movq_m2r (mmx_cbb, mm7);
545   } else if (clamp_luma) {
546     movq_m2r (mmx_clh, mm5);
547     movq_m2r (mmx_cll, mm6);
548     movq_m2r (mmx_clb, mm7);
549   } else if (clamp_chroma) {
550     movq_m2r (mmx_cch, mm5);
551     movq_m2r (mmx_ccl, mm6);
552     movq_m2r (mmx_ccb, mm7);
553   } else {
554     movq_m2r (mmx_zero, mm5);
555     movq_m2r (mmx_zero, mm6);
556     movq_m2r (mmx_zero, mm7);
557   }
558 
559   for (j = 0; j < 4; j += 2) { // Two rows of blocks j, j+1
560     for (row = 0; row < 4; row++) { // 4 pairs of two rows
561       pwyuv0 = pyuv;
562       pwyuv1 = pyuv + inc_l4;
563       for (i = 0; i < 2; ++i) { // Two columns of blocks
564         Ytmp0 = Y[j + i];
565 
566 	/* -------------------------------------------------------------------
567 	 */
568 	movq_m2r (*cb_frame, mm2);	/* mm2 = b1 b2 b3 b4	*/
569 	movq_m2r (*cr_frame, mm3);	/* mm3 = r1 r2 r3 r4	*/
570 	movq_r2r (mm2, mm4);		/* mm4 = b1 b2 b3 b4	*/
571 	punpcklwd_r2r (mm3, mm4);	/* mm4 = b3 r3 b4 r4	*/
572 
573 	movq_m2r (Ytmp0[0], mm0);	/* mm0 = y1 y2 y3 y4	*/
574 	movq_r2r (mm0, mm1);
575 
576 	punpcklwd_r2r (mm4, mm0);	/* mm0 = b4 y3 r4 y4	*/
577 	punpckhwd_r2r (mm4, mm1);	/* mm1 = b3 y1 r3 y2	*/
578 
579 	packsswb_r2r (mm1, mm0);	/* mm0 = b3 y1 r3 y2 b4 y3 r4 y4	*/
580 	paddb_m2r (mmx_0x8080s, mm0);
581 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
582 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
583 	paddusb_r2r (mm7, mm0);		/* to black level	*/
584 	movq_r2m (mm0, pwyuv0[0]);
585 
586 	movq_m2r (Ytmp0[8+8], mm0);
587 	movq_r2r (mm0, mm1);
588 
589 	punpcklwd_r2r (mm4, mm0);
590 	punpckhwd_r2r (mm4, mm1);
591 	packsswb_r2r (mm1, mm0);
592 	paddb_m2r (mmx_0x8080s, mm0);
593 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
594 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
595 	paddusb_r2r (mm7, mm0);		/* to black level	*/
596 	movq_r2m (mm0, pwyuv1[0]);
597 
598 	movq_r2r (mm2, mm4);
599 	punpckhwd_r2r (mm3, mm4);
600 	movq_m2r (Ytmp0[4], mm0);
601 	movq_r2r (mm0, mm1);
602 	punpcklwd_r2r (mm4, mm0);
603 	punpckhwd_r2r (mm4, mm1);
604 	packsswb_r2r (mm1, mm0);
605 	paddb_m2r (mmx_0x8080s, mm0);
606 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
607 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
608 	paddusb_r2r (mm7, mm0);		/* to black level	*/
609 	movq_r2m (mm0, pwyuv0[8]);
610 
611 	movq_m2r (Ytmp0[8+12], mm0);
612 	movq_r2r (mm0, mm1);
613 	punpcklwd_r2r (mm4, mm0);
614 	punpckhwd_r2r (mm4, mm1);
615 	packsswb_r2r (mm1, mm0);
616 	paddb_m2r (mmx_0x8080s, mm0);
617 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
618 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
619 	paddusb_r2r (mm7, mm0);		/* to black level	*/
620 	movq_r2m (mm0, pwyuv1[8]);
621 
622         pwyuv0 += 16;
623         pwyuv1 += 16;
624         cb_frame += 4;
625         cr_frame += 4;
626         if (row & 1) {
627           Ytmp0 += 24;
628         } else {
629           Ytmp0 += 8;
630         }
631         Y[j + i] = Ytmp0;
632       }
633       pyuv += inc_l2;
634       if (row & 1)
635         pyuv += inc_l4;
636     }
637   }
638   emms ();
639 }
640 
641 /* ----------------------------------------------------------------------------
642  * start of half high render functions
643  */
644 
645 /* ---------------------------------------------------------------------------
646  */
647 void
dv_mb411_YUY2_hh_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup,int clamp_luma,int clamp_chroma)648 dv_mb411_YUY2_hh_mmx(dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
649                      int add_ntsc_setup, int clamp_luma, int clamp_chroma) {
650     dv_coeff_t		*Y[4], *cr_frame, *cb_frame;
651     unsigned char	*pyuv, *pwyuv;
652     int			i, row;
653 
654     Y[0] = mb->b[0].coeffs;
655     Y[1] = mb->b[1].coeffs;
656     Y[2] = mb->b[2].coeffs;
657     Y[3] = mb->b[3].coeffs;
658     cr_frame = mb->b[4].coeffs;
659     cb_frame = mb->b[5].coeffs;
660 
661     pyuv = pixels[0] + (mb->x * 2) + ((mb->y * pitches[0]) / 2);
662 
663     if (clamp_luma && clamp_chroma) {
664       movq_m2r (mmx_cbh, mm5);
665       movq_m2r (mmx_cbl, mm6);
666       movq_m2r (mmx_cbb, mm7);
667     } else if (clamp_luma) {
668       movq_m2r (mmx_clh, mm5);
669       movq_m2r (mmx_cll, mm6);
670       movq_m2r (mmx_clb, mm7);
671     } else if (clamp_chroma) {
672       movq_m2r (mmx_cch, mm5);
673       movq_m2r (mmx_ccl, mm6);
674       movq_m2r (mmx_ccb, mm7);
675     } else {
676       movq_m2r (mmx_zero, mm5);
677       movq_m2r (mmx_zero, mm6);
678       movq_m2r (mmx_zero, mm7);
679     }
680 
681     if (add_ntsc_setup)
682       paddusb_m2r (mmx_0x0010s, mm5);	/* add setup to hi clamp	*/
683 
684     for (row = 0; row < 4; ++row) { // Eight rows
685       pwyuv = pyuv;
686       for (i = 0; i < 4; ++i) {     // Four Y blocks
687 	dv_coeff_t *Ytmp = Y [i];   // less indexing in inner loop speedup?
688 	/* ---------------------------------------------------------------------
689 	 */
690 	movq_m2r (*cb_frame, mm2);	// cb0 cb1 cb2 cb3
691 	movq_m2r (*cr_frame, mm3);	// cr0 cr1 cr2 cr3
692 	punpcklwd_r2r (mm3, mm2);	// cb0cr0 cb1cr1
693 	movq_r2r (mm2, mm3);
694 	punpckldq_r2r (mm2, mm2);	// cb0cr0 cb0cr0
695 	movq_m2r (Ytmp [0], mm0);
696 	movq_r2r (mm0, mm1);
697 	punpcklwd_r2r (mm2, mm0);	/* mm0 = b4 y3 r4 y4	*/
698 	punpckhwd_r2r (mm2, mm1);	/* mm1 = b3 y1 r3 y2	*/
699 
700 	packsswb_r2r (mm1, mm0);	/* mm0 = b3 y1 r3 y2 b4 y3 r4 y4	*/
701 	paddb_m2r (mmx_0x8080s, mm0);
702 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
703 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
704 	paddusb_r2r (mm7, mm0);		/* to black level	*/
705 	movq_r2m (mm0, pwyuv [0]);
706 
707 	/* ---------------------------------------------------------------------
708 	 */
709 	movq_m2r (Ytmp [4], mm0);
710 	punpckhdq_r2r (mm3, mm3);
711 	movq_r2r (mm0, mm1);
712 	punpcklwd_r2r (mm3, mm0);	/* mm0 = b4 y3 r4 y4	*/
713 	punpckhwd_r2r (mm3, mm1);	/* mm1 = b3 y1 r3 y2	*/
714 
715 	packsswb_r2r (mm1, mm0);	/* mm0 = b3 y1 r3 y2 b4 y3 r4 y4	*/
716 	paddb_m2r (mmx_0x8080s, mm0);
717 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
718 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
719 	paddusb_r2r (mm7, mm0);		/* to black level	*/
720 	movq_r2m (mm0, pwyuv [8]);
721 
722 	pwyuv += 16;
723 
724 	cr_frame += 2;
725 	cb_frame += 2;
726 	Y [i] = Ytmp + 16;
727       } /* for i */
728       cr_frame += 8;
729       cb_frame += 8;
730       pyuv += pitches[0];
731     } /* for j */
732     emms ();
733 } /* dv_mb411_YUY2_mmx */
734 
735 /* ----------------------------------------------------------------------------
736  */
737 void
dv_mb411_right_YUY2_hh_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int add_ntsc_setup,int clamp_luma,int clamp_chroma)738 dv_mb411_right_YUY2_hh_mmx(dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
739                            int add_ntsc_setup, int clamp_luma, int clamp_chroma) {
740 
741   dv_coeff_t		*Y[4], *Ytmp, *cr_frame, *cb_frame;
742   unsigned char	        *pyuv;
743   int			j, row;
744 
745   Y[0] = mb->b[0].coeffs;
746   Y[1] = mb->b[1].coeffs;
747   Y[2] = mb->b[2].coeffs;
748   Y[3] = mb->b[3].coeffs;
749 
750   pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]) / 2;
751 
752   if (clamp_luma && clamp_chroma) {
753     movq_m2r (mmx_cbh, mm5);
754     movq_m2r (mmx_cbl, mm6);
755     movq_m2r (mmx_cbb, mm7);
756   } else if (clamp_luma) {
757     movq_m2r (mmx_clh, mm5);
758     movq_m2r (mmx_cll, mm6);
759     movq_m2r (mmx_clb, mm7);
760   } else if (clamp_chroma) {
761     movq_m2r (mmx_cch, mm5);
762     movq_m2r (mmx_ccl, mm6);
763     movq_m2r (mmx_ccb, mm7);
764   } else {
765     movq_m2r (mmx_zero, mm5);
766     movq_m2r (mmx_zero, mm6);
767     movq_m2r (mmx_zero, mm7);
768   }
769 
770   if (add_ntsc_setup)
771     paddusb_m2r (mmx_0x0010s, mm5);	/* add setup to hi clamp	*/
772 
773   for (j = 0; j < 4; j += 2) { // Two rows of blocks
774     cr_frame = mb->b[4].coeffs + (j * 2);
775     cb_frame = mb->b[5].coeffs + (j * 2);
776 
777     for (row = 0; row < 4; row++) {
778 
779       movq_m2r(*cb_frame, mm0);
780       packsswb_r2r(mm0,mm0);
781       movq_m2r(*cr_frame, mm1);
782       packsswb_r2r(mm1,mm1);
783       punpcklbw_r2r(mm1,mm0);
784       movq_r2r(mm0,mm1);
785 
786       punpcklwd_r2r(mm0,mm0); // pack doubled low cb and crs
787       punpckhwd_r2r(mm1,mm1); // pack doubled high cb and crs
788 
789       Ytmp = Y[j];
790 
791       movq_m2r(Ytmp [0],mm2);
792       movq_m2r(Ytmp [4],mm3);
793 
794       packsswb_r2r(mm3,mm2);  // pack Ys from signed 16-bit to unsigned 8-bit
795       movq_r2r(mm2,mm3);
796 
797       punpcklbw_r2r (mm0, mm3); // interlieve low Ys with crcbs
798       paddb_m2r (mmx_0x8080s, mm3);
799       paddusb_r2r (mm5, mm3);		/* clamp high		*/
800       psubusb_r2r (mm6, mm3);		/* clamp low		*/
801       paddusb_r2r (mm7, mm3);		/* to black level	*/
802       movq_r2m (mm3, pyuv [0]);
803 
804       punpckhbw_r2r (mm0, mm2); // interlieve high Ys with crcbs
805       paddb_m2r (mmx_0x8080s, mm2);
806       paddusb_r2r (mm5, mm2);		/* clamp high		*/
807       psubusb_r2r (mm6, mm2);		/* clamp low		*/
808       paddusb_r2r (mm7, mm2);		/* to black level	*/
809       movq_r2m (mm2, pyuv [8]);
810 
811       Y[j] += 16;
812 
813       Ytmp = Y[j+1];
814 
815       movq_m2r(Ytmp [0],mm2);
816       movq_m2r(Ytmp [4],mm3);
817 
818       packsswb_r2r(mm3,mm2); // pack Ys from signed 16-bit to unsigned 8-bit
819       movq_r2r(mm2,mm3);
820 
821       punpcklbw_r2r(mm1,mm3);
822       paddb_m2r (mmx_0x8080s, mm3);
823       paddusb_r2r (mm5, mm3);		/* clamp high		*/
824       psubusb_r2r (mm6, mm3);		/* clamp low		*/
825       paddusb_r2r (mm7, mm3);		/* to black level	*/
826       movq_r2m(mm3, pyuv [16]);
827 
828       punpckhbw_r2r(mm1,mm2);  // interlieve low Ys with crcbs
829       paddb_m2r (mmx_0x8080s, mm2);
830       paddusb_r2r (mm5, mm2);		/* clamp high		*/
831       psubusb_r2r (mm6, mm2);		/* clamp low		*/
832       paddusb_r2r (mm7, mm2);		/* to black level	*/
833       movq_r2m (mm2, pyuv [24]); // interlieve high Ys with crcbs
834 
835       Y[j+1] += 16;
836       cr_frame += 16;
837       cb_frame += 16;
838 
839       pyuv += pitches[0];
840     } /* for row */
841 
842   } /* for j */
843   emms();
844 } /* dv_mb411_right_YUY2_mmx */
845 
846 /* ---------------------------------------------------------------------------
847  */
848 void
dv_mb420_YUY2_hh_mmx(dv_macroblock_t * mb,uint8_t ** pixels,int * pitches,int clamp_luma,int clamp_chroma)849 dv_mb420_YUY2_hh_mmx (dv_macroblock_t *mb, uint8_t **pixels, int *pitches,
850                    int clamp_luma, int clamp_chroma) {
851     dv_coeff_t		*Y [4], *Ytmp0, *cr_frame, *cb_frame;
852     unsigned char	*pyuv,
853 			*pwyuv0;
854     int			i, j, row, inc_l2;
855 
856   pyuv = pixels[0] + (mb->x * 2) + (mb->y * pitches[0]) / 2;
857 
858   Y [0] = mb->b[0].coeffs;
859   Y [1] = mb->b[1].coeffs;
860   Y [2] = mb->b[2].coeffs;
861   Y [3] = mb->b[3].coeffs;
862   cr_frame = mb->b[4].coeffs;
863   cb_frame = mb->b[5].coeffs;
864   inc_l2 = pitches[0];
865 
866   if (clamp_luma && clamp_chroma) {
867     movq_m2r (mmx_cbh, mm5);
868     movq_m2r (mmx_cbl, mm6);
869     movq_m2r (mmx_cbb, mm7);
870   } else if (clamp_luma) {
871     movq_m2r (mmx_clh, mm5);
872     movq_m2r (mmx_cll, mm6);
873     movq_m2r (mmx_clb, mm7);
874   } else if (clamp_chroma) {
875     movq_m2r (mmx_cch, mm5);
876     movq_m2r (mmx_ccl, mm6);
877     movq_m2r (mmx_ccb, mm7);
878   } else {
879     movq_m2r (mmx_zero, mm5);
880     movq_m2r (mmx_zero, mm6);
881     movq_m2r (mmx_zero, mm7);
882   }
883 
884   for (j = 0; j < 4; j += 2) { // Two rows of blocks j, j+1
885     for (row = 0; row < 4; row++) { // 4 pairs of two rows
886       pwyuv0 = pyuv;
887       for (i = 0; i < 2; ++i) { // Two columns of blocks
888         Ytmp0 = Y[j + i];
889 
890 	/* -------------------------------------------------------------------
891 	 */
892 	movq_m2r (*cb_frame, mm2);	/* mm2 = b1 b2 b3 b4	*/
893 	movq_m2r (*cr_frame, mm3);	/* mm3 = r1 r2 r3 r4	*/
894 	movq_r2r (mm2, mm4);		/* mm4 = b1 b2 b3 b4	*/
895 	punpcklwd_r2r (mm3, mm4);	/* mm4 = b3 r3 b4 r4	*/
896 
897 	movq_m2r (Ytmp0[0], mm0);	/* mm0 = y1 y2 y3 y4	*/
898 	movq_r2r (mm0, mm1);
899 
900 	punpcklwd_r2r (mm4, mm0);	/* mm0 = b4 y3 r4 y4	*/
901 	punpckhwd_r2r (mm4, mm1);	/* mm1 = b3 y1 r3 y2	*/
902 
903 	packsswb_r2r (mm1, mm0);	/* mm4 = b3 y1 r3 y2 b4 y3 r4 y4	*/
904 	paddb_m2r (mmx_0x8080s, mm0);
905 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
906 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
907 	paddusb_r2r (mm7, mm0);		/* to black level	*/
908 	movq_r2m (mm0, pwyuv0[0]);
909 
910 	movq_r2r (mm2, mm4);
911 	punpckhwd_r2r (mm3, mm4);
912 	movq_m2r (Ytmp0[4], mm0);
913 	movq_r2r (mm0, mm1);
914 	punpcklwd_r2r (mm4, mm0);	/* mm4 = b4 y3 r4 y4	*/
915 	punpckhwd_r2r (mm4, mm1);	/* mm5 = b3 y1 r3 y2	*/
916 	packsswb_r2r (mm1, mm0);	/* mm4 = b3 y1 r3 y2 b4 y3 r4 y4	*/
917 	paddb_m2r (mmx_0x8080s, mm0);
918 	paddusb_r2r (mm5, mm0);		/* clamp high		*/
919 	psubusb_r2r (mm6, mm0);		/* clamp low		*/
920 	paddusb_r2r (mm7, mm0);		/* to black level	*/
921 	movq_r2m (mm0, pwyuv0[8]);
922 
923         pwyuv0 += 16;
924         cb_frame += 4;
925         cr_frame += 4;
926         Y[j + i] = Ytmp0 + 16;
927       }
928       /* ---------------------------------------------------------------------
929        * for odd value of row counter (this is NOT an odd line number)
930        * we have to go one additional step forward, and for even value
931        * we have to go one step back to use the color information again.
932        * Assuming that chroma information is fields based.
933        */
934       if (row & 1) {
935         cb_frame += 8;
936         cr_frame += 8;
937       } else {
938         cb_frame -= 8;
939         cr_frame -= 8;
940       }
941       pyuv += inc_l2;
942     }
943   }
944   emms ();
945 }
946 #endif // ARCH_X86 || ARCH_X86_64
947