1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 VLC authors and VideoLAN
5 * $Id: b1a0359fc9b237cd1ce7d6ba87e155396348afdd $
6 *
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
24
25 /*****************************************************************************
26 * Preamble
27 *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_picture.h>
37 #include <vlc_cpu.h>
38
39 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
40 # undef bool
41 # include <altivec.h>
42 # define bool _Bool
43 #endif
44
45 #include "i420_yuy2.h"
46
47 #define SRC_FOURCC "I420,IYUV,YV12"
48
49 #if defined (MODULE_NAME_IS_i420_yuy2)
50 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,Y211"
51 # define VLC_TARGET
52 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
53 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
54 # define VLC_TARGET VLC_MMX
55 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
56 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
57 # define VLC_TARGET VLC_SSE
58 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
59 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
60 # define VLC_TARGET
61 #endif
62
63 /*****************************************************************************
64 * Local and extern prototypes.
65 *****************************************************************************/
66 static int Activate ( vlc_object_t * );
67
68 static void I420_YUY2 ( filter_t *, picture_t *, picture_t * );
69 static void I420_YVYU ( filter_t *, picture_t *, picture_t * );
70 static void I420_UYVY ( filter_t *, picture_t *, picture_t * );
71 static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * );
72 static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * );
73 static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * );
74 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
75 static void I420_IUYV ( filter_t *, picture_t *, picture_t * );
76 static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * );
77 #endif
78 #if defined (MODULE_NAME_IS_i420_yuy2)
79 static void I420_Y211 ( filter_t *, picture_t *, picture_t * );
80 static picture_t *I420_Y211_Filter ( filter_t *, picture_t * );
81 #endif
82
83 /*****************************************************************************
84 * Module descriptor.
85 *****************************************************************************/
86 vlc_module_begin ()
87 #if defined (MODULE_NAME_IS_i420_yuy2)
88 set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
89 set_capability( "video converter", 80 )
90 # define vlc_CPU_capable() (true)
91 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
92 set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
93 set_capability( "video converter", 160 )
94 # define vlc_CPU_capable() vlc_CPU_MMX()
95 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
96 set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
97 set_capability( "video converter", 250 )
98 # define vlc_CPU_capable() vlc_CPU_SSE2()
99 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
100 set_description(
101 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
102 set_capability( "video converter", 250 )
103 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
104 #endif
set_callbacks(Activate,NULL)105 set_callbacks( Activate, NULL )
106 vlc_module_end ()
107
108 /*****************************************************************************
109 * Activate: allocate a chroma function
110 *****************************************************************************
111 * This function allocates and initializes a chroma function
112 *****************************************************************************/
113 static int Activate( vlc_object_t *p_this )
114 {
115 filter_t *p_filter = (filter_t *)p_this;
116
117 if( !vlc_CPU_capable() )
118 return VLC_EGENERIC;
119 if( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) & 1
120 || (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) & 1 )
121 {
122 return -1;
123 }
124
125 if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
126 || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height
127 || p_filter->fmt_in.video.orientation != p_filter->fmt_out.video.orientation )
128 return -1;
129
130 switch( p_filter->fmt_in.video.i_chroma )
131 {
132 // case VLC_CODEC_YV12: FIXME invert U and V in the filters :)
133 case VLC_CODEC_I420:
134 switch( p_filter->fmt_out.video.i_chroma )
135 {
136 case VLC_CODEC_YUYV:
137 p_filter->pf_video_filter = I420_YUY2_Filter;
138 break;
139
140 case VLC_CODEC_YVYU:
141 p_filter->pf_video_filter = I420_YVYU_Filter;
142 break;
143
144 case VLC_CODEC_UYVY:
145 p_filter->pf_video_filter = I420_UYVY_Filter;
146 break;
147 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
148 case VLC_FOURCC('I','U','Y','V'):
149 p_filter->pf_video_filter = I420_IUYV_Filter;
150 break;
151 #endif
152
153 #if defined (MODULE_NAME_IS_i420_yuy2)
154 case VLC_CODEC_Y211:
155 p_filter->pf_video_filter = I420_Y211_Filter;
156 break;
157 #endif
158
159 default:
160 return -1;
161 }
162 break;
163
164 default:
165 return -1;
166 }
167
168 return 0;
169 }
170
171 #if 0
172 static inline unsigned long long read_cycles(void)
173 {
174 unsigned long long v;
175 __asm__ __volatile__("rdtsc" : "=A" (v): );
176
177 return v;
178 }
179 #endif
180
181 /* Following functions are local */
182
183 VIDEO_FILTER_WRAPPER( I420_YUY2 )
VIDEO_FILTER_WRAPPER(I420_YVYU)184 VIDEO_FILTER_WRAPPER( I420_YVYU )
185 VIDEO_FILTER_WRAPPER( I420_UYVY )
186 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
187 VIDEO_FILTER_WRAPPER( I420_IUYV )
188 #endif
189 #if defined (MODULE_NAME_IS_i420_yuy2)
190 VIDEO_FILTER_WRAPPER( I420_Y211 )
191 #endif
192
193 /*****************************************************************************
194 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
195 *****************************************************************************/
196 VLC_TARGET
197 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
198 picture_t *p_dest )
199 {
200 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
201 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
202 uint8_t *p_u = p_source->U_PIXELS;
203 uint8_t *p_v = p_source->V_PIXELS;
204
205 int i_x, i_y;
206
207 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
208 #define VEC_NEXT_LINES( ) \
209 p_line1 = p_line2; \
210 p_line2 += p_dest->p->i_pitch; \
211 p_y1 = p_y2; \
212 p_y2 += p_source->p[Y_PLANE].i_pitch;
213
214 #define VEC_LOAD_UV( ) \
215 u_vec = vec_ld( 0, p_u ); p_u += 16; \
216 v_vec = vec_ld( 0, p_v ); p_v += 16;
217
218 #define VEC_MERGE( a ) \
219 uv_vec = a( u_vec, v_vec ); \
220 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
221 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
222 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
223 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
224 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
225 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
226
227 vector unsigned char u_vec;
228 vector unsigned char v_vec;
229 vector unsigned char uv_vec;
230 vector unsigned char y_vec;
231
232 if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
233 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
234 {
235 /* Width is a multiple of 32, we take 2 lines at a time */
236 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
237 {
238 VEC_NEXT_LINES( );
239 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
240 {
241 VEC_LOAD_UV( );
242 VEC_MERGE( vec_mergeh );
243 VEC_MERGE( vec_mergel );
244 }
245 }
246 }
247 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
248 #if 0
249 else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
250 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
251 {
252 /* Width is only a multiple of 16, we take 4 lines at a time */
253 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
254 {
255 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
256 VEC_NEXT_LINES( );
257 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
258 {
259 VEC_LOAD_UV( );
260 VEC_MERGE( vec_mergeh );
261 VEC_MERGE( vec_mergel );
262 }
263
264 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
265 VEC_LOAD_UV( );
266 VEC_MERGE( vec_mergeh );
267
268 /* Line 3 and 4, pixels 0 to 16 */
269 VEC_NEXT_LINES( );
270 VEC_MERGE( vec_mergel );
271
272 /* Line 3 and 4, pixels 16 to ( width ) */
273 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
274 {
275 VEC_LOAD_UV( );
276 VEC_MERGE( vec_mergeh );
277 VEC_MERGE( vec_mergel );
278 }
279 }
280 }
281 #endif
282 else
283 {
284 /* Crap, use the C version */
285 #undef VEC_NEXT_LINES
286 #undef VEC_LOAD_UV
287 #undef VEC_MERGE
288 #endif
289
290 const int i_source_margin = p_source->p[0].i_pitch
291 - p_source->p[0].i_visible_pitch
292 - p_filter->fmt_in.video.i_x_offset;
293 const int i_source_margin_c = p_source->p[1].i_pitch
294 - p_source->p[1].i_visible_pitch
295 - ( p_filter->fmt_in.video.i_x_offset / 2 );
296 const int i_dest_margin = p_dest->p->i_pitch
297 - p_dest->p->i_visible_pitch
298 - ( p_filter->fmt_out.video.i_x_offset * 2 );
299
300 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
301 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
302 {
303 p_line1 = p_line2;
304 p_line2 += p_dest->p->i_pitch;
305
306 p_y1 = p_y2;
307 p_y2 += p_source->p[Y_PLANE].i_pitch;
308
309 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
310 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x-- ; )
311 {
312 C_YUV420_YUYV( );
313 C_YUV420_YUYV( );
314 C_YUV420_YUYV( );
315 C_YUV420_YUYV( );
316 }
317 #else
318 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
319 {
320 MMX_CALL( MMX_YUV420_YUYV );
321 }
322 #endif
323 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x-- ; )
324 {
325 C_YUV420_YUYV( );
326 }
327
328 p_y2 += i_source_margin;
329 p_u += i_source_margin_c;
330 p_v += i_source_margin_c;
331 p_line2 += i_dest_margin;
332 }
333
334 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
335 /* re-enable FPU registers */
336 MMX_END;
337 #endif
338
339 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
340 }
341 #endif
342
343 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
344 /*
345 ** SSE2 128 bits fetch/store instructions are faster
346 ** if memory access is 16 bytes aligned
347 */
348
349 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
350 ((intptr_t)p_line2|(intptr_t)p_y2))) )
351 {
352 /* use faster SSE2 aligned fetch and store */
353 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
354 {
355 p_line1 = p_line2;
356 p_line2 += p_dest->p->i_pitch;
357
358 p_y1 = p_y2;
359 p_y2 += p_source->p[Y_PLANE].i_pitch;
360
361 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
362 {
363 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
364 }
365 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
366 {
367 C_YUV420_YUYV( );
368 }
369
370 p_y2 += i_source_margin;
371 p_u += i_source_margin_c;
372 p_v += i_source_margin_c;
373 p_line2 += i_dest_margin;
374 }
375 }
376 else
377 {
378 /* use slower SSE2 unaligned fetch and store */
379 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
380 {
381 p_line1 = p_line2;
382 p_line2 += p_dest->p->i_pitch;
383
384 p_y1 = p_y2;
385 p_y2 += p_source->p[Y_PLANE].i_pitch;
386
387 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
388 {
389 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
390 }
391 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
392 {
393 C_YUV420_YUYV( );
394 }
395
396 p_y2 += i_source_margin;
397 p_u += i_source_margin_c;
398 p_v += i_source_margin_c;
399 p_line2 += i_dest_margin;
400 }
401 }
402 /* make sure all SSE2 stores are visible thereafter */
403 SSE2_END;
404
405 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
406 }
407
408 /*****************************************************************************
409 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
410 *****************************************************************************/
411 VLC_TARGET
I420_YVYU(filter_t * p_filter,picture_t * p_source,picture_t * p_dest)412 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
413 picture_t *p_dest )
414 {
415 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
416 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
417 uint8_t *p_u = p_source->U_PIXELS;
418 uint8_t *p_v = p_source->V_PIXELS;
419
420 int i_x, i_y;
421
422 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
423 #define VEC_NEXT_LINES( ) \
424 p_line1 = p_line2; \
425 p_line2 += p_dest->p->i_pitch; \
426 p_y1 = p_y2; \
427 p_y2 += p_source->p[Y_PLANE].i_pitch;
428
429 #define VEC_LOAD_UV( ) \
430 u_vec = vec_ld( 0, p_u ); p_u += 16; \
431 v_vec = vec_ld( 0, p_v ); p_v += 16;
432
433 #define VEC_MERGE( a ) \
434 vu_vec = a( v_vec, u_vec ); \
435 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
436 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
437 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
438 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
439 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
440 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
441
442 vector unsigned char u_vec;
443 vector unsigned char v_vec;
444 vector unsigned char vu_vec;
445 vector unsigned char y_vec;
446
447 if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
448 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
449 {
450 /* Width is a multiple of 32, we take 2 lines at a time */
451 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
452 {
453 VEC_NEXT_LINES( );
454 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
455 {
456 VEC_LOAD_UV( );
457 VEC_MERGE( vec_mergeh );
458 VEC_MERGE( vec_mergel );
459 }
460 }
461 }
462 else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
463 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
464 {
465 /* Width is only a multiple of 16, we take 4 lines at a time */
466 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
467 {
468 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
469 VEC_NEXT_LINES( );
470 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
471 {
472 VEC_LOAD_UV( );
473 VEC_MERGE( vec_mergeh );
474 VEC_MERGE( vec_mergel );
475 }
476
477 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
478 VEC_LOAD_UV( );
479 VEC_MERGE( vec_mergeh );
480
481 /* Line 3 and 4, pixels 0 to 16 */
482 VEC_NEXT_LINES( );
483 VEC_MERGE( vec_mergel );
484
485 /* Line 3 and 4, pixels 16 to ( width ) */
486 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
487 {
488 VEC_LOAD_UV( );
489 VEC_MERGE( vec_mergeh );
490 VEC_MERGE( vec_mergel );
491 }
492 }
493 }
494 else
495 {
496 /* Crap, use the C version */
497 #undef VEC_NEXT_LINES
498 #undef VEC_LOAD_UV
499 #undef VEC_MERGE
500 #endif
501
502 const int i_source_margin = p_source->p[0].i_pitch
503 - p_source->p[0].i_visible_pitch
504 - p_filter->fmt_in.video.i_x_offset;
505 const int i_source_margin_c = p_source->p[1].i_pitch
506 - p_source->p[1].i_visible_pitch
507 - ( p_filter->fmt_in.video.i_x_offset / 2 );
508 const int i_dest_margin = p_dest->p->i_pitch
509 - p_dest->p->i_visible_pitch
510 - ( p_filter->fmt_out.video.i_x_offset * 2 );
511
512 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
513 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
514 {
515 p_line1 = p_line2;
516 p_line2 += p_dest->p->i_pitch;
517
518 p_y1 = p_y2;
519 p_y2 += p_source->p[Y_PLANE].i_pitch;
520
521 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
522 {
523 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
524 C_YUV420_YVYU( );
525 C_YUV420_YVYU( );
526 C_YUV420_YVYU( );
527 C_YUV420_YVYU( );
528 #else
529 MMX_CALL( MMX_YUV420_YVYU );
530 #endif
531 }
532 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x-- ; )
533 {
534 C_YUV420_YVYU( );
535 }
536
537 p_y1 += i_source_margin;
538 p_y2 += i_source_margin;
539 p_u += i_source_margin_c;
540 p_v += i_source_margin_c;
541 p_line1 += i_dest_margin;
542 p_line2 += i_dest_margin;
543 }
544
545 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
546 /* re-enable FPU registers */
547 MMX_END;
548 #endif
549
550 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
551 }
552 #endif
553
554 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
555 /*
556 ** SSE2 128 bits fetch/store instructions are faster
557 ** if memory access is 16 bytes aligned
558 */
559 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
560 ((intptr_t)p_line2|(intptr_t)p_y2))) )
561 {
562 /* use faster SSE2 aligned fetch and store */
563 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
564 {
565 p_line1 = p_line2;
566 p_line2 += p_dest->p->i_pitch;
567
568 p_y1 = p_y2;
569 p_y2 += p_source->p[Y_PLANE].i_pitch;
570
571 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
572 {
573 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
574 }
575 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
576 {
577 C_YUV420_YVYU( );
578 }
579
580 p_y1 += i_source_margin;
581 p_y2 += i_source_margin;
582 p_u += i_source_margin_c;
583 p_v += i_source_margin_c;
584 p_line1 += i_dest_margin;
585 p_line2 += i_dest_margin;
586 }
587 }
588 else
589 {
590 /* use slower SSE2 unaligned fetch and store */
591 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
592 {
593 p_line1 = p_line2;
594 p_line2 += p_dest->p->i_pitch;
595
596 p_y1 = p_y2;
597 p_y2 += p_source->p[Y_PLANE].i_pitch;
598
599 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
600 {
601 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
602 }
603 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
604 {
605 C_YUV420_YVYU( );
606 }
607
608 p_y1 += i_source_margin;
609 p_y2 += i_source_margin;
610 p_u += i_source_margin_c;
611 p_v += i_source_margin_c;
612 p_line1 += i_dest_margin;
613 p_line2 += i_dest_margin;
614 }
615 }
616 /* make sure all SSE2 stores are visible thereafter */
617 SSE2_END;
618 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
619 }
620
621 /*****************************************************************************
622 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
623 *****************************************************************************/
624 VLC_TARGET
I420_UYVY(filter_t * p_filter,picture_t * p_source,picture_t * p_dest)625 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
626 picture_t *p_dest )
627 {
628 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
629 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
630 uint8_t *p_u = p_source->U_PIXELS;
631 uint8_t *p_v = p_source->V_PIXELS;
632
633 int i_x, i_y;
634
635 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
636 #define VEC_NEXT_LINES( ) \
637 p_line1 = p_line2; \
638 p_line2 += p_dest->p->i_pitch; \
639 p_y1 = p_y2; \
640 p_y2 += p_source->p[Y_PLANE].i_pitch;
641
642 #define VEC_LOAD_UV( ) \
643 u_vec = vec_ld( 0, p_u ); p_u += 16; \
644 v_vec = vec_ld( 0, p_v ); p_v += 16;
645
646 #define VEC_MERGE( a ) \
647 uv_vec = a( u_vec, v_vec ); \
648 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
649 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
650 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
651 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
652 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
653 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
654
655 vector unsigned char u_vec;
656 vector unsigned char v_vec;
657 vector unsigned char uv_vec;
658 vector unsigned char y_vec;
659
660 if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
661 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
662 {
663 /* Width is a multiple of 32, we take 2 lines at a time */
664 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
665 {
666 VEC_NEXT_LINES( );
667 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
668 {
669 VEC_LOAD_UV( );
670 VEC_MERGE( vec_mergeh );
671 VEC_MERGE( vec_mergel );
672 }
673 }
674 }
675 else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
676 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
677 {
678 /* Width is only a multiple of 16, we take 4 lines at a time */
679 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
680 {
681 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
682 VEC_NEXT_LINES( );
683 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
684 {
685 VEC_LOAD_UV( );
686 VEC_MERGE( vec_mergeh );
687 VEC_MERGE( vec_mergel );
688 }
689
690 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
691 VEC_LOAD_UV( );
692 VEC_MERGE( vec_mergeh );
693
694 /* Line 3 and 4, pixels 0 to 16 */
695 VEC_NEXT_LINES( );
696 VEC_MERGE( vec_mergel );
697
698 /* Line 3 and 4, pixels 16 to ( width ) */
699 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
700 {
701 VEC_LOAD_UV( );
702 VEC_MERGE( vec_mergeh );
703 VEC_MERGE( vec_mergel );
704 }
705 }
706 }
707 else
708 {
709 /* Crap, use the C version */
710 #undef VEC_NEXT_LINES
711 #undef VEC_LOAD_UV
712 #undef VEC_MERGE
713 #endif
714
715 const int i_source_margin = p_source->p[0].i_pitch
716 - p_source->p[0].i_visible_pitch
717 - p_filter->fmt_in.video.i_x_offset;
718 const int i_source_margin_c = p_source->p[1].i_pitch
719 - p_source->p[1].i_visible_pitch
720 - ( p_filter->fmt_in.video.i_x_offset / 2 );
721 const int i_dest_margin = p_dest->p->i_pitch
722 - p_dest->p->i_visible_pitch
723 - ( p_filter->fmt_out.video.i_x_offset * 2 );
724
725 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
726 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
727 {
728 p_line1 = p_line2;
729 p_line2 += p_dest->p->i_pitch;
730
731 p_y1 = p_y2;
732 p_y2 += p_source->p[Y_PLANE].i_pitch;
733
734 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
735 {
736 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
737 C_YUV420_UYVY( );
738 C_YUV420_UYVY( );
739 C_YUV420_UYVY( );
740 C_YUV420_UYVY( );
741 #else
742 MMX_CALL( MMX_YUV420_UYVY );
743 #endif
744 }
745 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x--; )
746 {
747 C_YUV420_UYVY( );
748 }
749
750 p_y1 += i_source_margin;
751 p_y2 += i_source_margin;
752 p_u += i_source_margin_c;
753 p_v += i_source_margin_c;
754 p_line1 += i_dest_margin;
755 p_line2 += i_dest_margin;
756 }
757
758 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
759 /* re-enable FPU registers */
760 MMX_END;
761 #endif
762
763 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
764 }
765 #endif
766
767 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
768 /*
769 ** SSE2 128 bits fetch/store instructions are faster
770 ** if memory access is 16 bytes aligned
771 */
772 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
773 ((intptr_t)p_line2|(intptr_t)p_y2))) )
774 {
775 /* use faster SSE2 aligned fetch and store */
776 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
777 {
778 p_line1 = p_line2;
779 p_line2 += p_dest->p->i_pitch;
780
781 p_y1 = p_y2;
782 p_y2 += p_source->p[Y_PLANE].i_pitch;
783
784 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
785 {
786 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
787 }
788 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
789 {
790 C_YUV420_UYVY( );
791 }
792
793 p_y1 += i_source_margin;
794 p_y2 += i_source_margin;
795 p_u += i_source_margin_c;
796 p_v += i_source_margin_c;
797 p_line1 += i_dest_margin;
798 p_line2 += i_dest_margin;
799 }
800 }
801 else
802 {
803 /* use slower SSE2 unaligned fetch and store */
804 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
805 {
806 p_line1 = p_line2;
807 p_line2 += p_dest->p->i_pitch;
808
809 p_y1 = p_y2;
810 p_y2 += p_source->p[Y_PLANE].i_pitch;
811
812 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
813 {
814 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
815 }
816 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
817 {
818 C_YUV420_UYVY( );
819 }
820
821 p_y1 += i_source_margin;
822 p_y2 += i_source_margin;
823 p_u += i_source_margin_c;
824 p_v += i_source_margin_c;
825 p_line1 += i_dest_margin;
826 p_line2 += i_dest_margin;
827 }
828 }
829 /* make sure all SSE2 stores are visible thereafter */
830 SSE2_END;
831 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
832 }
833
834 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
835 /*****************************************************************************
836 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
837 *****************************************************************************/
I420_IUYV(filter_t * p_filter,picture_t * p_source,picture_t * p_dest)838 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
839 picture_t *p_dest )
840 {
841 VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
842 /* FIXME: TODO ! */
843 msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
844 }
845 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
846
847 /*****************************************************************************
848 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
849 *****************************************************************************/
850 #if defined (MODULE_NAME_IS_i420_yuy2)
I420_Y211(filter_t * p_filter,picture_t * p_source,picture_t * p_dest)851 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
852 picture_t *p_dest )
853 {
854 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
855 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
856 uint8_t *p_u = p_source->U_PIXELS;
857 uint8_t *p_v = p_source->V_PIXELS;
858
859 int i_x, i_y;
860
861 const int i_source_margin = p_source->p[0].i_pitch
862 - p_source->p[0].i_visible_pitch
863 - p_filter->fmt_in.video.i_x_offset;
864 const int i_source_margin_c = p_source->p[1].i_pitch
865 - p_source->p[1].i_visible_pitch
866 - ( p_filter->fmt_in.video.i_x_offset / 2 );
867 const int i_dest_margin = p_dest->p->i_pitch
868 - p_dest->p->i_visible_pitch
869 - ( p_filter->fmt_out.video.i_x_offset * 2 );
870
871 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
872 {
873 p_line1 = p_line2;
874 p_line2 += p_dest->p->i_pitch;
875
876 p_y1 = p_y2;
877 p_y2 += p_source->p[Y_PLANE].i_pitch;
878
879 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
880 {
881 C_YUV420_Y211( );
882 C_YUV420_Y211( );
883 }
884
885 p_y1 += i_source_margin;
886 p_y2 += i_source_margin;
887 p_u += i_source_margin_c;
888 p_v += i_source_margin_c;
889 p_line1 += i_dest_margin;
890 p_line2 += i_dest_margin;
891 }
892 }
893 #endif
894