1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 VLC authors and VideoLAN
5  * $Id: b1a0359fc9b237cd1ce7d6ba87e155396348afdd $
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of the GNU Lesser General Public License as published by
12  * the Free Software Foundation; either version 2.1 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this program; if not, write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24 
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28 
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32 
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_picture.h>
37 #include <vlc_cpu.h>
38 
39 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
40 #   undef bool
41 #   include <altivec.h>
42 #   define bool _Bool
43 #endif
44 
45 #include "i420_yuy2.h"
46 
47 #define SRC_FOURCC  "I420,IYUV,YV12"
48 
49 #if defined (MODULE_NAME_IS_i420_yuy2)
50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,Y211"
51 #    define VLC_TARGET
52 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
54 #    define VLC_TARGET VLC_MMX
55 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
56 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
57 #    define VLC_TARGET VLC_SSE
58 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
59 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
60 #    define VLC_TARGET
61 #endif
62 
63 /*****************************************************************************
64  * Local and extern prototypes.
65  *****************************************************************************/
66 static int  Activate ( vlc_object_t * );
67 
68 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
69 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
70 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
71 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
72 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
73 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
74 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
75 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
76 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
77 #endif
78 #if defined (MODULE_NAME_IS_i420_yuy2)
79 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
80 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
81 #endif
82 
83 /*****************************************************************************
84  * Module descriptor.
85  *****************************************************************************/
86 vlc_module_begin ()
87 #if defined (MODULE_NAME_IS_i420_yuy2)
88     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
89     set_capability( "video converter", 80 )
90 # define vlc_CPU_capable() (true)
91 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
92     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
93     set_capability( "video converter", 160 )
94 # define vlc_CPU_capable() vlc_CPU_MMX()
95 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
96     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
97     set_capability( "video converter", 250 )
98 # define vlc_CPU_capable() vlc_CPU_SSE2()
99 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
100     set_description(
101             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
102     set_capability( "video converter", 250 )
103 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
104 #endif
set_callbacks(Activate,NULL)105     set_callbacks( Activate, NULL )
106 vlc_module_end ()
107 
108 /*****************************************************************************
109  * Activate: allocate a chroma function
110  *****************************************************************************
111  * This function allocates and initializes a chroma function
112  *****************************************************************************/
113 static int Activate( vlc_object_t *p_this )
114 {
115     filter_t *p_filter = (filter_t *)p_this;
116 
117     if( !vlc_CPU_capable() )
118         return VLC_EGENERIC;
119     if( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) & 1
120      || (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) & 1 )
121     {
122         return -1;
123     }
124 
125     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
126        || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height
127        || p_filter->fmt_in.video.orientation != p_filter->fmt_out.video.orientation )
128         return -1;
129 
130     switch( p_filter->fmt_in.video.i_chroma )
131     {
132 //        case VLC_CODEC_YV12: FIXME invert U and V in the filters :)
133         case VLC_CODEC_I420:
134             switch( p_filter->fmt_out.video.i_chroma )
135             {
136                 case VLC_CODEC_YUYV:
137                     p_filter->pf_video_filter = I420_YUY2_Filter;
138                     break;
139 
140                 case VLC_CODEC_YVYU:
141                     p_filter->pf_video_filter = I420_YVYU_Filter;
142                     break;
143 
144                 case VLC_CODEC_UYVY:
145                     p_filter->pf_video_filter = I420_UYVY_Filter;
146                     break;
147 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
148                 case VLC_FOURCC('I','U','Y','V'):
149                     p_filter->pf_video_filter = I420_IUYV_Filter;
150                     break;
151 #endif
152 
153 #if defined (MODULE_NAME_IS_i420_yuy2)
154                 case VLC_CODEC_Y211:
155                     p_filter->pf_video_filter = I420_Y211_Filter;
156                     break;
157 #endif
158 
159                 default:
160                     return -1;
161             }
162             break;
163 
164         default:
165             return -1;
166     }
167 
168     return 0;
169 }
170 
171 #if 0
172 static inline unsigned long long read_cycles(void)
173 {
174     unsigned long long v;
175     __asm__ __volatile__("rdtsc" : "=A" (v): );
176 
177     return v;
178 }
179 #endif
180 
181 /* Following functions are local */
182 
183 VIDEO_FILTER_WRAPPER( I420_YUY2 )
VIDEO_FILTER_WRAPPER(I420_YVYU)184 VIDEO_FILTER_WRAPPER( I420_YVYU )
185 VIDEO_FILTER_WRAPPER( I420_UYVY )
186 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
187 VIDEO_FILTER_WRAPPER( I420_IUYV )
188 #endif
189 #if defined (MODULE_NAME_IS_i420_yuy2)
190 VIDEO_FILTER_WRAPPER( I420_Y211 )
191 #endif
192 
193 /*****************************************************************************
194  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
195  *****************************************************************************/
196 VLC_TARGET
197 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
198                                            picture_t *p_dest )
199 {
200     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
201     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
202     uint8_t *p_u = p_source->U_PIXELS;
203     uint8_t *p_v = p_source->V_PIXELS;
204 
205     int i_x, i_y;
206 
207 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
208 #define VEC_NEXT_LINES( ) \
209     p_line1  = p_line2; \
210     p_line2 += p_dest->p->i_pitch; \
211     p_y1     = p_y2; \
212     p_y2    += p_source->p[Y_PLANE].i_pitch;
213 
214 #define VEC_LOAD_UV( ) \
215     u_vec = vec_ld( 0, p_u ); p_u += 16; \
216     v_vec = vec_ld( 0, p_v ); p_v += 16;
217 
218 #define VEC_MERGE( a ) \
219     uv_vec = a( u_vec, v_vec ); \
220     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
221     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
222     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
223     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
224     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
225     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
226 
227     vector unsigned char u_vec;
228     vector unsigned char v_vec;
229     vector unsigned char uv_vec;
230     vector unsigned char y_vec;
231 
232     if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
233            ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
234     {
235         /* Width is a multiple of 32, we take 2 lines at a time */
236         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
237         {
238             VEC_NEXT_LINES( );
239             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
240             {
241                 VEC_LOAD_UV( );
242                 VEC_MERGE( vec_mergeh );
243                 VEC_MERGE( vec_mergel );
244             }
245         }
246     }
247 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
248 #if 0
249     else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
250                 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
251     {
252         /* Width is only a multiple of 16, we take 4 lines at a time */
253         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
254         {
255             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
256             VEC_NEXT_LINES( );
257             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
258             {
259                 VEC_LOAD_UV( );
260                 VEC_MERGE( vec_mergeh );
261                 VEC_MERGE( vec_mergel );
262             }
263 
264             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
265             VEC_LOAD_UV( );
266             VEC_MERGE( vec_mergeh );
267 
268             /* Line 3 and 4, pixels 0 to 16 */
269             VEC_NEXT_LINES( );
270             VEC_MERGE( vec_mergel );
271 
272             /* Line 3 and 4, pixels 16 to ( width ) */
273             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
274             {
275                 VEC_LOAD_UV( );
276                 VEC_MERGE( vec_mergeh );
277                 VEC_MERGE( vec_mergel );
278             }
279         }
280     }
281 #endif
282     else
283     {
284         /* Crap, use the C version */
285 #undef VEC_NEXT_LINES
286 #undef VEC_LOAD_UV
287 #undef VEC_MERGE
288 #endif
289 
290     const int i_source_margin = p_source->p[0].i_pitch
291                                  - p_source->p[0].i_visible_pitch
292                                  - p_filter->fmt_in.video.i_x_offset;
293     const int i_source_margin_c = p_source->p[1].i_pitch
294                                  - p_source->p[1].i_visible_pitch
295                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
296     const int i_dest_margin = p_dest->p->i_pitch
297                                - p_dest->p->i_visible_pitch
298                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
299 
300 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
301     for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
302     {
303         p_line1 = p_line2;
304         p_line2 += p_dest->p->i_pitch;
305 
306         p_y1 = p_y2;
307         p_y2 += p_source->p[Y_PLANE].i_pitch;
308 
309 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
310         for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x-- ; )
311         {
312             C_YUV420_YUYV( );
313             C_YUV420_YUYV( );
314             C_YUV420_YUYV( );
315             C_YUV420_YUYV( );
316         }
317 #else
318         for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
319         {
320             MMX_CALL( MMX_YUV420_YUYV );
321         }
322 #endif
323         for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x-- ; )
324         {
325             C_YUV420_YUYV( );
326         }
327 
328         p_y2 += i_source_margin;
329         p_u += i_source_margin_c;
330         p_v += i_source_margin_c;
331         p_line2 += i_dest_margin;
332     }
333 
334 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
335     /* re-enable FPU registers */
336     MMX_END;
337 #endif
338 
339 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
340     }
341 #endif
342 
343 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
344     /*
345     ** SSE2 128 bits fetch/store instructions are faster
346     ** if memory access is 16 bytes aligned
347     */
348 
349     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
350         ((intptr_t)p_line2|(intptr_t)p_y2))) )
351     {
352         /* use faster SSE2 aligned fetch and store */
353         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
354         {
355             p_line1 = p_line2;
356             p_line2 += p_dest->p->i_pitch;
357 
358             p_y1 = p_y2;
359             p_y2 += p_source->p[Y_PLANE].i_pitch;
360 
361             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
362             {
363                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
364             }
365             for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
366             {
367                 C_YUV420_YUYV( );
368             }
369 
370             p_y2 += i_source_margin;
371             p_u += i_source_margin_c;
372             p_v += i_source_margin_c;
373             p_line2 += i_dest_margin;
374         }
375     }
376     else
377     {
378         /* use slower SSE2 unaligned fetch and store */
379         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
380         {
381             p_line1 = p_line2;
382             p_line2 += p_dest->p->i_pitch;
383 
384             p_y1 = p_y2;
385             p_y2 += p_source->p[Y_PLANE].i_pitch;
386 
387             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
388             {
389                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
390             }
391             for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
392             {
393                 C_YUV420_YUYV( );
394             }
395 
396             p_y2 += i_source_margin;
397             p_u += i_source_margin_c;
398             p_v += i_source_margin_c;
399             p_line2 += i_dest_margin;
400         }
401     }
402     /* make sure all SSE2 stores are visible thereafter */
403     SSE2_END;
404 
405 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
406 }
407 
408 /*****************************************************************************
409  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
410  *****************************************************************************/
411 VLC_TARGET
I420_YVYU(filter_t * p_filter,picture_t * p_source,picture_t * p_dest)412 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
413                                            picture_t *p_dest )
414 {
415     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
416     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
417     uint8_t *p_u = p_source->U_PIXELS;
418     uint8_t *p_v = p_source->V_PIXELS;
419 
420     int i_x, i_y;
421 
422 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
423 #define VEC_NEXT_LINES( ) \
424     p_line1  = p_line2; \
425     p_line2 += p_dest->p->i_pitch; \
426     p_y1     = p_y2; \
427     p_y2    += p_source->p[Y_PLANE].i_pitch;
428 
429 #define VEC_LOAD_UV( ) \
430     u_vec = vec_ld( 0, p_u ); p_u += 16; \
431     v_vec = vec_ld( 0, p_v ); p_v += 16;
432 
433 #define VEC_MERGE( a ) \
434     vu_vec = a( v_vec, u_vec ); \
435     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
436     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
437     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
438     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
439     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
440     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
441 
442     vector unsigned char u_vec;
443     vector unsigned char v_vec;
444     vector unsigned char vu_vec;
445     vector unsigned char y_vec;
446 
447     if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
448            ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
449     {
450         /* Width is a multiple of 32, we take 2 lines at a time */
451         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
452         {
453             VEC_NEXT_LINES( );
454             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
455             {
456                 VEC_LOAD_UV( );
457                 VEC_MERGE( vec_mergeh );
458                 VEC_MERGE( vec_mergel );
459             }
460         }
461     }
462     else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
463                 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
464     {
465         /* Width is only a multiple of 16, we take 4 lines at a time */
466         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
467         {
468             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
469             VEC_NEXT_LINES( );
470             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
471             {
472                 VEC_LOAD_UV( );
473                 VEC_MERGE( vec_mergeh );
474                 VEC_MERGE( vec_mergel );
475             }
476 
477             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
478             VEC_LOAD_UV( );
479             VEC_MERGE( vec_mergeh );
480 
481             /* Line 3 and 4, pixels 0 to 16 */
482             VEC_NEXT_LINES( );
483             VEC_MERGE( vec_mergel );
484 
485             /* Line 3 and 4, pixels 16 to ( width ) */
486             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
487             {
488                 VEC_LOAD_UV( );
489                 VEC_MERGE( vec_mergeh );
490                 VEC_MERGE( vec_mergel );
491             }
492         }
493     }
494     else
495     {
496         /* Crap, use the C version */
497 #undef VEC_NEXT_LINES
498 #undef VEC_LOAD_UV
499 #undef VEC_MERGE
500 #endif
501 
502     const int i_source_margin = p_source->p[0].i_pitch
503                                  - p_source->p[0].i_visible_pitch
504                                  - p_filter->fmt_in.video.i_x_offset;
505     const int i_source_margin_c = p_source->p[1].i_pitch
506                                  - p_source->p[1].i_visible_pitch
507                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
508     const int i_dest_margin = p_dest->p->i_pitch
509                                - p_dest->p->i_visible_pitch
510                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
511 
512 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
513     for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
514     {
515         p_line1 = p_line2;
516         p_line2 += p_dest->p->i_pitch;
517 
518         p_y1 = p_y2;
519         p_y2 += p_source->p[Y_PLANE].i_pitch;
520 
521         for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
522         {
523 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
524             C_YUV420_YVYU( );
525             C_YUV420_YVYU( );
526             C_YUV420_YVYU( );
527             C_YUV420_YVYU( );
528 #else
529             MMX_CALL( MMX_YUV420_YVYU );
530 #endif
531         }
532         for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x-- ; )
533         {
534             C_YUV420_YVYU( );
535         }
536 
537         p_y1 += i_source_margin;
538         p_y2 += i_source_margin;
539         p_u += i_source_margin_c;
540         p_v += i_source_margin_c;
541         p_line1 += i_dest_margin;
542         p_line2 += i_dest_margin;
543     }
544 
545 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
546     /* re-enable FPU registers */
547     MMX_END;
548 #endif
549 
550 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
551     }
552 #endif
553 
554 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
555     /*
556     ** SSE2 128 bits fetch/store instructions are faster
557     ** if memory access is 16 bytes aligned
558     */
559     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
560         ((intptr_t)p_line2|(intptr_t)p_y2))) )
561     {
562         /* use faster SSE2 aligned fetch and store */
563         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
564         {
565             p_line1 = p_line2;
566             p_line2 += p_dest->p->i_pitch;
567 
568             p_y1 = p_y2;
569             p_y2 += p_source->p[Y_PLANE].i_pitch;
570 
571             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
572             {
573                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
574             }
575             for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
576             {
577                 C_YUV420_YVYU( );
578             }
579 
580             p_y1 += i_source_margin;
581             p_y2 += i_source_margin;
582             p_u += i_source_margin_c;
583             p_v += i_source_margin_c;
584             p_line1 += i_dest_margin;
585             p_line2 += i_dest_margin;
586         }
587     }
588     else
589     {
590         /* use slower SSE2 unaligned fetch and store */
591         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
592         {
593             p_line1 = p_line2;
594             p_line2 += p_dest->p->i_pitch;
595 
596             p_y1 = p_y2;
597             p_y2 += p_source->p[Y_PLANE].i_pitch;
598 
599             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
600             {
601                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
602             }
603             for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
604             {
605                 C_YUV420_YVYU( );
606             }
607 
608             p_y1 += i_source_margin;
609             p_y2 += i_source_margin;
610             p_u += i_source_margin_c;
611             p_v += i_source_margin_c;
612             p_line1 += i_dest_margin;
613             p_line2 += i_dest_margin;
614         }
615     }
616     /* make sure all SSE2 stores are visible thereafter */
617     SSE2_END;
618 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
619 }
620 
621 /*****************************************************************************
622  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
623  *****************************************************************************/
624 VLC_TARGET
I420_UYVY(filter_t * p_filter,picture_t * p_source,picture_t * p_dest)625 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
626                                            picture_t *p_dest )
627 {
628     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
629     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
630     uint8_t *p_u = p_source->U_PIXELS;
631     uint8_t *p_v = p_source->V_PIXELS;
632 
633     int i_x, i_y;
634 
635 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
636 #define VEC_NEXT_LINES( ) \
637     p_line1  = p_line2; \
638     p_line2 += p_dest->p->i_pitch; \
639     p_y1     = p_y2; \
640     p_y2    += p_source->p[Y_PLANE].i_pitch;
641 
642 #define VEC_LOAD_UV( ) \
643     u_vec = vec_ld( 0, p_u ); p_u += 16; \
644     v_vec = vec_ld( 0, p_v ); p_v += 16;
645 
646 #define VEC_MERGE( a ) \
647     uv_vec = a( u_vec, v_vec ); \
648     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
649     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
650     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
651     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
652     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
653     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
654 
655     vector unsigned char u_vec;
656     vector unsigned char v_vec;
657     vector unsigned char uv_vec;
658     vector unsigned char y_vec;
659 
660     if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
661            ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
662     {
663         /* Width is a multiple of 32, we take 2 lines at a time */
664         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
665         {
666             VEC_NEXT_LINES( );
667             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
668             {
669                 VEC_LOAD_UV( );
670                 VEC_MERGE( vec_mergeh );
671                 VEC_MERGE( vec_mergel );
672             }
673         }
674     }
675     else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
676                 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
677     {
678         /* Width is only a multiple of 16, we take 4 lines at a time */
679         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
680         {
681             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
682             VEC_NEXT_LINES( );
683             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
684             {
685                 VEC_LOAD_UV( );
686                 VEC_MERGE( vec_mergeh );
687                 VEC_MERGE( vec_mergel );
688             }
689 
690             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
691             VEC_LOAD_UV( );
692             VEC_MERGE( vec_mergeh );
693 
694             /* Line 3 and 4, pixels 0 to 16 */
695             VEC_NEXT_LINES( );
696             VEC_MERGE( vec_mergel );
697 
698             /* Line 3 and 4, pixels 16 to ( width ) */
699             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
700             {
701                 VEC_LOAD_UV( );
702                 VEC_MERGE( vec_mergeh );
703                 VEC_MERGE( vec_mergel );
704             }
705         }
706     }
707     else
708     {
709         /* Crap, use the C version */
710 #undef VEC_NEXT_LINES
711 #undef VEC_LOAD_UV
712 #undef VEC_MERGE
713 #endif
714 
715     const int i_source_margin = p_source->p[0].i_pitch
716                                  - p_source->p[0].i_visible_pitch
717                                  - p_filter->fmt_in.video.i_x_offset;
718     const int i_source_margin_c = p_source->p[1].i_pitch
719                                  - p_source->p[1].i_visible_pitch
720                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
721     const int i_dest_margin = p_dest->p->i_pitch
722                                - p_dest->p->i_visible_pitch
723                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
724 
725 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
726     for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
727     {
728         p_line1 = p_line2;
729         p_line2 += p_dest->p->i_pitch;
730 
731         p_y1 = p_y2;
732         p_y2 += p_source->p[Y_PLANE].i_pitch;
733 
734         for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
735         {
736 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
737             C_YUV420_UYVY( );
738             C_YUV420_UYVY( );
739             C_YUV420_UYVY( );
740             C_YUV420_UYVY( );
741 #else
742             MMX_CALL( MMX_YUV420_UYVY );
743 #endif
744         }
745         for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x--; )
746         {
747             C_YUV420_UYVY( );
748         }
749 
750         p_y1 += i_source_margin;
751         p_y2 += i_source_margin;
752         p_u += i_source_margin_c;
753         p_v += i_source_margin_c;
754         p_line1 += i_dest_margin;
755         p_line2 += i_dest_margin;
756     }
757 
758 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
759     /* re-enable FPU registers */
760     MMX_END;
761 #endif
762 
763 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
764     }
765 #endif
766 
767 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
768     /*
769     ** SSE2 128 bits fetch/store instructions are faster
770     ** if memory access is 16 bytes aligned
771     */
772     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
773         ((intptr_t)p_line2|(intptr_t)p_y2))) )
774     {
775         /* use faster SSE2 aligned fetch and store */
776         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
777         {
778             p_line1 = p_line2;
779             p_line2 += p_dest->p->i_pitch;
780 
781             p_y1 = p_y2;
782             p_y2 += p_source->p[Y_PLANE].i_pitch;
783 
784             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
785             {
786                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
787             }
788             for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
789             {
790                 C_YUV420_UYVY( );
791             }
792 
793             p_y1 += i_source_margin;
794             p_y2 += i_source_margin;
795             p_u += i_source_margin_c;
796             p_v += i_source_margin_c;
797             p_line1 += i_dest_margin;
798             p_line2 += i_dest_margin;
799         }
800     }
801     else
802     {
803         /* use slower SSE2 unaligned fetch and store */
804         for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
805         {
806             p_line1 = p_line2;
807             p_line2 += p_dest->p->i_pitch;
808 
809             p_y1 = p_y2;
810             p_y2 += p_source->p[Y_PLANE].i_pitch;
811 
812             for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
813             {
814                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
815             }
816             for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
817             {
818                 C_YUV420_UYVY( );
819             }
820 
821             p_y1 += i_source_margin;
822             p_y2 += i_source_margin;
823             p_u += i_source_margin_c;
824             p_v += i_source_margin_c;
825             p_line1 += i_dest_margin;
826             p_line2 += i_dest_margin;
827         }
828     }
829     /* make sure all SSE2 stores are visible thereafter */
830     SSE2_END;
831 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
832 }
833 
834 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
835 /*****************************************************************************
836  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
837  *****************************************************************************/
I420_IUYV(filter_t * p_filter,picture_t * p_source,picture_t * p_dest)838 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
839                                            picture_t *p_dest )
840 {
841     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
842     /* FIXME: TODO ! */
843     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
844 }
845 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
846 
847 /*****************************************************************************
848  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
849  *****************************************************************************/
850 #if defined (MODULE_NAME_IS_i420_yuy2)
I420_Y211(filter_t * p_filter,picture_t * p_source,picture_t * p_dest)851 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
852                                            picture_t *p_dest )
853 {
854     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
855     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
856     uint8_t *p_u = p_source->U_PIXELS;
857     uint8_t *p_v = p_source->V_PIXELS;
858 
859     int i_x, i_y;
860 
861     const int i_source_margin = p_source->p[0].i_pitch
862                                  - p_source->p[0].i_visible_pitch
863                                  - p_filter->fmt_in.video.i_x_offset;
864     const int i_source_margin_c = p_source->p[1].i_pitch
865                                  - p_source->p[1].i_visible_pitch
866                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
867     const int i_dest_margin = p_dest->p->i_pitch
868                                - p_dest->p->i_visible_pitch
869                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
870 
871     for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
872     {
873         p_line1 = p_line2;
874         p_line2 += p_dest->p->i_pitch;
875 
876         p_y1 = p_y2;
877         p_y2 += p_source->p[Y_PLANE].i_pitch;
878 
879         for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
880         {
881             C_YUV420_Y211( );
882             C_YUV420_Y211( );
883         }
884 
885         p_y1 += i_source_margin;
886         p_y2 += i_source_margin;
887         p_u += i_source_margin_c;
888         p_v += i_source_margin_c;
889         p_line1 += i_dest_margin;
890         p_line2 += i_dest_margin;
891     }
892 }
893 #endif
894