]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
qt4_vlm: fix #3938 (options must be removed from the input)
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36
37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
38 #   include <altivec.h>
39 #endif
40
41 #include "i420_yuy2.h"
42
43 #define SRC_FOURCC  "I420,IYUV,YV12"
44
45 #if defined (MODULE_NAME_IS_i420_yuy2)
46 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
48 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
52 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
53 #endif
54
55 /*****************************************************************************
56  * Local and extern prototypes.
57  *****************************************************************************/
58 static int  Activate ( vlc_object_t * );
59
60 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
61 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
62 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
63 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
64 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
65 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
66 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
67 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
68 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
69 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
70 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
71 #endif
72 #if defined (MODULE_NAME_IS_i420_yuy2)
73 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
74 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
75 #endif
76
77 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
78 /* Initialize MMX-specific constants */
79 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
80 static const uint64_t i_80w   = 0x0000000080808080ULL;
81 #endif
82
83 /*****************************************************************************
84  * Module descriptor.
85  *****************************************************************************/
86 vlc_module_begin ()
87 #if defined (MODULE_NAME_IS_i420_yuy2)
88     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
89     set_capability( "video filter2", 80 )
90 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
91     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
92     set_capability( "video filter2", 160 )
93 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
94     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
95     set_capability( "video filter2", 250 )
96 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
97     set_description(
98             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
99     set_capability( "video filter2", 250 )
100 #endif
101     set_callbacks( Activate, NULL )
102 vlc_module_end ()
103
104 /*****************************************************************************
105  * Activate: allocate a chroma function
106  *****************************************************************************
107  * This function allocates and initializes a chroma function
108  *****************************************************************************/
109 static int Activate( vlc_object_t *p_this )
110 {
111     filter_t *p_filter = (filter_t *)p_this;
112
113     if( p_filter->fmt_in.video.i_width & 1
114      || p_filter->fmt_in.video.i_height & 1 )
115     {
116         return -1;
117     }
118
119     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
120      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
121         return -1;
122
123     switch( p_filter->fmt_in.video.i_chroma )
124     {
125         case VLC_CODEC_YV12:
126         case VLC_CODEC_I420:
127             switch( p_filter->fmt_out.video.i_chroma )
128             {
129                 case VLC_CODEC_YUYV:
130                     p_filter->pf_video_filter = I420_YUY2_Filter;
131                     break;
132
133                 case VLC_CODEC_YVYU:
134                     p_filter->pf_video_filter = I420_YVYU_Filter;
135                     break;
136
137                 case VLC_CODEC_UYVY:
138                     p_filter->pf_video_filter = I420_UYVY_Filter;
139                     break;
140 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
141                 case VLC_FOURCC('I','U','Y','V'):
142                     p_filter->pf_video_filter = I420_IUYV_Filter;
143                     break;
144
145                 case VLC_CODEC_CYUV:
146                     p_filter->pf_video_filter = I420_cyuv_Filter;
147                     break;
148 #endif
149
150 #if defined (MODULE_NAME_IS_i420_yuy2)
151                 case VLC_CODEC_Y211:
152                     p_filter->pf_video_filter = I420_Y211_Filter;
153                     break;
154 #endif
155
156                 default:
157                     return -1;
158             }
159             break;
160
161         default:
162             return -1;
163     }
164
165     return 0;
166 }
167
168 #if 0
169 static inline unsigned long long read_cycles(void)
170 {
171     unsigned long long v;
172     __asm__ __volatile__("rdtsc" : "=A" (v): );
173
174     return v;
175 }
176 #endif
177
178 /* Following functions are local */
179
180 VIDEO_FILTER_WRAPPER( I420_YUY2 )
181 VIDEO_FILTER_WRAPPER( I420_YVYU )
182 VIDEO_FILTER_WRAPPER( I420_UYVY )
183 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
184 VIDEO_FILTER_WRAPPER( I420_IUYV )
185 VIDEO_FILTER_WRAPPER( I420_cyuv )
186 #endif
187 #if defined (MODULE_NAME_IS_i420_yuy2)
188 VIDEO_FILTER_WRAPPER( I420_Y211 )
189 #endif
190
191 /*****************************************************************************
192  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
193  *****************************************************************************/
194 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
195                                            picture_t *p_dest )
196 {
197     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
198     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
199     uint8_t *p_u = p_source->U_PIXELS;
200     uint8_t *p_v = p_source->V_PIXELS;
201
202     int i_x, i_y;
203
204 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
205 #define VEC_NEXT_LINES( ) \
206     p_line1  = p_line2; \
207     p_line2 += p_dest->p->i_pitch; \
208     p_y1     = p_y2; \
209     p_y2    += p_source->p[Y_PLANE].i_pitch;
210
211 #define VEC_LOAD_UV( ) \
212     u_vec = vec_ld( 0, p_u ); p_u += 16; \
213     v_vec = vec_ld( 0, p_v ); p_v += 16;
214
215 #define VEC_MERGE( a ) \
216     uv_vec = a( u_vec, v_vec ); \
217     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
218     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
219     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
220     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
221     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
222     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
223
224     vector unsigned char u_vec;
225     vector unsigned char v_vec;
226     vector unsigned char uv_vec;
227     vector unsigned char y_vec;
228
229     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
230            ( p_filter->fmt_in.video.i_height % 2 ) ) )
231     {
232         /* Width is a multiple of 32, we take 2 lines at a time */
233         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
234         {
235             VEC_NEXT_LINES( );
236             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
237             {
238                 VEC_LOAD_UV( );
239                 VEC_MERGE( vec_mergeh );
240                 VEC_MERGE( vec_mergel );
241             }
242         }
243     }
244     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
245                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
246     {
247         /* Width is only a multiple of 16, we take 4 lines at a time */
248         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
249         {
250             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
251             VEC_NEXT_LINES( );
252             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
253             {
254                 VEC_LOAD_UV( );
255                 VEC_MERGE( vec_mergeh );
256                 VEC_MERGE( vec_mergel );
257             }
258
259             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
260             VEC_LOAD_UV( );
261             VEC_MERGE( vec_mergeh );
262
263             /* Line 3 and 4, pixels 0 to 16 */
264             VEC_NEXT_LINES( );
265             VEC_MERGE( vec_mergel );
266
267             /* Line 3 and 4, pixels 16 to ( width ) */
268             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
269             {
270                 VEC_LOAD_UV( );
271                 VEC_MERGE( vec_mergeh );
272                 VEC_MERGE( vec_mergel );
273             }
274         }
275     }
276     else
277     {
278         /* Crap, use the C version */
279 #undef VEC_NEXT_LINES
280 #undef VEC_LOAD_UV
281 #undef VEC_MERGE
282 #endif
283
284     const int i_source_margin = p_source->p[0].i_pitch
285                                  - p_source->p[0].i_visible_pitch;
286     const int i_source_margin_c = p_source->p[1].i_pitch
287                                  - p_source->p[1].i_visible_pitch;
288     const int i_dest_margin = p_dest->p->i_pitch
289                                - p_dest->p->i_visible_pitch;
290
291 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
292     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
293     {
294         p_line1 = p_line2;
295         p_line2 += p_dest->p->i_pitch;
296
297         p_y1 = p_y2;
298         p_y2 += p_source->p[Y_PLANE].i_pitch;
299
300 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
301         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
302         {
303             C_YUV420_YUYV( );
304             C_YUV420_YUYV( );
305             C_YUV420_YUYV( );
306             C_YUV420_YUYV( );
307         }
308 #else
309         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
310         {
311             MMX_CALL( MMX_YUV420_YUYV );
312         }
313 #endif
314         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
315         {
316             C_YUV420_YUYV( );
317         }
318
319         p_y1 += i_source_margin;
320         p_y2 += i_source_margin;
321         p_u += i_source_margin_c;
322         p_v += i_source_margin_c;
323         p_line1 += i_dest_margin;
324         p_line2 += i_dest_margin;
325     }
326
327 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
328     /* re-enable FPU registers */
329     MMX_END;
330 #endif
331
332 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
333     }
334 #endif
335
336 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
337     /*
338     ** SSE2 128 bits fetch/store instructions are faster
339     ** if memory access is 16 bytes aligned
340     */
341
342     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
343         ((intptr_t)p_line2|(intptr_t)p_y2))) )
344     {
345         /* use faster SSE2 aligned fetch and store */
346         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
347         {
348             p_line1 = p_line2;
349             p_line2 += p_dest->p->i_pitch;
350
351             p_y1 = p_y2;
352             p_y2 += p_source->p[Y_PLANE].i_pitch;
353
354             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
355             {
356                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
357             }
358             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
359             {
360                 C_YUV420_YUYV( );
361             }
362
363             p_y1 += i_source_margin;
364             p_y2 += i_source_margin;
365             p_u += i_source_margin_c;
366             p_v += i_source_margin_c;
367             p_line1 += i_dest_margin;
368             p_line2 += i_dest_margin;
369         }
370     }
371     else
372     {
373         /* use slower SSE2 unaligned fetch and store */
374         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
375         {
376             p_line1 = p_line2;
377             p_line2 += p_dest->p->i_pitch;
378
379             p_y1 = p_y2;
380             p_y2 += p_source->p[Y_PLANE].i_pitch;
381
382             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
383             {
384                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
385             }
386             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
387             {
388                 C_YUV420_YUYV( );
389             }
390
391             p_y1 += i_source_margin;
392             p_y2 += i_source_margin;
393             p_u += i_source_margin_c;
394             p_v += i_source_margin_c;
395             p_line1 += i_dest_margin;
396             p_line2 += i_dest_margin;
397         }
398     }
399     /* make sure all SSE2 stores are visible thereafter */
400     SSE2_END;
401
402 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
403 }
404
405 /*****************************************************************************
406  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
407  *****************************************************************************/
408 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
409                                            picture_t *p_dest )
410 {
411     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
412     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
413     uint8_t *p_u = p_source->U_PIXELS;
414     uint8_t *p_v = p_source->V_PIXELS;
415
416     int i_x, i_y;
417
418 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
419 #define VEC_NEXT_LINES( ) \
420     p_line1  = p_line2; \
421     p_line2 += p_dest->p->i_pitch; \
422     p_y1     = p_y2; \
423     p_y2    += p_source->p[Y_PLANE].i_pitch;
424
425 #define VEC_LOAD_UV( ) \
426     u_vec = vec_ld( 0, p_u ); p_u += 16; \
427     v_vec = vec_ld( 0, p_v ); p_v += 16;
428
429 #define VEC_MERGE( a ) \
430     vu_vec = a( v_vec, u_vec ); \
431     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
432     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
433     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
434     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
435     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
436     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
437
438     vector unsigned char u_vec;
439     vector unsigned char v_vec;
440     vector unsigned char vu_vec;
441     vector unsigned char y_vec;
442
443     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
444            ( p_filter->fmt_in.video.i_height % 2 ) ) )
445     {
446         /* Width is a multiple of 32, we take 2 lines at a time */
447         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
448         {
449             VEC_NEXT_LINES( );
450             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
451             {
452                 VEC_LOAD_UV( );
453                 VEC_MERGE( vec_mergeh );
454                 VEC_MERGE( vec_mergel );
455             }
456         }
457     }
458     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
459                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
460     {
461         /* Width is only a multiple of 16, we take 4 lines at a time */
462         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
463         {
464             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
465             VEC_NEXT_LINES( );
466             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
467             {
468                 VEC_LOAD_UV( );
469                 VEC_MERGE( vec_mergeh );
470                 VEC_MERGE( vec_mergel );
471             }
472
473             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
474             VEC_LOAD_UV( );
475             VEC_MERGE( vec_mergeh );
476
477             /* Line 3 and 4, pixels 0 to 16 */
478             VEC_NEXT_LINES( );
479             VEC_MERGE( vec_mergel );
480
481             /* Line 3 and 4, pixels 16 to ( width ) */
482             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
483             {
484                 VEC_LOAD_UV( );
485                 VEC_MERGE( vec_mergeh );
486                 VEC_MERGE( vec_mergel );
487             }
488         }
489     }
490     else
491     {
492         /* Crap, use the C version */
493 #undef VEC_NEXT_LINES
494 #undef VEC_LOAD_UV
495 #undef VEC_MERGE
496 #endif
497
498     const int i_source_margin = p_source->p[0].i_pitch
499                                  - p_source->p[0].i_visible_pitch;
500     const int i_source_margin_c = p_source->p[1].i_pitch
501                                  - p_source->p[1].i_visible_pitch;
502     const int i_dest_margin = p_dest->p->i_pitch
503                                - p_dest->p->i_visible_pitch;
504
505 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
506     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
507     {
508         p_line1 = p_line2;
509         p_line2 += p_dest->p->i_pitch;
510
511         p_y1 = p_y2;
512         p_y2 += p_source->p[Y_PLANE].i_pitch;
513
514         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
515         {
516 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
517             C_YUV420_YVYU( );
518             C_YUV420_YVYU( );
519             C_YUV420_YVYU( );
520             C_YUV420_YVYU( );
521 #else
522             MMX_CALL( MMX_YUV420_YVYU );
523 #endif
524         }
525         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
526         {
527             C_YUV420_YVYU( );
528         }
529
530         p_y1 += i_source_margin;
531         p_y2 += i_source_margin;
532         p_u += i_source_margin_c;
533         p_v += i_source_margin_c;
534         p_line1 += i_dest_margin;
535         p_line2 += i_dest_margin;
536     }
537
538 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
539     /* re-enable FPU registers */
540     MMX_END;
541 #endif
542
543 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
544     }
545 #endif
546
547 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
548     /*
549     ** SSE2 128 bits fetch/store instructions are faster
550     ** if memory access is 16 bytes aligned
551     */
552     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
553         ((intptr_t)p_line2|(intptr_t)p_y2))) )
554     {
555         /* use faster SSE2 aligned fetch and store */
556         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
557         {
558             p_line1 = p_line2;
559             p_line2 += p_dest->p->i_pitch;
560
561             p_y1 = p_y2;
562             p_y2 += p_source->p[Y_PLANE].i_pitch;
563
564             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
565             {
566                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
567             }
568             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
569             {
570                 C_YUV420_YVYU( );
571             }
572
573             p_y1 += i_source_margin;
574             p_y2 += i_source_margin;
575             p_u += i_source_margin_c;
576             p_v += i_source_margin_c;
577             p_line1 += i_dest_margin;
578             p_line2 += i_dest_margin;
579         }
580     }
581     else
582     {
583         /* use slower SSE2 unaligned fetch and store */
584         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
585         {
586             p_line1 = p_line2;
587             p_line2 += p_dest->p->i_pitch;
588
589             p_y1 = p_y2;
590             p_y2 += p_source->p[Y_PLANE].i_pitch;
591
592             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
593             {
594                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
595             }
596             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
597             {
598                 C_YUV420_YVYU( );
599             }
600
601             p_y1 += i_source_margin;
602             p_y2 += i_source_margin;
603             p_u += i_source_margin_c;
604             p_v += i_source_margin_c;
605             p_line1 += i_dest_margin;
606             p_line2 += i_dest_margin;
607         }
608     }
609     /* make sure all SSE2 stores are visible thereafter */
610     SSE2_END;
611 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
612 }
613
614 /*****************************************************************************
615  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
616  *****************************************************************************/
617 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
618                                            picture_t *p_dest )
619 {
620     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
621     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
622     uint8_t *p_u = p_source->U_PIXELS;
623     uint8_t *p_v = p_source->V_PIXELS;
624
625     int i_x, i_y;
626
627 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
628 #define VEC_NEXT_LINES( ) \
629     p_line1  = p_line2; \
630     p_line2 += p_dest->p->i_pitch; \
631     p_y1     = p_y2; \
632     p_y2    += p_source->p[Y_PLANE].i_pitch;
633
634 #define VEC_LOAD_UV( ) \
635     u_vec = vec_ld( 0, p_u ); p_u += 16; \
636     v_vec = vec_ld( 0, p_v ); p_v += 16;
637
638 #define VEC_MERGE( a ) \
639     uv_vec = a( u_vec, v_vec ); \
640     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
641     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
642     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
643     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
644     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
645     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
646
647     vector unsigned char u_vec;
648     vector unsigned char v_vec;
649     vector unsigned char uv_vec;
650     vector unsigned char y_vec;
651
652     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
653            ( p_filter->fmt_in.video.i_height % 2 ) ) )
654     {
655         /* Width is a multiple of 32, we take 2 lines at a time */
656         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
657         {
658             VEC_NEXT_LINES( );
659             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
660             {
661                 VEC_LOAD_UV( );
662                 VEC_MERGE( vec_mergeh );
663                 VEC_MERGE( vec_mergel );
664             }
665         }
666     }
667     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
668                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
669     {
670         /* Width is only a multiple of 16, we take 4 lines at a time */
671         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
672         {
673             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
674             VEC_NEXT_LINES( );
675             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
676             {
677                 VEC_LOAD_UV( );
678                 VEC_MERGE( vec_mergeh );
679                 VEC_MERGE( vec_mergel );
680             }
681
682             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
683             VEC_LOAD_UV( );
684             VEC_MERGE( vec_mergeh );
685
686             /* Line 3 and 4, pixels 0 to 16 */
687             VEC_NEXT_LINES( );
688             VEC_MERGE( vec_mergel );
689
690             /* Line 3 and 4, pixels 16 to ( width ) */
691             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
692             {
693                 VEC_LOAD_UV( );
694                 VEC_MERGE( vec_mergeh );
695                 VEC_MERGE( vec_mergel );
696             }
697         }
698     }
699     else
700     {
701         /* Crap, use the C version */
702 #undef VEC_NEXT_LINES
703 #undef VEC_LOAD_UV
704 #undef VEC_MERGE
705 #endif
706
707     const int i_source_margin = p_source->p[0].i_pitch
708                                  - p_source->p[0].i_visible_pitch;
709     const int i_source_margin_c = p_source->p[1].i_pitch
710                                  - p_source->p[1].i_visible_pitch;
711     const int i_dest_margin = p_dest->p->i_pitch
712                                - p_dest->p->i_visible_pitch;
713
714 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
715     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
716     {
717         p_line1 = p_line2;
718         p_line2 += p_dest->p->i_pitch;
719
720         p_y1 = p_y2;
721         p_y2 += p_source->p[Y_PLANE].i_pitch;
722
723         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
724         {
725 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
726             C_YUV420_UYVY( );
727             C_YUV420_UYVY( );
728             C_YUV420_UYVY( );
729             C_YUV420_UYVY( );
730 #else
731             MMX_CALL( MMX_YUV420_UYVY );
732 #endif
733         }
734         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
735         {
736             C_YUV420_UYVY( );
737         }
738
739         p_y1 += i_source_margin;
740         p_y2 += i_source_margin;
741         p_u += i_source_margin_c;
742         p_v += i_source_margin_c;
743         p_line1 += i_dest_margin;
744         p_line2 += i_dest_margin;
745     }
746
747 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
748     /* re-enable FPU registers */
749     MMX_END;
750 #endif
751
752 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
753     }
754 #endif
755
756 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
757     /*
758     ** SSE2 128 bits fetch/store instructions are faster
759     ** if memory access is 16 bytes aligned
760     */
761     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
762         ((intptr_t)p_line2|(intptr_t)p_y2))) )
763     {
764         /* use faster SSE2 aligned fetch and store */
765         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
766         {
767             p_line1 = p_line2;
768             p_line2 += p_dest->p->i_pitch;
769
770             p_y1 = p_y2;
771             p_y2 += p_source->p[Y_PLANE].i_pitch;
772
773             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
774             {
775                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
776             }
777             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
778             {
779                 C_YUV420_UYVY( );
780             }
781
782             p_y1 += i_source_margin;
783             p_y2 += i_source_margin;
784             p_u += i_source_margin_c;
785             p_v += i_source_margin_c;
786             p_line1 += i_dest_margin;
787             p_line2 += i_dest_margin;
788         }
789     }
790     else
791     {
792         /* use slower SSE2 unaligned fetch and store */
793         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
794         {
795             p_line1 = p_line2;
796             p_line2 += p_dest->p->i_pitch;
797
798             p_y1 = p_y2;
799             p_y2 += p_source->p[Y_PLANE].i_pitch;
800
801             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
802             {
803                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
804             }
805             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
806             {
807                 C_YUV420_UYVY( );
808             }
809
810             p_y1 += i_source_margin;
811             p_y2 += i_source_margin;
812             p_u += i_source_margin_c;
813             p_v += i_source_margin_c;
814             p_line1 += i_dest_margin;
815             p_line2 += i_dest_margin;
816         }
817     }
818     /* make sure all SSE2 stores are visible thereafter */
819     SSE2_END;
820 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
821 }
822
823 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
824 /*****************************************************************************
825  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
826  *****************************************************************************/
827 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
828                                            picture_t *p_dest )
829 {
830     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
831     /* FIXME: TODO ! */
832     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
833 }
834
835 /*****************************************************************************
836  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
837  *****************************************************************************/
838 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
839                                            picture_t *p_dest )
840 {
841     uint8_t *p_line1 = p_dest->p->p_pixels +
842                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
843                        + p_dest->p->i_pitch;
844     uint8_t *p_line2 = p_dest->p->p_pixels +
845                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
846     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
847     uint8_t *p_u = p_source->U_PIXELS;
848     uint8_t *p_v = p_source->V_PIXELS;
849
850     int i_x, i_y;
851
852     const int i_source_margin = p_source->p[0].i_pitch
853                                  - p_source->p[0].i_visible_pitch;
854     const int i_source_margin_c = p_source->p[1].i_pitch
855                                  - p_source->p[1].i_visible_pitch;
856     const int i_dest_margin = p_dest->p->i_pitch
857                                - p_dest->p->i_visible_pitch;
858
859 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
860     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
861     {
862         p_line1 -= 3 * p_dest->p->i_pitch;
863         p_line2 -= 3 * p_dest->p->i_pitch;
864
865         p_y1 = p_y2;
866         p_y2 += p_source->p[Y_PLANE].i_pitch;
867
868         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
869         {
870 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
871             C_YUV420_UYVY( );
872             C_YUV420_UYVY( );
873             C_YUV420_UYVY( );
874             C_YUV420_UYVY( );
875 #else
876             MMX_CALL( MMX_YUV420_UYVY );
877 #endif
878         }
879         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
880         {
881             C_YUV420_UYVY( );
882         }
883
884         p_y1 += i_source_margin;
885         p_y2 += i_source_margin;
886         p_u += i_source_margin_c;
887         p_v += i_source_margin_c;
888         p_line1 += i_dest_margin;
889         p_line2 += i_dest_margin;
890     }
891
892 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
893     /* re-enable FPU registers */
894     MMX_END;
895 #endif
896
897 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
898     /*
899     ** SSE2 128 bits fetch/store instructions are faster
900     ** if memory access is 16 bytes aligned
901     */
902     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
903         ((intptr_t)p_line2|(intptr_t)p_y2))) )
904     {
905         /* use faster SSE2 aligned fetch and store */
906         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
907         {
908             p_line1 = p_line2;
909             p_line2 += p_dest->p->i_pitch;
910
911             p_y1 = p_y2;
912             p_y2 += p_source->p[Y_PLANE].i_pitch;
913
914             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
915             {
916                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
917             }
918             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
919             {
920                 C_YUV420_UYVY( );
921             }
922
923             p_y1 += i_source_margin;
924             p_y2 += i_source_margin;
925             p_u += i_source_margin_c;
926             p_v += i_source_margin_c;
927             p_line1 += i_dest_margin;
928             p_line2 += i_dest_margin;
929         }
930     }
931     else
932     {
933         /* use slower SSE2 unaligned fetch and store */
934         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
935         {
936             p_line1 = p_line2;
937             p_line2 += p_dest->p->i_pitch;
938
939             p_y1 = p_y2;
940             p_y2 += p_source->p[Y_PLANE].i_pitch;
941
942             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
943             {
944                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
945             }
946             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
947             {
948                 C_YUV420_UYVY( );
949             }
950
951             p_y1 += i_source_margin;
952             p_y2 += i_source_margin;
953             p_u += i_source_margin_c;
954             p_v += i_source_margin_c;
955             p_line1 += i_dest_margin;
956             p_line2 += i_dest_margin;
957         }
958     }
959     /* make sure all SSE2 stores are visible thereafter */
960     SSE2_END;
961 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
962 }
963 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
964
965 /*****************************************************************************
966  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
967  *****************************************************************************/
968 #if defined (MODULE_NAME_IS_i420_yuy2)
969 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
970                                            picture_t *p_dest )
971 {
972     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
973     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
974     uint8_t *p_u = p_source->U_PIXELS;
975     uint8_t *p_v = p_source->V_PIXELS;
976
977     int i_x, i_y;
978
979     const int i_source_margin = p_source->p[0].i_pitch
980                                  - p_source->p[0].i_visible_pitch;
981     const int i_source_margin_c = p_source->p[1].i_pitch
982                                  - p_source->p[1].i_visible_pitch;
983     const int i_dest_margin = p_dest->p->i_pitch
984                                - p_dest->p->i_visible_pitch;
985
986     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
987     {
988         p_line1 = p_line2;
989         p_line2 += p_dest->p->i_pitch;
990
991         p_y1 = p_y2;
992         p_y2 += p_source->p[Y_PLANE].i_pitch;
993
994         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
995         {
996             C_YUV420_Y211( );
997             C_YUV420_Y211( );
998         }
999
1000         p_y1 += i_source_margin;
1001         p_y2 += i_source_margin;
1002         p_u += i_source_margin_c;
1003         p_v += i_source_margin_c;
1004         p_line1 += i_dest_margin;
1005         p_line2 += i_dest_margin;
1006     }
1007 }
1008 #endif