]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
ff9f62d30e0257a6165ebd27aa3409421fcc0fd7
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 the VideoLAN team
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_vout.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
54 #endif
55
56 /*****************************************************************************
57  * Local and extern prototypes.
58  *****************************************************************************/
59 static int  Activate ( vlc_object_t * );
60
61 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
62 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
63 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
64 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
65 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
66 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
68 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
69 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
70 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
71 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
72 #endif
73 #if defined (MODULE_NAME_IS_i420_yuy2)
74 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
75 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
76 #endif
77
78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
79 /* Initialize MMX-specific constants */
80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
81 static const uint64_t i_80w   = 0x0000000080808080ULL;
82 #endif
83
84 /*****************************************************************************
85  * Module descriptor.
86  *****************************************************************************/
87 vlc_module_begin ()
88 #if defined (MODULE_NAME_IS_i420_yuy2)
89     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
90     set_capability( "video filter2", 80 )
91 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
92     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
93     set_capability( "video filter2", 160 )
94     add_requirement( MMX )
95 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
96     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
97     set_capability( "video filter2", 250 )
98     add_requirement( SSE2 )
99 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
100     set_description(
101             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
102     set_capability( "video filter2", 250 )
103     add_requirement( ALTIVEC )
104 #endif
105     set_callbacks( Activate, NULL )
106 vlc_module_end ()
107
108 /*****************************************************************************
109  * Activate: allocate a chroma function
110  *****************************************************************************
111  * This function allocates and initializes a chroma function
112  *****************************************************************************/
113 static int Activate( vlc_object_t *p_this )
114 {
115     filter_t *p_filter = (filter_t *)p_this;
116
117     if( p_filter->fmt_in.video.i_width & 1
118      || p_filter->fmt_in.video.i_height & 1 )
119     {
120         return -1;
121     }
122
123     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
124      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
125         return -1;
126
127     switch( p_filter->fmt_in.video.i_chroma )
128     {
129         case VLC_CODEC_YV12:
130         case VLC_CODEC_I420:
131             switch( p_filter->fmt_out.video.i_chroma )
132             {
133                 case VLC_CODEC_YUYV:
134                     p_filter->pf_video_filter = I420_YUY2_Filter;
135                     break;
136
137                 case VLC_CODEC_YVYU:
138                     p_filter->pf_video_filter = I420_YVYU_Filter;
139                     break;
140
141                 case VLC_CODEC_UYVY:
142                     p_filter->pf_video_filter = I420_UYVY_Filter;
143                     break;
144 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
145                 case VLC_FOURCC('I','U','Y','V'):
146                     p_filter->pf_video_filter = I420_IUYV_Filter;
147                     break;
148
149                 case VLC_CODEC_CYUV:
150                     p_filter->pf_video_filter = I420_cyuv_Filter;
151                     break;
152 #endif
153
154 #if defined (MODULE_NAME_IS_i420_yuy2)
155                 case VLC_CODEC_Y211:
156                     p_filter->pf_video_filter = I420_Y211_Filter;
157                     break;
158 #endif
159
160                 default:
161                     return -1;
162             }
163             break;
164
165         default:
166             return -1;
167     }
168
169     return 0;
170 }
171
172 #if 0
173 static inline unsigned long long read_cycles(void)
174 {
175     unsigned long long v;
176     __asm__ __volatile__("rdtsc" : "=A" (v): );
177
178     return v;
179 }
180 #endif
181
182 /* Following functions are local */
183
184 VIDEO_FILTER_WRAPPER( I420_YUY2 )
185 VIDEO_FILTER_WRAPPER( I420_YVYU )
186 VIDEO_FILTER_WRAPPER( I420_UYVY )
187 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
188 VIDEO_FILTER_WRAPPER( I420_IUYV )
189 VIDEO_FILTER_WRAPPER( I420_cyuv )
190 #endif
191 #if defined (MODULE_NAME_IS_i420_yuy2)
192 VIDEO_FILTER_WRAPPER( I420_Y211 )
193 #endif
194
195 /*****************************************************************************
196  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
197  *****************************************************************************/
198 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
199                                            picture_t *p_dest )
200 {
201     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
202     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
203     uint8_t *p_u = p_source->U_PIXELS;
204     uint8_t *p_v = p_source->V_PIXELS;
205
206     int i_x, i_y;
207
208 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
209 #define VEC_NEXT_LINES( ) \
210     p_line1  = p_line2; \
211     p_line2 += p_dest->p->i_pitch; \
212     p_y1     = p_y2; \
213     p_y2    += p_source->p[Y_PLANE].i_pitch;
214
215 #define VEC_LOAD_UV( ) \
216     u_vec = vec_ld( 0, p_u ); p_u += 16; \
217     v_vec = vec_ld( 0, p_v ); p_v += 16;
218
219 #define VEC_MERGE( a ) \
220     uv_vec = a( u_vec, v_vec ); \
221     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
222     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
223     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
224     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
225     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
226     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
227
228     vector unsigned char u_vec;
229     vector unsigned char v_vec;
230     vector unsigned char uv_vec;
231     vector unsigned char y_vec;
232
233     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
234            ( p_filter->fmt_in.video.i_height % 2 ) ) )
235     {
236         /* Width is a multiple of 32, we take 2 lines at a time */
237         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
238         {
239             VEC_NEXT_LINES( );
240             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
241             {
242                 VEC_LOAD_UV( );
243                 VEC_MERGE( vec_mergeh );
244                 VEC_MERGE( vec_mergel );
245             }
246         }
247     }
248     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
249                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
250     {
251         /* Width is only a multiple of 16, we take 4 lines at a time */
252         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
253         {
254             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
255             VEC_NEXT_LINES( );
256             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
257             {
258                 VEC_LOAD_UV( );
259                 VEC_MERGE( vec_mergeh );
260                 VEC_MERGE( vec_mergel );
261             }
262
263             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
264             VEC_LOAD_UV( );
265             VEC_MERGE( vec_mergeh );
266
267             /* Line 3 and 4, pixels 0 to 16 */
268             VEC_NEXT_LINES( );
269             VEC_MERGE( vec_mergel );
270
271             /* Line 3 and 4, pixels 16 to ( width ) */
272             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
273             {
274                 VEC_LOAD_UV( );
275                 VEC_MERGE( vec_mergeh );
276                 VEC_MERGE( vec_mergel );
277             }
278         }
279     }
280     else
281     {
282         /* Crap, use the C version */
283 #undef VEC_NEXT_LINES
284 #undef VEC_LOAD_UV
285 #undef VEC_MERGE
286 #endif
287
288     const int i_source_margin = p_source->p[0].i_pitch
289                                  - p_source->p[0].i_visible_pitch;
290     const int i_source_margin_c = p_source->p[1].i_pitch
291                                  - p_source->p[1].i_visible_pitch;
292     const int i_dest_margin = p_dest->p->i_pitch
293                                - p_dest->p->i_visible_pitch;
294
295 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
296     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
297     {
298         p_line1 = p_line2;
299         p_line2 += p_dest->p->i_pitch;
300
301         p_y1 = p_y2;
302         p_y2 += p_source->p[Y_PLANE].i_pitch;
303
304 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
305         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
306         {
307             C_YUV420_YUYV( );
308             C_YUV420_YUYV( );
309             C_YUV420_YUYV( );
310             C_YUV420_YUYV( );
311         }
312 #else
313         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
314         {
315             MMX_CALL( MMX_YUV420_YUYV );
316         }
317 #endif
318         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
319         {
320             C_YUV420_YUYV( );
321         }
322
323         p_y1 += i_source_margin;
324         p_y2 += i_source_margin;
325         p_u += i_source_margin_c;
326         p_v += i_source_margin_c;
327         p_line1 += i_dest_margin;
328         p_line2 += i_dest_margin;
329     }
330
331 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
332     /* re-enable FPU registers */
333     MMX_END;
334 #endif
335
336 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
337     }
338 #endif
339
340 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
341     /*
342     ** SSE2 128 bits fetch/store instructions are faster
343     ** if memory access is 16 bytes aligned
344     */
345
346     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
347         ((intptr_t)p_line2|(intptr_t)p_y2))) )
348     {
349         /* use faster SSE2 aligned fetch and store */
350         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
351         {
352             p_line1 = p_line2;
353             p_line2 += p_dest->p->i_pitch;
354
355             p_y1 = p_y2;
356             p_y2 += p_source->p[Y_PLANE].i_pitch;
357
358             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
359             {
360                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
361             }
362             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
363             {
364                 C_YUV420_YUYV( );
365             }
366
367             p_y1 += i_source_margin;
368             p_y2 += i_source_margin;
369             p_u += i_source_margin_c;
370             p_v += i_source_margin_c;
371             p_line1 += i_dest_margin;
372             p_line2 += i_dest_margin;
373         }
374     }
375     else
376     {
377         /* use slower SSE2 unaligned fetch and store */
378         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
379         {
380             p_line1 = p_line2;
381             p_line2 += p_dest->p->i_pitch;
382
383             p_y1 = p_y2;
384             p_y2 += p_source->p[Y_PLANE].i_pitch;
385
386             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
387             {
388                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
389             }
390             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
391             {
392                 C_YUV420_YUYV( );
393             }
394
395             p_y1 += i_source_margin;
396             p_y2 += i_source_margin;
397             p_u += i_source_margin_c;
398             p_v += i_source_margin_c;
399             p_line1 += i_dest_margin;
400             p_line2 += i_dest_margin;
401         }
402     }
403     /* make sure all SSE2 stores are visible thereafter */
404     SSE2_END;
405
406 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
407 }
408
409 /*****************************************************************************
410  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
411  *****************************************************************************/
412 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
413                                            picture_t *p_dest )
414 {
415     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
416     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
417     uint8_t *p_u = p_source->U_PIXELS;
418     uint8_t *p_v = p_source->V_PIXELS;
419
420     int i_x, i_y;
421
422 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
423 #define VEC_NEXT_LINES( ) \
424     p_line1  = p_line2; \
425     p_line2 += p_dest->p->i_pitch; \
426     p_y1     = p_y2; \
427     p_y2    += p_source->p[Y_PLANE].i_pitch;
428
429 #define VEC_LOAD_UV( ) \
430     u_vec = vec_ld( 0, p_u ); p_u += 16; \
431     v_vec = vec_ld( 0, p_v ); p_v += 16;
432
433 #define VEC_MERGE( a ) \
434     vu_vec = a( v_vec, u_vec ); \
435     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
436     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
437     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
438     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
439     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
440     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
441
442     vector unsigned char u_vec;
443     vector unsigned char v_vec;
444     vector unsigned char vu_vec;
445     vector unsigned char y_vec;
446
447     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
448            ( p_filter->fmt_in.video.i_height % 2 ) ) )
449     {
450         /* Width is a multiple of 32, we take 2 lines at a time */
451         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
452         {
453             VEC_NEXT_LINES( );
454             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
455             {
456                 VEC_LOAD_UV( );
457                 VEC_MERGE( vec_mergeh );
458                 VEC_MERGE( vec_mergel );
459             }
460         }
461     }
462     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
463                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
464     {
465         /* Width is only a multiple of 16, we take 4 lines at a time */
466         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
467         {
468             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
469             VEC_NEXT_LINES( );
470             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
471             {
472                 VEC_LOAD_UV( );
473                 VEC_MERGE( vec_mergeh );
474                 VEC_MERGE( vec_mergel );
475             }
476
477             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
478             VEC_LOAD_UV( );
479             VEC_MERGE( vec_mergeh );
480
481             /* Line 3 and 4, pixels 0 to 16 */
482             VEC_NEXT_LINES( );
483             VEC_MERGE( vec_mergel );
484
485             /* Line 3 and 4, pixels 16 to ( width ) */
486             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
487             {
488                 VEC_LOAD_UV( );
489                 VEC_MERGE( vec_mergeh );
490                 VEC_MERGE( vec_mergel );
491             }
492         }
493     }
494     else
495     {
496         /* Crap, use the C version */
497 #undef VEC_NEXT_LINES
498 #undef VEC_LOAD_UV
499 #undef VEC_MERGE
500 #endif
501
502     const int i_source_margin = p_source->p[0].i_pitch
503                                  - p_source->p[0].i_visible_pitch;
504     const int i_source_margin_c = p_source->p[1].i_pitch
505                                  - p_source->p[1].i_visible_pitch;
506     const int i_dest_margin = p_dest->p->i_pitch
507                                - p_dest->p->i_visible_pitch;
508
509 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
510     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
511     {
512         p_line1 = p_line2;
513         p_line2 += p_dest->p->i_pitch;
514
515         p_y1 = p_y2;
516         p_y2 += p_source->p[Y_PLANE].i_pitch;
517
518         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
519         {
520 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
521             C_YUV420_YVYU( );
522             C_YUV420_YVYU( );
523             C_YUV420_YVYU( );
524             C_YUV420_YVYU( );
525 #else
526             MMX_CALL( MMX_YUV420_YVYU );
527 #endif
528         }
529         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
530         {
531             C_YUV420_YVYU( );
532         }
533
534         p_y1 += i_source_margin;
535         p_y2 += i_source_margin;
536         p_u += i_source_margin_c;
537         p_v += i_source_margin_c;
538         p_line1 += i_dest_margin;
539         p_line2 += i_dest_margin;
540     }
541
542 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
543     /* re-enable FPU registers */
544     MMX_END;
545 #endif
546
547 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
548     }
549 #endif
550
551 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
552     /*
553     ** SSE2 128 bits fetch/store instructions are faster
554     ** if memory access is 16 bytes aligned
555     */
556     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
557         ((intptr_t)p_line2|(intptr_t)p_y2))) )
558     {
559         /* use faster SSE2 aligned fetch and store */
560         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
561         {
562             p_line1 = p_line2;
563             p_line2 += p_dest->p->i_pitch;
564
565             p_y1 = p_y2;
566             p_y2 += p_source->p[Y_PLANE].i_pitch;
567
568             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
569             {
570                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
571             }
572             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
573             {
574                 C_YUV420_YVYU( );
575             }
576
577             p_y1 += i_source_margin;
578             p_y2 += i_source_margin;
579             p_u += i_source_margin_c;
580             p_v += i_source_margin_c;
581             p_line1 += i_dest_margin;
582             p_line2 += i_dest_margin;
583         }
584     }
585     else
586     {
587         /* use slower SSE2 unaligned fetch and store */
588         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
589         {
590             p_line1 = p_line2;
591             p_line2 += p_dest->p->i_pitch;
592
593             p_y1 = p_y2;
594             p_y2 += p_source->p[Y_PLANE].i_pitch;
595
596             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
597             {
598                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
599             }
600             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
601             {
602                 C_YUV420_YVYU( );
603             }
604
605             p_y1 += i_source_margin;
606             p_y2 += i_source_margin;
607             p_u += i_source_margin_c;
608             p_v += i_source_margin_c;
609             p_line1 += i_dest_margin;
610             p_line2 += i_dest_margin;
611         }
612     }
613     /* make sure all SSE2 stores are visible thereafter */
614     SSE2_END;
615 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
616 }
617
618 /*****************************************************************************
619  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
620  *****************************************************************************/
621 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
622                                            picture_t *p_dest )
623 {
624     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
625     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
626     uint8_t *p_u = p_source->U_PIXELS;
627     uint8_t *p_v = p_source->V_PIXELS;
628
629     int i_x, i_y;
630
631 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
632 #define VEC_NEXT_LINES( ) \
633     p_line1  = p_line2; \
634     p_line2 += p_dest->p->i_pitch; \
635     p_y1     = p_y2; \
636     p_y2    += p_source->p[Y_PLANE].i_pitch;
637
638 #define VEC_LOAD_UV( ) \
639     u_vec = vec_ld( 0, p_u ); p_u += 16; \
640     v_vec = vec_ld( 0, p_v ); p_v += 16;
641
642 #define VEC_MERGE( a ) \
643     uv_vec = a( u_vec, v_vec ); \
644     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
645     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
646     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
647     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
648     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
649     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
650
651     vector unsigned char u_vec;
652     vector unsigned char v_vec;
653     vector unsigned char uv_vec;
654     vector unsigned char y_vec;
655
656     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
657            ( p_filter->fmt_in.video.i_height % 2 ) ) )
658     {
659         /* Width is a multiple of 32, we take 2 lines at a time */
660         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
661         {
662             VEC_NEXT_LINES( );
663             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
664             {
665                 VEC_LOAD_UV( );
666                 VEC_MERGE( vec_mergeh );
667                 VEC_MERGE( vec_mergel );
668             }
669         }
670     }
671     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
672                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
673     {
674         /* Width is only a multiple of 16, we take 4 lines at a time */
675         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
676         {
677             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
678             VEC_NEXT_LINES( );
679             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
680             {
681                 VEC_LOAD_UV( );
682                 VEC_MERGE( vec_mergeh );
683                 VEC_MERGE( vec_mergel );
684             }
685
686             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
687             VEC_LOAD_UV( );
688             VEC_MERGE( vec_mergeh );
689
690             /* Line 3 and 4, pixels 0 to 16 */
691             VEC_NEXT_LINES( );
692             VEC_MERGE( vec_mergel );
693
694             /* Line 3 and 4, pixels 16 to ( width ) */
695             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
696             {
697                 VEC_LOAD_UV( );
698                 VEC_MERGE( vec_mergeh );
699                 VEC_MERGE( vec_mergel );
700             }
701         }
702     }
703     else
704     {
705         /* Crap, use the C version */
706 #undef VEC_NEXT_LINES
707 #undef VEC_LOAD_UV
708 #undef VEC_MERGE
709 #endif
710
711     const int i_source_margin = p_source->p[0].i_pitch
712                                  - p_source->p[0].i_visible_pitch;
713     const int i_source_margin_c = p_source->p[1].i_pitch
714                                  - p_source->p[1].i_visible_pitch;
715     const int i_dest_margin = p_dest->p->i_pitch
716                                - p_dest->p->i_visible_pitch;
717
718 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
719     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
720     {
721         p_line1 = p_line2;
722         p_line2 += p_dest->p->i_pitch;
723
724         p_y1 = p_y2;
725         p_y2 += p_source->p[Y_PLANE].i_pitch;
726
727         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
728         {
729 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
730             C_YUV420_UYVY( );
731             C_YUV420_UYVY( );
732             C_YUV420_UYVY( );
733             C_YUV420_UYVY( );
734 #else
735             MMX_CALL( MMX_YUV420_UYVY );
736 #endif
737         }
738         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
739         {
740             C_YUV420_UYVY( );
741         }
742
743         p_y1 += i_source_margin;
744         p_y2 += i_source_margin;
745         p_u += i_source_margin_c;
746         p_v += i_source_margin_c;
747         p_line1 += i_dest_margin;
748         p_line2 += i_dest_margin;
749     }
750
751 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
752     /* re-enable FPU registers */
753     MMX_END;
754 #endif
755
756 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
757     }
758 #endif
759
760 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
761     /*
762     ** SSE2 128 bits fetch/store instructions are faster
763     ** if memory access is 16 bytes aligned
764     */
765     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
766         ((intptr_t)p_line2|(intptr_t)p_y2))) )
767     {
768         /* use faster SSE2 aligned fetch and store */
769         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
770         {
771             p_line1 = p_line2;
772             p_line2 += p_dest->p->i_pitch;
773
774             p_y1 = p_y2;
775             p_y2 += p_source->p[Y_PLANE].i_pitch;
776
777             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
778             {
779                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
780             }
781             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
782             {
783                 C_YUV420_UYVY( );
784             }
785
786             p_y1 += i_source_margin;
787             p_y2 += i_source_margin;
788             p_u += i_source_margin_c;
789             p_v += i_source_margin_c;
790             p_line1 += i_dest_margin;
791             p_line2 += i_dest_margin;
792         }
793     }
794     else
795     {
796         /* use slower SSE2 unaligned fetch and store */
797         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
798         {
799             p_line1 = p_line2;
800             p_line2 += p_dest->p->i_pitch;
801
802             p_y1 = p_y2;
803             p_y2 += p_source->p[Y_PLANE].i_pitch;
804
805             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
806             {
807                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
808             }
809             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
810             {
811                 C_YUV420_UYVY( );
812             }
813
814             p_y1 += i_source_margin;
815             p_y2 += i_source_margin;
816             p_u += i_source_margin_c;
817             p_v += i_source_margin_c;
818             p_line1 += i_dest_margin;
819             p_line2 += i_dest_margin;
820         }
821     }
822     /* make sure all SSE2 stores are visible thereafter */
823     SSE2_END;
824 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
825 }
826
827 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
828 /*****************************************************************************
829  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
830  *****************************************************************************/
831 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
832                                            picture_t *p_dest )
833 {
834     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
835     /* FIXME: TODO ! */
836     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
837 }
838
839 /*****************************************************************************
840  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
841  *****************************************************************************/
842 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
843                                            picture_t *p_dest )
844 {
845     uint8_t *p_line1 = p_dest->p->p_pixels +
846                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
847                        + p_dest->p->i_pitch;
848     uint8_t *p_line2 = p_dest->p->p_pixels +
849                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
850     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
851     uint8_t *p_u = p_source->U_PIXELS;
852     uint8_t *p_v = p_source->V_PIXELS;
853
854     int i_x, i_y;
855
856     const int i_source_margin = p_source->p[0].i_pitch
857                                  - p_source->p[0].i_visible_pitch;
858     const int i_source_margin_c = p_source->p[1].i_pitch
859                                  - p_source->p[1].i_visible_pitch;
860     const int i_dest_margin = p_dest->p->i_pitch
861                                - p_dest->p->i_visible_pitch;
862
863 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
864     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
865     {
866         p_line1 -= 3 * p_dest->p->i_pitch;
867         p_line2 -= 3 * p_dest->p->i_pitch;
868
869         p_y1 = p_y2;
870         p_y2 += p_source->p[Y_PLANE].i_pitch;
871
872         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
873         {
874 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
875             C_YUV420_UYVY( );
876             C_YUV420_UYVY( );
877             C_YUV420_UYVY( );
878             C_YUV420_UYVY( );
879 #else
880             MMX_CALL( MMX_YUV420_UYVY );
881 #endif
882         }
883         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
884         {
885             C_YUV420_UYVY( );
886         }
887
888         p_y1 += i_source_margin;
889         p_y2 += i_source_margin;
890         p_u += i_source_margin_c;
891         p_v += i_source_margin_c;
892         p_line1 += i_dest_margin;
893         p_line2 += i_dest_margin;
894     }
895
896 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
897     /* re-enable FPU registers */
898     MMX_END;
899 #endif
900
901 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
902     /*
903     ** SSE2 128 bits fetch/store instructions are faster
904     ** if memory access is 16 bytes aligned
905     */
906     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
907         ((intptr_t)p_line2|(intptr_t)p_y2))) )
908     {
909         /* use faster SSE2 aligned fetch and store */
910         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
911         {
912             p_line1 = p_line2;
913             p_line2 += p_dest->p->i_pitch;
914
915             p_y1 = p_y2;
916             p_y2 += p_source->p[Y_PLANE].i_pitch;
917
918             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
919             {
920                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
921             }
922             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
923             {
924                 C_YUV420_UYVY( );
925             }
926
927             p_y1 += i_source_margin;
928             p_y2 += i_source_margin;
929             p_u += i_source_margin_c;
930             p_v += i_source_margin_c;
931             p_line1 += i_dest_margin;
932             p_line2 += i_dest_margin;
933         }
934     }
935     else
936     {
937         /* use slower SSE2 unaligned fetch and store */
938         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
939         {
940             p_line1 = p_line2;
941             p_line2 += p_dest->p->i_pitch;
942
943             p_y1 = p_y2;
944             p_y2 += p_source->p[Y_PLANE].i_pitch;
945
946             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
947             {
948                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
949             }
950             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
951             {
952                 C_YUV420_UYVY( );
953             }
954
955             p_y1 += i_source_margin;
956             p_y2 += i_source_margin;
957             p_u += i_source_margin_c;
958             p_v += i_source_margin_c;
959             p_line1 += i_dest_margin;
960             p_line2 += i_dest_margin;
961         }
962     }
963     /* make sure all SSE2 stores are visible thereafter */
964     SSE2_END;
965 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
966 }
967 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
968
969 /*****************************************************************************
970  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
971  *****************************************************************************/
972 #if defined (MODULE_NAME_IS_i420_yuy2)
973 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
974                                            picture_t *p_dest )
975 {
976     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
977     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
978     uint8_t *p_u = p_source->U_PIXELS;
979     uint8_t *p_v = p_source->V_PIXELS;
980
981     int i_x, i_y;
982
983     const int i_source_margin = p_source->p[0].i_pitch
984                                  - p_source->p[0].i_visible_pitch;
985     const int i_source_margin_c = p_source->p[1].i_pitch
986                                  - p_source->p[1].i_visible_pitch;
987     const int i_dest_margin = p_dest->p->i_pitch
988                                - p_dest->p->i_visible_pitch;
989
990     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
991     {
992         p_line1 = p_line2;
993         p_line2 += p_dest->p->i_pitch;
994
995         p_y1 = p_y2;
996         p_y2 += p_source->p[Y_PLANE].i_pitch;
997
998         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
999         {
1000             C_YUV420_Y211( );
1001             C_YUV420_Y211( );
1002         }
1003
1004         p_y1 += i_source_margin;
1005         p_y2 += i_source_margin;
1006         p_u += i_source_margin_c;
1007         p_v += i_source_margin_c;
1008         p_line1 += i_dest_margin;
1009         p_line2 += i_dest_margin;
1010     }
1011 }
1012 #endif