]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
5457f9572590d442088f498caf3f5fcd15798a25
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 VLC authors and VideoLAN
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of the GNU Lesser General Public License as published by
12  * the Free Software Foundation; either version 2.1 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this program; if not, write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_cpu.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #    define VLC_TARGET
49 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 #    define VLC_TARGET VLC_MMX
52 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
54 #    define VLC_TARGET VLC_SSE
55 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
56 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
57 #    define VLC_TARGET
58 #endif
59
60 /*****************************************************************************
61  * Local and extern prototypes.
62  *****************************************************************************/
63 static int  Activate ( vlc_object_t * );
64
65 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
66 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
67 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
68 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
69 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
70 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
71 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
72 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
73 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
74 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
75 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
76 #endif
77 #if defined (MODULE_NAME_IS_i420_yuy2)
78 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
79 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
80 #endif
81
82 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
83 /* Initialize MMX-specific constants */
84 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
85 static const uint64_t i_80w   = 0x0000000080808080ULL;
86 #endif
87
88 /*****************************************************************************
89  * Module descriptor.
90  *****************************************************************************/
91 vlc_module_begin ()
92 #if defined (MODULE_NAME_IS_i420_yuy2)
93     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
94     set_capability( "video filter2", 80 )
95 # define vlc_CPU_capable() (true)
96 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
97     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
98     set_capability( "video filter2", 160 )
99 # define vlc_CPU_capable() vlc_CPU_MMX()
100 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
101     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
102     set_capability( "video filter2", 250 )
103 # define vlc_CPU_capable() vlc_CPU_SSE2()
104 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
105     set_description(
106             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
107     set_capability( "video filter2", 250 )
108 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
109 #endif
110     set_callbacks( Activate, NULL )
111 vlc_module_end ()
112
113 /*****************************************************************************
114  * Activate: allocate a chroma function
115  *****************************************************************************
116  * This function allocates and initializes a chroma function
117  *****************************************************************************/
118 static int Activate( vlc_object_t *p_this )
119 {
120     filter_t *p_filter = (filter_t *)p_this;
121
122     if( !vlc_CPU_capable() )
123         return VLC_EGENERIC;
124     if( p_filter->fmt_in.video.i_width & 1
125      || p_filter->fmt_in.video.i_height & 1 )
126     {
127         return -1;
128     }
129
130     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
131        || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height
132        || p_filter->fmt_in.video.orientation != p_filter->fmt_out.video.orientation )
133         return -1;
134
135     switch( p_filter->fmt_in.video.i_chroma )
136     {
137         case VLC_CODEC_YV12:
138         case VLC_CODEC_I420:
139             switch( p_filter->fmt_out.video.i_chroma )
140             {
141                 case VLC_CODEC_YUYV:
142                     p_filter->pf_video_filter = I420_YUY2_Filter;
143                     break;
144
145                 case VLC_CODEC_YVYU:
146                     p_filter->pf_video_filter = I420_YVYU_Filter;
147                     break;
148
149                 case VLC_CODEC_UYVY:
150                     p_filter->pf_video_filter = I420_UYVY_Filter;
151                     break;
152 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
153                 case VLC_FOURCC('I','U','Y','V'):
154                     p_filter->pf_video_filter = I420_IUYV_Filter;
155                     break;
156
157                 case VLC_CODEC_CYUV:
158                     p_filter->pf_video_filter = I420_cyuv_Filter;
159                     break;
160 #endif
161
162 #if defined (MODULE_NAME_IS_i420_yuy2)
163                 case VLC_CODEC_Y211:
164                     p_filter->pf_video_filter = I420_Y211_Filter;
165                     break;
166 #endif
167
168                 default:
169                     return -1;
170             }
171             break;
172
173         default:
174             return -1;
175     }
176
177     return 0;
178 }
179
180 #if 0
181 static inline unsigned long long read_cycles(void)
182 {
183     unsigned long long v;
184     __asm__ __volatile__("rdtsc" : "=A" (v): );
185
186     return v;
187 }
188 #endif
189
190 /* Following functions are local */
191
192 VIDEO_FILTER_WRAPPER( I420_YUY2 )
193 VIDEO_FILTER_WRAPPER( I420_YVYU )
194 VIDEO_FILTER_WRAPPER( I420_UYVY )
195 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
196 VIDEO_FILTER_WRAPPER( I420_IUYV )
197 VIDEO_FILTER_WRAPPER( I420_cyuv )
198 #endif
199 #if defined (MODULE_NAME_IS_i420_yuy2)
200 VIDEO_FILTER_WRAPPER( I420_Y211 )
201 #endif
202
203 /*****************************************************************************
204  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
205  *****************************************************************************/
206 VLC_TARGET
207 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
208                                            picture_t *p_dest )
209 {
210     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
211     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
212     uint8_t *p_u = p_source->U_PIXELS;
213     uint8_t *p_v = p_source->V_PIXELS;
214
215     int i_x, i_y;
216
217 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
218 #define VEC_NEXT_LINES( ) \
219     p_line1  = p_line2; \
220     p_line2 += p_dest->p->i_pitch; \
221     p_y1     = p_y2; \
222     p_y2    += p_source->p[Y_PLANE].i_pitch;
223
224 #define VEC_LOAD_UV( ) \
225     u_vec = vec_ld( 0, p_u ); p_u += 16; \
226     v_vec = vec_ld( 0, p_v ); p_v += 16;
227
228 #define VEC_MERGE( a ) \
229     uv_vec = a( u_vec, v_vec ); \
230     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
231     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
232     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
233     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
234     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
235     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
236
237     vector unsigned char u_vec;
238     vector unsigned char v_vec;
239     vector unsigned char uv_vec;
240     vector unsigned char y_vec;
241
242     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
243            ( p_filter->fmt_in.video.i_height % 2 ) ) )
244     {
245         /* Width is a multiple of 32, we take 2 lines at a time */
246         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
247         {
248             VEC_NEXT_LINES( );
249             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
250             {
251                 VEC_LOAD_UV( );
252                 VEC_MERGE( vec_mergeh );
253                 VEC_MERGE( vec_mergel );
254             }
255         }
256     }
257 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
258 #if 0
259     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
260                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
261     {
262         /* Width is only a multiple of 16, we take 4 lines at a time */
263         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
264         {
265             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
266             VEC_NEXT_LINES( );
267             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
268             {
269                 VEC_LOAD_UV( );
270                 VEC_MERGE( vec_mergeh );
271                 VEC_MERGE( vec_mergel );
272             }
273
274             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
275             VEC_LOAD_UV( );
276             VEC_MERGE( vec_mergeh );
277
278             /* Line 3 and 4, pixels 0 to 16 */
279             VEC_NEXT_LINES( );
280             VEC_MERGE( vec_mergel );
281
282             /* Line 3 and 4, pixels 16 to ( width ) */
283             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
284             {
285                 VEC_LOAD_UV( );
286                 VEC_MERGE( vec_mergeh );
287                 VEC_MERGE( vec_mergel );
288             }
289         }
290     }
291 #endif
292     else
293     {
294         /* Crap, use the C version */
295 #undef VEC_NEXT_LINES
296 #undef VEC_LOAD_UV
297 #undef VEC_MERGE
298 #endif
299
300     const int i_source_margin = p_source->p[0].i_pitch
301                                  - p_source->p[0].i_visible_pitch;
302     const int i_source_margin_c = p_source->p[1].i_pitch
303                                  - p_source->p[1].i_visible_pitch;
304     const int i_dest_margin = p_dest->p->i_pitch
305                                - p_dest->p->i_visible_pitch;
306
307 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
308     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
309     {
310         p_line1 = p_line2;
311         p_line2 += p_dest->p->i_pitch;
312
313         p_y1 = p_y2;
314         p_y2 += p_source->p[Y_PLANE].i_pitch;
315
316 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
317         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
318         {
319             C_YUV420_YUYV( );
320             C_YUV420_YUYV( );
321             C_YUV420_YUYV( );
322             C_YUV420_YUYV( );
323         }
324 #else
325         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
326         {
327             MMX_CALL( MMX_YUV420_YUYV );
328         }
329 #endif
330         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
331         {
332             C_YUV420_YUYV( );
333         }
334
335         p_y1 += i_source_margin;
336         p_y2 += i_source_margin;
337         p_u += i_source_margin_c;
338         p_v += i_source_margin_c;
339         p_line1 += i_dest_margin;
340         p_line2 += i_dest_margin;
341     }
342
343 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
344     /* re-enable FPU registers */
345     MMX_END;
346 #endif
347
348 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
349     }
350 #endif
351
352 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
353     /*
354     ** SSE2 128 bits fetch/store instructions are faster
355     ** if memory access is 16 bytes aligned
356     */
357
358     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
359         ((intptr_t)p_line2|(intptr_t)p_y2))) )
360     {
361         /* use faster SSE2 aligned fetch and store */
362         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
363         {
364             p_line1 = p_line2;
365             p_line2 += p_dest->p->i_pitch;
366
367             p_y1 = p_y2;
368             p_y2 += p_source->p[Y_PLANE].i_pitch;
369
370             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
371             {
372                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
373             }
374             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
375             {
376                 C_YUV420_YUYV( );
377             }
378
379             p_y1 += i_source_margin;
380             p_y2 += i_source_margin;
381             p_u += i_source_margin_c;
382             p_v += i_source_margin_c;
383             p_line1 += i_dest_margin;
384             p_line2 += i_dest_margin;
385         }
386     }
387     else
388     {
389         /* use slower SSE2 unaligned fetch and store */
390         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
391         {
392             p_line1 = p_line2;
393             p_line2 += p_dest->p->i_pitch;
394
395             p_y1 = p_y2;
396             p_y2 += p_source->p[Y_PLANE].i_pitch;
397
398             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
399             {
400                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
401             }
402             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
403             {
404                 C_YUV420_YUYV( );
405             }
406
407             p_y1 += i_source_margin;
408             p_y2 += i_source_margin;
409             p_u += i_source_margin_c;
410             p_v += i_source_margin_c;
411             p_line1 += i_dest_margin;
412             p_line2 += i_dest_margin;
413         }
414     }
415     /* make sure all SSE2 stores are visible thereafter */
416     SSE2_END;
417
418 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
419 }
420
421 /*****************************************************************************
422  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
423  *****************************************************************************/
424 VLC_TARGET
425 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
426                                            picture_t *p_dest )
427 {
428     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
429     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
430     uint8_t *p_u = p_source->U_PIXELS;
431     uint8_t *p_v = p_source->V_PIXELS;
432
433     int i_x, i_y;
434
435 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
436 #define VEC_NEXT_LINES( ) \
437     p_line1  = p_line2; \
438     p_line2 += p_dest->p->i_pitch; \
439     p_y1     = p_y2; \
440     p_y2    += p_source->p[Y_PLANE].i_pitch;
441
442 #define VEC_LOAD_UV( ) \
443     u_vec = vec_ld( 0, p_u ); p_u += 16; \
444     v_vec = vec_ld( 0, p_v ); p_v += 16;
445
446 #define VEC_MERGE( a ) \
447     vu_vec = a( v_vec, u_vec ); \
448     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
449     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
450     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
451     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
452     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
453     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
454
455     vector unsigned char u_vec;
456     vector unsigned char v_vec;
457     vector unsigned char vu_vec;
458     vector unsigned char y_vec;
459
460     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
461            ( p_filter->fmt_in.video.i_height % 2 ) ) )
462     {
463         /* Width is a multiple of 32, we take 2 lines at a time */
464         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
465         {
466             VEC_NEXT_LINES( );
467             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
468             {
469                 VEC_LOAD_UV( );
470                 VEC_MERGE( vec_mergeh );
471                 VEC_MERGE( vec_mergel );
472             }
473         }
474     }
475     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
476                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
477     {
478         /* Width is only a multiple of 16, we take 4 lines at a time */
479         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
480         {
481             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
482             VEC_NEXT_LINES( );
483             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
484             {
485                 VEC_LOAD_UV( );
486                 VEC_MERGE( vec_mergeh );
487                 VEC_MERGE( vec_mergel );
488             }
489
490             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
491             VEC_LOAD_UV( );
492             VEC_MERGE( vec_mergeh );
493
494             /* Line 3 and 4, pixels 0 to 16 */
495             VEC_NEXT_LINES( );
496             VEC_MERGE( vec_mergel );
497
498             /* Line 3 and 4, pixels 16 to ( width ) */
499             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
500             {
501                 VEC_LOAD_UV( );
502                 VEC_MERGE( vec_mergeh );
503                 VEC_MERGE( vec_mergel );
504             }
505         }
506     }
507     else
508     {
509         /* Crap, use the C version */
510 #undef VEC_NEXT_LINES
511 #undef VEC_LOAD_UV
512 #undef VEC_MERGE
513 #endif
514
515     const int i_source_margin = p_source->p[0].i_pitch
516                                  - p_source->p[0].i_visible_pitch;
517     const int i_source_margin_c = p_source->p[1].i_pitch
518                                  - p_source->p[1].i_visible_pitch;
519     const int i_dest_margin = p_dest->p->i_pitch
520                                - p_dest->p->i_visible_pitch;
521
522 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
523     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
524     {
525         p_line1 = p_line2;
526         p_line2 += p_dest->p->i_pitch;
527
528         p_y1 = p_y2;
529         p_y2 += p_source->p[Y_PLANE].i_pitch;
530
531         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
532         {
533 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
534             C_YUV420_YVYU( );
535             C_YUV420_YVYU( );
536             C_YUV420_YVYU( );
537             C_YUV420_YVYU( );
538 #else
539             MMX_CALL( MMX_YUV420_YVYU );
540 #endif
541         }
542         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
543         {
544             C_YUV420_YVYU( );
545         }
546
547         p_y1 += i_source_margin;
548         p_y2 += i_source_margin;
549         p_u += i_source_margin_c;
550         p_v += i_source_margin_c;
551         p_line1 += i_dest_margin;
552         p_line2 += i_dest_margin;
553     }
554
555 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
556     /* re-enable FPU registers */
557     MMX_END;
558 #endif
559
560 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
561     }
562 #endif
563
564 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
565     /*
566     ** SSE2 128 bits fetch/store instructions are faster
567     ** if memory access is 16 bytes aligned
568     */
569     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
570         ((intptr_t)p_line2|(intptr_t)p_y2))) )
571     {
572         /* use faster SSE2 aligned fetch and store */
573         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
574         {
575             p_line1 = p_line2;
576             p_line2 += p_dest->p->i_pitch;
577
578             p_y1 = p_y2;
579             p_y2 += p_source->p[Y_PLANE].i_pitch;
580
581             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
582             {
583                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
584             }
585             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
586             {
587                 C_YUV420_YVYU( );
588             }
589
590             p_y1 += i_source_margin;
591             p_y2 += i_source_margin;
592             p_u += i_source_margin_c;
593             p_v += i_source_margin_c;
594             p_line1 += i_dest_margin;
595             p_line2 += i_dest_margin;
596         }
597     }
598     else
599     {
600         /* use slower SSE2 unaligned fetch and store */
601         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
602         {
603             p_line1 = p_line2;
604             p_line2 += p_dest->p->i_pitch;
605
606             p_y1 = p_y2;
607             p_y2 += p_source->p[Y_PLANE].i_pitch;
608
609             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
610             {
611                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
612             }
613             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
614             {
615                 C_YUV420_YVYU( );
616             }
617
618             p_y1 += i_source_margin;
619             p_y2 += i_source_margin;
620             p_u += i_source_margin_c;
621             p_v += i_source_margin_c;
622             p_line1 += i_dest_margin;
623             p_line2 += i_dest_margin;
624         }
625     }
626     /* make sure all SSE2 stores are visible thereafter */
627     SSE2_END;
628 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
629 }
630
631 /*****************************************************************************
632  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
633  *****************************************************************************/
634 VLC_TARGET
635 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
636                                            picture_t *p_dest )
637 {
638     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
639     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
640     uint8_t *p_u = p_source->U_PIXELS;
641     uint8_t *p_v = p_source->V_PIXELS;
642
643     int i_x, i_y;
644
645 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
646 #define VEC_NEXT_LINES( ) \
647     p_line1  = p_line2; \
648     p_line2 += p_dest->p->i_pitch; \
649     p_y1     = p_y2; \
650     p_y2    += p_source->p[Y_PLANE].i_pitch;
651
652 #define VEC_LOAD_UV( ) \
653     u_vec = vec_ld( 0, p_u ); p_u += 16; \
654     v_vec = vec_ld( 0, p_v ); p_v += 16;
655
656 #define VEC_MERGE( a ) \
657     uv_vec = a( u_vec, v_vec ); \
658     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
659     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
660     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
661     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
662     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
663     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
664
665     vector unsigned char u_vec;
666     vector unsigned char v_vec;
667     vector unsigned char uv_vec;
668     vector unsigned char y_vec;
669
670     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
671            ( p_filter->fmt_in.video.i_height % 2 ) ) )
672     {
673         /* Width is a multiple of 32, we take 2 lines at a time */
674         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
675         {
676             VEC_NEXT_LINES( );
677             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
678             {
679                 VEC_LOAD_UV( );
680                 VEC_MERGE( vec_mergeh );
681                 VEC_MERGE( vec_mergel );
682             }
683         }
684     }
685     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
686                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
687     {
688         /* Width is only a multiple of 16, we take 4 lines at a time */
689         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
690         {
691             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
692             VEC_NEXT_LINES( );
693             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
694             {
695                 VEC_LOAD_UV( );
696                 VEC_MERGE( vec_mergeh );
697                 VEC_MERGE( vec_mergel );
698             }
699
700             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
701             VEC_LOAD_UV( );
702             VEC_MERGE( vec_mergeh );
703
704             /* Line 3 and 4, pixels 0 to 16 */
705             VEC_NEXT_LINES( );
706             VEC_MERGE( vec_mergel );
707
708             /* Line 3 and 4, pixels 16 to ( width ) */
709             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
710             {
711                 VEC_LOAD_UV( );
712                 VEC_MERGE( vec_mergeh );
713                 VEC_MERGE( vec_mergel );
714             }
715         }
716     }
717     else
718     {
719         /* Crap, use the C version */
720 #undef VEC_NEXT_LINES
721 #undef VEC_LOAD_UV
722 #undef VEC_MERGE
723 #endif
724
725     const int i_source_margin = p_source->p[0].i_pitch
726                                  - p_source->p[0].i_visible_pitch;
727     const int i_source_margin_c = p_source->p[1].i_pitch
728                                  - p_source->p[1].i_visible_pitch;
729     const int i_dest_margin = p_dest->p->i_pitch
730                                - p_dest->p->i_visible_pitch;
731
732 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
733     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
734     {
735         p_line1 = p_line2;
736         p_line2 += p_dest->p->i_pitch;
737
738         p_y1 = p_y2;
739         p_y2 += p_source->p[Y_PLANE].i_pitch;
740
741         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
742         {
743 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
744             C_YUV420_UYVY( );
745             C_YUV420_UYVY( );
746             C_YUV420_UYVY( );
747             C_YUV420_UYVY( );
748 #else
749             MMX_CALL( MMX_YUV420_UYVY );
750 #endif
751         }
752         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
753         {
754             C_YUV420_UYVY( );
755         }
756
757         p_y1 += i_source_margin;
758         p_y2 += i_source_margin;
759         p_u += i_source_margin_c;
760         p_v += i_source_margin_c;
761         p_line1 += i_dest_margin;
762         p_line2 += i_dest_margin;
763     }
764
765 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
766     /* re-enable FPU registers */
767     MMX_END;
768 #endif
769
770 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
771     }
772 #endif
773
774 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
775     /*
776     ** SSE2 128 bits fetch/store instructions are faster
777     ** if memory access is 16 bytes aligned
778     */
779     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
780         ((intptr_t)p_line2|(intptr_t)p_y2))) )
781     {
782         /* use faster SSE2 aligned fetch and store */
783         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
784         {
785             p_line1 = p_line2;
786             p_line2 += p_dest->p->i_pitch;
787
788             p_y1 = p_y2;
789             p_y2 += p_source->p[Y_PLANE].i_pitch;
790
791             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
792             {
793                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
794             }
795             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
796             {
797                 C_YUV420_UYVY( );
798             }
799
800             p_y1 += i_source_margin;
801             p_y2 += i_source_margin;
802             p_u += i_source_margin_c;
803             p_v += i_source_margin_c;
804             p_line1 += i_dest_margin;
805             p_line2 += i_dest_margin;
806         }
807     }
808     else
809     {
810         /* use slower SSE2 unaligned fetch and store */
811         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
812         {
813             p_line1 = p_line2;
814             p_line2 += p_dest->p->i_pitch;
815
816             p_y1 = p_y2;
817             p_y2 += p_source->p[Y_PLANE].i_pitch;
818
819             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
820             {
821                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
822             }
823             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
824             {
825                 C_YUV420_UYVY( );
826             }
827
828             p_y1 += i_source_margin;
829             p_y2 += i_source_margin;
830             p_u += i_source_margin_c;
831             p_v += i_source_margin_c;
832             p_line1 += i_dest_margin;
833             p_line2 += i_dest_margin;
834         }
835     }
836     /* make sure all SSE2 stores are visible thereafter */
837     SSE2_END;
838 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
839 }
840
841 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
842 /*****************************************************************************
843  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
844  *****************************************************************************/
845 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
846                                            picture_t *p_dest )
847 {
848     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
849     /* FIXME: TODO ! */
850     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
851 }
852
853 /*****************************************************************************
854  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
855  *****************************************************************************/
856 VLC_TARGET
857 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
858                                            picture_t *p_dest )
859 {
860     uint8_t *p_line1 = p_dest->p->p_pixels +
861                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
862                        + p_dest->p->i_pitch;
863     uint8_t *p_line2 = p_dest->p->p_pixels +
864                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
865     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
866     uint8_t *p_u = p_source->U_PIXELS;
867     uint8_t *p_v = p_source->V_PIXELS;
868
869     int i_x, i_y;
870
871     const int i_source_margin = p_source->p[0].i_pitch
872                                  - p_source->p[0].i_visible_pitch;
873     const int i_source_margin_c = p_source->p[1].i_pitch
874                                  - p_source->p[1].i_visible_pitch;
875     const int i_dest_margin = p_dest->p->i_pitch
876                                - p_dest->p->i_visible_pitch;
877
878 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
879     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
880     {
881         p_line1 -= 3 * p_dest->p->i_pitch;
882         p_line2 -= 3 * p_dest->p->i_pitch;
883
884         p_y1 = p_y2;
885         p_y2 += p_source->p[Y_PLANE].i_pitch;
886
887         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
888         {
889 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
890             C_YUV420_UYVY( );
891             C_YUV420_UYVY( );
892             C_YUV420_UYVY( );
893             C_YUV420_UYVY( );
894 #else
895             MMX_CALL( MMX_YUV420_UYVY );
896 #endif
897         }
898         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
899         {
900             C_YUV420_UYVY( );
901         }
902
903         p_y1 += i_source_margin;
904         p_y2 += i_source_margin;
905         p_u += i_source_margin_c;
906         p_v += i_source_margin_c;
907         p_line1 += i_dest_margin;
908         p_line2 += i_dest_margin;
909     }
910
911 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
912     /* re-enable FPU registers */
913     MMX_END;
914 #endif
915
916 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
917     /*
918     ** SSE2 128 bits fetch/store instructions are faster
919     ** if memory access is 16 bytes aligned
920     */
921     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
922         ((intptr_t)p_line2|(intptr_t)p_y2))) )
923     {
924         /* use faster SSE2 aligned fetch and store */
925         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
926         {
927             p_line1 = p_line2;
928             p_line2 += p_dest->p->i_pitch;
929
930             p_y1 = p_y2;
931             p_y2 += p_source->p[Y_PLANE].i_pitch;
932
933             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
934             {
935                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
936             }
937             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
938             {
939                 C_YUV420_UYVY( );
940             }
941
942             p_y1 += i_source_margin;
943             p_y2 += i_source_margin;
944             p_u += i_source_margin_c;
945             p_v += i_source_margin_c;
946             p_line1 += i_dest_margin;
947             p_line2 += i_dest_margin;
948         }
949     }
950     else
951     {
952         /* use slower SSE2 unaligned fetch and store */
953         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
954         {
955             p_line1 = p_line2;
956             p_line2 += p_dest->p->i_pitch;
957
958             p_y1 = p_y2;
959             p_y2 += p_source->p[Y_PLANE].i_pitch;
960
961             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
962             {
963                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
964             }
965             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
966             {
967                 C_YUV420_UYVY( );
968             }
969
970             p_y1 += i_source_margin;
971             p_y2 += i_source_margin;
972             p_u += i_source_margin_c;
973             p_v += i_source_margin_c;
974             p_line1 += i_dest_margin;
975             p_line2 += i_dest_margin;
976         }
977     }
978     /* make sure all SSE2 stores are visible thereafter */
979     SSE2_END;
980 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
981 }
982 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
983
984 /*****************************************************************************
985  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
986  *****************************************************************************/
987 #if defined (MODULE_NAME_IS_i420_yuy2)
988 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
989                                            picture_t *p_dest )
990 {
991     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
992     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
993     uint8_t *p_u = p_source->U_PIXELS;
994     uint8_t *p_v = p_source->V_PIXELS;
995
996     int i_x, i_y;
997
998     const int i_source_margin = p_source->p[0].i_pitch
999                                  - p_source->p[0].i_visible_pitch;
1000     const int i_source_margin_c = p_source->p[1].i_pitch
1001                                  - p_source->p[1].i_visible_pitch;
1002     const int i_dest_margin = p_dest->p->i_pitch
1003                                - p_dest->p->i_visible_pitch;
1004
1005     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
1006     {
1007         p_line1 = p_line2;
1008         p_line2 += p_dest->p->i_pitch;
1009
1010         p_y1 = p_y2;
1011         p_y2 += p_source->p[Y_PLANE].i_pitch;
1012
1013         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1014         {
1015             C_YUV420_Y211( );
1016             C_YUV420_Y211( );
1017         }
1018
1019         p_y1 += i_source_margin;
1020         p_y2 += i_source_margin;
1021         p_u += i_source_margin_c;
1022         p_v += i_source_margin_c;
1023         p_line1 += i_dest_margin;
1024         p_line2 += i_dest_margin;
1025     }
1026 }
1027 #endif