]> git.sesse.net Git - vlc/blob - modules/video_chroma/i420_yuy2.c
mediacodec: skip prerolled frames
[vlc] / modules / video_chroma / i420_yuy2.c
1 /*****************************************************************************
2  * i420_yuy2.c : YUV to YUV conversion module for vlc
3  *****************************************************************************
4  * Copyright (C) 2000, 2001 VLC authors and VideoLAN
5  * $Id$
6  *
7  * Authors: Samuel Hocevar <sam@zoy.org>
8  *          Damien Fouilleul <damien@videolan.org>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of the GNU Lesser General Public License as published by
12  * the Free Software Foundation; either version 2.1 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this program; if not, write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24
25 /*****************************************************************************
26  * Preamble
27  *****************************************************************************/
28
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
32
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_cpu.h>
37
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
39 #   include <altivec.h>
40 #endif
41
42 #include "i420_yuy2.h"
43
44 #define SRC_FOURCC  "I420,IYUV,YV12"
45
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,Y211"
48 #    define VLC_TARGET
49 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
51 #    define VLC_TARGET VLC_MMX
52 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
54 #    define VLC_TARGET VLC_SSE
55 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
56 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
57 #    define VLC_TARGET
58 #endif
59
60 /*****************************************************************************
61  * Local and extern prototypes.
62  *****************************************************************************/
63 static int  Activate ( vlc_object_t * );
64
65 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
66 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
67 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
68 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
69 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
70 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
71 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
72 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
73 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
74 #endif
75 #if defined (MODULE_NAME_IS_i420_yuy2)
76 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
77 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
78 #endif
79
80 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
81 /* Initialize MMX-specific constants */
82 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
83 static const uint64_t i_80w   = 0x0000000080808080ULL;
84 #endif
85
86 /*****************************************************************************
87  * Module descriptor.
88  *****************************************************************************/
89 vlc_module_begin ()
90 #if defined (MODULE_NAME_IS_i420_yuy2)
91     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
92     set_capability( "video filter2", 80 )
93 # define vlc_CPU_capable() (true)
94 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
95     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
96     set_capability( "video filter2", 160 )
97 # define vlc_CPU_capable() vlc_CPU_MMX()
98 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
99     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
100     set_capability( "video filter2", 250 )
101 # define vlc_CPU_capable() vlc_CPU_SSE2()
102 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
103     set_description(
104             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
105     set_capability( "video filter2", 250 )
106 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
107 #endif
108     set_callbacks( Activate, NULL )
109 vlc_module_end ()
110
111 /*****************************************************************************
112  * Activate: allocate a chroma function
113  *****************************************************************************
114  * This function allocates and initializes a chroma function
115  *****************************************************************************/
116 static int Activate( vlc_object_t *p_this )
117 {
118     filter_t *p_filter = (filter_t *)p_this;
119
120     if( !vlc_CPU_capable() )
121         return VLC_EGENERIC;
122     if( p_filter->fmt_in.video.i_width & 1
123      || p_filter->fmt_in.video.i_height & 1 )
124     {
125         return -1;
126     }
127
128     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
129        || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height
130        || p_filter->fmt_in.video.orientation != p_filter->fmt_out.video.orientation )
131         return -1;
132
133     switch( p_filter->fmt_in.video.i_chroma )
134     {
135 //        case VLC_CODEC_YV12: FIXME invert U and V in the filters :)
136         case VLC_CODEC_I420:
137             switch( p_filter->fmt_out.video.i_chroma )
138             {
139                 case VLC_CODEC_YUYV:
140                     p_filter->pf_video_filter = I420_YUY2_Filter;
141                     break;
142
143                 case VLC_CODEC_YVYU:
144                     p_filter->pf_video_filter = I420_YVYU_Filter;
145                     break;
146
147                 case VLC_CODEC_UYVY:
148                     p_filter->pf_video_filter = I420_UYVY_Filter;
149                     break;
150 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
151                 case VLC_FOURCC('I','U','Y','V'):
152                     p_filter->pf_video_filter = I420_IUYV_Filter;
153                     break;
154 #endif
155
156 #if defined (MODULE_NAME_IS_i420_yuy2)
157                 case VLC_CODEC_Y211:
158                     p_filter->pf_video_filter = I420_Y211_Filter;
159                     break;
160 #endif
161
162                 default:
163                     return -1;
164             }
165             break;
166
167         default:
168             return -1;
169     }
170
171     return 0;
172 }
173
174 #if 0
175 static inline unsigned long long read_cycles(void)
176 {
177     unsigned long long v;
178     __asm__ __volatile__("rdtsc" : "=A" (v): );
179
180     return v;
181 }
182 #endif
183
184 /* Following functions are local */
185
186 VIDEO_FILTER_WRAPPER( I420_YUY2 )
187 VIDEO_FILTER_WRAPPER( I420_YVYU )
188 VIDEO_FILTER_WRAPPER( I420_UYVY )
189 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
190 VIDEO_FILTER_WRAPPER( I420_IUYV )
191 #endif
192 #if defined (MODULE_NAME_IS_i420_yuy2)
193 VIDEO_FILTER_WRAPPER( I420_Y211 )
194 #endif
195
196 /*****************************************************************************
197  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
198  *****************************************************************************/
199 VLC_TARGET
200 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
201                                            picture_t *p_dest )
202 {
203     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
204     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
205     uint8_t *p_u = p_source->U_PIXELS;
206     uint8_t *p_v = p_source->V_PIXELS;
207
208     int i_x, i_y;
209
210 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
211 #define VEC_NEXT_LINES( ) \
212     p_line1  = p_line2; \
213     p_line2 += p_dest->p->i_pitch; \
214     p_y1     = p_y2; \
215     p_y2    += p_source->p[Y_PLANE].i_pitch;
216
217 #define VEC_LOAD_UV( ) \
218     u_vec = vec_ld( 0, p_u ); p_u += 16; \
219     v_vec = vec_ld( 0, p_v ); p_v += 16;
220
221 #define VEC_MERGE( a ) \
222     uv_vec = a( u_vec, v_vec ); \
223     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
224     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
225     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
226     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
227     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
228     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
229
230     vector unsigned char u_vec;
231     vector unsigned char v_vec;
232     vector unsigned char uv_vec;
233     vector unsigned char y_vec;
234
235     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
236            ( p_filter->fmt_in.video.i_height % 2 ) ) )
237     {
238         /* Width is a multiple of 32, we take 2 lines at a time */
239         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
240         {
241             VEC_NEXT_LINES( );
242             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
243             {
244                 VEC_LOAD_UV( );
245                 VEC_MERGE( vec_mergeh );
246                 VEC_MERGE( vec_mergel );
247             }
248         }
249     }
250 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
251 #if 0
252     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
253                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
254     {
255         /* Width is only a multiple of 16, we take 4 lines at a time */
256         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
257         {
258             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
259             VEC_NEXT_LINES( );
260             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
261             {
262                 VEC_LOAD_UV( );
263                 VEC_MERGE( vec_mergeh );
264                 VEC_MERGE( vec_mergel );
265             }
266
267             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
268             VEC_LOAD_UV( );
269             VEC_MERGE( vec_mergeh );
270
271             /* Line 3 and 4, pixels 0 to 16 */
272             VEC_NEXT_LINES( );
273             VEC_MERGE( vec_mergel );
274
275             /* Line 3 and 4, pixels 16 to ( width ) */
276             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
277             {
278                 VEC_LOAD_UV( );
279                 VEC_MERGE( vec_mergeh );
280                 VEC_MERGE( vec_mergel );
281             }
282         }
283     }
284 #endif
285     else
286     {
287         /* Crap, use the C version */
288 #undef VEC_NEXT_LINES
289 #undef VEC_LOAD_UV
290 #undef VEC_MERGE
291 #endif
292
293     const int i_source_margin = p_source->p[0].i_pitch
294                                  - p_source->p[0].i_visible_pitch;
295     const int i_source_margin_c = p_source->p[1].i_pitch
296                                  - p_source->p[1].i_visible_pitch;
297     const int i_dest_margin = p_dest->p->i_pitch
298                                - p_dest->p->i_visible_pitch;
299
300 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
301     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
302     {
303         p_line1 = p_line2;
304         p_line2 += p_dest->p->i_pitch;
305
306         p_y1 = p_y2;
307         p_y2 += p_source->p[Y_PLANE].i_pitch;
308
309 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
310         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
311         {
312             C_YUV420_YUYV( );
313             C_YUV420_YUYV( );
314             C_YUV420_YUYV( );
315             C_YUV420_YUYV( );
316         }
317 #else
318         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
319         {
320             MMX_CALL( MMX_YUV420_YUYV );
321         }
322 #endif
323         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
324         {
325             C_YUV420_YUYV( );
326         }
327
328         p_y1 += i_source_margin;
329         p_y2 += i_source_margin;
330         p_u += i_source_margin_c;
331         p_v += i_source_margin_c;
332         p_line1 += i_dest_margin;
333         p_line2 += i_dest_margin;
334     }
335
336 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
337     /* re-enable FPU registers */
338     MMX_END;
339 #endif
340
341 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
342     }
343 #endif
344
345 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
346     /*
347     ** SSE2 128 bits fetch/store instructions are faster
348     ** if memory access is 16 bytes aligned
349     */
350
351     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
352         ((intptr_t)p_line2|(intptr_t)p_y2))) )
353     {
354         /* use faster SSE2 aligned fetch and store */
355         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
356         {
357             p_line1 = p_line2;
358             p_line2 += p_dest->p->i_pitch;
359
360             p_y1 = p_y2;
361             p_y2 += p_source->p[Y_PLANE].i_pitch;
362
363             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
364             {
365                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
366             }
367             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
368             {
369                 C_YUV420_YUYV( );
370             }
371
372             p_y1 += i_source_margin;
373             p_y2 += i_source_margin;
374             p_u += i_source_margin_c;
375             p_v += i_source_margin_c;
376             p_line1 += i_dest_margin;
377             p_line2 += i_dest_margin;
378         }
379     }
380     else
381     {
382         /* use slower SSE2 unaligned fetch and store */
383         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
384         {
385             p_line1 = p_line2;
386             p_line2 += p_dest->p->i_pitch;
387
388             p_y1 = p_y2;
389             p_y2 += p_source->p[Y_PLANE].i_pitch;
390
391             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
392             {
393                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
394             }
395             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
396             {
397                 C_YUV420_YUYV( );
398             }
399
400             p_y1 += i_source_margin;
401             p_y2 += i_source_margin;
402             p_u += i_source_margin_c;
403             p_v += i_source_margin_c;
404             p_line1 += i_dest_margin;
405             p_line2 += i_dest_margin;
406         }
407     }
408     /* make sure all SSE2 stores are visible thereafter */
409     SSE2_END;
410
411 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
412 }
413
414 /*****************************************************************************
415  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
416  *****************************************************************************/
417 VLC_TARGET
418 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
419                                            picture_t *p_dest )
420 {
421     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
422     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
423     uint8_t *p_u = p_source->U_PIXELS;
424     uint8_t *p_v = p_source->V_PIXELS;
425
426     int i_x, i_y;
427
428 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
429 #define VEC_NEXT_LINES( ) \
430     p_line1  = p_line2; \
431     p_line2 += p_dest->p->i_pitch; \
432     p_y1     = p_y2; \
433     p_y2    += p_source->p[Y_PLANE].i_pitch;
434
435 #define VEC_LOAD_UV( ) \
436     u_vec = vec_ld( 0, p_u ); p_u += 16; \
437     v_vec = vec_ld( 0, p_v ); p_v += 16;
438
439 #define VEC_MERGE( a ) \
440     vu_vec = a( v_vec, u_vec ); \
441     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
442     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
443     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
444     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
445     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
446     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
447
448     vector unsigned char u_vec;
449     vector unsigned char v_vec;
450     vector unsigned char vu_vec;
451     vector unsigned char y_vec;
452
453     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
454            ( p_filter->fmt_in.video.i_height % 2 ) ) )
455     {
456         /* Width is a multiple of 32, we take 2 lines at a time */
457         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
458         {
459             VEC_NEXT_LINES( );
460             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
461             {
462                 VEC_LOAD_UV( );
463                 VEC_MERGE( vec_mergeh );
464                 VEC_MERGE( vec_mergel );
465             }
466         }
467     }
468     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
469                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
470     {
471         /* Width is only a multiple of 16, we take 4 lines at a time */
472         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
473         {
474             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
475             VEC_NEXT_LINES( );
476             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
477             {
478                 VEC_LOAD_UV( );
479                 VEC_MERGE( vec_mergeh );
480                 VEC_MERGE( vec_mergel );
481             }
482
483             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
484             VEC_LOAD_UV( );
485             VEC_MERGE( vec_mergeh );
486
487             /* Line 3 and 4, pixels 0 to 16 */
488             VEC_NEXT_LINES( );
489             VEC_MERGE( vec_mergel );
490
491             /* Line 3 and 4, pixels 16 to ( width ) */
492             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
493             {
494                 VEC_LOAD_UV( );
495                 VEC_MERGE( vec_mergeh );
496                 VEC_MERGE( vec_mergel );
497             }
498         }
499     }
500     else
501     {
502         /* Crap, use the C version */
503 #undef VEC_NEXT_LINES
504 #undef VEC_LOAD_UV
505 #undef VEC_MERGE
506 #endif
507
508     const int i_source_margin = p_source->p[0].i_pitch
509                                  - p_source->p[0].i_visible_pitch;
510     const int i_source_margin_c = p_source->p[1].i_pitch
511                                  - p_source->p[1].i_visible_pitch;
512     const int i_dest_margin = p_dest->p->i_pitch
513                                - p_dest->p->i_visible_pitch;
514
515 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
516     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
517     {
518         p_line1 = p_line2;
519         p_line2 += p_dest->p->i_pitch;
520
521         p_y1 = p_y2;
522         p_y2 += p_source->p[Y_PLANE].i_pitch;
523
524         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
525         {
526 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
527             C_YUV420_YVYU( );
528             C_YUV420_YVYU( );
529             C_YUV420_YVYU( );
530             C_YUV420_YVYU( );
531 #else
532             MMX_CALL( MMX_YUV420_YVYU );
533 #endif
534         }
535         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
536         {
537             C_YUV420_YVYU( );
538         }
539
540         p_y1 += i_source_margin;
541         p_y2 += i_source_margin;
542         p_u += i_source_margin_c;
543         p_v += i_source_margin_c;
544         p_line1 += i_dest_margin;
545         p_line2 += i_dest_margin;
546     }
547
548 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
549     /* re-enable FPU registers */
550     MMX_END;
551 #endif
552
553 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
554     }
555 #endif
556
557 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
558     /*
559     ** SSE2 128 bits fetch/store instructions are faster
560     ** if memory access is 16 bytes aligned
561     */
562     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
563         ((intptr_t)p_line2|(intptr_t)p_y2))) )
564     {
565         /* use faster SSE2 aligned fetch and store */
566         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
567         {
568             p_line1 = p_line2;
569             p_line2 += p_dest->p->i_pitch;
570
571             p_y1 = p_y2;
572             p_y2 += p_source->p[Y_PLANE].i_pitch;
573
574             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
575             {
576                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
577             }
578             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
579             {
580                 C_YUV420_YVYU( );
581             }
582
583             p_y1 += i_source_margin;
584             p_y2 += i_source_margin;
585             p_u += i_source_margin_c;
586             p_v += i_source_margin_c;
587             p_line1 += i_dest_margin;
588             p_line2 += i_dest_margin;
589         }
590     }
591     else
592     {
593         /* use slower SSE2 unaligned fetch and store */
594         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
595         {
596             p_line1 = p_line2;
597             p_line2 += p_dest->p->i_pitch;
598
599             p_y1 = p_y2;
600             p_y2 += p_source->p[Y_PLANE].i_pitch;
601
602             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
603             {
604                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
605             }
606             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
607             {
608                 C_YUV420_YVYU( );
609             }
610
611             p_y1 += i_source_margin;
612             p_y2 += i_source_margin;
613             p_u += i_source_margin_c;
614             p_v += i_source_margin_c;
615             p_line1 += i_dest_margin;
616             p_line2 += i_dest_margin;
617         }
618     }
619     /* make sure all SSE2 stores are visible thereafter */
620     SSE2_END;
621 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
622 }
623
624 /*****************************************************************************
625  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
626  *****************************************************************************/
627 VLC_TARGET
628 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
629                                            picture_t *p_dest )
630 {
631     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
632     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
633     uint8_t *p_u = p_source->U_PIXELS;
634     uint8_t *p_v = p_source->V_PIXELS;
635
636     int i_x, i_y;
637
638 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
639 #define VEC_NEXT_LINES( ) \
640     p_line1  = p_line2; \
641     p_line2 += p_dest->p->i_pitch; \
642     p_y1     = p_y2; \
643     p_y2    += p_source->p[Y_PLANE].i_pitch;
644
645 #define VEC_LOAD_UV( ) \
646     u_vec = vec_ld( 0, p_u ); p_u += 16; \
647     v_vec = vec_ld( 0, p_v ); p_v += 16;
648
649 #define VEC_MERGE( a ) \
650     uv_vec = a( u_vec, v_vec ); \
651     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
652     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
653     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
654     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
655     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
656     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
657
658     vector unsigned char u_vec;
659     vector unsigned char v_vec;
660     vector unsigned char uv_vec;
661     vector unsigned char y_vec;
662
663     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
664            ( p_filter->fmt_in.video.i_height % 2 ) ) )
665     {
666         /* Width is a multiple of 32, we take 2 lines at a time */
667         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
668         {
669             VEC_NEXT_LINES( );
670             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
671             {
672                 VEC_LOAD_UV( );
673                 VEC_MERGE( vec_mergeh );
674                 VEC_MERGE( vec_mergel );
675             }
676         }
677     }
678     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
679                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
680     {
681         /* Width is only a multiple of 16, we take 4 lines at a time */
682         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
683         {
684             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
685             VEC_NEXT_LINES( );
686             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
687             {
688                 VEC_LOAD_UV( );
689                 VEC_MERGE( vec_mergeh );
690                 VEC_MERGE( vec_mergel );
691             }
692
693             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
694             VEC_LOAD_UV( );
695             VEC_MERGE( vec_mergeh );
696
697             /* Line 3 and 4, pixels 0 to 16 */
698             VEC_NEXT_LINES( );
699             VEC_MERGE( vec_mergel );
700
701             /* Line 3 and 4, pixels 16 to ( width ) */
702             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
703             {
704                 VEC_LOAD_UV( );
705                 VEC_MERGE( vec_mergeh );
706                 VEC_MERGE( vec_mergel );
707             }
708         }
709     }
710     else
711     {
712         /* Crap, use the C version */
713 #undef VEC_NEXT_LINES
714 #undef VEC_LOAD_UV
715 #undef VEC_MERGE
716 #endif
717
718     const int i_source_margin = p_source->p[0].i_pitch
719                                  - p_source->p[0].i_visible_pitch;
720     const int i_source_margin_c = p_source->p[1].i_pitch
721                                  - p_source->p[1].i_visible_pitch;
722     const int i_dest_margin = p_dest->p->i_pitch
723                                - p_dest->p->i_visible_pitch;
724
725 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
726     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
727     {
728         p_line1 = p_line2;
729         p_line2 += p_dest->p->i_pitch;
730
731         p_y1 = p_y2;
732         p_y2 += p_source->p[Y_PLANE].i_pitch;
733
734         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
735         {
736 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
737             C_YUV420_UYVY( );
738             C_YUV420_UYVY( );
739             C_YUV420_UYVY( );
740             C_YUV420_UYVY( );
741 #else
742             MMX_CALL( MMX_YUV420_UYVY );
743 #endif
744         }
745         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
746         {
747             C_YUV420_UYVY( );
748         }
749
750         p_y1 += i_source_margin;
751         p_y2 += i_source_margin;
752         p_u += i_source_margin_c;
753         p_v += i_source_margin_c;
754         p_line1 += i_dest_margin;
755         p_line2 += i_dest_margin;
756     }
757
758 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
759     /* re-enable FPU registers */
760     MMX_END;
761 #endif
762
763 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
764     }
765 #endif
766
767 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
768     /*
769     ** SSE2 128 bits fetch/store instructions are faster
770     ** if memory access is 16 bytes aligned
771     */
772     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
773         ((intptr_t)p_line2|(intptr_t)p_y2))) )
774     {
775         /* use faster SSE2 aligned fetch and store */
776         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
777         {
778             p_line1 = p_line2;
779             p_line2 += p_dest->p->i_pitch;
780
781             p_y1 = p_y2;
782             p_y2 += p_source->p[Y_PLANE].i_pitch;
783
784             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
785             {
786                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
787             }
788             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
789             {
790                 C_YUV420_UYVY( );
791             }
792
793             p_y1 += i_source_margin;
794             p_y2 += i_source_margin;
795             p_u += i_source_margin_c;
796             p_v += i_source_margin_c;
797             p_line1 += i_dest_margin;
798             p_line2 += i_dest_margin;
799         }
800     }
801     else
802     {
803         /* use slower SSE2 unaligned fetch and store */
804         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
805         {
806             p_line1 = p_line2;
807             p_line2 += p_dest->p->i_pitch;
808
809             p_y1 = p_y2;
810             p_y2 += p_source->p[Y_PLANE].i_pitch;
811
812             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
813             {
814                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
815             }
816             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
817             {
818                 C_YUV420_UYVY( );
819             }
820
821             p_y1 += i_source_margin;
822             p_y2 += i_source_margin;
823             p_u += i_source_margin_c;
824             p_v += i_source_margin_c;
825             p_line1 += i_dest_margin;
826             p_line2 += i_dest_margin;
827         }
828     }
829     /* make sure all SSE2 stores are visible thereafter */
830     SSE2_END;
831 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
832 }
833
834 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
835 /*****************************************************************************
836  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
837  *****************************************************************************/
838 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
839                                            picture_t *p_dest )
840 {
841     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
842     /* FIXME: TODO ! */
843     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
844 }
845 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
846
847 /*****************************************************************************
848  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
849  *****************************************************************************/
850 #if defined (MODULE_NAME_IS_i420_yuy2)
851 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
852                                            picture_t *p_dest )
853 {
854     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
855     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
856     uint8_t *p_u = p_source->U_PIXELS;
857     uint8_t *p_v = p_source->V_PIXELS;
858
859     int i_x, i_y;
860
861     const int i_source_margin = p_source->p[0].i_pitch
862                                  - p_source->p[0].i_visible_pitch;
863     const int i_source_margin_c = p_source->p[1].i_pitch
864                                  - p_source->p[1].i_visible_pitch;
865     const int i_dest_margin = p_dest->p->i_pitch
866                                - p_dest->p->i_visible_pitch;
867
868     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
869     {
870         p_line1 = p_line2;
871         p_line2 += p_dest->p->i_pitch;
872
873         p_y1 = p_y2;
874         p_y2 += p_source->p[Y_PLANE].i_pitch;
875
876         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
877         {
878             C_YUV420_Y211( );
879             C_YUV420_Y211( );
880         }
881
882         p_y1 += i_source_margin;
883         p_y2 += i_source_margin;
884         p_u += i_source_margin_c;
885         p_v += i_source_margin_c;
886         p_line1 += i_dest_margin;
887         p_line2 += i_dest_margin;
888     }
889 }
890 #endif