]> git.sesse.net Git - vlc/blobdiff - modules/video_chroma/i420_yuy2.c
Chroma modules now exactly implement the "video filter2" capability.
[vlc] / modules / video_chroma / i420_yuy2.c
index 8a76fcb46f61cd64021fbf2a737117007f01b3a4..4aedfff4b30293313254c2e1a17db1daa3dad5fd 100644 (file)
 /*****************************************************************************
  * Preamble
  *****************************************************************************/
-#include <string.h>                                            /* strerror() */
-#include <stdlib.h>                                      /* malloc(), free() */
 
-#include <vlc/vlc.h>
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
+#include <vlc_plugin.h>
+#include <vlc_filter.h>
 #include <vlc_vout.h>
 
 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  *****************************************************************************/
 static int  Activate ( vlc_object_t * );
 
-static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
-static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
-static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
+static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
+static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
+static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
+static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
+static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
+static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
-static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
-static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
+static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
+static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
+static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
+static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
 #endif
 #if defined (MODULE_NAME_IS_i420_yuy2)
-static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
+static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
+static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
 #endif
 
 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
@@ -76,20 +86,20 @@ static const uint64_t i_80w   = 0x0000000080808080ULL;
  *****************************************************************************/
 vlc_module_begin();
 #if defined (MODULE_NAME_IS_i420_yuy2)
-    set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
-    set_capability( "chroma", 80 );
+    set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) );
+    set_capability( "video filter2", 80 );
 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
-    set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
-    set_capability( "chroma", 100 );
+    set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
+    set_capability( "video filter2", 100 );
     add_requirement( MMX );
 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
-    set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
-    set_capability( "chroma", 120 );
+    set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
+    set_capability( "video filter2", 120 );
     add_requirement( SSE2 );
 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
     set_description(
             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
-    set_capability( "chroma", 100 );
+    set_capability( "video filter2", 100 );
     add_requirement( ALTIVEC );
 #endif
     set_callbacks( Activate, NULL );
@@ -102,47 +112,48 @@ vlc_module_end();
  *****************************************************************************/
 static int Activate( vlc_object_t *p_this )
 {
-    vout_thread_t *p_vout = (vout_thread_t *)p_this;
+    filter_t *p_filter = (filter_t *)p_this;
 
-    if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 )
+    if( p_filter->fmt_in.video.i_width & 1
+     || p_filter->fmt_in.video.i_height & 1 )
     {
         return -1;
     }
 
-    switch( p_vout->render.i_chroma )
+    switch( p_filter->fmt_in.video.i_chroma )
     {
         case VLC_FOURCC('Y','V','1','2'):
         case VLC_FOURCC('I','4','2','0'):
         case VLC_FOURCC('I','Y','U','V'):
-            switch( p_vout->output.i_chroma )
+            switch( p_filter->fmt_out.video.i_chroma )
             {
                 case VLC_FOURCC('Y','U','Y','2'):
                 case VLC_FOURCC('Y','U','N','V'):
-                    p_vout->chroma.pf_convert = I420_YUY2;
+                    p_filter->pf_video_filter = I420_YUY2_Filter;
                     break;
 
                 case VLC_FOURCC('Y','V','Y','U'):
-                    p_vout->chroma.pf_convert = I420_YVYU;
+                    p_filter->pf_video_filter = I420_YVYU_Filter;
                     break;
 
                 case VLC_FOURCC('U','Y','V','Y'):
                 case VLC_FOURCC('U','Y','N','V'):
                 case VLC_FOURCC('Y','4','2','2'):
-                    p_vout->chroma.pf_convert = I420_UYVY;
+                    p_filter->pf_video_filter = I420_UYVY_Filter;
                     break;
 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
                 case VLC_FOURCC('I','U','Y','V'):
-                    p_vout->chroma.pf_convert = I420_IUYV;
+                    p_filter->pf_video_filter = I420_IUYV_Filter;
                     break;
 
                 case VLC_FOURCC('c','y','u','v'):
-                    p_vout->chroma.pf_convert = I420_cyuv;
+                    p_filter->pf_video_filter = I420_cyuv_Filter;
                     break;
 #endif
 
 #if defined (MODULE_NAME_IS_i420_yuy2)
                 case VLC_FOURCC('Y','2','1','1'):
-                    p_vout->chroma.pf_convert = I420_Y211;
+                    p_filter->pf_video_filter = I420_Y211_Filter;
                     break;
 #endif
 
@@ -169,11 +180,22 @@ static inline unsigned long long read_cycles(void)
 #endif
 
 /* Following functions are local */
+
+VIDEO_FILTER_WRAPPER( I420_YUY2 )
+VIDEO_FILTER_WRAPPER( I420_YVYU )
+VIDEO_FILTER_WRAPPER( I420_UYVY )
+#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
+VIDEO_FILTER_WRAPPER( I420_IUYV )
+#endif
+#if defined (MODULE_NAME_IS_i420_yuy2)
+VIDEO_FILTER_WRAPPER( I420_Y211 )
+#endif
+
 /*****************************************************************************
  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
  *****************************************************************************/
-static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
-                                              picture_t *p_dest )
+static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
+                                           picture_t *p_dest )
 {
     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
@@ -207,14 +229,14 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
     vector unsigned char uv_vec;
     vector unsigned char y_vec;
 
-    if( !( ( p_vout->render.i_width % 32 ) |
-           ( p_vout->render.i_height % 2 ) ) )
+    if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
+           ( p_filter->fmt_in.video.i_height % 2 ) ) )
     {
         /* Width is a multiple of 32, we take 2 lines at a time */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             VEC_NEXT_LINES( );
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -222,15 +244,15 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
             }
         }
     }
-    else if( !( ( p_vout->render.i_width % 16 ) |
-                ( p_vout->render.i_height % 4 ) ) )
+    else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
+                ( p_filter->fmt_in.video.i_height % 4 ) ) )
     {
         /* Width is only a multiple of 16, we take 4 lines at a time */
-        for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
         {
             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
             VEC_NEXT_LINES( );
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_fiter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -246,7 +268,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
             VEC_MERGE( vec_mergel );
 
             /* Line 3 and 4, pixels 16 to ( width ) */
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -270,7 +292,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
                                - p_dest->p->i_visible_pitch;
 
 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
-    for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+    for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
     {
         p_line1 = p_line2;
         p_line2 += p_dest->p->i_pitch;
@@ -279,7 +301,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
         p_y2 += p_source->p[Y_PLANE].i_pitch;
 
 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
-        for( i_x = p_vout->render.i_width / 8; i_x-- ; )
+        for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
         {
             C_YUV420_YUYV( );
             C_YUV420_YUYV( );
@@ -287,12 +309,12 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
             C_YUV420_YUYV( );
         }
 #else
-        for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
+        for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
         {
             MMX_CALL( MMX_YUV420_YUYV );
         }
 #endif
-        for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
+        for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
         {
             C_YUV420_YUYV( );
         }
@@ -307,7 +329,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 
 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
     /* re-enable FPU registers */
-    __asm__ __volatile__ ( "emms" );
+    MMX_END;
 #endif
 
 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
@@ -316,15 +338,15 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 
 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
     /*
-    ** SSE2 128 bits fetch/store instructions are faster 
+    ** SSE2 128 bits fetch/store instructions are faster
     ** if memory access is 16 bytes aligned
     */
 
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
-        ((int)p_line2|(int)p_y2))) )
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
     {
         /* use faster SSE2 aligned fetch and store */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             p_line1 = p_line2;
             p_line2 += p_dest->p->i_pitch;
@@ -332,11 +354,11 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
             p_y1 = p_y2;
             p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
             {
                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
             }
-            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
             {
                 C_YUV420_YUYV( );
             }
@@ -348,13 +370,11 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
             p_line1 += i_dest_margin;
             p_line2 += i_dest_margin;
         }
-        /* make sure all SSE2 stores are visible thereafter */
-        __asm__ __volatile__ ( "sfence" );
     }
     else
     {
         /* use slower SSE2 unaligned fetch and store */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             p_line1 = p_line2;
             p_line2 += p_dest->p->i_pitch;
@@ -362,11 +382,11 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
             p_y1 = p_y2;
             p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
             {
                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
             }
-            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
             {
                 C_YUV420_YUYV( );
             }
@@ -379,6 +399,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
             p_line2 += i_dest_margin;
         }
     }
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
 
 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 }
@@ -386,8 +408,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
 /*****************************************************************************
  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
  *****************************************************************************/
-static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
-                                              picture_t *p_dest )
+static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
+                                           picture_t *p_dest )
 {
     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
@@ -421,14 +443,14 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
     vector unsigned char vu_vec;
     vector unsigned char y_vec;
 
-    if( !( ( p_vout->render.i_width % 32 ) |
-           ( p_vout->render.i_height % 2 ) ) )
+    if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
+           ( p_filter->fmt_in.video.i_height % 2 ) ) )
     {
         /* Width is a multiple of 32, we take 2 lines at a time */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             VEC_NEXT_LINES( );
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -436,15 +458,15 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
             }
         }
     }
-    else if( !( ( p_vout->render.i_width % 16 ) |
-                ( p_vout->render.i_height % 4 ) ) )
+    else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
+                ( p_filter->fmt_in.video.i_height % 4 ) ) )
     {
         /* Width is only a multiple of 16, we take 4 lines at a time */
-        for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
         {
             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
             VEC_NEXT_LINES( );
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -460,7 +482,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
             VEC_MERGE( vec_mergel );
 
             /* Line 3 and 4, pixels 16 to ( width ) */
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -484,7 +506,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
                                - p_dest->p->i_visible_pitch;
 
 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
-    for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+    for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
     {
         p_line1 = p_line2;
         p_line2 += p_dest->p->i_pitch;
@@ -492,7 +514,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
         p_y1 = p_y2;
         p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-        for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
+        for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
         {
 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
             C_YUV420_YVYU( );
@@ -503,7 +525,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
             MMX_CALL( MMX_YUV420_YVYU );
 #endif
         }
-        for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
+        for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
         {
             C_YUV420_YVYU( );
         }
@@ -518,7 +540,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 
 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
     /* re-enable FPU registers */
-    __asm__ __volatile__ ( "emms" );
+    MMX_END;
 #endif
 
 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
@@ -527,14 +549,14 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
 
 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
     /*
-    ** SSE2 128 bits fetch/store instructions are faster 
+    ** SSE2 128 bits fetch/store instructions are faster
     ** if memory access is 16 bytes aligned
     */
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
-        ((int)p_line2|(int)p_y2))) )
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
     {
         /* use faster SSE2 aligned fetch and store */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             p_line1 = p_line2;
             p_line2 += p_dest->p->i_pitch;
@@ -542,11 +564,11 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
             p_y1 = p_y2;
             p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
             {
                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
             }
-            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
             {
                 C_YUV420_YVYU( );
             }
@@ -558,13 +580,11 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
             p_line1 += i_dest_margin;
             p_line2 += i_dest_margin;
         }
-        /* make sure all SSE2 stores are visible thereafter */
-        __asm__ __volatile__ ( "sfence" );
     }
     else
     {
         /* use slower SSE2 unaligned fetch and store */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             p_line1 = p_line2;
             p_line2 += p_dest->p->i_pitch;
@@ -572,11 +592,11 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
             p_y1 = p_y2;
             p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
             {
                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
             }
-            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
             {
                 C_YUV420_YVYU( );
             }
@@ -589,14 +609,16 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
             p_line2 += i_dest_margin;
         }
     }
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 }
 
 /*****************************************************************************
  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
  *****************************************************************************/
-static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
-                                              picture_t *p_dest )
+static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
+                                           picture_t *p_dest )
 {
     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
@@ -630,14 +652,14 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
     vector unsigned char uv_vec;
     vector unsigned char y_vec;
 
-    if( !( ( p_vout->render.i_width % 32 ) |
-           ( p_vout->render.i_height % 2 ) ) )
+    if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
+           ( p_filter->fmt_in.video.i_height % 2 ) ) )
     {
         /* Width is a multiple of 32, we take 2 lines at a time */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             VEC_NEXT_LINES( );
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -645,15 +667,15 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
             }
         }
     }
-    else if( !( ( p_vout->render.i_width % 16 ) |
-                ( p_vout->render.i_height % 4 ) ) )
+    else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
+                ( p_filter->fmt_in.video.i_height % 4 ) ) )
     {
         /* Width is only a multiple of 16, we take 4 lines at a time */
-        for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
         {
             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
             VEC_NEXT_LINES( );
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -669,7 +691,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
             VEC_MERGE( vec_mergel );
 
             /* Line 3 and 4, pixels 16 to ( width ) */
-            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
             {
                 VEC_LOAD_UV( );
                 VEC_MERGE( vec_mergeh );
@@ -693,7 +715,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
                                - p_dest->p->i_visible_pitch;
 
 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
-    for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+    for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
     {
         p_line1 = p_line2;
         p_line2 += p_dest->p->i_pitch;
@@ -701,7 +723,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
         p_y1 = p_y2;
         p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-        for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
+        for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
         {
 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
             C_YUV420_UYVY( );
@@ -712,7 +734,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
             MMX_CALL( MMX_YUV420_UYVY );
 #endif
         }
-        for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
+        for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
         {
             C_YUV420_UYVY( );
         }
@@ -727,7 +749,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 
 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
     /* re-enable FPU registers */
-    __asm__ __volatile__ ( "emms" );
+    MMX_END;
 #endif
 
 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
@@ -736,14 +758,14 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 
 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
     /*
-    ** SSE2 128 bits fetch/store instructions are faster 
+    ** SSE2 128 bits fetch/store instructions are faster
     ** if memory access is 16 bytes aligned
     */
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
-        ((int)p_line2|(int)p_y2))) )
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
     {
         /* use faster SSE2 aligned fetch and store */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             p_line1 = p_line2;
             p_line2 += p_dest->p->i_pitch;
@@ -751,11 +773,11 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
             p_y1 = p_y2;
             p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
             {
                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
             }
-            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
             {
                 C_YUV420_UYVY( );
             }
@@ -767,13 +789,11 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
             p_line1 += i_dest_margin;
             p_line2 += i_dest_margin;
         }
-        /* make sure all SSE2 stores are visible thereafter */
-        __asm__ __volatile__ ( "sfence" );
     }
     else
     {
         /* use slower SSE2 unaligned fetch and store */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             p_line1 = p_line2;
             p_line2 += p_dest->p->i_pitch;
@@ -781,11 +801,11 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
             p_y1 = p_y2;
             p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
             {
                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
             }
-            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
             {
                 C_YUV420_UYVY( );
             }
@@ -798,6 +818,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
             p_line2 += i_dest_margin;
         }
     }
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 }
 
@@ -805,18 +827,19 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
 /*****************************************************************************
  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
  *****************************************************************************/
-static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
-                                              picture_t *p_dest )
+static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
+                                           picture_t *p_dest )
 {
+    VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
     /* FIXME: TODO ! */
-    msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
+    msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 }
 
 /*****************************************************************************
  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
  *****************************************************************************/
-static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
-                                              picture_t *p_dest )
+static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
+                                           picture_t *p_dest )
 {
     uint8_t *p_line1 = p_dest->p->p_pixels +
                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
@@ -837,7 +860,7 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
                                - p_dest->p->i_visible_pitch;
 
 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
-    for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+    for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
     {
         p_line1 -= 3 * p_dest->p->i_pitch;
         p_line2 -= 3 * p_dest->p->i_pitch;
@@ -845,7 +868,7 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
         p_y1 = p_y2;
         p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-        for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
+        for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
         {
 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
             C_YUV420_UYVY( );
@@ -856,7 +879,7 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
             MMX_CALL( MMX_YUV420_UYVY );
 #endif
         }
-        for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
+        for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
         {
             C_YUV420_UYVY( );
         }
@@ -871,19 +894,19 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
 
 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
     /* re-enable FPU registers */
-    __asm__ __volatile__ ( "emms" );
+    MMX_END;
 #endif
 
 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
     /*
-    ** SSE2 128 bits fetch/store instructions are faster 
+    ** SSE2 128 bits fetch/store instructions are faster
     ** if memory access is 16 bytes aligned
     */
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
-        ((int)p_line2|(int)p_y2))) )
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
     {
         /* use faster SSE2 aligned fetch and store */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             p_line1 = p_line2;
             p_line2 += p_dest->p->i_pitch;
@@ -891,11 +914,11 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
             p_y1 = p_y2;
             p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
             {
                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
             }
-            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
             {
                 C_YUV420_UYVY( );
             }
@@ -907,13 +930,11 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
             p_line1 += i_dest_margin;
             p_line2 += i_dest_margin;
         }
-        /* make sure all SSE2 stores are visible thereafter */
-        __asm__ __volatile__ ( "sfence" );
     }
     else
     {
         /* use slower SSE2 unaligned fetch and store */
-        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
         {
             p_line1 = p_line2;
             p_line2 += p_dest->p->i_pitch;
@@ -921,11 +942,11 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
             p_y1 = p_y2;
             p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
             {
                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
             }
-            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
             {
                 C_YUV420_UYVY( );
             }
@@ -938,6 +959,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
             p_line2 += i_dest_margin;
         }
     }
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 }
 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
@@ -946,8 +969,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
  *****************************************************************************/
 #if defined (MODULE_NAME_IS_i420_yuy2)
-static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
-                                              picture_t *p_dest )
+static void I420_Y211( filter_t *p_filter, picture_t *p_source,
+                                           picture_t *p_dest )
 {
     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
@@ -963,7 +986,7 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
     const int i_dest_margin = p_dest->p->i_pitch
                                - p_dest->p->i_visible_pitch;
 
-    for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+    for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
     {
         p_line1 = p_line2;
         p_line2 += p_dest->p->i_pitch;
@@ -971,7 +994,7 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
         p_y1 = p_y2;
         p_y2 += p_source->p[Y_PLANE].i_pitch;
 
-        for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
+        for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
         {
             C_YUV420_Y211( );
             C_YUV420_Y211( );