Include vlc_plugin.h as needed

[vlc] / modules / video_chroma / i420_yuy2.c
diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c

index d137b83dc0099a9e7f5dd0665ace67dae5cd6ee9..21e69d6e28f0e8b2a98db1e543f45d9b146cd6d0 100644 (file)
--- a/modules/video_chroma/i420_yuy2.c
+++ b/modules/video_chroma/i420_yuy2.c
@@ -1,10 +1,11 @@
  /*****************************************************************************
   * i420_yuy2.c : YUV to YUV conversion module for vlc
   *****************************************************************************
- * Copyright (C) 2000, 2001 VideoLAN
+ * Copyright (C) 2000, 2001 the VideoLAN team
   * $Id$
   *
   * Authors: Samuel Hocevar <sam@zoy.org>
+ *          Damien Fouilleul <damien@videolan.org>
   *
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -18,19 +19,22 @@
   *
   * You should have received a copy of the GNU General Public License
   * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
   *****************************************************************************/
  
  /*****************************************************************************
   * Preamble
   *****************************************************************************/
-#include <string.h>                                            /* strerror() */
-#include <stdlib.h>                                      /* malloc(), free() */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
  
  #include <vlc/vlc.h>
-#include <vlc/vout.h>
+#include <vlc_plugin.h>
+#include <vlc_vout.h>
  
-#ifdef HAVE_ALTIVEC_H
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  #   include <altivec.h>
  #endif
  
@@ -42,8 +46,10 @@
  #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
+#elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
+#    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
-#    define DEST_FOURCC "YUY2,YUNV"
+#    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  #endif
  
  /*****************************************************************************
@@ -52,9 +58,9 @@
  static int  Activate ( vlc_object_t * );
  
  static void I420_YUY2           ( vout_thread_t *, picture_t *, picture_t * );
-#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  static void I420_YVYU           ( vout_thread_t *, picture_t *, picture_t * );
  static void I420_UYVY           ( vout_thread_t *, picture_t *, picture_t * );
+#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  static void I420_IUYV           ( vout_thread_t *, picture_t *, picture_t * );
  static void I420_cyuv           ( vout_thread_t *, picture_t *, picture_t * );
  #endif
@@ -63,8 +69,9 @@ static void I420_Y211           ( vout_thread_t *, picture_t *, picture_t * );
  #endif
  
  #ifdef MODULE_NAME_IS_i420_yuy2_mmx
-static uint64_t i_00ffw;
-static uint64_t i_80w;
+/* Initialize MMX-specific constants */
+static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
+static const uint64_t i_80w   = 0x0000000080808080ULL;
  #endif
  
  /*****************************************************************************
@@ -78,9 +85,10 @@ vlc_module_begin();
      set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
      set_capability( "chroma", 100 );
      add_requirement( MMX );
-    /* Initialize MMX-specific constants */
-    i_00ffw = 0x00ff00ff00ff00ffULL;
-    i_80w   = 0x0000000080808080ULL;
+#elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
+    set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
+    set_capability( "chroma", 120 );
+    add_requirement( SSE2 );
  #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
      set_description(
              _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
@@ -116,7 +124,6 @@ static int Activate( vlc_object_t *p_this )
                      p_vout->chroma.pf_convert = I420_YUY2;
                      break;
  
-#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
                  case VLC_FOURCC('Y','V','Y','U'):
                      p_vout->chroma.pf_convert = I420_YVYU;
                      break;
@@ -126,7 +133,7 @@ static int Activate( vlc_object_t *p_this )
                  case VLC_FOURCC('Y','4','2','2'):
                      p_vout->chroma.pf_convert = I420_UYVY;
                      break;
-
+#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
                  case VLC_FOURCC('I','U','Y','V'):
                      p_vout->chroma.pf_convert = I420_IUYV;
                      break;
@@ -154,8 +161,17 @@ static int Activate( vlc_object_t *p_this )
      return 0;
  }
  
-/* Following functions are local */
+#if 0
+static inline unsigned long long read_cycles(void)
+{
+    unsigned long long v;
+    __asm__ __volatile__("rdtsc" : "=A" (v): );
  
+    return v;
+}
+#endif
+
+/* Following functions are local */
  /*****************************************************************************
   * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
   *****************************************************************************/
@@ -249,11 +265,14 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
  #undef VEC_MERGE
  #endif
  
-    const int i_source_margin = p_source->p->i_pitch
-                                 - p_source->p->i_visible_pitch;
+    const int i_source_margin = p_source->p[0].i_pitch
+                                 - p_source->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_source->p[1].i_pitch
+                                 - p_source->p[1].i_visible_pitch;
      const int i_dest_margin = p_dest->p->i_pitch
                                 - p_dest->p->i_visible_pitch;
  
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
      for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
      {
          p_line1 = p_line2;
@@ -263,36 +282,113 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source,
          p_y2 += p_source->p[Y_PLANE].i_pitch;
  
  #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
-        for( i_x = p_vout->render.i_width / 2 ; i_x-- ; )
+        for( i_x = p_vout->render.i_width / 8; i_x-- ; )
          {
              C_YUV420_YUYV( );
+            C_YUV420_YUYV( );
+            C_YUV420_YUYV( );
+            C_YUV420_YUYV( );
          }
  #else
          for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
          {
              MMX_CALL( MMX_YUV420_YUYV );
          }
+#endif
          for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
          {
              C_YUV420_YUYV( );
          }
-#endif
  
          p_y1 += i_source_margin;
          p_y2 += i_source_margin;
+        p_u += i_source_margin_c;
+        p_v += i_source_margin_c;
          p_line1 += i_dest_margin;
          p_line2 += i_dest_margin;
      }
  
+#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
+    /* re-enable FPU registers */
+    MMX_END;
+#endif
+
  #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
      }
  #endif
+
+#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+
+    if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV420_YUYV( );
+            }
+
+            p_y1 += i_source_margin;
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line1 += i_dest_margin;
+            p_line2 += i_dest_margin;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV420_YUYV( );
+            }
+
+            p_y1 += i_source_margin;
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line1 += i_dest_margin;
+            p_line2 += i_dest_margin;
+        }
+    }
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+
+#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  }
  
  /*****************************************************************************
   * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
   *****************************************************************************/
-#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
                                                picture_t *p_dest )
  {
@@ -303,11 +399,94 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
  
      int i_x, i_y;
  
-    const int i_source_margin = p_source->p->i_pitch
-                                 - p_source->p->i_visible_pitch;
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
+#define VEC_NEXT_LINES( ) \
+    p_line1  = p_line2; \
+    p_line2 += p_dest->p->i_pitch; \
+    p_y1     = p_y2; \
+    p_y2    += p_source->p[Y_PLANE].i_pitch;
+
+#define VEC_LOAD_UV( ) \
+    u_vec = vec_ld( 0, p_u ); p_u += 16; \
+    v_vec = vec_ld( 0, p_v ); p_v += 16;
+
+#define VEC_MERGE( a ) \
+    vu_vec = a( v_vec, u_vec ); \
+    y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
+    vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
+    vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
+    y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
+    vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
+    vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
+
+    vector unsigned char u_vec;
+    vector unsigned char v_vec;
+    vector unsigned char vu_vec;
+    vector unsigned char y_vec;
+
+    if( !( ( p_vout->render.i_width % 32 ) |
+           ( p_vout->render.i_height % 2 ) ) )
+    {
+        /* Width is a multiple of 32, we take 2 lines at a time */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            VEC_NEXT_LINES( );
+            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            {
+                VEC_LOAD_UV( );
+                VEC_MERGE( vec_mergeh );
+                VEC_MERGE( vec_mergel );
+            }
+        }
+    }
+    else if( !( ( p_vout->render.i_width % 16 ) |
+                ( p_vout->render.i_height % 4 ) ) )
+    {
+        /* Width is only a multiple of 16, we take 4 lines at a time */
+        for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
+        {
+            /* Line 1 and 2, pixels 0 to ( width - 16 ) */
+            VEC_NEXT_LINES( );
+            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            {
+                VEC_LOAD_UV( );
+                VEC_MERGE( vec_mergeh );
+                VEC_MERGE( vec_mergel );
+            }
+
+            /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
+            VEC_LOAD_UV( );
+            VEC_MERGE( vec_mergeh );
+
+            /* Line 3 and 4, pixels 0 to 16 */
+            VEC_NEXT_LINES( );
+            VEC_MERGE( vec_mergel );
+
+            /* Line 3 and 4, pixels 16 to ( width ) */
+            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            {
+                VEC_LOAD_UV( );
+                VEC_MERGE( vec_mergeh );
+                VEC_MERGE( vec_mergel );
+            }
+        }
+    }
+    else
+    {
+        /* Crap, use the C version */
+#undef VEC_NEXT_LINES
+#undef VEC_LOAD_UV
+#undef VEC_MERGE
+#endif
+
+    const int i_source_margin = p_source->p[0].i_pitch
+                                 - p_source->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_source->p[1].i_pitch
+                                 - p_source->p[1].i_visible_pitch;
      const int i_dest_margin = p_dest->p->i_pitch
                                 - p_dest->p->i_visible_pitch;
  
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
      for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
      {
          p_line1 = p_line2;
@@ -318,7 +497,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
  
          for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
          {
-#if defined (MODULE_NAME_IS_i420_yuy2)
+#if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
              C_YUV420_YVYU( );
              C_YUV420_YVYU( );
              C_YUV420_YVYU( );
@@ -327,12 +506,93 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
              MMX_CALL( MMX_YUV420_YVYU );
  #endif
          }
+        for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
+        {
+            C_YUV420_YVYU( );
+        }
  
          p_y1 += i_source_margin;
          p_y2 += i_source_margin;
+        p_u += i_source_margin_c;
+        p_v += i_source_margin_c;
          p_line1 += i_dest_margin;
          p_line2 += i_dest_margin;
      }
+
+#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
+    /* re-enable FPU registers */
+    MMX_END;
+#endif
+
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
+    }
+#endif
+
+#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+    if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV420_YVYU( );
+            }
+
+            p_y1 += i_source_margin;
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line1 += i_dest_margin;
+            p_line2 += i_dest_margin;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV420_YVYU( );
+            }
+
+            p_y1 += i_source_margin;
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line1 += i_dest_margin;
+            p_line2 += i_dest_margin;
+        }
+    }
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  }
  
  /*****************************************************************************
@@ -348,11 +608,94 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
  
      int i_x, i_y;
  
-    const int i_source_margin = p_source->p->i_pitch
-                                 - p_source->p->i_visible_pitch;
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
+#define VEC_NEXT_LINES( ) \
+    p_line1  = p_line2; \
+    p_line2 += p_dest->p->i_pitch; \
+    p_y1     = p_y2; \
+    p_y2    += p_source->p[Y_PLANE].i_pitch;
+
+#define VEC_LOAD_UV( ) \
+    u_vec = vec_ld( 0, p_u ); p_u += 16; \
+    v_vec = vec_ld( 0, p_v ); p_v += 16;
+
+#define VEC_MERGE( a ) \
+    uv_vec = a( u_vec, v_vec ); \
+    y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
+    vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
+    vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
+    y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
+    vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
+    vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
+
+    vector unsigned char u_vec;
+    vector unsigned char v_vec;
+    vector unsigned char uv_vec;
+    vector unsigned char y_vec;
+
+    if( !( ( p_vout->render.i_width % 32 ) |
+           ( p_vout->render.i_height % 2 ) ) )
+    {
+        /* Width is a multiple of 32, we take 2 lines at a time */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            VEC_NEXT_LINES( );
+            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            {
+                VEC_LOAD_UV( );
+                VEC_MERGE( vec_mergeh );
+                VEC_MERGE( vec_mergel );
+            }
+        }
+    }
+    else if( !( ( p_vout->render.i_width % 16 ) |
+                ( p_vout->render.i_height % 4 ) ) )
+    {
+        /* Width is only a multiple of 16, we take 4 lines at a time */
+        for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
+        {
+            /* Line 1 and 2, pixels 0 to ( width - 16 ) */
+            VEC_NEXT_LINES( );
+            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            {
+                VEC_LOAD_UV( );
+                VEC_MERGE( vec_mergeh );
+                VEC_MERGE( vec_mergel );
+            }
+
+            /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
+            VEC_LOAD_UV( );
+            VEC_MERGE( vec_mergeh );
+
+            /* Line 3 and 4, pixels 0 to 16 */
+            VEC_NEXT_LINES( );
+            VEC_MERGE( vec_mergel );
+
+            /* Line 3 and 4, pixels 16 to ( width ) */
+            for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+            {
+                VEC_LOAD_UV( );
+                VEC_MERGE( vec_mergeh );
+                VEC_MERGE( vec_mergel );
+            }
+        }
+    }
+    else
+    {
+        /* Crap, use the C version */
+#undef VEC_NEXT_LINES
+#undef VEC_LOAD_UV
+#undef VEC_MERGE
+#endif
+
+    const int i_source_margin = p_source->p[0].i_pitch
+                                 - p_source->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_source->p[1].i_pitch
+                                 - p_source->p[1].i_visible_pitch;
      const int i_dest_margin = p_dest->p->i_pitch
                                 - p_dest->p->i_visible_pitch;
  
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
      for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
      {
          p_line1 = p_line2;
@@ -363,7 +706,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
  
          for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
          {
-#if defined (MODULE_NAME_IS_i420_yuy2)
+#if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
              C_YUV420_UYVY( );
              C_YUV420_UYVY( );
              C_YUV420_UYVY( );
@@ -372,20 +715,103 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source,
              MMX_CALL( MMX_YUV420_UYVY );
  #endif
          }
+        for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
+        {
+            C_YUV420_UYVY( );
+        }
  
          p_y1 += i_source_margin;
          p_y2 += i_source_margin;
+        p_u += i_source_margin_c;
+        p_v += i_source_margin_c;
          p_line1 += i_dest_margin;
          p_line2 += i_dest_margin;
      }
+
+#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
+    /* re-enable FPU registers */
+    MMX_END;
+#endif
+
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
+    }
+#endif
+
+#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+    if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV420_UYVY( );
+            }
+
+            p_y1 += i_source_margin;
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line1 += i_dest_margin;
+            p_line2 += i_dest_margin;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV420_UYVY( );
+            }
+
+            p_y1 += i_source_margin;
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line1 += i_dest_margin;
+            p_line2 += i_dest_margin;
+        }
+    }
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  }
  
+#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  /*****************************************************************************
   * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
   *****************************************************************************/
  static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
                                                picture_t *p_dest )
  {
+    VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
      /* FIXME: TODO ! */
      msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
  }
@@ -407,11 +833,14 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
  
      int i_x, i_y;
  
-    const int i_source_margin = p_source->p->i_pitch
-                                 - p_source->p->i_visible_pitch;
+    const int i_source_margin = p_source->p[0].i_pitch
+                                 - p_source->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_source->p[1].i_pitch
+                                 - p_source->p[1].i_visible_pitch;
      const int i_dest_margin = p_dest->p->i_pitch
                                 - p_dest->p->i_visible_pitch;
  
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
      for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
      {
          p_line1 -= 3 * p_dest->p->i_pitch;
@@ -422,7 +851,7 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
  
          for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
          {
-#if defined (MODULE_NAME_IS_i420_yuy2)
+#if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
              C_YUV420_UYVY( );
              C_YUV420_UYVY( );
              C_YUV420_UYVY( );
@@ -431,12 +860,89 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source,
              MMX_CALL( MMX_YUV420_UYVY );
  #endif
          }
+        for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
+        {
+            C_YUV420_UYVY( );
+        }
  
          p_y1 += i_source_margin;
          p_y2 += i_source_margin;
+        p_u += i_source_margin_c;
+        p_v += i_source_margin_c;
          p_line1 += i_dest_margin;
          p_line2 += i_dest_margin;
      }
+
+#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
+    /* re-enable FPU registers */
+    MMX_END;
+#endif
+
+#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+    if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV420_UYVY( );
+            }
+
+            p_y1 += i_source_margin;
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line1 += i_dest_margin;
+            p_line2 += i_dest_margin;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+            {
+                SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
+            }
+            for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+            {
+                C_YUV420_UYVY( );
+            }
+
+            p_y1 += i_source_margin;
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line1 += i_dest_margin;
+            p_line2 += i_dest_margin;
+        }
+    }
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
  }
  #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  
@@ -454,8 +960,10 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
  
      int i_x, i_y;
  
-    const int i_source_margin = p_source->p->i_pitch
-                                 - p_source->p->i_visible_pitch;
+    const int i_source_margin = p_source->p[0].i_pitch
+                                 - p_source->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_source->p[1].i_pitch
+                                 - p_source->p[1].i_visible_pitch;
      const int i_dest_margin = p_dest->p->i_pitch
                                 - p_dest->p->i_visible_pitch;
  
@@ -475,9 +983,10 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source,
  
          p_y1 += i_source_margin;
          p_y2 += i_source_margin;
+        p_u += i_source_margin_c;
+        p_v += i_source_margin_c;
          p_line1 += i_dest_margin;
          p_line2 += i_dest_margin;
      }
  }
  #endif
-