]> git.sesse.net Git - vlc/commitdiff
i420_rgb: split files and clean up
authorRémi Denis-Courmont <remi@remlab.net>
Sat, 14 Sep 2013 16:55:47 +0000 (19:55 +0300)
committerRémi Denis-Courmont <remi@remlab.net>
Sat, 14 Sep 2013 16:56:18 +0000 (19:56 +0300)
modules/video_chroma/Makefile.am
modules/video_chroma/i420_rgb.c
modules/video_chroma/i420_rgb.h
modules/video_chroma/i420_rgb16.c
modules/video_chroma/i420_rgb16_x86.c [new file with mode: 0644]

index 9cf630bdd76152bbcd53a3d86cde888c6fdf0254..4af12863d8a9d1f4962e7b6b3aa20014dad8ee14 100644 (file)
@@ -63,8 +63,8 @@ endif
 
 # MMX
 libi420_rgb_mmx_plugin_la_SOURCES = i420_rgb.c i420_rgb.h \
-       i420_rgb16.c i420_rgb_mmx.h
-libi420_rgb_mmx_plugin_la_CPPFLAGS = $(AM_CPPFLAGS)
+       i420_rgb16_x86.c i420_rgb_mmx.h
+libi420_rgb_mmx_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -DMMX
 
 libi420_yuy2_mmx_plugin_la_SOURCES = i420_yuy2.c i420_yuy2.h
 libi420_yuy2_mmx_plugin_la_CPPFLAGS = $(AM_CPPFLAGS)
@@ -81,8 +81,8 @@ endif
 
 # SSE2
 libi420_rgb_sse2_plugin_la_SOURCES = i420_rgb.c i420_rgb.h \
-       i420_rgb16.c i420_rgb_sse2.h
-libi420_rgb_sse2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS)
+       i420_rgb16_x86.c i420_rgb_sse2.h
+libi420_rgb_sse2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -DSSE2
 
 libi420_yuy2_sse2_plugin_la_SOURCES = i420_yuy2.c i420_yuy2.h
 libi420_yuy2_sse2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS)
index 398ef57d1c29055a7a607a8213a258fb1a8704ce..c9dd5ed5610a5a0f002b5dfb87328ef3a06c79d5 100644 (file)
 #include <vlc_cpu.h>
 
 #include "i420_rgb.h"
-#if defined (MODULE_NAME_IS_i420_rgb)
-#   include "i420_rgb_c.h"
-    static picture_t *I420_RGB8_Filter         ( filter_t *, picture_t * );
-//    static picture_t *I420_RGB16_dither_Filter ( filter_t *, picture_t * );
-    static picture_t *I420_RGB16_Filter        ( filter_t *, picture_t * );
-    static picture_t *I420_RGB32_Filter        ( filter_t *, picture_t * );
+#ifdef PLAIN
+# include "i420_rgb_c.h"
+static picture_t *I420_RGB8_Filter( filter_t *, picture_t * );
+static picture_t *I420_RGB16_Filter( filter_t *, picture_t * );
+static picture_t *I420_RGB32_Filter( filter_t *, picture_t * );
+
+static void SetGammaTable( int *pi_table, double f_gamma );
+static void SetYUV( filter_t * );
+static void Set8bppPalette( filter_t *, uint8_t * );
 #else
-    static picture_t *I420_R5G5B5_Filter       ( filter_t *, picture_t * );
-    static picture_t *I420_R5G6B5_Filter       ( filter_t *, picture_t * );
-    static picture_t *I420_A8R8G8B8_Filter     ( filter_t *, picture_t * );
-    static picture_t *I420_R8G8B8A8_Filter     ( filter_t *, picture_t * );
-    static picture_t *I420_B8G8R8A8_Filter     ( filter_t *, picture_t * );
-    static picture_t *I420_A8B8G8R8_Filter     ( filter_t *, picture_t * );
+static picture_t *I420_R5G5B5_Filter( filter_t *, picture_t * );
+static picture_t *I420_R5G6B5_Filter( filter_t *, picture_t * );
+static picture_t *I420_A8R8G8B8_Filter( filter_t *, picture_t * );
+static picture_t *I420_R8G8B8A8_Filter( filter_t *, picture_t * );
+static picture_t *I420_B8G8R8A8_Filter( filter_t *, picture_t * );
+static picture_t *I420_A8B8G8R8_Filter( filter_t *, picture_t * );
 #endif
 
 /*****************************************************************************
                        << p_filter->fmt_out.video.i_lbshift))
 
 /*****************************************************************************
- * Local and extern prototypes.
+ * Module descriptor.
  *****************************************************************************/
 static int  Activate   ( vlc_object_t * );
 static void Deactivate ( vlc_object_t * );
 
-#if defined (MODULE_NAME_IS_i420_rgb)
-static void SetGammaTable       ( int *pi_table, double f_gamma );
-static void SetYUV              ( filter_t * );
-static void Set8bppPalette      ( filter_t *, uint8_t * );
-#endif
-
-/*****************************************************************************
- * Module descriptor.
- *****************************************************************************/
 vlc_module_begin ()
-#if defined (MODULE_NAME_IS_i420_rgb)
-    set_description( N_("I420,IYUV,YV12 to "
-                       "RGB2,RV15,RV16,RV24,RV32 conversions") )
-    set_capability( "video filter2", 80 )
-# define vlc_CPU_capable() (true)
-#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
-    set_description( N_( "MMX I420,IYUV,YV12 to "
-                        "RV15,RV16,RV24,RV32 conversions") )
-    set_capability( "video filter2", 100 )
-# define vlc_CPU_capable() vlc_CPU_MMX()
-#elif defined (MODULE_NAME_IS_i420_rgb_sse2)
+#if defined (SSE2)
     set_description( N_( "SSE2 I420,IYUV,YV12 to "
                         "RV15,RV16,RV24,RV32 conversions") )
     set_capability( "video filter2", 120 )
 # define vlc_CPU_capable() vlc_CPU_SSE2()
+#elif defined (MMX)
+    set_description( N_( "MMX I420,IYUV,YV12 to "
+                        "RV15,RV16,RV24,RV32 conversions") )
+    set_capability( "video filter2", 100 )
+# define vlc_CPU_capable() vlc_CPU_MMX()
+#else
+    set_description( N_("I420,IYUV,YV12 to "
+                       "RGB2,RV15,RV16,RV24,RV32 conversions") )
+    set_capability( "video filter2", 80 )
+# define vlc_CPU_capable() (true)
 #endif
     set_callbacks( Activate, Deactivate )
 vlc_module_end ()
@@ -107,7 +101,7 @@ vlc_module_end ()
 static int Activate( vlc_object_t *p_this )
 {
     filter_t *p_filter = (filter_t *)p_this;
-#if defined (MODULE_NAME_IS_i420_rgb)
+#ifdef PLAIN
     size_t i_tables_size;
 #endif
 
@@ -125,14 +119,9 @@ static int Activate( vlc_object_t *p_this )
         case VLC_CODEC_I420:
             switch( p_filter->fmt_out.video.i_chroma )
             {
-#if defined (MODULE_NAME_IS_i420_rgb)
-                case VLC_CODEC_RGB8:
-                    p_filter->pf_video_filter = I420_RGB8_Filter;
-                    break;
-#endif
+#ifndef PLAIN
                 case VLC_CODEC_RGB15:
                 case VLC_CODEC_RGB16:
-#if ! defined (MODULE_NAME_IS_i420_rgb)
                     /* If we don't have support for the bitmasks, bail out */
                     if( ( p_filter->fmt_out.video.i_rmask == 0x7c00
                        && p_filter->fmt_out.video.i_gmask == 0x03e0
@@ -152,19 +141,8 @@ static int Activate( vlc_object_t *p_this )
                     }
                     else
                         return VLC_EGENERIC;
-#else
-                    // generic C chroma converter */
-                    p_filter->pf_video_filter = I420_RGB16_Filter;
-#endif
                     break;
-
-#if 0
-                /* Hmmm, is there only X11 using 32bits per pixel for RV24 ? */
-                case VLC_CODEC_RGB24:
-#endif
-
                 case VLC_CODEC_RGB32:
-#if ! defined (MODULE_NAME_IS_i420_rgb)
                     /* If we don't have support for the bitmasks, bail out */
                     if( p_filter->fmt_out.video.i_rmask == 0x00ff0000
                      && p_filter->fmt_out.video.i_gmask == 0x0000ff00
@@ -200,12 +178,19 @@ static int Activate( vlc_object_t *p_this )
                     }
                     else
                         return VLC_EGENERIC;
+                    break;
 #else
-                    /* generic C chroma converter */
+                case VLC_CODEC_RGB8:
+                    p_filter->pf_video_filter = I420_RGB8_Filter;
+                    break;
+                case VLC_CODEC_RGB15:
+                case VLC_CODEC_RGB16:
+                    p_filter->pf_video_filter = I420_RGB16_Filter;
+                    break;
+                case VLC_CODEC_RGB32:
                     p_filter->pf_video_filter = I420_RGB32_Filter;
-#endif
                     break;
-
+#endif
                 default:
                     return VLC_EGENERIC;
             }
@@ -223,22 +208,19 @@ static int Activate( vlc_object_t *p_this )
 
     switch( p_filter->fmt_out.video.i_chroma )
     {
-#if defined (MODULE_NAME_IS_i420_rgb)
+#ifdef PLAIN
         case VLC_CODEC_RGB8:
             p_filter->p_sys->p_buffer = malloc( VOUT_MAX_WIDTH );
             break;
 #endif
-
         case VLC_CODEC_RGB15:
         case VLC_CODEC_RGB16:
             p_filter->p_sys->p_buffer = malloc( VOUT_MAX_WIDTH * 2 );
             break;
-
         case VLC_CODEC_RGB24:
         case VLC_CODEC_RGB32:
             p_filter->p_sys->p_buffer = malloc( VOUT_MAX_WIDTH * 4 );
             break;
-
         default:
             p_filter->p_sys->p_buffer = NULL;
             break;
@@ -261,7 +243,7 @@ static int Activate( vlc_object_t *p_this )
         return VLC_EGENERIC;
     }
 
-#if defined (MODULE_NAME_IS_i420_rgb)
+#ifdef PLAIN
     switch( p_filter->fmt_out.video.i_chroma )
     {
     case VLC_CODEC_RGB8:
@@ -300,7 +282,7 @@ static void Deactivate( vlc_object_t *p_this )
 {
     filter_t *p_filter = (filter_t *)p_this;
 
-#if defined (MODULE_NAME_IS_i420_rgb)
+#ifdef PLAIN
     free( p_filter->p_sys->p_base );
 #endif
     free( p_filter->p_sys->p_offset );
@@ -308,21 +290,18 @@ static void Deactivate( vlc_object_t *p_this )
     free( p_filter->p_sys );
 }
 
-#if defined (MODULE_NAME_IS_i420_rgb)
-VIDEO_FILTER_WRAPPER( I420_RGB8 )
-VIDEO_FILTER_WRAPPER( I420_RGB16 )
-//VIDEO_FILTER_WRAPPER( I420_RGB16_dither )
-VIDEO_FILTER_WRAPPER( I420_RGB32 )
-#else
+#ifndef PLAIN
 VIDEO_FILTER_WRAPPER( I420_R5G5B5 )
 VIDEO_FILTER_WRAPPER( I420_R5G6B5 )
 VIDEO_FILTER_WRAPPER( I420_A8R8G8B8 )
 VIDEO_FILTER_WRAPPER( I420_R8G8B8A8 )
 VIDEO_FILTER_WRAPPER( I420_B8G8R8A8 )
 VIDEO_FILTER_WRAPPER( I420_A8B8G8R8 )
-#endif
+#else
+VIDEO_FILTER_WRAPPER( I420_RGB8 )
+VIDEO_FILTER_WRAPPER( I420_RGB16 )
+VIDEO_FILTER_WRAPPER( I420_RGB32 )
 
-#if defined (MODULE_NAME_IS_i420_rgb)
 /*****************************************************************************
  * SetGammaTable: return intensity table transformed by gamma curve.
  *****************************************************************************
@@ -538,6 +517,4 @@ static void Set8bppPalette( filter_t *p_filter, uint8_t *p_rgb8 )
         }
     }
 }
-
 #endif
-
index 5af725b278c411df3dd4878a5cea387291efb9ee..3bef970c644d9970db1e55ed44a14a630a683590 100644 (file)
  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  *****************************************************************************/
 
+#if !defined (SSE2) && !defined (MMX)
+# define PLAIN
+#endif
+
 /** Number of entries in RGB palette/colormap */
 #define CMAP_RGB2_SIZE 256
 
@@ -35,7 +39,7 @@ struct filter_sys_t
     uint8_t  *p_buffer;
     int *p_offset;
 
-#ifdef MODULE_NAME_IS_i420_rgb
+#ifdef PLAIN
     /**< Pre-calculated conversion tables */
     void *p_base;                      /**< base for all conversion tables */
     uint8_t   *p_rgb8;                 /**< RGB 8 bits table */
@@ -55,12 +59,11 @@ struct filter_sys_t
 /*****************************************************************************
  * Prototypes
  *****************************************************************************/
-#ifdef MODULE_NAME_IS_i420_rgb
+#ifdef PLAIN
 void I420_RGB8         ( filter_t *, picture_t *, picture_t * );
-void I420_RGB16_dither ( filter_t *, picture_t *, picture_t * );
 void I420_RGB16        ( filter_t *, picture_t *, picture_t * );
 void I420_RGB32        ( filter_t *, picture_t *, picture_t * );
-#else // if defined(MODULE_NAME_IS_i420_rgb_mmx)
+#else
 void I420_R5G5B5       ( filter_t *, picture_t *, picture_t * );
 void I420_R5G6B5       ( filter_t *, picture_t *, picture_t * );
 void I420_A8R8G8B8     ( filter_t *, picture_t *, picture_t * );
index 37b19a2604483ed477d1c43e887c331c9e50a5c3..32df817f6caed7cdf17efbbb8e82bdbbe06248bc 100644 (file)
  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  *****************************************************************************/
 
-/*****************************************************************************
- * Preamble
- *****************************************************************************/
-
 #ifdef HAVE_CONFIG_H
 # include "config.h"
 #endif
 #include <vlc_cpu.h>
 
 #include "i420_rgb.h"
-#if defined (MODULE_NAME_IS_i420_rgb)
-#   include "i420_rgb_c.h"
-#   define VLC_TARGET
-#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
-#   include "i420_rgb_mmx.h"
-#   define VLC_TARGET VLC_MMX
-#elif defined (MODULE_NAME_IS_i420_rgb_sse2)
-#   include "i420_rgb_sse2.h"
-#   define VLC_TARGET VLC_SSE
-#endif
+#include "i420_rgb_c.h"
+
+/*****************************************************************************
+ * SetOffset: build offset array for conversion functions
+ *****************************************************************************
+ * This function will build an offset array used in later conversion functions.
+ * It will also set horizontal and vertical scaling indicators.
+ *****************************************************************************/
+static void SetOffset( int i_width, int i_height, int i_pic_width,
+                       int i_pic_height, bool *pb_hscale,
+                       unsigned int *pi_vscale, int *p_offset )
+{
+    /*
+     * Prepare horizontal offset array
+     */
+    if( i_pic_width - i_width == 0 )
+    {   /* No horizontal scaling: YUV conversion is done directly to picture */
+        *pb_hscale = 0;
+    }
+    else if( i_pic_width - i_width > 0 )
+    {   /* Prepare scaling array for horizontal extension */
+        int i_scale_count = i_pic_width;
+
+        *pb_hscale = 1;
+        for( int i_x = i_width; i_x--; )
+        {
+            while( (i_scale_count -= i_width) > 0 )
+            {
+                *p_offset++ = 0;
+            }
+            *p_offset++ = 1;
+            i_scale_count += i_pic_width;
+        }
+    }
+    else /* if( i_pic_width - i_width < 0 ) */
+    {   /* Prepare scaling array for horizontal reduction */
+        int i_scale_count = i_pic_width;
+
+        *pb_hscale = 1;
+        for( int i_x = i_pic_width; i_x--; )
+        {
+            *p_offset = 1;
+            while( (i_scale_count -= i_pic_width) > 0 )
+            {
+                *p_offset += 1;
+            }
+            p_offset++;
+            i_scale_count += i_width;
+        }
+    }
 
-static void SetOffset( int, int, int, int, bool *,
-                       unsigned int *, int * );
+    /*
+     * Set vertical scaling indicator
+     */
+    if( i_pic_height - i_height == 0 )
+        *pi_vscale = 0;
+    else if( i_pic_height - i_height > 0 )
+        *pi_vscale = 1;
+    else /* if( i_pic_height - i_height < 0 ) */
+        *pi_vscale = -1;
+}
 
 /*****************************************************************************
  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
@@ -60,8 +104,6 @@ static void SetOffset( int, int, int, int, bool *,
  *  - output: 1 line
  *****************************************************************************/
 
-#if defined (MODULE_NAME_IS_i420_rgb)
-
 void I420_RGB16( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 {
     /* We got this one from the old arguments */
@@ -154,13 +196,21 @@ void I420_RGB16( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     }
 }
 
-#else // ! defined (MODULE_NAME_IS_i420_rgb)
+/*****************************************************************************
+ * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
+ *****************************************************************************
+ * Horizontal alignment needed:
+ *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
+ *  - output: 1 pixel (2 bytes), margins allowed
+ * Vertical alignment needed:
+ *  - input: 2 lines (2 Y lines, 1 U/V line)
+ *  - output: 1 line
+ *****************************************************************************/
 
-VLC_TARGET
-void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+void I420_RGB32( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 {
     /* We got this one from the old arguments */
-    uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
+    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
     uint8_t  *p_y   = p_src->Y_PIXELS;
     uint8_t  *p_u   = p_src->U_PIXELS;
     uint8_t  *p_v   = p_src->V_PIXELS;
@@ -173,11 +223,15 @@ void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     int         i_rewind;
     int         i_scale_count;                       /* scale modulo counter */
     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
-    uint16_t *  p_pic_start;       /* beginning of the current line for copy */
+    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
+    int         i_uval, i_vval;                           /* U and V samples */
+    int         i_red, i_green, i_blue;          /* U and V modified samples */
+    uint32_t *  p_yuv = p_filter->p_sys->p_rgb32;
+    uint32_t *  p_ybase;                     /* Y dependant conversion table */
 
     /* Conversion buffer pointer */
-    uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
-    uint16_t *  p_buffer;
+    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+    uint32_t *  p_buffer;
 
     /* Offset array pointer */
     int *       p_offset_start = p_filter->p_sys->p_offset;
@@ -189,237 +243,8 @@ void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
                                  - p_src->p[1].i_visible_pitch;
 
     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
-    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
-     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
-     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
-    SetOffset( p_filter->fmt_in.video.i_width,
-               p_filter->fmt_in.video.i_height,
-               p_filter->fmt_out.video.i_width,
-               p_filter->fmt_out.video.i_height,
-               &b_hscale, &i_vscale, p_offset_start );
-
-
-    /*
-     * Perform conversion
-     */
-    i_scale_count = ( i_vscale == 1 ) ?
-                    p_filter->fmt_out.video.i_height :
-                    p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
-
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
-    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
-                    p_dest->p->i_pitch|
-                    ((intptr_t)p_y)|
-                    ((intptr_t)p_buffer))) )
-    {
-        /* use faster SSE2 aligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_16_ALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_15_ALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-
-                SSE2_CALL (
-                    SSE2_INIT_16_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_15_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 2 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-    else
-    {
-        /* use slower SSE2 unaligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_16_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_15_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-
-                SSE2_CALL (
-                    SSE2_INIT_16_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_15_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 2 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-
-    /* make sure all SSE2 stores are visible thereafter */
-    SSE2_END;
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
-
     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 
-    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-    {
-        p_pic_start = p_pic;
-        p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
-        {
-            MMX_CALL (
-                MMX_INIT_16
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_15
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-
-        /* Here we do some unaligned reads and duplicate conversions, but
-         * at least we have all the pixels */
-        if( i_rewind )
-        {
-            p_y -= i_rewind;
-            p_u -= i_rewind >> 1;
-            p_v -= i_rewind >> 1;
-            p_buffer -= i_rewind;
-
-            MMX_CALL (
-                MMX_INIT_16
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_15
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-        SCALE_WIDTH;
-        SCALE_HEIGHT( 420, 2 );
-
-        p_y += i_source_margin;
-        if( i_y % 2 )
-        {
-            p_u += i_source_margin_c;
-            p_v += i_source_margin_c;
-        }
-    }
-    /* re-enable FPU registers */
-    MMX_END;
-
-#endif
-}
-
-VLC_TARGET
-void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
-    /* We got this one from the old arguments */
-    uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
-    uint8_t  *p_y   = p_src->Y_PIXELS;
-    uint8_t  *p_u   = p_src->U_PIXELS;
-    uint8_t  *p_v   = p_src->V_PIXELS;
-
-    bool  b_hscale;                         /* horizontal scaling type */
-    unsigned int i_vscale;                          /* vertical scaling type */
-    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
-
-    int         i_right_margin;
-    int         i_rewind;
-    int         i_scale_count;                       /* scale modulo counter */
-    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
-    uint16_t *  p_pic_start;       /* beginning of the current line for copy */
-
-    /* Conversion buffer pointer */
-    uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
-    uint16_t *  p_buffer;
-
-    /* Offset array pointer */
-    int *       p_offset_start = p_filter->p_sys->p_offset;
-    int *       p_offset;
-
-    const int i_source_margin = p_src->p[0].i_pitch
-                                 - p_src->p[0].i_visible_pitch;
-    const int i_source_margin_c = p_src->p[1].i_pitch
-                                 - p_src->p[1].i_visible_pitch;
-
-    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
@@ -429,138 +254,12 @@ void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
                p_filter->fmt_out.video.i_height,
                &b_hscale, &i_vscale, p_offset_start );
 
-
     /*
      * Perform conversion
      */
     i_scale_count = ( i_vscale == 1 ) ?
                     p_filter->fmt_out.video.i_height :
                     p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
-
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
-    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
-                    p_dest->p->i_pitch|
-                    ((intptr_t)p_y)|
-                    ((intptr_t)p_buffer))) )
-    {
-        /* use faster SSE2 aligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_16_ALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_16_ALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-
-                SSE2_CALL (
-                    SSE2_INIT_16_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_16_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 2 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-    else
-    {
-        /* use slower SSE2 unaligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
-            {
-                SSE2_CALL(
-                    SSE2_INIT_16_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_16_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-
-                SSE2_CALL(
-                    SSE2_INIT_16_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_16_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 2 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-
-    /* make sure all SSE2 stores are visible thereafter */
-    SSE2_END;
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
     {
         p_pic_start = p_pic;
@@ -568,16 +267,10 @@ void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 
         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
         {
-            MMX_CALL (
-                MMX_INIT_16
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_16
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
+            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
+            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
+            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
+            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
         }
 
         /* Here we do some unaligned reads and duplicate conversions, but
@@ -588,20 +281,13 @@ void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
             p_u -= i_rewind >> 1;
             p_v -= i_rewind >> 1;
             p_buffer -= i_rewind;
-
-            MMX_CALL (
-                MMX_INIT_16
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_16
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
+            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
+            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
+            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
+            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
         }
         SCALE_WIDTH;
-        SCALE_HEIGHT( 420, 2 );
+        SCALE_HEIGHT( 420, 4 );
 
         p_y += i_source_margin;
         if( i_y % 2 )
@@ -610,1098 +296,4 @@ void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
             p_v += i_source_margin_c;
         }
     }
-    /* re-enable FPU registers */
-    MMX_END;
-
-#endif
 }
-
-#endif
-
-/*****************************************************************************
- * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
- *****************************************************************************
- * Horizontal alignment needed:
- *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
- *  - output: 1 pixel (2 bytes), margins allowed
- * Vertical alignment needed:
- *  - input: 2 lines (2 Y lines, 1 U/V line)
- *  - output: 1 line
- *****************************************************************************/
-
-#if defined (MODULE_NAME_IS_i420_rgb)
-
-void I420_RGB32( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
-    /* We got this one from the old arguments */
-    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
-    uint8_t  *p_y   = p_src->Y_PIXELS;
-    uint8_t  *p_u   = p_src->U_PIXELS;
-    uint8_t  *p_v   = p_src->V_PIXELS;
-
-    bool  b_hscale;                         /* horizontal scaling type */
-    unsigned int i_vscale;                          /* vertical scaling type */
-    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
-
-    int         i_right_margin;
-    int         i_rewind;
-    int         i_scale_count;                       /* scale modulo counter */
-    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
-    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
-    int         i_uval, i_vval;                           /* U and V samples */
-    int         i_red, i_green, i_blue;          /* U and V modified samples */
-    uint32_t *  p_yuv = p_filter->p_sys->p_rgb32;
-    uint32_t *  p_ybase;                     /* Y dependant conversion table */
-
-    /* Conversion buffer pointer */
-    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
-    uint32_t *  p_buffer;
-
-    /* Offset array pointer */
-    int *       p_offset_start = p_filter->p_sys->p_offset;
-    int *       p_offset;
-
-    const int i_source_margin = p_src->p[0].i_pitch
-                                 - p_src->p[0].i_visible_pitch;
-    const int i_source_margin_c = p_src->p[1].i_pitch
-                                 - p_src->p[1].i_visible_pitch;
-
-    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
-    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
-     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
-     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
-    SetOffset( p_filter->fmt_in.video.i_width,
-               p_filter->fmt_in.video.i_height,
-               p_filter->fmt_out.video.i_width,
-               p_filter->fmt_out.video.i_height,
-               &b_hscale, &i_vscale, p_offset_start );
-
-    /*
-     * Perform conversion
-     */
-    i_scale_count = ( i_vscale == 1 ) ?
-                    p_filter->fmt_out.video.i_height :
-                    p_filter->fmt_in.video.i_height;
-    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-    {
-        p_pic_start = p_pic;
-        p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
-        {
-            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
-            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
-            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
-            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
-        }
-
-        /* Here we do some unaligned reads and duplicate conversions, but
-         * at least we have all the pixels */
-        if( i_rewind )
-        {
-            p_y -= i_rewind;
-            p_u -= i_rewind >> 1;
-            p_v -= i_rewind >> 1;
-            p_buffer -= i_rewind;
-            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
-            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
-            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
-            CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
-        }
-        SCALE_WIDTH;
-        SCALE_HEIGHT( 420, 4 );
-
-        p_y += i_source_margin;
-        if( i_y % 2 )
-        {
-            p_u += i_source_margin_c;
-            p_v += i_source_margin_c;
-        }
-    }
-}
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
-
-VLC_TARGET
-void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
-                                            picture_t *p_dest )
-{
-    /* We got this one from the old arguments */
-    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
-    uint8_t  *p_y   = p_src->Y_PIXELS;
-    uint8_t  *p_u   = p_src->U_PIXELS;
-    uint8_t  *p_v   = p_src->V_PIXELS;
-
-    bool  b_hscale;                         /* horizontal scaling type */
-    unsigned int i_vscale;                          /* vertical scaling type */
-    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
-
-    int         i_right_margin;
-    int         i_rewind;
-    int         i_scale_count;                       /* scale modulo counter */
-    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
-    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
-    /* Conversion buffer pointer */
-    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
-    uint32_t *  p_buffer;
-
-    /* Offset array pointer */
-    int *       p_offset_start = p_filter->p_sys->p_offset;
-    int *       p_offset;
-
-    const int i_source_margin = p_src->p[0].i_pitch
-                                 - p_src->p[0].i_visible_pitch;
-    const int i_source_margin_c = p_src->p[1].i_pitch
-                                 - p_src->p[1].i_visible_pitch;
-
-    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
-    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
-     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
-     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
-    SetOffset( p_filter->fmt_in.video.i_width,
-               p_filter->fmt_in.video.i_height,
-               p_filter->fmt_out.video.i_width,
-               p_filter->fmt_out.video.i_height,
-               &b_hscale, &i_vscale, p_offset_start );
-
-    /*
-     * Perform conversion
-     */
-    i_scale_count = ( i_vscale == 1 ) ?
-                    p_filter->fmt_out.video.i_height :
-                    p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
-
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
-    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
-                    p_dest->p->i_pitch|
-                    ((intptr_t)p_y)|
-                    ((intptr_t)p_buffer))) )
-    {
-        /* use faster SSE2 aligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_32_ALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_ARGB_ALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_ARGB_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 4;
-                p_v += 4;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 4 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-    else
-    {
-        /* use slower SSE2 unaligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_ARGB_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_ARGB_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 4 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-
-    /* make sure all SSE2 stores are visible thereafter */
-    SSE2_END;
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
-    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-    {
-        p_pic_start = p_pic;
-        p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
-        {
-            MMX_CALL (
-                MMX_INIT_32
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_32_ARGB
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-
-        /* Here we do some unaligned reads and duplicate conversions, but
-         * at least we have all the pixels */
-        if( i_rewind )
-        {
-            p_y -= i_rewind;
-            p_u -= i_rewind >> 1;
-            p_v -= i_rewind >> 1;
-            p_buffer -= i_rewind;
-            MMX_CALL (
-                MMX_INIT_32
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_32_ARGB
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-        SCALE_WIDTH;
-        SCALE_HEIGHT( 420, 4 );
-
-        p_y += i_source_margin;
-        if( i_y % 2 )
-        {
-            p_u += i_source_margin_c;
-            p_v += i_source_margin_c;
-        }
-    }
-
-    /* re-enable FPU registers */
-    MMX_END;
-
-#endif
-}
-
-VLC_TARGET
-void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
-    /* We got this one from the old arguments */
-    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
-    uint8_t  *p_y   = p_src->Y_PIXELS;
-    uint8_t  *p_u   = p_src->U_PIXELS;
-    uint8_t  *p_v   = p_src->V_PIXELS;
-
-    bool  b_hscale;                         /* horizontal scaling type */
-    unsigned int i_vscale;                          /* vertical scaling type */
-    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
-
-    int         i_right_margin;
-    int         i_rewind;
-    int         i_scale_count;                       /* scale modulo counter */
-    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
-    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
-    /* Conversion buffer pointer */
-    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
-    uint32_t *  p_buffer;
-
-    /* Offset array pointer */
-    int *       p_offset_start = p_filter->p_sys->p_offset;
-    int *       p_offset;
-
-    const int i_source_margin = p_src->p[0].i_pitch
-                                 - p_src->p[0].i_visible_pitch;
-    const int i_source_margin_c = p_src->p[1].i_pitch
-                                 - p_src->p[1].i_visible_pitch;
-
-    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
-    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
-     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
-     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
-    SetOffset( p_filter->fmt_in.video.i_width,
-               p_filter->fmt_in.video.i_height,
-               p_filter->fmt_out.video.i_width,
-               p_filter->fmt_out.video.i_height,
-               &b_hscale, &i_vscale, p_offset_start );
-
-    /*
-     * Perform conversion
-     */
-    i_scale_count = ( i_vscale == 1 ) ?
-                    p_filter->fmt_out.video.i_height :
-                    p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
-
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
-    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
-                    p_dest->p->i_pitch|
-                    ((intptr_t)p_y)|
-                    ((intptr_t)p_buffer))) )
-    {
-        /* use faster SSE2 aligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_32_ALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_RGBA_ALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_RGBA_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 4;
-                p_v += 4;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 4 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-    else
-    {
-        /* use slower SSE2 unaligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_RGBA_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_RGBA_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 4 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-
-    /* make sure all SSE2 stores are visible thereafter */
-    SSE2_END;
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
-    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-    {
-        p_pic_start = p_pic;
-        p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
-        {
-            MMX_CALL (
-                MMX_INIT_32
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_32_RGBA
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-
-        /* Here we do some unaligned reads and duplicate conversions, but
-         * at least we have all the pixels */
-        if( i_rewind )
-        {
-            p_y -= i_rewind;
-            p_u -= i_rewind >> 1;
-            p_v -= i_rewind >> 1;
-            p_buffer -= i_rewind;
-            MMX_CALL (
-                MMX_INIT_32
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_32_RGBA
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-        SCALE_WIDTH;
-        SCALE_HEIGHT( 420, 4 );
-
-        p_y += i_source_margin;
-        if( i_y % 2 )
-        {
-            p_u += i_source_margin_c;
-            p_v += i_source_margin_c;
-        }
-    }
-
-    /* re-enable FPU registers */
-    MMX_END;
-
-#endif
-}
-
-VLC_TARGET
-void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
-    /* We got this one from the old arguments */
-    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
-    uint8_t  *p_y   = p_src->Y_PIXELS;
-    uint8_t  *p_u   = p_src->U_PIXELS;
-    uint8_t  *p_v   = p_src->V_PIXELS;
-
-    bool  b_hscale;                         /* horizontal scaling type */
-    unsigned int i_vscale;                          /* vertical scaling type */
-    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
-
-    int         i_right_margin;
-    int         i_rewind;
-    int         i_scale_count;                       /* scale modulo counter */
-    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
-    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
-    /* Conversion buffer pointer */
-    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
-    uint32_t *  p_buffer;
-
-    /* Offset array pointer */
-    int *       p_offset_start = p_filter->p_sys->p_offset;
-    int *       p_offset;
-
-    const int i_source_margin = p_src->p[0].i_pitch
-                                 - p_src->p[0].i_visible_pitch;
-    const int i_source_margin_c = p_src->p[1].i_pitch
-                                 - p_src->p[1].i_visible_pitch;
-
-    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
-    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
-     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
-     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
-    SetOffset( p_filter->fmt_in.video.i_width,
-               p_filter->fmt_in.video.i_height,
-               p_filter->fmt_out.video.i_width,
-               p_filter->fmt_out.video.i_height,
-               &b_hscale, &i_vscale, p_offset_start );
-
-    /*
-     * Perform conversion
-     */
-    i_scale_count = ( i_vscale == 1 ) ?
-                    p_filter->fmt_out.video.i_height :
-                    p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
-
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
-    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
-                    p_dest->p->i_pitch|
-                    ((intptr_t)p_y)|
-                    ((intptr_t)p_buffer))) )
-    {
-        /* use faster SSE2 aligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_32_ALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_BGRA_ALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_BGRA_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 4;
-                p_v += 4;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 4 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-    else
-    {
-        /* use slower SSE2 unaligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_BGRA_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_BGRA_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 4 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-
-#else
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
-    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-    {
-        p_pic_start = p_pic;
-        p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
-        {
-            MMX_CALL (
-                MMX_INIT_32
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_32_BGRA
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-
-        /* Here we do some unaligned reads and duplicate conversions, but
-         * at least we have all the pixels */
-        if( i_rewind )
-        {
-            p_y -= i_rewind;
-            p_u -= i_rewind >> 1;
-            p_v -= i_rewind >> 1;
-            p_buffer -= i_rewind;
-            MMX_CALL (
-                MMX_INIT_32
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_32_BGRA
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-        SCALE_WIDTH;
-        SCALE_HEIGHT( 420, 4 );
-
-        p_y += i_source_margin;
-        if( i_y % 2 )
-        {
-            p_u += i_source_margin_c;
-            p_v += i_source_margin_c;
-        }
-    }
-
-    /* re-enable FPU registers */
-    MMX_END;
-
-#endif
-}
-
-VLC_TARGET
-void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
-    /* We got this one from the old arguments */
-    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
-    uint8_t  *p_y   = p_src->Y_PIXELS;
-    uint8_t  *p_u   = p_src->U_PIXELS;
-    uint8_t  *p_v   = p_src->V_PIXELS;
-
-    bool  b_hscale;                         /* horizontal scaling type */
-    unsigned int i_vscale;                          /* vertical scaling type */
-    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
-
-    int         i_right_margin;
-    int         i_rewind;
-    int         i_scale_count;                       /* scale modulo counter */
-    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
-    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
-    /* Conversion buffer pointer */
-    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
-    uint32_t *  p_buffer;
-
-    /* Offset array pointer */
-    int *       p_offset_start = p_filter->p_sys->p_offset;
-    int *       p_offset;
-
-    const int i_source_margin = p_src->p[0].i_pitch
-                                 - p_src->p[0].i_visible_pitch;
-    const int i_source_margin_c = p_src->p[1].i_pitch
-                                 - p_src->p[1].i_visible_pitch;
-
-    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
-    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
-     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
-     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
-    SetOffset( p_filter->fmt_in.video.i_width,
-               p_filter->fmt_in.video.i_height,
-               p_filter->fmt_out.video.i_width,
-               p_filter->fmt_out.video.i_height,
-               &b_hscale, &i_vscale, p_offset_start );
-
-    /*
-     * Perform conversion
-     */
-    i_scale_count = ( i_vscale == 1 ) ?
-                    p_filter->fmt_out.video.i_height :
-                    p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
-
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
-    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
-                    p_dest->p->i_pitch|
-                    ((intptr_t)p_y)|
-                    ((intptr_t)p_buffer))) )
-    {
-        /* use faster SSE2 aligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_32_ALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_ABGR_ALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_ABGR_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 4;
-                p_v += 4;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 4 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-    else
-    {
-        /* use slower SSE2 unaligned fetch and store */
-        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-        {
-            p_pic_start = p_pic;
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
-            {
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_ABGR_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-                p_buffer += 16;
-            }
-
-            /* Here we do some unaligned reads and duplicate conversions, but
-             * at least we have all the pixels */
-            if( i_rewind )
-            {
-                p_y -= i_rewind;
-                p_u -= i_rewind >> 1;
-                p_v -= i_rewind >> 1;
-                p_buffer -= i_rewind;
-                SSE2_CALL (
-                    SSE2_INIT_32_UNALIGNED
-                    SSE2_YUV_MUL
-                    SSE2_YUV_ADD
-                    SSE2_UNPACK_32_ABGR_UNALIGNED
-                );
-                p_y += 16;
-                p_u += 8;
-                p_v += 8;
-            }
-            SCALE_WIDTH;
-            SCALE_HEIGHT( 420, 4 );
-
-            p_y += i_source_margin;
-            if( i_y % 2 )
-            {
-                p_u += i_source_margin_c;
-                p_v += i_source_margin_c;
-            }
-            p_buffer = b_hscale ? p_buffer_start : p_pic;
-        }
-    }
-
-#else
-
-    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
-    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
-    {
-        p_pic_start = p_pic;
-        p_buffer = b_hscale ? p_buffer_start : p_pic;
-
-        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
-        {
-            MMX_CALL (
-                MMX_INIT_32
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_32_ABGR
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-
-        /* Here we do some unaligned reads and duplicate conversions, but
-         * at least we have all the pixels */
-        if( i_rewind )
-        {
-            p_y -= i_rewind;
-            p_u -= i_rewind >> 1;
-            p_v -= i_rewind >> 1;
-            p_buffer -= i_rewind;
-            MMX_CALL (
-                MMX_INIT_32
-                MMX_YUV_MUL
-                MMX_YUV_ADD
-                MMX_UNPACK_32_ABGR
-            );
-            p_y += 8;
-            p_u += 4;
-            p_v += 4;
-            p_buffer += 8;
-        }
-        SCALE_WIDTH;
-        SCALE_HEIGHT( 420, 4 );
-
-        p_y += i_source_margin;
-        if( i_y % 2 )
-        {
-            p_u += i_source_margin_c;
-            p_v += i_source_margin_c;
-        }
-    }
-
-    /* re-enable FPU registers */
-    MMX_END;
-
-#endif
-}
-
-#endif
-
-/* Following functions are local */
-
-/*****************************************************************************
- * SetOffset: build offset array for conversion functions
- *****************************************************************************
- * This function will build an offset array used in later conversion functions.
- * It will also set horizontal and vertical scaling indicators.
- *****************************************************************************/
-static void SetOffset( int i_width, int i_height, int i_pic_width,
-                       int i_pic_height, bool *pb_hscale,
-                       unsigned int *pi_vscale, int *p_offset )
-{
-    int i_x;                                    /* x position in destination */
-    int i_scale_count;                                     /* modulo counter */
-
-    /*
-     * Prepare horizontal offset array
-     */
-    if( i_pic_width - i_width == 0 )
-    {
-        /* No horizontal scaling: YUV conversion is done directly to picture */
-        *pb_hscale = 0;
-    }
-    else if( i_pic_width - i_width > 0 )
-    {
-        /* Prepare scaling array for horizontal extension */
-        *pb_hscale = 1;
-        i_scale_count = i_pic_width;
-        for( i_x = i_width; i_x--; )
-        {
-            while( (i_scale_count -= i_width) > 0 )
-            {
-                *p_offset++ = 0;
-            }
-            *p_offset++ = 1;
-            i_scale_count += i_pic_width;
-        }
-    }
-    else /* if( i_pic_width - i_width < 0 ) */
-    {
-        /* Prepare scaling array for horizontal reduction */
-        *pb_hscale = 1;
-        i_scale_count = i_width;
-        for( i_x = i_pic_width; i_x--; )
-        {
-            *p_offset = 1;
-            while( (i_scale_count -= i_pic_width) > 0 )
-            {
-                *p_offset += 1;
-            }
-            p_offset++;
-            i_scale_count += i_width;
-        }
-    }
-
-    /*
-     * Set vertical scaling indicator
-     */
-    if( i_pic_height - i_height == 0 )
-    {
-        *pi_vscale = 0;
-    }
-    else if( i_pic_height - i_height > 0 )
-    {
-        *pi_vscale = 1;
-    }
-    else /* if( i_pic_height - i_height < 0 ) */
-    {
-        *pi_vscale = -1;
-    }
-}
-
diff --git a/modules/video_chroma/i420_rgb16_x86.c b/modules/video_chroma/i420_rgb16_x86.c
new file mode 100644 (file)
index 0000000..087f90f
--- /dev/null
@@ -0,0 +1,1467 @@
+/*****************************************************************************
+ * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
+ *****************************************************************************
+ * Copyright (C) 2000 VLC authors and VideoLAN
+ * $Id$
+ *
+ * Authors: Samuel Hocevar <sam@zoy.org>
+ *          Damien Fouilleul <damienf@videolan.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
+#include <vlc_filter.h>
+#include <vlc_cpu.h>
+
+#include "i420_rgb.h"
+#ifdef SSE2
+# include "i420_rgb_sse2.h"
+# define VLC_TARGET VLC_SSE
+#else
+# include "i420_rgb_mmx.h"
+# define VLC_TARGET VLC_MMX
+#endif
+
+/*****************************************************************************
+ * SetOffset: build offset array for conversion functions
+ *****************************************************************************
+ * This function will build an offset array used in later conversion functions.
+ * It will also set horizontal and vertical scaling indicators.
+ *****************************************************************************/
+static void SetOffset( int i_width, int i_height, int i_pic_width,
+                       int i_pic_height, bool *pb_hscale,
+                       unsigned int *pi_vscale, int *p_offset )
+{
+    /*
+     * Prepare horizontal offset array
+     */
+    if( i_pic_width - i_width == 0 )
+    {   /* No horizontal scaling: YUV conversion is done directly to picture */
+        *pb_hscale = 0;
+    }
+    else if( i_pic_width - i_width > 0 )
+    {   /* Prepare scaling array for horizontal extension */
+        int i_scale_count = i_pic_width;
+
+        *pb_hscale = 1;
+        for( int i_x = i_width; i_x--; )
+        {
+            while( (i_scale_count -= i_width) > 0 )
+            {
+                *p_offset++ = 0;
+            }
+            *p_offset++ = 1;
+            i_scale_count += i_pic_width;
+        }
+    }
+    else /* if( i_pic_width - i_width < 0 ) */
+    {   /* Prepare scaling array for horizontal reduction */
+        int i_scale_count = i_pic_width;
+
+        *pb_hscale = 1;
+        for( int i_x = i_pic_width; i_x--; )
+        {
+            *p_offset = 1;
+            while( (i_scale_count -= i_pic_width) > 0 )
+            {
+                *p_offset += 1;
+            }
+            p_offset++;
+            i_scale_count += i_width;
+        }
+    }
+
+    /*
+     * Set vertical scaling indicator
+     */
+    if( i_pic_height - i_height == 0 )
+        *pi_vscale = 0;
+    else if( i_pic_height - i_height > 0 )
+        *pi_vscale = 1;
+    else /* if( i_pic_height - i_height < 0 ) */
+        *pi_vscale = -1;
+}
+
+VLC_TARGET
+void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+    /* We got this one from the old arguments */
+    uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
+    uint8_t  *p_y   = p_src->Y_PIXELS;
+    uint8_t  *p_u   = p_src->U_PIXELS;
+    uint8_t  *p_v   = p_src->V_PIXELS;
+
+    bool  b_hscale;                         /* horizontal scaling type */
+    unsigned int i_vscale;                          /* vertical scaling type */
+    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
+
+    int         i_right_margin;
+    int         i_rewind;
+    int         i_scale_count;                       /* scale modulo counter */
+    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+    uint16_t *  p_pic_start;       /* beginning of the current line for copy */
+
+    /* Conversion buffer pointer */
+    uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
+    uint16_t *  p_buffer;
+
+    /* Offset array pointer */
+    int *       p_offset_start = p_filter->p_sys->p_offset;
+    int *       p_offset;
+
+    const int i_source_margin = p_src->p[0].i_pitch
+                                 - p_src->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_src->p[1].i_pitch
+                                 - p_src->p[1].i_visible_pitch;
+
+    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+    SetOffset( p_filter->fmt_in.video.i_width,
+               p_filter->fmt_in.video.i_height,
+               p_filter->fmt_out.video.i_width,
+               p_filter->fmt_out.video.i_height,
+               &b_hscale, &i_vscale, p_offset_start );
+
+
+    /*
+     * Perform conversion
+     */
+    i_scale_count = ( i_vscale == 1 ) ?
+                    p_filter->fmt_out.video.i_height :
+                    p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_16_ALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_15_ALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+
+                SSE2_CALL (
+                    SSE2_INIT_16_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_15_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 2 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_16_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_15_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+
+                SSE2_CALL (
+                    SSE2_INIT_16_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_15_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 2 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+
+#else /* SSE2 */
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+    {
+        p_pic_start = p_pic;
+        p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+        {
+            MMX_CALL (
+                MMX_INIT_16
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_15
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+
+        /* Here we do some unaligned reads and duplicate conversions, but
+         * at least we have all the pixels */
+        if( i_rewind )
+        {
+            p_y -= i_rewind;
+            p_u -= i_rewind >> 1;
+            p_v -= i_rewind >> 1;
+            p_buffer -= i_rewind;
+
+            MMX_CALL (
+                MMX_INIT_16
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_15
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+        SCALE_WIDTH;
+        SCALE_HEIGHT( 420, 2 );
+
+        p_y += i_source_margin;
+        if( i_y % 2 )
+        {
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+        }
+    }
+    /* re-enable FPU registers */
+    MMX_END;
+
+#endif /* SSE2 */
+}
+
+VLC_TARGET
+void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+    /* We got this one from the old arguments */
+    uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
+    uint8_t  *p_y   = p_src->Y_PIXELS;
+    uint8_t  *p_u   = p_src->U_PIXELS;
+    uint8_t  *p_v   = p_src->V_PIXELS;
+
+    bool  b_hscale;                         /* horizontal scaling type */
+    unsigned int i_vscale;                          /* vertical scaling type */
+    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
+
+    int         i_right_margin;
+    int         i_rewind;
+    int         i_scale_count;                       /* scale modulo counter */
+    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+    uint16_t *  p_pic_start;       /* beginning of the current line for copy */
+
+    /* Conversion buffer pointer */
+    uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
+    uint16_t *  p_buffer;
+
+    /* Offset array pointer */
+    int *       p_offset_start = p_filter->p_sys->p_offset;
+    int *       p_offset;
+
+    const int i_source_margin = p_src->p[0].i_pitch
+                                 - p_src->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_src->p[1].i_pitch
+                                 - p_src->p[1].i_visible_pitch;
+
+    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+    SetOffset( p_filter->fmt_in.video.i_width,
+               p_filter->fmt_in.video.i_height,
+               p_filter->fmt_out.video.i_width,
+               p_filter->fmt_out.video.i_height,
+               &b_hscale, &i_vscale, p_offset_start );
+
+
+    /*
+     * Perform conversion
+     */
+    i_scale_count = ( i_vscale == 1 ) ?
+                    p_filter->fmt_out.video.i_height :
+                    p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_16_ALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_16_ALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+
+                SSE2_CALL (
+                    SSE2_INIT_16_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_16_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 2 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
+            {
+                SSE2_CALL(
+                    SSE2_INIT_16_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_16_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+
+                SSE2_CALL(
+                    SSE2_INIT_16_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_16_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 2 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+
+#else /* SSE2 */
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+    {
+        p_pic_start = p_pic;
+        p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+        {
+            MMX_CALL (
+                MMX_INIT_16
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_16
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+
+        /* Here we do some unaligned reads and duplicate conversions, but
+         * at least we have all the pixels */
+        if( i_rewind )
+        {
+            p_y -= i_rewind;
+            p_u -= i_rewind >> 1;
+            p_v -= i_rewind >> 1;
+            p_buffer -= i_rewind;
+
+            MMX_CALL (
+                MMX_INIT_16
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_16
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+        SCALE_WIDTH;
+        SCALE_HEIGHT( 420, 2 );
+
+        p_y += i_source_margin;
+        if( i_y % 2 )
+        {
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+        }
+    }
+    /* re-enable FPU registers */
+    MMX_END;
+
+#endif /* SSE2 */
+}
+
+VLC_TARGET
+void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
+                                            picture_t *p_dest )
+{
+    /* We got this one from the old arguments */
+    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+    uint8_t  *p_y   = p_src->Y_PIXELS;
+    uint8_t  *p_u   = p_src->U_PIXELS;
+    uint8_t  *p_v   = p_src->V_PIXELS;
+
+    bool  b_hscale;                         /* horizontal scaling type */
+    unsigned int i_vscale;                          /* vertical scaling type */
+    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
+
+    int         i_right_margin;
+    int         i_rewind;
+    int         i_scale_count;                       /* scale modulo counter */
+    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
+    /* Conversion buffer pointer */
+    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+    uint32_t *  p_buffer;
+
+    /* Offset array pointer */
+    int *       p_offset_start = p_filter->p_sys->p_offset;
+    int *       p_offset;
+
+    const int i_source_margin = p_src->p[0].i_pitch
+                                 - p_src->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_src->p[1].i_pitch
+                                 - p_src->p[1].i_visible_pitch;
+
+    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+    SetOffset( p_filter->fmt_in.video.i_width,
+               p_filter->fmt_in.video.i_height,
+               p_filter->fmt_out.video.i_width,
+               p_filter->fmt_out.video.i_height,
+               &b_hscale, &i_vscale, p_offset_start );
+
+    /*
+     * Perform conversion
+     */
+    i_scale_count = ( i_vscale == 1 ) ?
+                    p_filter->fmt_out.video.i_height :
+                    p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_ALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_ARGB_ALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_ARGB_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 4;
+                p_v += 4;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_ARGB_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_ARGB_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+
+#else
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+    {
+        p_pic_start = p_pic;
+        p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+        {
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_ARGB
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+
+        /* Here we do some unaligned reads and duplicate conversions, but
+         * at least we have all the pixels */
+        if( i_rewind )
+        {
+            p_y -= i_rewind;
+            p_u -= i_rewind >> 1;
+            p_v -= i_rewind >> 1;
+            p_buffer -= i_rewind;
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_ARGB
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+        SCALE_WIDTH;
+        SCALE_HEIGHT( 420, 4 );
+
+        p_y += i_source_margin;
+        if( i_y % 2 )
+        {
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+        }
+    }
+
+    /* re-enable FPU registers */
+    MMX_END;
+
+#endif
+}
+
+VLC_TARGET
+void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+    /* We got this one from the old arguments */
+    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+    uint8_t  *p_y   = p_src->Y_PIXELS;
+    uint8_t  *p_u   = p_src->U_PIXELS;
+    uint8_t  *p_v   = p_src->V_PIXELS;
+
+    bool  b_hscale;                         /* horizontal scaling type */
+    unsigned int i_vscale;                          /* vertical scaling type */
+    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
+
+    int         i_right_margin;
+    int         i_rewind;
+    int         i_scale_count;                       /* scale modulo counter */
+    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
+    /* Conversion buffer pointer */
+    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+    uint32_t *  p_buffer;
+
+    /* Offset array pointer */
+    int *       p_offset_start = p_filter->p_sys->p_offset;
+    int *       p_offset;
+
+    const int i_source_margin = p_src->p[0].i_pitch
+                                 - p_src->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_src->p[1].i_pitch
+                                 - p_src->p[1].i_visible_pitch;
+
+    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+    SetOffset( p_filter->fmt_in.video.i_width,
+               p_filter->fmt_in.video.i_height,
+               p_filter->fmt_out.video.i_width,
+               p_filter->fmt_out.video.i_height,
+               &b_hscale, &i_vscale, p_offset_start );
+
+    /*
+     * Perform conversion
+     */
+    i_scale_count = ( i_vscale == 1 ) ?
+                    p_filter->fmt_out.video.i_height :
+                    p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_ALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_RGBA_ALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 4;
+                p_v += 4;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all SSE2 stores are visible thereafter */
+    SSE2_END;
+
+#else
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+    {
+        p_pic_start = p_pic;
+        p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+        {
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_RGBA
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+
+        /* Here we do some unaligned reads and duplicate conversions, but
+         * at least we have all the pixels */
+        if( i_rewind )
+        {
+            p_y -= i_rewind;
+            p_u -= i_rewind >> 1;
+            p_v -= i_rewind >> 1;
+            p_buffer -= i_rewind;
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_RGBA
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+        SCALE_WIDTH;
+        SCALE_HEIGHT( 420, 4 );
+
+        p_y += i_source_margin;
+        if( i_y % 2 )
+        {
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+        }
+    }
+
+    /* re-enable FPU registers */
+    MMX_END;
+
+#endif
+}
+
+VLC_TARGET
+void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+    /* We got this one from the old arguments */
+    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+    uint8_t  *p_y   = p_src->Y_PIXELS;
+    uint8_t  *p_u   = p_src->U_PIXELS;
+    uint8_t  *p_v   = p_src->V_PIXELS;
+
+    bool  b_hscale;                         /* horizontal scaling type */
+    unsigned int i_vscale;                          /* vertical scaling type */
+    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
+
+    int         i_right_margin;
+    int         i_rewind;
+    int         i_scale_count;                       /* scale modulo counter */
+    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
+    /* Conversion buffer pointer */
+    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+    uint32_t *  p_buffer;
+
+    /* Offset array pointer */
+    int *       p_offset_start = p_filter->p_sys->p_offset;
+    int *       p_offset;
+
+    const int i_source_margin = p_src->p[0].i_pitch
+                                 - p_src->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_src->p[1].i_pitch
+                                 - p_src->p[1].i_visible_pitch;
+
+    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+    SetOffset( p_filter->fmt_in.video.i_width,
+               p_filter->fmt_in.video.i_height,
+               p_filter->fmt_out.video.i_width,
+               p_filter->fmt_out.video.i_height,
+               &b_hscale, &i_vscale, p_offset_start );
+
+    /*
+     * Perform conversion
+     */
+    i_scale_count = ( i_vscale == 1 ) ?
+                    p_filter->fmt_out.video.i_height :
+                    p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_ALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_BGRA_ALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_BGRA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 4;
+                p_v += 4;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_BGRA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_BGRA_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+#else
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+    {
+        p_pic_start = p_pic;
+        p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+        {
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_BGRA
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+
+        /* Here we do some unaligned reads and duplicate conversions, but
+         * at least we have all the pixels */
+        if( i_rewind )
+        {
+            p_y -= i_rewind;
+            p_u -= i_rewind >> 1;
+            p_v -= i_rewind >> 1;
+            p_buffer -= i_rewind;
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_BGRA
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+        SCALE_WIDTH;
+        SCALE_HEIGHT( 420, 4 );
+
+        p_y += i_source_margin;
+        if( i_y % 2 )
+        {
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+        }
+    }
+
+    /* re-enable FPU registers */
+    MMX_END;
+
+#endif
+}
+
+VLC_TARGET
+void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+    /* We got this one from the old arguments */
+    uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+    uint8_t  *p_y   = p_src->Y_PIXELS;
+    uint8_t  *p_u   = p_src->U_PIXELS;
+    uint8_t  *p_v   = p_src->V_PIXELS;
+
+    bool  b_hscale;                         /* horizontal scaling type */
+    unsigned int i_vscale;                          /* vertical scaling type */
+    unsigned int i_x, i_y;                /* horizontal and vertical indexes */
+
+    int         i_right_margin;
+    int         i_rewind;
+    int         i_scale_count;                       /* scale modulo counter */
+    int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+    uint32_t *  p_pic_start;       /* beginning of the current line for copy */
+    /* Conversion buffer pointer */
+    uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+    uint32_t *  p_buffer;
+
+    /* Offset array pointer */
+    int *       p_offset_start = p_filter->p_sys->p_offset;
+    int *       p_offset;
+
+    const int i_source_margin = p_src->p[0].i_pitch
+                                 - p_src->p[0].i_visible_pitch;
+    const int i_source_margin_c = p_src->p[1].i_pitch
+                                 - p_src->p[1].i_visible_pitch;
+
+    i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+    /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+     * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+     * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+    SetOffset( p_filter->fmt_in.video.i_width,
+               p_filter->fmt_in.video.i_height,
+               p_filter->fmt_out.video.i_width,
+               p_filter->fmt_out.video.i_height,
+               &b_hscale, &i_vscale, p_offset_start );
+
+    /*
+     * Perform conversion
+     */
+    i_scale_count = ( i_vscale == 1 ) ?
+                    p_filter->fmt_out.video.i_height :
+                    p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+    /*
+    ** SSE2 128 bits fetch/store instructions are faster
+    ** if memory access is 16 bytes aligned
+    */
+
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use faster SSE2 aligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_ALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_ABGR_ALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_ABGR_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 4;
+                p_v += 4;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else
+    {
+        /* use slower SSE2 unaligned fetch and store */
+        for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+        {
+            p_pic_start = p_pic;
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+            for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+            {
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_ABGR_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+                p_buffer += 16;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                SSE2_CALL (
+                    SSE2_INIT_32_UNALIGNED
+                    SSE2_YUV_MUL
+                    SSE2_YUV_ADD
+                    SSE2_UNPACK_32_ABGR_UNALIGNED
+                );
+                p_y += 16;
+                p_u += 8;
+                p_v += 8;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+#else
+
+    i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+    for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+    {
+        p_pic_start = p_pic;
+        p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+        for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+        {
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_ABGR
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+
+        /* Here we do some unaligned reads and duplicate conversions, but
+         * at least we have all the pixels */
+        if( i_rewind )
+        {
+            p_y -= i_rewind;
+            p_u -= i_rewind >> 1;
+            p_v -= i_rewind >> 1;
+            p_buffer -= i_rewind;
+            MMX_CALL (
+                MMX_INIT_32
+                MMX_YUV_MUL
+                MMX_YUV_ADD
+                MMX_UNPACK_32_ABGR
+            );
+            p_y += 8;
+            p_u += 4;
+            p_v += 4;
+            p_buffer += 8;
+        }
+        SCALE_WIDTH;
+        SCALE_HEIGHT( 420, 4 );
+
+        p_y += i_source_margin;
+        if( i_y % 2 )
+        {
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+        }
+    }
+
+    /* re-enable FPU registers */
+    MMX_END;
+
+#endif
+}