# MMX
libi420_rgb_mmx_plugin_la_SOURCES = i420_rgb.c i420_rgb.h \
- i420_rgb16.c i420_rgb_mmx.h
-libi420_rgb_mmx_plugin_la_CPPFLAGS = $(AM_CPPFLAGS)
+ i420_rgb16_x86.c i420_rgb_mmx.h
+libi420_rgb_mmx_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -DMMX
libi420_yuy2_mmx_plugin_la_SOURCES = i420_yuy2.c i420_yuy2.h
libi420_yuy2_mmx_plugin_la_CPPFLAGS = $(AM_CPPFLAGS)
# SSE2
libi420_rgb_sse2_plugin_la_SOURCES = i420_rgb.c i420_rgb.h \
- i420_rgb16.c i420_rgb_sse2.h
-libi420_rgb_sse2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS)
+ i420_rgb16_x86.c i420_rgb_sse2.h
+libi420_rgb_sse2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -DSSE2
libi420_yuy2_sse2_plugin_la_SOURCES = i420_yuy2.c i420_yuy2.h
libi420_yuy2_sse2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS)
#include <vlc_cpu.h>
#include "i420_rgb.h"
-#if defined (MODULE_NAME_IS_i420_rgb)
-# include "i420_rgb_c.h"
- static picture_t *I420_RGB8_Filter ( filter_t *, picture_t * );
-// static picture_t *I420_RGB16_dither_Filter ( filter_t *, picture_t * );
- static picture_t *I420_RGB16_Filter ( filter_t *, picture_t * );
- static picture_t *I420_RGB32_Filter ( filter_t *, picture_t * );
+#ifdef PLAIN
+# include "i420_rgb_c.h"
+static picture_t *I420_RGB8_Filter( filter_t *, picture_t * );
+static picture_t *I420_RGB16_Filter( filter_t *, picture_t * );
+static picture_t *I420_RGB32_Filter( filter_t *, picture_t * );
+
+static void SetGammaTable( int *pi_table, double f_gamma );
+static void SetYUV( filter_t * );
+static void Set8bppPalette( filter_t *, uint8_t * );
#else
- static picture_t *I420_R5G5B5_Filter ( filter_t *, picture_t * );
- static picture_t *I420_R5G6B5_Filter ( filter_t *, picture_t * );
- static picture_t *I420_A8R8G8B8_Filter ( filter_t *, picture_t * );
- static picture_t *I420_R8G8B8A8_Filter ( filter_t *, picture_t * );
- static picture_t *I420_B8G8R8A8_Filter ( filter_t *, picture_t * );
- static picture_t *I420_A8B8G8R8_Filter ( filter_t *, picture_t * );
+static picture_t *I420_R5G5B5_Filter( filter_t *, picture_t * );
+static picture_t *I420_R5G6B5_Filter( filter_t *, picture_t * );
+static picture_t *I420_A8R8G8B8_Filter( filter_t *, picture_t * );
+static picture_t *I420_R8G8B8A8_Filter( filter_t *, picture_t * );
+static picture_t *I420_B8G8R8A8_Filter( filter_t *, picture_t * );
+static picture_t *I420_A8B8G8R8_Filter( filter_t *, picture_t * );
#endif
/*****************************************************************************
<< p_filter->fmt_out.video.i_lbshift))
/*****************************************************************************
- * Local and extern prototypes.
+ * Module descriptor.
*****************************************************************************/
static int Activate ( vlc_object_t * );
static void Deactivate ( vlc_object_t * );
-#if defined (MODULE_NAME_IS_i420_rgb)
-static void SetGammaTable ( int *pi_table, double f_gamma );
-static void SetYUV ( filter_t * );
-static void Set8bppPalette ( filter_t *, uint8_t * );
-#endif
-
-/*****************************************************************************
- * Module descriptor.
- *****************************************************************************/
vlc_module_begin ()
-#if defined (MODULE_NAME_IS_i420_rgb)
- set_description( N_("I420,IYUV,YV12 to "
- "RGB2,RV15,RV16,RV24,RV32 conversions") )
- set_capability( "video filter2", 80 )
-# define vlc_CPU_capable() (true)
-#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
- set_description( N_( "MMX I420,IYUV,YV12 to "
- "RV15,RV16,RV24,RV32 conversions") )
- set_capability( "video filter2", 100 )
-# define vlc_CPU_capable() vlc_CPU_MMX()
-#elif defined (MODULE_NAME_IS_i420_rgb_sse2)
+#if defined (SSE2)
set_description( N_( "SSE2 I420,IYUV,YV12 to "
"RV15,RV16,RV24,RV32 conversions") )
set_capability( "video filter2", 120 )
# define vlc_CPU_capable() vlc_CPU_SSE2()
+#elif defined (MMX)
+ set_description( N_( "MMX I420,IYUV,YV12 to "
+ "RV15,RV16,RV24,RV32 conversions") )
+ set_capability( "video filter2", 100 )
+# define vlc_CPU_capable() vlc_CPU_MMX()
+#else
+ set_description( N_("I420,IYUV,YV12 to "
+ "RGB2,RV15,RV16,RV24,RV32 conversions") )
+ set_capability( "video filter2", 80 )
+# define vlc_CPU_capable() (true)
#endif
set_callbacks( Activate, Deactivate )
vlc_module_end ()
static int Activate( vlc_object_t *p_this )
{
filter_t *p_filter = (filter_t *)p_this;
-#if defined (MODULE_NAME_IS_i420_rgb)
+#ifdef PLAIN
size_t i_tables_size;
#endif
case VLC_CODEC_I420:
switch( p_filter->fmt_out.video.i_chroma )
{
-#if defined (MODULE_NAME_IS_i420_rgb)
- case VLC_CODEC_RGB8:
- p_filter->pf_video_filter = I420_RGB8_Filter;
- break;
-#endif
+#ifndef PLAIN
case VLC_CODEC_RGB15:
case VLC_CODEC_RGB16:
-#if ! defined (MODULE_NAME_IS_i420_rgb)
/* If we don't have support for the bitmasks, bail out */
if( ( p_filter->fmt_out.video.i_rmask == 0x7c00
&& p_filter->fmt_out.video.i_gmask == 0x03e0
}
else
return VLC_EGENERIC;
-#else
- // generic C chroma converter */
- p_filter->pf_video_filter = I420_RGB16_Filter;
-#endif
break;
-
-#if 0
- /* Hmmm, is there only X11 using 32bits per pixel for RV24 ? */
- case VLC_CODEC_RGB24:
-#endif
-
case VLC_CODEC_RGB32:
-#if ! defined (MODULE_NAME_IS_i420_rgb)
/* If we don't have support for the bitmasks, bail out */
if( p_filter->fmt_out.video.i_rmask == 0x00ff0000
&& p_filter->fmt_out.video.i_gmask == 0x0000ff00
}
else
return VLC_EGENERIC;
+ break;
#else
- /* generic C chroma converter */
+ case VLC_CODEC_RGB8:
+ p_filter->pf_video_filter = I420_RGB8_Filter;
+ break;
+ case VLC_CODEC_RGB15:
+ case VLC_CODEC_RGB16:
+ p_filter->pf_video_filter = I420_RGB16_Filter;
+ break;
+ case VLC_CODEC_RGB32:
p_filter->pf_video_filter = I420_RGB32_Filter;
-#endif
break;
-
+#endif
default:
return VLC_EGENERIC;
}
switch( p_filter->fmt_out.video.i_chroma )
{
-#if defined (MODULE_NAME_IS_i420_rgb)
+#ifdef PLAIN
case VLC_CODEC_RGB8:
p_filter->p_sys->p_buffer = malloc( VOUT_MAX_WIDTH );
break;
#endif
-
case VLC_CODEC_RGB15:
case VLC_CODEC_RGB16:
p_filter->p_sys->p_buffer = malloc( VOUT_MAX_WIDTH * 2 );
break;
-
case VLC_CODEC_RGB24:
case VLC_CODEC_RGB32:
p_filter->p_sys->p_buffer = malloc( VOUT_MAX_WIDTH * 4 );
break;
-
default:
p_filter->p_sys->p_buffer = NULL;
break;
return VLC_EGENERIC;
}
-#if defined (MODULE_NAME_IS_i420_rgb)
+#ifdef PLAIN
switch( p_filter->fmt_out.video.i_chroma )
{
case VLC_CODEC_RGB8:
{
filter_t *p_filter = (filter_t *)p_this;
-#if defined (MODULE_NAME_IS_i420_rgb)
+#ifdef PLAIN
free( p_filter->p_sys->p_base );
#endif
free( p_filter->p_sys->p_offset );
free( p_filter->p_sys );
}
-#if defined (MODULE_NAME_IS_i420_rgb)
-VIDEO_FILTER_WRAPPER( I420_RGB8 )
-VIDEO_FILTER_WRAPPER( I420_RGB16 )
-//VIDEO_FILTER_WRAPPER( I420_RGB16_dither )
-VIDEO_FILTER_WRAPPER( I420_RGB32 )
-#else
+#ifndef PLAIN
VIDEO_FILTER_WRAPPER( I420_R5G5B5 )
VIDEO_FILTER_WRAPPER( I420_R5G6B5 )
VIDEO_FILTER_WRAPPER( I420_A8R8G8B8 )
VIDEO_FILTER_WRAPPER( I420_R8G8B8A8 )
VIDEO_FILTER_WRAPPER( I420_B8G8R8A8 )
VIDEO_FILTER_WRAPPER( I420_A8B8G8R8 )
-#endif
+#else
+VIDEO_FILTER_WRAPPER( I420_RGB8 )
+VIDEO_FILTER_WRAPPER( I420_RGB16 )
+VIDEO_FILTER_WRAPPER( I420_RGB32 )
-#if defined (MODULE_NAME_IS_i420_rgb)
/*****************************************************************************
* SetGammaTable: return intensity table transformed by gamma curve.
*****************************************************************************
}
}
}
-
#endif
-
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
+#if !defined (SSE2) && !defined (MMX)
+# define PLAIN
+#endif
+
/** Number of entries in RGB palette/colormap */
#define CMAP_RGB2_SIZE 256
uint8_t *p_buffer;
int *p_offset;
-#ifdef MODULE_NAME_IS_i420_rgb
+#ifdef PLAIN
/**< Pre-calculated conversion tables */
void *p_base; /**< base for all conversion tables */
uint8_t *p_rgb8; /**< RGB 8 bits table */
/*****************************************************************************
* Prototypes
*****************************************************************************/
-#ifdef MODULE_NAME_IS_i420_rgb
+#ifdef PLAIN
void I420_RGB8 ( filter_t *, picture_t *, picture_t * );
-void I420_RGB16_dither ( filter_t *, picture_t *, picture_t * );
void I420_RGB16 ( filter_t *, picture_t *, picture_t * );
void I420_RGB32 ( filter_t *, picture_t *, picture_t * );
-#else // if defined(MODULE_NAME_IS_i420_rgb_mmx)
+#else
void I420_R5G5B5 ( filter_t *, picture_t *, picture_t * );
void I420_R5G6B5 ( filter_t *, picture_t *, picture_t * );
void I420_A8R8G8B8 ( filter_t *, picture_t *, picture_t * );
* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
-/*****************************************************************************
- * Preamble
- *****************************************************************************/
-
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <vlc_cpu.h>
#include "i420_rgb.h"
-#if defined (MODULE_NAME_IS_i420_rgb)
-# include "i420_rgb_c.h"
-# define VLC_TARGET
-#elif defined (MODULE_NAME_IS_i420_rgb_mmx)
-# include "i420_rgb_mmx.h"
-# define VLC_TARGET VLC_MMX
-#elif defined (MODULE_NAME_IS_i420_rgb_sse2)
-# include "i420_rgb_sse2.h"
-# define VLC_TARGET VLC_SSE
-#endif
+#include "i420_rgb_c.h"
+
+/*****************************************************************************
+ * SetOffset: build offset array for conversion functions
+ *****************************************************************************
+ * This function will build an offset array used in later conversion functions.
+ * It will also set horizontal and vertical scaling indicators.
+ *****************************************************************************/
+static void SetOffset( int i_width, int i_height, int i_pic_width,
+ int i_pic_height, bool *pb_hscale,
+ unsigned int *pi_vscale, int *p_offset )
+{
+ /*
+ * Prepare horizontal offset array
+ */
+ if( i_pic_width - i_width == 0 )
+ { /* No horizontal scaling: YUV conversion is done directly to picture */
+ *pb_hscale = 0;
+ }
+ else if( i_pic_width - i_width > 0 )
+ { /* Prepare scaling array for horizontal extension */
+ int i_scale_count = i_pic_width;
+
+ *pb_hscale = 1;
+ for( int i_x = i_width; i_x--; )
+ {
+ while( (i_scale_count -= i_width) > 0 )
+ {
+ *p_offset++ = 0;
+ }
+ *p_offset++ = 1;
+ i_scale_count += i_pic_width;
+ }
+ }
+ else /* if( i_pic_width - i_width < 0 ) */
+ { /* Prepare scaling array for horizontal reduction */
+ int i_scale_count = i_pic_width;
+
+ *pb_hscale = 1;
+ for( int i_x = i_pic_width; i_x--; )
+ {
+ *p_offset = 1;
+ while( (i_scale_count -= i_pic_width) > 0 )
+ {
+ *p_offset += 1;
+ }
+ p_offset++;
+ i_scale_count += i_width;
+ }
+ }
-static void SetOffset( int, int, int, int, bool *,
- unsigned int *, int * );
+ /*
+ * Set vertical scaling indicator
+ */
+ if( i_pic_height - i_height == 0 )
+ *pi_vscale = 0;
+ else if( i_pic_height - i_height > 0 )
+ *pi_vscale = 1;
+ else /* if( i_pic_height - i_height < 0 ) */
+ *pi_vscale = -1;
+}
/*****************************************************************************
* I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
* - output: 1 line
*****************************************************************************/
-#if defined (MODULE_NAME_IS_i420_rgb)
-
void I420_RGB16( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
{
/* We got this one from the old arguments */
}
}
-#else // ! defined (MODULE_NAME_IS_i420_rgb)
+/*****************************************************************************
+ * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
+ *****************************************************************************
+ * Horizontal alignment needed:
+ * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
+ * - output: 1 pixel (2 bytes), margins allowed
+ * Vertical alignment needed:
+ * - input: 2 lines (2 Y lines, 1 U/V line)
+ * - output: 1 line
+ *****************************************************************************/
-VLC_TARGET
-void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+void I420_RGB32( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
{
/* We got this one from the old arguments */
- uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
+ uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
uint8_t *p_y = p_src->Y_PIXELS;
uint8_t *p_u = p_src->U_PIXELS;
uint8_t *p_v = p_src->V_PIXELS;
int i_rewind;
int i_scale_count; /* scale modulo counter */
int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
- uint16_t * p_pic_start; /* beginning of the current line for copy */
+ uint32_t * p_pic_start; /* beginning of the current line for copy */
+ int i_uval, i_vval; /* U and V samples */
+ int i_red, i_green, i_blue; /* U and V modified samples */
+ uint32_t * p_yuv = p_filter->p_sys->p_rgb32;
+ uint32_t * p_ybase; /* Y dependant conversion table */
/* Conversion buffer pointer */
- uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
- uint16_t * p_buffer;
+ uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+ uint32_t * p_buffer;
/* Offset array pointer */
int * p_offset_start = p_filter->p_sys->p_offset;
- p_src->p[1].i_visible_pitch;
i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
- /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
- * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
- * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
- SetOffset( p_filter->fmt_in.video.i_width,
- p_filter->fmt_in.video.i_height,
- p_filter->fmt_out.video.i_width,
- p_filter->fmt_out.video.i_height,
- &b_hscale, &i_vscale, p_offset_start );
-
-
- /*
- * Perform conversion
- */
- i_scale_count = ( i_vscale == 1 ) ?
- p_filter->fmt_out.video.i_height :
- p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
- /*
- ** SSE2 128 bits fetch/store instructions are faster
- ** if memory access is 16 bytes aligned
- */
-
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
- p_dest->p->i_pitch|
- ((intptr_t)p_y)|
- ((intptr_t)p_buffer))) )
- {
- /* use faster SSE2 aligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_16_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_15_ALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
-
- SSE2_CALL (
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_15_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 2 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
- else
- {
- /* use slower SSE2 unaligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_15_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
-
- SSE2_CALL (
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_15_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 2 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
-
- /* make sure all SSE2 stores are visible thereafter */
- SSE2_END;
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
-
i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
- {
- MMX_CALL (
- MMX_INIT_16
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_15
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
-
- MMX_CALL (
- MMX_INIT_16
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_15
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 2 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- }
- /* re-enable FPU registers */
- MMX_END;
-
-#endif
-}
-
-VLC_TARGET
-void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
- /* We got this one from the old arguments */
- uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
- uint8_t *p_y = p_src->Y_PIXELS;
- uint8_t *p_u = p_src->U_PIXELS;
- uint8_t *p_v = p_src->V_PIXELS;
-
- bool b_hscale; /* horizontal scaling type */
- unsigned int i_vscale; /* vertical scaling type */
- unsigned int i_x, i_y; /* horizontal and vertical indexes */
-
- int i_right_margin;
- int i_rewind;
- int i_scale_count; /* scale modulo counter */
- int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
- uint16_t * p_pic_start; /* beginning of the current line for copy */
-
- /* Conversion buffer pointer */
- uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
- uint16_t * p_buffer;
-
- /* Offset array pointer */
- int * p_offset_start = p_filter->p_sys->p_offset;
- int * p_offset;
-
- const int i_source_margin = p_src->p[0].i_pitch
- - p_src->p[0].i_visible_pitch;
- const int i_source_margin_c = p_src->p[1].i_pitch
- - p_src->p[1].i_visible_pitch;
-
- i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
/* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
* on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
* then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
p_filter->fmt_out.video.i_height,
&b_hscale, &i_vscale, p_offset_start );
-
/*
* Perform conversion
*/
i_scale_count = ( i_vscale == 1 ) ?
p_filter->fmt_out.video.i_height :
p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
- /*
- ** SSE2 128 bits fetch/store instructions are faster
- ** if memory access is 16 bytes aligned
- */
-
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
- p_dest->p->i_pitch|
- ((intptr_t)p_y)|
- ((intptr_t)p_buffer))) )
- {
- /* use faster SSE2 aligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_16_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_16_ALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
-
- SSE2_CALL (
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_16_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 2 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
- else
- {
- /* use slower SSE2 unaligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
- {
- SSE2_CALL(
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_16_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
-
- SSE2_CALL(
- SSE2_INIT_16_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_16_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 2 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
-
- /* make sure all SSE2 stores are visible thereafter */
- SSE2_END;
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
{
p_pic_start = p_pic;
for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
{
- MMX_CALL (
- MMX_INIT_16
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_16
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
+ CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
+ CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
+ CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
+ CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
}
/* Here we do some unaligned reads and duplicate conversions, but
p_u -= i_rewind >> 1;
p_v -= i_rewind >> 1;
p_buffer -= i_rewind;
-
- MMX_CALL (
- MMX_INIT_16
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_16
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
+ CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
+ CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
+ CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
+ CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
}
SCALE_WIDTH;
- SCALE_HEIGHT( 420, 2 );
+ SCALE_HEIGHT( 420, 4 );
p_y += i_source_margin;
if( i_y % 2 )
p_v += i_source_margin_c;
}
}
- /* re-enable FPU registers */
- MMX_END;
-
-#endif
}
-
-#endif
-
-/*****************************************************************************
- * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
- *****************************************************************************
- * Horizontal alignment needed:
- * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
- * - output: 1 pixel (2 bytes), margins allowed
- * Vertical alignment needed:
- * - input: 2 lines (2 Y lines, 1 U/V line)
- * - output: 1 line
- *****************************************************************************/
-
-#if defined (MODULE_NAME_IS_i420_rgb)
-
-void I420_RGB32( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
- /* We got this one from the old arguments */
- uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
- uint8_t *p_y = p_src->Y_PIXELS;
- uint8_t *p_u = p_src->U_PIXELS;
- uint8_t *p_v = p_src->V_PIXELS;
-
- bool b_hscale; /* horizontal scaling type */
- unsigned int i_vscale; /* vertical scaling type */
- unsigned int i_x, i_y; /* horizontal and vertical indexes */
-
- int i_right_margin;
- int i_rewind;
- int i_scale_count; /* scale modulo counter */
- int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
- uint32_t * p_pic_start; /* beginning of the current line for copy */
- int i_uval, i_vval; /* U and V samples */
- int i_red, i_green, i_blue; /* U and V modified samples */
- uint32_t * p_yuv = p_filter->p_sys->p_rgb32;
- uint32_t * p_ybase; /* Y dependant conversion table */
-
- /* Conversion buffer pointer */
- uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
- uint32_t * p_buffer;
-
- /* Offset array pointer */
- int * p_offset_start = p_filter->p_sys->p_offset;
- int * p_offset;
-
- const int i_source_margin = p_src->p[0].i_pitch
- - p_src->p[0].i_visible_pitch;
- const int i_source_margin_c = p_src->p[1].i_pitch
- - p_src->p[1].i_visible_pitch;
-
- i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
- i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
- /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
- * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
- * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
- SetOffset( p_filter->fmt_in.video.i_width,
- p_filter->fmt_in.video.i_height,
- p_filter->fmt_out.video.i_width,
- p_filter->fmt_out.video.i_height,
- &b_hscale, &i_vscale, p_offset_start );
-
- /*
- * Perform conversion
- */
- i_scale_count = ( i_vscale == 1 ) ?
- p_filter->fmt_out.video.i_height :
- p_filter->fmt_in.video.i_height;
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
- {
- CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
- CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
- CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
- CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
- CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
- CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
- CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4);
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- }
-}
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
-
-VLC_TARGET
-void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
- picture_t *p_dest )
-{
- /* We got this one from the old arguments */
- uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
- uint8_t *p_y = p_src->Y_PIXELS;
- uint8_t *p_u = p_src->U_PIXELS;
- uint8_t *p_v = p_src->V_PIXELS;
-
- bool b_hscale; /* horizontal scaling type */
- unsigned int i_vscale; /* vertical scaling type */
- unsigned int i_x, i_y; /* horizontal and vertical indexes */
-
- int i_right_margin;
- int i_rewind;
- int i_scale_count; /* scale modulo counter */
- int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
- uint32_t * p_pic_start; /* beginning of the current line for copy */
- /* Conversion buffer pointer */
- uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
- uint32_t * p_buffer;
-
- /* Offset array pointer */
- int * p_offset_start = p_filter->p_sys->p_offset;
- int * p_offset;
-
- const int i_source_margin = p_src->p[0].i_pitch
- - p_src->p[0].i_visible_pitch;
- const int i_source_margin_c = p_src->p[1].i_pitch
- - p_src->p[1].i_visible_pitch;
-
- i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
- /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
- * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
- * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
- SetOffset( p_filter->fmt_in.video.i_width,
- p_filter->fmt_in.video.i_height,
- p_filter->fmt_out.video.i_width,
- p_filter->fmt_out.video.i_height,
- &b_hscale, &i_vscale, p_offset_start );
-
- /*
- * Perform conversion
- */
- i_scale_count = ( i_vscale == 1 ) ?
- p_filter->fmt_out.video.i_height :
- p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
- /*
- ** SSE2 128 bits fetch/store instructions are faster
- ** if memory access is 16 bytes aligned
- */
-
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
- p_dest->p->i_pitch|
- ((intptr_t)p_y)|
- ((intptr_t)p_buffer))) )
- {
- /* use faster SSE2 aligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_32_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ARGB_ALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ARGB_UNALIGNED
- );
- p_y += 16;
- p_u += 4;
- p_v += 4;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
- else
- {
- /* use slower SSE2 unaligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ARGB_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ARGB_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
-
- /* make sure all SSE2 stores are visible thereafter */
- SSE2_END;
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
- {
- MMX_CALL (
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_ARGB
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- MMX_CALL (
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_ARGB
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- }
-
- /* re-enable FPU registers */
- MMX_END;
-
-#endif
-}
-
-VLC_TARGET
-void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
- /* We got this one from the old arguments */
- uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
- uint8_t *p_y = p_src->Y_PIXELS;
- uint8_t *p_u = p_src->U_PIXELS;
- uint8_t *p_v = p_src->V_PIXELS;
-
- bool b_hscale; /* horizontal scaling type */
- unsigned int i_vscale; /* vertical scaling type */
- unsigned int i_x, i_y; /* horizontal and vertical indexes */
-
- int i_right_margin;
- int i_rewind;
- int i_scale_count; /* scale modulo counter */
- int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
- uint32_t * p_pic_start; /* beginning of the current line for copy */
- /* Conversion buffer pointer */
- uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
- uint32_t * p_buffer;
-
- /* Offset array pointer */
- int * p_offset_start = p_filter->p_sys->p_offset;
- int * p_offset;
-
- const int i_source_margin = p_src->p[0].i_pitch
- - p_src->p[0].i_visible_pitch;
- const int i_source_margin_c = p_src->p[1].i_pitch
- - p_src->p[1].i_visible_pitch;
-
- i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
- /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
- * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
- * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
- SetOffset( p_filter->fmt_in.video.i_width,
- p_filter->fmt_in.video.i_height,
- p_filter->fmt_out.video.i_width,
- p_filter->fmt_out.video.i_height,
- &b_hscale, &i_vscale, p_offset_start );
-
- /*
- * Perform conversion
- */
- i_scale_count = ( i_vscale == 1 ) ?
- p_filter->fmt_out.video.i_height :
- p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
- /*
- ** SSE2 128 bits fetch/store instructions are faster
- ** if memory access is 16 bytes aligned
- */
-
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
- p_dest->p->i_pitch|
- ((intptr_t)p_y)|
- ((intptr_t)p_buffer))) )
- {
- /* use faster SSE2 aligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_32_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_RGBA_ALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_RGBA_UNALIGNED
- );
- p_y += 16;
- p_u += 4;
- p_v += 4;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
- else
- {
- /* use slower SSE2 unaligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_RGBA_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_RGBA_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
-
- /* make sure all SSE2 stores are visible thereafter */
- SSE2_END;
-
-#else // defined (MODULE_NAME_IS_i420_rgb_mmx)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
- {
- MMX_CALL (
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_RGBA
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- MMX_CALL (
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_RGBA
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- }
-
- /* re-enable FPU registers */
- MMX_END;
-
-#endif
-}
-
-VLC_TARGET
-void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
- /* We got this one from the old arguments */
- uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
- uint8_t *p_y = p_src->Y_PIXELS;
- uint8_t *p_u = p_src->U_PIXELS;
- uint8_t *p_v = p_src->V_PIXELS;
-
- bool b_hscale; /* horizontal scaling type */
- unsigned int i_vscale; /* vertical scaling type */
- unsigned int i_x, i_y; /* horizontal and vertical indexes */
-
- int i_right_margin;
- int i_rewind;
- int i_scale_count; /* scale modulo counter */
- int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
- uint32_t * p_pic_start; /* beginning of the current line for copy */
- /* Conversion buffer pointer */
- uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
- uint32_t * p_buffer;
-
- /* Offset array pointer */
- int * p_offset_start = p_filter->p_sys->p_offset;
- int * p_offset;
-
- const int i_source_margin = p_src->p[0].i_pitch
- - p_src->p[0].i_visible_pitch;
- const int i_source_margin_c = p_src->p[1].i_pitch
- - p_src->p[1].i_visible_pitch;
-
- i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
- /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
- * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
- * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
- SetOffset( p_filter->fmt_in.video.i_width,
- p_filter->fmt_in.video.i_height,
- p_filter->fmt_out.video.i_width,
- p_filter->fmt_out.video.i_height,
- &b_hscale, &i_vscale, p_offset_start );
-
- /*
- * Perform conversion
- */
- i_scale_count = ( i_vscale == 1 ) ?
- p_filter->fmt_out.video.i_height :
- p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
- /*
- ** SSE2 128 bits fetch/store instructions are faster
- ** if memory access is 16 bytes aligned
- */
-
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
- p_dest->p->i_pitch|
- ((intptr_t)p_y)|
- ((intptr_t)p_buffer))) )
- {
- /* use faster SSE2 aligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_32_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_BGRA_ALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_BGRA_UNALIGNED
- );
- p_y += 16;
- p_u += 4;
- p_v += 4;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
- else
- {
- /* use slower SSE2 unaligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_BGRA_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_BGRA_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
-
-#else
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
- {
- MMX_CALL (
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_BGRA
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- MMX_CALL (
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_BGRA
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- }
-
- /* re-enable FPU registers */
- MMX_END;
-
-#endif
-}
-
-VLC_TARGET
-void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
-{
- /* We got this one from the old arguments */
- uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
- uint8_t *p_y = p_src->Y_PIXELS;
- uint8_t *p_u = p_src->U_PIXELS;
- uint8_t *p_v = p_src->V_PIXELS;
-
- bool b_hscale; /* horizontal scaling type */
- unsigned int i_vscale; /* vertical scaling type */
- unsigned int i_x, i_y; /* horizontal and vertical indexes */
-
- int i_right_margin;
- int i_rewind;
- int i_scale_count; /* scale modulo counter */
- int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
- uint32_t * p_pic_start; /* beginning of the current line for copy */
- /* Conversion buffer pointer */
- uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
- uint32_t * p_buffer;
-
- /* Offset array pointer */
- int * p_offset_start = p_filter->p_sys->p_offset;
- int * p_offset;
-
- const int i_source_margin = p_src->p[0].i_pitch
- - p_src->p[0].i_visible_pitch;
- const int i_source_margin_c = p_src->p[1].i_pitch
- - p_src->p[1].i_visible_pitch;
-
- i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
-
- /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
- * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
- * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
- SetOffset( p_filter->fmt_in.video.i_width,
- p_filter->fmt_in.video.i_height,
- p_filter->fmt_out.video.i_width,
- p_filter->fmt_out.video.i_height,
- &b_hscale, &i_vscale, p_offset_start );
-
- /*
- * Perform conversion
- */
- i_scale_count = ( i_vscale == 1 ) ?
- p_filter->fmt_out.video.i_height :
- p_filter->fmt_in.video.i_height;
-
-#if defined (MODULE_NAME_IS_i420_rgb_sse2)
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
-
- /*
- ** SSE2 128 bits fetch/store instructions are faster
- ** if memory access is 16 bytes aligned
- */
-
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
- p_dest->p->i_pitch|
- ((intptr_t)p_y)|
- ((intptr_t)p_buffer))) )
- {
- /* use faster SSE2 aligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_32_ALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ABGR_ALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ABGR_UNALIGNED
- );
- p_y += 16;
- p_u += 4;
- p_v += 4;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
- else
- {
- /* use slower SSE2 unaligned fetch and store */
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
- {
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ABGR_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- p_buffer += 16;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- SSE2_CALL (
- SSE2_INIT_32_UNALIGNED
- SSE2_YUV_MUL
- SSE2_YUV_ADD
- SSE2_UNPACK_32_ABGR_UNALIGNED
- );
- p_y += 16;
- p_u += 8;
- p_v += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- p_buffer = b_hscale ? p_buffer_start : p_pic;
- }
- }
-
-#else
-
- i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
-
- for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
- {
- p_pic_start = p_pic;
- p_buffer = b_hscale ? p_buffer_start : p_pic;
-
- for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
- {
- MMX_CALL (
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_ABGR
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
-
- /* Here we do some unaligned reads and duplicate conversions, but
- * at least we have all the pixels */
- if( i_rewind )
- {
- p_y -= i_rewind;
- p_u -= i_rewind >> 1;
- p_v -= i_rewind >> 1;
- p_buffer -= i_rewind;
- MMX_CALL (
- MMX_INIT_32
- MMX_YUV_MUL
- MMX_YUV_ADD
- MMX_UNPACK_32_ABGR
- );
- p_y += 8;
- p_u += 4;
- p_v += 4;
- p_buffer += 8;
- }
- SCALE_WIDTH;
- SCALE_HEIGHT( 420, 4 );
-
- p_y += i_source_margin;
- if( i_y % 2 )
- {
- p_u += i_source_margin_c;
- p_v += i_source_margin_c;
- }
- }
-
- /* re-enable FPU registers */
- MMX_END;
-
-#endif
-}
-
-#endif
-
-/* Following functions are local */
-
-/*****************************************************************************
- * SetOffset: build offset array for conversion functions
- *****************************************************************************
- * This function will build an offset array used in later conversion functions.
- * It will also set horizontal and vertical scaling indicators.
- *****************************************************************************/
-static void SetOffset( int i_width, int i_height, int i_pic_width,
- int i_pic_height, bool *pb_hscale,
- unsigned int *pi_vscale, int *p_offset )
-{
- int i_x; /* x position in destination */
- int i_scale_count; /* modulo counter */
-
- /*
- * Prepare horizontal offset array
- */
- if( i_pic_width - i_width == 0 )
- {
- /* No horizontal scaling: YUV conversion is done directly to picture */
- *pb_hscale = 0;
- }
- else if( i_pic_width - i_width > 0 )
- {
- /* Prepare scaling array for horizontal extension */
- *pb_hscale = 1;
- i_scale_count = i_pic_width;
- for( i_x = i_width; i_x--; )
- {
- while( (i_scale_count -= i_width) > 0 )
- {
- *p_offset++ = 0;
- }
- *p_offset++ = 1;
- i_scale_count += i_pic_width;
- }
- }
- else /* if( i_pic_width - i_width < 0 ) */
- {
- /* Prepare scaling array for horizontal reduction */
- *pb_hscale = 1;
- i_scale_count = i_width;
- for( i_x = i_pic_width; i_x--; )
- {
- *p_offset = 1;
- while( (i_scale_count -= i_pic_width) > 0 )
- {
- *p_offset += 1;
- }
- p_offset++;
- i_scale_count += i_width;
- }
- }
-
- /*
- * Set vertical scaling indicator
- */
- if( i_pic_height - i_height == 0 )
- {
- *pi_vscale = 0;
- }
- else if( i_pic_height - i_height > 0 )
- {
- *pi_vscale = 1;
- }
- else /* if( i_pic_height - i_height < 0 ) */
- {
- *pi_vscale = -1;
- }
-}
-
--- /dev/null
+/*****************************************************************************
+ * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
+ *****************************************************************************
+ * Copyright (C) 2000 VLC authors and VideoLAN
+ * $Id$
+ *
+ * Authors: Samuel Hocevar <sam@zoy.org>
+ * Damien Fouilleul <damienf@videolan.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
+#include <vlc_filter.h>
+#include <vlc_cpu.h>
+
+#include "i420_rgb.h"
+#ifdef SSE2
+# include "i420_rgb_sse2.h"
+# define VLC_TARGET VLC_SSE
+#else
+# include "i420_rgb_mmx.h"
+# define VLC_TARGET VLC_MMX
+#endif
+
+/*****************************************************************************
+ * SetOffset: build offset array for conversion functions
+ *****************************************************************************
+ * This function will build an offset array used in later conversion functions.
+ * It will also set horizontal and vertical scaling indicators.
+ *****************************************************************************/
+static void SetOffset( int i_width, int i_height, int i_pic_width,
+ int i_pic_height, bool *pb_hscale,
+ unsigned int *pi_vscale, int *p_offset )
+{
+ /*
+ * Prepare horizontal offset array
+ */
+ if( i_pic_width - i_width == 0 )
+ { /* No horizontal scaling: YUV conversion is done directly to picture */
+ *pb_hscale = 0;
+ }
+ else if( i_pic_width - i_width > 0 )
+ { /* Prepare scaling array for horizontal extension */
+ int i_scale_count = i_pic_width;
+
+ *pb_hscale = 1;
+ for( int i_x = i_width; i_x--; )
+ {
+ while( (i_scale_count -= i_width) > 0 )
+ {
+ *p_offset++ = 0;
+ }
+ *p_offset++ = 1;
+ i_scale_count += i_pic_width;
+ }
+ }
+ else /* if( i_pic_width - i_width < 0 ) */
+ { /* Prepare scaling array for horizontal reduction */
+ int i_scale_count = i_pic_width;
+
+ *pb_hscale = 1;
+ for( int i_x = i_pic_width; i_x--; )
+ {
+ *p_offset = 1;
+ while( (i_scale_count -= i_pic_width) > 0 )
+ {
+ *p_offset += 1;
+ }
+ p_offset++;
+ i_scale_count += i_width;
+ }
+ }
+
+ /*
+ * Set vertical scaling indicator
+ */
+ if( i_pic_height - i_height == 0 )
+ *pi_vscale = 0;
+ else if( i_pic_height - i_height > 0 )
+ *pi_vscale = 1;
+ else /* if( i_pic_height - i_height < 0 ) */
+ *pi_vscale = -1;
+}
+
+VLC_TARGET
+void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+ /* We got this one from the old arguments */
+ uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
+ uint8_t *p_y = p_src->Y_PIXELS;
+ uint8_t *p_u = p_src->U_PIXELS;
+ uint8_t *p_v = p_src->V_PIXELS;
+
+ bool b_hscale; /* horizontal scaling type */
+ unsigned int i_vscale; /* vertical scaling type */
+ unsigned int i_x, i_y; /* horizontal and vertical indexes */
+
+ int i_right_margin;
+ int i_rewind;
+ int i_scale_count; /* scale modulo counter */
+ int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+ uint16_t * p_pic_start; /* beginning of the current line for copy */
+
+ /* Conversion buffer pointer */
+ uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
+ uint16_t * p_buffer;
+
+ /* Offset array pointer */
+ int * p_offset_start = p_filter->p_sys->p_offset;
+ int * p_offset;
+
+ const int i_source_margin = p_src->p[0].i_pitch
+ - p_src->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_src->p[1].i_pitch
+ - p_src->p[1].i_visible_pitch;
+
+ i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+ /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+ * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+ * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+ SetOffset( p_filter->fmt_in.video.i_width,
+ p_filter->fmt_in.video.i_height,
+ p_filter->fmt_out.video.i_width,
+ p_filter->fmt_out.video.i_height,
+ &b_hscale, &i_vscale, p_offset_start );
+
+
+ /*
+ * Perform conversion
+ */
+ i_scale_count = ( i_vscale == 1 ) ?
+ p_filter->fmt_out.video.i_height :
+ p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+ p_dest->p->i_pitch|
+ ((intptr_t)p_y)|
+ ((intptr_t)p_buffer))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_16_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_15_ALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+
+ SSE2_CALL (
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_15_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 2 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_15_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+
+ SSE2_CALL (
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_15_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 2 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+
+ /* make sure all SSE2 stores are visible thereafter */
+ SSE2_END;
+
+#else /* SSE2 */
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+ {
+ MMX_CALL (
+ MMX_INIT_16
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_15
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+
+ MMX_CALL (
+ MMX_INIT_16
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_15
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 2 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ }
+ /* re-enable FPU registers */
+ MMX_END;
+
+#endif /* SSE2 */
+}
+
+VLC_TARGET
+void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+ /* We got this one from the old arguments */
+ uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
+ uint8_t *p_y = p_src->Y_PIXELS;
+ uint8_t *p_u = p_src->U_PIXELS;
+ uint8_t *p_v = p_src->V_PIXELS;
+
+ bool b_hscale; /* horizontal scaling type */
+ unsigned int i_vscale; /* vertical scaling type */
+ unsigned int i_x, i_y; /* horizontal and vertical indexes */
+
+ int i_right_margin;
+ int i_rewind;
+ int i_scale_count; /* scale modulo counter */
+ int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+ uint16_t * p_pic_start; /* beginning of the current line for copy */
+
+ /* Conversion buffer pointer */
+ uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
+ uint16_t * p_buffer;
+
+ /* Offset array pointer */
+ int * p_offset_start = p_filter->p_sys->p_offset;
+ int * p_offset;
+
+ const int i_source_margin = p_src->p[0].i_pitch
+ - p_src->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_src->p[1].i_pitch
+ - p_src->p[1].i_visible_pitch;
+
+ i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+ /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+ * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+ * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+ SetOffset( p_filter->fmt_in.video.i_width,
+ p_filter->fmt_in.video.i_height,
+ p_filter->fmt_out.video.i_width,
+ p_filter->fmt_out.video.i_height,
+ &b_hscale, &i_vscale, p_offset_start );
+
+
+ /*
+ * Perform conversion
+ */
+ i_scale_count = ( i_vscale == 1 ) ?
+ p_filter->fmt_out.video.i_height :
+ p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+ p_dest->p->i_pitch|
+ ((intptr_t)p_y)|
+ ((intptr_t)p_buffer))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_16_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_16_ALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+
+ SSE2_CALL (
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_16_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 2 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
+ {
+ SSE2_CALL(
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_16_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+
+ SSE2_CALL(
+ SSE2_INIT_16_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_16_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 2 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+
+ /* make sure all SSE2 stores are visible thereafter */
+ SSE2_END;
+
+#else /* SSE2 */
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+ {
+ MMX_CALL (
+ MMX_INIT_16
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_16
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+
+ MMX_CALL (
+ MMX_INIT_16
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_16
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 2 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ }
+ /* re-enable FPU registers */
+ MMX_END;
+
+#endif /* SSE2 */
+}
+
+VLC_TARGET
+void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
+ picture_t *p_dest )
+{
+ /* We got this one from the old arguments */
+ uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+ uint8_t *p_y = p_src->Y_PIXELS;
+ uint8_t *p_u = p_src->U_PIXELS;
+ uint8_t *p_v = p_src->V_PIXELS;
+
+ bool b_hscale; /* horizontal scaling type */
+ unsigned int i_vscale; /* vertical scaling type */
+ unsigned int i_x, i_y; /* horizontal and vertical indexes */
+
+ int i_right_margin;
+ int i_rewind;
+ int i_scale_count; /* scale modulo counter */
+ int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+ uint32_t * p_pic_start; /* beginning of the current line for copy */
+ /* Conversion buffer pointer */
+ uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+ uint32_t * p_buffer;
+
+ /* Offset array pointer */
+ int * p_offset_start = p_filter->p_sys->p_offset;
+ int * p_offset;
+
+ const int i_source_margin = p_src->p[0].i_pitch
+ - p_src->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_src->p[1].i_pitch
+ - p_src->p[1].i_visible_pitch;
+
+ i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+ /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+ * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+ * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+ SetOffset( p_filter->fmt_in.video.i_width,
+ p_filter->fmt_in.video.i_height,
+ p_filter->fmt_out.video.i_width,
+ p_filter->fmt_out.video.i_height,
+ &b_hscale, &i_vscale, p_offset_start );
+
+ /*
+ * Perform conversion
+ */
+ i_scale_count = ( i_vscale == 1 ) ?
+ p_filter->fmt_out.video.i_height :
+ p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+ p_dest->p->i_pitch|
+ ((intptr_t)p_y)|
+ ((intptr_t)p_buffer))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ARGB_ALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ARGB_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 4;
+ p_v += 4;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ARGB_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ARGB_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+
+ /* make sure all SSE2 stores are visible thereafter */
+ SSE2_END;
+
+#else
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+ {
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_ARGB
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_ARGB
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ }
+
+ /* re-enable FPU registers */
+ MMX_END;
+
+#endif
+}
+
+VLC_TARGET
+void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+ /* We got this one from the old arguments */
+ uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+ uint8_t *p_y = p_src->Y_PIXELS;
+ uint8_t *p_u = p_src->U_PIXELS;
+ uint8_t *p_v = p_src->V_PIXELS;
+
+ bool b_hscale; /* horizontal scaling type */
+ unsigned int i_vscale; /* vertical scaling type */
+ unsigned int i_x, i_y; /* horizontal and vertical indexes */
+
+ int i_right_margin;
+ int i_rewind;
+ int i_scale_count; /* scale modulo counter */
+ int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+ uint32_t * p_pic_start; /* beginning of the current line for copy */
+ /* Conversion buffer pointer */
+ uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+ uint32_t * p_buffer;
+
+ /* Offset array pointer */
+ int * p_offset_start = p_filter->p_sys->p_offset;
+ int * p_offset;
+
+ const int i_source_margin = p_src->p[0].i_pitch
+ - p_src->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_src->p[1].i_pitch
+ - p_src->p[1].i_visible_pitch;
+
+ i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+ /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+ * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+ * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+ SetOffset( p_filter->fmt_in.video.i_width,
+ p_filter->fmt_in.video.i_height,
+ p_filter->fmt_out.video.i_width,
+ p_filter->fmt_out.video.i_height,
+ &b_hscale, &i_vscale, p_offset_start );
+
+ /*
+ * Perform conversion
+ */
+ i_scale_count = ( i_vscale == 1 ) ?
+ p_filter->fmt_out.video.i_height :
+ p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+ p_dest->p->i_pitch|
+ ((intptr_t)p_y)|
+ ((intptr_t)p_buffer))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_RGBA_ALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_RGBA_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 4;
+ p_v += 4;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_RGBA_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_RGBA_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+
+ /* make sure all SSE2 stores are visible thereafter */
+ SSE2_END;
+
+#else
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+ {
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_RGBA
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_RGBA
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ }
+
+ /* re-enable FPU registers */
+ MMX_END;
+
+#endif
+}
+
+VLC_TARGET
+void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+ /* We got this one from the old arguments */
+ uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+ uint8_t *p_y = p_src->Y_PIXELS;
+ uint8_t *p_u = p_src->U_PIXELS;
+ uint8_t *p_v = p_src->V_PIXELS;
+
+ bool b_hscale; /* horizontal scaling type */
+ unsigned int i_vscale; /* vertical scaling type */
+ unsigned int i_x, i_y; /* horizontal and vertical indexes */
+
+ int i_right_margin;
+ int i_rewind;
+ int i_scale_count; /* scale modulo counter */
+ int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+ uint32_t * p_pic_start; /* beginning of the current line for copy */
+ /* Conversion buffer pointer */
+ uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+ uint32_t * p_buffer;
+
+ /* Offset array pointer */
+ int * p_offset_start = p_filter->p_sys->p_offset;
+ int * p_offset;
+
+ const int i_source_margin = p_src->p[0].i_pitch
+ - p_src->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_src->p[1].i_pitch
+ - p_src->p[1].i_visible_pitch;
+
+ i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+ /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+ * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+ * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+ SetOffset( p_filter->fmt_in.video.i_width,
+ p_filter->fmt_in.video.i_height,
+ p_filter->fmt_out.video.i_width,
+ p_filter->fmt_out.video.i_height,
+ &b_hscale, &i_vscale, p_offset_start );
+
+ /*
+ * Perform conversion
+ */
+ i_scale_count = ( i_vscale == 1 ) ?
+ p_filter->fmt_out.video.i_height :
+ p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+ p_dest->p->i_pitch|
+ ((intptr_t)p_y)|
+ ((intptr_t)p_buffer))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_BGRA_ALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_BGRA_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 4;
+ p_v += 4;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_BGRA_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_BGRA_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+
+#else
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+ {
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_BGRA
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_BGRA
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ }
+
+ /* re-enable FPU registers */
+ MMX_END;
+
+#endif
+}
+
+VLC_TARGET
+void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
+{
+ /* We got this one from the old arguments */
+ uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
+ uint8_t *p_y = p_src->Y_PIXELS;
+ uint8_t *p_u = p_src->U_PIXELS;
+ uint8_t *p_v = p_src->V_PIXELS;
+
+ bool b_hscale; /* horizontal scaling type */
+ unsigned int i_vscale; /* vertical scaling type */
+ unsigned int i_x, i_y; /* horizontal and vertical indexes */
+
+ int i_right_margin;
+ int i_rewind;
+ int i_scale_count; /* scale modulo counter */
+ int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
+ uint32_t * p_pic_start; /* beginning of the current line for copy */
+ /* Conversion buffer pointer */
+ uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
+ uint32_t * p_buffer;
+
+ /* Offset array pointer */
+ int * p_offset_start = p_filter->p_sys->p_offset;
+ int * p_offset;
+
+ const int i_source_margin = p_src->p[0].i_pitch
+ - p_src->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_src->p[1].i_pitch
+ - p_src->p[1].i_visible_pitch;
+
+ i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
+
+ /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
+ * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
+ * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
+ SetOffset( p_filter->fmt_in.video.i_width,
+ p_filter->fmt_in.video.i_height,
+ p_filter->fmt_out.video.i_width,
+ p_filter->fmt_out.video.i_height,
+ &b_hscale, &i_vscale, p_offset_start );
+
+ /*
+ * Perform conversion
+ */
+ i_scale_count = ( i_vscale == 1 ) ?
+ p_filter->fmt_out.video.i_height :
+ p_filter->fmt_in.video.i_height;
+
+#ifdef SSE2
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
+
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
+ p_dest->p->i_pitch|
+ ((intptr_t)p_y)|
+ ((intptr_t)p_buffer))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_ALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ABGR_ALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ABGR_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 4;
+ p_v += 4;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
+ {
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ABGR_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ p_buffer += 16;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ SSE2_CALL (
+ SSE2_INIT_32_UNALIGNED
+ SSE2_YUV_MUL
+ SSE2_YUV_ADD
+ SSE2_UNPACK_32_ABGR_UNALIGNED
+ );
+ p_y += 16;
+ p_u += 8;
+ p_v += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+ }
+ }
+
+#else
+
+ i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
+
+ for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
+ {
+ p_pic_start = p_pic;
+ p_buffer = b_hscale ? p_buffer_start : p_pic;
+
+ for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
+ {
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_ABGR
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+
+ /* Here we do some unaligned reads and duplicate conversions, but
+ * at least we have all the pixels */
+ if( i_rewind )
+ {
+ p_y -= i_rewind;
+ p_u -= i_rewind >> 1;
+ p_v -= i_rewind >> 1;
+ p_buffer -= i_rewind;
+ MMX_CALL (
+ MMX_INIT_32
+ MMX_YUV_MUL
+ MMX_YUV_ADD
+ MMX_UNPACK_32_ABGR
+ );
+ p_y += 8;
+ p_u += 4;
+ p_v += 4;
+ p_buffer += 8;
+ }
+ SCALE_WIDTH;
+ SCALE_HEIGHT( 420, 4 );
+
+ p_y += i_source_margin;
+ if( i_y % 2 )
+ {
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ }
+ }
+
+ /* re-enable FPU registers */
+ MMX_END;
+
+#endif
+}