/*****************************************************************************
* i420_yuy2.c : YUV to YUV conversion module for vlc
*****************************************************************************
- * Copyright (C) 2000, 2001 VideoLAN
+ * Copyright (C) 2000, 2001 the VideoLAN team
* $Id$
*
* Authors: Samuel Hocevar <sam@zoy.org>
+ * Damien Fouilleul <damien@videolan.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
/*****************************************************************************
* Preamble
*****************************************************************************/
-#include <string.h> /* strerror() */
-#include <stdlib.h> /* malloc(), free() */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
#include <vlc/vlc.h>
-#include <vlc/vout.h>
+#include <vlc_plugin.h>
+#include <vlc_vout.h>
-#ifdef HAVE_ALTIVEC_H
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
# include <altivec.h>
#endif
# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
#elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
+#elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
+# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
#elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
-# define DEST_FOURCC "YUY2,YUNV"
+# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
#endif
/*****************************************************************************
static int Activate ( vlc_object_t * );
static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * );
-#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * );
static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * );
+#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * );
static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * );
#endif
#endif
#ifdef MODULE_NAME_IS_i420_yuy2_mmx
-static uint64_t i_00ffw;
-static uint64_t i_80w;
+/* Initialize MMX-specific constants */
+static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
+static const uint64_t i_80w = 0x0000000080808080ULL;
#endif
/*****************************************************************************
set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) );
set_capability( "chroma", 100 );
add_requirement( MMX );
- /* Initialize MMX-specific constants */
- i_00ffw = 0x00ff00ff00ff00ffULL;
- i_80w = 0x0000000080808080ULL;
+#elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
+ set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) );
+ set_capability( "chroma", 120 );
+ add_requirement( SSE2 );
#elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
set_description(
_("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
p_vout->chroma.pf_convert = I420_YUY2;
break;
-#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
case VLC_FOURCC('Y','V','Y','U'):
p_vout->chroma.pf_convert = I420_YVYU;
break;
case VLC_FOURCC('Y','4','2','2'):
p_vout->chroma.pf_convert = I420_UYVY;
break;
-
+#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
case VLC_FOURCC('I','U','Y','V'):
p_vout->chroma.pf_convert = I420_IUYV;
break;
return 0;
}
-/* Following functions are local */
+#if 0
+static inline unsigned long long read_cycles(void)
+{
+ unsigned long long v;
+ __asm__ __volatile__("rdtsc" : "=A" (v): );
+ return v;
+}
+#endif
+
+/* Following functions are local */
/*****************************************************************************
* I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
*****************************************************************************/
#undef VEC_MERGE
#endif
- const int i_source_margin = p_source->p->i_pitch
- - p_source->p->i_visible_pitch;
+ const int i_source_margin = p_source->p[0].i_pitch
+ - p_source->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_source->p[1].i_pitch
+ - p_source->p[1].i_visible_pitch;
const int i_dest_margin = p_dest->p->i_pitch
- p_dest->p->i_visible_pitch;
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
{
p_line1 = p_line2;
p_y2 += p_source->p[Y_PLANE].i_pitch;
#if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
- for( i_x = p_vout->render.i_width / 2 ; i_x-- ; )
+ for( i_x = p_vout->render.i_width / 8; i_x-- ; )
{
C_YUV420_YUYV( );
+ C_YUV420_YUYV( );
+ C_YUV420_YUYV( );
+ C_YUV420_YUYV( );
}
#else
for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
{
MMX_CALL( MMX_YUV420_YUYV );
}
+#endif
for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
{
C_YUV420_YUYV( );
}
-#endif
p_y1 += i_source_margin;
p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
p_line1 += i_dest_margin;
p_line2 += i_dest_margin;
}
+#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
+ /* re-enable FPU registers */
+ MMX_END;
+#endif
+
#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
}
#endif
+
+#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+
+ if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+ ((intptr_t)p_line2|(intptr_t)p_y2))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ p_line1 = p_line2;
+ p_line2 += p_dest->p->i_pitch;
+
+ p_y1 = p_y2;
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV420_YUYV( );
+ }
+
+ p_y1 += i_source_margin;
+ p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line1 += i_dest_margin;
+ p_line2 += i_dest_margin;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ p_line1 = p_line2;
+ p_line2 += p_dest->p->i_pitch;
+
+ p_y1 = p_y2;
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV420_YUYV( );
+ }
+
+ p_y1 += i_source_margin;
+ p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line1 += i_dest_margin;
+ p_line2 += i_dest_margin;
+ }
+ }
+ /* make sure all SSE2 stores are visible thereafter */
+ SSE2_END;
+
+#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
}
/*****************************************************************************
* I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
*****************************************************************************/
-#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source,
picture_t *p_dest )
{
int i_x, i_y;
- const int i_source_margin = p_source->p->i_pitch
- - p_source->p->i_visible_pitch;
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
+#define VEC_NEXT_LINES( ) \
+ p_line1 = p_line2; \
+ p_line2 += p_dest->p->i_pitch; \
+ p_y1 = p_y2; \
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+#define VEC_LOAD_UV( ) \
+ u_vec = vec_ld( 0, p_u ); p_u += 16; \
+ v_vec = vec_ld( 0, p_v ); p_v += 16;
+
+#define VEC_MERGE( a ) \
+ vu_vec = a( v_vec, u_vec ); \
+ y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
+ vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
+ vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
+ y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
+ vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
+ vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
+
+ vector unsigned char u_vec;
+ vector unsigned char v_vec;
+ vector unsigned char vu_vec;
+ vector unsigned char y_vec;
+
+ if( !( ( p_vout->render.i_width % 32 ) |
+ ( p_vout->render.i_height % 2 ) ) )
+ {
+ /* Width is a multiple of 32, we take 2 lines at a time */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ VEC_NEXT_LINES( );
+ for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+ {
+ VEC_LOAD_UV( );
+ VEC_MERGE( vec_mergeh );
+ VEC_MERGE( vec_mergel );
+ }
+ }
+ }
+ else if( !( ( p_vout->render.i_width % 16 ) |
+ ( p_vout->render.i_height % 4 ) ) )
+ {
+ /* Width is only a multiple of 16, we take 4 lines at a time */
+ for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
+ {
+ /* Line 1 and 2, pixels 0 to ( width - 16 ) */
+ VEC_NEXT_LINES( );
+ for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+ {
+ VEC_LOAD_UV( );
+ VEC_MERGE( vec_mergeh );
+ VEC_MERGE( vec_mergel );
+ }
+
+ /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
+ VEC_LOAD_UV( );
+ VEC_MERGE( vec_mergeh );
+
+ /* Line 3 and 4, pixels 0 to 16 */
+ VEC_NEXT_LINES( );
+ VEC_MERGE( vec_mergel );
+
+ /* Line 3 and 4, pixels 16 to ( width ) */
+ for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+ {
+ VEC_LOAD_UV( );
+ VEC_MERGE( vec_mergeh );
+ VEC_MERGE( vec_mergel );
+ }
+ }
+ }
+ else
+ {
+ /* Crap, use the C version */
+#undef VEC_NEXT_LINES
+#undef VEC_LOAD_UV
+#undef VEC_MERGE
+#endif
+
+ const int i_source_margin = p_source->p[0].i_pitch
+ - p_source->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_source->p[1].i_pitch
+ - p_source->p[1].i_visible_pitch;
const int i_dest_margin = p_dest->p->i_pitch
- p_dest->p->i_visible_pitch;
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
{
p_line1 = p_line2;
for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
{
-#if defined (MODULE_NAME_IS_i420_yuy2)
+#if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
C_YUV420_YVYU( );
C_YUV420_YVYU( );
C_YUV420_YVYU( );
MMX_CALL( MMX_YUV420_YVYU );
#endif
}
+ for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
+ {
+ C_YUV420_YVYU( );
+ }
p_y1 += i_source_margin;
p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
p_line1 += i_dest_margin;
p_line2 += i_dest_margin;
}
+
+#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
+ /* re-enable FPU registers */
+ MMX_END;
+#endif
+
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
+ }
+#endif
+
+#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+ if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+ ((intptr_t)p_line2|(intptr_t)p_y2))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ p_line1 = p_line2;
+ p_line2 += p_dest->p->i_pitch;
+
+ p_y1 = p_y2;
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV420_YVYU( );
+ }
+
+ p_y1 += i_source_margin;
+ p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line1 += i_dest_margin;
+ p_line2 += i_dest_margin;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ p_line1 = p_line2;
+ p_line2 += p_dest->p->i_pitch;
+
+ p_y1 = p_y2;
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV420_YVYU( );
+ }
+
+ p_y1 += i_source_margin;
+ p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line1 += i_dest_margin;
+ p_line2 += i_dest_margin;
+ }
+ }
+ /* make sure all SSE2 stores are visible thereafter */
+ SSE2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
}
/*****************************************************************************
int i_x, i_y;
- const int i_source_margin = p_source->p->i_pitch
- - p_source->p->i_visible_pitch;
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
+#define VEC_NEXT_LINES( ) \
+ p_line1 = p_line2; \
+ p_line2 += p_dest->p->i_pitch; \
+ p_y1 = p_y2; \
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+#define VEC_LOAD_UV( ) \
+ u_vec = vec_ld( 0, p_u ); p_u += 16; \
+ v_vec = vec_ld( 0, p_v ); p_v += 16;
+
+#define VEC_MERGE( a ) \
+ uv_vec = a( u_vec, v_vec ); \
+ y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
+ vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
+ vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
+ y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
+ vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
+ vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
+
+ vector unsigned char u_vec;
+ vector unsigned char v_vec;
+ vector unsigned char uv_vec;
+ vector unsigned char y_vec;
+
+ if( !( ( p_vout->render.i_width % 32 ) |
+ ( p_vout->render.i_height % 2 ) ) )
+ {
+ /* Width is a multiple of 32, we take 2 lines at a time */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ VEC_NEXT_LINES( );
+ for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+ {
+ VEC_LOAD_UV( );
+ VEC_MERGE( vec_mergeh );
+ VEC_MERGE( vec_mergel );
+ }
+ }
+ }
+ else if( !( ( p_vout->render.i_width % 16 ) |
+ ( p_vout->render.i_height % 4 ) ) )
+ {
+ /* Width is only a multiple of 16, we take 4 lines at a time */
+ for( i_y = p_vout->render.i_height / 4 ; i_y-- ; )
+ {
+ /* Line 1 and 2, pixels 0 to ( width - 16 ) */
+ VEC_NEXT_LINES( );
+ for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+ {
+ VEC_LOAD_UV( );
+ VEC_MERGE( vec_mergeh );
+ VEC_MERGE( vec_mergel );
+ }
+
+ /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
+ VEC_LOAD_UV( );
+ VEC_MERGE( vec_mergeh );
+
+ /* Line 3 and 4, pixels 0 to 16 */
+ VEC_NEXT_LINES( );
+ VEC_MERGE( vec_mergel );
+
+ /* Line 3 and 4, pixels 16 to ( width ) */
+ for( i_x = p_vout->render.i_width / 32 ; i_x-- ; )
+ {
+ VEC_LOAD_UV( );
+ VEC_MERGE( vec_mergeh );
+ VEC_MERGE( vec_mergel );
+ }
+ }
+ }
+ else
+ {
+ /* Crap, use the C version */
+#undef VEC_NEXT_LINES
+#undef VEC_LOAD_UV
+#undef VEC_MERGE
+#endif
+
+ const int i_source_margin = p_source->p[0].i_pitch
+ - p_source->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_source->p[1].i_pitch
+ - p_source->p[1].i_visible_pitch;
const int i_dest_margin = p_dest->p->i_pitch
- p_dest->p->i_visible_pitch;
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
{
p_line1 = p_line2;
for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
{
-#if defined (MODULE_NAME_IS_i420_yuy2)
+#if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
C_YUV420_UYVY( );
C_YUV420_UYVY( );
C_YUV420_UYVY( );
MMX_CALL( MMX_YUV420_UYVY );
#endif
}
+ for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; )
+ {
+ C_YUV420_UYVY( );
+ }
p_y1 += i_source_margin;
p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
p_line1 += i_dest_margin;
p_line2 += i_dest_margin;
}
+
+#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
+ /* re-enable FPU registers */
+ MMX_END;
+#endif
+
+#if defined (MODULE_NAME_IS_i420_yuy2_altivec)
+ }
+#endif
+
+#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+ if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+ ((intptr_t)p_line2|(intptr_t)p_y2))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ p_line1 = p_line2;
+ p_line2 += p_dest->p->i_pitch;
+
+ p_y1 = p_y2;
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV420_UYVY( );
+ }
+
+ p_y1 += i_source_margin;
+ p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line1 += i_dest_margin;
+ p_line2 += i_dest_margin;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ p_line1 = p_line2;
+ p_line2 += p_dest->p->i_pitch;
+
+ p_y1 = p_y2;
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV420_UYVY( );
+ }
+
+ p_y1 += i_source_margin;
+ p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line1 += i_dest_margin;
+ p_line2 += i_dest_margin;
+ }
+ }
+ /* make sure all SSE2 stores are visible thereafter */
+ SSE2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
}
+#if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
/*****************************************************************************
* I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
*****************************************************************************/
static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source,
picture_t *p_dest )
{
+ VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
/* FIXME: TODO ! */
msg_Err( p_vout, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
}
int i_x, i_y;
- const int i_source_margin = p_source->p->i_pitch
- - p_source->p->i_visible_pitch;
+ const int i_source_margin = p_source->p[0].i_pitch
+ - p_source->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_source->p[1].i_pitch
+ - p_source->p[1].i_visible_pitch;
const int i_dest_margin = p_dest->p->i_pitch
- p_dest->p->i_visible_pitch;
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
{
p_line1 -= 3 * p_dest->p->i_pitch;
for( i_x = p_vout->render.i_width / 8 ; i_x-- ; )
{
-#if defined (MODULE_NAME_IS_i420_yuy2)
+#if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
C_YUV420_UYVY( );
C_YUV420_UYVY( );
C_YUV420_UYVY( );
MMX_CALL( MMX_YUV420_UYVY );
#endif
}
+ for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; )
+ {
+ C_YUV420_UYVY( );
+ }
p_y1 += i_source_margin;
p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
p_line1 += i_dest_margin;
p_line2 += i_dest_margin;
}
+
+#if defined (MODULE_NAME_IS_i420_yuy2_mmx)
+ /* re-enable FPU registers */
+ MMX_END;
+#endif
+
+#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+ /*
+ ** SSE2 128 bits fetch/store instructions are faster
+ ** if memory access is 16 bytes aligned
+ */
+ if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+ ((intptr_t)p_line2|(intptr_t)p_y2))) )
+ {
+ /* use faster SSE2 aligned fetch and store */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ p_line1 = p_line2;
+ p_line2 += p_dest->p->i_pitch;
+
+ p_y1 = p_y2;
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV420_UYVY( );
+ }
+
+ p_y1 += i_source_margin;
+ p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line1 += i_dest_margin;
+ p_line2 += i_dest_margin;
+ }
+ }
+ else
+ {
+ /* use slower SSE2 unaligned fetch and store */
+ for( i_y = p_vout->render.i_height / 2 ; i_y-- ; )
+ {
+ p_line1 = p_line2;
+ p_line2 += p_dest->p->i_pitch;
+
+ p_y1 = p_y2;
+ p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+ for( i_x = p_vout->render.i_width / 16 ; i_x-- ; )
+ {
+ SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
+ }
+ for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; )
+ {
+ C_YUV420_UYVY( );
+ }
+
+ p_y1 += i_source_margin;
+ p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
+ p_line1 += i_dest_margin;
+ p_line2 += i_dest_margin;
+ }
+ }
+ /* make sure all SSE2 stores are visible thereafter */
+ SSE2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
}
#endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
int i_x, i_y;
- const int i_source_margin = p_source->p->i_pitch
- - p_source->p->i_visible_pitch;
+ const int i_source_margin = p_source->p[0].i_pitch
+ - p_source->p[0].i_visible_pitch;
+ const int i_source_margin_c = p_source->p[1].i_pitch
+ - p_source->p[1].i_visible_pitch;
const int i_dest_margin = p_dest->p->i_pitch
- p_dest->p->i_visible_pitch;
p_y1 += i_source_margin;
p_y2 += i_source_margin;
+ p_u += i_source_margin_c;
+ p_v += i_source_margin_c;
p_line1 += i_dest_margin;
p_line2 += i_dest_margin;
}
}
#endif
-