X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=modules%2Fvideo_chroma%2Fi420_yuy2.c;h=57a7af6de059360a458a136885b2d805da2ebf01;hb=ebb2d8d15c5c1d2aa6ab1627ea30c17142bceca0;hp=ecfc330684d83c05b2c0c38b639f4f29af7f047c;hpb=ba0aca83c8fac83f81ca495f2160f98929b3f99f;p=vlc diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c index ecfc330684..57a7af6de0 100644 --- a/modules/video_chroma/i420_yuy2.c +++ b/modules/video_chroma/i420_yuy2.c @@ -5,6 +5,7 @@ * $Id$ * * Authors: Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -24,11 +25,14 @@ /***************************************************************************** * Preamble *****************************************************************************/ -#include /* strerror() */ -#include /* malloc(), free() */ -#include -#include +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H) # include @@ -42,6 +46,8 @@ # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211" #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" +#elif defined (MODULE_NAME_IS_i420_yuy2_sse2) +# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422" #endif @@ -51,44 +57,49 @@ *****************************************************************************/ static int Activate ( vlc_object_t * ); -static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_YUY2 ( filter_t *, picture_t *, picture_t * ); +static void I420_YVYU ( filter_t *, picture_t *, picture_t * ); +static void I420_UYVY ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * ); +static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * ); +static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * ); #if !defined (MODULE_NAME_IS_i420_yuy2_altivec) -static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_IUYV ( filter_t *, picture_t *, picture_t * ); +static void I420_cyuv ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * ); +static picture_t *I420_cyuv_Filter ( filter_t *, picture_t * ); #endif #if defined (MODULE_NAME_IS_i420_yuy2) -static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_Y211 ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_Y211_Filter ( filter_t *, picture_t * ); #endif #ifdef MODULE_NAME_IS_i420_yuy2_mmx -static uint64_t i_00ffw; -static uint64_t i_80w; +/* Initialize MMX-specific constants */ +static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL; +static const uint64_t i_80w = 0x0000000080808080ULL; #endif /***************************************************************************** * Module descriptor. *****************************************************************************/ -vlc_module_begin(); +vlc_module_begin () #if defined (MODULE_NAME_IS_i420_yuy2) - set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 80 ); + set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) ) + set_capability( "video filter2", 80 ) #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) - set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 100 ); - add_requirement( MMX ); - /* Initialize MMX-specific constants */ - i_00ffw = 0x00ff00ff00ff00ffULL; - i_80w = 0x0000000080808080ULL; + set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) ) + set_capability( "video filter2", 160 ) +#elif defined (MODULE_NAME_IS_i420_yuy2_sse2) + set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) ) + set_capability( "video filter2", 250 ) #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) set_description( _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 100 ); - add_requirement( ALTIVEC ); + set_capability( "video filter2", 250 ) #endif - set_callbacks( Activate, NULL ); -vlc_module_end(); + set_callbacks( Activate, NULL ) +vlc_module_end () /***************************************************************************** * Activate: allocate a chroma function @@ -97,48 +108,48 @@ vlc_module_end(); *****************************************************************************/ static int Activate( vlc_object_t *p_this ) { - vout_thread_t *p_vout = (vout_thread_t *)p_this; + filter_t *p_filter = (filter_t *)p_this; - if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 ) + if( p_filter->fmt_in.video.i_width & 1 + || p_filter->fmt_in.video.i_height & 1 ) { return -1; } - switch( p_vout->render.i_chroma ) + if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width + || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height ) + return -1; + + switch( p_filter->fmt_in.video.i_chroma ) { - case VLC_FOURCC('Y','V','1','2'): - case VLC_FOURCC('I','4','2','0'): - case VLC_FOURCC('I','Y','U','V'): - switch( p_vout->output.i_chroma ) + case VLC_CODEC_YV12: + case VLC_CODEC_I420: + switch( p_filter->fmt_out.video.i_chroma ) { - case VLC_FOURCC('Y','U','Y','2'): - case VLC_FOURCC('Y','U','N','V'): - p_vout->chroma.pf_convert = I420_YUY2; + case VLC_CODEC_YUYV: + p_filter->pf_video_filter = I420_YUY2_Filter; break; - case VLC_FOURCC('Y','V','Y','U'): - p_vout->chroma.pf_convert = I420_YVYU; + case VLC_CODEC_YVYU: + p_filter->pf_video_filter = I420_YVYU_Filter; break; - case VLC_FOURCC('U','Y','V','Y'): - case VLC_FOURCC('U','Y','N','V'): - case VLC_FOURCC('Y','4','2','2'): - p_vout->chroma.pf_convert = I420_UYVY; + case VLC_CODEC_UYVY: + p_filter->pf_video_filter = I420_UYVY_Filter; break; - #if !defined (MODULE_NAME_IS_i420_yuy2_altivec) case VLC_FOURCC('I','U','Y','V'): - p_vout->chroma.pf_convert = I420_IUYV; + p_filter->pf_video_filter = I420_IUYV_Filter; break; - case VLC_FOURCC('c','y','u','v'): - p_vout->chroma.pf_convert = I420_cyuv; + case VLC_CODEC_CYUV: + p_filter->pf_video_filter = I420_cyuv_Filter; break; #endif #if defined (MODULE_NAME_IS_i420_yuy2) - case VLC_FOURCC('Y','2','1','1'): - p_vout->chroma.pf_convert = I420_Y211; + case VLC_CODEC_Y211: + p_filter->pf_video_filter = I420_Y211_Filter; break; #endif @@ -154,13 +165,34 @@ static int Activate( vlc_object_t *p_this ) return 0; } +#if 0 +static inline unsigned long long read_cycles(void) +{ + unsigned long long v; + __asm__ __volatile__("rdtsc" : "=A" (v): ); + + return v; +} +#endif + /* Following functions are local */ +VIDEO_FILTER_WRAPPER( I420_YUY2 ) +VIDEO_FILTER_WRAPPER( I420_YVYU ) +VIDEO_FILTER_WRAPPER( I420_UYVY ) +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) +VIDEO_FILTER_WRAPPER( I420_IUYV ) +VIDEO_FILTER_WRAPPER( I420_cyuv ) +#endif +#if defined (MODULE_NAME_IS_i420_yuy2) +VIDEO_FILTER_WRAPPER( I420_Y211 ) +#endif + /***************************************************************************** * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2 *****************************************************************************/ -static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_YUY2( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -194,14 +226,14 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, vector unsigned char uv_vec; vector unsigned char y_vec; - if( !( ( p_vout->render.i_width % 32 ) | - ( p_vout->render.i_height % 2 ) ) ) + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) { /* Width is a multiple of 32, we take 2 lines at a time */ - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -209,15 +241,15 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, } } } - else if( !( ( p_vout->render.i_width % 16 ) | - ( p_vout->render.i_height % 4 ) ) ) + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) { /* Width is only a multiple of 16, we take 4 lines at a time */ - for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) { /* Line 1 and 2, pixels 0 to ( width - 16 ) */ VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -233,7 +265,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, VEC_MERGE( vec_mergel ); /* Line 3 and 4, pixels 16 to ( width ) */ - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -256,7 +288,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -265,20 +298,23 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_y2 += p_source->p[Y_PLANE].i_pitch; #if !defined (MODULE_NAME_IS_i420_yuy2_mmx) - for( i_x = p_vout->render.i_width / 2 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; ) { C_YUV420_YUYV( ); + C_YUV420_YUYV( ); + C_YUV420_YUYV( ); + C_YUV420_YUYV( ); } #else - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { MMX_CALL( MMX_YUV420_YUYV ); } - for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) +#endif + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) { C_YUV420_YUYV( ); } -#endif p_y1 += i_source_margin; p_y2 += i_source_margin; @@ -288,16 +324,89 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_line2 += i_dest_margin; } +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + #if defined (MODULE_NAME_IS_i420_yuy2_altivec) } #endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } /***************************************************************************** * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2 *****************************************************************************/ -static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_YVYU( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -331,14 +440,14 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, vector unsigned char vu_vec; vector unsigned char y_vec; - if( !( ( p_vout->render.i_width % 32 ) | - ( p_vout->render.i_height % 2 ) ) ) + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) { /* Width is a multiple of 32, we take 2 lines at a time */ - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -346,15 +455,15 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, } } } - else if( !( ( p_vout->render.i_width % 16 ) | - ( p_vout->render.i_height % 4 ) ) ) + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) { /* Width is only a multiple of 16, we take 4 lines at a time */ - for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) { /* Line 1 and 2, pixels 0 to ( width - 16 ) */ VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -370,7 +479,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, VEC_MERGE( vec_mergel ); /* Line 3 and 4, pixels 16 to ( width ) */ - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -393,7 +502,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -401,7 +511,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { #if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_YVYU( ); @@ -412,6 +522,10 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_YVYU ); #endif } + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; @@ -420,16 +534,88 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + #if defined (MODULE_NAME_IS_i420_yuy2_altivec) } #endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } /***************************************************************************** * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2 *****************************************************************************/ -static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_UYVY( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -463,14 +649,14 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, vector unsigned char uv_vec; vector unsigned char y_vec; - if( !( ( p_vout->render.i_width % 32 ) | - ( p_vout->render.i_height % 2 ) ) ) + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) { /* Width is a multiple of 32, we take 2 lines at a time */ - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -478,15 +664,15 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, } } } - else if( !( ( p_vout->render.i_width % 16 ) | - ( p_vout->render.i_height % 4 ) ) ) + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) { /* Width is only a multiple of 16, we take 4 lines at a time */ - for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) { /* Line 1 and 2, pixels 0 to ( width - 16 ) */ VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -502,7 +688,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, VEC_MERGE( vec_mergel ); /* Line 3 and 4, pixels 16 to ( width ) */ - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -525,7 +711,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -533,7 +720,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { #if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); @@ -544,7 +731,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_UYVY ); #endif } - for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; ) + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; ) { C_YUV420_UYVY( ); } @@ -558,30 +745,98 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, } #if defined (MODULE_NAME_IS_i420_yuy2_mmx) - __asm__ __volatile__("emms" :: ); + /* re-enable FPU registers */ + MMX_END; #endif #if defined (MODULE_NAME_IS_i420_yuy2_altivec) } #endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } #if !defined (MODULE_NAME_IS_i420_yuy2_altivec) /***************************************************************************** * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2 *****************************************************************************/ -static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_IUYV( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { + VLC_UNUSED(p_source); VLC_UNUSED(p_dest); /* FIXME: TODO ! */ - msg_Err( p_vout, "I420_IUYV unimplemented, please harass " ); + msg_Err( p_filter, "I420_IUYV unimplemented, please harass " ); } /***************************************************************************** * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2 *****************************************************************************/ -static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_cyuv( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1 = p_dest->p->p_pixels + p_dest->p->i_visible_lines * p_dest->p->i_pitch @@ -601,7 +856,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 -= 3 * p_dest->p->i_pitch; p_line2 -= 3 * p_dest->p->i_pitch; @@ -609,9 +865,9 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); C_YUV420_UYVY( ); C_YUV420_UYVY( ); @@ -620,6 +876,10 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_UYVY ); #endif } + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; @@ -628,6 +888,77 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -635,8 +966,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1 *****************************************************************************/ #if defined (MODULE_NAME_IS_i420_yuy2) -static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_Y211( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -652,7 +983,7 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -660,7 +991,7 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { C_YUV420_Y211( ); C_YUV420_Y211( ); @@ -675,4 +1006,3 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, } } #endif -