X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=modules%2Fvideo_chroma%2Fi420_yuy2.c;h=d9ffebee1fba48e41218e2b9afe9501766ed2892;hb=4e9597b800d1140dfab1cf33c3df8c608d58878f;hp=e42e0cace24b617879608b90d46eb84e8870d572;hpb=734d0a85b5ae5af4aef5e6ed96c84342cf553444;p=vlc diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c index e42e0cace2..d9ffebee1f 100644 --- a/modules/video_chroma/i420_yuy2.c +++ b/modules/video_chroma/i420_yuy2.c @@ -1,10 +1,11 @@ /***************************************************************************** * i420_yuy2.c : YUV to YUV conversion module for vlc ***************************************************************************** - * Copyright (C) 2000, 2001 VideoLAN + * Copyright (C) 2000, 2001 the VideoLAN team * $Id$ * * Authors: Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,19 +19,23 @@ * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. *****************************************************************************/ /***************************************************************************** * Preamble *****************************************************************************/ -#include /* strerror() */ -#include /* malloc(), free() */ -#include -#include +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include -#ifdef HAVE_ALTIVEC_H +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H) # include #endif @@ -42,8 +47,10 @@ # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211" #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" +#elif defined (MODULE_NAME_IS_i420_yuy2_sse2) +# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) -# define DEST_FOURCC "YUY2,YUNV" +# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422" #endif /***************************************************************************** @@ -51,20 +58,27 @@ *****************************************************************************/ static int Activate ( vlc_object_t * ); -static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_YUY2 ( filter_t *, picture_t *, picture_t * ); +static void I420_YVYU ( filter_t *, picture_t *, picture_t * ); +static void I420_UYVY ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * ); +static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * ); +static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * ); #if !defined (MODULE_NAME_IS_i420_yuy2_altivec) -static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_IUYV ( filter_t *, picture_t *, picture_t * ); +static void I420_cyuv ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * ); +static picture_t *I420_cyuv_Filter ( filter_t *, picture_t * ); #endif #if defined (MODULE_NAME_IS_i420_yuy2) -static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_Y211 ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_Y211_Filter ( filter_t *, picture_t * ); #endif #ifdef MODULE_NAME_IS_i420_yuy2_mmx -static uint64_t i_00ffw; -static uint64_t i_80w; +/* Initialize MMX-specific constants */ +static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL; +static const uint64_t i_80w = 0x0000000080808080ULL; #endif /***************************************************************************** @@ -72,19 +86,20 @@ static uint64_t i_80w; *****************************************************************************/ vlc_module_begin(); #if defined (MODULE_NAME_IS_i420_yuy2) - set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 80 ); + set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) ); + set_capability( "video filter2", 80 ); #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) - set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 100 ); + set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) ); + set_capability( "video filter2", 100 ); add_requirement( MMX ); - /* Initialize MMX-specific constants */ - i_00ffw = 0x00ff00ff00ff00ffULL; - i_80w = 0x0000000080808080ULL; +#elif defined (MODULE_NAME_IS_i420_yuy2_sse2) + set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) ); + set_capability( "video filter2", 120 ); + add_requirement( SSE2 ); #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) set_description( - _("Altivec conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 100 ); + _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) ); + set_capability( "video filter2", 100 ); add_requirement( ALTIVEC ); #endif set_callbacks( Activate, NULL ); @@ -97,48 +112,48 @@ vlc_module_end(); *****************************************************************************/ static int Activate( vlc_object_t *p_this ) { - vout_thread_t *p_vout = (vout_thread_t *)p_this; + filter_t *p_filter = (filter_t *)p_this; - if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 ) + if( p_filter->fmt_in.video.i_width & 1 + || p_filter->fmt_in.video.i_height & 1 ) { return -1; } - switch( p_vout->render.i_chroma ) + switch( p_filter->fmt_in.video.i_chroma ) { case VLC_FOURCC('Y','V','1','2'): case VLC_FOURCC('I','4','2','0'): case VLC_FOURCC('I','Y','U','V'): - switch( p_vout->output.i_chroma ) + switch( p_filter->fmt_out.video.i_chroma ) { case VLC_FOURCC('Y','U','Y','2'): case VLC_FOURCC('Y','U','N','V'): - p_vout->chroma.pf_convert = I420_YUY2; + p_filter->pf_video_filter = I420_YUY2_Filter; break; -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) case VLC_FOURCC('Y','V','Y','U'): - p_vout->chroma.pf_convert = I420_YVYU; + p_filter->pf_video_filter = I420_YVYU_Filter; break; case VLC_FOURCC('U','Y','V','Y'): case VLC_FOURCC('U','Y','N','V'): case VLC_FOURCC('Y','4','2','2'): - p_vout->chroma.pf_convert = I420_UYVY; + p_filter->pf_video_filter = I420_UYVY_Filter; break; - +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) case VLC_FOURCC('I','U','Y','V'): - p_vout->chroma.pf_convert = I420_IUYV; + p_filter->pf_video_filter = I420_IUYV_Filter; break; case VLC_FOURCC('c','y','u','v'): - p_vout->chroma.pf_convert = I420_cyuv; + p_filter->pf_video_filter = I420_cyuv_Filter; break; #endif #if defined (MODULE_NAME_IS_i420_yuy2) case VLC_FOURCC('Y','2','1','1'): - p_vout->chroma.pf_convert = I420_Y211; + p_filter->pf_video_filter = I420_Y211_Filter; break; #endif @@ -154,13 +169,34 @@ static int Activate( vlc_object_t *p_this ) return 0; } +#if 0 +static inline unsigned long long read_cycles(void) +{ + unsigned long long v; + __asm__ __volatile__("rdtsc" : "=A" (v): ); + + return v; +} +#endif + /* Following functions are local */ +VIDEO_FILTER_WRAPPER( I420_YUY2 ) +VIDEO_FILTER_WRAPPER( I420_YVYU ) +VIDEO_FILTER_WRAPPER( I420_UYVY ) +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) +VIDEO_FILTER_WRAPPER( I420_IUYV ) +VIDEO_FILTER_WRAPPER( I420_cyuv ) +#endif +#if defined (MODULE_NAME_IS_i420_yuy2) +VIDEO_FILTER_WRAPPER( I420_Y211 ) +#endif + /***************************************************************************** * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2 *****************************************************************************/ -static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_YUY2( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -194,14 +230,14 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, vector unsigned char uv_vec; vector unsigned char y_vec; - if( !( ( p_vout->render.i_width % 32 ) | - ( p_vout->render.i_height % 2 ) ) ) + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) { /* Width is a multiple of 32, we take 2 lines at a time */ - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -209,15 +245,15 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, } } } - else if( !( ( p_vout->render.i_width % 16 ) | - ( p_vout->render.i_height % 4 ) ) ) + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) { /* Width is only a multiple of 16, we take 4 lines at a time */ - for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) { /* Line 1 and 2, pixels 0 to ( width - 16 ) */ VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -233,7 +269,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, VEC_MERGE( vec_mergel ); /* Line 3 and 4, pixels 16 to ( width ) */ - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -249,12 +285,15 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, #undef VEC_MERGE #endif - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -262,35 +301,116 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) - { #if !defined (MODULE_NAME_IS_i420_yuy2_mmx) + for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; ) + { C_YUV420_YUYV( ); C_YUV420_YUYV( ); C_YUV420_YUYV( ); C_YUV420_YUYV( ); + } #else + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) + { MMX_CALL( MMX_YUV420_YUYV ); + } #endif + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); } p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + #if defined (MODULE_NAME_IS_i420_yuy2_altivec) } #endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } /***************************************************************************** * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2 *****************************************************************************/ -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) -static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_YVYU( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -299,12 +419,95 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) +#define VEC_NEXT_LINES( ) \ + p_line1 = p_line2; \ + p_line2 += p_dest->p->i_pitch; \ + p_y1 = p_y2; \ + p_y2 += p_source->p[Y_PLANE].i_pitch; + +#define VEC_LOAD_UV( ) \ + u_vec = vec_ld( 0, p_u ); p_u += 16; \ + v_vec = vec_ld( 0, p_v ); p_v += 16; + +#define VEC_MERGE( a ) \ + vu_vec = a( v_vec, u_vec ); \ + y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \ + vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \ + vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \ + y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \ + vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \ + vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; + + vector unsigned char u_vec; + vector unsigned char v_vec; + vector unsigned char vu_vec; + vector unsigned char y_vec; + + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) + { + /* Width is a multiple of 32, we take 2 lines at a time */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + VEC_NEXT_LINES( ); + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) + { + /* Width is only a multiple of 16, we take 4 lines at a time */ + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) + { + /* Line 1 and 2, pixels 0 to ( width - 16 ) */ + VEC_NEXT_LINES( ); + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + + /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */ + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + + /* Line 3 and 4, pixels 0 to 16 */ + VEC_NEXT_LINES( ); + VEC_MERGE( vec_mergel ); + + /* Line 3 and 4, pixels 16 to ( width ) */ + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else + { + /* Crap, use the C version */ +#undef VEC_NEXT_LINES +#undef VEC_LOAD_UV +#undef VEC_MERGE +#endif + + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -312,9 +515,9 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_YVYU( ); C_YUV420_YVYU( ); C_YUV420_YVYU( ); @@ -323,19 +526,100 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_YVYU ); #endif } + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) + } +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } /***************************************************************************** * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2 *****************************************************************************/ -static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_UYVY( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -344,12 +628,95 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) +#define VEC_NEXT_LINES( ) \ + p_line1 = p_line2; \ + p_line2 += p_dest->p->i_pitch; \ + p_y1 = p_y2; \ + p_y2 += p_source->p[Y_PLANE].i_pitch; + +#define VEC_LOAD_UV( ) \ + u_vec = vec_ld( 0, p_u ); p_u += 16; \ + v_vec = vec_ld( 0, p_v ); p_v += 16; + +#define VEC_MERGE( a ) \ + uv_vec = a( u_vec, v_vec ); \ + y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \ + vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \ + vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \ + y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \ + vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \ + vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; + + vector unsigned char u_vec; + vector unsigned char v_vec; + vector unsigned char uv_vec; + vector unsigned char y_vec; + + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) + { + /* Width is a multiple of 32, we take 2 lines at a time */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + VEC_NEXT_LINES( ); + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) + { + /* Width is only a multiple of 16, we take 4 lines at a time */ + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) + { + /* Line 1 and 2, pixels 0 to ( width - 16 ) */ + VEC_NEXT_LINES( ); + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + + /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */ + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + + /* Line 3 and 4, pixels 0 to 16 */ + VEC_NEXT_LINES( ); + VEC_MERGE( vec_mergel ); + + /* Line 3 and 4, pixels 16 to ( width ) */ + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else + { + /* Crap, use the C version */ +#undef VEC_NEXT_LINES +#undef VEC_LOAD_UV +#undef VEC_MERGE +#endif + + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -357,9 +724,9 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); C_YUV420_UYVY( ); C_YUV420_UYVY( ); @@ -368,47 +735,133 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_UYVY ); #endif } + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; ) + { + C_YUV420_UYVY( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) + } +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) /***************************************************************************** * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2 *****************************************************************************/ -static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_IUYV( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { + VLC_UNUSED(p_source); VLC_UNUSED(p_dest); /* FIXME: TODO ! */ - msg_Err( p_vout, "I420_IUYV unimplemented, please harass " ); + msg_Err( p_filter, "I420_IUYV unimplemented, please harass " ); } /***************************************************************************** * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2 *****************************************************************************/ -static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_cyuv( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1 = p_dest->p->p_pixels + - p_dest->p->i_lines * p_dest->p->i_pitch + p_dest->p->i_visible_lines * p_dest->p->i_pitch + p_dest->p->i_pitch; uint8_t *p_line2 = p_dest->p->p_pixels + - p_dest->p->i_lines * p_dest->p->i_pitch; + p_dest->p->i_visible_lines * p_dest->p->i_pitch; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; uint8_t *p_u = p_source->U_PIXELS; uint8_t *p_v = p_source->V_PIXELS; int i_x, i_y; - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 -= 3 * p_dest->p->i_pitch; p_line2 -= 3 * p_dest->p->i_pitch; @@ -416,9 +869,9 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); C_YUV420_UYVY( ); C_YUV420_UYVY( ); @@ -427,12 +880,89 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_UYVY ); #endif } + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -440,8 +970,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1 *****************************************************************************/ #if defined (MODULE_NAME_IS_i420_yuy2) -static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_Y211( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -450,12 +980,14 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -463,7 +995,7 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { C_YUV420_Y211( ); C_YUV420_Y211( ); @@ -471,9 +1003,10 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } } #endif -