X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=modules%2Fvideo_chroma%2Fi420_yuy2.c;h=09a904f951593daaf124516cc71212867c77e361;hb=1ce4f166a9653d8ee369862ee3111cce91af815d;hp=37c2fd879090593adab0db03e9873f2d42c6ac7c;hpb=2cb472dba008f7d877ffe6bae9c5575253365282;p=vlc diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c index 37c2fd8790..09a904f951 100644 --- a/modules/video_chroma/i420_yuy2.c +++ b/modules/video_chroma/i420_yuy2.c @@ -5,6 +5,7 @@ * $Id$ * * Authors: Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -24,11 +25,14 @@ /***************************************************************************** * Preamble *****************************************************************************/ -#include /* strerror() */ -#include /* malloc(), free() */ -#include -#include +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H) # include @@ -42,8 +46,10 @@ # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211" #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" +#elif defined (MODULE_NAME_IS_i420_yuy2_sse2) +# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) -# define DEST_FOURCC "YUY2,YUNV" +# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422" #endif /***************************************************************************** @@ -51,44 +57,49 @@ *****************************************************************************/ static int Activate ( vlc_object_t * ); -static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_YUY2 ( filter_t *, picture_t *, picture_t * ); +static void I420_YVYU ( filter_t *, picture_t *, picture_t * ); +static void I420_UYVY ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * ); +static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * ); +static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * ); #if !defined (MODULE_NAME_IS_i420_yuy2_altivec) -static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * ); -static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_IUYV ( filter_t *, picture_t *, picture_t * ); +static void I420_cyuv ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * ); +static picture_t *I420_cyuv_Filter ( filter_t *, picture_t * ); #endif #if defined (MODULE_NAME_IS_i420_yuy2) -static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * ); +static void I420_Y211 ( filter_t *, picture_t *, picture_t * ); +static picture_t *I420_Y211_Filter ( filter_t *, picture_t * ); #endif #ifdef MODULE_NAME_IS_i420_yuy2_mmx -static uint64_t i_00ffw; -static uint64_t i_80w; +/* Initialize MMX-specific constants */ +static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL; +static const uint64_t i_80w = 0x0000000080808080ULL; #endif /***************************************************************************** * Module descriptor. *****************************************************************************/ -vlc_module_begin(); +vlc_module_begin () #if defined (MODULE_NAME_IS_i420_yuy2) - set_description( _("Conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 80 ); + set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) ) + set_capability( "video filter2", 80 ) #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) - set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 100 ); - add_requirement( MMX ); - /* Initialize MMX-specific constants */ - i_00ffw = 0x00ff00ff00ff00ffULL; - i_80w = 0x0000000080808080ULL; + set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) ) + set_capability( "video filter2", 160 ) +#elif defined (MODULE_NAME_IS_i420_yuy2_sse2) + set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) ) + set_capability( "video filter2", 250 ) #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) set_description( _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) ); - set_capability( "chroma", 100 ); - add_requirement( ALTIVEC ); + set_capability( "video filter2", 250 ) #endif - set_callbacks( Activate, NULL ); -vlc_module_end(); + set_callbacks( Activate, NULL ) +vlc_module_end () /***************************************************************************** * Activate: allocate a chroma function @@ -97,48 +108,48 @@ vlc_module_end(); *****************************************************************************/ static int Activate( vlc_object_t *p_this ) { - vout_thread_t *p_vout = (vout_thread_t *)p_this; + filter_t *p_filter = (filter_t *)p_this; - if( p_vout->render.i_width & 1 || p_vout->render.i_height & 1 ) + if( p_filter->fmt_in.video.i_width & 1 + || p_filter->fmt_in.video.i_height & 1 ) { return -1; } - switch( p_vout->render.i_chroma ) + if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width + || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height ) + return -1; + + switch( p_filter->fmt_in.video.i_chroma ) { - case VLC_FOURCC('Y','V','1','2'): - case VLC_FOURCC('I','4','2','0'): - case VLC_FOURCC('I','Y','U','V'): - switch( p_vout->output.i_chroma ) + case VLC_CODEC_YV12: + case VLC_CODEC_I420: + switch( p_filter->fmt_out.video.i_chroma ) { - case VLC_FOURCC('Y','U','Y','2'): - case VLC_FOURCC('Y','U','N','V'): - p_vout->chroma.pf_convert = I420_YUY2; + case VLC_CODEC_YUYV: + p_filter->pf_video_filter = I420_YUY2_Filter; break; -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) - case VLC_FOURCC('Y','V','Y','U'): - p_vout->chroma.pf_convert = I420_YVYU; + case VLC_CODEC_YVYU: + p_filter->pf_video_filter = I420_YVYU_Filter; break; - case VLC_FOURCC('U','Y','V','Y'): - case VLC_FOURCC('U','Y','N','V'): - case VLC_FOURCC('Y','4','2','2'): - p_vout->chroma.pf_convert = I420_UYVY; + case VLC_CODEC_UYVY: + p_filter->pf_video_filter = I420_UYVY_Filter; break; - +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) case VLC_FOURCC('I','U','Y','V'): - p_vout->chroma.pf_convert = I420_IUYV; + p_filter->pf_video_filter = I420_IUYV_Filter; break; - case VLC_FOURCC('c','y','u','v'): - p_vout->chroma.pf_convert = I420_cyuv; + case VLC_CODEC_CYUV: + p_filter->pf_video_filter = I420_cyuv_Filter; break; #endif #if defined (MODULE_NAME_IS_i420_yuy2) - case VLC_FOURCC('Y','2','1','1'): - p_vout->chroma.pf_convert = I420_Y211; + case VLC_CODEC_Y211: + p_filter->pf_video_filter = I420_Y211_Filter; break; #endif @@ -154,13 +165,34 @@ static int Activate( vlc_object_t *p_this ) return 0; } +#if 0 +static inline unsigned long long read_cycles(void) +{ + unsigned long long v; + __asm__ __volatile__("rdtsc" : "=A" (v): ); + + return v; +} +#endif + /* Following functions are local */ +VIDEO_FILTER_WRAPPER( I420_YUY2 ) +VIDEO_FILTER_WRAPPER( I420_YVYU ) +VIDEO_FILTER_WRAPPER( I420_UYVY ) +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) +VIDEO_FILTER_WRAPPER( I420_IUYV ) +VIDEO_FILTER_WRAPPER( I420_cyuv ) +#endif +#if defined (MODULE_NAME_IS_i420_yuy2) +VIDEO_FILTER_WRAPPER( I420_Y211 ) +#endif + /***************************************************************************** * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2 *****************************************************************************/ -static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_YUY2( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -194,14 +226,14 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, vector unsigned char uv_vec; vector unsigned char y_vec; - if( !( ( p_vout->render.i_width % 32 ) | - ( p_vout->render.i_height % 2 ) ) ) + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) { /* Width is a multiple of 32, we take 2 lines at a time */ - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -209,15 +241,17 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, } } } - else if( !( ( p_vout->render.i_width % 16 ) | - ( p_vout->render.i_height % 4 ) ) ) +#warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec +#if 0 + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) { /* Width is only a multiple of 16, we take 4 lines at a time */ - for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) { /* Line 1 and 2, pixels 0 to ( width - 16 ) */ VEC_NEXT_LINES( ); - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -233,7 +267,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, VEC_MERGE( vec_mergel ); /* Line 3 and 4, pixels 16 to ( width ) */ - for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) { VEC_LOAD_UV( ); VEC_MERGE( vec_mergeh ); @@ -241,6 +275,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, } } } +#endif else { /* Crap, use the C version */ @@ -256,7 +291,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -265,20 +301,23 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_y2 += p_source->p[Y_PLANE].i_pitch; #if !defined (MODULE_NAME_IS_i420_yuy2_mmx) - for( i_x = p_vout->render.i_width / 2 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; ) { C_YUV420_YUYV( ); + C_YUV420_YUYV( ); + C_YUV420_YUYV( ); + C_YUV420_YUYV( ); } #else - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { MMX_CALL( MMX_YUV420_YUYV ); } - for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) +#endif + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) { C_YUV420_YUYV( ); } -#endif p_y1 += i_source_margin; p_y2 += i_source_margin; @@ -288,17 +327,89 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_line2 += i_dest_margin; } +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + #if defined (MODULE_NAME_IS_i420_yuy2_altivec) } #endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } /***************************************************************************** * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2 *****************************************************************************/ -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) -static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_YVYU( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -307,6 +418,86 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) +#define VEC_NEXT_LINES( ) \ + p_line1 = p_line2; \ + p_line2 += p_dest->p->i_pitch; \ + p_y1 = p_y2; \ + p_y2 += p_source->p[Y_PLANE].i_pitch; + +#define VEC_LOAD_UV( ) \ + u_vec = vec_ld( 0, p_u ); p_u += 16; \ + v_vec = vec_ld( 0, p_v ); p_v += 16; + +#define VEC_MERGE( a ) \ + vu_vec = a( v_vec, u_vec ); \ + y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \ + vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \ + vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \ + y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \ + vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \ + vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; + + vector unsigned char u_vec; + vector unsigned char v_vec; + vector unsigned char vu_vec; + vector unsigned char y_vec; + + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) + { + /* Width is a multiple of 32, we take 2 lines at a time */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + VEC_NEXT_LINES( ); + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) + { + /* Width is only a multiple of 16, we take 4 lines at a time */ + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) + { + /* Line 1 and 2, pixels 0 to ( width - 16 ) */ + VEC_NEXT_LINES( ); + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + + /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */ + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + + /* Line 3 and 4, pixels 0 to 16 */ + VEC_NEXT_LINES( ); + VEC_MERGE( vec_mergel ); + + /* Line 3 and 4, pixels 16 to ( width ) */ + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else + { + /* Crap, use the C version */ +#undef VEC_NEXT_LINES +#undef VEC_LOAD_UV +#undef VEC_MERGE +#endif + const int i_source_margin = p_source->p[0].i_pitch - p_source->p[0].i_visible_pitch; const int i_source_margin_c = p_source->p[1].i_pitch @@ -314,7 +505,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -322,9 +514,9 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_YVYU( ); C_YUV420_YVYU( ); C_YUV420_YVYU( ); @@ -333,6 +525,10 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_YVYU ); #endif } + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; @@ -341,13 +537,88 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) + } +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } /***************************************************************************** * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2 *****************************************************************************/ -static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_UYVY( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -356,6 +627,86 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) +#define VEC_NEXT_LINES( ) \ + p_line1 = p_line2; \ + p_line2 += p_dest->p->i_pitch; \ + p_y1 = p_y2; \ + p_y2 += p_source->p[Y_PLANE].i_pitch; + +#define VEC_LOAD_UV( ) \ + u_vec = vec_ld( 0, p_u ); p_u += 16; \ + v_vec = vec_ld( 0, p_v ); p_v += 16; + +#define VEC_MERGE( a ) \ + uv_vec = a( u_vec, v_vec ); \ + y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \ + vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \ + vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \ + y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \ + vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \ + vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; + + vector unsigned char u_vec; + vector unsigned char v_vec; + vector unsigned char uv_vec; + vector unsigned char y_vec; + + if( !( ( p_filter->fmt_in.video.i_width % 32 ) | + ( p_filter->fmt_in.video.i_height % 2 ) ) ) + { + /* Width is a multiple of 32, we take 2 lines at a time */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + VEC_NEXT_LINES( ); + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else if( !( ( p_filter->fmt_in.video.i_width % 16 ) | + ( p_filter->fmt_in.video.i_height % 4 ) ) ) + { + /* Width is only a multiple of 16, we take 4 lines at a time */ + for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; ) + { + /* Line 1 and 2, pixels 0 to ( width - 16 ) */ + VEC_NEXT_LINES( ); + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + + /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */ + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + + /* Line 3 and 4, pixels 0 to 16 */ + VEC_NEXT_LINES( ); + VEC_MERGE( vec_mergel ); + + /* Line 3 and 4, pixels 16 to ( width ) */ + for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else + { + /* Crap, use the C version */ +#undef VEC_NEXT_LINES +#undef VEC_LOAD_UV +#undef VEC_MERGE +#endif + const int i_source_margin = p_source->p[0].i_pitch - p_source->p[0].i_visible_pitch; const int i_source_margin_c = p_source->p[1].i_pitch @@ -363,7 +714,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -371,9 +723,9 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); C_YUV420_UYVY( ); C_YUV420_UYVY( ); @@ -382,6 +734,10 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_UYVY ); #endif } + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; ) + { + C_YUV420_UYVY( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; @@ -390,23 +746,100 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) + } +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) /***************************************************************************** * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2 *****************************************************************************/ -static void I420_IUYV( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_IUYV( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { + VLC_UNUSED(p_source); VLC_UNUSED(p_dest); /* FIXME: TODO ! */ - msg_Err( p_vout, "I420_IUYV unimplemented, please harass " ); + msg_Err( p_filter, "I420_IUYV unimplemented, please harass " ); } /***************************************************************************** * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2 *****************************************************************************/ -static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_cyuv( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1 = p_dest->p->p_pixels + p_dest->p->i_visible_lines * p_dest->p->i_pitch @@ -426,7 +859,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 -= 3 * p_dest->p->i_pitch; p_line2 -= 3 * p_dest->p->i_pitch; @@ -434,9 +868,9 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); C_YUV420_UYVY( ); C_YUV420_UYVY( ); @@ -445,6 +879,10 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_UYVY ); #endif } + for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; @@ -453,6 +891,77 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + MMX_END; +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((intptr_t)p_line2|(intptr_t)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); + } + for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -460,8 +969,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1 *****************************************************************************/ #if defined (MODULE_NAME_IS_i420_yuy2) -static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, - picture_t *p_dest ) +static void I420_Y211( filter_t *p_filter, picture_t *p_source, + picture_t *p_dest ) { uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; @@ -477,7 +986,7 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += p_dest->p->i_pitch; @@ -485,7 +994,7 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; ) { C_YUV420_Y211( ); C_YUV420_Y211( ); @@ -500,4 +1009,3 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, } } #endif -