X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=modules%2Fvideo_chroma%2Fi420_yuy2.c;h=8a76fcb46f61cd64021fbf2a737117007f01b3a4;hb=f4f90e674b23ba5a949d0bffd942451685d31907;hp=e42e0cace24b617879608b90d46eb84e8870d572;hpb=734d0a85b5ae5af4aef5e6ed96c84342cf553444;p=vlc diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c index e42e0cace2..8a76fcb46f 100644 --- a/modules/video_chroma/i420_yuy2.c +++ b/modules/video_chroma/i420_yuy2.c @@ -1,10 +1,11 @@ /***************************************************************************** * i420_yuy2.c : YUV to YUV conversion module for vlc ***************************************************************************** - * Copyright (C) 2000, 2001 VideoLAN + * Copyright (C) 2000, 2001 the VideoLAN team * $Id$ * * Authors: Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,7 +19,7 @@ * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. *****************************************************************************/ /***************************************************************************** @@ -28,9 +29,9 @@ #include /* malloc(), free() */ #include -#include +#include -#ifdef HAVE_ALTIVEC_H +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H) # include #endif @@ -42,8 +43,10 @@ # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211" #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" +#elif defined (MODULE_NAME_IS_i420_yuy2_sse2) +# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) -# define DEST_FOURCC "YUY2,YUNV" +# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422" #endif /***************************************************************************** @@ -52,9 +55,9 @@ static int Activate ( vlc_object_t * ); static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * ); -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * ); static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * ); +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * ); static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * ); #endif @@ -63,8 +66,9 @@ static void I420_Y211 ( vout_thread_t *, picture_t *, picture_t * ); #endif #ifdef MODULE_NAME_IS_i420_yuy2_mmx -static uint64_t i_00ffw; -static uint64_t i_80w; +/* Initialize MMX-specific constants */ +static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL; +static const uint64_t i_80w = 0x0000000080808080ULL; #endif /***************************************************************************** @@ -78,12 +82,13 @@ vlc_module_begin(); set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) ); set_capability( "chroma", 100 ); add_requirement( MMX ); - /* Initialize MMX-specific constants */ - i_00ffw = 0x00ff00ff00ff00ffULL; - i_80w = 0x0000000080808080ULL; +#elif defined (MODULE_NAME_IS_i420_yuy2_sse2) + set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) ); + set_capability( "chroma", 120 ); + add_requirement( SSE2 ); #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) set_description( - _("Altivec conversions from " SRC_FOURCC " to " DEST_FOURCC) ); + _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) ); set_capability( "chroma", 100 ); add_requirement( ALTIVEC ); #endif @@ -116,7 +121,6 @@ static int Activate( vlc_object_t *p_this ) p_vout->chroma.pf_convert = I420_YUY2; break; -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) case VLC_FOURCC('Y','V','Y','U'): p_vout->chroma.pf_convert = I420_YVYU; break; @@ -126,7 +130,7 @@ static int Activate( vlc_object_t *p_this ) case VLC_FOURCC('Y','4','2','2'): p_vout->chroma.pf_convert = I420_UYVY; break; - +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) case VLC_FOURCC('I','U','Y','V'): p_vout->chroma.pf_convert = I420_IUYV; break; @@ -154,8 +158,17 @@ static int Activate( vlc_object_t *p_this ) return 0; } -/* Following functions are local */ +#if 0 +static inline unsigned long long read_cycles(void) +{ + unsigned long long v; + __asm__ __volatile__("rdtsc" : "=A" (v): ); + + return v; +} +#endif +/* Following functions are local */ /***************************************************************************** * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2 *****************************************************************************/ @@ -249,11 +262,14 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, #undef VEC_MERGE #endif - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; @@ -262,33 +278,114 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_y1 = p_y2; p_y2 += p_source->p[Y_PLANE].i_pitch; - for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) - { #if !defined (MODULE_NAME_IS_i420_yuy2_mmx) + for( i_x = p_vout->render.i_width / 8; i_x-- ; ) + { C_YUV420_YUYV( ); C_YUV420_YUYV( ); C_YUV420_YUYV( ); C_YUV420_YUYV( ); + } #else + for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) + { MMX_CALL( MMX_YUV420_YUYV ); + } #endif + for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); } p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + __asm__ __volatile__ ( "emms" ); +#endif + #if defined (MODULE_NAME_IS_i420_yuy2_altivec) } #endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((int)p_line2|(int)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + /* make sure all SSE2 stores are visible thereafter */ + __asm__ __volatile__ ( "sfence" ); + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YUYV( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } + +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } /***************************************************************************** * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2 *****************************************************************************/ -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, picture_t *p_dest ) { @@ -299,11 +396,94 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) +#define VEC_NEXT_LINES( ) \ + p_line1 = p_line2; \ + p_line2 += p_dest->p->i_pitch; \ + p_y1 = p_y2; \ + p_y2 += p_source->p[Y_PLANE].i_pitch; + +#define VEC_LOAD_UV( ) \ + u_vec = vec_ld( 0, p_u ); p_u += 16; \ + v_vec = vec_ld( 0, p_v ); p_v += 16; + +#define VEC_MERGE( a ) \ + vu_vec = a( v_vec, u_vec ); \ + y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \ + vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \ + vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \ + y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \ + vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \ + vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; + + vector unsigned char u_vec; + vector unsigned char v_vec; + vector unsigned char vu_vec; + vector unsigned char y_vec; + + if( !( ( p_vout->render.i_width % 32 ) | + ( p_vout->render.i_height % 2 ) ) ) + { + /* Width is a multiple of 32, we take 2 lines at a time */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + VEC_NEXT_LINES( ); + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else if( !( ( p_vout->render.i_width % 16 ) | + ( p_vout->render.i_height % 4 ) ) ) + { + /* Width is only a multiple of 16, we take 4 lines at a time */ + for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + { + /* Line 1 and 2, pixels 0 to ( width - 16 ) */ + VEC_NEXT_LINES( ); + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + + /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */ + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + + /* Line 3 and 4, pixels 0 to 16 */ + VEC_NEXT_LINES( ); + VEC_MERGE( vec_mergel ); + + /* Line 3 and 4, pixels 16 to ( width ) */ + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else + { + /* Crap, use the C version */ +#undef VEC_NEXT_LINES +#undef VEC_LOAD_UV +#undef VEC_MERGE +#endif + + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; @@ -314,7 +494,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_YVYU( ); C_YUV420_YVYU( ); C_YUV420_YVYU( ); @@ -323,12 +503,93 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_YVYU ); #endif } + for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + __asm__ __volatile__ ( "emms" ); +#endif + +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) + } +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((int)p_line2|(int)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + /* make sure all SSE2 stores are visible thereafter */ + __asm__ __volatile__ ( "sfence" ); + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_YVYU( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } /***************************************************************************** @@ -344,11 +605,94 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) +#define VEC_NEXT_LINES( ) \ + p_line1 = p_line2; \ + p_line2 += p_dest->p->i_pitch; \ + p_y1 = p_y2; \ + p_y2 += p_source->p[Y_PLANE].i_pitch; + +#define VEC_LOAD_UV( ) \ + u_vec = vec_ld( 0, p_u ); p_u += 16; \ + v_vec = vec_ld( 0, p_v ); p_v += 16; + +#define VEC_MERGE( a ) \ + uv_vec = a( u_vec, v_vec ); \ + y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \ + vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \ + vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \ + y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \ + vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \ + vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; + + vector unsigned char u_vec; + vector unsigned char v_vec; + vector unsigned char uv_vec; + vector unsigned char y_vec; + + if( !( ( p_vout->render.i_width % 32 ) | + ( p_vout->render.i_height % 2 ) ) ) + { + /* Width is a multiple of 32, we take 2 lines at a time */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + VEC_NEXT_LINES( ); + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else if( !( ( p_vout->render.i_width % 16 ) | + ( p_vout->render.i_height % 4 ) ) ) + { + /* Width is only a multiple of 16, we take 4 lines at a time */ + for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + { + /* Line 1 and 2, pixels 0 to ( width - 16 ) */ + VEC_NEXT_LINES( ); + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + + /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */ + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + + /* Line 3 and 4, pixels 0 to 16 */ + VEC_NEXT_LINES( ); + VEC_MERGE( vec_mergel ); + + /* Line 3 and 4, pixels 16 to ( width ) */ + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else + { + /* Crap, use the C version */ +#undef VEC_NEXT_LINES +#undef VEC_LOAD_UV +#undef VEC_MERGE +#endif + + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) { p_line1 = p_line2; @@ -359,7 +703,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); C_YUV420_UYVY( ); C_YUV420_UYVY( ); @@ -368,14 +712,96 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_UYVY ); #endif } + for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x--; ) + { + C_YUV420_UYVY( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + __asm__ __volatile__ ( "emms" ); +#endif + +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) + } +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((int)p_line2|(int)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + /* make sure all SSE2 stores are visible thereafter */ + __asm__ __volatile__ ( "sfence" ); + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) /***************************************************************************** * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2 *****************************************************************************/ @@ -393,21 +819,24 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, picture_t *p_dest ) { uint8_t *p_line1 = p_dest->p->p_pixels + - p_dest->p->i_lines * p_dest->p->i_pitch + p_dest->p->i_visible_lines * p_dest->p->i_pitch + p_dest->p->i_pitch; uint8_t *p_line2 = p_dest->p->p_pixels + - p_dest->p->i_lines * p_dest->p->i_pitch; + p_dest->p->i_visible_lines * p_dest->p->i_pitch; uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS; uint8_t *p_u = p_source->U_PIXELS; uint8_t *p_v = p_source->V_PIXELS; int i_x, i_y; - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; +#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) { p_line1 -= 3 * p_dest->p->i_pitch; @@ -418,7 +847,7 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); C_YUV420_UYVY( ); C_YUV420_UYVY( ); @@ -427,12 +856,89 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV420_UYVY ); #endif } + for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + +#if defined (MODULE_NAME_IS_i420_yuy2_mmx) + /* re-enable FPU registers */ + __asm__ __volatile__ ( "emms" ); +#endif + +#else // defined(MODULE_NAME_IS_i420_yuy2_sse2) + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((int)p_line2|(int)p_y2))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + /* make sure all SSE2 stores are visible thereafter */ + __asm__ __volatile__ ( "sfence" ); + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + p_line1 = p_line2; + p_line2 += p_dest->p->i_pitch; + + p_y1 = p_y2; + p_y2 += p_source->p[Y_PLANE].i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV420_UYVY( ); + } + + p_y1 += i_source_margin; + p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line1 += i_dest_margin; + p_line2 += i_dest_margin; + } + } +#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -450,8 +956,10 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; - const int i_source_margin = p_source->p->i_pitch - - p_source->p->i_visible_pitch; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; const int i_dest_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; @@ -471,9 +979,10 @@ static void I420_Y211( vout_thread_t *p_vout, picture_t *p_source, p_y1 += i_source_margin; p_y2 += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; p_line1 += i_dest_margin; p_line2 += i_dest_margin; } } #endif -