From ba0aca83c8fac83f81ca495f2160f98929b3f99f Mon Sep 17 00:00:00 2001 From: Damien Fouilleul Date: Tue, 10 Oct 2006 20:53:05 +0000 Subject: [PATCH] - added altivec acceleration for i420 -> YVYU, i420 -> UYVY (probably of no use to anyone) --- modules/video_chroma/i420_yuy2.c | 179 +++++++++++++++++++++++++++++-- 1 file changed, 173 insertions(+), 6 deletions(-) diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c index 9079438c0b..ecfc330684 100644 --- a/modules/video_chroma/i420_yuy2.c +++ b/modules/video_chroma/i420_yuy2.c @@ -43,7 +43,7 @@ #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) -# define DEST_FOURCC "YUY2,YUNV" +# define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422" #endif /***************************************************************************** @@ -52,9 +52,9 @@ static int Activate ( vlc_object_t * ); static void I420_YUY2 ( vout_thread_t *, picture_t *, picture_t * ); -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) static void I420_YVYU ( vout_thread_t *, picture_t *, picture_t * ); static void I420_UYVY ( vout_thread_t *, picture_t *, picture_t * ); +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) static void I420_IUYV ( vout_thread_t *, picture_t *, picture_t * ); static void I420_cyuv ( vout_thread_t *, picture_t *, picture_t * ); #endif @@ -116,7 +116,6 @@ static int Activate( vlc_object_t *p_this ) p_vout->chroma.pf_convert = I420_YUY2; break; -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) case VLC_FOURCC('Y','V','Y','U'): p_vout->chroma.pf_convert = I420_YVYU; break; @@ -127,6 +126,7 @@ static int Activate( vlc_object_t *p_this ) p_vout->chroma.pf_convert = I420_UYVY; break; +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) case VLC_FOURCC('I','U','Y','V'): p_vout->chroma.pf_convert = I420_IUYV; break; @@ -296,7 +296,6 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, /***************************************************************************** * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2 *****************************************************************************/ -#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, picture_t *p_dest ) { @@ -307,6 +306,86 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) +#define VEC_NEXT_LINES( ) \ + p_line1 = p_line2; \ + p_line2 += p_dest->p->i_pitch; \ + p_y1 = p_y2; \ + p_y2 += p_source->p[Y_PLANE].i_pitch; + +#define VEC_LOAD_UV( ) \ + u_vec = vec_ld( 0, p_u ); p_u += 16; \ + v_vec = vec_ld( 0, p_v ); p_v += 16; + +#define VEC_MERGE( a ) \ + vu_vec = a( v_vec, u_vec ); \ + y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \ + vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \ + vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \ + y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \ + vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \ + vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; + + vector unsigned char u_vec; + vector unsigned char v_vec; + vector unsigned char vu_vec; + vector unsigned char y_vec; + + if( !( ( p_vout->render.i_width % 32 ) | + ( p_vout->render.i_height % 2 ) ) ) + { + /* Width is a multiple of 32, we take 2 lines at a time */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + VEC_NEXT_LINES( ); + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else if( !( ( p_vout->render.i_width % 16 ) | + ( p_vout->render.i_height % 4 ) ) ) + { + /* Width is only a multiple of 16, we take 4 lines at a time */ + for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + { + /* Line 1 and 2, pixels 0 to ( width - 16 ) */ + VEC_NEXT_LINES( ); + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + + /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */ + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + + /* Line 3 and 4, pixels 0 to 16 */ + VEC_NEXT_LINES( ); + VEC_MERGE( vec_mergel ); + + /* Line 3 and 4, pixels 16 to ( width ) */ + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else + { + /* Crap, use the C version */ +#undef VEC_NEXT_LINES +#undef VEC_LOAD_UV +#undef VEC_MERGE +#endif + const int i_source_margin = p_source->p[0].i_pitch - p_source->p[0].i_visible_pitch; const int i_source_margin_c = p_source->p[1].i_pitch @@ -324,7 +403,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_YVYU( ); C_YUV420_YVYU( ); C_YUV420_YVYU( ); @@ -341,6 +420,9 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) + } +#endif } /***************************************************************************** @@ -356,6 +438,86 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) +#define VEC_NEXT_LINES( ) \ + p_line1 = p_line2; \ + p_line2 += p_dest->p->i_pitch; \ + p_y1 = p_y2; \ + p_y2 += p_source->p[Y_PLANE].i_pitch; + +#define VEC_LOAD_UV( ) \ + u_vec = vec_ld( 0, p_u ); p_u += 16; \ + v_vec = vec_ld( 0, p_v ); p_v += 16; + +#define VEC_MERGE( a ) \ + uv_vec = a( u_vec, v_vec ); \ + y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \ + vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \ + vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \ + y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \ + vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \ + vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; + + vector unsigned char u_vec; + vector unsigned char v_vec; + vector unsigned char uv_vec; + vector unsigned char y_vec; + + if( !( ( p_vout->render.i_width % 32 ) | + ( p_vout->render.i_height % 2 ) ) ) + { + /* Width is a multiple of 32, we take 2 lines at a time */ + for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) + { + VEC_NEXT_LINES( ); + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else if( !( ( p_vout->render.i_width % 16 ) | + ( p_vout->render.i_height % 4 ) ) ) + { + /* Width is only a multiple of 16, we take 4 lines at a time */ + for( i_y = p_vout->render.i_height / 4 ; i_y-- ; ) + { + /* Line 1 and 2, pixels 0 to ( width - 16 ) */ + VEC_NEXT_LINES( ); + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + + /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */ + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + + /* Line 3 and 4, pixels 0 to 16 */ + VEC_NEXT_LINES( ); + VEC_MERGE( vec_mergel ); + + /* Line 3 and 4, pixels 16 to ( width ) */ + for( i_x = p_vout->render.i_width / 32 ; i_x-- ; ) + { + VEC_LOAD_UV( ); + VEC_MERGE( vec_mergeh ); + VEC_MERGE( vec_mergel ); + } + } + } + else + { + /* Crap, use the C version */ +#undef VEC_NEXT_LINES +#undef VEC_LOAD_UV +#undef VEC_MERGE +#endif + const int i_source_margin = p_source->p[0].i_pitch - p_source->p[0].i_visible_pitch; const int i_source_margin_c = p_source->p[1].i_pitch @@ -373,7 +535,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { -#if defined (MODULE_NAME_IS_i420_yuy2) +#if !defined (MODULE_NAME_IS_i420_yuy2_mmx) C_YUV420_UYVY( ); C_YUV420_UYVY( ); C_YUV420_UYVY( ); @@ -398,8 +560,13 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, #if defined (MODULE_NAME_IS_i420_yuy2_mmx) __asm__ __volatile__("emms" :: ); #endif + +#if defined (MODULE_NAME_IS_i420_yuy2_altivec) + } +#endif } +#if !defined (MODULE_NAME_IS_i420_yuy2_altivec) /***************************************************************************** * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2 *****************************************************************************/ -- 2.39.2