From 262b177b0890b6b1943cb9c4838bcffc5152f290 Mon Sep 17 00:00:00 2001 From: Damien Fouilleul Date: Fri, 10 Aug 2007 18:28:49 +0000 Subject: [PATCH] i422_yuy2: SSE2 improvements --- configure.ac | 6 +- modules/video_chroma/Modules.am | 5 + modules/video_chroma/i422_yuy2.c | 208 ++++++++++++++++++++++++++++--- modules/video_chroma/i422_yuy2.h | 131 ++++++++++++++++++- 4 files changed, 327 insertions(+), 23 deletions(-) diff --git a/configure.ac b/configure.ac index 356f146869..070948ca64 100644 --- a/configure.ac +++ b/configure.ac @@ -1257,7 +1257,7 @@ MMXEXT_MODULES="memcpymmxext" #MMXEXT_MODULES="${MMXEXT_MODULES} idctmmxext motionmmxext" THREEDNOW_MODULES="memcpy3dn" SSE_MODULES="" -SSE2_MODULES="i420_rgb_sse2 i420_yuy2_sse2" +SSE2_MODULES="i420_rgb_sse2 i420_yuy2_sse2 i422_yuy2_sse2" ALTIVEC_MODULES="memcpyaltivec i420_yuy2_altivec" #ALTIVEC_MODULES="${ALTIVEC_MODULES} idctaltivec motionaltivec" @@ -1283,7 +1283,7 @@ AC_CACHE_CHECK([if \$CC groks MMX intrinsics], [ac_cv_c_mmx_intrinsics=no])]) if test "${ac_cv_c_mmx_intrinsics}" != "no"; then AC_DEFINE(HAVE_MMX_INTRINSICS, 1, Define if MMX intrinsics are available.) - VLC_ADD_CFLAGS([i420_rgb_mmx],[-mmmx]) + VLC_ADD_CFLAGS([${MMX_MODULES}],[-mmmx]) fi dnl Check for fully workin SSE2 intrinsics @@ -1308,7 +1308,7 @@ AC_CACHE_CHECK([if \$CC groks SSE2 intrinsics], [ac_cv_c_sse2_intrinsics=no])]) if test "${ac_cv_c_sse2_intrinsics}" != "no"; then AC_DEFINE(HAVE_SSE2_INTRINSICS, 1, Define if SSE2 intrinsics are available.) - VLC_ADD_CFLAGS([i420_rgb_sse2],[-msse2]) + VLC_ADD_CFLAGS([${SSE2_MODULES}],[-msse2]) fi AC_CACHE_CHECK([if \$CC groks MMX inline assembly], diff --git a/modules/video_chroma/Modules.am b/modules/video_chroma/Modules.am index ad882d2700..8ea7e1aca2 100644 --- a/modules/video_chroma/Modules.am +++ b/modules/video_chroma/Modules.am @@ -50,6 +50,11 @@ SOURCES_i422_yuy2_mmx = \ i422_yuy2.h \ $(NULL) +SOURCES_i422_yuy2_sse2 = \ + i422_yuy2.c \ + i422_yuy2.h \ + $(NULL) + SOURCES_i420_ymga = \ i420_ymga.c \ $(NULL) diff --git a/modules/video_chroma/i422_yuy2.c b/modules/video_chroma/i422_yuy2.c index 4bc4528912..2119755a28 100644 --- a/modules/video_chroma/i422_yuy2.c +++ b/modules/video_chroma/i422_yuy2.c @@ -67,6 +67,10 @@ vlc_module_begin(); set_description( _("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) ); set_capability( "chroma", 100 ); add_requirement( MMX ); +#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) + set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) ); + set_capability( "chroma", 120 ); + add_requirement( MMX ); #endif set_callbacks( Activate, NULL ); vlc_module_end(); @@ -143,17 +147,66 @@ static int Activate( vlc_object_t *p_this ) static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source, picture_t *p_dest ) { - uint8_t *p_pixels = p_dest->p->p_pixels; - int i_pitch = p_dest->p->i_pitch; + uint8_t *p_line = p_dest->p->p_pixels; uint8_t *p_y = p_source->Y_PIXELS; uint8_t *p_u = p_source->U_PIXELS; uint8_t *p_v = p_source->V_PIXELS; int i_x, i_y; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; + const int i_dest_margin = p_dest->p->i_pitch + - p_dest->p->i_visible_pitch; + +#if defined (MODULE_NAME_IS_i422_yuy2_sse2) + + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((int)p_line|(int)p_y))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_vout->render.i_height ; i_y-- ; ) + { + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV422_YUYV_ALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV422_YUYV( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; + } + } + else { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_vout->render.i_height ; i_y-- ; ) + { + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV422_YUYV_UNALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV422_YUYV( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; + } + } + SSE2_END; + +#else + for( i_y = p_vout->render.i_height ; i_y-- ; ) { - uint8_t *p_line = p_pixels; for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { #if defined (MODULE_NAME_IS_i422_yuy2) @@ -165,12 +218,19 @@ static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV422_YUYV ); #endif } - p_pixels += i_pitch; + for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV422_YUYV( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; } #if defined (MODULE_NAME_IS_i422_yuy2_mmx) MMX_END; -#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) - SSE2_END; +#endif + #endif } @@ -180,17 +240,66 @@ static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source, static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source, picture_t *p_dest ) { - uint8_t *p_pixels = p_dest->p->p_pixels; - int i_pitch = p_dest->p->i_pitch; + uint8_t *p_line = p_dest->p->p_pixels; uint8_t *p_y = p_source->Y_PIXELS; uint8_t *p_u = p_source->U_PIXELS; uint8_t *p_v = p_source->V_PIXELS; int i_x, i_y; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; + const int i_dest_margin = p_dest->p->i_pitch + - p_dest->p->i_visible_pitch; + +#if defined (MODULE_NAME_IS_i422_yuy2_sse2) + + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((int)p_line|(int)p_y))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_vout->render.i_height ; i_y-- ; ) + { + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV422_YVYU_ALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV422_YVYU( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; + } + } + else { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_vout->render.i_height ; i_y-- ; ) + { + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV422_YVYU_UNALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV422_YVYU( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; + } + } + SSE2_END; + +#else + for( i_y = p_vout->render.i_height ; i_y-- ; ) { - uint8_t *p_line = p_pixels; for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { #if defined (MODULE_NAME_IS_i422_yuy2) @@ -202,12 +311,19 @@ static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV422_YVYU ); #endif } - p_pixels += i_pitch; + for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV422_YVYU( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; } #if defined (MODULE_NAME_IS_i422_yuy2_mmx) MMX_END; -#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) - SSE2_END; +#endif + #endif } @@ -217,17 +333,66 @@ static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source, static void I422_UYVY( vout_thread_t *p_vout, picture_t *p_source, picture_t *p_dest ) { - uint8_t *p_pixels = p_dest->p->p_pixels; - int i_pitch = p_dest->p->i_pitch; + uint8_t *p_line = p_dest->p->p_pixels; uint8_t *p_y = p_source->Y_PIXELS; uint8_t *p_u = p_source->U_PIXELS; uint8_t *p_v = p_source->V_PIXELS; int i_x, i_y; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; + const int i_dest_margin = p_dest->p->i_pitch + - p_dest->p->i_visible_pitch; + +#if defined (MODULE_NAME_IS_i422_yuy2_sse2) + + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((int)p_line|(int)p_y))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_vout->render.i_height ; i_y-- ; ) + { + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV422_UYVY_ALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV422_UYVY( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; + } + } + else { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_vout->render.i_height ; i_y-- ; ) + { + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV422_UYVY_UNALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV422_UYVY( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; + } + } + SSE2_END; + +#else + for( i_y = p_vout->render.i_height ; i_y-- ; ) { - uint8_t *p_line = p_pixels; for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { #if defined (MODULE_NAME_IS_i422_yuy2) @@ -239,12 +404,19 @@ static void I422_UYVY( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV422_UYVY ); #endif } - p_pixels += i_pitch; + for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) + { + C_YUV422_UYVY( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; } #if defined (MODULE_NAME_IS_i422_yuy2_mmx) MMX_END; -#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) - SSE2_END; +#endif + #endif } diff --git a/modules/video_chroma/i422_yuy2.h b/modules/video_chroma/i422_yuy2.h index 85794be1b9..1b0405bb27 100644 --- a/modules/video_chroma/i422_yuy2.h +++ b/modules/video_chroma/i422_yuy2.h @@ -87,8 +87,49 @@ movq %%mm1, 8(%0) # Store high UYVY \n\ #include +#define MMX_CALL(MMX_INSTRUCTIONS) \ + do { \ + __m64 mm0, mm1, mm2; \ + MMX_INSTRUCTIONS \ + p_line += 16; p_y += 8; \ + p_u += 4; p_v += 4; \ + } while(0) + #define MMX_END _mm_empty() +#define MMX_YUV422_YUYV \ + mm0 = (__m64)*(uint64_t*)p_y; \ + mm1 = _mm_cvtsi32_si64(*(int*)p_u); \ + mm2 = _mm_cvtsi32_si64(*(int*)p_v); \ + mm1 = _mm_unpacklo_pi8(mm1, mm2); \ + mm2 = mm0; \ + mm2 = _mm_unpacklo_pi8(mm2, mm1); \ + *(uint64_t*)p_line = (uint64_t)mm2; \ + mm0 = _mm_unpackhi_pi8(mm0, mm1); \ + *(uint64_t*)(p_line+8) = (uint64_t)mm0; + +#define MMX_YUV422_YVYU \ + mm0 = (__m64)*(uint64_t*)p_y; \ + mm2 = _mm_cvtsi32_si64(*(int*)p_u); \ + mm1 = _mm_cvtsi32_si64(*(int*)p_v); \ + mm1 = _mm_unpacklo_pi8(mm1, mm2); \ + mm2 = mm0; \ + mm2 = _mm_unpacklo_pi8(mm2, mm1); \ + *(uint64_t*)p_line = (uint64_t)mm2; \ + mm0 = _mm_unpackhi_pi8(mm0, mm1); \ + *(uint64_t*)(p_line+8) = (uint64_t)mm0; + +#define MMX_YUV422_UYVY \ + mm0 = (__m64)*(uint64_t*)p_y; \ + mm1 = _mm_cvtsi32_si64(*(int*)p_u); \ + mm2 = _mm_cvtsi32_si64(*(int*)p_v); \ + mm1 = _mm_unpacklo_pi8(mm1, mm2); \ + mm2 = mm1; \ + mm2 = _mm_unpacklo_pi8(mm2, mm0); \ + *(uint64_t*)p_line = (uint64_t)mm2; \ + mm1 = _mm_unpackhi_pi8(mm1, mm0); \ + *(uint64_t*)(p_line+8) = (uint64_t)mm1; + #endif #elif defined( MODULE_NAME_IS_i422_yuy2_sse2 ) @@ -97,8 +138,95 @@ movq %%mm1, 8(%0) # Store high UYVY \n\ /* SSE2 assembly */ +#define SSE2_CALL(MMX_INSTRUCTIONS) \ + do { \ + __asm__ __volatile__( \ + ".p2align 3 \n\t" \ + MMX_INSTRUCTIONS \ + : \ + : "r" (p_line), "r" (p_y), \ + "r" (p_u), "r" (p_v) ); \ + p_line += 32; p_y += 16; \ + p_u += 8; p_v += 8; \ + } while(0) + #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" ) +#define SSE2_YUV422_YUYV_ALIGNED " \n\ +movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ +punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\ +movntdq %%xmm2, (%0) # Store low YUYV \n\ +punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ +movntdq %%xmm0, 16(%0) # Store high YUYV \n\ +" + +#define SSE2_YUV422_YUYV_UNALIGNED " \n\ +movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\ +punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ +punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\ +movdqu %%xmm2, (%0) # Store low YUYV \n\ +punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ +movdqu %%xmm0, 16(%0) # Store high YUYV \n\ +" + +#define SSE2_YUV422_YVYU_ALIGNED " \n\ +movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ +movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ +punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ +movntdq %%xmm2, (%0) # Store low YUYV \n\ +punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ +movntdq %%xmm0, 16(%0) # Store high YUYV \n\ +" + +#define SSE2_YUV422_YVYU_UNALIGNED " \n\ +movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\ +punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ +movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ +punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ +movdqu %%xmm2, (%0) # Store low YUYV \n\ +punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ +movdqu %%xmm0, 16(%0) # Store high YUYV \n\ +" + +#define SSE2_YUV422_UYVY_ALIGNED " \n\ +movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ +movntdq %%xmm2, (%0) # Store low UYVY \n\ +punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\ +movntdq %%xmm1, 16(%0) # Store high UYVY \n\ +" + +#define SSE2_YUV422_UYVY_UNALIGNED " \n\ +movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\ +punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ +movdqu %%xmm2, (%0) # Store low UYVY \n\ +punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\ +movdqu %%xmm1, 16(%0) # Store high UYVY \n\ +" + #elif defined(HAVE_SSE2_INTRINSICS) /* SSE2 intrinsics */ @@ -110,7 +238,7 @@ movq %%mm1, 8(%0) # Store high UYVY \n\ #endif -#elif defined (MODULE_NAME_IS_i422_yuy2) +#endif #define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \ *(p_line)++ = *(p_y)++; \ @@ -136,5 +264,4 @@ movq %%mm1, 8(%0) # Store high UYVY \n\ *(p_line)++ = *(p_y); p_y += 2; \ *(p_line)++ = *(p_v) - 0x80; p_v += 2; \ -#endif -- 2.39.2