From: Damien Fouilleul Date: Wed, 15 Aug 2007 16:15:45 +0000 (+0000) Subject: video chromas: finalize SSE2 improvements X-Git-Tag: 0.9.0-test0~6521 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=df3b5eec76004d2dcfbabc36ac65f64755bdbc33;p=vlc video chromas: finalize SSE2 improvements --- diff --git a/AUTHORS b/AUTHORS index cf8e02eef7..d9595ecdb4 100644 --- a/AUTHORS +++ b/AUTHORS @@ -190,6 +190,9 @@ E: Damien.Fouilleul@laposte.net C: Quovodis D: ActiveX control D: Safari/Firefox plugin for MacOS X +D: Direct3D Video output +D: SSE2 chroma converters +D: improved MMX chroma converters S: Ireland N: Alexis Guillard diff --git a/NEWS b/NEWS index d0cf308cd4..a12068786d 100644 --- a/NEWS +++ b/NEWS @@ -81,6 +81,8 @@ Video output and filters: was previously part of the mosaic module. * Fix random characters problem in RSS filter. * Add rotate-deciangle for more precision on rotate filter + * Support for Intel SSE2 intruction set in chroma converters + * Improved use of Intel MMX intruction set in chroma converters Audio output * Replay gain support. diff --git a/modules/video_chroma/i420_yuy2.h b/modules/video_chroma/i420_yuy2.h index 1f35a3061f..a630157dc2 100644 --- a/modules/video_chroma/i420_yuy2.h +++ b/modules/video_chroma/i420_yuy2.h @@ -366,8 +366,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\ #define SSE2_YUV420_YUYV_UNALIGNED \ xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ - xmm0 = _mm_load_si128((__m128i *)p_y1); \ - xmm3 = _mm_load_si128((__m128i *)p_y2); \ + xmm0 = _mm_loadu_si128((__m128i *)p_y1); \ + xmm3 = _mm_loadu_si128((__m128i *)p_y2); \ _mm_prefetch(p_line1, _MM_HINT_NTA); \ _mm_prefetch(p_line2, _MM_HINT_NTA); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ @@ -402,8 +402,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\ #define SSE2_YUV420_YVYU_UNALIGNED \ xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ xmm2 = _mm_loadl_epi64((__m128i *)p_u); \ - xmm0 = _mm_load_si128((__m128i *)p_y1); \ - xmm3 = _mm_load_si128((__m128i *)p_y2); \ + xmm0 = _mm_loadu_si128((__m128i *)p_y1); \ + xmm3 = _mm_loadu_si128((__m128i *)p_y2); \ _mm_prefetch(p_line1, _MM_HINT_NTA); \ _mm_prefetch(p_line2, _MM_HINT_NTA); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ @@ -439,8 +439,8 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\ #define SSE2_YUV420_UYVY_UNALIGNED \ xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ - xmm0 = _mm_load_si128((__m128i *)p_y1); \ - xmm3 = _mm_load_si128((__m128i *)p_y2); \ + xmm0 = _mm_loadu_si128((__m128i *)p_y1); \ + xmm3 = _mm_loadu_si128((__m128i *)p_y2); \ _mm_prefetch(p_line1, _MM_HINT_NTA); \ _mm_prefetch(p_line2, _MM_HINT_NTA); \ xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ diff --git a/modules/video_chroma/i422_yuy2.c b/modules/video_chroma/i422_yuy2.c index c255079f31..84eaf90aac 100644 --- a/modules/video_chroma/i422_yuy2.c +++ b/modules/video_chroma/i422_yuy2.c @@ -442,6 +442,61 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source, int i_x, i_y; + const int i_source_margin = p_source->p[0].i_pitch + - p_source->p[0].i_visible_pitch; + const int i_source_margin_c = p_source->p[1].i_pitch + - p_source->p[1].i_visible_pitch; + const int i_dest_margin = p_dest->p->i_pitch + - p_dest->p->i_visible_pitch; + +#if defined (MODULE_NAME_IS_i422_yuy2_sse2) + + if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| + ((int)p_line|(int)p_y))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = p_vout->render.i_height ; i_y-- ; ) + { + p_line -= 2 * p_dest->p->i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV422_UYVY_ALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV422_UYVY( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; + } + } + else { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = p_vout->render.i_height ; i_y-- ; ) + { + p_line -= 2 * p_dest->p->i_pitch; + + for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) + { + SSE2_CALL( SSE2_YUV422_UYVY_UNALIGNED ); + } + for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) + { + C_YUV422_UYVY( p_line, p_y, p_u, p_v ); + } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; + } + } + SSE2_END; + +#else + for( i_y = p_vout->render.i_height ; i_y-- ; ) { for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) @@ -457,12 +512,18 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source, MMX_CALL( MMX_YUV422_UYVY ); #endif } + p_y += i_source_margin; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + p_line += i_dest_margin; } #if defined (MODULE_NAME_IS_i422_yuy2_mmx) MMX_END; #elif defined (MODULE_NAME_IS_i422_yuy2_sse2) SSE2_END; #endif + +#endif } /***************************************************************************** diff --git a/modules/video_chroma/i422_yuy2.h b/modules/video_chroma/i422_yuy2.h index 1b0405bb27..68057c7c04 100644 --- a/modules/video_chroma/i422_yuy2.h +++ b/modules/video_chroma/i422_yuy2.h @@ -233,9 +233,82 @@ movdqu %%xmm1, 16(%0) # Store high UYVY \n\ #include +#define SSE2_CALL(SSE2_INSTRUCTIONS) \ + do { \ + __m128i xmm0, xmm1, xmm2; \ + SSE2_INSTRUCTIONS \ + p_line += 32; p_y += 16; \ + p_u += 8; p_v += 8; \ + } while(0) #define SSE2_END _mm_sfence() +#define SSE2_YUV422_YUYV_ALIGNED \ + xmm0 = _mm_load_si128((__m128i *)p_y); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm0; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ + _mm_stream_si128((__m128i*)(p_line), xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ + _mm_stream_si128((__m128i*)(p_line+16), xmm0); + +#define SSE2_YUV422_YUYV_UNALIGNED \ + xmm0 = _mm_loadu_si128((__m128i *)p_y); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm0; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line), xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line+16), xmm0); + +#define SSE2_YUV422_YVYU_ALIGNED \ + xmm0 = _mm_load_si128((__m128i *)p_y); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm0; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ + _mm_stream_si128((__m128i*)(p_line), xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ + _mm_stream_si128((__m128i*)(p_line+16), xmm0); + +#define SSE2_YUV422_YVYU_UNALIGNED \ + xmm0 = _mm_loadu_si128((__m128i *)p_y); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm0; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line), xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line+16), xmm0); + +#define SSE2_YUV422_UYVY_ALIGNED \ + xmm0 = _mm_load_si128((__m128i *)p_y); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm1; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \ + _mm_stream_si128((__m128i*)(p_line), xmm2); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \ + _mm_stream_si128((__m128i*)(p_line+16), xmm1); + +#define SSE2_YUV422_UYVY_UNALIGNED \ + xmm0 = _mm_loadu_si128((__m128i *)p_y); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm1; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \ + _mm_storeu_si128((__m128i*)(p_line), xmm2); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm0); \ + _mm_storeu_si128((__m128i*)(p_line+16), xmm1); + #endif #endif