From: Damien Fouilleul Date: Wed, 1 Aug 2007 18:29:17 +0000 (+0000) Subject: video_chroma: added I420_ABGR32 support (mostly for opengl), some clean up as well X-Git-Tag: 0.9.0-test0~6694 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=a3883709fd734bf346753e0b1866188477854381;p=vlc video_chroma: added I420_ABGR32 support (mostly for opengl), some clean up as well --- diff --git a/modules/video_chroma/i420_rgb.c b/modules/video_chroma/i420_rgb.c index 068c84504d..ca772275e2 100644 --- a/modules/video_chroma/i420_rgb.c +++ b/modules/video_chroma/i420_rgb.c @@ -155,6 +155,15 @@ static int Activate( vlc_object_t *p_this ) msg_Dbg(p_this, "RGB pixel format is A8R8G8B8"); p_vout->chroma.pf_convert = E_(I420_A8R8G8B8); } + else if( p_vout->output.i_rmask == 0xff000000 + && p_vout->output.i_gmask == 0x00ff0000 + && p_vout->output.i_bmask == 0x0000ff00 ) + { + /* R8G8B8A8 pixel format */ + msg_Dbg(p_this, "RGB pixel format is R8G8B8A8"); + //p_vout->chroma.pf_convert = E_(I420_B8G8R8A8); + return -1; + } else if( p_vout->output.i_rmask == 0x0000ff00 && p_vout->output.i_gmask == 0x00ff0000 && p_vout->output.i_bmask == 0xff000000 ) @@ -163,10 +172,18 @@ static int Activate( vlc_object_t *p_this ) msg_Dbg(p_this, "RGB pixel format is B8G8R8A8"); p_vout->chroma.pf_convert = E_(I420_B8G8R8A8); } + else if( p_vout->output.i_rmask == 0x000000ff + && p_vout->output.i_gmask == 0x0000ff00 + && p_vout->output.i_bmask == 0x00ff0000 ) + { + /* A8B8G8R8 pixel format */ + msg_Dbg(p_this, "RGB pixel format is A8B8G8R8"); + p_vout->chroma.pf_convert = E_(I420_A8B8G8R8); + } else return -1; #else - // generic C chroma converter */ + /* generic C chroma converter */ p_vout->chroma.pf_convert = E_(I420_RGB32); #endif break; diff --git a/modules/video_chroma/i420_rgb.h b/modules/video_chroma/i420_rgb.h index 15fadf4ced..1ba1c0de6f 100644 --- a/modules/video_chroma/i420_rgb.h +++ b/modules/video_chroma/i420_rgb.h @@ -65,6 +65,7 @@ void E_(I420_R5G5B5) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_R5G6B5) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_A8R8G8B8) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_B8G8R8A8) ( vout_thread_t *, picture_t *, picture_t * ); +void E_(I420_A8B8G8R8) ( vout_thread_t *, picture_t *, picture_t * ); #endif /***************************************************************************** diff --git a/modules/video_chroma/i420_rgb16.c b/modules/video_chroma/i420_rgb16.c index f9fc4fb02c..fc622de24f 100644 --- a/modules/video_chroma/i420_rgb16.c +++ b/modules/video_chroma/i420_rgb16.c @@ -35,14 +35,8 @@ #if defined (MODULE_NAME_IS_i420_rgb) # include "i420_rgb_c.h" #elif defined (MODULE_NAME_IS_i420_rgb_mmx) -# if defined(HAVE_MMX_INTRINSICS) -# include -# endif # include "i420_rgb_mmx.h" #elif defined (MODULE_NAME_IS_i420_rgb_sse2) -# if defined(HAVE_SSE2_INTRINSICS) -# include -# endif # include "i420_rgb_mmx.h" #endif @@ -309,7 +303,7 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, } } -#else // defined (MODULE_NAME_IS_i420_rgb_mmx) +#else // ! defined (MODULE_NAME_IS_i420_rgb) void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, picture_t *p_dest ) @@ -388,20 +382,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width/16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_ALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_15_ALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - SSE2_INTRINSICS_INIT_16_ALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_15_ALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_16_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_ALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -416,23 +402,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_15_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_15_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -459,20 +434,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width/16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_15_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_15_UNALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -487,23 +454,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_15_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_15_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -522,11 +478,7 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, } /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ::: "memory" ); -#else - _mm_sfence(); -#endif + SSE2_END; #else // defined (MODULE_NAME_IS_i420_rgb_mmx) @@ -546,22 +498,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 8; i_x--; ) { -#if defined (CAN_COMPILE_MMX) - __asm__( ".p2align 3" - MMX_INIT_16 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_15 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - MMX_INTRINSICS_INIT_16 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_15 -#endif - + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_15 + ); p_y += 8; p_u += 4; p_v += 4; @@ -577,24 +519,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_MMX) - __asm__( ".p2align 3" - MMX_INIT_16 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_15 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - { - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_16 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_15 - } -#endif + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_15 + ); p_y += 8; p_u += 4; p_v += 4; @@ -611,11 +541,7 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, } } /* re-enable FPU registers */ -#if defined (CAN_COMPILE_MMX) - __asm__ __volatile__ ( "emms" ); -#else - _mm_empty(); -#endif + MMX_END; #endif } @@ -697,20 +623,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width/16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_ALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_16_ALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - SSE2_INTRINSICS_INIT_16_ALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_16_ALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_16_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_ALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -725,23 +643,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_16_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_16_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -768,20 +675,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width/16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_16_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_16_UNALIGNED -#endif + SSE2_CALL( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -796,23 +695,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_16_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_16_UNALIGNED - } -#endif + SSE2_CALL( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -831,11 +719,7 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, } /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ::: "memory" ); -#else - _mm_sfence(); -#endif + SSE2_END; #else // defined (MODULE_NAME_IS_i420_rgb_mmx) @@ -855,22 +739,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 8; i_x--; ) { -#if defined (CAN_COMPILE_MMX) - __asm__( ".p2align 3" - MMX_INIT_16 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - MMX_INTRINSICS_INIT_16 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_16 -#endif - + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_16 + ); p_y += 8; p_u += 4; p_v += 4; @@ -886,24 +760,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_MMX) - __asm__( ".p2align 3" - MMX_INIT_16 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - { - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_16 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_16 - } -#endif + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_16 + ); p_y += 8; p_u += 4; p_v += 4; @@ -920,11 +782,7 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, } } /* re-enable FPU registers */ -#if defined (CAN_COMPILE_MMX) - __asm__ __volatile__ ( "emms" ); -#else - _mm_empty(); -#endif + MMX_END; #endif } @@ -1118,23 +976,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_ALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_ARGB_ALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 C intrinsics wrappers */ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_ALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_ALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1149,25 +996,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_ARGB_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 intrinsics wrappers */ - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); p_y += 16; p_u += 4; p_v += 4; @@ -1194,23 +1028,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_ARGB_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 C intrinsics wrappers */ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1225,25 +1048,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_ARGB_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 intrinsics wrappers */ - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1262,11 +1072,7 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, } /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ::: "memory" ); -#else - _mm_sfence(); -#endif + SSE2_END; #else // defined (MODULE_NAME_IS_i420_rgb_mmx) @@ -1286,26 +1092,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 8; i_x--; ) { -#if defined (CAN_COMPILE_MMX) - /* use inline MMX assembly */ - __asm__( MMX_INIT_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - - __asm__( ".p2align 3" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32_ARGB - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - /* otherwise use MMX C intrinsics wrappers */ - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_32 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_32_ARGB -#endif + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + ); p_y += 8; p_u += 4; p_v += 4; @@ -1320,26 +1112,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_MMX) - /* use inline MMX assembly */ - __asm__( ".p2align 3" - MMX_INIT_32 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32_ARGB - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - /* otherwise use MMX intrinsics wrappers */ - { - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_32 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_32_ARGB - } -#endif + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + ); p_y += 8; p_u += 4; p_v += 4; @@ -1355,12 +1133,9 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, p_v += i_source_margin_c; } } + /* re-enable FPU registers */ -#if defined (CAN_COMPILE_MMX) - __asm__ __volatile__ ( "emms" ); -#else - _mm_empty(); -#endif + MMX_END; #endif } @@ -1440,23 +1215,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_ALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_BGRA_ALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 C intrinsics wrappers */ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_ALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_ALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1471,25 +1235,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_BGRA_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 intrinsics wrappers */ - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); p_y += 16; p_u += 4; p_v += 4; @@ -1516,23 +1267,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_BGRA_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 C intrinsics wrappers */ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1547,25 +1287,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_BGRA_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 intrinsics wrappers */ - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1601,26 +1328,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 8; i_x--; ) { -#if defined (CAN_COMPILE_MMX) - /* use inline MMX assembly */ - __asm__( MMX_INIT_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - - __asm__( ".p2align 3" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32_ARGB - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - /* otherwise use MMX C intrinsics wrappers */ - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_32 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_32_BGRA -#endif + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_BGRA + ); p_y += 8; p_u += 4; p_v += 4; @@ -1635,26 +1348,248 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_MMX) - /* use inline MMX assembly */ - __asm__( ".p2align 3" - MMX_INIT_32 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32_BGRA - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - /* otherwise use MMX intrinsics wrappers */ + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_BGRA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + + /* re-enable FPU registers */ + MMX_END; + +#endif +} + +void E_(I420_A8B8G8R8)( vout_thread_t *p_vout, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + vlc_bool_t b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_vout->render.i_width, p_vout->render.i_height, + p_vout->output.i_width, p_vout->output.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_vout->output.i_height : p_vout->render.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_vout->render.i_width & 15 ) + { + i_rewind = 16 - ( p_vout->render.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((int)p_y)| + ((int)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_vout->render.i_width / 16; i_x--; ) { - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } - MMX_INTRINSICS_INIT_32 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_32_BGRA + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 4; + p_v += 4; } -#endif + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + +#else + + if( p_vout->render.i_width & 7 ) + { + i_rewind = 8 - ( p_vout->render.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ABGR + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ABGR + ); p_y += 8; p_u += 4; p_v += 4; @@ -1670,12 +1605,9 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, p_v += i_source_margin_c; } } + /* re-enable FPU registers */ -#if defined (CAN_COMPILE_MMX) - __asm__ __volatile__ ( "emms" ); -#else - _mm_empty(); -#endif + MMX_END; #endif } diff --git a/modules/video_chroma/i420_rgb_mmx.h b/modules/video_chroma/i420_rgb_mmx.h index 85aa9094ab..3200a1f334 100644 --- a/modules/video_chroma/i420_rgb_mmx.h +++ b/modules/video_chroma/i420_rgb_mmx.h @@ -24,6 +24,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. *****************************************************************************/ +#ifdef MODULE_NAME_IS_i420_rgb_mmx + /* hope these constant values are cache line aligned */ #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) #define USED_U64(foo) \ @@ -46,6 +48,22 @@ USED_U64(mmx_mask_f8) = 0xf8f8f8f8f8f8f8f8ULL; USED_U64(mmx_mask_fc) = 0xfcfcfcfcfcfcfcfcULL; #undef USED_U64 +#if defined(CAN_COMPILE_MMX) + +/* MMX assembly */ + +#define MMX_CALL(MMX_INSTRUCTIONS) \ + do { \ + __asm__ __volatile__( \ + ".p2align 3 \n\t" \ + MMX_INSTRUCTIONS \ + : \ + : "r" (p_y), "r" (p_u), \ + "r" (p_v), "r" (p_buffer) ); \ + } while(0) + +#define MMX_END __asm__ __volatile__ ( "emms" ) + /* Use RIP-relative code in PIC mode on amd64 */ #if defined(__x86_64__) && defined(__PIC__) # define G "(%%rip)" @@ -60,42 +78,6 @@ pxor %%mm4, %%mm4 # zero mm4 \n\ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ " -#define SSE2_INIT_16_ALIGNED " \n\ -movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -pxor %%xmm4, %%xmm4 # zero mm4 \n\ -movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -" - -#define SSE2_INIT_16_UNALIGNED " \n\ -movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -pxor %%xmm4, %%xmm4 # zero mm4 \n\ -movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -prefetchnta (%3) # Tell CPU not to cache output RGB data \n\ -" - -#define MMX_INTRINSICS_INIT_16 \ - tmp64 = *(uint32_t *)p_u; \ - mm0 = (__m64)tmp64; \ - tmp64 = *(uint32_t *)p_v; \ - mm1 = (__m64)tmp64; \ - mm4 = _mm_setzero_si64(); \ - mm6 = (__m64)*(uint64_t *)p_y; \ - -#define SSE2_INTRINSICS_INIT_16_ALIGNED \ - xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ - xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ - xmm4 = _mm_setzero_si128(); \ - xmm6 = _mm_load_si128((__m128i *)p_y); \ - -#define SSE2_INTRINSICS_INIT_16_UNALIGNED \ - xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ - xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ - xmm4 = _mm_setzero_si128(); \ - xmm6 = _mm_loadu_si128((__m128i *)p_y); \ - _mm_prefetch(p_buffer, _MM_HINT_NTA); \ - #define MMX_INIT_16_GRAY " \n\ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ #movl $0, (%3) # cache preload for image \n\ @@ -109,43 +91,6 @@ pxor %%mm4, %%mm4 # zero mm4 \n\ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ " -#define SSE2_INIT_32_ALIGNED " \n\ -movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -pxor %%xmm4, %%xmm4 # zero mm4 \n\ -movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -" - -#define SSE2_INIT_32_UNALIGNED " \n\ -movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -pxor %%xmm4, %%xmm4 # zero mm4 \n\ -movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -prefetchnta (%3) # Tell CPU not to cache output RGB data \n\ -" - -#define MMX_INTRINSICS_INIT_32 \ - tmp64 = *(uint32_t *)p_u; \ - mm0 = (__m64)tmp64; \ - *(uint16_t *)p_buffer = 0; \ - tmp64 = *(uint32_t *)p_v; \ - mm1 = (__m64)tmp64; \ - mm4 = _mm_setzero_si64(); \ - mm6 = (__m64)*(uint64_t *)p_y; - -#define SSE2_INTRINSICS_INIT_32_ALIGNED \ - xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ - xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ - xmm4 = _mm_setzero_si128(); \ - xmm6 = _mm_load_si128((__m128i *)p_y); \ - -#define SSE2_INTRINSICS_INIT_32_UNALIGNED \ - xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ - xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ - xmm4 = _mm_setzero_si128(); \ - xmm6 = _mm_loadu_si128((__m128i *)p_y); \ - _mm_prefetch(p_buffer, _MM_HINT_NTA); \ - /* * Do the multiply part of the conversion for even and odd pixels, * register usage: @@ -181,113 +126,6 @@ pmulhw mmx_Y_coeff"G", %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\ pmulhw mmx_Y_coeff"G", %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\ " -#define SSE2_YUV_MUL " \n\ -# convert the chroma part \n\ -punpcklbw %%xmm4, %%xmm0 # scatter 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\ -punpcklbw %%xmm4, %%xmm1 # scatter 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\ -movl $0x00800080, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 0080 0080 ... 0080 0080 \n\ -psubsw %%xmm5, %%xmm0 # Cb -= 128 \n\ -psubsw %%xmm5, %%xmm1 # Cr -= 128 \n\ -psllw $3, %%xmm0 # Promote precision \n\ -psllw $3, %%xmm1 # Promote precision \n\ -movdqa %%xmm0, %%xmm2 # Copy 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\ -movdqa %%xmm1, %%xmm3 # Copy 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\ -movl $0xf37df37d, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to f37d f37d ... f37d f37d \n\ -pmulhw %%xmm5, %%xmm2 # Mul Cb with green coeff -> Cb green \n\ -movl $0xe5fce5fc, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to e5fc e5fc ... e5fc e5fc \n\ -pmulhw %%xmm5, %%xmm3 # Mul Cr with green coeff -> Cr green \n\ -movl $0x40934093, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 4093 4093 ... 4093 4093 \n\ -pmulhw %%xmm5, %%xmm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\ -movl $0x33123312, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 3312 3312 ... 3312 3312 \n\ -pmulhw %%xmm5, %%xmm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\ -paddsw %%xmm3, %%xmm2 # Cb green + Cr green -> Cgreen \n\ - \n\ -# convert the luma part \n\ -movl $0x10101010, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 1010 1010 ... 1010 1010 \n\ -psubusb %%xmm5, %%xmm6 # Y -= 16 \n\ -movdqa %%xmm6, %%xmm7 # Copy 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -movl $0x00ff00ff, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 00ff 00ff ... 00ff 00ff \n\ -pand %%xmm5, %%xmm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\ -psrlw $8, %%xmm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\ -psllw $3, %%xmm6 # Promote precision \n\ -psllw $3, %%xmm7 # Promote precision \n\ -movl $0x253f253f, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 253f 253f ... 253f 253f \n\ -pmulhw %%xmm5, %%xmm6 # Mul 8 Y even 00 y6 00 y4 00 y2 00 y0 \n\ -pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\ -" - -#define MMX_INTRINSICS_YUV_MUL \ - mm0 = _mm_unpacklo_pi8(mm0, mm4); \ - mm1 = _mm_unpacklo_pi8(mm1, mm4); \ - mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \ - mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \ - mm0 = _mm_slli_pi16(mm0, 3); \ - mm1 = _mm_slli_pi16(mm1, 3); \ - mm2 = mm0; \ - mm3 = mm1; \ - mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \ - mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \ - mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \ - mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \ - mm2 = _mm_adds_pi16(mm2, mm3); \ - \ - mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \ - mm7 = mm6; \ - mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \ - mm7 = _mm_srli_pi16(mm7, 8); \ - mm6 = _mm_slli_pi16(mm6, 3); \ - mm7 = _mm_slli_pi16(mm7, 3); \ - mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \ - mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff); - -#define SSE2_INTRINSICS_YUV_MUL \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \ - xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ - xmm5 = _mm_set1_epi32(0x00800080UL); \ - xmm0 = _mm_subs_epi16(xmm0, xmm5); \ - xmm1 = _mm_subs_epi16(xmm1, xmm5); \ - xmm0 = _mm_slli_epi16(xmm0, 3); \ - xmm1 = _mm_slli_epi16(xmm1, 3); \ - xmm2 = xmm0; \ - xmm3 = xmm1; \ - xmm5 = _mm_set1_epi32(0xf37df37dUL); \ - xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \ - xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \ - xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \ - xmm5 = _mm_set1_epi32(0x40934093UL); \ - xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \ - xmm5 = _mm_set1_epi32(0x33123312UL); \ - xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \ - xmm2 = _mm_adds_epi16(xmm2, xmm3); \ - \ - xmm5 = _mm_set1_epi32(0x10101010UL); \ - xmm6 = _mm_subs_epu8(xmm6, xmm5); \ - xmm7 = xmm6; \ - xmm5 = _mm_set1_epi32(0x00ff00ffUL); \ - xmm6 = _mm_and_si128(xmm6, xmm5); \ - xmm7 = _mm_srli_epi16(xmm7, 8); \ - xmm6 = _mm_slli_epi16(xmm6, 3); \ - xmm7 = _mm_slli_epi16(xmm7, 3); \ - xmm5 = _mm_set1_epi32(0x253f253fUL); \ - xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \ - xmm7 = _mm_mulhi_epi16(xmm7, xmm5); - /* * Do the addition part of the conversion for even and odd pixels, * register usage: @@ -324,80 +162,6 @@ punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ " -#define SSE2_YUV_ADD " \n\ -# Do horizontal and vertical scaling \n\ -movdqa %%xmm0, %%xmm3 # Copy Cblue \n\ -movdqa %%xmm1, %%xmm4 # Copy Cred \n\ -movdqa %%xmm2, %%xmm5 # Copy Cgreen \n\ -paddsw %%xmm6, %%xmm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\ -paddsw %%xmm7, %%xmm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\ -paddsw %%xmm6, %%xmm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\ -paddsw %%xmm7, %%xmm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\ -paddsw %%xmm6, %%xmm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\ -paddsw %%xmm7, %%xmm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\ - \n\ -# Limit RGB even to 0..255 \n\ -packuswb %%xmm0, %%xmm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\ -packuswb %%xmm1, %%xmm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\ -packuswb %%xmm2, %%xmm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\ - \n\ -# Limit RGB odd to 0..255 \n\ -packuswb %%xmm3, %%xmm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\ -packuswb %%xmm4, %%xmm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\ -packuswb %%xmm5, %%xmm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\ - \n\ -# Interleave RGB even and odd \n\ -punpcklbw %%xmm3, %%xmm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ -punpcklbw %%xmm4, %%xmm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ -punpcklbw %%xmm5, %%xmm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ -" - -#define MMX_INTRINSICS_YUV_ADD \ - mm3 = mm0; \ - mm4 = mm1; \ - mm5 = mm2; \ - mm0 = _mm_adds_pi16(mm0, mm6); \ - mm3 = _mm_adds_pi16(mm3, mm7); \ - mm1 = _mm_adds_pi16(mm1, mm6); \ - mm4 = _mm_adds_pi16(mm4, mm7); \ - mm2 = _mm_adds_pi16(mm2, mm6); \ - mm5 = _mm_adds_pi16(mm5, mm7); \ - \ - mm0 = _mm_packs_pu16(mm0, mm0); \ - mm1 = _mm_packs_pu16(mm1, mm1); \ - mm2 = _mm_packs_pu16(mm2, mm2); \ - \ - mm3 = _mm_packs_pu16(mm3, mm3); \ - mm4 = _mm_packs_pu16(mm4, mm4); \ - mm5 = _mm_packs_pu16(mm5, mm5); \ - \ - mm0 = _mm_unpacklo_pi8(mm0, mm3); \ - mm1 = _mm_unpacklo_pi8(mm1, mm4); \ - mm2 = _mm_unpacklo_pi8(mm2, mm5); - -#define SSE2_INTRINSICS_YUV_ADD \ - xmm3 = xmm0; \ - xmm4 = xmm1; \ - xmm5 = xmm2; \ - xmm0 = _mm_adds_epi16(xmm0, xmm6); \ - xmm3 = _mm_adds_epi16(xmm3, xmm7); \ - xmm1 = _mm_adds_epi16(xmm1, xmm6); \ - xmm4 = _mm_adds_epi16(xmm4, xmm7); \ - xmm2 = _mm_adds_epi16(xmm2, xmm6); \ - xmm5 = _mm_adds_epi16(xmm5, xmm7); \ - \ - xmm0 = _mm_packus_epi16(xmm0, xmm0); \ - xmm1 = _mm_packus_epi16(xmm1, xmm1); \ - xmm2 = _mm_packus_epi16(xmm2, xmm2); \ - \ - xmm3 = _mm_packus_epi16(xmm3, xmm3); \ - xmm4 = _mm_packus_epi16(xmm4, xmm4); \ - xmm5 = _mm_packus_epi16(xmm5, xmm5); \ - \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \ - xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); - /* * Grayscale case, only use Y */ @@ -476,138 +240,6 @@ movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\ movq %%mm5, 8(%3) # store pixel 4-7 \n\ " -#define SSE2_UNPACK_15_ALIGNED " \n\ -# mask unneeded bits off \n\ -movl $0xf8f8f8f8, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ -pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ -psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ -pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\ -pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ -psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\ -pxor %%xmm4, %%xmm4 # zero mm4 \n\ -movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ -movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ - \n\ -# convert rgb24 plane to rgb15 pack for pixel 0-7 \n\ -punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\ -punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ -psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\ -por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ -movntdq %%xmm0, (%3) # store pixel 0-7 \n\ - \n\ -# convert rgb24 plane to rgb15 pack for pixel 8-15 \n\ -punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\ -punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ -psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\ -por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ -movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\ -" - -#define SSE2_UNPACK_15_UNALIGNED " \n\ -# mask unneeded bits off \n\ -movl $0xf8f8f8f8, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ -pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ -psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ -pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\ -pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ -psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\ -pxor %%xmm4, %%xmm4 # zero mm4 \n\ -movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ -movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ - \n\ -# convert rgb24 plane to rgb15 pack for pixel 0-7 \n\ -punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\ -punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ -psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\ -por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ -movdqu %%xmm0, (%3) # store pixel 0-7 \n\ - \n\ -# convert rgb24 plane to rgb15 pack for pixel 8-15 \n\ -punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\ -punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ -psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\ -por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ -movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\ -" - -#define MMX_INTRINSICS_UNPACK_15 \ - mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \ - mm0 = _mm_srli_pi16(mm0, 3); \ - mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \ - mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \ - mm1 = _mm_srli_pi16(mm1, 1); \ - mm4 = _mm_setzero_si64(); \ - mm5 = mm0; \ - mm7 = mm2; \ - \ - mm2 = _mm_unpacklo_pi8(mm2, mm4); \ - mm0 = _mm_unpacklo_pi8(mm0, mm1); \ - mm2 = _mm_slli_pi16(mm2, 2); \ - mm0 = _mm_or_si64(mm0, mm2); \ - tmp64 = *(uint64_t *)(p_y + 8); \ - mm6 = (__m64)tmp64; \ - *(uint64_t *)p_buffer = (uint64_t)mm0; \ - \ - mm7 = _mm_unpackhi_pi8(mm7, mm4); \ - mm5 = _mm_unpackhi_pi8(mm5, mm1); \ - mm7 = _mm_slli_pi16(mm7, 2); \ - tmp64 = (uint64_t)*(uint32_t *)(p_u + 4); \ - mm0 = (__m64)tmp64; \ - mm5 = _mm_or_si64(mm5, mm7); \ - tmp64 = (uint64_t)*(uint32_t *)(p_v + 4); \ - mm1 = (__m64)tmp64; \ - *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; - -#define SSE2_INTRINSICS_UNPACK_15_ALIGNED \ - xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ - xmm0 = _mm_and_si128(xmm0, xmm5); \ - xmm0 = _mm_srli_epi16(xmm0, 3); \ - xmm2 = _mm_and_si128(xmm2, xmm5); \ - xmm1 = _mm_and_si128(xmm1, xmm5); \ - xmm1 = _mm_srli_epi16(xmm1, 1); \ - xmm4 = _mm_setzero_si128(); \ - xmm5 = xmm0; \ - xmm7 = xmm2; \ - \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_slli_epi16(xmm2, 2); \ - xmm0 = _mm_or_si128(xmm0, xmm2); \ - _mm_stream_si128((__m128i*)p_buffer, xmm0); \ - \ - xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ - xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ - xmm7 = _mm_slli_epi16(xmm7, 2); \ - xmm5 = _mm_or_si128(xmm5, xmm7); \ - _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); - -#define SSE2_INTRINSICS_UNPACK_15_UNALIGNED \ - xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ - xmm0 = _mm_and_si128(xmm0, xmm5); \ - xmm0 = _mm_srli_epi16(xmm0, 3); \ - xmm2 = _mm_and_si128(xmm2, xmm5); \ - xmm1 = _mm_and_si128(xmm1, xmm5); \ - xmm1 = _mm_srli_epi16(xmm1, 1); \ - xmm4 = _mm_setzero_si128(); \ - xmm5 = xmm0; \ - xmm7 = xmm2; \ - \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_slli_epi16(xmm2, 2); \ - xmm0 = _mm_or_si128(xmm0, xmm2); \ - _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ - \ - xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ - xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ - xmm7 = _mm_slli_epi16(xmm7, 2); \ - xmm5 = _mm_or_si128(xmm5, xmm7); \ - _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5); - /* * convert RGB plane to RGB 16 bits, * mm0 -> B, mm1 -> R, mm2 -> G, @@ -643,36 +275,454 @@ movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\ movq %%mm5, 8(%3) # store pixel 4-7 \n\ " -#define SSE2_UNPACK_16_ALIGNED " \n\ -# mask unneeded bits off \n\ -movl $0xf8f8f8f8, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ -pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ -pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ -movl $0xfcfcfcfc, %%eax # \n\ -movd %%eax, %%xmm5 # \n\ -pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ -pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\ -psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ -pxor %%xmm4, %%xmm4 # zero mm4 \n\ -movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ -movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ - \n\ -# convert rgb24 plane to rgb16 pack for pixel 0-7 \n\ -punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\ -punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ -psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\ -por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ -movntdq %%xmm0, (%3) # store pixel 0-7 \n\ - \n\ -# convert rgb24 plane to rgb16 pack for pixel 8-15 \n\ -punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\ -punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ -psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\ -por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ -movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\ -" +/* + * convert RGB plane to RGB packed format, + * mm0 -> B, mm1 -> R, mm2 -> G + */ + +#define MMX_UNPACK_32_ARGB " \n\ +pxor %%mm3, %%mm3 # zero mm3 \n\ +movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%mm2, %%mm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ +movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ +punpcklbw %%mm3, %%mm5 # 00 R3 00 R2 00 R1 00 R0 \n\ +movq %%mm4, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ +punpcklwd %%mm5, %%mm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\ +movq %%mm4, (%3) # Store ARGB1 ARGB0 \n\ +punpckhwd %%mm5, %%mm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\ +movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\ +punpckhbw %%mm2, %%mm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ +punpckhbw %%mm3, %%mm1 # 00 R7 00 R6 00 R5 00 R4 \n\ +movq %%mm0, %%mm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ +punpcklwd %%mm1, %%mm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\ +movq %%mm5, 16(%3) # Store ARGB5 ARGB4 \n\ +punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\ +movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\ +" + +#define MMX_UNPACK_32_BGRA " \n\ +pxor %%mm3, %%mm3 # zero mm3 \n\ +movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpcklbw %%mm0, %%mm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\ +punpcklbw %%mm1, %%mm3 # R3 00 R2 00 R1 00 R0 00 \n\ +movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\ +punpcklwd %%mm4, %%mm3 # B1 G1 R1 00 B0 G0 R0 00 \n\ +movq %%mm3, (%3) # Store BGRA1 BGRA0 \n\ +punpckhwd %%mm4, %%mm5 # B3 G3 R3 00 B2 G2 R2 00 \n\ +movq %%mm5, 8(%3) # Store BGRA3 BGRA2 \n\ +pxor %%mm6, %%mm6 # zero mm6 \n\ +punpckhbw %%mm0, %%mm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ +punpckhbw %%mm1, %%mm6 # R7 00 R6 00 R5 00 R4 00 \n\ +movq %%mm6, %%mm0 # R7 00 R6 00 R5 00 R4 00 \n\ +punpcklwd %%mm2, %%mm6 # B5 G5 R5 00 B4 G4 R4 00 \n\ +movq %%mm6, 16(%3) # Store BGRA5 BGRA4 \n\ +punpckhwd %%mm2, %%mm0 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movq %%mm0, 24(%3) # Store BGRA7 BGRA6 \n\ +" + +#define MMX_UNPACK_32_ABGR " \n\ +pxor %%mm3, %%mm3 # zero mm3 \n\ +movq %%mm1, %%mm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ +punpcklbw %%mm2, %%mm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\ +movq %%mm0, %%mm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%mm3, %%mm5 # 00 B3 00 B2 00 B1 00 B0 \n\ +movq %%mm4, %%mm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\ +punpcklwd %%mm5, %%mm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\ +movq %%mm4, (%3) # Store ABGR1 ABGR0 \n\ +punpckhwd %%mm5, %%mm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\ +movq %%mm6, 8(%3) # Store ABGR3 ABGR2 \n\ +punpckhbw %%mm2, %%mm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\ +punpckhbw %%mm3, %%mm0 # 00 B7 00 B6 00 B5 00 B4 \n\ +movq %%mm1, %%mm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\ +punpcklwd %%mm0, %%mm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\ +movq %%mm1, 16(%3) # Store ABGR5 ABGR4 \n\ +punpckhwd %%mm0, %%mm2 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movq %%mm2, 24(%3) # Store ABGR7 ABGR6 \n\ +" + +#elif defined(HAVE_MMX_INTRINSICS) + +/* MMX intrinsics */ + +#include + +#define MMX_CALL(MMX_INSTRUCTIONS) \ + do { \ + __m64 mm0, mm1, mm2, mm3, \ + mm4, mm5, mm6, mm7; \ + MMX_INSTRUCTIONS \ + } while(0) + +#define MMX_END _mm_empty() + +#define MMX_INIT_16 \ + mm0 = _mm_cvtsi32_si64((int)*p_u); \ + mm1 = _mm_cvtsi32_si64((int)*p_v); \ + mm4 = _mm_setzero_si64(); \ + mm6 = (__m64)*(uint64_t *)p_y + +#define MMX_INIT_32 \ + mm0 = _mm_cvtsi32_si64((int)*p_u); \ + *(uint16_t *)p_buffer = 0; \ + mm1 = _mm_cvtsi32_si64((int)*p_v); \ + mm4 = _mm_setzero_si64(); \ + mm6 = (__m64)*(uint64_t *)p_y; + +#define MMX_YUV_MUL \ + mm0 = _mm_unpacklo_pi8(mm0, mm4); \ + mm1 = _mm_unpacklo_pi8(mm1, mm4); \ + mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \ + mm1 = _mm_subs_pi16(mm1, (__m64)mmx_80w); \ + mm0 = _mm_slli_pi16(mm0, 3); \ + mm1 = _mm_slli_pi16(mm1, 3); \ + mm2 = mm0; \ + mm3 = mm1; \ + mm2 = _mm_mulhi_pi16(mm2, (__m64)mmx_U_green); \ + mm3 = _mm_mulhi_pi16(mm3, (__m64)mmx_V_green); \ + mm0 = _mm_mulhi_pi16(mm0, (__m64)mmx_U_blue); \ + mm1 = _mm_mulhi_pi16(mm1, (__m64)mmx_V_red); \ + mm2 = _mm_adds_pi16(mm2, mm3); \ + \ + mm6 = _mm_subs_pu8(mm6, (__m64)mmx_10w); \ + mm7 = mm6; \ + mm6 = _mm_and_si64(mm6, (__m64)mmx_00ffw); \ + mm7 = _mm_srli_pi16(mm7, 8); \ + mm6 = _mm_slli_pi16(mm6, 3); \ + mm7 = _mm_slli_pi16(mm7, 3); \ + mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \ + mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff); + +#define MMX_YUV_ADD \ + mm3 = mm0; \ + mm4 = mm1; \ + mm5 = mm2; \ + mm0 = _mm_adds_pi16(mm0, mm6); \ + mm3 = _mm_adds_pi16(mm3, mm7); \ + mm1 = _mm_adds_pi16(mm1, mm6); \ + mm4 = _mm_adds_pi16(mm4, mm7); \ + mm2 = _mm_adds_pi16(mm2, mm6); \ + mm5 = _mm_adds_pi16(mm5, mm7); \ + \ + mm0 = _mm_packs_pu16(mm0, mm0); \ + mm1 = _mm_packs_pu16(mm1, mm1); \ + mm2 = _mm_packs_pu16(mm2, mm2); \ + \ + mm3 = _mm_packs_pu16(mm3, mm3); \ + mm4 = _mm_packs_pu16(mm4, mm4); \ + mm5 = _mm_packs_pu16(mm5, mm5); \ + \ + mm0 = _mm_unpacklo_pi8(mm0, mm3); \ + mm1 = _mm_unpacklo_pi8(mm1, mm4); \ + mm2 = _mm_unpacklo_pi8(mm2, mm5); + +#define MMX_UNPACK_15 \ + mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \ + mm0 = _mm_srli_pi16(mm0, 3); \ + mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \ + mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \ + mm1 = _mm_srli_pi16(mm1, 1); \ + mm4 = _mm_setzero_si64(); \ + mm5 = mm0; \ + mm7 = mm2; \ + \ + mm2 = _mm_unpacklo_pi8(mm2, mm4); \ + mm0 = _mm_unpacklo_pi8(mm0, mm1); \ + mm2 = _mm_slli_pi16(mm2, 2); \ + mm0 = _mm_or_si64(mm0, mm2); \ + mm6 = (__m64)*(uint64_t *)(p_y + 8); \ + *(uint64_t *)p_buffer = (uint64_t)mm0; \ + \ + mm7 = _mm_unpackhi_pi8(mm7, mm4); \ + mm5 = _mm_unpackhi_pi8(mm5, mm1); \ + mm7 = _mm_slli_pi16(mm7, 2); \ + mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \ + mm5 = _mm_or_si64(mm5, mm7); \ + mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \ + *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; + +#define MMX_UNPACK_16 \ + mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \ + mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \ + mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \ + mm0 = _mm_srli_pi16(mm0, 3); \ + mm4 = _mm_setzero_si64(); \ + mm5 = mm0; \ + mm7 = mm2; \ + \ + mm2 = _mm_unpacklo_pi8(mm2, mm4); \ + mm0 = _mm_unpacklo_pi8(mm0, mm1); \ + mm2 = _mm_slli_pi16(mm2, 3); \ + mm0 = _mm_or_si64(mm0, mm2); \ + mm6 = (__m64)*(uint64_t *)(p_y + 8); \ + *(uint64_t *)p_buffer = (uint64_t)mm0; \ + \ + mm7 = _mm_unpackhi_pi8(mm7, mm4); \ + mm5 = _mm_unpackhi_pi8(mm5, mm1); \ + mm7 = _mm_slli_pi16(mm7, 3); \ + mm0 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_u + 4)); \ + mm5 = _mm_or_si64(mm5, mm7); \ + mm1 = _mm_cvtsi32_si64((int)*(uint32_t *)(p_v + 4)); \ + *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; + +#define MMX_UNPACK_32_ARGB \ + mm3 = _mm_setzero_si64(); \ + mm4 = mm0; \ + mm4 = _mm_unpacklo_pi8(mm4, mm2); \ + mm5 = mm1; \ + mm5 = _mm_unpacklo_pi8(mm5, mm3); \ + mm6 = mm4; \ + mm4 = _mm_unpacklo_pi16(mm4, mm5); \ + *(uint64_t *)p_buffer = (uint64_t)mm4; \ + mm6 = _mm_unpackhi_pi16(mm6, mm5); \ + *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\ + mm0 = _mm_unpackhi_pi8(mm0, mm2); \ + mm1 = _mm_unpackhi_pi8(mm1, mm3); \ + mm5 = mm0; \ + mm5 = _mm_unpacklo_pi16(mm5, mm1); \ + *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5;\ + mm0 = _mm_unpackhi_pi16(mm0, mm1); \ + *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; + +#define MMX_UNPACK_32_BGRA \ + mm3 = _mm_setzero_si64(); \ + mm4 = mm2; \ + mm4 = _mm_unpacklo_pi8(mm4, mm0); \ + mm3 = _mm_unpacklo_pi8(mm3, mm1); \ + mm5 = mm3; \ + mm3 = _mm_unpacklo_pi16(mm3, mm4); \ + *(uint64_t *)p_buffer = (uint64_t)mm3; \ + mm5 = _mm_unpackhi_pi16(mm5, mm4); \ + *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\ + mm6 = _mm_setzero_si64(); \ + mm2 = _mm_unpackhi_pi8(mm2, mm0); \ + mm6 = _mm_unpackhi_pi8(mm6, mm1); \ + mm0 = mm6; \ + mm6 = _mm_unpacklo_pi16(mm6, mm2); \ + *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\ + mm0 = _mm_unpackhi_pi16(mm0, mm2); \ + *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; + +#define MMX_UNPACK_32_ABGR \ + ; + +#endif + +#elif defined( MODULE_NAME_IS_i420_rgb_sse2 ) + +#if defined(CAN_COMPILE_SSE2) + +/* SSE2 assembly */ + +#define SSE2_CALL(SSE2_INSTRUCTIONS) \ + do { \ + __asm__ __volatile__( \ + ".p2align 3 \n\t" \ + SSE2_INSTRUCTIONS \ + : \ + : "r" (p_y), "r" (p_u), \ + "r" (p_v), "r" (p_buffer) \ + : "eax" ); \ + } while(0) + +#define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" ) + +#define SSE2_INIT_16_ALIGNED " \n\ +movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +" + +#define SSE2_INIT_16_UNALIGNED " \n\ +movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +prefetchnta (%3) # Tell CPU not to cache output RGB data \n\ +" + +#define SSE2_INIT_32_ALIGNED " \n\ +movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +" + +#define SSE2_INIT_32_UNALIGNED " \n\ +movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +prefetchnta (%3) # Tell CPU not to cache output RGB data \n\ +" + +#define SSE2_YUV_MUL " \n\ +# convert the chroma part \n\ +punpcklbw %%xmm4, %%xmm0 # scatter 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\ +punpcklbw %%xmm4, %%xmm1 # scatter 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\ +movl $0x00800080, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 0080 0080 ... 0080 0080 \n\ +psubsw %%xmm5, %%xmm0 # Cb -= 128 \n\ +psubsw %%xmm5, %%xmm1 # Cr -= 128 \n\ +psllw $3, %%xmm0 # Promote precision \n\ +psllw $3, %%xmm1 # Promote precision \n\ +movdqa %%xmm0, %%xmm2 # Copy 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\ +movdqa %%xmm1, %%xmm3 # Copy 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\ +movl $0xf37df37d, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to f37d f37d ... f37d f37d \n\ +pmulhw %%xmm5, %%xmm2 # Mul Cb with green coeff -> Cb green \n\ +movl $0xe5fce5fc, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to e5fc e5fc ... e5fc e5fc \n\ +pmulhw %%xmm5, %%xmm3 # Mul Cr with green coeff -> Cr green \n\ +movl $0x40934093, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 4093 4093 ... 4093 4093 \n\ +pmulhw %%xmm5, %%xmm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\ +movl $0x33123312, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 3312 3312 ... 3312 3312 \n\ +pmulhw %%xmm5, %%xmm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\ +paddsw %%xmm3, %%xmm2 # Cb green + Cr green -> Cgreen \n\ + \n\ +# convert the luma part \n\ +movl $0x10101010, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 1010 1010 ... 1010 1010 \n\ +psubusb %%xmm5, %%xmm6 # Y -= 16 \n\ +movdqa %%xmm6, %%xmm7 # Copy 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +movl $0x00ff00ff, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 00ff 00ff ... 00ff 00ff \n\ +pand %%xmm5, %%xmm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\ +psrlw $8, %%xmm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\ +psllw $3, %%xmm6 # Promote precision \n\ +psllw $3, %%xmm7 # Promote precision \n\ +movl $0x253f253f, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 253f 253f ... 253f 253f \n\ +pmulhw %%xmm5, %%xmm6 # Mul 8 Y even 00 y6 00 y4 00 y2 00 y0 \n\ +pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\ +" + +#define SSE2_YUV_ADD " \n\ +# Do horizontal and vertical scaling \n\ +movdqa %%xmm0, %%xmm3 # Copy Cblue \n\ +movdqa %%xmm1, %%xmm4 # Copy Cred \n\ +movdqa %%xmm2, %%xmm5 # Copy Cgreen \n\ +paddsw %%xmm6, %%xmm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\ +paddsw %%xmm7, %%xmm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\ +paddsw %%xmm6, %%xmm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\ +paddsw %%xmm7, %%xmm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\ +paddsw %%xmm6, %%xmm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\ +paddsw %%xmm7, %%xmm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\ + \n\ +# Limit RGB even to 0..255 \n\ +packuswb %%xmm0, %%xmm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\ +packuswb %%xmm1, %%xmm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\ +packuswb %%xmm2, %%xmm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\ + \n\ +# Limit RGB odd to 0..255 \n\ +packuswb %%xmm3, %%xmm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\ +packuswb %%xmm4, %%xmm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\ +packuswb %%xmm5, %%xmm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\ + \n\ +# Interleave RGB even and odd \n\ +punpcklbw %%xmm3, %%xmm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%xmm4, %%xmm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ +punpcklbw %%xmm5, %%xmm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +" + +#define SSE2_UNPACK_15_ALIGNED " \n\ +# mask unneeded bits off \n\ +movl $0xf8f8f8f8, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ +psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ +pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\ +pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ +psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ +movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ + \n\ +# convert rgb24 plane to rgb15 pack for pixel 0-7 \n\ +punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\ +punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\ +por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ +movntdq %%xmm0, (%3) # store pixel 0-7 \n\ + \n\ +# convert rgb24 plane to rgb15 pack for pixel 8-15 \n\ +punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\ +punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\ +por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ +movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\ +" + +#define SSE2_UNPACK_15_UNALIGNED " \n\ +# mask unneeded bits off \n\ +movl $0xf8f8f8f8, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ +psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ +pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\ +pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ +psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ +movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ + \n\ +# convert rgb24 plane to rgb15 pack for pixel 0-7 \n\ +punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\ +punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\ +por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ +movdqu %%xmm0, (%3) # store pixel 0-7 \n\ + \n\ +# convert rgb24 plane to rgb15 pack for pixel 8-15 \n\ +punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\ +punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\ +por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ +movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\ +" + +#define SSE2_UNPACK_16_ALIGNED " \n\ +# mask unneeded bits off \n\ +movl $0xf8f8f8f8, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ +pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ +movl $0xfcfcfcfc, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\ +psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ +movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ + \n\ +# convert rgb24 plane to rgb16 pack for pixel 0-7 \n\ +punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\ +punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\ +por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ +movntdq %%xmm0, (%3) # store pixel 0-7 \n\ + \n\ +# convert rgb24 plane to rgb16 pack for pixel 8-15 \n\ +punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\ +punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\ +por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ +movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\ +" #define SSE2_UNPACK_16_UNALIGNED " \n\ # mask unneeded bits off \n\ @@ -705,104 +755,6 @@ por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\ " -#define MMX_INTRINSICS_UNPACK_16 \ - mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \ - mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \ - mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \ - mm0 = _mm_srli_pi16(mm0, 3); \ - mm4 = _mm_setzero_si64(); \ - mm5 = mm0; \ - mm7 = mm2; \ - \ - mm2 = _mm_unpacklo_pi8(mm2, mm4); \ - mm0 = _mm_unpacklo_pi8(mm0, mm1); \ - mm2 = _mm_slli_pi16(mm2, 3); \ - mm0 = _mm_or_si64(mm0, mm2); \ - tmp64 = *(uint64_t *)(p_y + 8); \ - mm6 = (__m64)tmp64; \ - *(uint64_t *)p_buffer = (uint64_t)mm0; \ - \ - mm7 = _mm_unpackhi_pi8(mm7, mm4); \ - mm5 = _mm_unpackhi_pi8(mm5, mm1); \ - mm7 = _mm_slli_pi16(mm7, 3); \ - tmp64 = (uint64_t)*(uint32_t *)(p_u + 4); \ - mm0 = (__m64)tmp64; \ - mm5 = _mm_or_si64(mm5, mm7); \ - tmp64 = (uint64_t)*(uint32_t *)(p_v + 4); \ - mm1 = (__m64)tmp64; \ - *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; - -#define SSE2_INTRINSICS_UNPACK_16_ALIGNED \ - xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ - xmm0 = _mm_and_si128(xmm0, xmm5); \ - xmm1 = _mm_and_si128(xmm1, xmm5); \ - xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ - xmm2 = _mm_and_si128(xmm2, xmm5); \ - xmm0 = _mm_srli_epi16(xmm0, 3); \ - xmm4 = _mm_setzero_si128(); \ - xmm5 = xmm0; \ - xmm7 = xmm2; \ - \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_slli_epi16(xmm2, 3); \ - xmm0 = _mm_or_si128(xmm0, xmm2); \ - _mm_stream_si128((__m128i*)p_buffer, xmm0); \ - \ - xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ - xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ - xmm7 = _mm_slli_epi16(xmm7, 3); \ - xmm5 = _mm_or_si128(xmm5, xmm7); \ - _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); - -#define SSE2_INTRINSICS_UNPACK_16_UNALIGNED \ - xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ - xmm0 = _mm_and_si128(xmm0, xmm5); \ - xmm1 = _mm_and_si128(xmm1, xmm5); \ - xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ - xmm2 = _mm_and_si128(xmm2, xmm5); \ - xmm0 = _mm_srli_epi16(xmm0, 3); \ - xmm4 = _mm_setzero_si128(); \ - xmm5 = xmm0; \ - xmm7 = xmm2; \ - \ - xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ - xmm2 = _mm_slli_epi16(xmm2, 3); \ - xmm0 = _mm_or_si128(xmm0, xmm2); \ - _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ - \ - xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ - xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ - xmm7 = _mm_slli_epi16(xmm7, 3); \ - xmm5 = _mm_or_si128(xmm5, xmm7); \ - _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); - -/* - * convert RGB plane to RGB packed format, - * mm0 -> B, mm1 -> R, mm2 -> G - */ - -#define MMX_UNPACK_32_ARGB " \n\ -pxor %%mm3, %%mm3 # zero mm3 \n\ -movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ -punpcklbw %%mm2, %%mm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ -movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ -punpcklbw %%mm3, %%mm5 # 00 R3 00 R2 00 R1 00 R0 \n\ -movq %%mm4, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ -punpcklwd %%mm5, %%mm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\ -movq %%mm4, (%3) # Store ARGB1 ARGB0 \n\ -punpckhwd %%mm5, %%mm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\ -movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\ -punpckhbw %%mm2, %%mm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ -punpckhbw %%mm3, %%mm1 # 00 R7 00 R6 00 R5 00 R4 \n\ -movq %%mm0, %%mm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ -punpcklwd %%mm1, %%mm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\ -movq %%mm5, 16(%3) # Store ARGB5 ARGB4 \n\ -punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\ -movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\ -" - #define SSE2_UNPACK_32_ARGB_ALIGNED " \n\ pxor %%xmm3, %%xmm3 # zero xmm3 \n\ movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ @@ -843,84 +795,6 @@ punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\ movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\ " -#define MMX_INTRINSICS_UNPACK_32_ARGB \ - mm3 = _mm_setzero_si64(); \ - mm4 = mm0; \ - mm4 = _mm_unpacklo_pi8(mm4, mm2); \ - mm5 = mm1; \ - mm5 = _mm_unpacklo_pi8(mm5, mm3); \ - mm6 = mm4; \ - mm4 = _mm_unpacklo_pi16(mm4, mm5); \ - *(uint64_t *)p_buffer = (uint64_t)mm4; \ - mm6 = _mm_unpackhi_pi16(mm6, mm5); \ - *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6; \ - mm0 = _mm_unpackhi_pi8(mm0, mm2); \ - mm1 = _mm_unpackhi_pi8(mm1, mm3); \ - mm5 = mm0; \ - mm5 = _mm_unpacklo_pi16(mm5, mm1); \ - *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; \ - mm0 = _mm_unpackhi_pi16(mm0, mm1); \ - *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; - -#define SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm0; \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ - xmm5 = xmm1; \ - xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ - xmm6 = xmm4; \ - xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ - _mm_stream_si128((__m128i*)(p_buffer), xmm4); \ - xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ - _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \ - xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ - xmm5 = xmm0; \ - xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ - _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \ - xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ - _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); - -#define SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm0; \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ - xmm5 = xmm1; \ - xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ - xmm6 = xmm4; \ - xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ - _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \ - xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ - _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \ - xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ - xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ - xmm5 = xmm0; \ - xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ - _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \ - xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ - _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); - -#define MMX_UNPACK_32_BGRA " \n\ -pxor %%mm3, %%mm3 # zero mm3 \n\ -movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ -punpcklbw %%mm0, %%mm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\ -punpcklbw %%mm1, %%mm3 # R3 00 R2 00 R1 00 R0 00 \n\ -movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\ -punpcklwd %%mm4, %%mm3 # B1 G1 R1 00 B0 G0 R0 00 \n\ -movq %%mm3, (%3) # Store BGRA1 BGRA0 \n\ -punpckhwd %%mm4, %%mm5 # B3 G3 R3 00 B2 G2 R2 00 \n\ -movq %%mm5, 8(%3) # Store BGRA3 BGRA2 \n\ -pxor %%mm3, %%mm3 # zero mm3 \n\ -movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ -punpckhbw %%mm0, %%mm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ -punpckhbw %%mm1, %%mm3 # R7 00 R6 00 R5 00 R4 00 \n\ -movq %%mm3, %%mm5 # R7 00 R6 00 R5 00 R4 00 \n\ -punpcklwd %%mm1, %%mm3 # B5 G5 R5 00 B4 G4 R4 00 \n\ -movq %%mm3, 16(%3) # Store BGRA5 BGRA4 \n\ -punpckhwd %%mm4, %%mm5 # B7 G7 R7 00 B6 G6 R6 00 \n\ -movq %%mm5, 24(%3) # Store BGRA7 BGRA6 \n\ -" - #define SSE2_UNPACK_32_BGRA_ALIGNED " \n\ pxor %%xmm3, %%xmm3 # zero mm3 \n\ movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ @@ -930,16 +804,15 @@ movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\ punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\ movntdq %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\ punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\ -movntdq %%xmm5, 8(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\ -pxor %%xmm3, %%xmm3 # zero mm3 \n\ -movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ -punpckhbw %%xmm0, %%xmm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ -punpckhbw %%xmm1, %%xmm3 # R7 00 R6 00 R5 00 R4 00 \n\ -movdqa %%xmm3, %%xmm5 # R7 00 R6 00 R5 00 R4 00 \n\ -punpcklwd %%xmm1, %%xmm3 # B5 G5 R5 00 B4 G4 R4 00 \n\ -movntdq %%xmm3, 16(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\ -punpckhwd %%xmm4, %%xmm5 # B7 G7 R7 00 B6 G6 R6 00 \n\ -movntdq %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ +movntdq %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\ +pxor %%xmm6, %%xmm6 # zero mm6 \n\ +punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ +punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\ +movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\ +punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\ +movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\ +punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movntdq %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ " #define SSE2_UNPACK_32_BGRA_UNALIGNED " \n\ @@ -951,75 +824,327 @@ movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\ punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\ movdqu %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\ punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\ -movdqu %%xmm5, 8(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\ +movdqu %%xmm5, 16(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\ +pxor %%xmm6, %%xmm6 # zero mm6 \n\ +punpckhbw %%xmm0, %%xmm2 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ +punpckhbw %%xmm1, %%xmm6 # R7 00 R6 00 R5 00 R4 00 \n\ +movdqa %%xmm6, %%xmm0 # R7 00 R6 00 R5 00 R4 00 \n\ +punpcklwd %%xmm2, %%xmm6 # B5 G5 R5 00 B4 G4 R4 00 \n\ +movdqu %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\ +punpckhwd %%xmm2, %%xmm0 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movdqu %%xmm0, 48(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ +" + +#define SSE2_UNPACK_32_ABGR_ALIGNED " \n\ pxor %%xmm3, %%xmm3 # zero mm3 \n\ -movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ -punpckhbw %%xmm0, %%xmm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ -punpckhbw %%xmm1, %%xmm3 # R7 00 R6 00 R5 00 R4 00 \n\ -movdqa %%xmm3, %%xmm5 # R7 00 R6 00 R5 00 R4 00 \n\ -punpcklwd %%xmm1, %%xmm3 # B5 G5 R5 00 B4 G4 R4 00 \n\ -movdqu %%xmm3, 16(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\ -punpckhwd %%xmm4, %%xmm5 # B7 G7 R7 00 B6 G6 R6 00 \n\ -movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ +movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ +punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\ +movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\ +movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\ +punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\ +movntdq %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\ +punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\ +movntdq %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\ +punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\ +punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\ +movdqa %%xmm1, %%xmm2 # G7 R7 G6 R6 G5 R5 G4 R4 \n\ +punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\ +movntdq %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\ +punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movntdq %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\ +" + +#define SSE2_UNPACK_32_ABGR_UNALIGNED " \n\ +pxor %%xmm3, %%xmm3 # zero mm3 \n\ +movdqa %%xmm1, %%xmm4 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ +punpcklbw %%xmm2, %%xmm4 # G3 R3 G2 R2 G1 R1 G0 R0 \n\ +movdqa %%xmm0, %%xmm5 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%xmm3, %%xmm5 # 00 B3 00 B2 00 B1 00 B0 \n\ +movdqa %%xmm4, %%xmm6 # G3 R3 G2 R2 G1 R1 G0 R0 \n\ +punpcklwd %%xmm5, %%xmm4 # 00 B1 G1 R1 00 B0 G0 R0 \n\ +movdqu %%xmm4, (%3) # Store ABGR3 ABGR2 ABGR1 ABGR0 \n\ +punpckhwd %%xmm5, %%xmm6 # 00 B3 G3 R3 00 B2 G2 R2 \n\ +movdqu %%xmm6, 16(%3) # Store ABGR7 ABGR6 ABGR5 ABGR4 \n\ +punpckhbw %%xmm2, %%xmm1 # G7 R7 G6 R6 G5 R5 G4 R4 \n\ +punpckhbw %%xmm3, %%xmm0 # 00 B7 00 B6 00 B5 00 B4 \n\ +movdqa %%xmm1, %%xmm2 # R7 00 R6 00 R5 00 R4 00 \n\ +punpcklwd %%xmm0, %%xmm1 # 00 B5 G5 R5 00 B4 G4 R4 \n\ +movdqu %%xmm1, 32(%3) # Store ABGR11 ABGR10 ABGR9 ABGR8 \n\ +punpckhwd %%xmm0, %%xmm2 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movdqu %%xmm2, 48(%3) # Store ABGR15 ABGR14 ABGR13 ABGR12 \n\ " -#define MMX_INTRINSICS_UNPACK_32_BGRA \ - mm3 = _mm_setzero_si64(); \ - mm4 = mm2; \ - mm4 = _mm_unpacklo_pi8(mm4, mm0); \ - mm1 = _mm_unpacklo_pi8(mm1, mm3); \ - mm5 = mm3; \ - mm3 = _mm_unpacklo_pi16(mm3, mm4); \ - *(uint64_t *)p_buffer = (uint64_t)mm3; \ - mm5 = _mm_unpackhi_pi16(mm5, mm4); \ - *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5; \ - mm3 = _mm_setzero_si64(); \ - mm4 = mm2; \ - mm0 = _mm_unpackhi_pi8(mm0, mm4); \ - mm1 = _mm_unpackhi_pi8(mm1, mm3); \ - mm5 = mm3; \ - mm3 = _mm_unpacklo_pi16(mm3, mm1); \ - *(uint64_t *)(p_buffer + 4) = (uint64_t)mm3; \ - mm5 = _mm_unpackhi_pi16(mm5, mm4); \ - *(uint64_t *)(p_buffer + 6) = (uint64_t)mm5; \ - -#define SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm2; \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ - xmm1 = _mm_unpacklo_epi8(xmm1, xmm3); \ - xmm5 = xmm3; \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ +#elif defined(HAVE_SSE2_INTRINSICS) + +/* SSE2 intrinsics */ + +#include + +#define SSE2_CALL(SSE2_INSTRUCTIONS) \ + do { \ + __m128i xmm0, xmm1, xmm2, xmm3, \ + xmm4, xmm5, xmm6, xmm7; \ + SSE2_INSTRUCTIONS \ + } while(0) + +#define SSE2_END _mm_sfence() + +#define SSE2_INIT_16_ALIGNED \ + xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm4 = _mm_setzero_si128(); \ + xmm6 = _mm_load_si128((__m128i *)p_y); + +#define SSE2_INIT_16_UNALIGNED \ + xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm4 = _mm_setzero_si128(); \ + xmm6 = _mm_loadu_si128((__m128i *)p_y); \ + _mm_prefetch(p_buffer, _MM_HINT_NTA); + +#define SSE2_INIT_32_ALIGNED \ + xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm4 = _mm_setzero_si128(); \ + xmm6 = _mm_load_si128((__m128i *)p_y); + +#define SSE2_INIT_32_UNALIGNED \ + xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm4 = _mm_setzero_si128(); \ + xmm6 = _mm_loadu_si128((__m128i *)p_y); \ + _mm_prefetch(p_buffer, _MM_HINT_NTA); + +#define SSE2_YUV_MUL \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ + xmm5 = _mm_set1_epi32(0x00800080UL); \ + xmm0 = _mm_subs_epi16(xmm0, xmm5); \ + xmm1 = _mm_subs_epi16(xmm1, xmm5); \ + xmm0 = _mm_slli_epi16(xmm0, 3); \ + xmm1 = _mm_slli_epi16(xmm1, 3); \ + xmm2 = xmm0; \ + xmm3 = xmm1; \ + xmm5 = _mm_set1_epi32(0xf37df37dUL); \ + xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \ + xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \ + xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \ + xmm5 = _mm_set1_epi32(0x40934093UL); \ + xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \ + xmm5 = _mm_set1_epi32(0x33123312UL); \ + xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \ + xmm2 = _mm_adds_epi16(xmm2, xmm3); \ + \ + xmm5 = _mm_set1_epi32(0x10101010UL); \ + xmm6 = _mm_subs_epu8(xmm6, xmm5); \ + xmm7 = xmm6; \ + xmm5 = _mm_set1_epi32(0x00ff00ffUL); \ + xmm6 = _mm_and_si128(xmm6, xmm5); \ + xmm7 = _mm_srli_epi16(xmm7, 8); \ + xmm6 = _mm_slli_epi16(xmm6, 3); \ + xmm7 = _mm_slli_epi16(xmm7, 3); \ + xmm5 = _mm_set1_epi32(0x253f253fUL); \ + xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \ + xmm7 = _mm_mulhi_epi16(xmm7, xmm5); + +#define SSE2_YUV_ADD \ + xmm3 = xmm0; \ + xmm4 = xmm1; \ + xmm5 = xmm2; \ + xmm0 = _mm_adds_epi16(xmm0, xmm6); \ + xmm3 = _mm_adds_epi16(xmm3, xmm7); \ + xmm1 = _mm_adds_epi16(xmm1, xmm6); \ + xmm4 = _mm_adds_epi16(xmm4, xmm7); \ + xmm2 = _mm_adds_epi16(xmm2, xmm6); \ + xmm5 = _mm_adds_epi16(xmm5, xmm7); \ + \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ + xmm2 = _mm_packus_epi16(xmm2, xmm2); \ + \ + xmm3 = _mm_packus_epi16(xmm3, xmm3); \ + xmm4 = _mm_packus_epi16(xmm4, xmm4); \ + xmm5 = _mm_packus_epi16(xmm5, xmm5); \ + \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + +#define SSE2_UNPACK_15_ALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm1 = _mm_srli_epi16(xmm1, 1); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ + \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 2); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_stream_si128((__m128i*)p_buffer, xmm0); \ + \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 2); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); + +#define SSE2_UNPACK_15_UNALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm1 = _mm_srli_epi16(xmm1, 1); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ + \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 2); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ + \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 2); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ + _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5); + +#define SSE2_UNPACK_16_ALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ + \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 3); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_stream_si128((__m128i*)p_buffer, xmm0); \ + \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 3); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); + +#define SSE2_UNPACK_16_UNALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ + \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 3); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ + \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 3); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ + _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); + +#define SSE2_UNPACK_32_ARGB_ALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm0; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ + xmm5 = xmm1; \ + xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ + xmm6 = xmm4; \ + xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ + _mm_stream_si128((__m128i*)(p_buffer), xmm4); \ + xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ + _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + xmm5 = xmm0; \ + xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ + _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); + +#define SSE2_UNPACK_32_ARGB_UNALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm0; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ + xmm5 = xmm1; \ + xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ + xmm6 = xmm4; \ + xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ + _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \ + xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ + _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + xmm5 = xmm0; \ + xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ + _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ + _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); + +#define SSE2_UNPACK_32_BGRA_ALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ _mm_stream_si128((__m128i*)(p_buffer), xmm3); \ - xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm2; \ - xmm0 = _mm_unpackhi_epi8(xmm0, xmm4); \ - xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ - xmm5 = xmm3; \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ - _mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \ - xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ - _mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \ - -#define SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm2; \ - xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ - xmm1 = _mm_unpacklo_epi8(xmm1, xmm3); \ - xmm5 = xmm3; \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ + xmm6 = _mm_setzero_si128(); \ + xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ + xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \ + xmm0 = xmm6; \ + xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ + _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); + +#define SSE2_UNPACK_32_BGRA_UNALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ + xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \ - xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \ - xmm3 = _mm_setzero_si128(); \ - xmm4 = xmm2; \ - xmm0 = _mm_unpackhi_epi8(xmm0, xmm4); \ - xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ - xmm5 = xmm3; \ - xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ - _mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \ - xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ - _mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \ + xmm6 = _mm_setzero_si128(); \ + xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ + xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \ + xmm0 = xmm6; \ + xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ + _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ + _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); + +#define SSE2_UNPACK_32_ABGR_ALIGNED \ + ; + +#define SSE2_UNPACK_32_ABGR_UNALIGNED \ + ; +#endif + +#endif diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c index 8a76fcb46f..dc08896f5e 100644 --- a/modules/video_chroma/i420_yuy2.c +++ b/modules/video_chroma/i420_yuy2.c @@ -307,7 +307,7 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, #if defined (MODULE_NAME_IS_i420_yuy2_mmx) /* re-enable FPU registers */ - __asm__ __volatile__ ( "emms" ); + MMX_END; #endif #if defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -348,8 +348,6 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } - /* make sure all SSE2 stores are visible thereafter */ - __asm__ __volatile__ ( "sfence" ); } else { @@ -379,6 +377,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_line2 += i_dest_margin; } } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } @@ -518,7 +518,7 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, #if defined (MODULE_NAME_IS_i420_yuy2_mmx) /* re-enable FPU registers */ - __asm__ __volatile__ ( "emms" ); + MMX_END; #endif #if defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -558,8 +558,6 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } - /* make sure all SSE2 stores are visible thereafter */ - __asm__ __volatile__ ( "sfence" ); } else { @@ -589,6 +587,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_line2 += i_dest_margin; } } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } @@ -727,7 +727,7 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, #if defined (MODULE_NAME_IS_i420_yuy2_mmx) /* re-enable FPU registers */ - __asm__ __volatile__ ( "emms" ); + MMX_END; #endif #if defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -767,8 +767,6 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } - /* make sure all SSE2 stores are visible thereafter */ - __asm__ __volatile__ ( "sfence" ); } else { @@ -798,6 +796,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, p_line2 += i_dest_margin; } } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } @@ -871,7 +871,7 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, #if defined (MODULE_NAME_IS_i420_yuy2_mmx) /* re-enable FPU registers */ - __asm__ __volatile__ ( "emms" ); + MMX_END; #endif #else // defined(MODULE_NAME_IS_i420_yuy2_sse2) @@ -907,8 +907,6 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } - /* make sure all SSE2 stores are visible thereafter */ - __asm__ __volatile__ ( "sfence" ); } else { @@ -938,6 +936,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, p_line2 += i_dest_margin; } } + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) } #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec) diff --git a/modules/video_chroma/i420_yuy2.h b/modules/video_chroma/i420_yuy2.h index 441e578481..a881c6e5a8 100644 --- a/modules/video_chroma/i420_yuy2.h +++ b/modules/video_chroma/i420_yuy2.h @@ -24,17 +24,26 @@ #ifdef MODULE_NAME_IS_i420_yuy2_mmx -#define MMX_CALL(MMX_INSTRUCTIONS) \ - do { \ - __asm__ __volatile__( \ - ".p2align 3 \n\t" \ - MMX_INSTRUCTIONS \ - : \ - : "r" (p_line1), "r" (p_line2), "r" (p_y1), "r" (p_y2), \ - "r" (p_u), "r" (p_v) ); \ - p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ +#if defined(CAN_COMPILE_MMX) + +/* MMX assembly */ + +#define MMX_CALL(MMX_INSTRUCTIONS) \ + do { \ + __asm__ __volatile__( \ + ".p2align 3 \n\t" \ + MMX_INSTRUCTIONS \ + : \ + : "r" (p_line1), "r" (p_line2), \ + "r" (p_y1), "r" (p_y2), \ + "r" (p_u), "r" (p_v) ); \ + p_line1 += 16; p_line2 += 16; \ + p_y1 += 8; p_y2 += 8; \ + p_u += 4; p_v += 4; \ } while(0) +#define MMX_END __asm__ __volatile__ ( "emms" ) + #define MMX_YUV420_YUYV " \n\ movd (%4), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ movd (%5), %%mm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ @@ -111,23 +120,99 @@ packuswb %%mm1, %%mm1 # pack Y Y6 Y4 Y2 Y0 Y6 Y4 Y2 Y0 \n\ punpcklbw %%mm2, %%mm1 # v2 Y6 u2 Y4 v0 Y2 u0 Y0 \n\ movq %%mm1, (%1) # Store YUYV \n\ " +#elif defined(HAVE_MMX_INTRINSICS) + +/* MMX intrinsics */ + +#include + +#define MMX_CALL(MMX_INSTRUCTIONS) \ + do { \ + __m64 mm0, mm1, mm2, mm3, mm4; \ + MMX_INSTRUCTIONS \ + p_line1 += 16; p_line2 += 16; \ + p_y1 += 8; p_y2 += 8; \ + p_u += 4; p_v += 4; \ + } while(0) + +#define MMX_END _mm_empty() + +#define MMX_YUV420_YUYV \ + mm1 = _mm_cvtsi32_si64((int)*p_u); \ + mm2 = _mm_cvtsi32_si64((int)*p_v); \ + mm0 = (__m64)*(uint64_t*)p_y1; \ + mm3 = (__m64)*(uint64_t*)p_y2; \ + mm1 = _mm_unpacklo_pi8(mm1, mm2); \ + mm2 = mm0; \ + mm2 = _mm_unpacklo_pi8(mm2, mm1); \ + *(uin64_t)p_line1 = (uint64)mm2; \ + mm0 = _mm_unpackhi_pi8(mm0, mm1); \ + *(uin64_t)(p_line1 + 4) = (uint64)mm0; \ + mm4 = mm3; \ + mm4 = _mm_unpacklo_pi8(mm4, mm1); \ + *(uin64_t)p_line2 = (uint64)mm4; \ + mm3 = _mm_unpackhi_pi8(mm3, mm1); \ + *(uin64_t)(p_line2 + 4) = (uint64)mm4; + +#define MMX_YUV420_YVYU \ + mm2 = _mm_cvtsi32_si64((int)*p_u); \ + mm1 = _mm_cvtsi32_si64((int)*p_v); \ + mm0 = (__m64)*(uint64_t*)p_y1; \ + mm3 = (__m64)*(uint64_t*)p_y2; \ + mm1 = _mm_unpacklo_pi8(mm1, mm2); \ + mm2 = mm0; \ + mm2 = _mm_unpacklo_pi8(mm2, mm1); \ + *(uin64_t)p_line1 = (uint64)mm2; \ + mm0 = _mm_unpackhi_pi8(mm0, mm1); \ + *(uin64_t)(p_line1 + 4) = (uint64)mm0; \ + mm4 = mm3; \ + mm4 = _mm_unpacklo_pi8(mm4, mm1); \ + *(uin64_t)p_line2 = (uint64)mm4; \ + mm3 = _mm_unpackhi_pi8(mm3, mm1); \ + *(uin64_t)(p_line2 + 4) = (uint64)mm4; + +#define MMX_YUV420_UYVY \ + mm1 = _mm_cvtsi32_si64((int)*p_u); \ + mm2 = _mm_cvtsi32_si64((int)*p_v); \ + mm0 = (__m64)*(uint64_t*)p_y1; \ + mm3 = (__m64)*(uint64_t*)p_y2; \ + mm1 = _mm_unpacklo_pi8(mm1, mm2); \ + mm2 = mm1; \ + mm2 = _mm_unpacklo_pi8(mm2, mm0); \ + *(uin64_t)p_line1 = (uint64)mm2; \ + mm2 = mm1; \ + mm2 = _mm_unpackhi_pi8(mm2, mm0); \ + *(uin64_t)(p_line1 + 4) = (uint64)mm2; \ + mm4 = mm1; \ + mm4 = _mm_unpacklo_pi8(mm4, mm3); \ + *(uin64_t)p_line2 = (uint64)mm4; \ + mm1 = _mm_unpackhi_pi8(mm1, mm3); \ + *(uin64_t)(p_line2 + 4) = (uint64)mm1; + +#endif #elif defined( MODULE_NAME_IS_i420_yuy2_sse2 ) -/* SSE2 */ - -#define SSE2_CALL(SSE2_INSTRUCTIONS) \ - do { \ - __asm__ __volatile__( \ - ".p2align 3 \n\t" \ - SSE2_INSTRUCTIONS \ - : \ - : "r" (p_line1), "r" (p_line2), "r" (p_y1), "r" (p_y2), \ - "r" (p_u), "r" (p_v) ); \ - p_line1 += 32; p_line2 += 32; p_y1 += 16; p_y2 += 16; \ - p_u += 8; p_v += 8; \ +#if defined(CAN_COMPILE_SSE2) + +/* SSE2 assembly */ + +#define SSE2_CALL(SSE2_INSTRUCTIONS) \ + do { \ + __asm__ __volatile__( \ + ".p2align 3 \n\t" \ + SSE2_INSTRUCTIONS \ + : \ + : "r" (p_line1), "r" (p_line2), \ + "r" (p_y1), "r" (p_y2), \ + "r" (p_u), "r" (p_v) ); \ + p_line1 += 32; p_line2 += 32; \ + p_y1 += 16; p_y2 += 16; \ + p_u += 8; p_v += 8; \ } while(0) +#define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" ) + #define SSE2_YUV420_YUYV_ALIGNED " \n\ movq (%4), %%xmm1 # Load 8 Cb u7 u6 u5 u4 u3 u2 u1 u0 \n\ movq (%5), %%xmm2 # Load 8 Cr v7 06 v5 v4 v3 v2 v1 v0 \n\ @@ -151,6 +236,8 @@ movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\ +prefetchnta (%1) # Tell CPU not to cache output YUYV data \n\ punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\ @@ -164,80 +251,213 @@ punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\ movdqu %%xmm3, 16(%1) # Store high YUYV \n\ " -#define SSE2_YUV420_YVYU_ALIGNED " \n\ -movq (%4), %%xmm2 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movq (%5), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ -movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ -movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ -punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ -movntdq %%xmm2, (%0) # Store low YUYV \n\ -punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ -movntdq %%xmm0, 16(%0) # Store high YUYV \n\ -movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -punpcklbw %%xmm1, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ -movntdq %%xmm4, (%1) # Store low YUYV \n\ -punpckhbw %%xmm1, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ -movntdq %%xmm3, 16(%1) # Store high YUYV \n\ +#define SSE2_YUV420_YVYU_ALIGNED " \n\ +movq (%4), %%xmm2 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%5), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ +movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ +punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ +movntdq %%xmm2, (%0) # Store low YUYV \n\ +punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ +movntdq %%xmm0, 16(%0) # Store high YUYV \n\ +movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +punpcklbw %%xmm1, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ +movntdq %%xmm4, (%1) # Store low YUYV \n\ +punpckhbw %%xmm1, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ +movntdq %%xmm3, 16(%1) # Store high YUYV \n\ " -#define SSE2_YUV420_YVYU_UNALIGNED " \n\ -movq (%4), %%xmm2 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movq (%5), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ -movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ -movdqu %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ -punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ -movdqu %%xmm2, (%0) # Store low YUYV \n\ -punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ -movdqu %%xmm0, 16(%0) # Store high YUYV \n\ -movdqu %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -punpcklbw %%xmm1, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ -movdqu %%xmm4, (%1) # Store low YUYV \n\ -punpckhbw %%xmm1, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ -movdqu %%xmm3, 16(%1) # Store high YUYV \n\ +#define SSE2_YUV420_YVYU_UNALIGNED " \n\ +movq (%4), %%xmm2 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%5), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +prefetchnta (%0) # Tell CPU not to cache output YVYU data \n\ +prefetchnta (%1) # Tell CPU not to cache output YVYU data \n\ +punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ +movdqu %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ +punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ +movdqu %%xmm2, (%0) # Store low YUYV \n\ +punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ +movdqu %%xmm0, 16(%0) # Store high YUYV \n\ +movdqu %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +punpcklbw %%xmm1, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ +movdqu %%xmm4, (%1) # Store low YUYV \n\ +punpckhbw %%xmm1, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ +movdqu %%xmm3, 16(%1) # Store high YUYV \n\ " -#define SSE2_YUV420_UYVY_ALIGNED " \n\ -movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ -movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ -movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ -punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ -movntdq %%xmm2, (%0) # Store low UYVY \n\ -movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ -punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ -movntdq %%xmm2, 16(%0) # Store high UYVY \n\ -movdqa %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ -punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ -movntdq %%xmm4, (%1) # Store low UYVY \n\ -punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ -movntdq %%xmm1, 16(%1) # Store high UYVY \n\ +#define SSE2_YUV420_UYVY_ALIGNED " \n\ +movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ +movntdq %%xmm2, (%0) # Store low UYVY \n\ +movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ +punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ +movntdq %%xmm2, 16(%0) # Store high UYVY \n\ +movdqa %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ +punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ +movntdq %%xmm4, (%1) # Store low UYVY \n\ +punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ +movntdq %%xmm1, 16(%1) # Store high UYVY \n\ " -#define SSE2_YUV420_UYVY_UNALIGNED " \n\ -movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ -movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ -movdqu %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ -punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ -movdqu %%xmm2, (%0) # Store low UYVY \n\ -movdqu %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ -punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ -movdqu %%xmm2, 16(%0) # Store high UYVY \n\ -movdqu %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ -punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ -movdqu %%xmm4, (%1) # Store low UYVY \n\ -punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ -movdqu %%xmm1, 16(%1) # Store high UYVY \n\ +#define SSE2_YUV420_UYVY_UNALIGNED " \n\ +movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ +movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +prefetchnta (%0) # Tell CPU not to cache output UYVY data \n\ +prefetchnta (%1) # Tell CPU not to cache output UYVY data \n\ +punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +movdqu %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ +punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ +movdqu %%xmm2, (%0) # Store low UYVY \n\ +movdqu %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ +punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ +movdqu %%xmm2, 16(%0) # Store high UYVY \n\ +movdqu %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ +punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ +movdqu %%xmm4, (%1) # Store low UYVY \n\ +punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ +movdqu %%xmm1, 16(%1) # Store high UYVY \n\ " +#elif defined(HAVE_SSE2_INTRINSICS) + +/* SSE2 intrinsics */ + +#include + +#define SSE2_CALL(SSE2_INSTRUCTIONS) \ + do { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; \ + SSE2_INSTRUCTIONS \ + p_line1 += 32; p_line2 += 32; \ + p_y1 += 16; p_y2 += 16; \ + p_u += 8; p_v += 8; \ + } while(0) + +#define SSE2_END _mm_sfence() + +#define SSE2_YUV420_YUYV_ALIGNED \ + xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm0 = _mm_load_si128((__m128i *)p_y1); \ + xmm3 = _mm_load_si128((__m128i *)p_y2); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm0; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ + _mm_stream_si128((__m128i*)(p_line1), xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ + _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \ + xmm4 = xmm3; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ + _mm_stream_si128((__m128i*)(p_line2), xmm4); \ + xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \ + _mm_stream_si128((__m128i*)(p_line1+16), xmm3); + +#define SSE2_YUV420_YUYV_UNALIGNED \ + xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm0 = _mm_load_si128((__m128i *)p_y1); \ + xmm3 = _mm_load_si128((__m128i *)p_y2); \ + _mm_prefetch(p_line1, _MM_HINT_NTA); \ + _mm_prefetch(p_line2, _MM_HINT_NTA); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm0; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line1), xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \ + xmm4 = xmm3; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line2), xmm4); \ + xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line1+16), xmm3); + +#define SSE2_YUV420_YVYU_ALIGNED \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm0 = _mm_load_si128((__m128i *)p_y1); \ + xmm3 = _mm_load_si128((__m128i *)p_y2); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm0; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ + _mm_stream_si128((__m128i*)(p_line1), xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ + _mm_stream_si128((__m128i*)(p_line1+16), xmm0); \ + xmm4 = xmm3; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ + _mm_stream_si128((__m128i*)(p_line2), xmm4); \ + xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \ + _mm_stream_si128((__m128i*)(p_line1+16), xmm3); + +#define SSE2_YUV420_YVYU_UNALIGNED \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm0 = _mm_load_si128((__m128i *)p_y1); \ + xmm3 = _mm_load_si128((__m128i *)p_y2); \ + _mm_prefetch(p_line1, _MM_HINT_NTA); \ + _mm_prefetch(p_line2, _MM_HINT_NTA); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm0; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line1), xmm2); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line1+16), xmm0); \ + xmm4 = xmm3; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line2), xmm4); \ + xmm3 = _mm_unpackhi_epi8(xmm3, xmm1); \ + _mm_storeu_si128((__m128i*)(p_line1+16), xmm3); + +#define SSE2_YUV420_UYVY_ALIGNED \ + xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm0 = _mm_load_si128((__m128i *)p_y1); \ + xmm3 = _mm_load_si128((__m128i *)p_y2); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm1; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \ + _mm_stream_si128((__m128i*)(p_line1), xmm2); \ + xmm2 = xmm1; \ + xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ + _mm_stream_si128((__m128i*)(p_line1+16), xmm2); \ + xmm4 = xmm1; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \ + _mm_stream_si128((__m128i*)(p_line2), xmm4); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + _mm_stream_si128((__m128i*)(p_line1+16), xmm1); + +#define SSE2_YUV420_UYVY_UNALIGNED \ + xmm1 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm2 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm0 = _mm_load_si128((__m128i *)p_y1); \ + xmm3 = _mm_load_si128((__m128i *)p_y2); \ + _mm_prefetch(p_line1, _MM_HINT_NTA); \ + _mm_prefetch(p_line2, _MM_HINT_NTA); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm2); \ + xmm2 = xmm1; \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm0); \ + _mm_storeu_si128((__m128i*)(p_line1), xmm2); \ + xmm2 = xmm1; \ + xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ + _mm_storeu_si128((__m128i*)(p_line1+16), xmm2); \ + xmm4 = xmm1; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm3); \ + _mm_storeu_si128((__m128i*)(p_line2), xmm4); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + _mm_storeu_si128((__m128i*)(p_line1+16), xmm1); + +#endif + #endif /* Used in both accelerated and C modules */ diff --git a/modules/video_chroma/i422_yuy2.c b/modules/video_chroma/i422_yuy2.c index 5afb683362..4bc4528912 100644 --- a/modules/video_chroma/i422_yuy2.c +++ b/modules/video_chroma/i422_yuy2.c @@ -5,6 +5,7 @@ * $Id$ * * Authors: Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -152,7 +153,7 @@ static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source, for( i_y = p_vout->render.i_height ; i_y-- ; ) { - uint8_t *p_line = p_pixels; + uint8_t *p_line = p_pixels; for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { #if defined (MODULE_NAME_IS_i422_yuy2) @@ -160,15 +161,17 @@ static void I422_YUY2( vout_thread_t *p_vout, picture_t *p_source, C_YUV422_YUYV( p_line, p_y, p_u, p_v ); C_YUV422_YUYV( p_line, p_y, p_u, p_v ); C_YUV422_YUYV( p_line, p_y, p_u, p_v ); -#else - __asm__( ".p2align 3" MMX_YUV422_YUYV - : : "r" (p_line), "r" (p_y), "r" (p_u), "r" (p_v) ); - - p_line += 16; p_y += 8; p_u += 4; p_v += 4; +#elif defined (MODULE_NAME_IS_i422_yuy2_mmx) + MMX_CALL( MMX_YUV422_YUYV ); #endif } - p_pixels += i_pitch; + p_pixels += i_pitch; } +#if defined (MODULE_NAME_IS_i422_yuy2_mmx) + MMX_END; +#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) + SSE2_END; +#endif } /***************************************************************************** @@ -187,7 +190,7 @@ static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source, for( i_y = p_vout->render.i_height ; i_y-- ; ) { - uint8_t *p_line = p_pixels; + uint8_t *p_line = p_pixels; for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { #if defined (MODULE_NAME_IS_i422_yuy2) @@ -195,15 +198,17 @@ static void I422_YVYU( vout_thread_t *p_vout, picture_t *p_source, C_YUV422_YVYU( p_line, p_y, p_u, p_v ); C_YUV422_YVYU( p_line, p_y, p_u, p_v ); C_YUV422_YVYU( p_line, p_y, p_u, p_v ); -#else - __asm__( ".p2align 3" MMX_YUV422_YVYU - : : "r" (p_line), "r" (p_y), "r" (p_u), "r" (p_v) ); - - p_line += 16; p_y += 8; p_u += 4; p_v += 4; +#elif defined (MODULE_NAME_IS_i422_yuy2_mmx) + MMX_CALL( MMX_YUV422_YVYU ); #endif } - p_pixels += i_pitch; + p_pixels += i_pitch; } +#if defined (MODULE_NAME_IS_i422_yuy2_mmx) + MMX_END; +#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) + SSE2_END; +#endif } /***************************************************************************** @@ -222,7 +227,7 @@ static void I422_UYVY( vout_thread_t *p_vout, picture_t *p_source, for( i_y = p_vout->render.i_height ; i_y-- ; ) { - uint8_t *p_line = p_pixels; + uint8_t *p_line = p_pixels; for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) { #if defined (MODULE_NAME_IS_i422_yuy2) @@ -230,15 +235,17 @@ static void I422_UYVY( vout_thread_t *p_vout, picture_t *p_source, C_YUV422_UYVY( p_line, p_y, p_u, p_v ); C_YUV422_UYVY( p_line, p_y, p_u, p_v ); C_YUV422_UYVY( p_line, p_y, p_u, p_v ); -#else - __asm__( ".p2align 3" MMX_YUV422_UYVY - : : "r" (p_line), "r" (p_y), "r" (p_u), "r" (p_v) ); - - p_line += 16; p_y += 8; p_u += 4; p_v += 4; +#elif defined (MODULE_NAME_IS_i422_yuy2_mmx) + MMX_CALL( MMX_YUV422_UYVY ); #endif } - p_pixels += i_pitch; + p_pixels += i_pitch; } +#if defined (MODULE_NAME_IS_i422_yuy2_mmx) + MMX_END; +#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) + SSE2_END; +#endif } /***************************************************************************** @@ -275,14 +282,16 @@ static void I422_cyuv( vout_thread_t *p_vout, picture_t *p_source, C_YUV422_UYVY( p_line, p_y, p_u, p_v ); C_YUV422_UYVY( p_line, p_y, p_u, p_v ); C_YUV422_UYVY( p_line, p_y, p_u, p_v ); -#else - __asm__( ".p2align 3" MMX_YUV422_UYVY - : : "r" (p_line), "r" (p_y), "r" (p_u), "r" (p_v) ); - - p_line += 16; p_y += 8; p_u += 4; p_v += 4; +#elif defined (MODULE_NAME_IS_i422_yuy2_mmx) + MMX_CALL( MMX_YUV422_UYVY ); #endif } } +#if defined (MODULE_NAME_IS_i422_yuy2_mmx) + MMX_END; +#elif defined (MODULE_NAME_IS_i422_yuy2_sse2) + SSE2_END; +#endif } /***************************************************************************** diff --git a/modules/video_chroma/i422_yuy2.h b/modules/video_chroma/i422_yuy2.h index f502e710af..85794be1b9 100644 --- a/modules/video_chroma/i422_yuy2.h +++ b/modules/video_chroma/i422_yuy2.h @@ -5,6 +5,7 @@ * $Id$ * * Authors: Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -23,6 +24,24 @@ #ifdef MODULE_NAME_IS_i422_yuy2_mmx +#if defined(CAN_COMPILE_MMX) + +/* MMX assembly */ + +#define MMX_CALL(MMX_INSTRUCTIONS) \ + do { \ + __asm__ __volatile__( \ + ".p2align 3 \n\t" \ + MMX_INSTRUCTIONS \ + : \ + : "r" (p_line), "r" (p_y), \ + "r" (p_u), "r" (p_v) ); \ + p_line += 16; p_y += 8; \ + p_u += 4; p_v += 4; \ + } while(0) + +#define MMX_END __asm__ __volatile__ ( "emms" ) + #define MMX_YUV422_YUYV " \n\ movq (%1), %%mm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ movd (%2), %%mm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ @@ -62,7 +81,36 @@ movq %%mm1, 8(%0) # Store high UYVY \n\ #define MMX_YUV422_Y211 " \n\ " -#else +#elif defined(HAVE_MMX_INTRINSICS) + +/* MMX intrinsics */ + +#include + +#define MMX_END _mm_empty() + +#endif + +#elif defined( MODULE_NAME_IS_i422_yuy2_sse2 ) + +#if defined(CAN_COMPILE_SSE2) + +/* SSE2 assembly */ + +#define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" ) + +#elif defined(HAVE_SSE2_INTRINSICS) + +/* SSE2 intrinsics */ + +#include + + +#define SSE2_END _mm_sfence() + +#endif + +#elif defined (MODULE_NAME_IS_i422_yuy2) #define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \ *(p_line)++ = *(p_y)++; \