From 9acaa4b2e175fb575070d684acdb178bc7a542d2 Mon Sep 17 00:00:00 2001 From: Damien Fouilleul Date: Fri, 15 Jun 2007 16:36:41 +0000 Subject: [PATCH] - video_chromas: more SSE2 and MMX support and optimization, added SSE2 i420 -> RGB acceleration --- configure.ac | 4 +- modules/video_chroma/Modules.am | 7 + modules/video_chroma/i420_rgb.c | 63 +- modules/video_chroma/i420_rgb.h | 7 +- modules/video_chroma/i420_rgb16.c | 1484 +++++++++++++++++++++++---- modules/video_chroma/i420_rgb_mmx.h | 734 +++++++++++-- modules/video_chroma/i420_yuy2.c | 20 +- modules/video_chroma/i420_yuy2.h | 24 +- 8 files changed, 2046 insertions(+), 297 deletions(-) diff --git a/configure.ac b/configure.ac index 70782d0f09..1f0ebc47c6 100644 --- a/configure.ac +++ b/configure.ac @@ -1274,7 +1274,7 @@ MMXEXT_MODULES="memcpymmxext" #MMXEXT_MODULES="${MMXEXT_MODULES} idctmmxext motionmmxext" THREEDNOW_MODULES="memcpy3dn" SSE_MODULES="" -SSE2_MODULES="i420_yuy2_sse2" +SSE2_MODULES="i420_rgb_sse2 i420_yuy2_sse2" ALTIVEC_MODULES="memcpyaltivec i420_yuy2_altivec" #ALTIVEC_MODULES="${ALTIVEC_MODULES} idctaltivec motionaltivec" @@ -1325,7 +1325,7 @@ AC_CACHE_CHECK([if \$CC groks SSE2 intrinsics], [ac_cv_c_sse2_intrinsics=no])]) if test "${ac_cv_c_sse2_intrinsics}" != "no"; then AC_DEFINE(HAVE_SSE2_INTRINSICS, 1, Define if SSE2 intrinsics are available.) - dnl VLC_ADD_CFLAGS([i420_rgb_sse2],[-msse2]) + VLC_ADD_CFLAGS([i420_rgb_sse2],[-msse2]) fi AC_CACHE_CHECK([if \$CC groks MMX inline assembly], diff --git a/modules/video_chroma/Modules.am b/modules/video_chroma/Modules.am index ae77124afa..2fe5c54029 100644 --- a/modules/video_chroma/Modules.am +++ b/modules/video_chroma/Modules.am @@ -13,6 +13,13 @@ SOURCES_i420_rgb_mmx = \ i420_rgb_mmx.h \ $(NULL) +SOURCES_i420_rgb_sse2 = \ + i420_rgb.c \ + i420_rgb.h \ + i420_rgb16.c \ + i420_rgb_mmx.h \ + $(NULL) + SOURCES_i420_yuy2 = \ i420_yuy2.c \ i420_yuy2.h \ diff --git a/modules/video_chroma/i420_rgb.c b/modules/video_chroma/i420_rgb.c index 512e8789ab..068c84504d 100644 --- a/modules/video_chroma/i420_rgb.c +++ b/modules/video_chroma/i420_rgb.c @@ -4,7 +4,8 @@ * Copyright (C) 2000, 2001, 2004 the VideoLAN team * $Id$ * - * Author: Sam Hocevar + * Authors: Sam Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -72,6 +73,11 @@ vlc_module_begin(); "RV15,RV16,RV24,RV32 conversions") ); set_capability( "chroma", 100 ); add_requirement( MMX ); +#elif defined (MODULE_NAME_IS_i420_rgb_sse2) + set_description( _( "SSE2 I420,IYUV,YV12 to " + "RV15,RV16,RV24,RV32 conversions") ); + set_capability( "chroma", 120 ); + add_requirement( SSE2 ); #endif set_callbacks( Activate, Deactivate ); vlc_module_end(); @@ -107,19 +113,30 @@ static int Activate( vlc_object_t *p_this ) #endif case VLC_FOURCC('R','V','1','5'): case VLC_FOURCC('R','V','1','6'): -#if defined (MODULE_NAME_IS_i420_rgb_mmx) +#if ! defined (MODULE_NAME_IS_i420_rgb) /* If we don't have support for the bitmasks, bail out */ - if( ( p_vout->output.i_rmask != 0x7c00 - || p_vout->output.i_gmask != 0x03e0 - || p_vout->output.i_bmask != 0x001f ) - && ( p_vout->output.i_rmask != 0xf800 - || p_vout->output.i_gmask != 0x07e0 - || p_vout->output.i_bmask != 0x001f ) ) + if( ( p_vout->output.i_rmask == 0x7c00 + && p_vout->output.i_gmask == 0x03e0 + && p_vout->output.i_bmask == 0x001f ) ) { - return -1; + /* R5G5B6 pixel format */ + msg_Dbg(p_this, "RGB pixel format is R5G5B5"); + p_vout->chroma.pf_convert = E_(I420_R5G5B5); } -#endif + else if( ( p_vout->output.i_rmask == 0xf800 + && p_vout->output.i_gmask == 0x07e0 + && p_vout->output.i_bmask == 0x001f ) ) + { + /* R5G6B5 pixel format */ + msg_Dbg(p_this, "RGB pixel format is R5G6B5"); + p_vout->chroma.pf_convert = E_(I420_R5G6B5); + } + else + return -1; +#else + // generic C chroma converter */ p_vout->chroma.pf_convert = E_(I420_RGB16); +#endif break; #if 0 @@ -128,16 +145,30 @@ static int Activate( vlc_object_t *p_this ) #endif case VLC_FOURCC('R','V','3','2'): -#if defined (MODULE_NAME_IS_i420_rgb_mmx) +#if ! defined (MODULE_NAME_IS_i420_rgb) /* If we don't have support for the bitmasks, bail out */ - if( p_vout->output.i_rmask != 0x00ff0000 - || p_vout->output.i_gmask != 0x0000ff00 - || p_vout->output.i_bmask != 0x000000ff ) + if( p_vout->output.i_rmask == 0x00ff0000 + && p_vout->output.i_gmask == 0x0000ff00 + && p_vout->output.i_bmask == 0x000000ff ) { - return -1; + /* A8R8G8B8 pixel format */ + msg_Dbg(p_this, "RGB pixel format is A8R8G8B8"); + p_vout->chroma.pf_convert = E_(I420_A8R8G8B8); } -#endif + else if( p_vout->output.i_rmask == 0x0000ff00 + && p_vout->output.i_gmask == 0x00ff0000 + && p_vout->output.i_bmask == 0xff000000 ) + { + /* B8G8R8A8 pixel format */ + msg_Dbg(p_this, "RGB pixel format is B8G8R8A8"); + p_vout->chroma.pf_convert = E_(I420_B8G8R8A8); + } + else + return -1; +#else + // generic C chroma converter */ p_vout->chroma.pf_convert = E_(I420_RGB32); +#endif break; default: diff --git a/modules/video_chroma/i420_rgb.h b/modules/video_chroma/i420_rgb.h index 83781a092b..15fadf4ced 100644 --- a/modules/video_chroma/i420_rgb.h +++ b/modules/video_chroma/i420_rgb.h @@ -58,9 +58,14 @@ struct chroma_sys_t #ifdef MODULE_NAME_IS_i420_rgb void E_(I420_RGB8) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_RGB16_dither) ( vout_thread_t *, picture_t *, picture_t * ); -#endif void E_(I420_RGB16) ( vout_thread_t *, picture_t *, picture_t * ); void E_(I420_RGB32) ( vout_thread_t *, picture_t *, picture_t * ); +#else // if defined(MODULE_NAME_IS_i420_rgb_mmx) +void E_(I420_R5G5B5) ( vout_thread_t *, picture_t *, picture_t * ); +void E_(I420_R5G6B5) ( vout_thread_t *, picture_t *, picture_t * ); +void E_(I420_A8R8G8B8) ( vout_thread_t *, picture_t *, picture_t * ); +void E_(I420_B8G8R8A8) ( vout_thread_t *, picture_t *, picture_t * ); +#endif /***************************************************************************** * CONVERT_*_PIXEL: pixel conversion macros diff --git a/modules/video_chroma/i420_rgb16.c b/modules/video_chroma/i420_rgb16.c index c33dd5ffb8..27f0715a49 100644 --- a/modules/video_chroma/i420_rgb16.c +++ b/modules/video_chroma/i420_rgb16.c @@ -5,6 +5,7 @@ * $Id$ * * Authors: Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,6 +39,11 @@ # include # endif # include "i420_rgb_mmx.h" +#elif defined (MODULE_NAME_IS_i420_rgb_sse2) +# if defined(HAVE_SSE2_INTRINSICS) +# include +# endif +# include "i420_rgb_mmx.h" #endif static void SetOffset( int, int, int, int, vlc_bool_t *, @@ -202,6 +208,9 @@ void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src, * - input: 2 lines (2 Y lines, 1 U/V line) * - output: 1 line *****************************************************************************/ + +#if defined (MODULE_NAME_IS_i420_rgb) + void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, picture_t *p_dest ) { @@ -220,12 +229,10 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, int i_scale_count; /* scale modulo counter */ int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ uint16_t * p_pic_start; /* beginning of the current line for copy */ -#if defined (MODULE_NAME_IS_i420_rgb) int i_uval, i_vval; /* U and V samples */ int i_red, i_green, i_blue; /* U and V modified samples */ uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16; uint16_t * p_ybase; /* Y dependant conversion table */ -#endif /* Conversion buffer pointer */ uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; @@ -268,7 +275,6 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; -#if defined (MODULE_NAME_IS_i420_rgb) for ( i_x = p_vout->render.i_width / 8; i_x--; ) { CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); @@ -276,131 +282,20 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); } -#elif defined (MODULE_NAME_IS_i420_rgb_mmx) - if( p_vout->output.i_rmask == 0x7c00 ) - { - /* 15bpp 5/5/5 */ - for ( i_x = p_vout->render.i_width / 8; i_x--; ) - { -# if defined (HAVE_MMX_INTRINSICS) - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - INTRINSICS_INIT_16 - INTRINSICS_YUV_MUL - INTRINSICS_YUV_ADD - INTRINSICS_UNPACK_15 -# else - __asm__( MMX_INIT_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - - __asm__( ".p2align 3" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_15 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -# endif - - p_y += 8; - p_u += 4; - p_v += 4; - p_buffer += 8; - } - } - else - { - /* 16bpp 5/6/5 */ - for ( i_x = p_vout->render.i_width / 8; i_x--; ) - { -# if defined (HAVE_MMX_INTRINSICS) - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - INTRINSICS_INIT_16 - INTRINSICS_YUV_MUL - INTRINSICS_YUV_ADD - INTRINSICS_UNPACK_16 -# else - __asm__( MMX_INIT_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - - __asm__( ".p2align 3" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -# endif - - p_y += 8; - p_u += 4; - p_v += 4; - p_buffer += 8; - } - } -#endif /* Here we do some unaligned reads and duplicate conversions, but * at least we have all the pixels */ if( i_rewind ) { -#if defined (MODULE_NAME_IS_i420_rgb_mmx) -# if defined (HAVE_MMX_INTRINSICS) - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; -# endif -#endif p_y -= i_rewind; p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (MODULE_NAME_IS_i420_rgb) + CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); -#elif defined (MODULE_NAME_IS_i420_rgb_mmx) - -# if defined (HAVE_MMX_INTRINSICS) - INTRINSICS_INIT_16 -# else - __asm__( MMX_INIT_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -# endif - - if( p_vout->output.i_rmask == 0x7c00 ) - { - /* 15bpp 5/5/5 */ -# if defined (HAVE_MMX_INTRINSICS) - INTRINSICS_YUV_MUL - INTRINSICS_YUV_ADD - INTRINSICS_UNPACK_15 -# else - __asm__( ".p2align 3" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_15 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -# endif - } - else - { -# if defined (HAVE_MMX_INTRINSICS) - INTRINSICS_YUV_MUL - INTRINSICS_YUV_ADD - INTRINSICS_UNPACK_16 -# else - /* 16bpp 5/6/5 */ - __asm__( ".p2align 3" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -# endif - } - - p_y += 8; - p_u += 4; - p_v += 4; - p_buffer += 8; -#endif } SCALE_WIDTH; SCALE_HEIGHT( 420, 2 ); @@ -414,21 +309,13 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, } } -/***************************************************************************** - * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp - ***************************************************************************** - * Horizontal alignment needed: - * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed - * - output: 1 pixel (2 bytes), margins allowed - * Vertical alignment needed: - * - input: 2 lines (2 Y lines, 1 U/V line) - * - output: 1 line - *****************************************************************************/ -void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + +void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, picture_t *p_dest ) { /* We got this one from the old arguments */ - uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; uint8_t *p_y = p_src->Y_PIXELS; uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; @@ -441,17 +328,11 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, int i_rewind; int i_scale_count; /* scale modulo counter */ int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ - uint32_t * p_pic_start; /* beginning of the current line for copy */ -#if defined (MODULE_NAME_IS_i420_rgb) - int i_uval, i_vval; /* U and V samples */ - int i_red, i_green, i_blue; /* U and V modified samples */ - uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32; - uint32_t * p_ybase; /* Y dependant conversion table */ -#endif + uint16_t * p_pic_start; /* beginning of the current line for copy */ /* Conversion buffer pointer */ - uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; - uint32_t * p_buffer; + uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; + uint16_t * p_buffer; /* Offset array pointer */ int * p_offset_start = p_vout->chroma.p_sys->p_offset; @@ -464,15 +345,6 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - if( p_vout->render.i_width & 7 ) - { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); - } - else - { - i_rewind = 0; - } - /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ @@ -480,11 +352,191 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, p_vout->output.i_width, p_vout->output.i_height, &b_hscale, &i_vscale, p_offset_start ); + /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? p_vout->output.i_height : p_vout->render.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_vout->render.i_width & 15 ) + { + i_rewind = 16 - ( p_vout->render.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((int)p_y)| + ((int)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_vout->render.i_width/16; i_x--; ) + { +#if defined (CAN_COMPILE_SSE2) + __asm__( ".p2align 3" + SSE2_INIT_16_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_ALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + SSE2_INTRINSICS_INIT_16_ALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_15_ALIGNED +#endif + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + +#if defined (CAN_COMPILE_SSE2) + __asm__( ".p2align 3" + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_16_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_15_UNALIGNED + } +#endif + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + /* make sure all SSE2 stores are visible thereafter */ +#if defined (CAN_COMPILE_SSE2) + __asm__ __volatile__ ( "sfence" ); +#else + _mm_sfence(); +#endif + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width/16; i_x--; ) + { +#if defined (CAN_COMPILE_SSE2) + __asm__( ".p2align 3" + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + SSE2_INTRINSICS_INIT_16_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_15_UNALIGNED +#endif + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + +#if defined (CAN_COMPILE_SSE2) + __asm__( ".p2align 3" + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_16_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_15_UNALIGNED + } +#endif + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + + if( p_vout->render.i_width & 7 ) + { + i_rewind = 8 - ( p_vout->render.i_width & 7 ); + } + else + { + i_rewind = 0; + } + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) { p_pic_start = p_pic; @@ -492,81 +544,62 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, for ( i_x = p_vout->render.i_width / 8; i_x--; ) { -#if defined (MODULE_NAME_IS_i420_rgb) - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); -#elif defined (MODULE_NAME_IS_i420_rgb_mmx) -# if defined (HAVE_MMX_INTRINSICS) - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - INTRINSICS_INIT_32 - INTRINSICS_YUV_MUL - INTRINSICS_YUV_ADD - INTRINSICS_UNPACK_32 -# else - __asm__( MMX_INIT_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - +#if defined (CAN_COMPILE_MMX) __asm__( ".p2align 3" + MMX_INIT_16 MMX_YUV_MUL MMX_YUV_ADD - MMX_UNPACK_32 + MMX_UNPACK_15 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -# endif +#else + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + uint64_t tmp64; + MMX_INTRINSICS_INIT_16 + MMX_INTRINSICS_YUV_MUL + MMX_INTRINSICS_YUV_ADD + MMX_INTRINSICS_UNPACK_15 +#endif p_y += 8; p_u += 4; p_v += 4; p_buffer += 8; -#endif } /* Here we do some unaligned reads and duplicate conversions, but * at least we have all the pixels */ if( i_rewind ) { -#if defined (MODULE_NAME_IS_i420_rgb_mmx) -# if defined (HAVE_MMX_INTRINSICS) - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; -# endif -#endif p_y -= i_rewind; p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (MODULE_NAME_IS_i420_rgb) - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); -#elif defined (MODULE_NAME_IS_i420_rgb_mmx) -# if defined (HAVE_MMX_INTRINSICS) - INTRINSICS_INIT_32 - INTRINSICS_YUV_MUL - INTRINSICS_YUV_ADD - INTRINSICS_UNPACK_32 -# else - __asm__( MMX_INIT_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); +#if defined (CAN_COMPILE_MMX) __asm__( ".p2align 3" + MMX_INIT_16 MMX_YUV_MUL MMX_YUV_ADD - MMX_UNPACK_32 + MMX_UNPACK_15 : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -# endif +#else + { + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + uint64_t tmp64; + MMX_INTRINSICS_INIT_16 + MMX_INTRINSICS_YUV_MUL + MMX_INTRINSICS_YUV_ADD + MMX_INTRINSICS_UNPACK_15 + } +#endif p_y += 8; p_u += 4; p_v += 4; p_buffer += 8; -#endif } SCALE_WIDTH; - SCALE_HEIGHT( 420, 4 ); + SCALE_HEIGHT( 420, 2 ); p_y += i_source_margin; if( i_y % 2 ) @@ -575,8 +608,1081 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, p_v += i_source_margin_c; } } + /* re-enable FPU registers */ +#if defined (CAN_COMPILE_MMX) + __asm__ __volatile__ ( "emms" ); +#else + _mm_empty(); +#endif + +#endif } +void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + vlc_bool_t b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + uint16_t * p_pic_start; /* beginning of the current line for copy */ + + /* Conversion buffer pointer */ + uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; + uint16_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_vout->render.i_width, p_vout->render.i_height, + p_vout->output.i_width, p_vout->output.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_vout->output.i_height : p_vout->render.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_vout->render.i_width & 15 ) + { + i_rewind = 16 - ( p_vout->render.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((int)p_y)| + ((int)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_vout->render.i_width/16; i_x--; ) + { +#if defined (CAN_COMPILE_SSE2) + __asm__( ".p2align 3" + SSE2_INIT_16_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_ALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + SSE2_INTRINSICS_INIT_16_ALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_16_ALIGNED +#endif + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + +#if defined (CAN_COMPILE_SSE2) + __asm__( ".p2align 3" + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_16_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_16_UNALIGNED + } +#endif + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + /* make sure all SSE2 stores are visible thereafter */ +#if defined (CAN_COMPILE_SSE2) + __asm__ __volatile__ ( "sfence" ); +#else + _mm_sfence(); +#endif + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width/16; i_x--; ) + { +#if defined (CAN_COMPILE_SSE2) + __asm__( ".p2align 3" + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + SSE2_INTRINSICS_INIT_16_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_16_UNALIGNED +#endif + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + +#if defined (CAN_COMPILE_SSE2) + __asm__( ".p2align 3" + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_16_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_16_UNALIGNED + } +#endif + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + + if( p_vout->render.i_width & 7 ) + { + i_rewind = 8 - ( p_vout->render.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 8; i_x--; ) + { +#if defined (CAN_COMPILE_MMX) + __asm__( ".p2align 3" + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_16 + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); +#else + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + uint64_t tmp64; + MMX_INTRINSICS_INIT_16 + MMX_INTRINSICS_YUV_MUL + MMX_INTRINSICS_YUV_ADD + MMX_INTRINSICS_UNPACK_16 +#endif + + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + +#if defined (CAN_COMPILE_MMX) + __asm__( ".p2align 3" + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_16 + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); +#else + { + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + uint64_t tmp64; + + MMX_INTRINSICS_INIT_16 + MMX_INTRINSICS_YUV_MUL + MMX_INTRINSICS_YUV_ADD + MMX_INTRINSICS_UNPACK_16 + } +#endif + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + /* re-enable FPU registers */ +#if defined (CAN_COMPILE_MMX) + __asm__ __volatile__ ( "emms" ); +#else + _mm_empty(); +#endif + +#endif +} + +#endif + +/***************************************************************************** + * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp + ***************************************************************************** + * Horizontal alignment needed: + * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed + * - output: 1 pixel (2 bytes), margins allowed + * Vertical alignment needed: + * - input: 2 lines (2 Y lines, 1 U/V line) + * - output: 1 line + *****************************************************************************/ + +#if defined (MODULE_NAME_IS_i420_rgb) + +void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + vlc_bool_t b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + int i_uval, i_vval; /* U and V samples */ + int i_red, i_green, i_blue; /* U and V modified samples */ + uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32; + uint32_t * p_ybase; /* Y dependant conversion table */ + + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + if( p_vout->render.i_width & 7 ) + { + i_rewind = 8 - ( p_vout->render.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_vout->render.i_width, p_vout->render.i_height, + p_vout->output.i_width, p_vout->output.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_vout->output.i_height : p_vout->render.i_height; + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 8; i_x--; ) + { + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } +} + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2) + +void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + vlc_bool_t b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_vout->render.i_width, p_vout->render.i_height, + p_vout->output.i_width, p_vout->output.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_vout->output.i_height : p_vout->render.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_vout->render.i_width & 15 ) + { + i_rewind = 16 - ( p_vout->render.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((int)p_y)| + ((int)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_vout->render.i_width / 16; i_x--; ) + { +#if defined (CAN_COMPILE_SSE2) + /* use inline SSE2 assembly */ + __asm__( ".p2align 3" + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_ALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + /* otherwise use SSE2 C intrinsics wrappers */ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_32_ALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED +#endif + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; +#if defined (CAN_COMPILE_SSE2) + /* use inline SSE2 assembly */ + __asm__( ".p2align 3" + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + /* otherwise use SSE2 intrinsics wrappers */ + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_32_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED + } +#endif + p_y += 16; + p_u += 4; + p_v += 4; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + /* make sure all SSE2 stores are visible thereafter */ +#if defined (CAN_COMPILE_SSE2) + __asm__ __volatile__ ( "sfence" ); +#else + _mm_sfence(); +#endif + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 16; i_x--; ) + { +#if defined (CAN_COMPILE_SSE2) + /* use inline SSE2 assembly */ + __asm__( ".p2align 3" + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + /* otherwise use SSE2 C intrinsics wrappers */ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_32_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED +#endif + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; +#if defined (CAN_COMPILE_SSE2) + /* use inline SSE2 assembly */ + __asm__( ".p2align 3" + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + /* otherwise use SSE2 intrinsics wrappers */ + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_32_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED + } +#endif + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + +#else + + if( p_vout->render.i_width & 7 ) + { + i_rewind = 8 - ( p_vout->render.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 8; i_x--; ) + { +#if defined (CAN_COMPILE_MMX) + /* use inline MMX assembly */ + __asm__( MMX_INIT_32 + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + + __asm__( ".p2align 3" + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); +#else + /* otherwise use MMX C intrinsics wrappers */ + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + uint64_t tmp64; + + MMX_INTRINSICS_INIT_32 + MMX_INTRINSICS_YUV_MUL + MMX_INTRINSICS_YUV_ADD + MMX_INTRINSICS_UNPACK_32_ARGB +#endif + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; +#if defined (CAN_COMPILE_MMX) + /* use inline MMX assembly */ + __asm__( ".p2align 3" + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); +#else + /* otherwise use MMX intrinsics wrappers */ + { + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + uint64_t tmp64; + + MMX_INTRINSICS_INIT_32 + MMX_INTRINSICS_YUV_MUL + MMX_INTRINSICS_YUV_ADD + MMX_INTRINSICS_UNPACK_32_ARGB + } +#endif + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + /* re-enable FPU registers */ +#if defined (CAN_COMPILE_MMX) + __asm__ __volatile__ ( "emms" ); +#else + _mm_empty(); +#endif + +#endif +} + +void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + vlc_bool_t b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_vout->render.i_width, p_vout->render.i_height, + p_vout->output.i_width, p_vout->output.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_vout->output.i_height : p_vout->render.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_vout->render.i_width & 15 ) + { + i_rewind = 16 - ( p_vout->render.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((int)p_y)| + ((int)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_vout->render.i_width / 16; i_x--; ) + { +#if defined (CAN_COMPILE_SSE2) + /* use inline SSE2 assembly */ + __asm__( ".p2align 3" + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_ALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + /* otherwise use SSE2 C intrinsics wrappers */ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_32_ALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED +#endif + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; +#if defined (CAN_COMPILE_SSE2) + /* use inline SSE2 assembly */ + __asm__( ".p2align 3" + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + /* otherwise use SSE2 intrinsics wrappers */ + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_32_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED + } +#endif + p_y += 16; + p_u += 4; + p_v += 4; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + /* make sure all SSE2 stores are visible thereafter */ +#if defined (CAN_COMPILE_SSE2) + __asm__ __volatile__ ( "sfence" ); +#else + _mm_sfence(); +#endif + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 16; i_x--; ) + { +#if defined (CAN_COMPILE_SSE2) + /* use inline SSE2 assembly */ + __asm__( ".p2align 3" + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + /* otherwise use SSE2 C intrinsics wrappers */ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_32_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED +#endif + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; +#if defined (CAN_COMPILE_SSE2) + /* use inline SSE2 assembly */ + __asm__( ".p2align 3" + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); +#else + /* otherwise use SSE2 intrinsics wrappers */ + { + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + SSE2_INTRINSICS_INIT_32_UNALIGNED + SSE2_INTRINSICS_YUV_MUL + SSE2_INTRINSICS_YUV_ADD + SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED + } +#endif + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + +#else + + if( p_vout->render.i_width & 7 ) + { + i_rewind = 8 - ( p_vout->render.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_vout->render.i_width / 8; i_x--; ) + { +#if defined (CAN_COMPILE_MMX) + /* use inline MMX assembly */ + __asm__( MMX_INIT_32 + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + + __asm__( ".p2align 3" + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); +#else + /* otherwise use MMX C intrinsics wrappers */ + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + uint64_t tmp64; + + MMX_INTRINSICS_INIT_32 + MMX_INTRINSICS_YUV_MUL + MMX_INTRINSICS_YUV_ADD + MMX_INTRINSICS_UNPACK_32_BGRA +#endif + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; +#if defined (CAN_COMPILE_MMX) + /* use inline MMX assembly */ + __asm__( ".p2align 3" + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_BGRA + : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); +#else + /* otherwise use MMX intrinsics wrappers */ + { + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; + uint64_t tmp64; + + MMX_INTRINSICS_INIT_32 + MMX_INTRINSICS_YUV_MUL + MMX_INTRINSICS_YUV_ADD + MMX_INTRINSICS_UNPACK_32_BGRA + } +#endif + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + /* re-enable FPU registers */ +#if defined (CAN_COMPILE_MMX) + __asm__ __volatile__ ( "emms" ); +#else + _mm_empty(); +#endif + +#endif +} + +#endif + /* Following functions are local */ /***************************************************************************** diff --git a/modules/video_chroma/i420_rgb_mmx.h b/modules/video_chroma/i420_rgb_mmx.h index 1c2f9bb5b0..42b33d412c 100644 --- a/modules/video_chroma/i420_rgb_mmx.h +++ b/modules/video_chroma/i420_rgb_mmx.h @@ -1,12 +1,13 @@ /***************************************************************************** * transforms_yuvmmx.h: MMX YUV transformation assembly ***************************************************************************** - * Copyright (C) 1999-2004 the VideoLAN team + * Copyright (C) 1999-2007 the VideoLAN team * $Id$ * * Authors: Olie Lho * Gaël Hendryckx * Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -53,21 +54,48 @@ USED_U64(mmx_mask_fc) = 0xfcfcfcfcfcfcfcfcULL; #endif #define MMX_INIT_16 " \n\ -movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ pxor %%mm4, %%mm4 # zero mm4 \n\ -movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ -#movl $0, (%3) # cache preload for image \n\ +movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ " -#define INTRINSICS_INIT_16 \ - tmp64 = *(uint32_t *)p_u; \ - mm0 = (__m64)tmp64; \ - tmp64 = *(uint32_t *)p_v; \ - mm1 = (__m64)tmp64; \ - mm4 = (__m64)(uint64_t)0; \ - mm6 = (__m64)*(uint64_t *)p_y; \ - /* *(uint16_t *)p_buffer = 0; */ +#define SSE2_INIT_16_ALIGNED " \n\ +prefetcht1 (%3) # cache preload for image \n\ +movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +" + +#define SSE2_INIT_16_UNALIGNED " \n\ +prefetcht1 (%3) # cache preload for image \n\ +movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +" + +#define MMX_INTRINSICS_INIT_16 \ + tmp64 = *(uint32_t *)p_u; \ + mm0 = (__m64)tmp64; \ + tmp64 = *(uint32_t *)p_v; \ + mm1 = (__m64)tmp64; \ + mm4 = _mm_setzero_si64(); \ + mm6 = (__m64)*(uint64_t *)p_y; \ + +#define SSE2_INTRINSICS_INIT_16_ALIGNED \ + xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm4 = _mm_setzero_si128(); \ + xmm6 = _mm_load_si128((__m128i *)p_y); \ + +#define SSE2_INTRINSICS_INIT_16_UNALIGNED \ + _mm_prefetch(p_buffer, _MM_HINT_T1); \ + xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm4 = _mm_setzero_si128(); \ + xmm6 = _mm_loadu_si128((__m128i *)p_y); \ #define MMX_INIT_16_GRAY " \n\ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ @@ -76,21 +104,49 @@ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ #define MMX_INIT_32 " \n\ movd (%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -movl $0, (%3) # cache preload for image \n\ +movl $0, (%3) # cache preload for image \n\ movd (%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -pxor %%mm4, %%mm4 # zero mm4 \n\ +pxor %%mm4, %%mm4 # zero mm4 \n\ movq (%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ " -#define INTRINSICS_INIT_32 \ - tmp64 = *(uint32_t *)p_u; \ - mm0 = (__m64)tmp64; \ - *(uint16_t *)p_buffer = 0; \ - tmp64 = *(uint32_t *)p_v; \ - mm1 = (__m64)tmp64; \ - mm4 = (__m64)(uint64_t)0; \ +#define SSE2_INIT_32_ALIGNED " \n\ +movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +" + +#define SSE2_INIT_32_UNALIGNED " \n\ +prefetcht1 (%3) # cache preload for image \n\ +movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ +movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +" + +#define MMX_INTRINSICS_INIT_32 \ + tmp64 = *(uint32_t *)p_u; \ + mm0 = (__m64)tmp64; \ + *(uint16_t *)p_buffer = 0; \ + tmp64 = *(uint32_t *)p_v; \ + mm1 = (__m64)tmp64; \ + mm4 = _mm_setzero_si64(); \ mm6 = (__m64)*(uint64_t *)p_y; +#define SSE2_INTRINSICS_INIT_32_ALIGNED \ + xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm4 = _mm_setzero_si128(); \ + xmm6 = _mm_load_si128((__m128i *)p_y); \ + +#define SSE2_INTRINSICS_INIT_32_UNALIGNED \ + _mm_prefetch(p_buffer, _MM_HINT_T1); \ + xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ + xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ + xmm4 = _mm_setzero_si128(); \ + xmm6 = _mm_loadu_si128((__m128i *)p_y); \ + /* * Do the multiply part of the conversion for even and odd pixels, * register usage: @@ -126,7 +182,58 @@ pmulhw mmx_Y_coeff"G", %%mm6 # Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 \n\ pmulhw mmx_Y_coeff"G", %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\ " -#define INTRINSICS_YUV_MUL \ +#define SSE2_YUV_MUL " \n\ +# convert the chroma part \n\ +punpcklbw %%xmm4, %%xmm0 # scatter 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\ +punpcklbw %%xmm4, %%xmm1 # scatter 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\ +movl $0x00800080, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 0080 0080 ... 0080 0080 \n\ +psubsw %%xmm5, %%xmm0 # Cb -= 128 \n\ +psubsw %%xmm5, %%xmm1 # Cr -= 128 \n\ +psllw $3, %%xmm0 # Promote precision \n\ +psllw $3, %%xmm1 # Promote precision \n\ +movdqa %%xmm0, %%xmm2 # Copy 8 Cb 00 u3 00 u2 00 u1 00 u0 \n\ +movdqa %%xmm1, %%xmm3 # Copy 8 Cr 00 v3 00 v2 00 v1 00 v0 \n\ +movl $0xf37df37d, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to f37d f37d ... f37d f37d \n\ +pmulhw %%xmm5, %%xmm2 # Mul Cb with green coeff -> Cb green \n\ +movl $0xe5fce5fc, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to e5fc e5fc ... e5fc e5fc \n\ +pmulhw %%xmm5, %%xmm3 # Mul Cr with green coeff -> Cr green \n\ +movl $0x40934093, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 4093 4093 ... 4093 4093 \n\ +pmulhw %%xmm5, %%xmm0 # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 \n\ +movl $0x33123312, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 3312 3312 ... 3312 3312 \n\ +pmulhw %%xmm5, %%xmm1 # Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 \n\ +paddsw %%xmm3, %%xmm2 # Cb green + Cr green -> Cgreen \n\ + \n\ +# convert the luma part \n\ +movl $0x10101010, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # Set xmm5 to 1010 1010 ... 1010 1010 \n\ +psubusb %%xmm5, %%xmm6 # Y -= 16 \n\ +movdqa %%xmm6, %%xmm7 # Copy 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +movl $0x00ff00ff, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 00ff 00ff ... 00ff 00ff \n\ +pand %%xmm5, %%xmm6 # get Y even 00 Y6 00 Y4 00 Y2 00 Y0 \n\ +psrlw $8, %%xmm7 # get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 \n\ +psllw $3, %%xmm6 # Promote precision \n\ +psllw $3, %%xmm7 # Promote precision \n\ +movl $0x253f253f, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to 253f 253f ... 253f 253f \n\ +pmulhw %%xmm5, %%xmm6 # Mul 8 Y even 00 y6 00 y4 00 y2 00 y0 \n\ +pmulhw %%xmm5, %%xmm7 # Mul 8 Y odd 00 y7 00 y5 00 y3 00 y1 \n\ +" + +#define MMX_INTRINSICS_YUV_MUL \ mm0 = _mm_unpacklo_pi8(mm0, mm4); \ mm1 = _mm_unpacklo_pi8(mm1, mm4); \ mm0 = _mm_subs_pi16(mm0, (__m64)mmx_80w); \ @@ -150,6 +257,38 @@ pmulhw mmx_Y_coeff"G", %%mm7 # Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 \n\ mm6 = _mm_mulhi_pi16(mm6, (__m64)mmx_Y_coeff); \ mm7 = _mm_mulhi_pi16(mm7, (__m64)mmx_Y_coeff); +#define SSE2_INTRINSICS_YUV_MUL \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ + xmm5 = _mm_set1_epi32(0x80808080UL); \ + xmm0 = _mm_subs_epi16(xmm0, xmm5); \ + xmm1 = _mm_subs_epi16(xmm1, xmm5); \ + xmm0 = _mm_slli_epi16(xmm0, 3); \ + xmm1 = _mm_slli_epi16(xmm1, 3); \ + xmm2 = xmm0; \ + xmm3 = xmm1; \ + xmm5 = _mm_set1_epi32(0xf37df37dUL); \ + xmm2 = _mm_mulhi_epi16(xmm2, xmm5); \ + xmm5 = _mm_set1_epi32(0xe5fce5fcUL); \ + xmm3 = _mm_mulhi_epi16(xmm3, xmm5); \ + xmm5 = _mm_set1_epi32(0x40934093UL); \ + xmm0 = _mm_mulhi_epi16(xmm0, xmm5); \ + xmm5 = _mm_set1_epi32(0x33123312UL); \ + xmm1 = _mm_mulhi_epi16(xmm1, xmm5); \ + xmm2 = _mm_adds_epi16(xmm2, xmm3); \ + \ + xmm5 = _mm_set1_epi32(0x10101010UL); \ + xmm6 = _mm_subs_epu8(xmm6, xmm5); \ + xmm7 = xmm6; \ + xmm5 = _mm_set1_epi32(0x00ff00ffUL); \ + xmm6 = _mm_and_si128(xmm6, xmm5); \ + xmm7 = _mm_srli_epi16(xmm7, 8); \ + xmm6 = _mm_slli_epi16(xmm6, 3); \ + xmm7 = _mm_slli_epi16(xmm7, 3); \ + xmm5 = _mm_set1_epi32(0x253f253fUL); \ + xmm6 = _mm_mulhi_epi16(xmm6, xmm5); \ + xmm7 = _mm_mulhi_epi16(xmm7, xmm5); + /* * Do the addition part of the conversion for even and odd pixels, * register usage: @@ -186,7 +325,35 @@ punpcklbw %%mm4, %%mm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ " -#define INTRINSICS_YUV_ADD \ +#define SSE2_YUV_ADD " \n\ +# Do horizontal and vertical scaling \n\ +movdqa %%xmm0, %%xmm3 # Copy Cblue \n\ +movdqa %%xmm1, %%xmm4 # Copy Cred \n\ +movdqa %%xmm2, %%xmm5 # Copy Cgreen \n\ +paddsw %%xmm6, %%xmm0 # Y even + Cblue 00 B6 00 B4 00 B2 00 B0 \n\ +paddsw %%xmm7, %%xmm3 # Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 \n\ +paddsw %%xmm6, %%xmm1 # Y even + Cred 00 R6 00 R4 00 R2 00 R0 \n\ +paddsw %%xmm7, %%xmm4 # Y odd + Cred 00 R7 00 R5 00 R3 00 R1 \n\ +paddsw %%xmm6, %%xmm2 # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 \n\ +paddsw %%xmm7, %%xmm5 # Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 \n\ + \n\ +# Limit RGB even to 0..255 \n\ +packuswb %%xmm0, %%xmm0 # B6 B4 B2 B0 / B6 B4 B2 B0 \n\ +packuswb %%xmm1, %%xmm1 # R6 R4 R2 R0 / R6 R4 R2 R0 \n\ +packuswb %%xmm2, %%xmm2 # G6 G4 G2 G0 / G6 G4 G2 G0 \n\ + \n\ +# Limit RGB odd to 0..255 \n\ +packuswb %%xmm3, %%xmm3 # B7 B5 B3 B1 / B7 B5 B3 B1 \n\ +packuswb %%xmm4, %%xmm4 # R7 R5 R3 R1 / R7 R5 R3 R1 \n\ +packuswb %%xmm5, %%xmm5 # G7 G5 G3 G1 / G7 G5 G3 G1 \n\ + \n\ +# Interleave RGB even and odd \n\ +punpcklbw %%xmm3, %%xmm0 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%xmm4, %%xmm1 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ +punpcklbw %%xmm5, %%xmm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +" + +#define MMX_INTRINSICS_YUV_ADD \ mm3 = mm0; \ mm4 = mm1; \ mm5 = mm2; \ @@ -209,6 +376,29 @@ punpcklbw %%mm5, %%mm2 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ mm1 = _mm_unpacklo_pi8(mm1, mm4); \ mm2 = _mm_unpacklo_pi8(mm2, mm5); +#define SSE2_INTRINSICS_YUV_ADD \ + xmm3 = xmm0; \ + xmm4 = xmm1; \ + xmm5 = xmm2; \ + xmm0 = _mm_adds_epi16(xmm0, xmm6); \ + xmm3 = _mm_adds_epi16(xmm3, xmm7); \ + xmm1 = _mm_adds_epi16(xmm1, xmm6); \ + xmm4 = _mm_adds_epi16(xmm4, xmm7); \ + xmm2 = _mm_adds_epi16(xmm2, xmm6); \ + xmm5 = _mm_adds_epi16(xmm5, xmm7); \ + \ + xmm0 = _mm_packus_epi16(xmm0, xmm0); \ + xmm1 = _mm_packus_epi16(xmm1, xmm1); \ + xmm2 = _mm_packus_epi16(xmm2, xmm2); \ + \ + xmm3 = _mm_packus_epi16(xmm3, xmm3); \ + xmm4 = _mm_packus_epi16(xmm4, xmm4); \ + xmm5 = _mm_packus_epi16(xmm5, xmm5); \ + \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm3); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); + /* * Grayscale case, only use Y */ @@ -287,13 +477,71 @@ movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\ movq %%mm5, 8(%3) # store pixel 4-7 \n\ " -#define INTRINSICS_UNPACK_15 \ +#define SSE2_UNPACK_15_ALIGNED " \n\ +# mask unneeded bits off \n\ +movl $0xf8f8f8f8, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ +psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ +pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\ +pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ +psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ +movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ + \n\ +# convert rgb24 plane to rgb15 pack for pixel 0-7 \n\ +punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\ +punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\ +por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ +movntdq %%xmm0, (%3) # store pixel 0-7 \n\ + \n\ +# convert rgb24 plane to rgb15 pack for pixel 8-15 \n\ +punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\ +punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\ +por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ +movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\ +" + +#define SSE2_UNPACK_15_UNALIGNED " \n\ +# mask unneeded bits off \n\ +movl $0xf8f8f8f8, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ +psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ +pand %%xmm5, %%xmm2 # g7g6g5g4 g3______ g7g6g5g4 g3______ \n\ +pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ +psrlw $1,%%xmm1 # __r7r6r5 r4r3____ __r7r6r5 r4r3____ \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ +movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ + \n\ +# convert rgb24 plane to rgb15 pack for pixel 0-7 \n\ +punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3______ \n\ +punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $2,%%xmm2 # ________ ____g7g6 g5g4g3__ ________ \n\ +por %%xmm2, %%xmm0 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ +movdqu %%xmm0, (%3) # store pixel 0-7 \n\ + \n\ +# convert rgb24 plane to rgb15 pack for pixel 8-15 \n\ +punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3______ \n\ +punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $2,%%xmm7 # ________ ____g7g6 g5g4g3__ ________ \n\ +por %%xmm7, %%xmm5 # r7r6r5r4 r3__g7g6 g5g4g3b7 b6b5b4b3 \n\ +movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\ +" + +#define MMX_INTRINSICS_UNPACK_15 \ mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \ mm0 = _mm_srli_pi16(mm0, 3); \ mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_f8); \ mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \ mm1 = _mm_srli_pi16(mm1, 1); \ - mm4 = (__m64)(uint64_t)0; \ + mm4 = _mm_setzero_si64(); \ mm5 = mm0; \ mm7 = mm2; \ \ @@ -315,6 +563,52 @@ movq %%mm5, 8(%3) # store pixel 4-7 \n\ mm1 = (__m64)tmp64; \ *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; +#define SSE2_INTRINSICS_UNPACK_15_ALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm1 = _mm_srli_epi16(xmm1, 1); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ + \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 2); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_stream_si128((__m128i*)p_buffer, xmm0); \ + \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 2); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); + +#define SSE2_INTRINSICS_UNPACK_15_UNALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm1 = _mm_srli_epi16(xmm1, 1); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ + \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 2); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ + \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 2); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ + _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5); + /* * convert RGB plane to RGB 16 bits, * mm0 -> B, mm1 -> R, mm2 -> G, @@ -350,12 +644,74 @@ movd 4(%2), %%mm1 # Load 4 Cr __ __ __ __ v3 v2 v1 v0 \n\ movq %%mm5, 8(%3) # store pixel 4-7 \n\ " -#define INTRINSICS_UNPACK_16 \ +#define SSE2_UNPACK_16_ALIGNED " \n\ +# mask unneeded bits off \n\ +movl $0xf8f8f8f8, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ +pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ +movl $0xfcfcfcfc, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\ +psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ +movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ + \n\ +# convert rgb24 plane to rgb16 pack for pixel 0-7 \n\ +punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\ +punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\ +por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ +movntdq %%xmm0, (%3) # store pixel 0-7 \n\ + \n\ +# convert rgb24 plane to rgb16 pack for pixel 8-15 \n\ +punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\ +punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\ +por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ +movntdq %%xmm5, 16(%3) # store pixel 4-7 \n\ +" + +#define SSE2_UNPACK_16_UNALIGNED " \n\ +# mask unneeded bits off \n\ +movl $0xf8f8f8f8, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm0 # b7b6b5b4 b3______ b7b6b5b4 b3______ \n\ +pand %%xmm5, %%xmm1 # r7r6r5r4 r3______ r7r6r5r4 r3______ \n\ +movl $0xfcfcfcfc, %%eax # \n\ +movd %%eax, %%xmm5 # \n\ +pshufd $0, %%xmm5, %%xmm5 # set xmm5 to f8f8 f8f8 ... f8f8 f8f8 \n\ +pand %%xmm5, %%xmm2 # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____ \n\ +psrlw $3,%%xmm0 # ______b7 b6b5b4b3 ______b7 b6b5b4b3 \n\ +pxor %%xmm4, %%xmm4 # zero mm4 \n\ +movdqa %%xmm0, %%xmm5 # Copy B15-B0 \n\ +movdqa %%xmm2, %%xmm7 # Copy G15-G0 \n\ + \n\ +# convert rgb24 plane to rgb16 pack for pixel 0-7 \n\ +punpcklbw %%xmm4, %%xmm2 # ________ ________ g7g6g5g4 g3g2____ \n\ +punpcklbw %%xmm1, %%xmm0 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $3,%%xmm2 # ________ __g7g6g5 g4g3g2__ ________ \n\ +por %%xmm2, %%xmm0 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ +movdqu %%xmm0, (%3) # store pixel 0-7 \n\ + \n\ +# convert rgb24 plane to rgb16 pack for pixel 8-15 \n\ +punpckhbw %%xmm4, %%xmm7 # ________ ________ g7g6g5g4 g3g2____ \n\ +punpckhbw %%xmm1, %%xmm5 # r7r6r5r4 r3______ ______b7 b6b5b4b3 \n\ +psllw $3,%%xmm7 # ________ __g7g6g5 g4g3g2__ ________ \n\ +por %%xmm7, %%xmm5 # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 \n\ +movdqu %%xmm5, 16(%3) # store pixel 4-7 \n\ +" + +#define MMX_INTRINSICS_UNPACK_16 \ mm0 = _mm_and_si64(mm0, (__m64)mmx_mask_f8); \ mm2 = _mm_and_si64(mm2, (__m64)mmx_mask_fc); \ mm1 = _mm_and_si64(mm1, (__m64)mmx_mask_f8); \ mm0 = _mm_srli_pi16(mm0, 3); \ - mm4 = (__m64)(uint64_t)0; \ + mm4 = _mm_setzero_si64(); \ mm5 = mm0; \ mm7 = mm2; \ \ @@ -377,62 +733,294 @@ movq %%mm5, 8(%3) # store pixel 4-7 \n\ mm1 = (__m64)tmp64; \ *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; +#define SSE2_INTRINSICS_UNPACK_16_ALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ + \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 3); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_stream_si128((__m128i*)p_buffer, xmm0); \ + \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 3); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); + +#define SSE2_INTRINSICS_UNPACK_16_UNALIGNED \ + xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ + xmm0 = _mm_and_si128(xmm0, xmm5); \ + xmm1 = _mm_and_si128(xmm1, xmm5); \ + xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ + xmm2 = _mm_and_si128(xmm2, xmm5); \ + xmm0 = _mm_srli_epi16(xmm0, 3); \ + xmm4 = _mm_setzero_si128(); \ + xmm5 = xmm0; \ + xmm7 = xmm2; \ + \ + xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ + xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ + xmm2 = _mm_slli_epi16(xmm2, 3); \ + xmm0 = _mm_or_si128(xmm0, xmm2); \ + _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ + \ + xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ + xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ + xmm7 = _mm_slli_epi16(xmm7, 3); \ + xmm5 = _mm_or_si128(xmm5, xmm7); \ + _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); + /* * convert RGB plane to RGB packed format, - * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, - * mm4 -> GB, mm5 -> AR pixel 4-7, - * mm6 -> GB, mm7 -> AR pixel 0-3 + * mm0 -> B, mm1 -> R, mm2 -> G */ -#define MMX_UNPACK_32 " \n\ +#define MMX_UNPACK_32_ARGB " \n\ pxor %%mm3, %%mm3 # zero mm3 \n\ -movq %%mm0, %%mm6 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ -movq %%mm1, %%mm7 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%mm2, %%mm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ movq %%mm1, %%mm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ -punpcklbw %%mm2, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ -punpcklbw %%mm3, %%mm7 # 00 R3 00 R2 00 R1 00 R0 \n\ -punpcklwd %%mm7, %%mm6 # 00 R1 B1 G1 00 R0 B0 G0 \n\ -movq %%mm6, (%3) # Store ARGB1 ARGB0 \n\ -movq %%mm0, %%mm6 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ -punpcklbw %%mm2, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ -punpckhwd %%mm7, %%mm6 # 00 R3 G3 B3 00 R2 B3 G2 \n\ +punpcklbw %%mm3, %%mm5 # 00 R3 00 R2 00 R1 00 R0 \n\ +movq %%mm4, %%mm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ +punpcklwd %%mm5, %%mm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\ +movq %%mm4, (%3) # Store ARGB1 ARGB0 \n\ +punpckhwd %%mm5, %%mm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\ movq %%mm6, 8(%3) # Store ARGB3 ARGB2 \n\ -punpckhbw %%mm2, %%mm4 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ -punpckhbw %%mm3, %%mm5 # 00 R7 00 R6 00 R5 00 R4 \n\ -punpcklwd %%mm5, %%mm4 # 00 R5 B5 G5 00 R4 B4 G4 \n\ -movq %%mm4, 16(%3) # Store ARGB5 ARGB4 \n\ -movq %%mm0, %%mm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ -punpckhbw %%mm2, %%mm4 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ -punpckhwd %%mm5, %%mm4 # 00 R7 G7 B7 00 R6 B6 G6 \n\ -movq %%mm4, 24(%3) # Store ARGB7 ARGB6 \n\ - \n\ -#movd 4(%1), %%mm0 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ -#movd 4(%2), %%mm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ -#pxor %%mm4, %%mm4 # zero mm4 \n\ -#movq 8(%0), %%mm6 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ +punpckhbw %%mm2, %%mm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ +punpckhbw %%mm3, %%mm1 # 00 R7 00 R6 00 R5 00 R4 \n\ +movq %%mm0, %%mm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ +punpcklwd %%mm1, %%mm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\ +movq %%mm5, 16(%3) # Store ARGB5 ARGB4 \n\ +punpckhwd %%mm1, %%mm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\ +movq %%mm0, 24(%3) # Store ARGB7 ARGB6 \n\ +" + +#define SSE2_UNPACK_32_ARGB_ALIGNED " \n\ +pxor %%xmm3, %%xmm3 # zero xmm3 \n\ +movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ +movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ +punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\ +movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ +punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\ +movntdq %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\ +punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\ +movntdq %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\ +punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ +punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\ +movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ +punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\ +movntdq %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\ +punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\ +movntdq %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\ +" + +#define SSE2_UNPACK_32_ARGB_UNALIGNED " \n\ +pxor %%xmm3, %%xmm3 # zero xmm3 \n\ +movdqa %%xmm0, %%xmm4 # B7 B6 B5 B4 B3 B2 B1 B0 \n\ +punpcklbw %%xmm2, %%xmm4 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ +movdqa %%xmm1, %%xmm5 # R7 R6 R5 R4 R3 R2 R1 R0 \n\ +punpcklbw %%xmm3, %%xmm5 # 00 R3 00 R2 00 R1 00 R0 \n\ +movdqa %%xmm4, %%xmm6 # G3 B3 G2 B2 G1 B1 G0 B0 \n\ +punpcklwd %%xmm5, %%xmm4 # 00 R1 B1 G1 00 R0 B0 G0 \n\ +movdqu %%xmm4, (%3) # Store ARGB3 ARGB2 ARGB1 ARGB0 \n\ +punpckhwd %%xmm5, %%xmm6 # 00 R3 B3 G3 00 R2 B2 G2 \n\ +movdqu %%xmm6, 16(%3) # Store ARGB7 ARGB6 ARGB5 ARGB4 \n\ +punpckhbw %%xmm2, %%xmm0 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ +punpckhbw %%xmm3, %%xmm1 # 00 R7 00 R6 00 R5 00 R4 \n\ +movdqa %%xmm0, %%xmm5 # G7 B7 G6 B6 G5 B5 G4 B4 \n\ +punpcklwd %%xmm1, %%xmm5 # 00 R5 B5 G5 00 R4 B4 G4 \n\ +movdqu %%xmm5, 32(%3) # Store ARGB11 ARGB10 ARGB9 ARGB8 \n\ +punpckhwd %%xmm1, %%xmm0 # 00 R7 B7 G7 00 R6 B6 G6 \n\ +movdqu %%xmm0, 48(%3) # Store ARGB15 ARGB14 ARGB13 ARGB12 \n\ " -#define INTRINSICS_UNPACK_32 \ - mm3 = (__m64)(uint64_t)0; \ - mm6 = mm0; \ - mm7 = mm1; \ +#define MMX_INTRINSICS_UNPACK_32_ARGB \ + mm3 = _mm_setzero_si64(); \ mm4 = mm0; \ + mm4 = _mm_unpacklo_pi8(mm4, mm2); \ mm5 = mm1; \ - mm6 = _mm_unpacklo_pi8(mm6, mm2); \ - mm7 = _mm_unpacklo_pi8(mm7, mm3); \ - mm6 = _mm_unpacklo_pi16(mm6, mm7); \ - *(uint64_t *)p_buffer = (uint64_t)mm6; \ - mm6 = mm0; \ - mm6 = _mm_unpacklo_pi8(mm6, mm2); \ - mm6 = _mm_unpackhi_pi16(mm6, mm7); \ - *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6; \ - mm4 = _mm_unpackhi_pi8(mm4, mm2); \ - mm5 = _mm_unpackhi_pi8(mm5, mm3); \ + mm5 = _mm_unpacklo_pi8(mm5, mm3); \ + mm6 = mm4; \ mm4 = _mm_unpacklo_pi16(mm4, mm5); \ - *(uint64_t *)(p_buffer + 4) = (uint64_t)mm4; \ - mm4 = mm0; \ - mm4 = _mm_unpackhi_pi8(mm4, mm2); \ - mm4 = _mm_unpackhi_pi16(mm4, mm5); \ - *(uint64_t *)(p_buffer + 6) = (uint64_t)mm4; \ + *(uint64_t *)p_buffer = (uint64_t)mm4; \ + mm6 = _mm_unpackhi_pi16(mm6, mm5); \ + *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6; \ + mm0 = _mm_unpackhi_pi8(mm0, mm2); \ + mm1 = _mm_unpackhi_pi8(mm1, mm3); \ + mm5 = mm0; \ + mm5 = _mm_unpacklo_pi16(mm5, mm1); \ + *(uint64_t *)(p_buffer + 4) = (uint64_t)mm5; \ + mm0 = _mm_unpackhi_pi16(mm0, mm1); \ + *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; + +#define SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm0; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ + xmm5 = xmm1; \ + xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ + xmm6 = xmm4; \ + xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ + _mm_stream_si128((__m128i*)(p_buffer), xmm4); \ + xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ + _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + xmm5 = xmm0; \ + xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ + _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); + +#define SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm0; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ + xmm5 = xmm1; \ + xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ + xmm6 = xmm4; \ + xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ + _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \ + xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ + _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + xmm5 = xmm0; \ + xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ + _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \ + xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ + _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); + +#define MMX_UNPACK_32_BGRA " \n\ +pxor %%mm3, %%mm3 # zero mm3 \n\ +movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpcklbw %%mm0, %%mm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\ +punpcklbw %%mm1, %%mm3 # R3 00 R2 00 R1 00 R0 00 \n\ +movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\ +punpcklwd %%mm4, %%mm3 # B1 G1 R1 00 B0 G0 R0 00 \n\ +movq %%mm3, (%3) # Store BGRA1 BGRA0 \n\ +punpckhwd %%mm4, %%mm5 # B3 G3 R3 00 B2 G2 R2 00 \n\ +movq %%mm5, 8(%3) # Store BGRA3 BGRA2 \n\ +pxor %%mm3, %%mm3 # zero mm3 \n\ +movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpckhbw %%mm0, %%mm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ +punpckhbw %%mm1, %%mm3 # R7 00 R6 00 R5 00 R4 00 \n\ +movq %%mm3, %%mm5 # R7 00 R6 00 R5 00 R4 00 \n\ +punpcklwd %%mm1, %%mm3 # B5 G5 R5 00 B4 G4 R4 00 \n\ +movq %%mm3, 16(%3) # Store BGRA5 BGRA4 \n\ +punpckhwd %%mm4, %%mm5 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movq %%mm5, 24(%3) # Store BGRA7 BGRA6 \n\ +" + +#define SSE2_UNPACK_32_BGRA_ALIGNED " \n\ +pxor %%xmm3, %%xmm3 # zero mm3 \n\ +movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\ +punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\ +movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\ +punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\ +movntdq %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\ +punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\ +movntdq %%xmm5, 8(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\ +pxor %%xmm3, %%xmm3 # zero mm3 \n\ +movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpckhbw %%xmm0, %%xmm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ +punpckhbw %%xmm1, %%xmm3 # R7 00 R6 00 R5 00 R4 00 \n\ +movdqa %%xmm3, %%xmm5 # R7 00 R6 00 R5 00 R4 00 \n\ +punpcklwd %%xmm1, %%xmm3 # B5 G5 R5 00 B4 G4 R4 00 \n\ +movntdq %%xmm3, 16(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\ +punpckhwd %%xmm4, %%xmm5 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movntdq %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ +" + +#define SSE2_UNPACK_32_BGRA_UNALIGNED " \n\ +pxor %%xmm3, %%xmm3 # zero mm3 \n\ +movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpcklbw %%xmm0, %%xmm4 # B3 G3 B2 G2 B1 G1 B0 G0 \n\ +punpcklbw %%xmm1, %%xmm3 # R3 00 R2 00 R1 00 R0 00 \n\ +movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\ +punpcklwd %%xmm4, %%xmm3 # B1 G1 R1 00 B0 G0 R0 00 \n\ +movdqu %%xmm3, (%3) # Store BGRA3 BGRA2 BGRA1 BGRA0 \n\ +punpckhwd %%xmm4, %%xmm5 # B3 G3 R3 00 B2 G2 R2 00 \n\ +movdqu %%xmm5, 8(%3) # Store BGRA7 BGRA6 BGRA5 BGRA4 \n\ +pxor %%xmm3, %%xmm3 # zero mm3 \n\ +movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ +punpckhbw %%xmm0, %%xmm4 # B7 G7 B6 G6 B5 G5 B4 G4 \n\ +punpckhbw %%xmm1, %%xmm3 # R7 00 R6 00 R5 00 R4 00 \n\ +movdqa %%xmm3, %%xmm5 # R7 00 R6 00 R5 00 R4 00 \n\ +punpcklwd %%xmm1, %%xmm3 # B5 G5 R5 00 B4 G4 R4 00 \n\ +movdqu %%xmm3, 16(%3) # Store BGRA11 BGRA10 BGRA9 BGRA8 \n\ +punpckhwd %%xmm4, %%xmm5 # B7 G7 R7 00 B6 G6 R6 00 \n\ +movdqu %%xmm5, 24(%3) # Store BGRA15 BGRA14 BGRA13 BGRA12 \n\ +" + +#define MMX_INTRINSICS_UNPACK_32_BGRA \ + mm3 = _mm_setzero_si64(); \ + mm4 = mm2; \ + mm4 = _mm_unpacklo_pi8(mm4, mm0); \ + mm1 = _mm_unpacklo_pi8(mm1, mm3); \ + mm5 = mm3; \ + mm3 = _mm_unpacklo_pi16(mm3, mm4); \ + *(uint64_t *)p_buffer = (uint64_t)mm3; \ + mm5 = _mm_unpackhi_pi16(mm5, mm4); \ + *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5; \ + mm3 = _mm_setzero_si64(); \ + mm4 = mm2; \ + mm0 = _mm_unpackhi_pi8(mm0, mm4); \ + mm1 = _mm_unpackhi_pi8(mm1, mm3); \ + mm5 = mm3; \ + mm3 = _mm_unpacklo_pi16(mm3, mm1); \ + *(uint64_t *)(p_buffer + 4) = (uint64_t)mm3; \ + mm5 = _mm_unpackhi_pi16(mm5, mm4); \ + *(uint64_t *)(p_buffer + 6) = (uint64_t)mm5; \ + +#define SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm3); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ + _mm_stream_si128((__m128i*)(p_buffer), xmm3); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ + _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm4); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ + _mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \ + xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \ + _mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \ + +#define SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ + xmm1 = _mm_unpacklo_epi8(xmm1, xmm3); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ + _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \ + xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ + _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \ + xmm3 = _mm_setzero_si128(); \ + xmm4 = xmm2; \ + xmm0 = _mm_unpackhi_epi8(xmm0, xmm4); \ + xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ + xmm5 = xmm3; \ + xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ + _mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \ + xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \ + _mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \ diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c index 6901c7337a..8a76fcb46f 100644 --- a/modules/video_chroma/i420_yuy2.c +++ b/modules/video_chroma/i420_yuy2.c @@ -306,7 +306,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, } #if defined (MODULE_NAME_IS_i420_yuy2_mmx) - __asm__ __volatile__("emms" :: ); + /* re-enable FPU registers */ + __asm__ __volatile__ ( "emms" ); #endif #if defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -347,6 +348,8 @@ static void I420_YUY2( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + /* make sure all SSE2 stores are visible thereafter */ + __asm__ __volatile__ ( "sfence" ); } else { @@ -514,7 +517,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, } #if defined (MODULE_NAME_IS_i420_yuy2_mmx) - __asm__ __volatile__("emms" :: ); + /* re-enable FPU registers */ + __asm__ __volatile__ ( "emms" ); #endif #if defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -554,6 +558,8 @@ static void I420_YVYU( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + /* make sure all SSE2 stores are visible thereafter */ + __asm__ __volatile__ ( "sfence" ); } else { @@ -720,7 +726,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, } #if defined (MODULE_NAME_IS_i420_yuy2_mmx) - __asm__ __volatile__("emms" :: ); + /* re-enable FPU registers */ + __asm__ __volatile__ ( "emms" ); #endif #if defined (MODULE_NAME_IS_i420_yuy2_altivec) @@ -760,6 +767,8 @@ static void I420_UYVY( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + /* make sure all SSE2 stores are visible thereafter */ + __asm__ __volatile__ ( "sfence" ); } else { @@ -861,7 +870,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, } #if defined (MODULE_NAME_IS_i420_yuy2_mmx) - __asm__ __volatile__("emms" :: ); + /* re-enable FPU registers */ + __asm__ __volatile__ ( "emms" ); #endif #else // defined(MODULE_NAME_IS_i420_yuy2_sse2) @@ -897,6 +907,8 @@ static void I420_cyuv( vout_thread_t *p_vout, picture_t *p_source, p_line1 += i_dest_margin; p_line2 += i_dest_margin; } + /* make sure all SSE2 stores are visible thereafter */ + __asm__ __volatile__ ( "sfence" ); } else { diff --git a/modules/video_chroma/i420_yuy2.h b/modules/video_chroma/i420_yuy2.h index a9549131ec..441e578481 100644 --- a/modules/video_chroma/i420_yuy2.h +++ b/modules/video_chroma/i420_yuy2.h @@ -136,14 +136,14 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ punpcklbw %%xmm2, %%xmm1 # v7 u7 v6 u6 .. u1 v0 u0 \n\ movdqa %%xmm0, %%xmm2 # y15 y14 y13 .. y2 y1 y0 \n\ punpcklbw %%xmm1, %%xmm2 # v3 y7 u3 .. v0 y1 u0 y0 \n\ -movdqa %%xmm2, (%0) # Store low YUYV \n\ +movntdq %%xmm2, (%0) # Store low YUYV \n\ punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ -movdqa %%xmm0, 16(%0) # Store high YUYV \n\ +movntdq %%xmm0, 16(%0) # Store high YUYV \n\ movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ punpcklbw %%xmm1, %%xmm4 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\ -movdqa %%xmm4, (%1) # Store low YUYV \n\ +movntdq %%xmm4, (%1) # Store low YUYV \n\ punpckhbw %%xmm1, %%xmm3 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\ -movdqa %%xmm3, 16(%1) # Store high YUYV \n\ +movntdq %%xmm3, 16(%1) # Store high YUYV \n\ " #define SSE2_YUV420_YUYV_UNALIGNED " \n\ @@ -172,14 +172,14 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ -movdqa %%xmm2, (%0) # Store low YUYV \n\ +movntdq %%xmm2, (%0) # Store low YUYV \n\ punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ -movdqa %%xmm0, 16(%0) # Store high YUYV \n\ +movntdq %%xmm0, 16(%0) # Store high YUYV \n\ movdqa %%xmm3, %%xmm4 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ punpcklbw %%xmm1, %%xmm4 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ -movdqa %%xmm4, (%1) # Store low YUYV \n\ +movntdq %%xmm4, (%1) # Store low YUYV \n\ punpckhbw %%xmm1, %%xmm3 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ -movdqa %%xmm3, 16(%1) # Store high YUYV \n\ +movntdq %%xmm3, 16(%1) # Store high YUYV \n\ " #define SSE2_YUV420_YVYU_UNALIGNED " \n\ @@ -208,15 +208,15 @@ movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ -movdqa %%xmm2, (%0) # Store low UYVY \n\ +movntdq %%xmm2, (%0) # Store low UYVY \n\ movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ -movdqa %%xmm2, 16(%0) # Store high UYVY \n\ +movntdq %%xmm2, 16(%0) # Store high UYVY \n\ movdqa %%xmm1, %%xmm4 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ punpcklbw %%xmm3, %%xmm4 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ -movdqa %%xmm4, (%1) # Store low UYVY \n\ +movntdq %%xmm4, (%1) # Store low UYVY \n\ punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ -movdqa %%xmm1, 16(%1) # Store high UYVY \n\ +movntdq %%xmm1, 16(%1) # Store high UYVY \n\ " #define SSE2_YUV420_UYVY_UNALIGNED " \n\ -- 2.39.5