X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=modules%2Fvideo_chroma%2Fi420_rgb16.c;h=3f0c6734f5d3a20d09399c397a27a860edc1ce45;hb=79bc7d59facaeafe140ecc924b04c9cb6310df61;hp=27f0715a497bb3eaa7472cc413350b69ad9ea564;hpb=9acaa4b2e175fb575070d684acdb178bc7a542d2;p=vlc diff --git a/modules/video_chroma/i420_rgb16.c b/modules/video_chroma/i420_rgb16.c index 27f0715a49..3f0c6734f5 100644 --- a/modules/video_chroma/i420_rgb16.c +++ b/modules/video_chroma/i420_rgb16.c @@ -11,7 +11,7 @@ * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -25,28 +25,25 @@ /***************************************************************************** * Preamble *****************************************************************************/ -#include /* strerror() */ -#include /* malloc(), free() */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif #include +#include #include #include "i420_rgb.h" #if defined (MODULE_NAME_IS_i420_rgb) # include "i420_rgb_c.h" #elif defined (MODULE_NAME_IS_i420_rgb_mmx) -# if defined(HAVE_MMX_INTRINSICS) -# include -# endif # include "i420_rgb_mmx.h" #elif defined (MODULE_NAME_IS_i420_rgb_sse2) -# if defined(HAVE_SSE2_INTRINSICS) -# include -# endif # include "i420_rgb_mmx.h" #endif -static void SetOffset( int, int, int, int, vlc_bool_t *, +static void SetOffset( int, int, int, int, bool *, unsigned int *, int * ); #if defined (MODULE_NAME_IS_i420_rgb) @@ -60,8 +57,8 @@ static void SetOffset( int, int, int, int, vlc_bool_t *, * - input: 2 lines (2 Y lines, 1 U/V line) * - output: 1 line *****************************************************************************/ -void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src, - picture_t *p_dest ) +void I420_RGB16_dither( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) { /* We got this one from the old arguments */ uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; @@ -69,7 +66,7 @@ void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ unsigned int i_real_y; /* y % 4 */ @@ -77,19 +74,19 @@ void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src, int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint16_t * p_pic_start; /* beginning of the current line for copy */ int i_uval, i_vval; /* U and V samples */ int i_red, i_green, i_blue; /* U and V modified samples */ - uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16; + uint16_t * p_yuv = p_filter->p_sys->p_rgb16; uint16_t * p_ybase; /* Y dependant conversion table */ /* Conversion buffer pointer */ - uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; + uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer; uint16_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -105,17 +102,17 @@ void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src, for(i_x = 0; i_x < 4; i_x++) { - dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift); - dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift); - dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift); - dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift); + dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift); + dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift); + dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift); + dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift); } i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { @@ -125,22 +122,25 @@ void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { i_real_y = i_y & 0x3; p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { int *p_dither = dither10; CONVERT_YUV_PIXEL_DITHER(2); @@ -211,8 +211,8 @@ void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src, #if defined (MODULE_NAME_IS_i420_rgb) -void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, - picture_t *p_dest ) +void I420_RGB16( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) { /* We got this one from the old arguments */ uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; @@ -220,26 +220,26 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint16_t * p_pic_start; /* beginning of the current line for copy */ int i_uval, i_vval; /* U and V samples */ int i_red, i_green, i_blue; /* U and V modified samples */ - uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16; + uint16_t * p_yuv = p_filter->p_sys->p_rgb16; uint16_t * p_ybase; /* Y dependant conversion table */ /* Conversion buffer pointer */ - uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; + uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer; uint16_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -249,9 +249,9 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { @@ -261,21 +261,24 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); @@ -309,10 +312,10 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, } } -#else // defined (MODULE_NAME_IS_i420_rgb_mmx) +#else // ! defined (MODULE_NAME_IS_i420_rgb) -void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, - picture_t *p_dest ) +void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) { /* We got this one from the old arguments */ uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; @@ -320,22 +323,22 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint16_t * p_pic_start; /* beginning of the current line for copy */ /* Conversion buffer pointer */ - uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; + uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer; uint16_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -348,8 +351,10 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); @@ -357,13 +362,14 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; #if defined (MODULE_NAME_IS_i420_rgb_sse2) - if( p_vout->render.i_width & 15 ) + if( p_filter->fmt_in.video.i_width & 15 ) { - i_rewind = 16 - ( p_vout->render.i_width & 15 ); + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); } else { @@ -371,37 +377,29 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, } /* - ** SSE2 128 bits fetch/store instructions are faster + ** SSE2 128 bits fetch/store instructions are faster ** if memory access is 16 bytes aligned */ p_buffer = b_hscale ? p_buffer_start : p_pic; if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| p_dest->p->i_pitch| - ((int)p_y)| - ((int)p_buffer))) ) + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) { /* use faster SSE2 aligned fetch and store */ - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; - for ( i_x = p_vout->render.i_width/16; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_ALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_15_ALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - SSE2_INTRINSICS_INIT_16_ALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_15_ALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_16_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_ALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -416,23 +414,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_15_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_15_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -448,37 +435,23 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, } p_buffer = b_hscale ? p_buffer_start : p_pic; } - /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ); -#else - _mm_sfence(); -#endif } else { /* use slower SSE2 unaligned fetch and store */ - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width/16; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_15_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_15_UNALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -493,23 +466,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_15_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_15_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -526,40 +488,34 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, p_buffer = b_hscale ? p_buffer_start : p_pic; } } + + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + #else // defined (MODULE_NAME_IS_i420_rgb_mmx) - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { i_rewind = 0; } - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { -#if defined (CAN_COMPILE_MMX) - __asm__( ".p2align 3" - MMX_INIT_16 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_15 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - MMX_INTRINSICS_INIT_16 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_15 -#endif - + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_15 + ); p_y += 8; p_u += 4; p_v += 4; @@ -575,24 +531,12 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_MMX) - __asm__( ".p2align 3" - MMX_INIT_16 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_15 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - { - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_16 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_15 - } -#endif + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_15 + ); p_y += 8; p_u += 4; p_v += 4; @@ -609,17 +553,13 @@ void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src, } } /* re-enable FPU registers */ -#if defined (CAN_COMPILE_MMX) - __asm__ __volatile__ ( "emms" ); -#else - _mm_empty(); -#endif + MMX_END; #endif } -void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, - picture_t *p_dest ) +void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) { /* We got this one from the old arguments */ uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; @@ -627,22 +567,22 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint16_t * p_pic_start; /* beginning of the current line for copy */ /* Conversion buffer pointer */ - uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; + uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer; uint16_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -655,8 +595,10 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); @@ -664,13 +606,14 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; #if defined (MODULE_NAME_IS_i420_rgb_sse2) - if( p_vout->render.i_width & 15 ) + if( p_filter->fmt_in.video.i_width & 15 ) { - i_rewind = 16 - ( p_vout->render.i_width & 15 ); + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); } else { @@ -678,37 +621,29 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, } /* - ** SSE2 128 bits fetch/store instructions are faster + ** SSE2 128 bits fetch/store instructions are faster ** if memory access is 16 bytes aligned */ p_buffer = b_hscale ? p_buffer_start : p_pic; if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| p_dest->p->i_pitch| - ((int)p_y)| - ((int)p_buffer))) ) + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) { /* use faster SSE2 aligned fetch and store */ - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; - for ( i_x = p_vout->render.i_width/16; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_ALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_16_ALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - SSE2_INTRINSICS_INIT_16_ALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_16_ALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_16_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_ALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -723,23 +658,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_16_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_16_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -755,37 +679,23 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, } p_buffer = b_hscale ? p_buffer_start : p_pic; } - /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ); -#else - _mm_sfence(); -#endif } else { /* use slower SSE2 unaligned fetch and store */ - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width/16; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_16_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_16_UNALIGNED -#endif + SSE2_CALL( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -800,23 +710,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - __asm__( ".p2align 3" - SSE2_INIT_16_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_16_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_16_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_16_UNALIGNED - } -#endif + SSE2_CALL( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -833,40 +732,34 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, p_buffer = b_hscale ? p_buffer_start : p_pic; } } + + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + #else // defined (MODULE_NAME_IS_i420_rgb_mmx) - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { i_rewind = 0; } - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { -#if defined (CAN_COMPILE_MMX) - __asm__( ".p2align 3" - MMX_INIT_16 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - MMX_INTRINSICS_INIT_16 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_16 -#endif - + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_16 + ); p_y += 8; p_u += 4; p_v += 4; @@ -882,24 +775,12 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_MMX) - __asm__( ".p2align 3" - MMX_INIT_16 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - { - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_16 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_16 - } -#endif + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_16 + ); p_y += 8; p_u += 4; p_v += 4; @@ -916,11 +797,7 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, } } /* re-enable FPU registers */ -#if defined (CAN_COMPILE_MMX) - __asm__ __volatile__ ( "emms" ); -#else - _mm_empty(); -#endif + MMX_END; #endif } @@ -940,8 +817,8 @@ void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src, #if defined (MODULE_NAME_IS_i420_rgb) -void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, - picture_t *p_dest ) +void I420_RGB32( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) { /* We got this one from the old arguments */ uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; @@ -949,26 +826,26 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint32_t * p_pic_start; /* beginning of the current line for copy */ int i_uval, i_vval; /* U and V samples */ int i_red, i_green, i_blue; /* U and V modified samples */ - uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32; + uint32_t * p_yuv = p_filter->p_sys->p_rgb32; uint32_t * p_ybase; /* Y dependant conversion table */ /* Conversion buffer pointer */ - uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; uint32_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -978,9 +855,9 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { @@ -990,21 +867,24 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); @@ -1039,7 +919,7 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2) -void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, +void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest ) { /* We got this one from the old arguments */ @@ -1048,21 +928,21 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint32_t * p_pic_start; /* beginning of the current line for copy */ /* Conversion buffer pointer */ - uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; uint32_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -1075,21 +955,24 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; #if defined (MODULE_NAME_IS_i420_rgb_sse2) - if( p_vout->render.i_width & 15 ) + if( p_filter->fmt_in.video.i_width & 15 ) { - i_rewind = 16 - ( p_vout->render.i_width & 15 ); + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); } else { @@ -1097,40 +980,29 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, } /* - ** SSE2 128 bits fetch/store instructions are faster + ** SSE2 128 bits fetch/store instructions are faster ** if memory access is 16 bytes aligned */ p_buffer = b_hscale ? p_buffer_start : p_pic; if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| p_dest->p->i_pitch| - ((int)p_y)| - ((int)p_buffer))) ) + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) { /* use faster SSE2 aligned fetch and store */ - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; - for ( i_x = p_vout->render.i_width / 16; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_ALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_ARGB_ALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 C intrinsics wrappers */ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_ALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_ALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1145,25 +1017,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_ARGB_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 intrinsics wrappers */ - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); p_y += 16; p_u += 4; p_v += 4; @@ -1179,40 +1038,23 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, } p_buffer = b_hscale ? p_buffer_start : p_pic; } - /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ); -#else - _mm_sfence(); -#endif } else { /* use slower SSE2 unaligned fetch and store */ - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 16; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_ARGB_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 C intrinsics wrappers */ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1227,25 +1069,12 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_ARGB_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 intrinsics wrappers */ - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1263,44 +1092,33 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, } } -#else + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { i_rewind = 0; } - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { -#if defined (CAN_COMPILE_MMX) - /* use inline MMX assembly */ - __asm__( MMX_INIT_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - - __asm__( ".p2align 3" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32_ARGB - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - /* otherwise use MMX C intrinsics wrappers */ - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_32 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_32_ARGB -#endif + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + ); p_y += 8; p_u += 4; p_v += 4; @@ -1315,26 +1133,254 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_MMX) - /* use inline MMX assembly */ - __asm__( ".p2align 3" - MMX_INIT_32 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32_ARGB - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - /* otherwise use MMX intrinsics wrappers */ + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + + /* re-enable FPU registers */ + MMX_END; + +#endif +} + +void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_filter->p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_filter->fmt_in.video.i_width & 15 ) + { + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) { - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } - MMX_INTRINSICS_INIT_32 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_32_ARGB + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 4; + p_v += 4; } -#endif + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_RGBA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_RGBA + ); p_y += 8; p_u += 4; p_v += 4; @@ -1350,17 +1396,14 @@ void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src, p_v += i_source_margin_c; } } + /* re-enable FPU registers */ -#if defined (CAN_COMPILE_MMX) - __asm__ __volatile__ ( "emms" ); -#else - _mm_empty(); -#endif + MMX_END; #endif } -void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, +void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest ) { /* We got this one from the old arguments */ @@ -1369,21 +1412,21 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint32_t * p_pic_start; /* beginning of the current line for copy */ /* Conversion buffer pointer */ - uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; uint32_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -1396,21 +1439,24 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; #if defined (MODULE_NAME_IS_i420_rgb_sse2) - if( p_vout->render.i_width & 15 ) + if( p_filter->fmt_in.video.i_width & 15 ) { - i_rewind = 16 - ( p_vout->render.i_width & 15 ); + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); } else { @@ -1418,40 +1464,29 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, } /* - ** SSE2 128 bits fetch/store instructions are faster + ** SSE2 128 bits fetch/store instructions are faster ** if memory access is 16 bytes aligned */ p_buffer = b_hscale ? p_buffer_start : p_pic; if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| p_dest->p->i_pitch| - ((int)p_y)| - ((int)p_buffer))) ) + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) { /* use faster SSE2 aligned fetch and store */ - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; - for ( i_x = p_vout->render.i_width / 16; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_ALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_BGRA_ALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 C intrinsics wrappers */ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_ALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_ALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1466,25 +1501,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_BGRA_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 intrinsics wrappers */ - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); p_y += 16; p_u += 4; p_v += 4; @@ -1500,40 +1522,23 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, } p_buffer = b_hscale ? p_buffer_start : p_pic; } - /* make sure all SSE2 stores are visible thereafter */ -#if defined (CAN_COMPILE_SSE2) - __asm__ __volatile__ ( "sfence" ); -#else - _mm_sfence(); -#endif } else { /* use slower SSE2 unaligned fetch and store */ - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 16; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) { -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_BGRA_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 C intrinsics wrappers */ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1548,25 +1553,12 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_SSE2) - /* use inline SSE2 assembly */ - __asm__( ".p2align 3" - SSE2_INIT_32_UNALIGNED - SSE2_YUV_MUL - SSE2_YUV_ADD - SSE2_UNPACK_32_BGRA_UNALIGNED - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" ); -#else - /* otherwise use SSE2 intrinsics wrappers */ - { - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; - - SSE2_INTRINSICS_INIT_32_UNALIGNED - SSE2_INTRINSICS_YUV_MUL - SSE2_INTRINSICS_YUV_ADD - SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED - } -#endif + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); p_y += 16; p_u += 8; p_v += 8; @@ -1586,42 +1578,28 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, #else - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { i_rewind = 0; } - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { -#if defined (CAN_COMPILE_MMX) - /* use inline MMX assembly */ - __asm__( MMX_INIT_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - - __asm__( ".p2align 3" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32_ARGB - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - /* otherwise use MMX C intrinsics wrappers */ - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; - - MMX_INTRINSICS_INIT_32 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_32_BGRA -#endif + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_BGRA + ); p_y += 8; p_u += 4; p_v += 4; @@ -1636,26 +1614,251 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (CAN_COMPILE_MMX) - /* use inline MMX assembly */ - __asm__( ".p2align 3" - MMX_INIT_32 - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32_BGRA - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); -#else - /* otherwise use MMX intrinsics wrappers */ + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_BGRA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + + /* re-enable FPU registers */ + MMX_END; + +#endif +} + +void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_filter->p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_filter->fmt_in.video.i_width & 15 ) + { + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 4; + p_v += 4; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) { - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - uint64_t tmp64; + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; - MMX_INTRINSICS_INIT_32 - MMX_INTRINSICS_YUV_MUL - MMX_INTRINSICS_YUV_ADD - MMX_INTRINSICS_UNPACK_32_BGRA + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; } -#endif + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + +#else + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ABGR + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ABGR + ); p_y += 8; p_u += 4; p_v += 4; @@ -1671,12 +1874,9 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, p_v += i_source_margin_c; } } + /* re-enable FPU registers */ -#if defined (CAN_COMPILE_MMX) - __asm__ __volatile__ ( "emms" ); -#else - _mm_empty(); -#endif + MMX_END; #endif } @@ -1692,7 +1892,7 @@ void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, * It will also set horizontal and vertical scaling indicators. *****************************************************************************/ static void SetOffset( int i_width, int i_height, int i_pic_width, - int i_pic_height, vlc_bool_t *pb_hscale, + int i_pic_height, bool *pb_hscale, unsigned int *pi_vscale, int *p_offset ) { int i_x; /* x position in destination */