X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=modules%2Fvideo_chroma%2Fi420_rgb16.c;h=3f0c6734f5d3a20d09399c397a27a860edc1ce45;hb=f2b2e37c04b2921e29daa3260dc696646ad4f10c;hp=f15054f8e427a4d306561bf1ed0e7a07cbd39aa6;hpb=4904140690f8630d53466a35914f0a37aa5154a4;p=vlc diff --git a/modules/video_chroma/i420_rgb16.c b/modules/video_chroma/i420_rgb16.c index f15054f8e4..3f0c6734f5 100644 --- a/modules/video_chroma/i420_rgb16.c +++ b/modules/video_chroma/i420_rgb16.c @@ -1,16 +1,17 @@ /***************************************************************************** * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc ***************************************************************************** - * Copyright (C) 2000 VideoLAN + * Copyright (C) 2000 the VideoLAN team * $Id$ * * Authors: Samuel Hocevar + * Damien Fouilleul * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @@ -18,26 +19,32 @@ * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. *****************************************************************************/ /***************************************************************************** * Preamble *****************************************************************************/ -#include /* strerror() */ -#include /* malloc(), free() */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif #include -#include +#include +#include #include "i420_rgb.h" #if defined (MODULE_NAME_IS_i420_rgb) # include "i420_rgb_c.h" #elif defined (MODULE_NAME_IS_i420_rgb_mmx) # include "i420_rgb_mmx.h" +#elif defined (MODULE_NAME_IS_i420_rgb_sse2) +# include "i420_rgb_mmx.h" #endif -static void SetOffset( int, int, int, int, vlc_bool_t *, int *, int * ); +static void SetOffset( int, int, int, int, bool *, + unsigned int *, int * ); #if defined (MODULE_NAME_IS_i420_rgb) /***************************************************************************** @@ -50,8 +57,8 @@ static void SetOffset( int, int, int, int, vlc_bool_t *, int *, int * ); * - input: 2 lines (2 Y lines, 1 U/V line) * - output: 1 line *****************************************************************************/ -void E_(I420_RGB16_dithering)( vout_thread_t *p_vout, picture_t *p_src, - picture_t *p_dest ) +void I420_RGB16_dither( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) { /* We got this one from the old arguments */ uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; @@ -59,27 +66,27 @@ void E_(I420_RGB16_dithering)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ - int i_vscale; /* vertical scaling type */ + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ unsigned int i_real_y; /* y % 4 */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint16_t * p_pic_start; /* beginning of the current line for copy */ int i_uval, i_vval; /* U and V samples */ int i_red, i_green, i_blue; /* U and V modified samples */ - uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16; + uint16_t * p_yuv = p_filter->p_sys->p_rgb16; uint16_t * p_ybase; /* Y dependant conversion table */ /* Conversion buffer pointer */ - uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; + uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer; uint16_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -95,17 +102,17 @@ void E_(I420_RGB16_dithering)( vout_thread_t *p_vout, picture_t *p_src, for(i_x = 0; i_x < 4; i_x++) { - dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift); - dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift); - dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift); - dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift); + dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift); + dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift); + dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift); + dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift); } i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { @@ -115,22 +122,25 @@ void E_(I420_RGB16_dithering)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { i_real_y = i_y & 0x3; p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { int *p_dither = dither10; CONVERT_YUV_PIXEL_DITHER(2); @@ -198,8 +208,11 @@ void E_(I420_RGB16_dithering)( vout_thread_t *p_vout, picture_t *p_src, * - input: 2 lines (2 Y lines, 1 U/V line) * - output: 1 line *****************************************************************************/ -void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, - picture_t *p_dest ) + +#if defined (MODULE_NAME_IS_i420_rgb) + +void I420_RGB16( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) { /* We got this one from the old arguments */ uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; @@ -207,28 +220,26 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ uint16_t * p_pic_start; /* beginning of the current line for copy */ -#if defined (MODULE_NAME_IS_i420_rgb) int i_uval, i_vval; /* U and V samples */ int i_red, i_green, i_blue; /* U and V modified samples */ - uint16_t * p_yuv = p_vout->chroma.p_sys->p_rgb16; + uint16_t * p_yuv = p_filter->p_sys->p_rgb16; uint16_t * p_ybase; /* Y dependant conversion table */ -#endif /* Conversion buffer pointer */ - uint16_t * p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer; + uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer; uint16_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -238,9 +249,9 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - if( p_vout->render.i_width & 7 ) + if( p_filter->fmt_in.video.i_width & 7 ) { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); } else { @@ -250,70 +261,266 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { p_pic_start = p_pic; p_buffer = b_hscale ? p_buffer_start : p_pic; -#if defined (MODULE_NAME_IS_i420_rgb) - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) { CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); } -#elif defined (MODULE_NAME_IS_i420_rgb_mmx) - if( p_vout->output.i_rmask == 0x7c00 ) + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + + CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); + CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); + CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); + CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } +} + +#else // ! defined (MODULE_NAME_IS_i420_rgb) + +void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint16_t * p_pic_start; /* beginning of the current line for copy */ + + /* Conversion buffer pointer */ + uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer; + uint16_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_filter->p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_filter->fmt_in.video.i_width & 15 ) + { + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { - /* 15bpp 5/5/5 */ - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + p_pic_start = p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; ) { - __asm__( MMX_INIT_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + SSE2_CALL ( + SSE2_INIT_16_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; - __asm__( ".align 8" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_15 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); - p_y += 8; - p_u += 4; - p_v += 4; - p_buffer += 8; + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; } + p_buffer = b_hscale ? p_buffer_start : p_pic; } - else + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { - /* 16bpp 5/6/5 */ - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) { - __asm__( MMX_INIT_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; - __asm__( ".align 8" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_15_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); - p_y += 8; - p_u += 4; - p_v += 4; - p_buffer += 8; + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_15 + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; } -#endif /* Here we do some unaligned reads and duplicate conversions, but * at least we have all the pixels */ @@ -323,39 +530,17 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (MODULE_NAME_IS_i420_rgb) - CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); - CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); - CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); - CONVERT_YUV_PIXEL(2); CONVERT_Y_PIXEL(2); -#elif defined (MODULE_NAME_IS_i420_rgb_mmx) - __asm__( MMX_INIT_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - - if( p_vout->output.i_rmask == 0x7c00 ) - { - /* 15bpp 5/5/5 */ - __asm__( ".align 8" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_15 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - } - else - { - /* 16bpp 5/6/5 */ - __asm__( ".align 8" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_16 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - } + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_15 + ); p_y += 8; p_u += 4; p_v += 4; p_buffer += 8; -#endif } SCALE_WIDTH; SCALE_HEIGHT( 420, 2 ); @@ -367,49 +552,37 @@ void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src, p_v += i_source_margin_c; } } + /* re-enable FPU registers */ + MMX_END; + +#endif } -/***************************************************************************** - * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp - ***************************************************************************** - * Horizontal alignment needed: - * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed - * - output: 1 pixel (2 bytes), margins allowed - * Vertical alignment needed: - * - input: 2 lines (2 Y lines, 1 U/V line) - * - output: 1 line - *****************************************************************************/ -void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, - picture_t *p_dest ) +void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) { /* We got this one from the old arguments */ - uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels; uint8_t *p_y = p_src->Y_PIXELS; uint8_t *p_u = p_src->U_PIXELS; uint8_t *p_v = p_src->V_PIXELS; - vlc_bool_t b_hscale; /* horizontal scaling type */ + bool b_hscale; /* horizontal scaling type */ unsigned int i_vscale; /* vertical scaling type */ unsigned int i_x, i_y; /* horizontal and vertical indexes */ int i_right_margin; int i_rewind; int i_scale_count; /* scale modulo counter */ - int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ - uint32_t * p_pic_start; /* beginning of the current line for copy */ -#if defined (MODULE_NAME_IS_i420_rgb) - int i_uval, i_vval; /* U and V samples */ - int i_red, i_green, i_blue; /* U and V modified samples */ - uint32_t * p_yuv = p_vout->chroma.p_sys->p_rgb32; - uint32_t * p_ybase; /* Y dependant conversion table */ -#endif + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint16_t * p_pic_start; /* beginning of the current line for copy */ /* Conversion buffer pointer */ - uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; - uint32_t * p_buffer; + uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer; + uint16_t * p_buffer; /* Offset array pointer */ - int * p_offset_start = p_vout->chroma.p_sys->p_offset; + int * p_offset_start = p_filter->p_sys->p_offset; int * p_offset; const int i_source_margin = p_src->p[0].i_pitch @@ -419,54 +592,178 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; - if( p_vout->render.i_width & 7 ) - { - i_rewind = 8 - ( p_vout->render.i_width & 7 ); - } - else - { - i_rewind = 0; - } - /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ - SetOffset( p_vout->render.i_width, p_vout->render.i_height, - p_vout->output.i_width, p_vout->output.i_height, + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, &b_hscale, &i_vscale, p_offset_start ); + /* * Perform conversion */ i_scale_count = ( i_vscale == 1 ) ? - p_vout->output.i_height : p_vout->render.i_height; - for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_filter->fmt_in.video.i_width & 15 ) { - p_pic_start = p_pic; - p_buffer = b_hscale ? p_buffer_start : p_pic; + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ - for ( i_x = p_vout->render.i_width / 8; i_x--; ) + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) { -#if defined (MODULE_NAME_IS_i420_rgb) - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); -#elif defined (MODULE_NAME_IS_i420_rgb_mmx) - __asm__( MMX_INIT_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + p_pic_start = p_pic; - __asm__( ".align 8" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_16_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + + SSE2_CALL ( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; ) + { + SSE2_CALL( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + + SSE2_CALL( + SSE2_INIT_16_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_16_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 2 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_16 + ); p_y += 8; p_u += 4; p_v += 4; p_buffer += 8; -#endif } /* Here we do some unaligned reads and duplicate conversions, but @@ -477,29 +774,20 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, p_u -= i_rewind >> 1; p_v -= i_rewind >> 1; p_buffer -= i_rewind; -#if defined (MODULE_NAME_IS_i420_rgb) - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); - CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); -#elif defined (MODULE_NAME_IS_i420_rgb_mmx) - __asm__( MMX_INIT_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); - - __asm__( ".align 8" - MMX_YUV_MUL - MMX_YUV_ADD - MMX_UNPACK_32 - : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) ); + MMX_CALL ( + MMX_INIT_16 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_16 + ); p_y += 8; p_u += 4; p_v += 4; p_buffer += 8; -#endif } SCALE_WIDTH; - SCALE_HEIGHT( 420, 4 ); + SCALE_HEIGHT( 420, 2 ); p_y += i_source_margin; if( i_y % 2 ) @@ -508,8 +796,1093 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, p_v += i_source_margin_c; } } + /* re-enable FPU registers */ + MMX_END; + +#endif } +#endif + +/***************************************************************************** + * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp + ***************************************************************************** + * Horizontal alignment needed: + * - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed + * - output: 1 pixel (2 bytes), margins allowed + * Vertical alignment needed: + * - input: 2 lines (2 Y lines, 1 U/V line) + * - output: 1 line + *****************************************************************************/ + +#if defined (MODULE_NAME_IS_i420_rgb) + +void I420_RGB32( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + int i_uval, i_vval; /* U and V samples */ + int i_red, i_green, i_blue; /* U and V modified samples */ + uint32_t * p_yuv = p_filter->p_sys->p_rgb32; + uint32_t * p_ybase; /* Y dependant conversion table */ + + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_filter->p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + CONVERT_YUV_PIXEL(4); CONVERT_Y_PIXEL(4); + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } +} + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2) + +void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_filter->p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_filter->fmt_in.video.i_width & 15 ) + { + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); + p_y += 16; + p_u += 4; + p_v += 4; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ARGB_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ARGB + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + + /* re-enable FPU registers */ + MMX_END; + +#endif +} + +void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_filter->p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_filter->fmt_in.video.i_width & 15 ) + { + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 4; + p_v += 4; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_RGBA_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + + /* make sure all SSE2 stores are visible thereafter */ + SSE2_END; + +#else // defined (MODULE_NAME_IS_i420_rgb_mmx) + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_RGBA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_RGBA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + + /* re-enable FPU registers */ + MMX_END; + +#endif +} + +void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_filter->p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_filter->fmt_in.video.i_width & 15 ) + { + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); + p_y += 16; + p_u += 4; + p_v += 4; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_BGRA_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + +#else + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_BGRA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_BGRA + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + + /* re-enable FPU registers */ + MMX_END; + +#endif +} + +void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, + picture_t *p_dest ) +{ + /* We got this one from the old arguments */ + uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; + uint8_t *p_y = p_src->Y_PIXELS; + uint8_t *p_u = p_src->U_PIXELS; + uint8_t *p_v = p_src->V_PIXELS; + + bool b_hscale; /* horizontal scaling type */ + unsigned int i_vscale; /* vertical scaling type */ + unsigned int i_x, i_y; /* horizontal and vertical indexes */ + + int i_right_margin; + int i_rewind; + int i_scale_count; /* scale modulo counter */ + int i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */ + uint32_t * p_pic_start; /* beginning of the current line for copy */ + /* Conversion buffer pointer */ + uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer; + uint32_t * p_buffer; + + /* Offset array pointer */ + int * p_offset_start = p_filter->p_sys->p_offset; + int * p_offset; + + const int i_source_margin = p_src->p[0].i_pitch + - p_src->p[0].i_visible_pitch; + const int i_source_margin_c = p_src->p[1].i_pitch + - p_src->p[1].i_visible_pitch; + + i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; + + /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered + * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' + * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ + SetOffset( p_filter->fmt_in.video.i_width, + p_filter->fmt_in.video.i_height, + p_filter->fmt_out.video.i_width, + p_filter->fmt_out.video.i_height, + &b_hscale, &i_vscale, p_offset_start ); + + /* + * Perform conversion + */ + i_scale_count = ( i_vscale == 1 ) ? + p_filter->fmt_out.video.i_height : + p_filter->fmt_in.video.i_height; + +#if defined (MODULE_NAME_IS_i420_rgb_sse2) + + if( p_filter->fmt_in.video.i_width & 15 ) + { + i_rewind = 16 - ( p_filter->fmt_in.video.i_width & 15 ); + } + else + { + i_rewind = 0; + } + + /* + ** SSE2 128 bits fetch/store instructions are faster + ** if memory access is 16 bytes aligned + */ + + p_buffer = b_hscale ? p_buffer_start : p_pic; + if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| + p_dest->p->i_pitch| + ((intptr_t)p_y)| + ((intptr_t)p_buffer))) ) + { + /* use faster SSE2 aligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_ALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_ALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 4; + p_v += 4; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + else + { + /* use slower SSE2 unaligned fetch and store */ + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; ) + { + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + p_buffer += 16; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + SSE2_CALL ( + SSE2_INIT_32_UNALIGNED + SSE2_YUV_MUL + SSE2_YUV_ADD + SSE2_UNPACK_32_ABGR_UNALIGNED + ); + p_y += 16; + p_u += 8; + p_v += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + p_buffer = b_hscale ? p_buffer_start : p_pic; + } + } + +#else + + if( p_filter->fmt_in.video.i_width & 7 ) + { + i_rewind = 8 - ( p_filter->fmt_in.video.i_width & 7 ); + } + else + { + i_rewind = 0; + } + + for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ ) + { + p_pic_start = p_pic; + p_buffer = b_hscale ? p_buffer_start : p_pic; + + for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; ) + { + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ABGR + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + + /* Here we do some unaligned reads and duplicate conversions, but + * at least we have all the pixels */ + if( i_rewind ) + { + p_y -= i_rewind; + p_u -= i_rewind >> 1; + p_v -= i_rewind >> 1; + p_buffer -= i_rewind; + MMX_CALL ( + MMX_INIT_32 + MMX_YUV_MUL + MMX_YUV_ADD + MMX_UNPACK_32_ABGR + ); + p_y += 8; + p_u += 4; + p_v += 4; + p_buffer += 8; + } + SCALE_WIDTH; + SCALE_HEIGHT( 420, 4 ); + + p_y += i_source_margin; + if( i_y % 2 ) + { + p_u += i_source_margin_c; + p_v += i_source_margin_c; + } + } + + /* re-enable FPU registers */ + MMX_END; + +#endif +} + +#endif + /* Following functions are local */ /***************************************************************************** @@ -519,8 +1892,8 @@ void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src, * It will also set horizontal and vertical scaling indicators. *****************************************************************************/ static void SetOffset( int i_width, int i_height, int i_pic_width, - int i_pic_height, vlc_bool_t *pb_hscale, - int *pi_vscale, int *p_offset ) + int i_pic_height, bool *pb_hscale, + unsigned int *pi_vscale, int *p_offset ) { int i_x; /* x position in destination */ int i_scale_count; /* modulo counter */