git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c

   1 /*****************************************************************************
   2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damienf@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_filter.h>
  35 #include <vlc_cpu.h>
  36
  37 #include "i420_rgb.h"
  38 #if defined (MODULE_NAME_IS_i420_rgb)
  39 #   include "i420_rgb_c.h"
  40 #   define VLC_TARGET
  41 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
  42 #   include "../mmx/i420_rgb_mmx.h"
  43 #   define VLC_TARGET VLC_MMX
  44 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
  45 #   include "../mmx/i420_rgb_mmx.h"
  46 #   define VLC_TARGET VLC_SSE
  47 #endif
  48
  49 static void SetOffset( int, int, int, int, bool *,
  50                        unsigned int *, int * );
  51
  52 #if defined (MODULE_NAME_IS_i420_rgb)
  53 /*****************************************************************************
  54  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
  55  *****************************************************************************
  56  * Horizontal alignment needed:
  57  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
  58  *  - output: 1 pixel (2 bytes), margins allowed
  59  * Vertical alignment needed:
  60  *  - input: 2 lines (2 Y lines, 1 U/V line)
  61  *  - output: 1 line
  62  *****************************************************************************/
  63 void I420_RGB16_dither( filter_t *p_filter, picture_t *p_src,
  64                                                 picture_t *p_dest )
  65 {
  66     /* We got this one from the old arguments */
  67     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
  68     uint8_t  *p_y   = p_src->Y_PIXELS;
  69     uint8_t  *p_u   = p_src->U_PIXELS;
  70     uint8_t  *p_v   = p_src->V_PIXELS;
  71
  72     bool   b_hscale;                        /* horizontal scaling type */
  73     unsigned int i_vscale;                          /* vertical scaling type */
  74     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
  75     unsigned int i_real_y;                                          /* y % 4 */
  76
  77     int         i_right_margin;
  78     int         i_rewind;
  79     int         i_scale_count;                       /* scale modulo counter */
  80     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
  81     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
  82     int         i_uval, i_vval;                           /* U and V samples */
  83     int         i_red, i_green, i_blue;          /* U and V modified samples */
  84     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
  85     uint16_t *  p_ybase;                     /* Y dependant conversion table */
  86
  87     /* Conversion buffer pointer */
  88     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
  89     uint16_t *  p_buffer;
  90
  91     /* Offset array pointer */
  92     int *       p_offset_start = p_filter->p_sys->p_offset;
  93     int *       p_offset;
  94
  95     const int i_source_margin = p_src->p[0].i_pitch
  96                                  - p_src->p[0].i_visible_pitch;
  97     const int i_source_margin_c = p_src->p[1].i_pitch
  98                                  - p_src->p[1].i_visible_pitch;
  99
 100     /* The dithering matrices */
 101     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
 102     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
 103     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
 104     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
 105
 106     for(i_x = 0; i_x < 4; i_x++)
 107     {
 108         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
 109         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
 110         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
 111         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
 112     }
 113
 114     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 115     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 116
 117     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 118      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 119      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 120     SetOffset( p_filter->fmt_in.video.i_width,
 121                p_filter->fmt_in.video.i_height,
 122                p_filter->fmt_out.video.i_width,
 123                p_filter->fmt_out.video.i_height,
 124                &b_hscale, &i_vscale, p_offset_start );
 125
 126     /*
 127      * Perform conversion
 128      */
 129     i_scale_count = ( i_vscale == 1 ) ?
 130                     p_filter->fmt_out.video.i_height :
 131                     p_filter->fmt_in.video.i_height;
 132     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 133     {
 134         i_real_y = i_y & 0x3;
 135         p_pic_start = p_pic;
 136         p_buffer = b_hscale ? p_buffer_start : p_pic;
 137
 138         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 139         {
 140             int *p_dither = dither10;
 141             CONVERT_YUV_PIXEL_DITHER(2);
 142             p_dither = dither11;
 143             CONVERT_Y_PIXEL_DITHER(2);
 144             p_dither = dither12;
 145             CONVERT_YUV_PIXEL_DITHER(2);
 146             p_dither = dither13;
 147             CONVERT_Y_PIXEL_DITHER(2);
 148             p_dither = dither10;
 149             CONVERT_YUV_PIXEL_DITHER(2);
 150             p_dither = dither11;
 151             CONVERT_Y_PIXEL_DITHER(2);
 152             p_dither = dither12;
 153             CONVERT_YUV_PIXEL_DITHER(2);
 154             p_dither = dither13;
 155             CONVERT_Y_PIXEL_DITHER(2);
 156         }
 157
 158         /* Here we do some unaligned reads and duplicate conversions, but
 159          * at least we have all the pixels */
 160         if( i_rewind )
 161         {
 162             int *p_dither = dither10;
 163             p_y -= i_rewind;
 164             p_u -= i_rewind >> 1;
 165             p_v -= i_rewind >> 1;
 166             p_buffer -= i_rewind;
 167             CONVERT_YUV_PIXEL_DITHER(2);
 168             p_dither = dither11;
 169             CONVERT_Y_PIXEL_DITHER(2);
 170             p_dither = dither12;
 171             CONVERT_YUV_PIXEL_DITHER(2);
 172             p_dither = dither13;
 173             CONVERT_Y_PIXEL_DITHER(2);
 174             p_dither = dither10;
 175             CONVERT_YUV_PIXEL_DITHER(2);
 176             p_dither = dither11;
 177             CONVERT_Y_PIXEL_DITHER(2);
 178             p_dither = dither12;
 179             CONVERT_YUV_PIXEL_DITHER(2);
 180             p_dither = dither13;
 181             CONVERT_Y_PIXEL_DITHER(2);
 182         }
 183         SCALE_WIDTH;
 184         SCALE_HEIGHT( 420, 2 );
 185
 186         p_y += i_source_margin;
 187         if( i_y % 2 )
 188         {
 189             p_u += i_source_margin_c;
 190             p_v += i_source_margin_c;
 191         }
 192     }
 193 }
 194 #endif
 195
 196 /*****************************************************************************
 197  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
 198  *****************************************************************************
 199  * Horizontal alignment needed:
 200  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
 201  *  - output: 1 pixel (2 bytes), margins allowed
 202  * Vertical alignment needed:
 203  *  - input: 2 lines (2 Y lines, 1 U/V line)
 204  *  - output: 1 line
 205  *****************************************************************************/
 206
 207 #if defined (MODULE_NAME_IS_i420_rgb)
 208
 209 void I420_RGB16( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 210 {
 211     /* We got this one from the old arguments */
 212     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 213     uint8_t  *p_y   = p_src->Y_PIXELS;
 214     uint8_t  *p_u   = p_src->U_PIXELS;
 215     uint8_t  *p_v   = p_src->V_PIXELS;
 216
 217     bool  b_hscale;                         /* horizontal scaling type */
 218     unsigned int i_vscale;                          /* vertical scaling type */
 219     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 220
 221     int         i_right_margin;
 222     int         i_rewind;
 223     int         i_scale_count;                       /* scale modulo counter */
 224     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 225     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 226     int         i_uval, i_vval;                           /* U and V samples */
 227     int         i_red, i_green, i_blue;          /* U and V modified samples */
 228     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
 229     uint16_t *  p_ybase;                     /* Y dependant conversion table */
 230
 231     /* Conversion buffer pointer */
 232     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 233     uint16_t *  p_buffer;
 234
 235     /* Offset array pointer */
 236     int *       p_offset_start = p_filter->p_sys->p_offset;
 237     int *       p_offset;
 238
 239     const int i_source_margin = p_src->p[0].i_pitch
 240                                  - p_src->p[0].i_visible_pitch;
 241     const int i_source_margin_c = p_src->p[1].i_pitch
 242                                  - p_src->p[1].i_visible_pitch;
 243
 244     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 245     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 246
 247     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 248      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 249      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 250     SetOffset( p_filter->fmt_in.video.i_width,
 251                p_filter->fmt_in.video.i_height,
 252                p_filter->fmt_out.video.i_width,
 253                p_filter->fmt_out.video.i_height,
 254                &b_hscale, &i_vscale, p_offset_start );
 255
 256     /*
 257      * Perform conversion
 258      */
 259     i_scale_count = ( i_vscale == 1 ) ?
 260                     p_filter->fmt_out.video.i_height :
 261                     p_filter->fmt_in.video.i_height;
 262     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 263     {
 264         p_pic_start = p_pic;
 265         p_buffer = b_hscale ? p_buffer_start : p_pic;
 266
 267         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 268         {
 269             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 270             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 271             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 272             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 273         }
 274
 275         /* Here we do some unaligned reads and duplicate conversions, but
 276          * at least we have all the pixels */
 277         if( i_rewind )
 278         {
 279             p_y -= i_rewind;
 280             p_u -= i_rewind >> 1;
 281             p_v -= i_rewind >> 1;
 282             p_buffer -= i_rewind;
 283
 284             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 285             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 286             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 287             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 288         }
 289         SCALE_WIDTH;
 290         SCALE_HEIGHT( 420, 2 );
 291
 292         p_y += i_source_margin;
 293         if( i_y % 2 )
 294         {
 295             p_u += i_source_margin_c;
 296             p_v += i_source_margin_c;
 297         }
 298     }
 299 }
 300
 301 #else // ! defined (MODULE_NAME_IS_i420_rgb)
 302
 303 VLC_TARGET
 304 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 305 {
 306     /* We got this one from the old arguments */
 307     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 308     uint8_t  *p_y   = p_src->Y_PIXELS;
 309     uint8_t  *p_u   = p_src->U_PIXELS;
 310     uint8_t  *p_v   = p_src->V_PIXELS;
 311
 312     bool  b_hscale;                         /* horizontal scaling type */
 313     unsigned int i_vscale;                          /* vertical scaling type */
 314     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 315
 316     int         i_right_margin;
 317     int         i_rewind;
 318     int         i_scale_count;                       /* scale modulo counter */
 319     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 320     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 321
 322     /* Conversion buffer pointer */
 323     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 324     uint16_t *  p_buffer;
 325
 326     /* Offset array pointer */
 327     int *       p_offset_start = p_filter->p_sys->p_offset;
 328     int *       p_offset;
 329
 330     const int i_source_margin = p_src->p[0].i_pitch
 331                                  - p_src->p[0].i_visible_pitch;
 332     const int i_source_margin_c = p_src->p[1].i_pitch
 333                                  - p_src->p[1].i_visible_pitch;
 334
 335     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 336
 337     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 338      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 339      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 340     SetOffset( p_filter->fmt_in.video.i_width,
 341                p_filter->fmt_in.video.i_height,
 342                p_filter->fmt_out.video.i_width,
 343                p_filter->fmt_out.video.i_height,
 344                &b_hscale, &i_vscale, p_offset_start );
 345
 346
 347     /*
 348      * Perform conversion
 349      */
 350     i_scale_count = ( i_vscale == 1 ) ?
 351                     p_filter->fmt_out.video.i_height :
 352                     p_filter->fmt_in.video.i_height;
 353
 354 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 355
 356     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 357
 358     /*
 359     ** SSE2 128 bits fetch/store instructions are faster
 360     ** if memory access is 16 bytes aligned
 361     */
 362
 363     p_buffer = b_hscale ? p_buffer_start : p_pic;
 364     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 365                     p_dest->p->i_pitch|
 366                     ((intptr_t)p_y)|
 367                     ((intptr_t)p_buffer))) )
 368     {
 369         /* use faster SSE2 aligned fetch and store */
 370         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 371         {
 372             p_pic_start = p_pic;
 373
 374             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 375             {
 376                 SSE2_CALL (
 377                     SSE2_INIT_16_ALIGNED
 378                     SSE2_YUV_MUL
 379                     SSE2_YUV_ADD
 380                     SSE2_UNPACK_15_ALIGNED
 381                 );
 382                 p_y += 16;
 383                 p_u += 8;
 384                 p_v += 8;
 385                 p_buffer += 16;
 386             }
 387             /* Here we do some unaligned reads and duplicate conversions, but
 388              * at least we have all the pixels */
 389             if( i_rewind )
 390             {
 391                 p_y -= i_rewind;
 392                 p_u -= i_rewind >> 1;
 393                 p_v -= i_rewind >> 1;
 394                 p_buffer -= i_rewind;
 395
 396                 SSE2_CALL (
 397                     SSE2_INIT_16_UNALIGNED
 398                     SSE2_YUV_MUL
 399                     SSE2_YUV_ADD
 400                     SSE2_UNPACK_15_UNALIGNED
 401                 );
 402                 p_y += 16;
 403                 p_u += 8;
 404                 p_v += 8;
 405             }
 406             SCALE_WIDTH;
 407             SCALE_HEIGHT( 420, 2 );
 408
 409             p_y += i_source_margin;
 410             if( i_y % 2 )
 411             {
 412                 p_u += i_source_margin_c;
 413                 p_v += i_source_margin_c;
 414             }
 415             p_buffer = b_hscale ? p_buffer_start : p_pic;
 416         }
 417     }
 418     else
 419     {
 420         /* use slower SSE2 unaligned fetch and store */
 421         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 422         {
 423             p_pic_start = p_pic;
 424             p_buffer = b_hscale ? p_buffer_start : p_pic;
 425
 426             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 427             {
 428                 SSE2_CALL (
 429                     SSE2_INIT_16_UNALIGNED
 430                     SSE2_YUV_MUL
 431                     SSE2_YUV_ADD
 432                     SSE2_UNPACK_15_UNALIGNED
 433                 );
 434                 p_y += 16;
 435                 p_u += 8;
 436                 p_v += 8;
 437                 p_buffer += 16;
 438             }
 439             /* Here we do some unaligned reads and duplicate conversions, but
 440              * at least we have all the pixels */
 441             if( i_rewind )
 442             {
 443                 p_y -= i_rewind;
 444                 p_u -= i_rewind >> 1;
 445                 p_v -= i_rewind >> 1;
 446                 p_buffer -= i_rewind;
 447
 448                 SSE2_CALL (
 449                     SSE2_INIT_16_UNALIGNED
 450                     SSE2_YUV_MUL
 451                     SSE2_YUV_ADD
 452                     SSE2_UNPACK_15_UNALIGNED
 453                 );
 454                 p_y += 16;
 455                 p_u += 8;
 456                 p_v += 8;
 457             }
 458             SCALE_WIDTH;
 459             SCALE_HEIGHT( 420, 2 );
 460
 461             p_y += i_source_margin;
 462             if( i_y % 2 )
 463             {
 464                 p_u += i_source_margin_c;
 465                 p_v += i_source_margin_c;
 466             }
 467             p_buffer = b_hscale ? p_buffer_start : p_pic;
 468         }
 469     }
 470
 471     /* make sure all SSE2 stores are visible thereafter */
 472     SSE2_END;
 473
 474 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 475
 476     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 477
 478     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 479     {
 480         p_pic_start = p_pic;
 481         p_buffer = b_hscale ? p_buffer_start : p_pic;
 482
 483         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 484         {
 485             MMX_CALL (
 486                 MMX_INIT_16
 487                 MMX_YUV_MUL
 488                 MMX_YUV_ADD
 489                 MMX_UNPACK_15
 490             );
 491             p_y += 8;
 492             p_u += 4;
 493             p_v += 4;
 494             p_buffer += 8;
 495         }
 496
 497         /* Here we do some unaligned reads and duplicate conversions, but
 498          * at least we have all the pixels */
 499         if( i_rewind )
 500         {
 501             p_y -= i_rewind;
 502             p_u -= i_rewind >> 1;
 503             p_v -= i_rewind >> 1;
 504             p_buffer -= i_rewind;
 505
 506             MMX_CALL (
 507                 MMX_INIT_16
 508                 MMX_YUV_MUL
 509                 MMX_YUV_ADD
 510                 MMX_UNPACK_15
 511             );
 512             p_y += 8;
 513             p_u += 4;
 514             p_v += 4;
 515             p_buffer += 8;
 516         }
 517         SCALE_WIDTH;
 518         SCALE_HEIGHT( 420, 2 );
 519
 520         p_y += i_source_margin;
 521         if( i_y % 2 )
 522         {
 523             p_u += i_source_margin_c;
 524             p_v += i_source_margin_c;
 525         }
 526     }
 527     /* re-enable FPU registers */
 528     MMX_END;
 529
 530 #endif
 531 }
 532
 533 VLC_TARGET
 534 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 535 {
 536     /* We got this one from the old arguments */
 537     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 538     uint8_t  *p_y   = p_src->Y_PIXELS;
 539     uint8_t  *p_u   = p_src->U_PIXELS;
 540     uint8_t  *p_v   = p_src->V_PIXELS;
 541
 542     bool  b_hscale;                         /* horizontal scaling type */
 543     unsigned int i_vscale;                          /* vertical scaling type */
 544     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 545
 546     int         i_right_margin;
 547     int         i_rewind;
 548     int         i_scale_count;                       /* scale modulo counter */
 549     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 550     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 551
 552     /* Conversion buffer pointer */
 553     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 554     uint16_t *  p_buffer;
 555
 556     /* Offset array pointer */
 557     int *       p_offset_start = p_filter->p_sys->p_offset;
 558     int *       p_offset;
 559
 560     const int i_source_margin = p_src->p[0].i_pitch
 561                                  - p_src->p[0].i_visible_pitch;
 562     const int i_source_margin_c = p_src->p[1].i_pitch
 563                                  - p_src->p[1].i_visible_pitch;
 564
 565     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 566
 567     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 568      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 569      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 570     SetOffset( p_filter->fmt_in.video.i_width,
 571                p_filter->fmt_in.video.i_height,
 572                p_filter->fmt_out.video.i_width,
 573                p_filter->fmt_out.video.i_height,
 574                &b_hscale, &i_vscale, p_offset_start );
 575
 576
 577     /*
 578      * Perform conversion
 579      */
 580     i_scale_count = ( i_vscale == 1 ) ?
 581                     p_filter->fmt_out.video.i_height :
 582                     p_filter->fmt_in.video.i_height;
 583
 584 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 585
 586     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 587
 588     /*
 589     ** SSE2 128 bits fetch/store instructions are faster
 590     ** if memory access is 16 bytes aligned
 591     */
 592
 593     p_buffer = b_hscale ? p_buffer_start : p_pic;
 594     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 595                     p_dest->p->i_pitch|
 596                     ((intptr_t)p_y)|
 597                     ((intptr_t)p_buffer))) )
 598     {
 599         /* use faster SSE2 aligned fetch and store */
 600         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 601         {
 602             p_pic_start = p_pic;
 603
 604             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 605             {
 606                 SSE2_CALL (
 607                     SSE2_INIT_16_ALIGNED
 608                     SSE2_YUV_MUL
 609                     SSE2_YUV_ADD
 610                     SSE2_UNPACK_16_ALIGNED
 611                 );
 612                 p_y += 16;
 613                 p_u += 8;
 614                 p_v += 8;
 615                 p_buffer += 16;
 616             }
 617             /* Here we do some unaligned reads and duplicate conversions, but
 618              * at least we have all the pixels */
 619             if( i_rewind )
 620             {
 621                 p_y -= i_rewind;
 622                 p_u -= i_rewind >> 1;
 623                 p_v -= i_rewind >> 1;
 624                 p_buffer -= i_rewind;
 625
 626                 SSE2_CALL (
 627                     SSE2_INIT_16_UNALIGNED
 628                     SSE2_YUV_MUL
 629                     SSE2_YUV_ADD
 630                     SSE2_UNPACK_16_UNALIGNED
 631                 );
 632                 p_y += 16;
 633                 p_u += 8;
 634                 p_v += 8;
 635             }
 636             SCALE_WIDTH;
 637             SCALE_HEIGHT( 420, 2 );
 638
 639             p_y += i_source_margin;
 640             if( i_y % 2 )
 641             {
 642                 p_u += i_source_margin_c;
 643                 p_v += i_source_margin_c;
 644             }
 645             p_buffer = b_hscale ? p_buffer_start : p_pic;
 646         }
 647     }
 648     else
 649     {
 650         /* use slower SSE2 unaligned fetch and store */
 651         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 652         {
 653             p_pic_start = p_pic;
 654             p_buffer = b_hscale ? p_buffer_start : p_pic;
 655
 656             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 657             {
 658                 SSE2_CALL(
 659                     SSE2_INIT_16_UNALIGNED
 660                     SSE2_YUV_MUL
 661                     SSE2_YUV_ADD
 662                     SSE2_UNPACK_16_UNALIGNED
 663                 );
 664                 p_y += 16;
 665                 p_u += 8;
 666                 p_v += 8;
 667                 p_buffer += 16;
 668             }
 669             /* Here we do some unaligned reads and duplicate conversions, but
 670              * at least we have all the pixels */
 671             if( i_rewind )
 672             {
 673                 p_y -= i_rewind;
 674                 p_u -= i_rewind >> 1;
 675                 p_v -= i_rewind >> 1;
 676                 p_buffer -= i_rewind;
 677
 678                 SSE2_CALL(
 679                     SSE2_INIT_16_UNALIGNED
 680                     SSE2_YUV_MUL
 681                     SSE2_YUV_ADD
 682                     SSE2_UNPACK_16_UNALIGNED
 683                 );
 684                 p_y += 16;
 685                 p_u += 8;
 686                 p_v += 8;
 687             }
 688             SCALE_WIDTH;
 689             SCALE_HEIGHT( 420, 2 );
 690
 691             p_y += i_source_margin;
 692             if( i_y % 2 )
 693             {
 694                 p_u += i_source_margin_c;
 695                 p_v += i_source_margin_c;
 696             }
 697             p_buffer = b_hscale ? p_buffer_start : p_pic;
 698         }
 699     }
 700
 701     /* make sure all SSE2 stores are visible thereafter */
 702     SSE2_END;
 703
 704 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 705
 706     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 707
 708     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 709     {
 710         p_pic_start = p_pic;
 711         p_buffer = b_hscale ? p_buffer_start : p_pic;
 712
 713         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 714         {
 715             MMX_CALL (
 716                 MMX_INIT_16
 717                 MMX_YUV_MUL
 718                 MMX_YUV_ADD
 719                 MMX_UNPACK_16
 720             );
 721             p_y += 8;
 722             p_u += 4;
 723             p_v += 4;
 724             p_buffer += 8;
 725         }
 726
 727         /* Here we do some unaligned reads and duplicate conversions, but
 728          * at least we have all the pixels */
 729         if( i_rewind )
 730         {
 731             p_y -= i_rewind;
 732             p_u -= i_rewind >> 1;
 733             p_v -= i_rewind >> 1;
 734             p_buffer -= i_rewind;
 735
 736             MMX_CALL (
 737                 MMX_INIT_16
 738                 MMX_YUV_MUL
 739                 MMX_YUV_ADD
 740                 MMX_UNPACK_16
 741             );
 742             p_y += 8;
 743             p_u += 4;
 744             p_v += 4;
 745             p_buffer += 8;
 746         }
 747         SCALE_WIDTH;
 748         SCALE_HEIGHT( 420, 2 );
 749
 750         p_y += i_source_margin;
 751         if( i_y % 2 )
 752         {
 753             p_u += i_source_margin_c;
 754             p_v += i_source_margin_c;
 755         }
 756     }
 757     /* re-enable FPU registers */
 758     MMX_END;
 759
 760 #endif
 761 }
 762
 763 #endif
 764
 765 /*****************************************************************************
 766  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
 767  *****************************************************************************
 768  * Horizontal alignment needed:
 769  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
 770  *  - output: 1 pixel (2 bytes), margins allowed
 771  * Vertical alignment needed:
 772  *  - input: 2 lines (2 Y lines, 1 U/V line)
 773  *  - output: 1 line
 774  *****************************************************************************/
 775
 776 #if defined (MODULE_NAME_IS_i420_rgb)
 777
 778 void I420_RGB32( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 779 {
 780     /* We got this one from the old arguments */
 781     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 782     uint8_t  *p_y   = p_src->Y_PIXELS;
 783     uint8_t  *p_u   = p_src->U_PIXELS;
 784     uint8_t  *p_v   = p_src->V_PIXELS;
 785
 786     bool  b_hscale;                         /* horizontal scaling type */
 787     unsigned int i_vscale;                          /* vertical scaling type */
 788     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 789
 790     int         i_right_margin;
 791     int         i_rewind;
 792     int         i_scale_count;                       /* scale modulo counter */
 793     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 794     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 795     int         i_uval, i_vval;                           /* U and V samples */
 796     int         i_red, i_green, i_blue;          /* U and V modified samples */
 797     uint32_t *  p_yuv = p_filter->p_sys->p_rgb32;
 798     uint32_t *  p_ybase;                     /* Y dependant conversion table */
 799
 800     /* Conversion buffer pointer */
 801     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
 802     uint32_t *  p_buffer;
 803
 804     /* Offset array pointer */
 805     int *       p_offset_start = p_filter->p_sys->p_offset;
 806     int *       p_offset;
 807
 808     const int i_source_margin = p_src->p[0].i_pitch
 809                                  - p_src->p[0].i_visible_pitch;
 810     const int i_source_margin_c = p_src->p[1].i_pitch
 811                                  - p_src->p[1].i_visible_pitch;
 812
 813     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 814     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 815
 816     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 817      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 818      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 819     SetOffset( p_filter->fmt_in.video.i_width,
 820                p_filter->fmt_in.video.i_height,
 821                p_filter->fmt_out.video.i_width,
 822                p_filter->fmt_out.video.i_height,
 823                &b_hscale, &i_vscale, p_offset_start );
 824
 825     /*
 826      * Perform conversion
 827      */
 828     i_scale_count = ( i_vscale == 1 ) ?
 829                     p_filter->fmt_out.video.i_height :
 830                     p_filter->fmt_in.video.i_height;
 831     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 832     {
 833         p_pic_start = p_pic;
 834         p_buffer = b_hscale ? p_buffer_start : p_pic;
 835
 836         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 837         {
 838             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 839             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 840             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 841             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 842         }
 843
 844         /* Here we do some unaligned reads and duplicate conversions, but
 845          * at least we have all the pixels */
 846         if( i_rewind )
 847         {
 848             p_y -= i_rewind;
 849             p_u -= i_rewind >> 1;
 850             p_v -= i_rewind >> 1;
 851             p_buffer -= i_rewind;
 852             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 853             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 854             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 855             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 856         }
 857         SCALE_WIDTH;
 858         SCALE_HEIGHT( 420, 4 );
 859
 860         p_y += i_source_margin;
 861         if( i_y % 2 )
 862         {
 863             p_u += i_source_margin_c;
 864             p_v += i_source_margin_c;
 865         }
 866     }
 867 }
 868
 869 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
 870
 871 VLC_TARGET
 872 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
 873                                             picture_t *p_dest )
 874 {
 875     /* We got this one from the old arguments */
 876     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 877     uint8_t  *p_y   = p_src->Y_PIXELS;
 878     uint8_t  *p_u   = p_src->U_PIXELS;
 879     uint8_t  *p_v   = p_src->V_PIXELS;
 880
 881     bool  b_hscale;                         /* horizontal scaling type */
 882     unsigned int i_vscale;                          /* vertical scaling type */
 883     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 884
 885     int         i_right_margin;
 886     int         i_rewind;
 887     int         i_scale_count;                       /* scale modulo counter */
 888     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 889     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 890     /* Conversion buffer pointer */
 891     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
 892     uint32_t *  p_buffer;
 893
 894     /* Offset array pointer */
 895     int *       p_offset_start = p_filter->p_sys->p_offset;
 896     int *       p_offset;
 897
 898     const int i_source_margin = p_src->p[0].i_pitch
 899                                  - p_src->p[0].i_visible_pitch;
 900     const int i_source_margin_c = p_src->p[1].i_pitch
 901                                  - p_src->p[1].i_visible_pitch;
 902
 903     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 904
 905     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 906      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 907      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 908     SetOffset( p_filter->fmt_in.video.i_width,
 909                p_filter->fmt_in.video.i_height,
 910                p_filter->fmt_out.video.i_width,
 911                p_filter->fmt_out.video.i_height,
 912                &b_hscale, &i_vscale, p_offset_start );
 913
 914     /*
 915      * Perform conversion
 916      */
 917     i_scale_count = ( i_vscale == 1 ) ?
 918                     p_filter->fmt_out.video.i_height :
 919                     p_filter->fmt_in.video.i_height;
 920
 921 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 922
 923     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 924
 925     /*
 926     ** SSE2 128 bits fetch/store instructions are faster
 927     ** if memory access is 16 bytes aligned
 928     */
 929
 930     p_buffer = b_hscale ? p_buffer_start : p_pic;
 931     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 932                     p_dest->p->i_pitch|
 933                     ((intptr_t)p_y)|
 934                     ((intptr_t)p_buffer))) )
 935     {
 936         /* use faster SSE2 aligned fetch and store */
 937         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 938         {
 939             p_pic_start = p_pic;
 940
 941             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
 942             {
 943                 SSE2_CALL (
 944                     SSE2_INIT_32_ALIGNED
 945                     SSE2_YUV_MUL
 946                     SSE2_YUV_ADD
 947                     SSE2_UNPACK_32_ARGB_ALIGNED
 948                 );
 949                 p_y += 16;
 950                 p_u += 8;
 951                 p_v += 8;
 952                 p_buffer += 16;
 953             }
 954
 955             /* Here we do some unaligned reads and duplicate conversions, but
 956              * at least we have all the pixels */
 957             if( i_rewind )
 958             {
 959                 p_y -= i_rewind;
 960                 p_u -= i_rewind >> 1;
 961                 p_v -= i_rewind >> 1;
 962                 p_buffer -= i_rewind;
 963                 SSE2_CALL (
 964                     SSE2_INIT_32_UNALIGNED
 965                     SSE2_YUV_MUL
 966                     SSE2_YUV_ADD
 967                     SSE2_UNPACK_32_ARGB_UNALIGNED
 968                 );
 969                 p_y += 16;
 970                 p_u += 4;
 971                 p_v += 4;
 972             }
 973             SCALE_WIDTH;
 974             SCALE_HEIGHT( 420, 4 );
 975
 976             p_y += i_source_margin;
 977             if( i_y % 2 )
 978             {
 979                 p_u += i_source_margin_c;
 980                 p_v += i_source_margin_c;
 981             }
 982             p_buffer = b_hscale ? p_buffer_start : p_pic;
 983         }
 984     }
 985     else
 986     {
 987         /* use slower SSE2 unaligned fetch and store */
 988         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 989         {
 990             p_pic_start = p_pic;
 991             p_buffer = b_hscale ? p_buffer_start : p_pic;
 992
 993             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
 994             {
 995                 SSE2_CALL (
 996                     SSE2_INIT_32_UNALIGNED
 997                     SSE2_YUV_MUL
 998                     SSE2_YUV_ADD
 999                     SSE2_UNPACK_32_ARGB_UNALIGNED
1000                 );
1001                 p_y += 16;
1002                 p_u += 8;
1003                 p_v += 8;
1004                 p_buffer += 16;
1005             }
1006
1007             /* Here we do some unaligned reads and duplicate conversions, but
1008              * at least we have all the pixels */
1009             if( i_rewind )
1010             {
1011                 p_y -= i_rewind;
1012                 p_u -= i_rewind >> 1;
1013                 p_v -= i_rewind >> 1;
1014                 p_buffer -= i_rewind;
1015                 SSE2_CALL (
1016                     SSE2_INIT_32_UNALIGNED
1017                     SSE2_YUV_MUL
1018                     SSE2_YUV_ADD
1019                     SSE2_UNPACK_32_ARGB_UNALIGNED
1020                 );
1021                 p_y += 16;
1022                 p_u += 8;
1023                 p_v += 8;
1024             }
1025             SCALE_WIDTH;
1026             SCALE_HEIGHT( 420, 4 );
1027
1028             p_y += i_source_margin;
1029             if( i_y % 2 )
1030             {
1031                 p_u += i_source_margin_c;
1032                 p_v += i_source_margin_c;
1033             }
1034             p_buffer = b_hscale ? p_buffer_start : p_pic;
1035         }
1036     }
1037
1038     /* make sure all SSE2 stores are visible thereafter */
1039     SSE2_END;
1040
1041 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1042
1043     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1044
1045     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1046     {
1047         p_pic_start = p_pic;
1048         p_buffer = b_hscale ? p_buffer_start : p_pic;
1049
1050         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1051         {
1052             MMX_CALL (
1053                 MMX_INIT_32
1054                 MMX_YUV_MUL
1055                 MMX_YUV_ADD
1056                 MMX_UNPACK_32_ARGB
1057             );
1058             p_y += 8;
1059             p_u += 4;
1060             p_v += 4;
1061             p_buffer += 8;
1062         }
1063
1064         /* Here we do some unaligned reads and duplicate conversions, but
1065          * at least we have all the pixels */
1066         if( i_rewind )
1067         {
1068             p_y -= i_rewind;
1069             p_u -= i_rewind >> 1;
1070             p_v -= i_rewind >> 1;
1071             p_buffer -= i_rewind;
1072             MMX_CALL (
1073                 MMX_INIT_32
1074                 MMX_YUV_MUL
1075                 MMX_YUV_ADD
1076                 MMX_UNPACK_32_ARGB
1077             );
1078             p_y += 8;
1079             p_u += 4;
1080             p_v += 4;
1081             p_buffer += 8;
1082         }
1083         SCALE_WIDTH;
1084         SCALE_HEIGHT( 420, 4 );
1085
1086         p_y += i_source_margin;
1087         if( i_y % 2 )
1088         {
1089             p_u += i_source_margin_c;
1090             p_v += i_source_margin_c;
1091         }
1092     }
1093
1094     /* re-enable FPU registers */
1095     MMX_END;
1096
1097 #endif
1098 }
1099
1100 VLC_TARGET
1101 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1102 {
1103     /* We got this one from the old arguments */
1104     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1105     uint8_t  *p_y   = p_src->Y_PIXELS;
1106     uint8_t  *p_u   = p_src->U_PIXELS;
1107     uint8_t  *p_v   = p_src->V_PIXELS;
1108
1109     bool  b_hscale;                         /* horizontal scaling type */
1110     unsigned int i_vscale;                          /* vertical scaling type */
1111     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1112
1113     int         i_right_margin;
1114     int         i_rewind;
1115     int         i_scale_count;                       /* scale modulo counter */
1116     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1117     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1118     /* Conversion buffer pointer */
1119     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1120     uint32_t *  p_buffer;
1121
1122     /* Offset array pointer */
1123     int *       p_offset_start = p_filter->p_sys->p_offset;
1124     int *       p_offset;
1125
1126     const int i_source_margin = p_src->p[0].i_pitch
1127                                  - p_src->p[0].i_visible_pitch;
1128     const int i_source_margin_c = p_src->p[1].i_pitch
1129                                  - p_src->p[1].i_visible_pitch;
1130
1131     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1132
1133     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1134      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1135      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1136     SetOffset( p_filter->fmt_in.video.i_width,
1137                p_filter->fmt_in.video.i_height,
1138                p_filter->fmt_out.video.i_width,
1139                p_filter->fmt_out.video.i_height,
1140                &b_hscale, &i_vscale, p_offset_start );
1141
1142     /*
1143      * Perform conversion
1144      */
1145     i_scale_count = ( i_vscale == 1 ) ?
1146                     p_filter->fmt_out.video.i_height :
1147                     p_filter->fmt_in.video.i_height;
1148
1149 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1150
1151     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1152
1153     /*
1154     ** SSE2 128 bits fetch/store instructions are faster
1155     ** if memory access is 16 bytes aligned
1156     */
1157
1158     p_buffer = b_hscale ? p_buffer_start : p_pic;
1159     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1160                     p_dest->p->i_pitch|
1161                     ((intptr_t)p_y)|
1162                     ((intptr_t)p_buffer))) )
1163     {
1164         /* use faster SSE2 aligned fetch and store */
1165         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1166         {
1167             p_pic_start = p_pic;
1168
1169             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1170             {
1171                 SSE2_CALL (
1172                     SSE2_INIT_32_ALIGNED
1173                     SSE2_YUV_MUL
1174                     SSE2_YUV_ADD
1175                     SSE2_UNPACK_32_RGBA_ALIGNED
1176                 );
1177                 p_y += 16;
1178                 p_u += 8;
1179                 p_v += 8;
1180                 p_buffer += 16;
1181             }
1182
1183             /* Here we do some unaligned reads and duplicate conversions, but
1184              * at least we have all the pixels */
1185             if( i_rewind )
1186             {
1187                 p_y -= i_rewind;
1188                 p_u -= i_rewind >> 1;
1189                 p_v -= i_rewind >> 1;
1190                 p_buffer -= i_rewind;
1191                 SSE2_CALL (
1192                     SSE2_INIT_32_UNALIGNED
1193                     SSE2_YUV_MUL
1194                     SSE2_YUV_ADD
1195                     SSE2_UNPACK_32_RGBA_UNALIGNED
1196                 );
1197                 p_y += 16;
1198                 p_u += 4;
1199                 p_v += 4;
1200             }
1201             SCALE_WIDTH;
1202             SCALE_HEIGHT( 420, 4 );
1203
1204             p_y += i_source_margin;
1205             if( i_y % 2 )
1206             {
1207                 p_u += i_source_margin_c;
1208                 p_v += i_source_margin_c;
1209             }
1210             p_buffer = b_hscale ? p_buffer_start : p_pic;
1211         }
1212     }
1213     else
1214     {
1215         /* use slower SSE2 unaligned fetch and store */
1216         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1217         {
1218             p_pic_start = p_pic;
1219             p_buffer = b_hscale ? p_buffer_start : p_pic;
1220
1221             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1222             {
1223                 SSE2_CALL (
1224                     SSE2_INIT_32_UNALIGNED
1225                     SSE2_YUV_MUL
1226                     SSE2_YUV_ADD
1227                     SSE2_UNPACK_32_RGBA_UNALIGNED
1228                 );
1229                 p_y += 16;
1230                 p_u += 8;
1231                 p_v += 8;
1232                 p_buffer += 16;
1233             }
1234
1235             /* Here we do some unaligned reads and duplicate conversions, but
1236              * at least we have all the pixels */
1237             if( i_rewind )
1238             {
1239                 p_y -= i_rewind;
1240                 p_u -= i_rewind >> 1;
1241                 p_v -= i_rewind >> 1;
1242                 p_buffer -= i_rewind;
1243                 SSE2_CALL (
1244                     SSE2_INIT_32_UNALIGNED
1245                     SSE2_YUV_MUL
1246                     SSE2_YUV_ADD
1247                     SSE2_UNPACK_32_RGBA_UNALIGNED
1248                 );
1249                 p_y += 16;
1250                 p_u += 8;
1251                 p_v += 8;
1252             }
1253             SCALE_WIDTH;
1254             SCALE_HEIGHT( 420, 4 );
1255
1256             p_y += i_source_margin;
1257             if( i_y % 2 )
1258             {
1259                 p_u += i_source_margin_c;
1260                 p_v += i_source_margin_c;
1261             }
1262             p_buffer = b_hscale ? p_buffer_start : p_pic;
1263         }
1264     }
1265
1266     /* make sure all SSE2 stores are visible thereafter */
1267     SSE2_END;
1268
1269 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1270
1271     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1272
1273     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1274     {
1275         p_pic_start = p_pic;
1276         p_buffer = b_hscale ? p_buffer_start : p_pic;
1277
1278         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1279         {
1280             MMX_CALL (
1281                 MMX_INIT_32
1282                 MMX_YUV_MUL
1283                 MMX_YUV_ADD
1284                 MMX_UNPACK_32_RGBA
1285             );
1286             p_y += 8;
1287             p_u += 4;
1288             p_v += 4;
1289             p_buffer += 8;
1290         }
1291
1292         /* Here we do some unaligned reads and duplicate conversions, but
1293          * at least we have all the pixels */
1294         if( i_rewind )
1295         {
1296             p_y -= i_rewind;
1297             p_u -= i_rewind >> 1;
1298             p_v -= i_rewind >> 1;
1299             p_buffer -= i_rewind;
1300             MMX_CALL (
1301                 MMX_INIT_32
1302                 MMX_YUV_MUL
1303                 MMX_YUV_ADD
1304                 MMX_UNPACK_32_RGBA
1305             );
1306             p_y += 8;
1307             p_u += 4;
1308             p_v += 4;
1309             p_buffer += 8;
1310         }
1311         SCALE_WIDTH;
1312         SCALE_HEIGHT( 420, 4 );
1313
1314         p_y += i_source_margin;
1315         if( i_y % 2 )
1316         {
1317             p_u += i_source_margin_c;
1318             p_v += i_source_margin_c;
1319         }
1320     }
1321
1322     /* re-enable FPU registers */
1323     MMX_END;
1324
1325 #endif
1326 }
1327
1328 VLC_TARGET
1329 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1330 {
1331     /* We got this one from the old arguments */
1332     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1333     uint8_t  *p_y   = p_src->Y_PIXELS;
1334     uint8_t  *p_u   = p_src->U_PIXELS;
1335     uint8_t  *p_v   = p_src->V_PIXELS;
1336
1337     bool  b_hscale;                         /* horizontal scaling type */
1338     unsigned int i_vscale;                          /* vertical scaling type */
1339     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1340
1341     int         i_right_margin;
1342     int         i_rewind;
1343     int         i_scale_count;                       /* scale modulo counter */
1344     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1345     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1346     /* Conversion buffer pointer */
1347     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1348     uint32_t *  p_buffer;
1349
1350     /* Offset array pointer */
1351     int *       p_offset_start = p_filter->p_sys->p_offset;
1352     int *       p_offset;
1353
1354     const int i_source_margin = p_src->p[0].i_pitch
1355                                  - p_src->p[0].i_visible_pitch;
1356     const int i_source_margin_c = p_src->p[1].i_pitch
1357                                  - p_src->p[1].i_visible_pitch;
1358
1359     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1360
1361     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1362      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1363      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1364     SetOffset( p_filter->fmt_in.video.i_width,
1365                p_filter->fmt_in.video.i_height,
1366                p_filter->fmt_out.video.i_width,
1367                p_filter->fmt_out.video.i_height,
1368                &b_hscale, &i_vscale, p_offset_start );
1369
1370     /*
1371      * Perform conversion
1372      */
1373     i_scale_count = ( i_vscale == 1 ) ?
1374                     p_filter->fmt_out.video.i_height :
1375                     p_filter->fmt_in.video.i_height;
1376
1377 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1378
1379     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1380
1381     /*
1382     ** SSE2 128 bits fetch/store instructions are faster
1383     ** if memory access is 16 bytes aligned
1384     */
1385
1386     p_buffer = b_hscale ? p_buffer_start : p_pic;
1387     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1388                     p_dest->p->i_pitch|
1389                     ((intptr_t)p_y)|
1390                     ((intptr_t)p_buffer))) )
1391     {
1392         /* use faster SSE2 aligned fetch and store */
1393         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1394         {
1395             p_pic_start = p_pic;
1396
1397             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1398             {
1399                 SSE2_CALL (
1400                     SSE2_INIT_32_ALIGNED
1401                     SSE2_YUV_MUL
1402                     SSE2_YUV_ADD
1403                     SSE2_UNPACK_32_BGRA_ALIGNED
1404                 );
1405                 p_y += 16;
1406                 p_u += 8;
1407                 p_v += 8;
1408                 p_buffer += 16;
1409             }
1410
1411             /* Here we do some unaligned reads and duplicate conversions, but
1412              * at least we have all the pixels */
1413             if( i_rewind )
1414             {
1415                 p_y -= i_rewind;
1416                 p_u -= i_rewind >> 1;
1417                 p_v -= i_rewind >> 1;
1418                 p_buffer -= i_rewind;
1419                 SSE2_CALL (
1420                     SSE2_INIT_32_UNALIGNED
1421                     SSE2_YUV_MUL
1422                     SSE2_YUV_ADD
1423                     SSE2_UNPACK_32_BGRA_UNALIGNED
1424                 );
1425                 p_y += 16;
1426                 p_u += 4;
1427                 p_v += 4;
1428             }
1429             SCALE_WIDTH;
1430             SCALE_HEIGHT( 420, 4 );
1431
1432             p_y += i_source_margin;
1433             if( i_y % 2 )
1434             {
1435                 p_u += i_source_margin_c;
1436                 p_v += i_source_margin_c;
1437             }
1438             p_buffer = b_hscale ? p_buffer_start : p_pic;
1439         }
1440     }
1441     else
1442     {
1443         /* use slower SSE2 unaligned fetch and store */
1444         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1445         {
1446             p_pic_start = p_pic;
1447             p_buffer = b_hscale ? p_buffer_start : p_pic;
1448
1449             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1450             {
1451                 SSE2_CALL (
1452                     SSE2_INIT_32_UNALIGNED
1453                     SSE2_YUV_MUL
1454                     SSE2_YUV_ADD
1455                     SSE2_UNPACK_32_BGRA_UNALIGNED
1456                 );
1457                 p_y += 16;
1458                 p_u += 8;
1459                 p_v += 8;
1460                 p_buffer += 16;
1461             }
1462
1463             /* Here we do some unaligned reads and duplicate conversions, but
1464              * at least we have all the pixels */
1465             if( i_rewind )
1466             {
1467                 p_y -= i_rewind;
1468                 p_u -= i_rewind >> 1;
1469                 p_v -= i_rewind >> 1;
1470                 p_buffer -= i_rewind;
1471                 SSE2_CALL (
1472                     SSE2_INIT_32_UNALIGNED
1473                     SSE2_YUV_MUL
1474                     SSE2_YUV_ADD
1475                     SSE2_UNPACK_32_BGRA_UNALIGNED
1476                 );
1477                 p_y += 16;
1478                 p_u += 8;
1479                 p_v += 8;
1480             }
1481             SCALE_WIDTH;
1482             SCALE_HEIGHT( 420, 4 );
1483
1484             p_y += i_source_margin;
1485             if( i_y % 2 )
1486             {
1487                 p_u += i_source_margin_c;
1488                 p_v += i_source_margin_c;
1489             }
1490             p_buffer = b_hscale ? p_buffer_start : p_pic;
1491         }
1492     }
1493
1494 #else
1495
1496     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1497
1498     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1499     {
1500         p_pic_start = p_pic;
1501         p_buffer = b_hscale ? p_buffer_start : p_pic;
1502
1503         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1504         {
1505             MMX_CALL (
1506                 MMX_INIT_32
1507                 MMX_YUV_MUL
1508                 MMX_YUV_ADD
1509                 MMX_UNPACK_32_BGRA
1510             );
1511             p_y += 8;
1512             p_u += 4;
1513             p_v += 4;
1514             p_buffer += 8;
1515         }
1516
1517         /* Here we do some unaligned reads and duplicate conversions, but
1518          * at least we have all the pixels */
1519         if( i_rewind )
1520         {
1521             p_y -= i_rewind;
1522             p_u -= i_rewind >> 1;
1523             p_v -= i_rewind >> 1;
1524             p_buffer -= i_rewind;
1525             MMX_CALL (
1526                 MMX_INIT_32
1527                 MMX_YUV_MUL
1528                 MMX_YUV_ADD
1529                 MMX_UNPACK_32_BGRA
1530             );
1531             p_y += 8;
1532             p_u += 4;
1533             p_v += 4;
1534             p_buffer += 8;
1535         }
1536         SCALE_WIDTH;
1537         SCALE_HEIGHT( 420, 4 );
1538
1539         p_y += i_source_margin;
1540         if( i_y % 2 )
1541         {
1542             p_u += i_source_margin_c;
1543             p_v += i_source_margin_c;
1544         }
1545     }
1546
1547     /* re-enable FPU registers */
1548     MMX_END;
1549
1550 #endif
1551 }
1552
1553 VLC_TARGET
1554 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1555 {
1556     /* We got this one from the old arguments */
1557     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1558     uint8_t  *p_y   = p_src->Y_PIXELS;
1559     uint8_t  *p_u   = p_src->U_PIXELS;
1560     uint8_t  *p_v   = p_src->V_PIXELS;
1561
1562     bool  b_hscale;                         /* horizontal scaling type */
1563     unsigned int i_vscale;                          /* vertical scaling type */
1564     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1565
1566     int         i_right_margin;
1567     int         i_rewind;
1568     int         i_scale_count;                       /* scale modulo counter */
1569     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1570     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1571     /* Conversion buffer pointer */
1572     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1573     uint32_t *  p_buffer;
1574
1575     /* Offset array pointer */
1576     int *       p_offset_start = p_filter->p_sys->p_offset;
1577     int *       p_offset;
1578
1579     const int i_source_margin = p_src->p[0].i_pitch
1580                                  - p_src->p[0].i_visible_pitch;
1581     const int i_source_margin_c = p_src->p[1].i_pitch
1582                                  - p_src->p[1].i_visible_pitch;
1583
1584     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1585
1586     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1587      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1588      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1589     SetOffset( p_filter->fmt_in.video.i_width,
1590                p_filter->fmt_in.video.i_height,
1591                p_filter->fmt_out.video.i_width,
1592                p_filter->fmt_out.video.i_height,
1593                &b_hscale, &i_vscale, p_offset_start );
1594
1595     /*
1596      * Perform conversion
1597      */
1598     i_scale_count = ( i_vscale == 1 ) ?
1599                     p_filter->fmt_out.video.i_height :
1600                     p_filter->fmt_in.video.i_height;
1601
1602 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1603
1604     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1605
1606     /*
1607     ** SSE2 128 bits fetch/store instructions are faster
1608     ** if memory access is 16 bytes aligned
1609     */
1610
1611     p_buffer = b_hscale ? p_buffer_start : p_pic;
1612     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1613                     p_dest->p->i_pitch|
1614                     ((intptr_t)p_y)|
1615                     ((intptr_t)p_buffer))) )
1616     {
1617         /* use faster SSE2 aligned fetch and store */
1618         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1619         {
1620             p_pic_start = p_pic;
1621
1622             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1623             {
1624                 SSE2_CALL (
1625                     SSE2_INIT_32_ALIGNED
1626                     SSE2_YUV_MUL
1627                     SSE2_YUV_ADD
1628                     SSE2_UNPACK_32_ABGR_ALIGNED
1629                 );
1630                 p_y += 16;
1631                 p_u += 8;
1632                 p_v += 8;
1633                 p_buffer += 16;
1634             }
1635
1636             /* Here we do some unaligned reads and duplicate conversions, but
1637              * at least we have all the pixels */
1638             if( i_rewind )
1639             {
1640                 p_y -= i_rewind;
1641                 p_u -= i_rewind >> 1;
1642                 p_v -= i_rewind >> 1;
1643                 p_buffer -= i_rewind;
1644                 SSE2_CALL (
1645                     SSE2_INIT_32_UNALIGNED
1646                     SSE2_YUV_MUL
1647                     SSE2_YUV_ADD
1648                     SSE2_UNPACK_32_ABGR_UNALIGNED
1649                 );
1650                 p_y += 16;
1651                 p_u += 4;
1652                 p_v += 4;
1653             }
1654             SCALE_WIDTH;
1655             SCALE_HEIGHT( 420, 4 );
1656
1657             p_y += i_source_margin;
1658             if( i_y % 2 )
1659             {
1660                 p_u += i_source_margin_c;
1661                 p_v += i_source_margin_c;
1662             }
1663             p_buffer = b_hscale ? p_buffer_start : p_pic;
1664         }
1665     }
1666     else
1667     {
1668         /* use slower SSE2 unaligned fetch and store */
1669         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1670         {
1671             p_pic_start = p_pic;
1672             p_buffer = b_hscale ? p_buffer_start : p_pic;
1673
1674             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1675             {
1676                 SSE2_CALL (
1677                     SSE2_INIT_32_UNALIGNED
1678                     SSE2_YUV_MUL
1679                     SSE2_YUV_ADD
1680                     SSE2_UNPACK_32_ABGR_UNALIGNED
1681                 );
1682                 p_y += 16;
1683                 p_u += 8;
1684                 p_v += 8;
1685                 p_buffer += 16;
1686             }
1687
1688             /* Here we do some unaligned reads and duplicate conversions, but
1689              * at least we have all the pixels */
1690             if( i_rewind )
1691             {
1692                 p_y -= i_rewind;
1693                 p_u -= i_rewind >> 1;
1694                 p_v -= i_rewind >> 1;
1695                 p_buffer -= i_rewind;
1696                 SSE2_CALL (
1697                     SSE2_INIT_32_UNALIGNED
1698                     SSE2_YUV_MUL
1699                     SSE2_YUV_ADD
1700                     SSE2_UNPACK_32_ABGR_UNALIGNED
1701                 );
1702                 p_y += 16;
1703                 p_u += 8;
1704                 p_v += 8;
1705             }
1706             SCALE_WIDTH;
1707             SCALE_HEIGHT( 420, 4 );
1708
1709             p_y += i_source_margin;
1710             if( i_y % 2 )
1711             {
1712                 p_u += i_source_margin_c;
1713                 p_v += i_source_margin_c;
1714             }
1715             p_buffer = b_hscale ? p_buffer_start : p_pic;
1716         }
1717     }
1718
1719 #else
1720
1721     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1722
1723     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1724     {
1725         p_pic_start = p_pic;
1726         p_buffer = b_hscale ? p_buffer_start : p_pic;
1727
1728         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1729         {
1730             MMX_CALL (
1731                 MMX_INIT_32
1732                 MMX_YUV_MUL
1733                 MMX_YUV_ADD
1734                 MMX_UNPACK_32_ABGR
1735             );
1736             p_y += 8;
1737             p_u += 4;
1738             p_v += 4;
1739             p_buffer += 8;
1740         }
1741
1742         /* Here we do some unaligned reads and duplicate conversions, but
1743          * at least we have all the pixels */
1744         if( i_rewind )
1745         {
1746             p_y -= i_rewind;
1747             p_u -= i_rewind >> 1;
1748             p_v -= i_rewind >> 1;
1749             p_buffer -= i_rewind;
1750             MMX_CALL (
1751                 MMX_INIT_32
1752                 MMX_YUV_MUL
1753                 MMX_YUV_ADD
1754                 MMX_UNPACK_32_ABGR
1755             );
1756             p_y += 8;
1757             p_u += 4;
1758             p_v += 4;
1759             p_buffer += 8;
1760         }
1761         SCALE_WIDTH;
1762         SCALE_HEIGHT( 420, 4 );
1763
1764         p_y += i_source_margin;
1765         if( i_y % 2 )
1766         {
1767             p_u += i_source_margin_c;
1768             p_v += i_source_margin_c;
1769         }
1770     }
1771
1772     /* re-enable FPU registers */
1773     MMX_END;
1774
1775 #endif
1776 }
1777
1778 #endif
1779
1780 /* Following functions are local */
1781
1782 /*****************************************************************************
1783  * SetOffset: build offset array for conversion functions
1784  *****************************************************************************
1785  * This function will build an offset array used in later conversion functions.
1786  * It will also set horizontal and vertical scaling indicators.
1787  *****************************************************************************/
1788 static void SetOffset( int i_width, int i_height, int i_pic_width,
1789                        int i_pic_height, bool *pb_hscale,
1790                        unsigned int *pi_vscale, int *p_offset )
1791 {
1792     int i_x;                                    /* x position in destination */
1793     int i_scale_count;                                     /* modulo counter */
1794
1795     /*
1796      * Prepare horizontal offset array
1797      */
1798     if( i_pic_width - i_width == 0 )
1799     {
1800         /* No horizontal scaling: YUV conversion is done directly to picture */
1801         *pb_hscale = 0;
1802     }
1803     else if( i_pic_width - i_width > 0 )
1804     {
1805         /* Prepare scaling array for horizontal extension */
1806         *pb_hscale = 1;
1807         i_scale_count = i_pic_width;
1808         for( i_x = i_width; i_x--; )
1809         {
1810             while( (i_scale_count -= i_width) > 0 )
1811             {
1812                 *p_offset++ = 0;
1813             }
1814             *p_offset++ = 1;
1815             i_scale_count += i_pic_width;
1816         }
1817     }
1818     else /* if( i_pic_width - i_width < 0 ) */
1819     {
1820         /* Prepare scaling array for horizontal reduction */
1821         *pb_hscale = 1;
1822         i_scale_count = i_width;
1823         for( i_x = i_pic_width; i_x--; )
1824         {
1825             *p_offset = 1;
1826             while( (i_scale_count -= i_pic_width) > 0 )
1827             {
1828                 *p_offset += 1;
1829             }
1830             p_offset++;
1831             i_scale_count += i_width;
1832         }
1833     }
1834
1835     /*
1836      * Set vertical scaling indicator
1837      */
1838     if( i_pic_height - i_height == 0 )
1839     {
1840         *pi_vscale = 0;
1841     }
1842     else if( i_pic_height - i_height > 0 )
1843     {
1844         *pi_vscale = 1;
1845     }
1846     else /* if( i_pic_height - i_height < 0 ) */
1847     {
1848         *pi_vscale = -1;
1849     }
1850 }
1851