git.sesse.net Git - vlc/blob - modules/video_chroma/i420_rgb16.c

   1 /*****************************************************************************
   2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damienf@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28 #include <string.h>                                            /* strerror() */
  29 #include <stdlib.h>                                      /* malloc(), free() */
  30
  31 #include <vlc/vlc.h>
  32 #include <vlc_vout.h>
  33
  34 #include "i420_rgb.h"
  35 #if defined (MODULE_NAME_IS_i420_rgb)
  36 #   include "i420_rgb_c.h"
  37 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
  38 #   if defined(HAVE_MMX_INTRINSICS)
  39 #       include <mmintrin.h>
  40 #   endif
  41 #   include "i420_rgb_mmx.h"
  42 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
  43 #   if defined(HAVE_SSE2_INTRINSICS)
  44 #       include <emmintrin.h>
  45 #   endif
  46 #   include "i420_rgb_mmx.h"
  47 #endif
  48
  49 static void SetOffset( int, int, int, int, vlc_bool_t *,
  50                        unsigned int *, int * );
  51
  52 #if defined (MODULE_NAME_IS_i420_rgb)
  53 /*****************************************************************************
  54  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
  55  *****************************************************************************
  56  * Horizontal alignment needed:
  57  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
  58  *  - output: 1 pixel (2 bytes), margins allowed
  59  * Vertical alignment needed:
  60  *  - input: 2 lines (2 Y lines, 1 U/V line)
  61  *  - output: 1 line
  62  *****************************************************************************/
  63 void E_(I420_RGB16_dither)( vout_thread_t *p_vout, picture_t *p_src,
  64                                                       picture_t *p_dest )
  65 {
  66     /* We got this one from the old arguments */
  67     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
  68     uint8_t  *p_y   = p_src->Y_PIXELS;
  69     uint8_t  *p_u   = p_src->U_PIXELS;
  70     uint8_t  *p_v   = p_src->V_PIXELS;
  71
  72     vlc_bool_t   b_hscale;                        /* horizontal scaling type */
  73     unsigned int i_vscale;                          /* vertical scaling type */
  74     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
  75     unsigned int i_real_y;                                          /* y % 4 */
  76
  77     int         i_right_margin;
  78     int         i_rewind;
  79     int         i_scale_count;                       /* scale modulo counter */
  80     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
  81     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
  82     int         i_uval, i_vval;                           /* U and V samples */
  83     int         i_red, i_green, i_blue;          /* U and V modified samples */
  84     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
  85     uint16_t *  p_ybase;                     /* Y dependant conversion table */
  86
  87     /* Conversion buffer pointer */
  88     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
  89     uint16_t *  p_buffer;
  90
  91     /* Offset array pointer */
  92     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
  93     int *       p_offset;
  94
  95     const int i_source_margin = p_src->p[0].i_pitch
  96                                  - p_src->p[0].i_visible_pitch;
  97     const int i_source_margin_c = p_src->p[1].i_pitch
  98                                  - p_src->p[1].i_visible_pitch;
  99
 100     /* The dithering matrices */
 101     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
 102     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
 103     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
 104     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
 105
 106     for(i_x = 0; i_x < 4; i_x++)
 107     {
 108         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
 109         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
 110         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
 111         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_vout->output.i_rrshift);
 112     }
 113
 114     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 115
 116     if( p_vout->render.i_width & 7 )
 117     {
 118         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 119     }
 120     else
 121     {
 122         i_rewind = 0;
 123     }
 124
 125     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 126      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 127      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 128     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 129                p_vout->output.i_width, p_vout->output.i_height,
 130                &b_hscale, &i_vscale, p_offset_start );
 131
 132     /*
 133      * Perform conversion
 134      */
 135     i_scale_count = ( i_vscale == 1 ) ?
 136                     p_vout->output.i_height : p_vout->render.i_height;
 137     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 138     {
 139         i_real_y = i_y & 0x3;
 140         p_pic_start = p_pic;
 141         p_buffer = b_hscale ? p_buffer_start : p_pic;
 142
 143         for ( i_x = p_vout->render.i_width / 8; i_x--; )
 144         {
 145             int *p_dither = dither10;
 146             CONVERT_YUV_PIXEL_DITHER(2);
 147             p_dither = dither11;
 148             CONVERT_Y_PIXEL_DITHER(2);
 149             p_dither = dither12;
 150             CONVERT_YUV_PIXEL_DITHER(2);
 151             p_dither = dither13;
 152             CONVERT_Y_PIXEL_DITHER(2);
 153             p_dither = dither10;
 154             CONVERT_YUV_PIXEL_DITHER(2);
 155             p_dither = dither11;
 156             CONVERT_Y_PIXEL_DITHER(2);
 157             p_dither = dither12;
 158             CONVERT_YUV_PIXEL_DITHER(2);
 159             p_dither = dither13;
 160             CONVERT_Y_PIXEL_DITHER(2);
 161         }
 162
 163         /* Here we do some unaligned reads and duplicate conversions, but
 164          * at least we have all the pixels */
 165         if( i_rewind )
 166         {
 167             int *p_dither = dither10;
 168             p_y -= i_rewind;
 169             p_u -= i_rewind >> 1;
 170             p_v -= i_rewind >> 1;
 171             p_buffer -= i_rewind;
 172             CONVERT_YUV_PIXEL_DITHER(2);
 173             p_dither = dither11;
 174             CONVERT_Y_PIXEL_DITHER(2);
 175             p_dither = dither12;
 176             CONVERT_YUV_PIXEL_DITHER(2);
 177             p_dither = dither13;
 178             CONVERT_Y_PIXEL_DITHER(2);
 179             p_dither = dither10;
 180             CONVERT_YUV_PIXEL_DITHER(2);
 181             p_dither = dither11;
 182             CONVERT_Y_PIXEL_DITHER(2);
 183             p_dither = dither12;
 184             CONVERT_YUV_PIXEL_DITHER(2);
 185             p_dither = dither13;
 186             CONVERT_Y_PIXEL_DITHER(2);
 187         }
 188         SCALE_WIDTH;
 189         SCALE_HEIGHT( 420, 2 );
 190
 191         p_y += i_source_margin;
 192         if( i_y % 2 )
 193         {
 194             p_u += i_source_margin_c;
 195             p_v += i_source_margin_c;
 196         }
 197     }
 198 }
 199 #endif
 200
 201 /*****************************************************************************
 202  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
 203  *****************************************************************************
 204  * Horizontal alignment needed:
 205  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
 206  *  - output: 1 pixel (2 bytes), margins allowed
 207  * Vertical alignment needed:
 208  *  - input: 2 lines (2 Y lines, 1 U/V line)
 209  *  - output: 1 line
 210  *****************************************************************************/
 211
 212 #if defined (MODULE_NAME_IS_i420_rgb)
 213
 214 void E_(I420_RGB16)( vout_thread_t *p_vout, picture_t *p_src,
 215                                             picture_t *p_dest )
 216 {
 217     /* We got this one from the old arguments */
 218     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 219     uint8_t  *p_y   = p_src->Y_PIXELS;
 220     uint8_t  *p_u   = p_src->U_PIXELS;
 221     uint8_t  *p_v   = p_src->V_PIXELS;
 222
 223     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
 224     unsigned int i_vscale;                          /* vertical scaling type */
 225     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 226
 227     int         i_right_margin;
 228     int         i_rewind;
 229     int         i_scale_count;                       /* scale modulo counter */
 230     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
 231     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 232     int         i_uval, i_vval;                           /* U and V samples */
 233     int         i_red, i_green, i_blue;          /* U and V modified samples */
 234     uint16_t *  p_yuv = p_vout->chroma.p_sys->p_rgb16;
 235     uint16_t *  p_ybase;                     /* Y dependant conversion table */
 236
 237     /* Conversion buffer pointer */
 238     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
 239     uint16_t *  p_buffer;
 240
 241     /* Offset array pointer */
 242     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
 243     int *       p_offset;
 244
 245     const int i_source_margin = p_src->p[0].i_pitch
 246                                  - p_src->p[0].i_visible_pitch;
 247     const int i_source_margin_c = p_src->p[1].i_pitch
 248                                  - p_src->p[1].i_visible_pitch;
 249
 250     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 251
 252     if( p_vout->render.i_width & 7 )
 253     {
 254         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 255     }
 256     else
 257     {
 258         i_rewind = 0;
 259     }
 260
 261     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 262      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 263      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 264     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 265                p_vout->output.i_width, p_vout->output.i_height,
 266                &b_hscale, &i_vscale, p_offset_start );
 267
 268     /*
 269      * Perform conversion
 270      */
 271     i_scale_count = ( i_vscale == 1 ) ?
 272                     p_vout->output.i_height : p_vout->render.i_height;
 273     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 274     {
 275         p_pic_start = p_pic;
 276         p_buffer = b_hscale ? p_buffer_start : p_pic;
 277
 278         for ( i_x = p_vout->render.i_width / 8; i_x--; )
 279         {
 280             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 281             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 282             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 283             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 284         }
 285
 286         /* Here we do some unaligned reads and duplicate conversions, but
 287          * at least we have all the pixels */
 288         if( i_rewind )
 289         {
 290             p_y -= i_rewind;
 291             p_u -= i_rewind >> 1;
 292             p_v -= i_rewind >> 1;
 293             p_buffer -= i_rewind;
 294
 295             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 296             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 297             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 298             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 299         }
 300         SCALE_WIDTH;
 301         SCALE_HEIGHT( 420, 2 );
 302
 303         p_y += i_source_margin;
 304         if( i_y % 2 )
 305         {
 306             p_u += i_source_margin_c;
 307             p_v += i_source_margin_c;
 308         }
 309     }
 310 }
 311
 312 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 313
 314 void E_(I420_R5G5B5)( vout_thread_t *p_vout, picture_t *p_src,
 315                                             picture_t *p_dest )
 316 {
 317     /* We got this one from the old arguments */
 318     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 319     uint8_t  *p_y   = p_src->Y_PIXELS;
 320     uint8_t  *p_u   = p_src->U_PIXELS;
 321     uint8_t  *p_v   = p_src->V_PIXELS;
 322
 323     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
 324     unsigned int i_vscale;                          /* vertical scaling type */
 325     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 326
 327     int         i_right_margin;
 328     int         i_rewind;
 329     int         i_scale_count;                       /* scale modulo counter */
 330     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
 331     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 332
 333     /* Conversion buffer pointer */
 334     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
 335     uint16_t *  p_buffer;
 336
 337     /* Offset array pointer */
 338     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
 339     int *       p_offset;
 340
 341     const int i_source_margin = p_src->p[0].i_pitch
 342                                  - p_src->p[0].i_visible_pitch;
 343     const int i_source_margin_c = p_src->p[1].i_pitch
 344                                  - p_src->p[1].i_visible_pitch;
 345
 346     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 347
 348     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 349      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 350      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 351     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 352                p_vout->output.i_width, p_vout->output.i_height,
 353                &b_hscale, &i_vscale, p_offset_start );
 354
 355
 356     /*
 357      * Perform conversion
 358      */
 359     i_scale_count = ( i_vscale == 1 ) ?
 360                     p_vout->output.i_height : p_vout->render.i_height;
 361
 362 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 363
 364     if( p_vout->render.i_width & 15 )
 365     {
 366         i_rewind = 16 - ( p_vout->render.i_width & 15 );
 367     }
 368     else
 369     {
 370         i_rewind = 0;
 371     }
 372
 373     /*
 374     ** SSE2 128 bits fetch/store instructions are faster
 375     ** if memory access is 16 bytes aligned
 376     */
 377
 378     p_buffer = b_hscale ? p_buffer_start : p_pic;
 379     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 380                     p_dest->p->i_pitch|
 381                     ((int)p_y)|
 382                     ((int)p_buffer))) )
 383     {
 384         /* use faster SSE2 aligned fetch and store */
 385         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 386         {
 387             p_pic_start = p_pic;
 388
 389             for ( i_x = p_vout->render.i_width/16; i_x--; )
 390             {
 391 #if defined (CAN_COMPILE_SSE2)
 392                 __asm__( ".p2align 3"
 393                          SSE2_INIT_16_ALIGNED
 394                          SSE2_YUV_MUL
 395                          SSE2_YUV_ADD
 396                          SSE2_UNPACK_15_ALIGNED
 397                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 398 #else
 399                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 400                 SSE2_INTRINSICS_INIT_16_ALIGNED
 401                 SSE2_INTRINSICS_YUV_MUL
 402                 SSE2_INTRINSICS_YUV_ADD
 403                 SSE2_INTRINSICS_UNPACK_15_ALIGNED
 404 #endif
 405                 p_y += 16;
 406                 p_u += 8;
 407                 p_v += 8;
 408                 p_buffer += 16;
 409             }
 410             /* Here we do some unaligned reads and duplicate conversions, but
 411              * at least we have all the pixels */
 412             if( i_rewind )
 413             {
 414                 p_y -= i_rewind;
 415                 p_u -= i_rewind >> 1;
 416                 p_v -= i_rewind >> 1;
 417                 p_buffer -= i_rewind;
 418
 419 #if defined (CAN_COMPILE_SSE2)
 420                 __asm__( ".p2align 3"
 421                          SSE2_INIT_16_UNALIGNED
 422                          SSE2_YUV_MUL
 423                          SSE2_YUV_ADD
 424                          SSE2_UNPACK_15_UNALIGNED
 425                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 426 #else
 427                 {
 428                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 429
 430                     SSE2_INTRINSICS_INIT_16_UNALIGNED
 431                     SSE2_INTRINSICS_YUV_MUL
 432                     SSE2_INTRINSICS_YUV_ADD
 433                     SSE2_INTRINSICS_UNPACK_15_UNALIGNED
 434                 }
 435 #endif
 436                 p_y += 16;
 437                 p_u += 8;
 438                 p_v += 8;
 439             }
 440             SCALE_WIDTH;
 441             SCALE_HEIGHT( 420, 2 );
 442
 443             p_y += i_source_margin;
 444             if( i_y % 2 )
 445             {
 446                 p_u += i_source_margin_c;
 447                 p_v += i_source_margin_c;
 448             }
 449             p_buffer = b_hscale ? p_buffer_start : p_pic;
 450         }
 451     }
 452     else
 453     {
 454         /* use slower SSE2 unaligned fetch and store */
 455         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 456         {
 457             p_pic_start = p_pic;
 458             p_buffer = b_hscale ? p_buffer_start : p_pic;
 459
 460             for ( i_x = p_vout->render.i_width/16; i_x--; )
 461             {
 462 #if defined (CAN_COMPILE_SSE2)
 463                 __asm__( ".p2align 3"
 464                          SSE2_INIT_16_UNALIGNED
 465                          SSE2_YUV_MUL
 466                          SSE2_YUV_ADD
 467                          SSE2_UNPACK_15_UNALIGNED
 468                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 469 #else
 470                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 471                 SSE2_INTRINSICS_INIT_16_UNALIGNED
 472                 SSE2_INTRINSICS_YUV_MUL
 473                 SSE2_INTRINSICS_YUV_ADD
 474                 SSE2_INTRINSICS_UNPACK_15_UNALIGNED
 475 #endif
 476                 p_y += 16;
 477                 p_u += 8;
 478                 p_v += 8;
 479                 p_buffer += 16;
 480             }
 481             /* Here we do some unaligned reads and duplicate conversions, but
 482              * at least we have all the pixels */
 483             if( i_rewind )
 484             {
 485                 p_y -= i_rewind;
 486                 p_u -= i_rewind >> 1;
 487                 p_v -= i_rewind >> 1;
 488                 p_buffer -= i_rewind;
 489
 490 #if defined (CAN_COMPILE_SSE2)
 491                 __asm__( ".p2align 3"
 492                          SSE2_INIT_16_UNALIGNED
 493                          SSE2_YUV_MUL
 494                          SSE2_YUV_ADD
 495                          SSE2_UNPACK_15_UNALIGNED
 496                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 497 #else
 498                 {
 499                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 500
 501                     SSE2_INTRINSICS_INIT_16_UNALIGNED
 502                     SSE2_INTRINSICS_YUV_MUL
 503                     SSE2_INTRINSICS_YUV_ADD
 504                     SSE2_INTRINSICS_UNPACK_15_UNALIGNED
 505                 }
 506 #endif
 507                 p_y += 16;
 508                 p_u += 8;
 509                 p_v += 8;
 510             }
 511             SCALE_WIDTH;
 512             SCALE_HEIGHT( 420, 2 );
 513
 514             p_y += i_source_margin;
 515             if( i_y % 2 )
 516             {
 517                 p_u += i_source_margin_c;
 518                 p_v += i_source_margin_c;
 519             }
 520             p_buffer = b_hscale ? p_buffer_start : p_pic;
 521         }
 522     }
 523
 524     /* make sure all SSE2 stores are visible thereafter */
 525 #if defined (CAN_COMPILE_SSE2)
 526     __asm__ __volatile__ ( "sfence" ::: "memory" );
 527 #else
 528     _mm_sfence();
 529 #endif
 530
 531 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 532
 533     if( p_vout->render.i_width & 7 )
 534     {
 535         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 536     }
 537     else
 538     {
 539         i_rewind = 0;
 540     }
 541
 542     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 543     {
 544         p_pic_start = p_pic;
 545         p_buffer = b_hscale ? p_buffer_start : p_pic;
 546
 547         for ( i_x = p_vout->render.i_width / 8; i_x--; )
 548         {
 549 #if defined (CAN_COMPILE_MMX)
 550             __asm__( ".p2align 3"
 551                      MMX_INIT_16
 552                      MMX_YUV_MUL
 553                      MMX_YUV_ADD
 554                      MMX_UNPACK_15
 555                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
 556 #else
 557             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 558             uint64_t tmp64;
 559             MMX_INTRINSICS_INIT_16
 560             MMX_INTRINSICS_YUV_MUL
 561             MMX_INTRINSICS_YUV_ADD
 562             MMX_INTRINSICS_UNPACK_15
 563 #endif
 564
 565             p_y += 8;
 566             p_u += 4;
 567             p_v += 4;
 568             p_buffer += 8;
 569         }
 570
 571         /* Here we do some unaligned reads and duplicate conversions, but
 572          * at least we have all the pixels */
 573         if( i_rewind )
 574         {
 575             p_y -= i_rewind;
 576             p_u -= i_rewind >> 1;
 577             p_v -= i_rewind >> 1;
 578             p_buffer -= i_rewind;
 579
 580 #if defined (CAN_COMPILE_MMX)
 581             __asm__( ".p2align 3"
 582                      MMX_INIT_16
 583                      MMX_YUV_MUL
 584                      MMX_YUV_ADD
 585                      MMX_UNPACK_15
 586                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
 587 #else
 588             {
 589                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 590                 uint64_t tmp64;
 591
 592                 MMX_INTRINSICS_INIT_16
 593                 MMX_INTRINSICS_YUV_MUL
 594                 MMX_INTRINSICS_YUV_ADD
 595                 MMX_INTRINSICS_UNPACK_15
 596             }
 597 #endif
 598             p_y += 8;
 599             p_u += 4;
 600             p_v += 4;
 601             p_buffer += 8;
 602         }
 603         SCALE_WIDTH;
 604         SCALE_HEIGHT( 420, 2 );
 605
 606         p_y += i_source_margin;
 607         if( i_y % 2 )
 608         {
 609             p_u += i_source_margin_c;
 610             p_v += i_source_margin_c;
 611         }
 612     }
 613     /* re-enable FPU registers */
 614 #if defined (CAN_COMPILE_MMX)
 615     __asm__ __volatile__ ( "emms" );
 616 #else
 617     _mm_empty();
 618 #endif
 619
 620 #endif
 621 }
 622
 623 void E_(I420_R5G6B5)( vout_thread_t *p_vout, picture_t *p_src,
 624                                             picture_t *p_dest )
 625 {
 626     /* We got this one from the old arguments */
 627     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 628     uint8_t  *p_y   = p_src->Y_PIXELS;
 629     uint8_t  *p_u   = p_src->U_PIXELS;
 630     uint8_t  *p_v   = p_src->V_PIXELS;
 631
 632     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
 633     unsigned int i_vscale;                          /* vertical scaling type */
 634     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 635
 636     int         i_right_margin;
 637     int         i_rewind;
 638     int         i_scale_count;                       /* scale modulo counter */
 639     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
 640     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 641
 642     /* Conversion buffer pointer */
 643     uint16_t *  p_buffer_start = (uint16_t*)p_vout->chroma.p_sys->p_buffer;
 644     uint16_t *  p_buffer;
 645
 646     /* Offset array pointer */
 647     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
 648     int *       p_offset;
 649
 650     const int i_source_margin = p_src->p[0].i_pitch
 651                                  - p_src->p[0].i_visible_pitch;
 652     const int i_source_margin_c = p_src->p[1].i_pitch
 653                                  - p_src->p[1].i_visible_pitch;
 654
 655     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 656
 657     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 658      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 659      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 660     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 661                p_vout->output.i_width, p_vout->output.i_height,
 662                &b_hscale, &i_vscale, p_offset_start );
 663
 664
 665     /*
 666      * Perform conversion
 667      */
 668     i_scale_count = ( i_vscale == 1 ) ?
 669                     p_vout->output.i_height : p_vout->render.i_height;
 670
 671 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 672
 673     if( p_vout->render.i_width & 15 )
 674     {
 675         i_rewind = 16 - ( p_vout->render.i_width & 15 );
 676     }
 677     else
 678     {
 679         i_rewind = 0;
 680     }
 681
 682     /*
 683     ** SSE2 128 bits fetch/store instructions are faster
 684     ** if memory access is 16 bytes aligned
 685     */
 686
 687     p_buffer = b_hscale ? p_buffer_start : p_pic;
 688     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 689                     p_dest->p->i_pitch|
 690                     ((int)p_y)|
 691                     ((int)p_buffer))) )
 692     {
 693         /* use faster SSE2 aligned fetch and store */
 694         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 695         {
 696             p_pic_start = p_pic;
 697
 698             for ( i_x = p_vout->render.i_width/16; i_x--; )
 699             {
 700 #if defined (CAN_COMPILE_SSE2)
 701                 __asm__( ".p2align 3"
 702                          SSE2_INIT_16_ALIGNED
 703                          SSE2_YUV_MUL
 704                          SSE2_YUV_ADD
 705                          SSE2_UNPACK_16_ALIGNED
 706                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 707 #else
 708                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 709                 SSE2_INTRINSICS_INIT_16_ALIGNED
 710                 SSE2_INTRINSICS_YUV_MUL
 711                 SSE2_INTRINSICS_YUV_ADD
 712                 SSE2_INTRINSICS_UNPACK_16_ALIGNED
 713 #endif
 714                 p_y += 16;
 715                 p_u += 8;
 716                 p_v += 8;
 717                 p_buffer += 16;
 718             }
 719             /* Here we do some unaligned reads and duplicate conversions, but
 720              * at least we have all the pixels */
 721             if( i_rewind )
 722             {
 723                 p_y -= i_rewind;
 724                 p_u -= i_rewind >> 1;
 725                 p_v -= i_rewind >> 1;
 726                 p_buffer -= i_rewind;
 727
 728 #if defined (CAN_COMPILE_SSE2)
 729                 __asm__( ".p2align 3"
 730                          SSE2_INIT_16_UNALIGNED
 731                          SSE2_YUV_MUL
 732                          SSE2_YUV_ADD
 733                          SSE2_UNPACK_16_UNALIGNED
 734                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 735 #else
 736                 {
 737                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 738
 739                     SSE2_INTRINSICS_INIT_16_UNALIGNED
 740                     SSE2_INTRINSICS_YUV_MUL
 741                     SSE2_INTRINSICS_YUV_ADD
 742                     SSE2_INTRINSICS_UNPACK_16_UNALIGNED
 743                 }
 744 #endif
 745                 p_y += 16;
 746                 p_u += 8;
 747                 p_v += 8;
 748             }
 749             SCALE_WIDTH;
 750             SCALE_HEIGHT( 420, 2 );
 751
 752             p_y += i_source_margin;
 753             if( i_y % 2 )
 754             {
 755                 p_u += i_source_margin_c;
 756                 p_v += i_source_margin_c;
 757             }
 758             p_buffer = b_hscale ? p_buffer_start : p_pic;
 759         }
 760     }
 761     else
 762     {
 763         /* use slower SSE2 unaligned fetch and store */
 764         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 765         {
 766             p_pic_start = p_pic;
 767             p_buffer = b_hscale ? p_buffer_start : p_pic;
 768
 769             for ( i_x = p_vout->render.i_width/16; i_x--; )
 770             {
 771 #if defined (CAN_COMPILE_SSE2)
 772                 __asm__( ".p2align 3"
 773                          SSE2_INIT_16_UNALIGNED
 774                          SSE2_YUV_MUL
 775                          SSE2_YUV_ADD
 776                          SSE2_UNPACK_16_UNALIGNED
 777                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 778 #else
 779                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 780                 SSE2_INTRINSICS_INIT_16_UNALIGNED
 781                 SSE2_INTRINSICS_YUV_MUL
 782                 SSE2_INTRINSICS_YUV_ADD
 783                 SSE2_INTRINSICS_UNPACK_16_UNALIGNED
 784 #endif
 785                 p_y += 16;
 786                 p_u += 8;
 787                 p_v += 8;
 788                 p_buffer += 16;
 789             }
 790             /* Here we do some unaligned reads and duplicate conversions, but
 791              * at least we have all the pixels */
 792             if( i_rewind )
 793             {
 794                 p_y -= i_rewind;
 795                 p_u -= i_rewind >> 1;
 796                 p_v -= i_rewind >> 1;
 797                 p_buffer -= i_rewind;
 798
 799 #if defined (CAN_COMPILE_SSE2)
 800                 __asm__( ".p2align 3"
 801                          SSE2_INIT_16_UNALIGNED
 802                          SSE2_YUV_MUL
 803                          SSE2_YUV_ADD
 804                          SSE2_UNPACK_16_UNALIGNED
 805                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
 806 #else
 807                 {
 808                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
 809
 810                     SSE2_INTRINSICS_INIT_16_UNALIGNED
 811                     SSE2_INTRINSICS_YUV_MUL
 812                     SSE2_INTRINSICS_YUV_ADD
 813                     SSE2_INTRINSICS_UNPACK_16_UNALIGNED
 814                 }
 815 #endif
 816                 p_y += 16;
 817                 p_u += 8;
 818                 p_v += 8;
 819             }
 820             SCALE_WIDTH;
 821             SCALE_HEIGHT( 420, 2 );
 822
 823             p_y += i_source_margin;
 824             if( i_y % 2 )
 825             {
 826                 p_u += i_source_margin_c;
 827                 p_v += i_source_margin_c;
 828             }
 829             p_buffer = b_hscale ? p_buffer_start : p_pic;
 830         }
 831     }
 832
 833     /* make sure all SSE2 stores are visible thereafter */
 834 #if defined (CAN_COMPILE_SSE2)
 835     __asm__ __volatile__ ( "sfence" ::: "memory" );
 836 #else
 837     _mm_sfence();
 838 #endif
 839
 840 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 841
 842     if( p_vout->render.i_width & 7 )
 843     {
 844         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 845     }
 846     else
 847     {
 848         i_rewind = 0;
 849     }
 850
 851     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
 852     {
 853         p_pic_start = p_pic;
 854         p_buffer = b_hscale ? p_buffer_start : p_pic;
 855
 856         for ( i_x = p_vout->render.i_width / 8; i_x--; )
 857         {
 858 #if defined (CAN_COMPILE_MMX)
 859             __asm__( ".p2align 3"
 860                      MMX_INIT_16
 861                      MMX_YUV_MUL
 862                      MMX_YUV_ADD
 863                      MMX_UNPACK_16
 864                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
 865 #else
 866             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 867             uint64_t tmp64;
 868             MMX_INTRINSICS_INIT_16
 869             MMX_INTRINSICS_YUV_MUL
 870             MMX_INTRINSICS_YUV_ADD
 871             MMX_INTRINSICS_UNPACK_16
 872 #endif
 873
 874             p_y += 8;
 875             p_u += 4;
 876             p_v += 4;
 877             p_buffer += 8;
 878         }
 879
 880         /* Here we do some unaligned reads and duplicate conversions, but
 881          * at least we have all the pixels */
 882         if( i_rewind )
 883         {
 884             p_y -= i_rewind;
 885             p_u -= i_rewind >> 1;
 886             p_v -= i_rewind >> 1;
 887             p_buffer -= i_rewind;
 888
 889 #if defined (CAN_COMPILE_MMX)
 890             __asm__( ".p2align 3"
 891                      MMX_INIT_16
 892                      MMX_YUV_MUL
 893                      MMX_YUV_ADD
 894                      MMX_UNPACK_16
 895                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
 896 #else
 897             {
 898                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
 899                 uint64_t tmp64;
 900
 901                 MMX_INTRINSICS_INIT_16
 902                 MMX_INTRINSICS_YUV_MUL
 903                 MMX_INTRINSICS_YUV_ADD
 904                 MMX_INTRINSICS_UNPACK_16
 905             }
 906 #endif
 907             p_y += 8;
 908             p_u += 4;
 909             p_v += 4;
 910             p_buffer += 8;
 911         }
 912         SCALE_WIDTH;
 913         SCALE_HEIGHT( 420, 2 );
 914
 915         p_y += i_source_margin;
 916         if( i_y % 2 )
 917         {
 918             p_u += i_source_margin_c;
 919             p_v += i_source_margin_c;
 920         }
 921     }
 922     /* re-enable FPU registers */
 923 #if defined (CAN_COMPILE_MMX)
 924     __asm__ __volatile__ ( "emms" );
 925 #else
 926     _mm_empty();
 927 #endif
 928
 929 #endif
 930 }
 931
 932 #endif
 933
 934 /*****************************************************************************
 935  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
 936  *****************************************************************************
 937  * Horizontal alignment needed:
 938  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
 939  *  - output: 1 pixel (2 bytes), margins allowed
 940  * Vertical alignment needed:
 941  *  - input: 2 lines (2 Y lines, 1 U/V line)
 942  *  - output: 1 line
 943  *****************************************************************************/
 944
 945 #if defined (MODULE_NAME_IS_i420_rgb)
 946
 947 void E_(I420_RGB32)( vout_thread_t *p_vout, picture_t *p_src,
 948                                             picture_t *p_dest )
 949 {
 950     /* We got this one from the old arguments */
 951     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 952     uint8_t  *p_y   = p_src->Y_PIXELS;
 953     uint8_t  *p_u   = p_src->U_PIXELS;
 954     uint8_t  *p_v   = p_src->V_PIXELS;
 955
 956     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
 957     unsigned int i_vscale;                          /* vertical scaling type */
 958     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 959
 960     int         i_right_margin;
 961     int         i_rewind;
 962     int         i_scale_count;                       /* scale modulo counter */
 963     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
 964     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 965     int         i_uval, i_vval;                           /* U and V samples */
 966     int         i_red, i_green, i_blue;          /* U and V modified samples */
 967     uint32_t *  p_yuv = p_vout->chroma.p_sys->p_rgb32;
 968     uint32_t *  p_ybase;                     /* Y dependant conversion table */
 969
 970     /* Conversion buffer pointer */
 971     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
 972     uint32_t *  p_buffer;
 973
 974     /* Offset array pointer */
 975     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
 976     int *       p_offset;
 977
 978     const int i_source_margin = p_src->p[0].i_pitch
 979                                  - p_src->p[0].i_visible_pitch;
 980     const int i_source_margin_c = p_src->p[1].i_pitch
 981                                  - p_src->p[1].i_visible_pitch;
 982
 983     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 984
 985     if( p_vout->render.i_width & 7 )
 986     {
 987         i_rewind = 8 - ( p_vout->render.i_width & 7 );
 988     }
 989     else
 990     {
 991         i_rewind = 0;
 992     }
 993
 994     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 995      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 996      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 997     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
 998                p_vout->output.i_width, p_vout->output.i_height,
 999                &b_hscale, &i_vscale, p_offset_start );
1000
1001     /*
1002      * Perform conversion
1003      */
1004     i_scale_count = ( i_vscale == 1 ) ?
1005                     p_vout->output.i_height : p_vout->render.i_height;
1006     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1007     {
1008         p_pic_start = p_pic;
1009         p_buffer = b_hscale ? p_buffer_start : p_pic;
1010
1011         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1012         {
1013             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1014             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1015             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1016             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1017         }
1018
1019         /* Here we do some unaligned reads and duplicate conversions, but
1020          * at least we have all the pixels */
1021         if( i_rewind )
1022         {
1023             p_y -= i_rewind;
1024             p_u -= i_rewind >> 1;
1025             p_v -= i_rewind >> 1;
1026             p_buffer -= i_rewind;
1027             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1028             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1029             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1030             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
1031         }
1032         SCALE_WIDTH;
1033         SCALE_HEIGHT( 420, 4 );
1034
1035         p_y += i_source_margin;
1036         if( i_y % 2 )
1037         {
1038             p_u += i_source_margin_c;
1039             p_v += i_source_margin_c;
1040         }
1041     }
1042 }
1043
1044 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
1045
1046 void E_(I420_A8R8G8B8)( vout_thread_t *p_vout, picture_t *p_src,
1047                                             picture_t *p_dest )
1048 {
1049     /* We got this one from the old arguments */
1050     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1051     uint8_t  *p_y   = p_src->Y_PIXELS;
1052     uint8_t  *p_u   = p_src->U_PIXELS;
1053     uint8_t  *p_v   = p_src->V_PIXELS;
1054
1055     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1056     unsigned int i_vscale;                          /* vertical scaling type */
1057     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1058
1059     int         i_right_margin;
1060     int         i_rewind;
1061     int         i_scale_count;                       /* scale modulo counter */
1062     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1063     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1064     /* Conversion buffer pointer */
1065     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1066     uint32_t *  p_buffer;
1067
1068     /* Offset array pointer */
1069     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1070     int *       p_offset;
1071
1072     const int i_source_margin = p_src->p[0].i_pitch
1073                                  - p_src->p[0].i_visible_pitch;
1074     const int i_source_margin_c = p_src->p[1].i_pitch
1075                                  - p_src->p[1].i_visible_pitch;
1076
1077     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1078
1079     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1080      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1081      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1082     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1083                p_vout->output.i_width, p_vout->output.i_height,
1084                &b_hscale, &i_vscale, p_offset_start );
1085
1086     /*
1087      * Perform conversion
1088      */
1089     i_scale_count = ( i_vscale == 1 ) ?
1090                     p_vout->output.i_height : p_vout->render.i_height;
1091
1092 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1093
1094     if( p_vout->render.i_width & 15 )
1095     {
1096         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1097     }
1098     else
1099     {
1100         i_rewind = 0;
1101     }
1102
1103     /*
1104     ** SSE2 128 bits fetch/store instructions are faster
1105     ** if memory access is 16 bytes aligned
1106     */
1107
1108     p_buffer = b_hscale ? p_buffer_start : p_pic;
1109     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1110                     p_dest->p->i_pitch|
1111                     ((int)p_y)|
1112                     ((int)p_buffer))) )
1113     {
1114         /* use faster SSE2 aligned fetch and store */
1115         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1116         {
1117             p_pic_start = p_pic;
1118
1119             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1120             {
1121 #if defined (CAN_COMPILE_SSE2)
1122                 /* use inline SSE2 assembly */
1123                 __asm__( ".p2align 3"
1124                          SSE2_INIT_32_ALIGNED
1125                          SSE2_YUV_MUL
1126                          SSE2_YUV_ADD
1127                          SSE2_UNPACK_32_ARGB_ALIGNED
1128                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1129 #else
1130                 /* otherwise use SSE2 C intrinsics wrappers */
1131                 __m128i  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1132
1133                 SSE2_INTRINSICS_INIT_32_ALIGNED
1134                 SSE2_INTRINSICS_YUV_MUL
1135                 SSE2_INTRINSICS_YUV_ADD
1136                 SSE2_INTRINSICS_UNPACK_32_ARGB_ALIGNED
1137 #endif
1138                 p_y += 16;
1139                 p_u += 8;
1140                 p_v += 8;
1141                 p_buffer += 16;
1142             }
1143
1144             /* Here we do some unaligned reads and duplicate conversions, but
1145              * at least we have all the pixels */
1146             if( i_rewind )
1147             {
1148                 p_y -= i_rewind;
1149                 p_u -= i_rewind >> 1;
1150                 p_v -= i_rewind >> 1;
1151                 p_buffer -= i_rewind;
1152 #if defined (CAN_COMPILE_SSE2)
1153                 /* use inline SSE2 assembly */
1154                 __asm__( ".p2align 3"
1155                          SSE2_INIT_32_UNALIGNED
1156                          SSE2_YUV_MUL
1157                          SSE2_YUV_ADD
1158                          SSE2_UNPACK_32_ARGB_UNALIGNED
1159                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1160 #else
1161                 /* otherwise use SSE2 intrinsics wrappers */
1162                 {
1163                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1164
1165                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1166                     SSE2_INTRINSICS_YUV_MUL
1167                     SSE2_INTRINSICS_YUV_ADD
1168                     SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1169                 }
1170 #endif
1171                 p_y += 16;
1172                 p_u += 4;
1173                 p_v += 4;
1174             }
1175             SCALE_WIDTH;
1176             SCALE_HEIGHT( 420, 4 );
1177
1178             p_y += i_source_margin;
1179             if( i_y % 2 )
1180             {
1181                 p_u += i_source_margin_c;
1182                 p_v += i_source_margin_c;
1183             }
1184             p_buffer = b_hscale ? p_buffer_start : p_pic;
1185         }
1186     }
1187     else
1188     {
1189         /* use slower SSE2 unaligned fetch and store */
1190         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1191         {
1192             p_pic_start = p_pic;
1193             p_buffer = b_hscale ? p_buffer_start : p_pic;
1194
1195             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1196             {
1197 #if defined (CAN_COMPILE_SSE2)
1198                 /* use inline SSE2 assembly */
1199                 __asm__( ".p2align 3"
1200                          SSE2_INIT_32_UNALIGNED
1201                          SSE2_YUV_MUL
1202                          SSE2_YUV_ADD
1203                          SSE2_UNPACK_32_ARGB_UNALIGNED
1204                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1205 #else
1206                 /* otherwise use SSE2 C intrinsics wrappers */
1207                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1208
1209                 SSE2_INTRINSICS_INIT_32_UNALIGNED
1210                 SSE2_INTRINSICS_YUV_MUL
1211                 SSE2_INTRINSICS_YUV_ADD
1212                 SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1213 #endif
1214                 p_y += 16;
1215                 p_u += 8;
1216                 p_v += 8;
1217                 p_buffer += 16;
1218             }
1219
1220             /* Here we do some unaligned reads and duplicate conversions, but
1221              * at least we have all the pixels */
1222             if( i_rewind )
1223             {
1224                 p_y -= i_rewind;
1225                 p_u -= i_rewind >> 1;
1226                 p_v -= i_rewind >> 1;
1227                 p_buffer -= i_rewind;
1228 #if defined (CAN_COMPILE_SSE2)
1229                 /* use inline SSE2 assembly */
1230                 __asm__( ".p2align 3"
1231                          SSE2_INIT_32_UNALIGNED
1232                          SSE2_YUV_MUL
1233                          SSE2_YUV_ADD
1234                          SSE2_UNPACK_32_ARGB_UNALIGNED
1235                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1236 #else
1237                 /* otherwise use SSE2 intrinsics wrappers */
1238                 {
1239                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1240
1241                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1242                     SSE2_INTRINSICS_YUV_MUL
1243                     SSE2_INTRINSICS_YUV_ADD
1244                     SSE2_INTRINSICS_UNPACK_32_ARGB_UNALIGNED
1245                 }
1246 #endif
1247                 p_y += 16;
1248                 p_u += 8;
1249                 p_v += 8;
1250             }
1251             SCALE_WIDTH;
1252             SCALE_HEIGHT( 420, 4 );
1253
1254             p_y += i_source_margin;
1255             if( i_y % 2 )
1256             {
1257                 p_u += i_source_margin_c;
1258                 p_v += i_source_margin_c;
1259             }
1260             p_buffer = b_hscale ? p_buffer_start : p_pic;
1261         }
1262     }
1263
1264     /* make sure all SSE2 stores are visible thereafter */
1265 #if defined (CAN_COMPILE_SSE2)
1266     __asm__ __volatile__ ( "sfence" ::: "memory" );
1267 #else
1268     _mm_sfence();
1269 #endif
1270
1271 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1272
1273     if( p_vout->render.i_width & 7 )
1274     {
1275         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1276     }
1277     else
1278     {
1279         i_rewind = 0;
1280     }
1281
1282     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1283     {
1284         p_pic_start = p_pic;
1285         p_buffer = b_hscale ? p_buffer_start : p_pic;
1286
1287         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1288         {
1289 #if defined (CAN_COMPILE_MMX)
1290             /* use inline MMX assembly */
1291             __asm__( MMX_INIT_32
1292                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1293
1294             __asm__( ".p2align 3"
1295                      MMX_YUV_MUL
1296                      MMX_YUV_ADD
1297                      MMX_UNPACK_32_ARGB
1298                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1299 #else
1300             /* otherwise use MMX C intrinsics wrappers */
1301             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1302             uint64_t tmp64;
1303
1304             MMX_INTRINSICS_INIT_32
1305             MMX_INTRINSICS_YUV_MUL
1306             MMX_INTRINSICS_YUV_ADD
1307             MMX_INTRINSICS_UNPACK_32_ARGB
1308 #endif
1309             p_y += 8;
1310             p_u += 4;
1311             p_v += 4;
1312             p_buffer += 8;
1313         }
1314
1315         /* Here we do some unaligned reads and duplicate conversions, but
1316          * at least we have all the pixels */
1317         if( i_rewind )
1318         {
1319             p_y -= i_rewind;
1320             p_u -= i_rewind >> 1;
1321             p_v -= i_rewind >> 1;
1322             p_buffer -= i_rewind;
1323 #if defined (CAN_COMPILE_MMX)
1324             /* use inline MMX assembly */
1325             __asm__( ".p2align 3"
1326                      MMX_INIT_32
1327                      MMX_YUV_MUL
1328                      MMX_YUV_ADD
1329                      MMX_UNPACK_32_ARGB
1330                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1331 #else
1332             /* otherwise use MMX intrinsics wrappers */
1333             {
1334                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1335                 uint64_t tmp64;
1336
1337                 MMX_INTRINSICS_INIT_32
1338                 MMX_INTRINSICS_YUV_MUL
1339                 MMX_INTRINSICS_YUV_ADD
1340                 MMX_INTRINSICS_UNPACK_32_ARGB
1341             }
1342 #endif
1343             p_y += 8;
1344             p_u += 4;
1345             p_v += 4;
1346             p_buffer += 8;
1347         }
1348         SCALE_WIDTH;
1349         SCALE_HEIGHT( 420, 4 );
1350
1351         p_y += i_source_margin;
1352         if( i_y % 2 )
1353         {
1354             p_u += i_source_margin_c;
1355             p_v += i_source_margin_c;
1356         }
1357     }
1358     /* re-enable FPU registers */
1359 #if defined (CAN_COMPILE_MMX)
1360     __asm__ __volatile__ ( "emms" );
1361 #else
1362     _mm_empty();
1363 #endif
1364
1365 #endif
1366 }
1367
1368 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src,
1369                                             picture_t *p_dest )
1370 {
1371     /* We got this one from the old arguments */
1372     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1373     uint8_t  *p_y   = p_src->Y_PIXELS;
1374     uint8_t  *p_u   = p_src->U_PIXELS;
1375     uint8_t  *p_v   = p_src->V_PIXELS;
1376
1377     vlc_bool_t  b_hscale;                         /* horizontal scaling type */
1378     unsigned int i_vscale;                          /* vertical scaling type */
1379     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1380
1381     int         i_right_margin;
1382     int         i_rewind;
1383     int         i_scale_count;                       /* scale modulo counter */
1384     int         i_chroma_width = p_vout->render.i_width / 2; /* chroma width */
1385     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1386     /* Conversion buffer pointer */
1387     uint32_t *  p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer;
1388     uint32_t *  p_buffer;
1389
1390     /* Offset array pointer */
1391     int *       p_offset_start = p_vout->chroma.p_sys->p_offset;
1392     int *       p_offset;
1393
1394     const int i_source_margin = p_src->p[0].i_pitch
1395                                  - p_src->p[0].i_visible_pitch;
1396     const int i_source_margin_c = p_src->p[1].i_pitch
1397                                  - p_src->p[1].i_visible_pitch;
1398
1399     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1400
1401     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1402      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1403      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1404     SetOffset( p_vout->render.i_width, p_vout->render.i_height,
1405                p_vout->output.i_width, p_vout->output.i_height,
1406                &b_hscale, &i_vscale, p_offset_start );
1407
1408     /*
1409      * Perform conversion
1410      */
1411     i_scale_count = ( i_vscale == 1 ) ?
1412                     p_vout->output.i_height : p_vout->render.i_height;
1413
1414 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1415
1416     if( p_vout->render.i_width & 15 )
1417     {
1418         i_rewind = 16 - ( p_vout->render.i_width & 15 );
1419     }
1420     else
1421     {
1422         i_rewind = 0;
1423     }
1424
1425     /*
1426     ** SSE2 128 bits fetch/store instructions are faster
1427     ** if memory access is 16 bytes aligned
1428     */
1429
1430     p_buffer = b_hscale ? p_buffer_start : p_pic;
1431     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1432                     p_dest->p->i_pitch|
1433                     ((int)p_y)|
1434                     ((int)p_buffer))) )
1435     {
1436         /* use faster SSE2 aligned fetch and store */
1437         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1438         {
1439             p_pic_start = p_pic;
1440
1441             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1442             {
1443 #if defined (CAN_COMPILE_SSE2)
1444                 /* use inline SSE2 assembly */
1445                 __asm__( ".p2align 3"
1446                          SSE2_INIT_32_ALIGNED
1447                          SSE2_YUV_MUL
1448                          SSE2_YUV_ADD
1449                          SSE2_UNPACK_32_BGRA_ALIGNED
1450                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1451 #else
1452                 /* otherwise use SSE2 C intrinsics wrappers */
1453                 __m128i  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1454
1455                 SSE2_INTRINSICS_INIT_32_ALIGNED
1456                 SSE2_INTRINSICS_YUV_MUL
1457                 SSE2_INTRINSICS_YUV_ADD
1458                 SSE2_INTRINSICS_UNPACK_32_BGRA_ALIGNED
1459 #endif
1460                 p_y += 16;
1461                 p_u += 8;
1462                 p_v += 8;
1463                 p_buffer += 16;
1464             }
1465
1466             /* Here we do some unaligned reads and duplicate conversions, but
1467              * at least we have all the pixels */
1468             if( i_rewind )
1469             {
1470                 p_y -= i_rewind;
1471                 p_u -= i_rewind >> 1;
1472                 p_v -= i_rewind >> 1;
1473                 p_buffer -= i_rewind;
1474 #if defined (CAN_COMPILE_SSE2)
1475                 /* use inline SSE2 assembly */
1476                 __asm__( ".p2align 3"
1477                          SSE2_INIT_32_UNALIGNED
1478                          SSE2_YUV_MUL
1479                          SSE2_YUV_ADD
1480                          SSE2_UNPACK_32_BGRA_UNALIGNED
1481                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1482 #else
1483                 /* otherwise use SSE2 intrinsics wrappers */
1484                 {
1485                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1486
1487                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1488                     SSE2_INTRINSICS_YUV_MUL
1489                     SSE2_INTRINSICS_YUV_ADD
1490                     SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1491                 }
1492 #endif
1493                 p_y += 16;
1494                 p_u += 4;
1495                 p_v += 4;
1496             }
1497             SCALE_WIDTH;
1498             SCALE_HEIGHT( 420, 4 );
1499
1500             p_y += i_source_margin;
1501             if( i_y % 2 )
1502             {
1503                 p_u += i_source_margin_c;
1504                 p_v += i_source_margin_c;
1505             }
1506             p_buffer = b_hscale ? p_buffer_start : p_pic;
1507         }
1508     }
1509     else
1510     {
1511         /* use slower SSE2 unaligned fetch and store */
1512         for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1513         {
1514             p_pic_start = p_pic;
1515             p_buffer = b_hscale ? p_buffer_start : p_pic;
1516
1517             for ( i_x = p_vout->render.i_width / 16; i_x--; )
1518             {
1519 #if defined (CAN_COMPILE_SSE2)
1520                 /* use inline SSE2 assembly */
1521                 __asm__( ".p2align 3"
1522                          SSE2_INIT_32_UNALIGNED
1523                          SSE2_YUV_MUL
1524                          SSE2_YUV_ADD
1525                          SSE2_UNPACK_32_BGRA_UNALIGNED
1526                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1527 #else
1528                 /* otherwise use SSE2 C intrinsics wrappers */
1529                 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1530
1531                 SSE2_INTRINSICS_INIT_32_UNALIGNED
1532                 SSE2_INTRINSICS_YUV_MUL
1533                 SSE2_INTRINSICS_YUV_ADD
1534                 SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1535 #endif
1536                 p_y += 16;
1537                 p_u += 8;
1538                 p_v += 8;
1539                 p_buffer += 16;
1540             }
1541
1542             /* Here we do some unaligned reads and duplicate conversions, but
1543              * at least we have all the pixels */
1544             if( i_rewind )
1545             {
1546                 p_y -= i_rewind;
1547                 p_u -= i_rewind >> 1;
1548                 p_v -= i_rewind >> 1;
1549                 p_buffer -= i_rewind;
1550 #if defined (CAN_COMPILE_SSE2)
1551                 /* use inline SSE2 assembly */
1552                 __asm__( ".p2align 3"
1553                          SSE2_INIT_32_UNALIGNED
1554                          SSE2_YUV_MUL
1555                          SSE2_YUV_ADD
1556                          SSE2_UNPACK_32_BGRA_UNALIGNED
1557                          : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) : "eax" );
1558 #else
1559                 /* otherwise use SSE2 intrinsics wrappers */
1560                 {
1561                     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1562
1563                     SSE2_INTRINSICS_INIT_32_UNALIGNED
1564                     SSE2_INTRINSICS_YUV_MUL
1565                     SSE2_INTRINSICS_YUV_ADD
1566                     SSE2_INTRINSICS_UNPACK_32_BGRA_UNALIGNED
1567                 }
1568 #endif
1569                 p_y += 16;
1570                 p_u += 8;
1571                 p_v += 8;
1572             }
1573             SCALE_WIDTH;
1574             SCALE_HEIGHT( 420, 4 );
1575
1576             p_y += i_source_margin;
1577             if( i_y % 2 )
1578             {
1579                 p_u += i_source_margin_c;
1580                 p_v += i_source_margin_c;
1581             }
1582             p_buffer = b_hscale ? p_buffer_start : p_pic;
1583         }
1584     }
1585
1586 #else
1587
1588     if( p_vout->render.i_width & 7 )
1589     {
1590         i_rewind = 8 - ( p_vout->render.i_width & 7 );
1591     }
1592     else
1593     {
1594         i_rewind = 0;
1595     }
1596
1597     for( i_y = 0; i_y < p_vout->render.i_height; i_y++ )
1598     {
1599         p_pic_start = p_pic;
1600         p_buffer = b_hscale ? p_buffer_start : p_pic;
1601
1602         for ( i_x = p_vout->render.i_width / 8; i_x--; )
1603         {
1604 #if defined (CAN_COMPILE_MMX)
1605             /* use inline MMX assembly */
1606             __asm__( MMX_INIT_32
1607                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1608
1609             __asm__( ".p2align 3"
1610                      MMX_YUV_MUL
1611                      MMX_YUV_ADD
1612                      MMX_UNPACK_32_ARGB
1613                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1614 #else
1615             /* otherwise use MMX C intrinsics wrappers */
1616             __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1617             uint64_t tmp64;
1618
1619             MMX_INTRINSICS_INIT_32
1620             MMX_INTRINSICS_YUV_MUL
1621             MMX_INTRINSICS_YUV_ADD
1622             MMX_INTRINSICS_UNPACK_32_BGRA
1623 #endif
1624             p_y += 8;
1625             p_u += 4;
1626             p_v += 4;
1627             p_buffer += 8;
1628         }
1629
1630         /* Here we do some unaligned reads and duplicate conversions, but
1631          * at least we have all the pixels */
1632         if( i_rewind )
1633         {
1634             p_y -= i_rewind;
1635             p_u -= i_rewind >> 1;
1636             p_v -= i_rewind >> 1;
1637             p_buffer -= i_rewind;
1638 #if defined (CAN_COMPILE_MMX)
1639             /* use inline MMX assembly */
1640             __asm__( ".p2align 3"
1641                      MMX_INIT_32
1642                      MMX_YUV_MUL
1643                      MMX_YUV_ADD
1644                      MMX_UNPACK_32_BGRA
1645                      : : "r" (p_y), "r" (p_u), "r" (p_v), "r" (p_buffer) );
1646 #else
1647             /* otherwise use MMX intrinsics wrappers */
1648             {
1649                 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
1650                 uint64_t tmp64;
1651
1652                 MMX_INTRINSICS_INIT_32
1653                 MMX_INTRINSICS_YUV_MUL
1654                 MMX_INTRINSICS_YUV_ADD
1655                 MMX_INTRINSICS_UNPACK_32_BGRA
1656             }
1657 #endif
1658             p_y += 8;
1659             p_u += 4;
1660             p_v += 4;
1661             p_buffer += 8;
1662         }
1663         SCALE_WIDTH;
1664         SCALE_HEIGHT( 420, 4 );
1665
1666         p_y += i_source_margin;
1667         if( i_y % 2 )
1668         {
1669             p_u += i_source_margin_c;
1670             p_v += i_source_margin_c;
1671         }
1672     }
1673     /* re-enable FPU registers */
1674 #if defined (CAN_COMPILE_MMX)
1675     __asm__ __volatile__ ( "emms" );
1676 #else
1677     _mm_empty();
1678 #endif
1679
1680 #endif
1681 }
1682
1683 #endif
1684
1685 /* Following functions are local */
1686
1687 /*****************************************************************************
1688  * SetOffset: build offset array for conversion functions
1689  *****************************************************************************
1690  * This function will build an offset array used in later conversion functions.
1691  * It will also set horizontal and vertical scaling indicators.
1692  *****************************************************************************/
1693 static void SetOffset( int i_width, int i_height, int i_pic_width,
1694                        int i_pic_height, vlc_bool_t *pb_hscale,
1695                        unsigned int *pi_vscale, int *p_offset )
1696 {
1697     int i_x;                                    /* x position in destination */
1698     int i_scale_count;                                     /* modulo counter */
1699
1700     /*
1701      * Prepare horizontal offset array
1702      */
1703     if( i_pic_width - i_width == 0 )
1704     {
1705         /* No horizontal scaling: YUV conversion is done directly to picture */
1706         *pb_hscale = 0;
1707     }
1708     else if( i_pic_width - i_width > 0 )
1709     {
1710         /* Prepare scaling array for horizontal extension */
1711         *pb_hscale = 1;
1712         i_scale_count = i_pic_width;
1713         for( i_x = i_width; i_x--; )
1714         {
1715             while( (i_scale_count -= i_width) > 0 )
1716             {
1717                 *p_offset++ = 0;
1718             }
1719             *p_offset++ = 1;
1720             i_scale_count += i_pic_width;
1721         }
1722     }
1723     else /* if( i_pic_width - i_width < 0 ) */
1724     {
1725         /* Prepare scaling array for horizontal reduction */
1726         *pb_hscale = 1;
1727         i_scale_count = i_width;
1728         for( i_x = i_pic_width; i_x--; )
1729         {
1730             *p_offset = 1;
1731             while( (i_scale_count -= i_pic_width) > 0 )
1732             {
1733                 *p_offset += 1;
1734             }
1735             p_offset++;
1736             i_scale_count += i_width;
1737         }
1738     }
1739
1740     /*
1741      * Set vertical scaling indicator
1742      */
1743     if( i_pic_height - i_height == 0 )
1744     {
1745         *pi_vscale = 0;
1746     }
1747     else if( i_pic_height - i_height > 0 )
1748     {
1749         *pi_vscale = 1;
1750     }
1751     else /* if( i_pic_height - i_height < 0 ) */
1752     {
1753         *pi_vscale = -1;
1754     }
1755 }
1756